pub(crate) mod builder;
pub(crate) mod header;
pub use self::{builder::Builder, header::Header};
use std::io;
use noodles_core::Position;
use noodles_fasta as fasta;
use noodles_sam as sam;
use super::{CompressionHeader, ReferenceSequenceContext};
use crate::{
container::Block,
record::resolve::{resolve_bases, resolve_quality_scores},
BitReader, Record,
};
#[derive(Clone, Debug, Eq, PartialEq)]
pub struct Slice {
header: Header,
core_data_block: Block,
external_blocks: Vec<Block>,
}
impl Slice {
pub(crate) fn builder() -> Builder {
Builder::default()
}
pub(crate) fn new(header: Header, core_data_block: Block, external_blocks: Vec<Block>) -> Self {
Self {
header,
core_data_block,
external_blocks,
}
}
pub(crate) fn header(&self) -> &Header {
&self.header
}
pub(crate) fn core_data_block(&self) -> &Block {
&self.core_data_block
}
pub(crate) fn external_blocks(&self) -> &[Block] {
&self.external_blocks
}
pub fn records(&self, compression_header: &CompressionHeader) -> io::Result<Vec<Record>> {
use crate::reader::record::ExternalDataReaders;
let core_data_reader = self
.core_data_block
.decompressed_data()
.map(BitReader::new)?;
let mut external_data_readers = ExternalDataReaders::new();
for block in self.external_blocks() {
let reader = block.decompressed_data()?;
external_data_readers.insert(block.content_id(), reader);
}
let mut record_reader = crate::reader::record::Reader::new(
compression_header,
core_data_reader,
external_data_readers,
self.header.reference_sequence_context(),
);
let record_count = self.header().record_count();
let mut records = Vec::with_capacity(record_count);
let start_id = self.header().record_counter();
let end_id = start_id + (record_count as u64);
for id in start_id..end_id {
let mut record = record_reader.read_record()?;
record.id = id;
records.push(record);
}
Ok(records)
}
pub fn resolve_records(
&self,
reference_sequence_repository: &fasta::Repository,
header: &sam::Header,
compression_header: &CompressionHeader,
records: &mut [Record],
) -> io::Result<()> {
resolve_mates(records)?;
self.resolve_bases(
reference_sequence_repository,
header,
compression_header,
records,
)?;
self.resolve_quality_scores(records);
Ok(())
}
fn resolve_bases(
&self,
reference_sequence_repository: &fasta::Repository,
header: &sam::Header,
compression_header: &CompressionHeader,
records: &mut [Record],
) -> io::Result<()> {
let embedded_reference_sequence = if let Some(block_content_id) =
self.header().embedded_reference_bases_block_content_id()
{
let block = self
.external_blocks()
.iter()
.find(|block| block.content_id() == block_content_id)
.expect("invalid block content ID");
let data = block.decompressed_data()?;
let sequence = fasta::record::Sequence::from(data);
Some(sequence)
} else {
None
};
if compression_header
.preservation_map()
.is_reference_required()
{
if let ReferenceSequenceContext::Some(context) =
self.header().reference_sequence_context()
{
let reference_sequence_name = header
.reference_sequences()
.get_index(context.reference_sequence_id())
.map(|(_, rs)| rs.name())
.expect("invalid slice reference sequence ID");
let sequence = reference_sequence_repository
.get(reference_sequence_name)
.transpose()?
.expect("invalid slice reference sequence name");
let start = context.alignment_start();
let end = context.alignment_end();
let actual_md5 =
builder::calculate_normalized_sequence_digest(&sequence[start..=end]);
let expected_md5 = self.header().reference_md5();
if actual_md5 != expected_md5 {
return Err(io::Error::new(
io::ErrorKind::InvalidData,
format!(
"reference sequence checksum mismatch: expected {:?}, got {:?}",
expected_md5, actual_md5
),
));
}
}
}
for record in records {
if record.bam_flags().is_unmapped() || record.cram_flags().decode_sequence_as_unknown()
{
continue;
}
let mut alignment_start = record.alignment_start.expect("invalid alignment start");
let reference_sequence = if compression_header
.preservation_map()
.is_reference_required()
{
let rs = record
.reference_sequence(header.reference_sequences())
.transpose()?
.expect("invalid reference sequence ID");
let sequence = reference_sequence_repository
.get(rs.name())
.transpose()?
.expect("invalid reference sequence name");
Some(sequence)
} else if let Some(ref sequence) = embedded_reference_sequence {
let offset = match self.header().reference_sequence_context() {
ReferenceSequenceContext::Some(context) => {
usize::from(context.alignment_start())
}
_ => panic!("invalid slice alignment start"),
};
let start = usize::from(alignment_start) - offset + 1;
alignment_start = Position::try_from(start)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
Some(sequence.clone())
} else {
None
};
let substitution_matrix = compression_header.preservation_map().substitution_matrix();
let bases = resolve_bases(
reference_sequence.as_ref(),
substitution_matrix,
record.features(),
alignment_start,
record.read_length(),
)?;
record.bases = bases;
}
Ok(())
}
fn resolve_quality_scores(&self, records: &mut [Record]) {
for record in records {
if !record.flags().is_unmapped()
&& !record.cram_flags().are_quality_scores_stored_as_array()
{
let quality_scores =
resolve_quality_scores(record.features(), record.read_length());
record.quality_scores = quality_scores;
}
}
}
}
fn resolve_mates(records: &mut [Record]) -> io::Result<()> {
let mut mate_indices = vec![None; records.len()];
for (i, record) in records.iter().enumerate() {
if let Some(distance_to_next_fragment) = record.distance_to_next_fragment() {
let mate_index = i + distance_to_next_fragment + 1;
mate_indices[i] = Some(mate_index);
}
}
let mut i = 0;
while i < records.len() - 1 {
let record = &mut records[i];
if record.read_name().is_none() {
let read_name = record
.id()
.to_string()
.parse()
.map(Some)
.map_err(|e| io::Error::new(io::ErrorKind::InvalidData, e))?;
record.read_name = read_name;
}
if mate_indices[i].is_none() {
i += 1;
continue;
}
let mut j = i;
while let Some(mate_index) = mate_indices[j] {
let mid = j + 1;
let (left, right) = records.split_at_mut(mid);
let record = &mut left[j];
let mate = &mut right[mate_index - mid];
set_mate(record, mate);
j = mate_index;
}
let (left, right) = records.split_at_mut(j);
let record = &mut right[0];
let mate = &mut left[i];
set_mate(record, mate);
let template_size = calculate_template_size(record, mate);
records[i].template_size = template_size;
let mut j = i;
while let Some(mate_index) = mate_indices[j] {
let record = &mut records[mate_index];
record.template_size = -template_size;
mate_indices[j] = None;
j = mate_index;
}
i += 1;
}
Ok(())
}
fn set_mate(mut record: &mut Record, mate: &mut Record) {
let mate_bam_flags = mate.bam_flags();
if mate_bam_flags.is_reverse_complemented() {
record.bam_bit_flags |= sam::record::Flags::MATE_REVERSE_COMPLEMENTED;
}
if mate_bam_flags.is_unmapped() {
record.bam_bit_flags |= sam::record::Flags::MATE_UNMAPPED;
}
if mate.read_name().is_none() {
mate.read_name = record.read_name().cloned();
}
record.next_fragment_reference_sequence_id = mate.reference_sequence_id();
record.next_mate_alignment_start = mate.alignment_start;
}
fn calculate_template_size(record: &Record, mate: &Record) -> i32 {
use std::cmp;
let start = cmp::min(record.alignment_start(), mate.alignment_start())
.map(usize::from)
.expect("invalid start positions");
let end = cmp::max(record.alignment_end(), mate.alignment_end())
.map(usize::from)
.expect("invalid end positions");
if start > end {
(start - end + 1) as i32
} else {
(end - start + 1) as i32
}
}
#[cfg(test)]
mod tests {
use noodles_core::Position;
use super::*;
#[test]
fn test_resolve_mates() -> Result<(), Box<dyn std::error::Error>> {
use sam::record::ReadName;
use crate::record::Flags;
let mut records = vec![
Record::builder()
.set_id(1)
.set_flags(Flags::HAS_MATE_DOWNSTREAM)
.set_reference_sequence_id(2)
.set_read_length(4)
.set_alignment_start(Position::try_from(5)?)
.set_distance_to_next_fragment(0)
.build(),
Record::builder()
.set_id(2)
.set_flags(Flags::HAS_MATE_DOWNSTREAM)
.set_reference_sequence_id(2)
.set_read_length(4)
.set_alignment_start(Position::try_from(8)?)
.set_distance_to_next_fragment(1)
.build(),
Record::builder().set_id(3).build(),
Record::builder()
.set_id(4)
.set_reference_sequence_id(2)
.set_read_length(4)
.set_alignment_start(Position::try_from(13)?)
.build(),
];
resolve_mates(&mut records)?;
let read_name_1 = ReadName::try_from(b"1".to_vec())?;
assert_eq!(records[0].read_name(), Some(&read_name_1));
assert_eq!(
records[0].next_fragment_reference_sequence_id(),
records[1].reference_sequence_id()
);
assert_eq!(
records[0].mate_alignment_start(),
records[1].alignment_start(),
);
assert_eq!(records[0].template_size(), 12);
assert_eq!(records[1].read_name(), Some(&read_name_1));
assert_eq!(
records[1].next_fragment_reference_sequence_id(),
records[3].reference_sequence_id()
);
assert_eq!(
records[1].mate_alignment_start(),
records[3].alignment_start(),
);
assert_eq!(records[1].template_size(), -12);
let read_name_3 = ReadName::try_from(b"3".to_vec())?;
assert_eq!(records[2].read_name(), Some(&read_name_3));
assert_eq!(records[3].read_name(), Some(&read_name_1));
assert_eq!(
records[3].next_fragment_reference_sequence_id(),
records[0].reference_sequence_id()
);
assert_eq!(
records[3].mate_alignment_start(),
records[0].alignment_start(),
);
assert_eq!(records[3].template_size(), -12);
Ok(())
}
#[test]
fn test_calculate_template_size() -> Result<(), noodles_core::position::TryFromIntError> {
use sam::record::Flags;
let record = Record::builder()
.set_alignment_start(Position::try_from(100)?)
.set_read_length(50)
.build();
let mate = Record::builder()
.set_alignment_start(Position::try_from(200)?)
.set_read_length(50)
.build();
assert_eq!(calculate_template_size(&record, &mate), 150);
assert_eq!(calculate_template_size(&mate, &record), 150);
let record = Record::builder()
.set_alignment_start(Position::try_from(100)?)
.set_read_length(50)
.build();
let mate = Record::builder()
.set_bam_flags(Flags::REVERSE_COMPLEMENTED)
.set_alignment_start(Position::try_from(200)?)
.set_read_length(50)
.build();
assert_eq!(calculate_template_size(&record, &mate), 150);
assert_eq!(calculate_template_size(&mate, &record), 150);
let record = Record::builder()
.set_bam_flags(Flags::REVERSE_COMPLEMENTED)
.set_alignment_start(Position::try_from(100)?)
.set_read_length(50)
.build();
let mate = Record::builder()
.set_alignment_start(Position::try_from(200)?)
.set_read_length(50)
.build();
assert_eq!(calculate_template_size(&record, &mate), 150);
assert_eq!(calculate_template_size(&mate, &record), 150);
let record = Record::builder()
.set_bam_flags(Flags::REVERSE_COMPLEMENTED)
.set_alignment_start(Position::try_from(100)?)
.set_read_length(50)
.build();
let mate = Record::builder()
.set_bam_flags(Flags::REVERSE_COMPLEMENTED)
.set_alignment_start(Position::try_from(200)?)
.set_read_length(50)
.build();
assert_eq!(calculate_template_size(&record, &mate), 150);
assert_eq!(calculate_template_size(&mate, &record), 150);
Ok(())
}
}