use std::collections::HashMap;
use std::convert::TryInto;
use std::io::Write;
use std::mem::size_of;
use crate::block::Block;
use crate::error::Error;
use crate::nucleotide::Nucleotide;
use crate::reader::{Field, SIGNATURE};
pub mod fasta;
const FIELD_SIZE: usize = size_of::<Field>();
#[derive(Clone)]
pub struct SequenceLength {
pub name: String,
pub length: usize,
}
impl SequenceLength {
pub fn new<S: ToString>(name: &S, length: usize) -> Self {
Self {
name: name.to_string(),
length,
}
}
#[must_use]
pub const fn len(&self) -> usize {
self.length
}
#[must_use]
pub const fn is_empty(&self) -> bool {
self.length == 0
}
}
pub trait SequenceRead<'a> {
fn sequence_lengths(&'a self) -> Result<&[SequenceLength], Box<dyn std::error::Error>>;
fn nucleotides(&self, chr: &str) -> Result<Box<dyn Nucleotides>, Box<dyn std::error::Error>>;
fn soft_masked_blocks(&'a self, chr: &str) -> Result<&'a [Block], Box<dyn std::error::Error>>;
fn hard_masked_blocks(&'a self, chr: &str) -> Result<&'a [Block], Box<dyn std::error::Error>>;
}
pub trait Nucleotides {
fn read_chunk(
&mut self,
buf: &mut [Nucleotide],
) -> std::result::Result<Option<usize>, Box<dyn std::error::Error>>;
}
#[allow(clippy::too_many_lines)]
pub fn to_2bit<'a>(
writer: &mut dyn Write,
extractor: &'a (dyn SequenceRead<'a> + 'a),
) -> Result<(), Box<dyn std::error::Error>> {
let seqs = extractor.sequence_lengths()?;
let signature: Field = SIGNATURE;
let version: Field = 0;
let sequence_count: Field = seqs.len() as Field;
let reserved: Field = 0;
for input in &[&signature, &version, &sequence_count, &reserved] {
writer.write_all(&input.to_ne_bytes())?;
}
let mut index_size: Field = 0;
let mut sequence_records_size: Field = 0;
let mut relative_sequence_offsets = HashMap::<&str, Field>::new();
for seq_length in seqs.iter() {
let chr = &seq_length.name;
if !chr.is_ascii() {
return Err(Box::new(Error::FileFormat(format!(
"Chromosome name is not ASCII: {}",
&chr
))));
}
let addition_to_index: Field = usize2field(1 + chr.len() + FIELD_SIZE)?;
index_size += addition_to_index;
relative_sequence_offsets.insert(chr, sequence_records_size);
let hard_masked_blocks = extractor.hard_masked_blocks(chr)?;
let soft_masked_blocks = extractor.soft_masked_blocks(chr)?;
let length: usize = seq_length.length;
sequence_records_size += {
let block_bytewidth = size_of::<u32>() * 2; let integer = FIELD_SIZE + FIELD_SIZE + block_bytewidth * hard_masked_blocks.len()
+ FIELD_SIZE + block_bytewidth * soft_masked_blocks.len()
+ FIELD_SIZE; let condensed_length = length >> 2; let adjust_partial_quartet = if length % 4 == 0 { 0 } else { 1 };
usize2field(integer + condensed_length + adjust_partial_quartet)?
};
}
let sequence_records_start: Field = 16 + index_size; for seq_length in seqs.iter() {
let chr = &seq_length.name;
let name = chr.as_bytes();
let name_size: u8 = name.len().try_into().map_err(|_| {
Error::FileFormat(format!(
"chromosome name is longer than 255 characters: {}",
name.len()
))
})?;
let mut buf: Vec<u8> = Vec::with_capacity(2 * FIELD_SIZE + name.len());
buf.push(name_size);
buf.extend_from_slice(name);
let sequence_offset: Field = sequence_records_start
+ *relative_sequence_offsets
.get(chr as &str)
.expect("previous loop");
buf.extend_from_slice(&sequence_offset.to_ne_bytes());
writer.write_all(&buf)?;
}
for seq_length in seqs.iter() {
let chr = &seq_length.name;
let length = seq_length.length;
let length_field = (length as Field).to_ne_bytes();
writer.write_all(&length_field)?; write_blocks(writer, extractor.hard_masked_blocks(chr)?)?;
write_blocks(writer, extractor.soft_masked_blocks(chr)?)?;
writer.write_all(&reserved.to_ne_bytes())?;
let mut data_buf = [Nucleotide::N; 80];
let mut nuc_modulo: u8 = 0;
let mut byte: u8 = 0;
let mut buf = [0_u8; 1];
let mut sequence_length = 0;
let mut nucleotides = extractor.nucleotides(chr)?;
while let Some(num_nucleotides_read) = nucleotides
.read_chunk(&mut data_buf)
.map_err(|e| Error::FileFormat(e.to_string()))?
{
for (i, nuc) in data_buf.iter().enumerate() {
if i == num_nucleotides_read {
break;
}
byte |= nuc.bits();
nuc_modulo += 1;
if nuc_modulo == 4 {
buf[0] = byte;
writer.write_all(&buf)?;
byte = 0;
nuc_modulo = 0;
} else {
byte <<= 2;
}
sequence_length += 1;
}
}
if nuc_modulo > 0 {
byte <<= 2 * (4 - nuc_modulo); buf[0] = byte;
writer.write_all(&buf)?;
}
if sequence_length != length {
let msg = format!(
"The reported sequence length of {} for sequence {} is wrong (seen {} nucleotides)",
length, chr, sequence_length,
);
return Err(Box::new(Error::FileFormat(msg)));
}
}
Ok(())
}
fn write_blocks(writer: &mut dyn Write, blocks: &[Block]) -> Result<(), Error> {
let num_blocks: Field = usize2field(blocks.len())?;
writer.write_all(&num_blocks.to_ne_bytes())?;
let mut block_lengths = Vec::with_capacity(&blocks.len() * 4); for block in blocks {
let start: Field = usize2field(block.start)?;
writer.write_all(&start.to_ne_bytes())?;
let distance: Field = {
if let Some(diff) = block.end.checked_sub(block.start) {
usize2field(diff)?
} else {
return Err(Error::FileFormat("Block end < Block start".to_string()));
}
};
block_lengths.extend_from_slice(&distance.to_ne_bytes());
}
writer.write_all(&block_lengths)?;
Ok(())
}
fn usize2field(v: usize) -> Result<Field, Error> {
v.try_into()
.map_err(|_| Error::FileFormat(format!("Number {} is too big to store in a 2bit Field", v)))
}