mod builder;
mod iter;
mod macros;
#[cfg(feature = "parallel")]
mod parallel;
mod reader;
#[cfg(feature = "serde")]
mod serde;
mod slice;
pub use builder::{SeqVecBuilder, SeqVecFromIterBuilder};
pub use iter::{SeqIter, SeqVecIntoIter, SeqVecIter};
pub use reader::SeqVecReader;
pub use slice::SeqVecSlice;
pub use crate::variable::codec::Codec;
#[allow(deprecated)]
pub use crate::variable::VariableCodecSpec;
use crate::common::codec_reader::CodecReader;
use crate::fixed::{Error as FixedVecError, FixedVec};
use crate::variable::traits::Storable;
use dsi_bitstream::{
dispatch::{Codes, CodesRead, StaticCodeRead},
impls::{BufBitWriter, MemWordWriterVec},
prelude::{BE, BitRead, BitSeek, BitWrite, CodesWrite, Endianness, LE},
};
use iter::SeqVecBitReader;
use mem_dbg::{DbgFlags, FlatType, MemDbgImpl, MemSize, SizeFlags};
use std::marker::PhantomData;
use std::{error::Error, fmt};
#[derive(Debug)]
pub enum SeqVecError {
Io(std::io::Error),
Bitstream(Box<dyn Error + Send + Sync>),
InvalidParameters(String),
CodecDispatch(String),
IndexOutOfBounds(usize),
}
impl fmt::Display for SeqVecError {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
match self {
SeqVecError::Io(e) => write!(f, "I/O error: {}", e),
SeqVecError::Bitstream(e) => write!(f, "Bitstream error: {}", e),
SeqVecError::InvalidParameters(s) => write!(f, "Invalid parameters: {}", s),
SeqVecError::CodecDispatch(s) => write!(f, "Codec dispatch error: {}", s),
SeqVecError::IndexOutOfBounds(idx) => {
write!(f, "Sequence index out of bounds: {}", idx)
}
}
}
}
impl Error for SeqVecError {
fn source(&self) -> Option<&(dyn Error + 'static)> {
match self {
SeqVecError::Io(e) => Some(e),
SeqVecError::Bitstream(e) => Some(e.as_ref()),
_ => None,
}
}
}
impl From<std::io::Error> for SeqVecError {
fn from(e: std::io::Error) -> Self {
SeqVecError::Io(e)
}
}
impl From<core::convert::Infallible> for SeqVecError {
fn from(_: core::convert::Infallible) -> Self {
unreachable!()
}
}
impl From<FixedVecError> for SeqVecError {
fn from(e: FixedVecError) -> Self {
SeqVecError::InvalidParameters(e.to_string())
}
}
pub(crate) type SeqVecBitWriter<E> = BufBitWriter<E, MemWordWriterVec<u64, Vec<u64>>>;
#[derive(Debug, Clone)]
pub struct SeqVec<T: Storable, E: Endianness, B: AsRef<[u64]> = Vec<u64>> {
data: B,
bit_offsets: FixedVec<u64, u64, E, B>,
seq_lengths: Option<FixedVec<u64, u64, E, Vec<u64>>>,
encoding: Codes,
_markers: PhantomData<(T, E)>,
}
pub type LESeqVec<T, B = Vec<u64>> = SeqVec<T, LE, B>;
pub type BESeqVec<T, B = Vec<u64>> = SeqVec<T, BE, B>;
pub type USeqVec<T, B = Vec<u64>> = SeqVec<T, LE, B>;
pub type SSeqVec<T, B = Vec<u64>> = SeqVec<T, LE, B>;
pub type BESSeqVec<T, B = Vec<u64>> = SeqVec<T, BE, B>;
pub type LESSeqVec<T, B = Vec<u64>> = SeqVec<T, LE, B>;
pub type SeqVecSlicePair<'a, T, E, B> = (SeqVecSlice<'a, T, E, B>, SeqVecSlice<'a, T, E, B>);
impl<T: Storable + 'static, E: Endianness> SeqVec<T, E, Vec<u64>> {
pub fn from_slices<S: AsRef<[T]>>(sequences: &[S]) -> Result<Self, SeqVecError>
where
SeqVecBitWriter<E>: BitWrite<E, Error = core::convert::Infallible> + CodesWrite<E>,
{
Self::builder().codec(Codec::Auto).build(sequences)
}
pub fn into_vecs(self) -> Vec<Vec<T>>
where
for<'a> SeqVecBitReader<'a, E>: BitRead<E, Error = core::convert::Infallible>
+ CodesRead<E>
+ BitSeek<Error = core::convert::Infallible>,
{
self.iter().map(|seq_iter| seq_iter.collect()).collect()
}
}
impl<T: Storable, E: Endianness, B: AsRef<[u64]>> SeqVec<T, E, B> {
pub fn from_parts(
data: B,
bit_offsets_data: B,
bit_offsets_len: usize,
bit_offsets_num_bits: usize,
encoding: Codes,
) -> Result<Self, SeqVecError> {
if bit_offsets_len == 0 {
return Err(SeqVecError::InvalidParameters(
"bit_offsets must have at least one entry (the sentinel)".to_string(),
));
}
let bit_offsets = FixedVec::<u64, u64, E, B>::from_parts(
bit_offsets_data,
bit_offsets_len,
bit_offsets_num_bits,
)?;
Ok(Self {
data,
bit_offsets,
seq_lengths: None,
encoding,
_markers: PhantomData,
})
}
pub fn from_parts_with_lengths(
data: B,
bit_offsets_data: B,
bit_offsets_len: usize,
bit_offsets_num_bits: usize,
seq_lengths: Option<FixedVec<u64, u64, E, Vec<u64>>>,
encoding: Codes,
) -> Result<Self, SeqVecError> {
if bit_offsets_len == 0 {
return Err(SeqVecError::InvalidParameters(
"bit_offsets must have at least one entry (the sentinel)".to_string(),
));
}
if let Some(lengths) = &seq_lengths
&& lengths.len() + 1 != bit_offsets_len
{
return Err(SeqVecError::InvalidParameters(
"seq_lengths length must match number of sequences".to_string(),
));
}
let bit_offsets = FixedVec::<u64, u64, E, B>::from_parts(
bit_offsets_data,
bit_offsets_len,
bit_offsets_num_bits,
)?;
Ok(Self {
data,
bit_offsets,
seq_lengths,
encoding,
_markers: PhantomData,
})
}
#[inline]
pub unsafe fn from_parts_unchecked(
data: B,
bit_offsets: FixedVec<u64, u64, E, B>,
encoding: Codes,
) -> Self {
Self {
data,
bit_offsets,
seq_lengths: None,
encoding,
_markers: PhantomData,
}
}
#[inline]
pub unsafe fn from_parts_with_lengths_unchecked(
data: B,
bit_offsets: FixedVec<u64, u64, E, B>,
seq_lengths: Option<FixedVec<u64, u64, E, Vec<u64>>>,
encoding: Codes,
) -> Self {
Self {
data,
bit_offsets,
seq_lengths,
encoding,
_markers: PhantomData,
}
}
}
impl<T: Storable, E: Endianness, B: AsRef<[u64]>> SeqVec<T, E, B> {
#[inline(always)]
pub fn num_sequences(&self) -> usize {
self.bit_offsets.len() - 1
}
#[inline(always)]
pub fn is_empty(&self) -> bool {
self.num_sequences() == 0
}
#[inline(always)]
pub fn encoding(&self) -> Codes {
self.encoding
}
#[inline(always)]
pub fn as_limbs(&self) -> &[u64] {
self.data.as_ref()
}
#[inline(always)]
pub fn bit_offsets_ref(&self) -> &FixedVec<u64, u64, E, B> {
&self.bit_offsets
}
#[inline(always)]
pub fn has_stored_lengths(&self) -> bool {
self.seq_lengths.is_some()
}
#[inline(always)]
pub fn sequence_len(&self, index: usize) -> Option<usize> {
if index >= self.num_sequences() {
return None;
}
self.seq_lengths
.as_ref()
.map(|lengths| unsafe { lengths.get_unchecked(index) as usize })
}
#[inline(always)]
pub fn total_bits(&self) -> u64 {
unsafe { self.bit_offsets.get_unchecked(self.bit_offsets.len() - 1) }
}
#[inline(always)]
pub fn sequence_start_bit(&self, index: usize) -> Option<u64> {
if index >= self.num_sequences() {
return None;
}
Some(unsafe { self.bit_offsets.get_unchecked(index) })
}
#[inline(always)]
pub unsafe fn sequence_start_bit_unchecked(&self, index: usize) -> u64 {
debug_assert!(
index < self.num_sequences(),
"index {} out of bounds for {} sequences",
index,
self.num_sequences()
);
unsafe { self.bit_offsets.get_unchecked(index) }
}
#[inline]
pub unsafe fn sequence_end_bit_unchecked(&self, index: usize) -> u64 {
debug_assert!(
index < self.num_sequences(),
"index {} out of bounds for {} sequences",
index,
self.num_sequences()
);
unsafe { self.bit_offsets.get_unchecked(index + 1) }
}
}
impl<T: Storable, E: Endianness, B: AsRef<[u64]>> SeqVec<T, E, B>
where
for<'a> SeqVecBitReader<'a, E>: BitRead<E, Error = core::convert::Infallible>
+ CodesRead<E>
+ BitSeek<Error = core::convert::Infallible>,
{
#[inline(always)]
pub fn get(&self, index: usize) -> Option<SeqIter<'_, T, E>> {
if index >= self.num_sequences() {
return None;
}
Some(unsafe { self.get_unchecked(index) })
}
#[inline(always)]
pub unsafe fn get_unchecked(&self, index: usize) -> SeqIter<'_, T, E> {
debug_assert!(
index < self.num_sequences(),
"index {} out of bounds for {} sequences",
index,
self.num_sequences()
);
let start_bit = unsafe { self.sequence_start_bit_unchecked(index) };
let end_bit = unsafe { self.sequence_end_bit_unchecked(index) };
let len = self
.seq_lengths
.as_ref()
.map(|lengths| unsafe { lengths.get_unchecked(index) as usize });
SeqIter::new_with_len(self.data.as_ref(), start_bit, end_bit, self.encoding, len)
}
#[inline(always)]
pub fn decode_vec(&self, index: usize) -> Option<Vec<T>> {
if index >= self.num_sequences() {
return None;
}
Some(unsafe { self.decode_vec_unchecked(index) })
}
#[inline(always)]
pub unsafe fn decode_vec_unchecked(&self, index: usize) -> Vec<T> {
unsafe { self.get_unchecked(index).collect() }
}
#[inline(always)]
pub fn decode_into(&self, index: usize, buf: &mut Vec<T>) -> Option<usize> {
if index >= self.num_sequences() {
return None;
}
Some(unsafe { self.decode_into_unchecked(index, buf) })
}
#[inline(always)]
pub unsafe fn decode_into_unchecked(&self, index: usize, buf: &mut Vec<T>) -> usize {
let start_bit = unsafe { self.sequence_start_bit_unchecked(index) };
buf.clear();
let mut reader =
SeqVecBitReader::<E>::new(dsi_bitstream::impls::MemWordReader::new_inf(self.data.as_ref()));
let _ = reader.set_bit_pos(start_bit);
let code_reader = CodecReader::new(self.encoding);
if let Some(lengths) = &self.seq_lengths {
let count = unsafe { lengths.get_unchecked(index) as usize };
self.decode_counted(&mut reader, &code_reader, buf, count);
} else {
let end_bit = unsafe { self.sequence_end_bit_unchecked(index) };
self.decode_until(&mut reader, &code_reader, buf, end_bit);
}
buf.len()
}
#[inline(always)]
fn decode_counted<'a>(
&self,
reader: &mut SeqVecBitReader<'a, E>,
code_reader: &CodecReader<'a, E>,
buf: &mut Vec<T>,
count: usize,
) {
buf.reserve(count);
for _ in 0..count {
let word = code_reader.read(reader).unwrap();
buf.push(T::from_word(word));
}
}
#[inline(always)]
fn decode_until<'a>(
&self,
reader: &mut SeqVecBitReader<'a, E>,
code_reader: &CodecReader<'a, E>,
buf: &mut Vec<T>,
end_bit: u64,
) {
while reader.bit_pos().unwrap() < end_bit {
let word = code_reader.read(reader).unwrap();
buf.push(T::from_word(word));
}
}
#[inline(always)]
pub fn iter(&self) -> SeqVecIter<'_, T, E, B> {
SeqVecIter::new(
self.data.as_ref(),
&self.bit_offsets,
self.seq_lengths.as_ref(),
self.encoding,
self.num_sequences(),
)
}
#[inline(always)]
pub fn reader(&self) -> SeqVecReader<'_, T, E, B> {
SeqVecReader::new(self)
}
pub fn slice(&self, start: usize, len: usize) -> Option<slice::SeqVecSlice<'_, T, E, B>> {
if start.saturating_add(len) > self.num_sequences() {
return None;
}
Some(slice::SeqVecSlice::new(self, start..start + len))
}
pub fn split_at(&self, mid: usize) -> Option<SeqVecSlicePair<'_, T, E, B>> {
if mid > self.num_sequences() {
return None;
}
Some((
slice::SeqVecSlice::new(self, 0..mid),
slice::SeqVecSlice::new(self, mid..self.num_sequences()),
))
}
}
impl<T: Storable, E: Endianness, B: AsRef<[u64]> + MemSize + FlatType> MemSize for SeqVec<T, E, B> {
fn mem_size_rec(&self, flags: SizeFlags, _refs: &mut mem_dbg::HashMap<usize, usize>) -> usize {
let mut total = core::mem::size_of::<Self>();
total += self.data.mem_size(flags) - core::mem::size_of::<B>();
total +=
self.bit_offsets.mem_size(flags) - core::mem::size_of::<FixedVec<u64, u64, E, B>>();
if let Some(lengths) = &self.seq_lengths {
total +=
lengths.mem_size(flags) - core::mem::size_of::<FixedVec<u64, u64, E, Vec<u64>>>();
}
total
}
}
struct CodeWrapper<'a>(&'a Codes);
impl MemSize for CodeWrapper<'_> {
fn mem_size_rec(&self, _flags: SizeFlags, _refs: &mut mem_dbg::HashMap<usize, usize>) -> usize {
core::mem::size_of_val(self.0)
}
}
impl MemDbgImpl for CodeWrapper<'_> {
fn _mem_dbg_depth_on(
&self,
writer: &mut impl core::fmt::Write,
total_size: usize,
max_depth: usize,
prefix: &mut String,
field_name: Option<&str>,
is_last: bool,
padded_size: usize,
flags: DbgFlags,
_dbg_refs: &mut mem_dbg::HashSet<usize>,
) -> core::fmt::Result {
use core::fmt::Write;
if prefix.len() > max_depth {
return Ok(());
}
let real_size = self.mem_size(flags.to_size_flags());
let mut buffer = String::new();
if flags.contains(DbgFlags::HUMANIZE) {
let (value, uom) = mem_dbg::humanize_float(real_size);
if uom == " B" {
write!(buffer, "{:>4}{}", value as usize, uom)?;
} else {
write!(buffer, "{:>4.2}{}", value, uom)?;
}
} else {
write!(buffer, "{:>9}", real_size)?;
}
if flags.contains(DbgFlags::PERCENTAGE) {
let percentage = 100.0 * real_size as f64 / total_size as f64;
write!(buffer, " {:>6.2}%", percentage)?;
}
write!(writer, "{}", buffer)?;
write!(writer, " {} {}", prefix, if is_last { "╰" } else { "├" })?;
if let Some(name) = field_name {
write!(writer, "{}", name)?;
}
if flags.contains(DbgFlags::TYPE_NAME) {
if flags.contains(DbgFlags::COLOR) {
write!(writer, "{}", mem_dbg::type_color())?;
}
write!(writer, ": {:?}", self.0)?;
if flags.contains(DbgFlags::COLOR) {
write!(writer, "{}", mem_dbg::reset_color())?;
}
}
let padding = padded_size - core::mem::size_of_val(self.0);
if padding != 0 {
write!(writer, " [{}B]", padding)?;
}
writeln!(writer)?;
Ok(())
}
fn _mem_dbg_rec_on(
&self,
_writer: &mut impl core::fmt::Write,
_total_size: usize,
_max_depth: usize,
_prefix: &mut String,
_is_last: bool,
_flags: DbgFlags,
_dbg_refs: &mut mem_dbg::HashSet<usize>,
) -> core::fmt::Result {
Ok(())
}
}
impl<T: Storable, E: Endianness, B: AsRef<[u64]> + MemDbgImpl + FlatType> MemDbgImpl for SeqVec<T, E, B> {
fn _mem_dbg_rec_on(
&self,
writer: &mut impl core::fmt::Write,
total_size: usize,
max_depth: usize,
prefix: &mut String,
_is_last: bool,
flags: DbgFlags,
_dbg_refs: &mut mem_dbg::HashSet<usize>,
) -> core::fmt::Result {
self.data._mem_dbg_depth_on(
writer,
total_size,
max_depth,
prefix,
Some("data"),
false,
core::mem::size_of_val(&self.data),
flags,
_dbg_refs,
)?;
self.bit_offsets._mem_dbg_depth_on(
writer,
total_size,
max_depth,
prefix,
Some("bit_offsets"),
false,
core::mem::size_of_val(&self.bit_offsets),
flags,
_dbg_refs,
)?;
if let Some(lengths) = &self.seq_lengths {
lengths._mem_dbg_depth_on(
writer,
total_size,
max_depth,
prefix,
Some("seq_lengths"),
false,
core::mem::size_of_val(lengths),
flags,
_dbg_refs,
)?;
}
let code_wrapper = CodeWrapper(&self.encoding);
code_wrapper._mem_dbg_depth_on(
writer,
total_size,
max_depth,
prefix,
Some("encoding"),
false,
core::mem::size_of_val(&self.encoding),
flags,
_dbg_refs,
)?;
self._markers._mem_dbg_depth_on(
writer,
total_size,
max_depth,
prefix,
Some("_markers"),
true,
core::mem::size_of_val(&self._markers),
flags,
_dbg_refs,
)?;
Ok(())
}
}
impl<T: Storable + PartialEq, E: Endianness, B: AsRef<[u64]>> PartialEq for SeqVec<T, E, B>
where
for<'a> SeqVecBitReader<'a, E>: BitRead<E, Error = core::convert::Infallible>
+ CodesRead<E>
+ BitSeek<Error = core::convert::Infallible>,
{
fn eq(&self, other: &Self) -> bool {
if self.num_sequences() != other.num_sequences() {
return false;
}
for i in 0..self.num_sequences() {
let self_iter = unsafe { self.get_unchecked(i) };
let other_iter = unsafe { other.get_unchecked(i) };
if self_iter.ne(other_iter) {
return false;
}
}
true
}
}
impl<T: Storable, E: Endianness, B: AsRef<[u64]>> SeqVec<T, E, B>
where
for<'a> SeqVecBitReader<'a, E>: BitRead<E, Error = core::convert::Infallible>
+ CodesRead<E>
+ BitSeek<Error = core::convert::Infallible>,
{
pub fn decode_many(&self, indices: &[usize]) -> Result<Vec<Vec<T>>, SeqVecError> {
if indices.is_empty() {
return Ok(Vec::new());
}
for &index in indices {
if index >= self.num_sequences() {
return Err(SeqVecError::IndexOutOfBounds(index));
}
}
Ok(unsafe { self.decode_many_unchecked(indices) })
}
pub unsafe fn decode_many_unchecked(&self, indices: &[usize]) -> Vec<Vec<T>> {
if indices.is_empty() {
return Vec::new();
}
let mut indexed_indices: Vec<(usize, usize)> = indices
.iter()
.enumerate()
.map(|(i, &idx)| (idx, i))
.collect();
indexed_indices.sort_unstable_by_key(|&(idx, _)| idx);
let mut results: Vec<Vec<T>> = indices
.iter()
.map(|&idx| {
let start = unsafe { self.sequence_start_bit_unchecked(idx) };
let end = unsafe { self.sequence_end_bit_unchecked(idx) };
let cap = ((end - start) / 4).max(1) as usize;
Vec::with_capacity(cap)
})
.collect();
let mut reader = self.reader();
for &(target_index, original_pos) in &indexed_indices {
let output = &mut results[original_pos];
let _ = reader.decode_into(target_index, output);
}
results
}
pub fn decode_many_into(
&self,
indices: &[usize],
output: &mut Vec<Vec<T>>,
) -> Result<(), SeqVecError> {
if indices.is_empty() {
output.clear();
return Ok(());
}
for &index in indices {
if index >= self.num_sequences() {
return Err(SeqVecError::IndexOutOfBounds(index));
}
}
output.clear();
output.resize_with(indices.len(), Vec::new);
unsafe { self.decode_many_into_unchecked(indices, output.as_mut_slice()) };
Ok(())
}
pub unsafe fn decode_many_into_unchecked(&self, indices: &[usize], output: &mut [Vec<T>]) {
debug_assert_eq!(indices.len(), output.len());
if indices.is_empty() {
return;
}
let mut indexed_indices: Vec<(usize, usize)> = indices
.iter()
.enumerate()
.map(|(i, &idx)| (idx, i))
.collect();
indexed_indices.sort_unstable_by_key(|&(idx, _)| idx);
for (i, &idx) in indices.iter().enumerate() {
let start = unsafe { self.sequence_start_bit_unchecked(idx) };
let end = unsafe { self.sequence_end_bit_unchecked(idx) };
let cap = ((end - start) / 4).max(1) as usize;
output[i].reserve(cap);
}
let mut reader = self.reader();
for &(target_index, original_pos) in &indexed_indices {
let output_slot = &mut output[original_pos];
let _ = reader.decode_into(target_index, output_slot);
}
}
}
impl<'a, T: Storable, E: Endianness, B: AsRef<[u64]>> IntoIterator for &'a SeqVec<T, E, B>
where
for<'b> SeqVecBitReader<'b, E>: BitRead<E, Error = core::convert::Infallible>
+ CodesRead<E>
+ BitSeek<Error = core::convert::Infallible>,
{
type Item = SeqIter<'a, T, E>;
type IntoIter = SeqVecIter<'a, T, E, B>;
#[inline]
fn into_iter(self) -> Self::IntoIter {
self.iter()
}
}
impl<T: Storable + 'static, E: Endianness + 'static> IntoIterator for SeqVec<T, E, Vec<u64>>
where
for<'a> SeqVecBitReader<'a, E>: BitRead<E, Error = core::convert::Infallible>
+ CodesRead<E>
+ BitSeek<Error = core::convert::Infallible>,
{
type Item = SeqIter<'static, T, E>;
type IntoIter = SeqVecIntoIter<T, E>;
#[inline]
fn into_iter(self) -> Self::IntoIter {
SeqVecIntoIter::new(self)
}
}