#![doc = include_str!("../README.md")]
use std::cmp::Ordering;
use std::collections::{BinaryHeap, HashMap};
use std::num::TryFromIntError;
use buffertk::Unpackable;
use prototk::FieldNumber;
pub mod binary_search;
pub mod bit_array;
pub mod bit_vector;
pub mod builder;
pub mod encoder;
pub mod isa;
pub mod psi;
pub mod sa;
pub mod sais;
pub mod sampled;
pub mod sigma;
pub mod wavelet_tree;
use crate::bit_vector::BitVector;
use crate::builder::{Builder, Helper};
#[derive(Clone, Copy, Debug, Eq, PartialEq)]
pub enum Error {
IntoUsize,
Unparseable,
TextTooLong,
BadSearch,
BadTextOffset,
BadRecordOffset,
CouldNotConstructBitVector,
InvalidEncoder,
InvalidBitVector,
InvalidWaveletTree,
InvalidSigma,
InvalidSuffixArray,
InvalidInverseSuffixArray,
InvalidPsi,
InvalidDocument,
BadRank(usize),
BadSelect(usize),
BadIndex(usize),
LogicError(&'static str),
}
impl From<TryFromIntError> for Error {
fn from(_: TryFromIntError) -> Self {
Self::IntoUsize
}
}
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct TextOffset(pub usize);
#[derive(Clone, Copy, Debug, Default, Eq, PartialEq, Ord, PartialOrd, Hash)]
pub struct RecordOffset(pub usize);
pub trait Document {
type Search: Iterator<Item = TextOffset>;
fn construct<H: Helper>(
text: Vec<u32>,
record_boundaries: Vec<usize>,
builder: &mut Builder<H>,
) -> Result<(), Error>;
fn len(&self) -> usize;
fn is_empty(&self) -> bool {
self.len() == 0
}
fn records(&self) -> usize;
fn search(&self, needle: &[u32]) -> Result<Self::Search, Error>;
fn count(&self, needle: &[u32]) -> Result<usize, Error>;
fn lookup(&self, offset: TextOffset) -> Result<RecordOffset, Error>;
fn retrieve(&self, record: RecordOffset) -> Result<Vec<u32>, Error>;
fn offset_of(&self, record: RecordOffset) -> Result<TextOffset, Error>;
}
#[derive(Clone, Debug, Default, prototk_derive::Message)]
struct ReferenceDocumentStub {
#[prototk(1, uint32)]
text: Vec<u32>,
#[prototk(2, uint64)]
record_boundaries: Vec<u64>,
}
impl From<ReferenceDocument> for ReferenceDocumentStub {
fn from(rd: ReferenceDocument) -> Self {
let ReferenceDocument {
text,
record_boundaries,
} = rd;
let record_boundaries = record_boundaries.into_iter().map(|x| x as u64).collect();
Self {
text,
record_boundaries,
}
}
}
impl TryFrom<ReferenceDocumentStub> for ReferenceDocument {
type Error = Error;
fn try_from(rds: ReferenceDocumentStub) -> Result<Self, Self::Error> {
let ReferenceDocumentStub {
text,
record_boundaries,
} = rds;
if record_boundaries.iter().any(|x| *x > usize::MAX as u64) {
return Err(Error::IntoUsize);
}
let record_boundaries = record_boundaries.into_iter().map(|x| x as usize).collect();
Ok(ReferenceDocument {
text,
record_boundaries,
})
}
}
pub struct ReferenceDocument {
text: Vec<u32>,
record_boundaries: Vec<usize>,
}
impl Document for ReferenceDocument {
type Search = std::vec::IntoIter<TextOffset>;
fn construct<H: Helper>(
text: Vec<u32>,
record_boundaries: Vec<usize>,
builder: &mut Builder<H>,
) -> Result<(), Error> {
check_record_boundaries(&text, &record_boundaries)?;
builder.append_vec_u32(FieldNumber::must(1), &text);
builder.append_vec_usize(FieldNumber::must(2), &record_boundaries);
Ok(())
}
fn len(&self) -> usize {
self.text.len()
}
fn records(&self) -> usize {
self.record_boundaries.len()
}
fn search(&self, needle: &[u32]) -> Result<Self::Search, Error> {
let offsets: Vec<TextOffset> = if needle.is_empty() {
(0..self.len()).map(TextOffset).collect()
} else {
let mut offsets = vec![];
for (idx, candidate) in self.text.windows(needle.len()).enumerate() {
if candidate == needle {
offsets.push(TextOffset(idx));
}
}
offsets
};
Ok(offsets.into_iter())
}
fn count(&self, needle: &[u32]) -> Result<usize, Error> {
Ok(self.search(needle)?.count())
}
fn lookup(&self, offset: TextOffset) -> Result<RecordOffset, Error> {
let partition_point = self.record_boundaries.partition_point(|b| *b < offset.0);
if partition_point >= self.record_boundaries.len() && !self.record_boundaries.is_empty() {
Ok(RecordOffset(self.record_boundaries.len() - 1))
} else if partition_point >= self.record_boundaries.len() {
Err(Error::BadTextOffset)
} else if self.record_boundaries[partition_point] <= offset.0 {
Ok(RecordOffset(partition_point))
} else {
Ok(RecordOffset(partition_point - 1))
}
}
fn retrieve(&self, record: RecordOffset) -> Result<Vec<u32>, Error> {
if record.0 >= self.record_boundaries.len() {
return Err(Error::BadRecordOffset);
}
let record_start = self.record_boundaries[record.0];
let record_limit = if record.0 + 1 >= self.record_boundaries.len() {
self.text.len()
} else {
self.record_boundaries[record.0 + 1]
};
Ok(Vec::from(&self.text[record_start..record_limit]))
}
fn offset_of(&self, record: RecordOffset) -> Result<TextOffset, Error> {
if record.0 >= self.record_boundaries.len() {
return Err(Error::BadRecordOffset);
}
Ok(TextOffset(self.record_boundaries[record.0]))
}
}
impl<'a> Unpackable<'a> for ReferenceDocument {
type Error = Error;
fn unpack<'b: 'a>(buf: &'b [u8]) -> Result<(Self, &'b [u8]), Self::Error> {
let (rds, buf) = <ReferenceDocumentStub as Unpackable>::unpack(buf)
.map_err(|_| Error::InvalidDocument)?;
Ok((rds.try_into()?, buf))
}
}
#[derive(Clone, Debug, Default, prototk_derive::Message)]
struct PsiDocumentStub<'a> {
#[prototk(1, bytes)]
record_boundaries: &'a [u8],
#[prototk(2, bytes)]
sigma: &'a [u8],
#[prototk(3, bytes)]
sa: &'a [u8],
#[prototk(4, bytes)]
isa: &'a [u8],
#[prototk(5, bytes)]
psi: &'a [u8],
}
pub struct PsiDocument<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
{
record_boundaries: bit_vector::sparse::BitVector<'a>,
sigma: sigma::Sigma<'a>,
sa: SA,
isa: ISA,
psi: PSI,
}
impl<'a, SA, ISA, PSI> PsiDocument<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
{
fn backwards_search(&self, needle: &[u32]) -> Result<(usize, usize), Error> {
let mut needle = needle.iter().rev();
let mut range = if let Some(t) = needle.next() {
self.sigma.sa_range_for(*t)?
} else {
return Ok((1usize, self.psi.len() - 1));
};
for t in needle {
range = self
.psi
.constrain(&self.sigma, self.sigma.sa_range_for(*t)?, range)?;
}
Ok(range)
}
}
impl<'a, SA, ISA, PSI> Document for PsiDocument<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
{
type Search = std::vec::IntoIter<TextOffset>;
fn construct<H: Helper>(
text: Vec<u32>,
record_boundaries: Vec<usize>,
builder: &mut Builder<H>,
) -> Result<(), Error> {
check_record_boundaries(&text, &record_boundaries)?;
let sparse_boundaries: Vec<usize> =
record_boundaries[1..].iter().map(|rb| rb - 1).collect();
bit_vector::sparse::BitVector::from_indices(
128,
text.len(),
&sparse_boundaries,
&mut builder.sub(FieldNumber::must(1)),
)
.ok_or(Error::InvalidBitVector)?;
let mut sigma_builder = builder.sub(FieldNumber::must(2));
sigma::Sigma::construct(text.iter().copied(), &mut sigma_builder)?;
let sigma_buf = sigma_builder.relative_bytes(0).to_vec();
let sigma = sigma::Sigma::unpack(&sigma_buf)?.0;
drop(sigma_builder);
let mut s = Vec::with_capacity(text.len() + 1);
for t in text.iter().copied() {
s.push(sigma.char_to_sigma(t).ok_or(Error::LogicError(
"freshly constructed sigma cannot translate text",
))?);
}
s.push(0);
drop(text);
let mut sa = vec![0usize; s.len()];
sais::sais(&sigma, &s, &mut sa)?;
drop(s);
SA::construct(6, &sa, &mut builder.sub(FieldNumber::must(3)))?;
let isa = inverse(&sa);
drop(sa);
ISA::construct(
&isa,
&record_boundaries,
&mut builder.sub(FieldNumber::must(4)),
)?;
let psi = psi::compute(&isa);
drop(isa);
PSI::construct(&sigma, &psi, &mut builder.sub(FieldNumber::must(5)))?;
Ok(())
}
fn len(&self) -> usize {
self.psi.len() - 1
}
fn records(&self) -> usize {
self.record_boundaries
.rank(self.record_boundaries.len())
.unwrap_or(0)
+ 1
}
fn search(&self, needle: &[u32]) -> Result<Self::Search, Error> {
let range = self.backwards_search(needle)?;
if range.0 > range.1 {
Ok(vec![].into_iter())
} else {
let mut result = Vec::with_capacity(range.1 - range.0 + 1);
for offset in range.0..=range.1 {
result.push(TextOffset(self.sa.lookup(
&self.sigma,
&self.psi,
offset,
)?));
}
result.sort();
Ok(result.into_iter())
}
}
fn count(&self, needle: &[u32]) -> Result<usize, Error> {
let range = self.backwards_search(needle)?;
if range.0 > range.1 {
Ok(0)
} else {
Ok(range.1 - range.0 + 1)
}
}
fn lookup(&self, offset: TextOffset) -> Result<RecordOffset, Error> {
Ok(RecordOffset(
self.record_boundaries
.rank(offset.0)
.ok_or(Error::BadRank(offset.0))?,
))
}
fn offset_of(&self, record: RecordOffset) -> Result<TextOffset, Error> {
Ok(TextOffset(
self.record_boundaries
.select(record.0)
.ok_or(Error::BadSelect(record.0))?,
))
}
fn retrieve(&self, record: RecordOffset) -> Result<Vec<u32>, Error> {
let start: usize = self
.record_boundaries
.select(record.0)
.ok_or(Error::BadSelect(record.0))?;
let limit: usize = self
.record_boundaries
.select(record.0 + 1)
.unwrap_or(self.len());
if start > limit {
return Err(Error::BadRecordOffset);
}
let mut idx = self.isa.lookup(start)?;
let mut result = Vec::with_capacity(limit - start);
for _ in start..limit {
result.push(self.sigma.sa_index_to_t(idx).ok_or(Error::InvalidSigma)?);
idx = self.psi.lookup(&self.sigma, idx)?;
}
Ok(result)
}
}
impl<'a, SA, ISA, PSI> Unpackable<'a> for PsiDocument<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray + Unpackable<'a>,
ISA: isa::InverseSuffixArray + Unpackable<'a>,
PSI: psi::Psi + Unpackable<'a>,
Error: From<<SA as Unpackable<'a>>::Error>
+ From<<ISA as Unpackable<'a>>::Error>
+ From<<PSI as Unpackable<'a>>::Error>,
{
type Error = Error;
fn unpack<'b: 'a>(buf: &'b [u8]) -> Result<(Self, &'b [u8]), Self::Error> {
let (stub, buf) =
<PsiDocumentStub as Unpackable>::unpack(buf).map_err(|_| Error::InvalidBitVector)?;
let record_boundaries = bit_vector::sparse::BitVector::new(stub.record_boundaries)
.ok_or(Error::InvalidBitVector)?;
let sigma = sigma::Sigma::unpack(stub.sigma)?.0;
let sa = SA::unpack(stub.sa)?.0;
let isa = ISA::unpack(stub.isa)?.0;
let psi = PSI::unpack(stub.psi)?.0;
Ok((
PsiDocument {
record_boundaries,
sigma,
sa,
isa,
psi,
},
buf,
))
}
}
pub type CompressedDocument<'a> = PsiDocument<
'a,
sa::SampledSuffixArray<'a>,
isa::SampledInverseSuffixArray<'a>,
psi::wavelet_tree::WaveletTreePsi<
'a,
wavelet_tree::prefix::WaveletTree<'a, encoder::HuffmanEncoder>,
>,
>;
#[derive(Clone, Debug, Default, Eq, PartialEq, Ord, PartialOrd)]
pub struct Exemplar {
count: usize,
needle: Vec<u32>,
}
impl Exemplar {
pub fn count(&self) -> usize {
self.count
}
pub fn text(&self) -> &[u32] {
&self.needle
}
}
impl From<CorrelateState> for Exemplar {
fn from(mut cs: CorrelateState) -> Self {
cs.needle.reverse();
Self {
count: cs
.docs
.values()
.map(|d| d.numer)
.fold(0, usize::saturating_add),
needle: cs.needle,
}
}
}
impl From<ExemplarState> for Exemplar {
fn from(mut es: ExemplarState) -> Self {
es.needle.reverse();
Self {
count: es
.docs
.values()
.map(|d| d.numer)
.fold(0, usize::saturating_add),
needle: es.needle,
}
}
}
#[derive(Debug, Default, Eq, PartialEq)]
struct CorrelateDocState {
numer: usize,
range: (usize, usize),
}
#[derive(Debug, Default, Eq, PartialEq)]
struct CorrelateState {
terminal: u32,
needle: Vec<u32>,
docs: HashMap<usize, CorrelateDocState>,
}
impl Ord for CorrelateState {
fn cmp(&self, other: &CorrelateState) -> Ordering {
if self == other {
Ordering::Equal
} else {
let numer_lhs = self
.docs
.values()
.map(|d| d.numer)
.fold(0, usize::saturating_add);
let numer_rhs = other
.docs
.values()
.map(|d| d.numer)
.fold(0, usize::saturating_add);
let denom_lhs = self
.docs
.values()
.map(|d| d.range.1 + 1 - d.range.0)
.fold(0, usize::saturating_add);
let denom_rhs = other
.docs
.values()
.map(|d| d.range.1 + 1 - d.range.0)
.fold(0, usize::saturating_add);
if denom_lhs == 0 && denom_rhs == 0 {
Ordering::Equal
} else if denom_lhs == 0 {
Ordering::Less
} else if denom_rhs == 0 {
Ordering::Greater
} else {
(numer_lhs as f64 / denom_lhs as f64)
.total_cmp(&(numer_rhs as f64 / denom_rhs as f64))
}
}
}
}
impl PartialOrd for CorrelateState {
fn partial_cmp(&self, other: &CorrelateState) -> Option<Ordering> {
Some(self.cmp(other))
}
}
pub fn correlate<'a, SA, ISA, PSI, F>(
docs: &'a [&'a PsiDocument<'a, SA, ISA, PSI>],
boundaries: &[(u32, u32)],
select: F,
) -> Correlate<'a, SA, ISA, PSI, F>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
F: Fn(usize, RecordOffset) -> bool,
{
let mut heap = BinaryHeap::new();
for boundary in boundaries.iter() {
let mut state = CorrelateState {
terminal: boundary.0,
needle: vec![boundary.1],
docs: HashMap::new(),
};
for (idx, doc) in docs.iter().enumerate() {
let Ok(range) = doc.sigma.sa_range_for(boundary.1) else {
continue;
};
if range.0 > range.1 {
continue;
}
let mut numer = 0;
for offset in range.0..=range.1 {
let Ok(text_offset) = doc.sa.lookup(&doc.sigma, &doc.psi, offset) else {
continue;
};
let Ok(record_offset) = doc.lookup(TextOffset(text_offset)) else {
continue;
};
if select(idx, record_offset) {
numer += 1;
}
}
if numer > 0 {
state.docs.insert(idx, CorrelateDocState { numer, range });
}
}
if !state.docs.is_empty() {
heap.push(state);
}
}
Correlate { docs, heap, select }
}
pub struct Correlate<'a, SA, ISA, PSI, F>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
F: Fn(usize, RecordOffset) -> bool,
{
docs: &'a [&'a PsiDocument<'a, SA, ISA, PSI>],
heap: BinaryHeap<CorrelateState>,
select: F,
}
impl<'a, SA, ISA, PSI, F> Iterator for Correlate<'a, SA, ISA, PSI, F>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
F: Fn(usize, RecordOffset) -> bool,
{
type Item = Exemplar;
fn next(&mut self) -> Option<Self::Item> {
while let Some(correlate) = self.heap.pop() {
if correlate.terminal == correlate.needle[correlate.needle.len() - 1] {
return Some(Exemplar::from(correlate));
}
let mut new_correlates: HashMap<u32, CorrelateState> = HashMap::new();
for (idx, doc_state) in correlate.docs.iter() {
let doc = &self.docs[*idx];
for i in 1..=doc.sigma.K() as u32 {
let Some(t) = doc.sigma.sigma_to_char(i) else {
continue;
};
let new_correlate = new_correlates.entry(t).or_insert_with(|| {
let mut needle = correlate.needle.clone();
needle.push(t);
CorrelateState {
terminal: correlate.terminal,
needle,
docs: HashMap::new(),
}
});
let Ok(range) = doc.sigma.sa_range_for_sigma(i) else {
continue;
};
if range.0 > range.1 {
continue;
}
let Ok(range) = doc.psi.constrain(&doc.sigma, range, doc_state.range) else {
continue;
};
if range.0 > range.1 {
continue;
}
let mut numer = 0;
for offset in range.0..=range.1 {
let Ok(text_offset) = doc.sa.lookup(&doc.sigma, &doc.psi, offset) else {
continue;
};
let Ok(record_offset) = doc.lookup(TextOffset(text_offset)) else {
continue;
};
if (self.select)(*idx, record_offset) {
numer += 1;
}
}
if numer > 0 {
new_correlate
.docs
.insert(*idx, CorrelateDocState { numer, range });
}
}
}
for (_, new_correlate) in new_correlates {
if !new_correlate.docs.is_empty() {
self.heap.push(new_correlate);
}
}
}
None
}
}
#[derive(Debug, Default, Eq, PartialEq)]
struct ExemplarState {
terminal: u32,
needle: Vec<u32>,
docs: HashMap<usize, CorrelateDocState>,
}
impl Ord for ExemplarState {
fn cmp(&self, other: &ExemplarState) -> Ordering {
if self == other {
Ordering::Equal
} else {
let numer_lhs = self
.docs
.values()
.map(|d| d.numer)
.fold(0, usize::saturating_add);
let numer_rhs = other
.docs
.values()
.map(|d| d.numer)
.fold(0, usize::saturating_add);
numer_lhs.cmp(&numer_rhs)
}
}
}
impl PartialOrd for ExemplarState {
fn partial_cmp(&self, other: &ExemplarState) -> Option<Ordering> {
Some(self.cmp(other))
}
}
pub fn exemplars<'a, SA, ISA, PSI>(
docs: &'a [&'a PsiDocument<'a, SA, ISA, PSI>],
boundaries: &[(u32, u32)],
) -> Exemplars<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
{
let mut heap = BinaryHeap::new();
for boundary in boundaries.iter() {
let mut state = ExemplarState {
terminal: boundary.0,
needle: vec![boundary.1],
docs: HashMap::new(),
};
for (idx, doc) in docs.iter().enumerate() {
let Ok(range) = doc.sigma.sa_range_for(boundary.1) else {
continue;
};
if range.0 > range.1 {
continue;
}
let numer = range.1 - range.0 + 1;
state.docs.insert(idx, CorrelateDocState { numer, range });
}
if !state.docs.is_empty() {
heap.push(state);
}
}
Exemplars { docs, heap }
}
pub struct Exemplars<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
{
docs: &'a [&'a PsiDocument<'a, SA, ISA, PSI>],
heap: BinaryHeap<ExemplarState>,
}
impl<'a, SA, ISA, PSI> Iterator for Exemplars<'a, SA, ISA, PSI>
where
SA: sa::SuffixArray,
ISA: isa::InverseSuffixArray,
PSI: psi::Psi,
{
type Item = Exemplar;
fn next(&mut self) -> Option<Self::Item> {
while let Some(correlate) = self.heap.pop() {
if correlate.terminal == correlate.needle[correlate.needle.len() - 1] {
return Some(Exemplar::from(correlate));
}
let mut new_correlates: HashMap<u32, ExemplarState> = HashMap::new();
for (idx, doc_state) in correlate.docs.iter() {
let doc = &self.docs[*idx];
for i in 1..=doc.sigma.K() as u32 {
let Some(t) = doc.sigma.sigma_to_char(i) else {
continue;
};
let new_correlate = new_correlates.entry(t).or_insert_with(|| {
let mut needle = correlate.needle.clone();
needle.push(t);
ExemplarState {
terminal: correlate.terminal,
needle,
docs: HashMap::new(),
}
});
let Ok(range) = doc.sigma.sa_range_for_sigma(i) else {
continue;
};
if range.0 > range.1 {
continue;
}
let Ok(range) = doc.psi.constrain(&doc.sigma, range, doc_state.range) else {
continue;
};
if range.0 > range.1 {
continue;
}
let numer = range.1 - range.0 + 1;
new_correlate
.docs
.insert(*idx, CorrelateDocState { numer, range });
}
}
for (_, new_correlate) in new_correlates {
if !new_correlate.docs.is_empty() {
self.heap.push(new_correlate);
}
}
}
None
}
}
fn inverse(x: &[usize]) -> Vec<usize> {
let mut ix = vec![0usize; x.len()];
for i in 0..x.len() {
ix[x[i]] = i
}
ix
}
fn check_record_boundaries(text: &[u32], record_boundaries: &[usize]) -> Result<(), Error> {
if record_boundaries.is_empty() {
return Err(Error::BadRecordOffset);
}
for (lhs, rhs) in std::iter::zip(record_boundaries.iter(), record_boundaries[1..].iter()) {
if lhs >= rhs {
return Err(Error::BadRecordOffset);
}
}
if record_boundaries[0] != 0 {
return Err(Error::BadRecordOffset);
}
if record_boundaries[record_boundaries.len() - 1] >= text.len() {
return Err(Error::BadRecordOffset);
}
Ok(())
}
#[cfg(test)]
pub mod test_util {
use crate::builder::Builder;
use crate::sigma::Sigma;
use super::*;
#[macro_export]
macro_rules! assert_eq_with_ctx {
(@inner [$($elems:tt)*] , $($rem:tt)*) => {
format!("{} = {:?}; {}", stringify!($($elems)*), $($elems)*, assert_eq_with_ctx!(@inner [] $($rem)*))
};
(@inner [$($elems:tt)*] $e:tt $($rem:tt)*) => {
assert_eq_with_ctx!(@inner [$($elems)* $e] $($rem)*)
};
(@inner [$($elems:tt)*]) => {
format!("{} = {:?}", stringify!($($elems)*), $($elems)*)
};
($lhs:expr, $rhs:expr, $($rem:expr),+) => {
assert_eq!($lhs, $rhs, "{} == {}; {}", stringify!($lhs), stringify!($rhs), assert_eq_with_ctx!(@inner [] $($rem),*));
};
($lhs:expr, $rhs:expr) => {
assert_eq!($lhs, $rhs, "{} == {}", stringify!($lhs), stringify!($rhs));
};
}
pub(crate) use assert_eq_with_ctx;
#[allow(non_snake_case)]
pub struct TestCase {
pub text: &'static str,
pub sigma2text: &'static [char],
pub boundaries: &'static [usize],
pub not_in_str: &'static [char],
pub S: &'static [u32],
pub SA: &'static [usize],
pub ISA: &'static [usize],
pub PSI: &'static [usize],
pub bucket_starts: &'static [usize],
pub bucket_limits: &'static [usize],
pub deref_SA: &'static [u32],
pub lstype: &'static str,
pub lmspos: &'static str,
pub searches: &'static [(&'static str, &'static [usize])],
pub table: &'static str,
#[allow(clippy::type_complexity)]
pub constrain: &'static [((usize, usize), (usize, usize), (usize, usize))],
}
impl TestCase {
pub fn sigma(&self) -> Vec<u8> {
let mut buf = Vec::new();
let mut builder = Builder::new(&mut buf);
Sigma::construct(self.text.chars().map(|c| c as u32), &mut builder)
.expect("test case should construct");
drop(builder);
buf
}
}
pub const BANANA: &TestCase = &TestCase {
text: "BANANA",
sigma2text: &['A', 'B', 'N'],
boundaries: &[1, 4, 5, 7],
not_in_str: &['C', 'D', 'E'],
S: &[2, 1, 3, 1, 3, 1, 0],
SA: &[6, 5, 3, 1, 0, 4, 2],
ISA: &[4, 3, 6, 2, 5, 1, 0],
PSI: &[4, 0, 5, 6, 3, 1, 2],
bucket_starts: &[0, 1, 4, 5],
bucket_limits: &[1, 4, 5, 7],
deref_SA: &[0, 1, 1, 1, 2, 3, 3],
lstype: "LSLSLLS",
lmspos: " * * *",
searches: &[("AN", &[1, 3]), ("NA", &[2, 4])],
table: "
A B N
[] [0] [] []
[] [] [] [1]
[] [] [3] [2]
[4] [] [] []
[] [5, 6] [] []
",
constrain: &[
((5, 6), (0, 7), (5, 6)),
((1, 3), (5, 6), (2, 3)),
((1, 3), (0, 7), (1, 3)),
((5, 6), (1, 3), (5, 6)),
((1, 3), (5, 6), (2, 3)),
((4, 4), (2, 3), (4, 4)),
],
};
pub const MISSISSIPPI: &TestCase = &TestCase {
text: "MISSISSIPPI",
sigma2text: &['I', 'M', 'P', 'S'],
boundaries: &[1, 5, 6, 8, 12],
not_in_str: &['A', 'B', 'N'],
S: &[2, 1, 4, 4, 1, 4, 4, 1, 3, 3, 1, 0],
SA: &[11, 10, 7, 4, 1, 0, 9, 8, 6, 3, 5, 2],
ISA: &[5, 4, 11, 9, 3, 10, 8, 2, 7, 6, 1, 0],
PSI: &[5, 0, 7, 10, 11, 4, 1, 6, 2, 3, 8, 9],
bucket_starts: &[0, 1, 5, 6, 8],
bucket_limits: &[1, 5, 6, 8, 12],
deref_SA: &[0, 1, 1, 1, 1, 2, 3, 3, 4, 4, 4, 4],
lstype: "LSLLSLLSLLLS",
lmspos: " * * * *",
searches: &[("ISS", &[1, 4])],
table: "
I M P S
[] [0] [] [] []
[] [] [] [1] []
[] [] [] [] [2]
[] [] [4] [] [3]
[5] [] [] [] []
[] [] [] [6] []
[] [7] [] [] []
[] [] [] [] [8, 9]
[] [10, 11] [] [] []
",
constrain: &[
((8, 11), (8, 11), (10, 11)),
((8, 11), (1, 4), (8, 9)),
((1, 4), (8, 11), (3, 4)),
((1, 4), (0, 0), (1, 1)),
((1, 4), (6, 7), (2, 2)),
((5, 5), (1, 4), (5, 5)),
((6, 7), (6, 7), (7, 7)),
],
};
pub const MISSISSIPPI_BANANA: &TestCase = &TestCase {
text: "MISSISSIPPIBANANA",
sigma2text: &['A', 'B', 'I', 'M', 'N', 'P', 'S'],
boundaries: &[1, 4, 5, 9, 10, 12, 14, 18],
not_in_str: &['C', 'D', 'E'],
S: &[4, 3, 7, 7, 3, 7, 7, 3, 6, 6, 3, 2, 1, 5, 1, 5, 1, 0],
SA: &[17, 16, 14, 12, 11, 10, 7, 4, 1, 0, 15, 13, 9, 8, 6, 3, 5, 2],
ISA: &[9, 8, 17, 15, 7, 16, 14, 6, 13, 12, 5, 4, 3, 11, 2, 10, 1, 0],
PSI: &[9, 0, 10, 11, 3, 4, 13, 16, 17, 8, 1, 2, 5, 12, 6, 7, 14, 15],
bucket_starts: &[0, 1, 4, 5, 9, 10, 12, 14],
bucket_limits: &[1, 4, 5, 9, 10, 12, 14, 18],
deref_SA: &[0, 1, 1, 1, 2, 3, 3, 3, 3, 4, 5, 5, 6, 6, 7, 7, 7, 7],
lstype: "LSLLSLLSLLLLSLSLLS",
lmspos: " * * * * * *",
searches: &[("ISS", &[1, 4])],
table: "
A B I M N P S
[] [0] [] [] [] [] [] []
[] [] [] [] [] [1] [] []
[] [] [3] [] [] [2] [] []
[] [] [] [4] [] [] [] []
[] [] [] [] [] [] [5] []
[] [] [] [] [] [] [] [6]
[] [] [] [] [8] [] [] [7]
[9] [] [] [] [] [] [] []
[] [10, 11] [] [] [] [] [] []
[] [] [] [] [] [] [12] []
[] [] [] [13] [] [] [] []
[] [] [] [] [] [] [] [14, 15]
[] [] [] [16, 17] [] [] [] []
",
constrain: &[],
};
pub const MIISSISSISSIPPI: &TestCase = &TestCase {
text: "MIISSISSISSIPPI",
sigma2text: &['I', 'M', 'P', 'S'],
boundaries: &[1, 7, 8, 10, 16],
not_in_str: &['A', 'B', 'N'],
S: &[2, 1, 1, 4, 4, 1, 4, 4, 1, 4, 4, 1, 3, 3, 1, 0],
SA: &[15, 14, 1, 11, 8, 5, 2, 0, 13, 12, 10, 7, 4, 9, 6, 3],
ISA: &[7, 2, 6, 15, 12, 5, 14, 11, 4, 13, 10, 3, 9, 8, 1, 0],
PSI: &[7, 0, 6, 9, 13, 14, 15, 2, 1, 8, 3, 4, 5, 10, 11, 12],
bucket_starts: &[0, 1, 7, 8, 10],
bucket_limits: &[1, 7, 8, 10, 16],
deref_SA: &[0, 1, 1, 1, 1, 1, 1, 2, 3, 3, 4, 4, 4, 4, 4, 4],
lstype: "LSSLLSLLSLLSLLLS",
lmspos: " * * * * *",
searches: &[("ISS", &[2, 5, 8])],
table: "
I M P S
[] [0] [] [] []
[] [] [] [1] []
[] [] [2] [] []
[] [] [] [] [3]
[] [6] [] [] [4, 5]
[7] [] [] [] []
[] [] [] [8] []
[] [9] [] [] []
[] [] [] [] [10, 11, 12]
[] [13, 14, 15] [] [] []
",
constrain: &[],
};
pub const MUTANT_BANANA: &TestCase = &TestCase {
text: "ANBNANANCANBNCNANANCNA",
sigma2text: &['A', 'B', 'C', 'N'],
boundaries: &[1, 8, 10, 13, 23],
not_in_str: &['D', 'E', 'F'],
S: &[
1, 4, 2, 4, 1, 4, 1, 4, 3, 1, 4, 2, 4, 3, 4, 1, 4, 1, 4, 3, 4, 1, 0,
],
SA: &[
22, 21, 4, 15, 0, 9, 6, 17, 2, 11, 8, 19, 13, 20, 3, 14, 5, 16, 1, 10, 7, 18, 12,
],
ISA: &[
4, 18, 8, 14, 2, 16, 6, 20, 10, 5, 19, 9, 22, 12, 15, 3, 17, 7, 21, 11, 13, 1, 0,
],
PSI: &[
4, 0, 16, 17, 18, 19, 20, 21, 14, 22, 5, 13, 15, 1, 2, 3, 6, 7, 8, 9, 10, 11, 12,
],
bucket_starts: &[0, 1, 8, 10, 13],
bucket_limits: &[1, 8, 10, 13, 23],
deref_SA: &[
0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
],
lstype: "SLSLSLSLLSLSLSLSLSLSLLS",
lmspos: " * * * * * * * * * *",
searches: &[("AN", &[0, 4, 6, 9, 15, 17])],
table: "
A B C N
[] [0] [] [] []
[] [] [] [] [1]
[4] [] [] [5] [2, 3, 6, 7]
[] [] [] [] [8, 9]
[] [] [] [] [10]
[] [] [] [] [11, 12]
[] [16, 17] [14] [13, 15] []
[] [18, 19] [] [] []
[] [20, 21] [22] [] []
",
constrain: &[],
};
pub const SAIS_EXAMPLE: &TestCase = &TestCase {
text: "baabababbab",
sigma2text: &['a', 'b'],
boundaries: &[1, 6, 12],
not_in_str: &['c', 'd', 'e'],
S: &[2, 1, 1, 2, 1, 2, 1, 2, 2, 1, 2, 0],
SA: &[11, 1, 9, 2, 4, 6, 10, 0, 8, 3, 5, 7],
ISA: &[7, 1, 3, 9, 4, 10, 5, 11, 8, 2, 6, 0],
PSI: &[7, 3, 6, 9, 10, 11, 0, 1, 2, 4, 5, 8],
bucket_starts: &[0, 1, 6],
bucket_limits: &[1, 6, 12],
deref_SA: &[0, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2],
lstype: "LSSLSLSLLSLS",
lmspos: " * * * * *",
searches: &[("b", &[0, 3, 5, 7, 8, 10])],
table: "
a b
[] [] [0]
[] [] [1]
[] [3] [2, 4, 5]
[] [6] []
[7] [9, 10] [8]
[] [11] []
",
constrain: &[],
};
pub const BAD_CASE_1: &TestCase = &TestCase {
text: "1111111111112111",
sigma2text: &['1', '2'],
boundaries: &[1, 16, 17],
not_in_str: &['3'],
S: &[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 0],
SA: &[16, 15, 14, 13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
ISA: &[4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 3, 2, 1, 0],
PSI: &[4, 0, 1, 2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 3],
bucket_starts: &[0, 1, 16],
bucket_limits: &[1, 16, 17],
deref_SA: &[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2],
lstype: "SSSSSSSSSSSSLLLLS",
lmspos: " *",
searches: &[],
table: "
1 2
[] [0] []
[] [1] []
[4] [2, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14] [3]
[] [15] []
[] [16] []
",
constrain: &[((1, 15), (16, 16), (15, 15))],
};
pub const BAD_CASE_2: &TestCase = &TestCase {
text: "2121221211111132",
sigma2text: &['1', '2', '3'],
boundaries: &[1, 10, 16, 17],
not_in_str: &['4'],
S: &[2, 1, 2, 1, 2, 2, 1, 2, 1, 1, 1, 1, 1, 1, 3, 2, 0],
SA: &[16, 8, 9, 10, 11, 12, 6, 1, 3, 13, 15, 7, 5, 0, 2, 4, 14],
ISA: &[13, 7, 14, 8, 15, 12, 6, 11, 1, 2, 3, 4, 5, 9, 16, 10, 0],
PSI: &[13, 2, 3, 4, 5, 9, 11, 14, 15, 16, 0, 1, 6, 7, 8, 12, 10],
bucket_starts: &[0, 1, 10, 16],
bucket_limits: &[1, 10, 16, 17],
deref_SA: &[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3],
lstype: "LSLSLLSLSSSSSSLLS",
lmspos: " * * * * *",
searches: &[],
table: "
1 2 3
[] [] [0] []
[] [2, 3, 4, 5] [1] []
[] [] [6, 7, 8] []
[] [9] [] []
[] [] [] [10]
[13] [11, 14] [12] []
[] [15] [] []
[] [16] [] []
",
constrain: &[((16, 16), (1, 9), (16, 15))],
};
pub const BAD_CASE_3: &TestCase = &TestCase {
text: "2121221212121222",
sigma2text: &['1', '2'],
boundaries: &[1, 7, 17],
not_in_str: &['3'],
S: &[2, 1, 2, 1, 2, 2, 1, 2, 1, 2, 1, 2, 1, 2, 2, 2, 0],
SA: &[16, 6, 8, 1, 10, 3, 12, 15, 5, 7, 0, 9, 2, 11, 14, 4, 13],
ISA: &[10, 3, 12, 5, 15, 8, 1, 9, 2, 11, 4, 13, 6, 16, 14, 7, 0],
PSI: &[10, 9, 11, 12, 13, 15, 16, 0, 1, 2, 3, 4, 5, 6, 7, 8, 14],
bucket_starts: &[0, 1, 7],
bucket_limits: &[1, 7, 17],
deref_SA: &[0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2],
lstype: "LSLSLLSLSLSLSLLLS",
lmspos: " * * * * * * *",
searches: &[],
table: "
1 2
[] [] [0]
[] [] [1, 2, 3, 4, 5, 6]
[] [] [7]
[10] [9, 11, 12, 13] [8]
[] [15, 16] [14]
",
constrain: &[((1, 6), (1, 6), (1, 0))],
};
pub const BAD_CASE_4: &TestCase = &TestCase {
text: "12224122231222312222122221222212221122211222112221",
sigma2text: &['1', '2', '3', '4'],
boundaries: &[1, 15, 48, 50, 51],
not_in_str: &['5'],
S: &[1, 2, 2, 2, 4, 1, 2, 2, 2, 3, 1, 2, 2, 2, 3, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 2, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 1, 2, 2, 2, 1, 0],
SA: &[50, 49, 44, 39, 34, 45, 40, 35, 30, 25, 20, 15, 10, 5, 0, 48, 43, 38, 33, 29, 24, 19, 47, 42, 37, 32, 28, 23, 18, 46, 41, 36, 31, 27, 22, 17, 26, 21, 16, 11, 6, 1, 12, 7, 2, 13, 8, 3, 14, 9, 4],
ISA: &[14, 41, 44, 47, 50, 13, 40, 43, 46, 49, 12, 39, 42, 45, 48, 11, 38, 35, 28, 21, 10, 37, 34, 27, 20, 9, 36, 33, 26, 19, 8, 32, 25, 18, 4, 7, 31, 24, 17, 3, 6, 30, 23, 16, 2, 5, 29, 22, 15, 1, 0],
PSI: &[14, 0, 5, 6, 7, 29, 30, 31, 32, 36, 37, 38, 39, 40, 41, 1, 2, 3, 4, 8, 9, 10, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 33, 34, 35, 42, 43, 44, 45, 46, 47, 48, 49, 50, 11, 12, 13],
bucket_starts: &[0, 1, 15, 48, 50],
bucket_limits: &[1, 15, 48, 50, 51],
deref_SA: &[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 4],
lstype: "SSSSLSSSSLSSSSLSLLLLSLLLLSLLLLSLLLSSLLLSSLLLSSLLLLS",
lmspos: " * * * * * * * * * *",
searches: &[],
table: "
1 2 3 4
[] [0] [] [] []
[] [] [1] [] []
[] [] [2, 3, 4] [] []
[14] [5, 6, 7] [8, 9, 10] [11, 12] [13]
[] [] [15, 16, 17, 18, 19, 20, 21] [] []
[] [29, 30, 31, 32, 36, 37, 38, 39, 40, 41] [22, 23, 24, 25, 26, 27, 28, 33, 34, 35, 42, 43, 44] [] []
[] [] [45, 46] [] []
[] [] [47] [] []
[] [] [48, 49] [] []
[] [] [50] [] []
",
constrain: &[
((15, 47), (48, 49), (45, 46)),
((15, 47), (45, 46), (42, 43)),
((15, 47), (42, 43), (39, 40)),
((15, 47), (50, 50), (47, 47)),
((15, 47), (47, 47), (44, 44)),
],
};
#[macro_export]
macro_rules! test_cases_for {
($name:ident, $check:path) => {
mod $name {
#[test]
fn banana() {
$check($crate::test_util::BANANA);
}
#[test]
fn mississippi() {
$check($crate::test_util::MISSISSIPPI);
}
#[test]
fn mississippi_banana() {
$check($crate::test_util::MISSISSIPPI_BANANA);
}
#[test]
fn miissississippi() {
$check($crate::test_util::MIISSISSISSIPPI);
}
#[test]
fn mutant_banana() {
$check($crate::test_util::MUTANT_BANANA);
}
#[test]
fn sais_example() {
$check($crate::test_util::SAIS_EXAMPLE);
}
#[test]
fn bad_case1() {
$check($crate::test_util::BAD_CASE_1);
}
#[test]
fn bad_case2() {
$check($crate::test_util::BAD_CASE_2);
}
#[test]
fn bad_case3() {
$check($crate::test_util::BAD_CASE_3);
}
#[test]
fn bad_case4() {
$check($crate::test_util::BAD_CASE_4);
}
}
};
}
pub(crate) use test_cases_for;
#[macro_export]
macro_rules! search_cases_for {
($name:ident, $check:path) => {
mod $name {
use super::*;
fn build(t: &TestCase) -> Vec<u8> {
let text: Vec<u32> = t.text.chars().map(|c| c as u32).collect();
let record_boundaries = vec![0usize];
let mut buf = Vec::new();
let mut builder = Builder::new(&mut buf);
<$check>::construct(text, record_boundaries, &mut builder)
.expect("test case should index");
drop(builder);
buf
}
fn check_len(t: &TestCase) {
let doc = build(t);
let (doc, _) =
<$check>::unpack(&doc).expect("freshly created document should unpack");
assert_eq!(t.text.chars().count(), doc.len());
}
test_cases_for! {len, super::check_len}
fn check_retrieve(t: &TestCase) {
let text: Vec<u32> = t.text.chars().map(|c| c as u32).collect();
let doc = build(t);
let (doc, _) =
<$check>::unpack(&doc).expect("freshly created document should unpack");
assert_eq!(
text,
doc.retrieve(RecordOffset(0))
.expect("retrieve should succeed")
);
}
test_cases_for! {retrieve, super::check_retrieve}
fn check_search(t: &TestCase) {
let doc = build(t);
let (doc, _) =
<$check>::unpack(&doc).expect("freshly created document should unpack");
for (needle, offsets) in t.searches.iter() {
let expected: Vec<TextOffset> =
offsets.iter().map(|x| TextOffset(*x)).collect();
let needle_u32: Vec<u32> = needle.chars().map(|c| c as u32).collect();
let mut returned: Vec<TextOffset> = doc
.search(&needle_u32)
.expect("search should succeed")
.collect();
returned.sort();
assert_eq_with_ctx!(expected, returned, needle);
}
}
test_cases_for! {search, super::check_search}
fn check_count(t: &TestCase) {
let doc = build(t);
let (doc, _) =
<$check>::unpack(&doc).expect("freshly created document should unpack");
for (needle, offsets) in t.searches.iter() {
let needle_u32: Vec<u32> = needle.chars().map(|c| c as u32).collect();
let count: usize = doc.count(&needle_u32).expect("count should succeed");
assert_eq_with_ctx!(offsets.len(), count, needle);
}
}
test_cases_for! {count, super::check_count}
}
};
}
pub(crate) use search_cases_for;
}
#[cfg(test)]
mod tests {
use crate::isa::InverseSuffixArray;
use crate::sa::SuffixArray;
use super::psi::Psi;
use super::test_util::*;
use super::*;
#[test]
#[allow(clippy::assertions_on_constants)]
fn u32_as_usize() {
assert!(u32::BITS <= usize::BITS);
}
#[test]
#[allow(clippy::assertions_on_constants)]
fn usize_as_u64() {
assert!(usize::BITS <= u64::BITS);
}
#[test]
fn inverse() {
let x = &[8, 6, 9, 5, 0, 3, 1, 2, 7, 4];
let ix = &[4, 6, 7, 5, 9, 3, 1, 8, 0, 2];
let returned: &[usize] = &super::inverse(x);
assert_eq!(ix, returned);
let returned: &[usize] = &super::inverse(returned);
assert_eq!(x, returned);
}
fn check_string(t: &TestCase) {
let sigma = t.sigma();
let sigma = sigma::Sigma::unpack(&sigma).expect("test should unpack").0;
let mut s: Vec<u32> = t
.text
.chars()
.map(|x| sigma.char_to_sigma(x as u32).unwrap())
.collect();
s.push(0);
assert_eq!(t.S, s);
}
test_cases_for! {string, super::check_string}
fn check_isa(t: &TestCase) {
let returned = super::inverse(t.SA);
assert_eq_with_ctx!(t.ISA, returned);
}
test_cases_for! {isa, super::check_isa}
fn check_unpack_psi_document(t: &TestCase) {
let text: Vec<u32> = t.text.chars().map(|c| c as u32).collect();
let record_boundaries = vec![0usize];
let mut buf = Vec::new();
let mut builder = Builder::new(&mut buf);
type Document<'a> = super::PsiDocument<
'a,
super::sa::ReferenceSuffixArray,
super::isa::ReferenceInverseSuffixArray,
super::psi::ReferencePsi,
>;
Document::construct(text, record_boundaries, &mut builder).expect("test case should index");
drop(builder);
let doc = Document::unpack(&buf).expect("document should unpack").0;
for (idx, sa) in t.SA.iter().enumerate() {
assert_eq_with_ctx!(*sa, doc.sa.lookup(&doc.sigma, &doc.psi, idx).unwrap(), idx);
}
for (idx, isa) in t.ISA.iter().enumerate() {
assert_eq_with_ctx!(*isa, doc.isa.lookup(idx).unwrap(), idx);
}
assert_eq!(t.PSI.len(), doc.psi.len());
for (idx, psi) in t.PSI.iter().enumerate() {
assert_eq_with_ctx!(*psi, doc.psi.lookup(&doc.sigma, idx).unwrap(), idx);
}
}
test_cases_for! {unpack_psi_document, super::check_unpack_psi_document}
search_cases_for! {reference, crate::ReferenceDocument}
search_cases_for! {psi_with_all_reference, crate::PsiDocument::<crate::sa::ReferenceSuffixArray, crate::isa::ReferenceInverseSuffixArray, psi::ReferencePsi>}
search_cases_for! {psi_with_wavelet_psi, crate::PsiDocument::<crate::sa::ReferenceSuffixArray, crate::isa::ReferenceInverseSuffixArray, psi::wavelet_tree::WaveletTreePsi<wavelet_tree::ReferenceWaveletTree>>}
search_cases_for! {compressed_document, crate::CompressedDocument}
}