use crate::character::Character;
use crate::converter::{Converter, IndexWithConverter};
use crate::sais;
use crate::suffix_array::{ArraySampler, PartialArray, IndexWithSA};
use crate::util;
use crate::wavelet_matrix::WaveletMatrix;
use crate::{BackwardIterableIndex, ForwardIterableIndex};
use fid::FID;
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct RLFMIndex<T, C, S> {
converter: C,
suffix_array: S,
s: WaveletMatrix,
b: fid::BitVector,
bp: fid::BitVector,
cs: Vec<u64>,
len: u64,
_t: std::marker::PhantomData<T>,
}
impl<T, C, S> RLFMIndex<T, C, S>
where
T: Character,
C: Converter<T>,
{
pub fn new<B: ArraySampler<S>>(text: Vec<T>, converter: C, sampler: B) -> Self {
let n = text.len();
let m = converter.len();
let sa = sais::sais(&text, &converter);
let mut c0 = T::zero();
let mut s = Vec::new();
let mut b = fid::BitVector::new();
let mut runs_by_char: Vec<Vec<usize>> = vec![vec![]; m as usize];
for &k in &sa {
let k = k as usize;
let c = converter.convert(if k > 0 { text[k - 1] } else { text[n - 1] });
if c0 != c {
s.push(c);
b.push(true);
runs_by_char[c.into() as usize].push(1);
} else {
b.push(false);
match runs_by_char[c.into() as usize].last_mut() {
Some(r) => *r += 1,
None => unreachable!(),
};
}
c0 = c;
}
let s = WaveletMatrix::new_with_size(s, util::log2(m - 1) + 1);
let mut bp = fid::BitVector::new();
let mut cs = vec![0u64; m as usize];
let mut c = 0;
for (rs, ci) in runs_by_char.into_iter().zip(&mut cs) {
*ci = c;
c += rs.len() as u64;
for r in rs {
bp.push(true);
for _ in 0..(r - 1) {
bp.push(false);
}
}
}
RLFMIndex {
converter,
suffix_array: sampler.sample(sa),
s,
b,
bp,
cs,
len: n as u64,
_t: std::marker::PhantomData::<T>,
}
}
pub fn runs(&self) -> u64 {
self.s.len()
}
pub fn len(&self) -> u64 {
self.len
}
pub fn is_empty(&self) -> bool {
self.len == 0
}
}
impl<T, C, S> BackwardIterableIndex for RLFMIndex<T, C, S>
where
T: Character,
C: Converter<T>,
{
type T = T;
fn len(&self) -> u64 {
self.len
}
fn get_l(&self, i: u64) -> T {
self.s.access(self.b.rank1(i + 1) - 1)
}
fn lf_map(&self, i: u64) -> u64 {
let c = self.get_l(i);
let j = self.b.rank1(i);
let nr = self.s.rank(c, j);
self.bp.select1(self.cs[c.into() as usize] + nr) + i - self.b.select1(j)
}
fn lf_map2(&self, c: T, i: u64) -> u64 {
let c = self.converter.convert(c);
let j = self.b.rank1(i);
let nr = self.s.rank(c, j);
if self.get_l(i) != c {
self.bp.select1(self.cs[c.into() as usize] + nr)
} else {
self.bp.select1(self.cs[c.into() as usize] + nr) + i - self.b.select1(j)
}
}
}
impl<T, C, S> ForwardIterableIndex for RLFMIndex<T, C, S>
where
T: Character,
C: Converter<T>,
{
type T = T;
fn get_f(&self, i: u64) -> Self::T {
let mut s = 0;
let mut e = self.cs.len() as usize;
let r = self.bp.rank1(i + 1) - 1;
while e - s > 1 {
let m = s + (e - s) / 2;
if self.cs[m] <= r {
s = m;
} else {
e = m;
}
}
T::from_u64(s as u64)
}
fn fl_map(&self, i: u64) -> u64 {
let c = self.get_f(i);
let j = self.bp.rank1(i + 1) - 1;
let p = self.bp.select1(j);
let m = self.s.select(c, j - self.cs[c.into() as usize]);
let n = self.b.select1(m);
n + i - p
}
fn fl_map2(&self, c: Self::T, i: u64) -> u64 {
let c = self.converter.convert(c);
let j = self.bp.rank1(i + 1) - 1;
let p = self.bp.select1(j);
let m = self.s.select(c, j - self.cs[c.into() as usize]);
let n = self.b.select1(m);
n + i - p
}
fn len(&self) -> u64 {
self.len
}
}
impl<T, C, S> IndexWithSA for RLFMIndex<T, C, S>
where
T: Character,
C: Converter<T>,
S: PartialArray,
{
fn get_sa(&self, mut i: u64) -> u64 {
let mut steps = 0;
loop {
match self.suffix_array.get(i) {
Some(sa) => {
return (sa + steps) % self.len();
}
None => {
i = self.lf_map(i);
steps += 1;
}
}
}
}
}
impl<T, C, S> IndexWithConverter<T> for RLFMIndex<T, C, S>
where
C: Converter<T>,
{
type C = C;
fn get_converter(&self) -> &Self::C {
&self.converter
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::converter::RangeConverter;
use crate::search::BackwardSearchIndex;
use crate::suffix_array::{NullSampler, RegularSampler};
use fid::FID;
#[test]
fn test_count() {
let text = "mississippi\0".to_string().into_bytes();
let ans = vec![
("m", 1),
("mi", 1),
("i", 4),
("iss", 2),
("ss", 2),
("p", 2),
("ppi", 1),
("z", 0),
("pps", 0),
];
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
for (pattern, expected) in ans {
let search = rlfmi.search_backward(pattern);
let actual = search.count();
assert_eq!(
expected, actual,
"pattern \"{}\" must occur {} times, but {}",
pattern, expected, actual,
);
}
}
#[test]
fn test_locate() {
let text = "mississippi\0".to_string().into_bytes();
let ans = vec![
("m", vec![0]),
("mi", vec![0]),
("i", vec![1, 4, 7, 10]),
("iss", vec![1, 4]),
("ss", vec![2, 5]),
("p", vec![8, 9]),
("ppi", vec![8]),
("z", vec![]),
("pps", vec![]),
];
let fm_index = RLFMIndex::new(
text,
RangeConverter::new(b'a', b'z'),
RegularSampler::new().level(2),
);
for (pattern, positions) in ans {
let search = fm_index.search_backward(pattern);
let expected = positions.len() as u64;
let actual = search.count();
assert_eq!(
expected,
actual,
"pattern \"{}\" must occur {} times, but {}: {:?}",
pattern,
expected,
actual,
search.locate()
);
let mut res = search.locate();
res.sort();
assert_eq!(res, positions);
}
}
#[test]
fn test_s() {
let text = "mississippi\0".to_string().into_bytes();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let ans = "ipsm\0pisi".to_string().into_bytes();
for (i, a) in ans.into_iter().enumerate() {
let l: u8 = rlfmi.s.access(i as u64);
assert_eq!(rlfmi.converter.convert_inv(l), a);
}
}
#[test]
fn test_b() {
let text = "mississippi\0".to_string().into_bytes();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let n = rlfmi.len();
let ans = vec![1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0];
assert_eq!(n, rlfmi.b.len());
for (i, a) in ans.into_iter().enumerate() {
assert_eq!(rlfmi.b.get(i as u64), a != 0, "failed at b[{}]", i);
}
}
#[test]
fn test_bp() {
let text = "mississippi\0".to_string().into_bytes();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let n = rlfmi.len();
let ans = vec![1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0];
assert_eq!(n, rlfmi.bp.len());
for (i, a) in ans.into_iter().enumerate() {
assert_eq!(rlfmi.bp.get(i as u64), a != 0, "failed at bp[{}]", i);
}
}
#[test]
fn test_cs() {
let text = "mississippi\0".to_string().into_bytes();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let ans = vec![(b'\0', 0), (b'i', 1), (b'm', 4), (b'p', 5), (b's', 7)];
for (c, a) in ans {
let c = rlfmi.converter.convert(c) as usize;
assert_eq!(rlfmi.cs[c], a);
}
}
#[test]
fn test_get_l() {
let text = "mississippi\0".to_string().into_bytes();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let ans = "ipssm\0pissii".to_string().into_bytes();
for (i, a) in ans.into_iter().enumerate() {
let l = rlfmi.get_l(i as u64);
assert_eq!(rlfmi.converter.convert_inv(l as u8), a);
}
}
#[test]
fn test_lf_map() {
let text = "mississippi\0".to_string().into_bytes();
let ans = vec![1, 6, 7, 2, 8, 10, 3, 9, 11, 4, 5, 0];
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let mut i = 0;
for a in ans {
let next_i = rlfmi.lf_map(i);
assert_eq!(next_i, a, "should be lf_map({}) == {}", i, a);
i = next_i;
}
}
#[test]
fn test_lf_map2() {
let text = "mississippi\0".to_string().into_bytes();
let n = text.len() as u64;
let ans = vec![
(b'\0', (0, 1)),
(b'i', (1, 5)),
(b'm', (5, 6)),
(b'p', (6, 8)),
(b's', (8, 12)),
];
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
for (c, r) in ans {
let s = rlfmi.lf_map2(c, 0);
let e = rlfmi.lf_map2(c, n);
assert_eq!(
(s, e),
r,
"character {:?}: (s, e) should be {:?}, actual: {:?}",
c as char,
r,
(s, e)
);
}
}
#[test]
fn test_search_backward() {
let text = "mississippi\0".to_string().into_bytes();
let ans = vec![
("iss", (3, 5)),
("ppi", (7, 8)),
("si", (8, 10)),
("ssi", (10, 12)),
];
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
for (s, r) in ans {
let search = rlfmi.search_backward(s);
assert_eq!(search.get_range(), r);
}
}
#[test]
fn test_iter_backward() {
let text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.\0".to_string().into_bytes();
let index = RLFMIndex::new(text, RangeConverter::new(b' ', b'~'), NullSampler::new());
let search = index.search_backward("sit ");
let mut prev_seq = search.iter_backward(0).take(6).collect::<Vec<_>>();
prev_seq.reverse();
assert_eq!(prev_seq, b"dolor ".to_owned());
}
#[test]
fn test_iter_forward() {
let text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.\0".to_string().into_bytes();
let index = RLFMIndex::new(text, RangeConverter::new(b' ', b'~'), NullSampler::new());
let search = index.search_backward("sit ");
let next_seq = search.iter_forward(0).take(10).collect::<Vec<_>>();
assert_eq!(next_seq, b"sit amet, ".to_owned());
}
#[test]
fn test_get_f() {
let text = "mississippi\0".to_string().into_bytes();
let mut ans = text.clone();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
ans.sort();
for (i, a) in ans.into_iter().enumerate() {
let f = rlfmi.get_f(i as u64);
assert_eq!(rlfmi.converter.convert_inv(f as u8), a);
}
}
#[test]
fn test_fl_map() {
let text = "mississippi\0".to_string().into_bytes();
let rlfmi = RLFMIndex::new(text, RangeConverter::new(b'a', b'z'), NullSampler::new());
let cases = vec![5u64, 0, 7, 10, 11, 4, 1, 6, 2, 3, 8, 9];
for (i, expected) in cases.into_iter().enumerate() {
let actual = rlfmi.fl_map(i as u64);
assert_eq!(actual, expected);
}
}
}