use std::ops::Range;
#[cfg(feature = "pack")]
use std::{
io::{Read, Result, Write},
path::Path,
};
#[cfg(feature = "pack")]
use super::packed_sa::PackedSuffixArray;
use super::saca::saca;
use super::utils::*;
#[derive(Clone)]
pub struct SuffixArray<'a> {
s: &'a [u8],
sa: Vec<u32>,
bkt: Option<Vec<u32>>,
}
impl<'a> SuffixArray<'a> {
pub fn new(s: &'a [u8]) -> Self {
let mut sa = vec![0; s.len() + 1];
saca(s, &mut sa[..]);
SuffixArray { s, sa, bkt: None }
}
pub fn set(&mut self, s: &'a [u8]) {
self.sa.resize(s.len() + 1, 0);
saca(s, &mut self.sa[..]);
}
pub fn fit(&mut self) {
self.sa.shrink_to_fit()
}
pub fn len(&self) -> usize {
self.s.len()
}
pub fn is_empty(&self) -> bool {
self.len() == 0
}
pub fn into_parts(self) -> (&'a [u8], Vec<u32>) {
(self.s, self.sa)
}
pub fn from_parts(s: &'a [u8], sa: Vec<u32>) -> Option<Self> {
let compose = SuffixArray { s, sa, bkt: None };
if compose.check_integrity() {
Some(compose)
} else {
None
}
}
pub unsafe fn unchecked_from_parts(s: &'a [u8], sa: Vec<u32>) -> Self {
SuffixArray { s, sa, bkt: None }
}
fn check_integrity(&self) -> bool {
if self.s.len() + 1 != self.sa.len() {
return false;
}
for i in 1..self.sa.len() {
let x = &self.s[self.sa[i - 1] as usize..];
let y = &self.s[self.sa[i] as usize..];
if x >= y {
return false;
}
}
true
}
pub fn enable_buckets(&mut self) {
if self.bkt.is_some() {
return;
}
let mut bkt = vec![0; 256 * 257 + 1];
bkt[0] = 1;
if self.s.len() > 0 {
for i in 0..self.s.len() - 1 {
let c0 = unsafe { *self.s.get_unchecked(i) };
let c1 = unsafe { *self.s.get_unchecked(i + 1) };
let idx = (c0 as usize * 257) + (c1 as usize + 1) + 1;
bkt[idx] += 1;
}
let c0 = unsafe { *self.s.get_unchecked(self.s.len() - 1) };
let idx = (c0 as usize * 257) + 1;
bkt[idx] += 1;
}
let mut sum = 0;
for p in bkt.iter_mut() {
sum += *p;
*p = sum;
}
self.bkt = Some(bkt);
}
#[inline]
fn get_bucket(&self, pat: &[u8]) -> Range<usize> {
if let Some(ref bkt) = self.bkt {
if pat.len() > 1 {
let c0 = pat[0];
let c1 = pat[1];
let idx = (c0 as usize * 257) + (c1 as usize + 1) + 1;
bkt[idx - 1] as usize..bkt[idx] as usize
} else if pat.len() == 1 {
let c0 = pat[0];
let start_idx = c0 as usize * 257;
let end_idx = start_idx + 257;
bkt[start_idx] as usize..bkt[end_idx] as usize
} else {
0..1
}
} else {
0..self.sa.len()
}
}
#[inline]
fn get_top_bucket(&self, pat: &[u8]) -> Range<usize> {
if let Some(ref bkt) = self.bkt {
if pat.len() > 0 {
let c0 = pat[0];
let start_idx = c0 as usize * 257;
let end_idx = start_idx + 257;
bkt[start_idx] as usize..bkt[end_idx] as usize
} else {
0..1
}
} else {
0..self.sa.len()
}
}
pub fn contains(&self, pat: &[u8]) -> bool {
let s = self.s;
let sa = &self.sa[self.get_bucket(pat)];
sa.binary_search_by_key(&pat, |&i| trunc(&s[i as usize..], pat.len()))
.is_ok()
}
pub fn search_all(&self, pat: &[u8]) -> &[u32] {
let s = self.s;
let sa = if pat.len() > 0 {
&self.sa[self.get_bucket(pat)]
} else {
&self.sa[..]
};
let mut i = 0;
let mut k = sa.len();
while i < k {
let m = i + (k - i) / 2;
if pat > &s[sa[m] as usize..] {
i = m + 1;
} else {
k = m;
}
}
let mut j = i;
let mut k = sa.len();
while j < k {
let m = j + (k - j) / 2;
if s[sa[m] as usize..].starts_with(pat) {
j = m + 1;
} else {
k = m;
}
}
&sa[i..j]
}
pub fn search_lcp(&self, pat: &[u8]) -> Range<usize> {
let s = self.s;
let sa = &self.sa[self.get_bucket(pat)];
if sa.len() == 0 {
let sa = &self.sa[self.get_top_bucket(pat)];
if sa.len() > 0 {
let i = sa[0] as usize;
return i..i + 1;
} else {
return self.s.len()..self.s.len();
}
}
match sa.binary_search_by(|&i| s[i as usize..].cmp(pat)) {
Ok(i) => {
let start = sa[i] as usize;
start..s.len()
}
Err(i) => {
if i > 0 && i < sa.len() {
let start_a = sa[i - 1] as usize;
let start_b = sa[i] as usize;
let len_a = lcp(pat, &s[start_a..]);
let len_b = lcp(pat, &s[start_b..]);
if len_a > len_b {
start_a..start_a + len_a
} else {
start_b..start_b + len_b
}
} else if i == 0 {
let start = sa[i] as usize;
let len = lcp(pat, &s[start..]);
start..start + len
} else {
let start = sa[i - 1] as usize;
let len = lcp(pat, &s[start..]);
start..start + len
}
}
}
}
#[cfg(feature = "pack")]
pub fn dump<W: Write>(&self, file: W) -> Result<()> {
let psa = PackedSuffixArray::from_sa(&self.sa[..]);
psa.dump(file)
}
#[cfg(feature = "pack")]
pub fn dump_file<P: AsRef<Path>>(&self, name: P) -> Result<()> {
use std::fs::File;
use std::io::BufWriter;
let file = BufWriter::new(File::create(name)?);
let psa = PackedSuffixArray::from_sa(&self.sa[..]);
psa.dump(file)
}
#[cfg(feature = "pack")]
pub fn dump_bytes(&self) -> Result<Vec<u8>> {
let psa = PackedSuffixArray::from_sa(&self.sa[..]);
psa.dump_bytes()
}
#[cfg(feature = "pack")]
pub unsafe fn unchecked_load<R: Read>(
s: &'a [u8],
file: R,
) -> Result<Self> {
let psa = PackedSuffixArray::load(file)?;
let sa = psa.into_sa();
Ok(Self::unchecked_from_parts(s, sa))
}
#[cfg(feature = "pack")]
pub fn load<R: Read>(s: &'a [u8], file: R) -> Result<Self> {
use std::io::{Error, ErrorKind};
let sa = unsafe { Self::unchecked_load(s, file)? };
if !sa.check_integrity() {
Err(Error::new(
ErrorKind::InvalidData,
"inconsistent suffix array",
))
} else {
Ok(sa)
}
}
#[cfg(feature = "pack")]
pub unsafe fn unchecked_load_file<P: AsRef<Path>>(
s: &'a [u8],
name: P,
) -> Result<Self> {
use std::fs::File;
use std::io::BufReader;
let file = BufReader::new(File::open(name)?);
Self::unchecked_load(s, file)
}
#[cfg(feature = "pack")]
pub fn load_file<P: AsRef<Path>>(s: &'a [u8], name: P) -> Result<Self> {
use std::io::{Error, ErrorKind};
let sa = unsafe { Self::unchecked_load_file(s, name)? };
if !sa.check_integrity() {
Err(Error::new(
ErrorKind::InvalidData,
"inconsistent suffix array",
))
} else {
Ok(sa)
}
}
#[cfg(feature = "pack")]
pub unsafe fn unchecked_load_bytes(
s: &'a [u8],
bytes: &[u8],
) -> Result<Self> {
let psa = PackedSuffixArray::load_bytes(bytes)?;
let sa = psa.into_sa();
Ok(Self::unchecked_from_parts(s, sa))
}
#[cfg(feature = "pack")]
pub fn load_bytes(s: &'a [u8], bytes: &[u8]) -> Result<Self> {
use std::io::{Error, ErrorKind};
let sa = unsafe { Self::unchecked_load_bytes(s, bytes)? };
if !sa.check_integrity() {
Err(Error::new(
ErrorKind::InvalidData,
"inconsistent suffix array",
))
} else {
Ok(sa)
}
}
}
impl<'a> From<SuffixArray<'a>> for Vec<u32> {
fn from(sa: SuffixArray<'a>) -> Vec<u32> {
sa.sa
}
}
impl<'a> AsRef<[u8]> for SuffixArray<'a> {
fn as_ref(&self) -> &[u8] {
self.s
}
}