#![forbid(unsafe_code)]
use std::io::{Cursor, Result, Write};
use std::ops::Range;
use bzip2::write::BzEncoder;
use bzip2::Compression;
use rayon::prelude::*;
use suffix_array::SuffixArray;
pub use suffix_array::MAX_LENGTH;
use super::utils::*;
pub const SMALL_MATCH: usize = 12;
const MISMATCH_COUNT: usize = 8;
const LONG_SUFFIX: usize = 256;
pub const BUFFER_SIZE: usize = 4096;
pub const COMPRESSION_LEVEL: u32 = 6;
const MIN_CHUNK: usize = 256 * 1024;
const DEFAULT_CHUNK: usize = 512 * 1024;
const BSDIFF4_MAGIC: &[u8] = b"BSDIFF40";
#[derive(Copy, Clone, Debug, Eq, PartialEq)]
pub enum ParallelScheme {
Never,
Auto,
ChunkSize(usize),
NumJobs(usize),
}
pub struct Bsdiff<'s, 't> {
source: &'s [u8],
target: &'t [u8],
parallel_scheme: ParallelScheme,
small_match: usize,
mismatch_count: usize,
long_suffix: usize,
buffer_size: usize,
compression_level: Compression,
}
impl<'s, 't> Bsdiff<'s, 't> {
pub fn new(source: &'s [u8], target: &'t [u8]) -> Self {
if source.len() > MAX_LENGTH {
panic!("source data is too large to be indexed");
}
Bsdiff {
source,
target,
parallel_scheme: ParallelScheme::Auto,
small_match: SMALL_MATCH,
mismatch_count: MISMATCH_COUNT,
long_suffix: LONG_SUFFIX,
compression_level: Compression::new(COMPRESSION_LEVEL),
buffer_size: BUFFER_SIZE,
}
}
pub fn source(mut self, source: &'s [u8]) -> Self {
self.source = source;
self
}
pub fn target(mut self, target: &'t [u8]) -> Self {
self.target = target;
self
}
pub fn parallel_scheme(mut self, mut parallel_scheme: ParallelScheme) -> Self {
use ParallelScheme::*;
if parallel_scheme == ChunkSize(0) || parallel_scheme == NumJobs(0) {
parallel_scheme = Auto;
}
self.parallel_scheme = parallel_scheme;
self
}
pub fn small_match(mut self, small_match: usize) -> Self {
self.small_match = small_match;
self
}
#[allow(unused)]
fn mismatch_count(mut self, mut mismatch_count: usize) -> Self {
if mismatch_count < 1 {
mismatch_count = 1;
}
self.mismatch_count = mismatch_count;
self
}
#[allow(unused)]
fn long_suffix(mut self, mut long_suffix: usize) -> Self {
if long_suffix < 64 {
long_suffix = 64;
}
self.long_suffix = long_suffix;
self
}
pub fn compression_level(mut self, compression_level: u32) -> Self {
self.compression_level = Compression::new(u32::min(u32::max(compression_level, 0), 9));
self
}
pub fn buffer_size(mut self, mut buffer_size: usize) -> Self {
if buffer_size < 128 {
buffer_size = 128;
}
self.buffer_size = buffer_size;
self
}
pub fn compare<P: Write>(&self, patch: P) -> Result<u64> {
use ParallelScheme::*;
let mut chunk = match self.parallel_scheme {
Never => self.target.len(),
ChunkSize(chunk) => chunk,
NumJobs(jobs) => div_ceil(self.target.len(), jobs),
Auto => DEFAULT_CHUNK,
};
chunk = Ord::max(chunk, MIN_CHUNK);
let mut suffix_array = SuffixArray::new(self.source);
suffix_array.enable_buckets();
if chunk >= self.target.len() {
let diff = SaDiff::new(
self.source,
self.target,
&suffix_array,
self.small_match,
self.mismatch_count,
self.long_suffix,
);
pack(
self.source,
self.target,
diff,
patch,
self.compression_level,
self.buffer_size,
)
} else {
let par_diff = ParSaDiff::new(
self.source,
self.target,
&suffix_array,
chunk,
self.small_match,
self.mismatch_count,
self.long_suffix,
);
let ctrls = par_diff.compute();
pack(
self.source,
self.target,
ctrls.into_iter(),
patch,
self.compression_level,
self.buffer_size,
)
}
}
}
#[inline]
fn div_ceil(x: usize, y: usize) -> usize {
if x % y == 0 {
x / y
} else {
x / y + 1
}
}
fn pack<D, P>(source: &[u8], target: &[u8], diff: D, mut patch: P, level: Compression, bsize: usize) -> Result<u64>
where
D: Iterator<Item = Control>,
P: Write,
{
let mut bz_ctrls = Vec::new();
let mut bz_delta = Vec::new();
let mut bz_extra = Vec::new();
{
let mut ctrls = BzEncoder::new(Cursor::new(&mut bz_ctrls), level);
let mut delta = BzEncoder::new(Cursor::new(&mut bz_delta), level);
let mut extra = BzEncoder::new(Cursor::new(&mut bz_extra), level);
let mut spos = 0;
let mut tpos = 0;
let mut cbuf = [0; 24];
let mut dat = Vec::with_capacity(bsize);
for ctrl in diff {
encode_int(ctrl.add as i64, &mut cbuf[0..8]);
encode_int(ctrl.copy as i64, &mut cbuf[8..16]);
encode_int(ctrl.seek, &mut cbuf[16..24]);
ctrls.write_all(&cbuf[..])?;
if ctrl.add > 0 {
let mut n = ctrl.add;
while n > 0 {
let k = Ord::min(n, bsize as u64) as usize;
dat.extend(
Iterator::zip(source[spos as usize..].iter(), target[tpos as usize..].iter())
.map(|(x, y)| y.wrapping_sub(*x))
.take(k),
);
delta.write_all(&dat[..])?;
dat.clear();
spos += k as u64;
tpos += k as u64;
n -= k as u64;
}
}
if ctrl.copy > 0 {
extra.write_all(&target[tpos as usize..(tpos + ctrl.copy) as usize])?;
tpos += ctrl.copy;
}
spos = spos.wrapping_add(ctrl.seek as u64);
}
ctrls.flush()?;
delta.flush()?;
extra.flush()?;
}
let mut header = [0; 32];
let csize = bz_ctrls.len() as u64;
let dsize = bz_delta.len() as u64;
let esize = bz_extra.len() as u64;
let tsize = target.len() as u64;
header[0..8].copy_from_slice(BSDIFF4_MAGIC);
encode_int(csize as i64, &mut header[8..16]);
encode_int(dsize as i64, &mut header[16..24]);
encode_int(tsize as i64, &mut header[24..32]);
patch.write_all(&header[..])?;
patch.write_all(&bz_ctrls[..])?;
patch.write_all(&bz_delta[..])?;
patch.write_all(&bz_extra[..])?;
patch.flush()?;
Ok(32 + csize + dsize + esize)
}
struct ParSaDiff<'s, 't> {
jobs: Vec<SaDiff<'s, 't>>,
}
impl<'s, 't> ParSaDiff<'s, 't> {
pub fn new(
s: &'s [u8],
t: &'t [u8],
sa: &'s SuffixArray<'s>,
chunk: usize,
small_match: usize,
mismatch_count: usize,
long_suffix: usize,
) -> Self {
let jobs = t
.chunks(chunk)
.map(|ti| SaDiff::new(s, ti, sa, small_match, mismatch_count, long_suffix))
.collect();
ParSaDiff { jobs }
}
pub fn compute(mut self) -> Vec<Control> {
self.jobs
.par_iter_mut()
.map(|diff| {
let mut pos = 0u64;
let mut ctrls = Vec::new();
for ctl in diff {
pos += ctl.add;
pos = pos.wrapping_add(ctl.seek as u64);
ctrls.push(ctl);
}
debug_assert!(pos <= i64::MAX as u64);
ctrls.push(Control {
add: 0,
copy: 0,
seek: -(pos as i64),
});
ctrls
})
.flatten()
.collect()
}
}
struct SaDiff<'s, 't> {
s: &'s [u8],
t: &'t [u8],
sa: &'s SuffixArray<'s>,
small_match: usize,
mismatch_count: usize,
long_suffix: usize,
i0: usize,
j0: usize,
n0: usize,
b0: usize,
}
impl<'s, 't> SaDiff<'s, 't> {
pub fn new(
s: &'s [u8],
t: &'t [u8],
sa: &'s SuffixArray<'s>,
small_match: usize,
mismatch_count: usize,
long_suffix: usize,
) -> Self {
SaDiff {
s,
t,
sa,
small_match,
mismatch_count,
long_suffix,
i0: 0,
j0: 0,
n0: 0,
b0: 0,
}
}
#[inline]
fn previous_state(&self) -> (usize, usize, usize, usize) {
(self.i0, self.j0, self.n0, self.b0)
}
#[inline]
fn update_state(&mut self, i0: usize, j0: usize, n0: usize, b0: usize) {
self.i0 = i0;
self.j0 = j0;
self.n0 = n0;
self.b0 = b0;
}
#[inline]
fn search_next(&mut self) -> Option<(usize, usize, usize)> {
if self.j0 == self.t.len() && self.b0 == 0 {
return None;
}
let mut j = self.j0 + self.n0;
let mut k = j;
let mut m = 0;
while j < self.t.len().saturating_sub(self.small_match) {
let (i, n) = range_to_extent(self.sa.search_lcp(&self.t[j..]));
while k < j + n {
let i = self.i0.saturating_add(k - self.j0);
if i < self.s.len() && self.s[i] == self.t[k] {
m += 1;
}
k += 1;
}
if n == 0 {
j += 1;
m = 0;
} else if m == n || n <= self.small_match {
j += n;
m = 0;
} else if n <= m + self.mismatch_count {
let next = if n <= self.long_suffix {
j + 1
} else {
let mut x = 0;
let mut y = n;
while x < y {
let z = x + (y - x) / 2;
let (iz, nz) = range_to_extent(self.sa.search_lcp(&self.t[j + z..]));
if i + n == iz + nz && j + n == j + z + nz {
x = z + 1;
} else {
y = z;
}
}
j + Ord::max(x, 1)
};
let mut i = self.i0.saturating_add(j - self.j0);
while j < next {
if i < self.s.len() && self.s[i] == self.t[j] {
m -= 1;
}
i += 1;
j += 1;
}
} else {
return Some((i, j, n));
}
}
Some((self.s.len(), self.t.len(), 0))
}
#[inline]
fn shrink_gap(&self, i: usize, j: usize) -> (usize, usize) {
let gap = &self.t[self.j0 + self.n0..j];
let suffix = &self.s[self.i0 + self.n0..];
let prefix = &self.s[..i];
let mut a0 = scan_similar(gap.iter(), suffix.iter());
let mut b = scan_similar(gap.iter().rev(), prefix.iter().rev());
if a0 + b > gap.len() {
let n = a0 + b - gap.len();
let xs = gap[gap.len() - b..a0].iter();
let ys = suffix[gap.len() - b..a0].iter();
let zs = prefix[prefix.len() - b..prefix.len() - b + n].iter();
let i = scan_divide(xs, ys, zs);
a0 -= n - i;
b -= i;
}
(a0, b)
}
}
impl<'s, 't> Iterator for SaDiff<'s, 't> {
type Item = Control;
fn next(&mut self) -> Option<Self::Item> {
if let Some((i, j, n)) = self.search_next() {
let (i0, j0, n0, b0) = self.previous_state();
let (a0, b) = self.shrink_gap(i, j);
let add = (b0 + n0 + a0) as u64;
let copy = ((j - b) - (j0 + n0 + a0)) as u64;
let seek = (i - b).wrapping_sub(i0 + n0 + a0) as isize as i64;
self.update_state(i, j, n, b);
Some(Control { add, copy, seek })
} else {
None
}
}
}
#[inline]
fn range_to_extent(range: Range<usize>) -> (usize, usize) {
let Range { start, end } = range;
(start, end.saturating_sub(start))
}
#[inline]
fn scan_similar<T: Eq, I: Iterator<Item = T>>(xs: I, ys: I) -> usize {
let mut i = 0;
let mut matched = 0;
let mut max_score = 0;
for (n, eq) in (1..).zip(xs.zip(ys).map(|(x, y)| x == y)) {
matched += usize::from(eq);
let mismatched = n - matched;
let score = matched.wrapping_sub(mismatched) as isize;
if score > max_score {
i = n;
max_score = score;
}
}
i
}
#[inline]
fn scan_divide<T: Eq, I: Iterator<Item = T>>(xs: I, ys: I, zs: I) -> usize {
let mut i = 0;
let mut y_matched = 0;
let mut z_matched = 0;
let mut max_score = 0;
let eqs = xs.zip(ys).zip(zs).map(|((x, y), z)| (x == y, x == z));
for (n, (y_eq, z_eq)) in (1..).zip(eqs) {
y_matched += usize::from(y_eq);
z_matched += usize::from(z_eq);
let score = y_matched.wrapping_sub(z_matched) as isize;
if score > max_score {
i = n;
max_score = score;
}
}
i
}