#[cfg(test)]
use crate::json_parser::deduplication::kmp_matcher::KMPMatcher;
use crate::json_parser::deduplication::rolling_hash::RollingHashWindow;
enum OverlapPrecondition {
ExactMatch,
NotShortNotIdentical,
Reject,
}
fn check_overlap_preconditions(delta: &str, accumulated: &str, thresholds: &OverlapThresholds) -> OverlapPrecondition {
if delta == accumulated { return OverlapPrecondition::ExactMatch; }
if delta.len() < thresholds.short_chunk_threshold || delta.len() <= accumulated.len() { return OverlapPrecondition::Reject; }
OverlapPrecondition::NotShortNotIdentical
}
fn extract_suffix_by_overlap(delta: &str, char_count: usize) -> Option<&str> {
if char_count > 0 && delta.len() > char_count {
Some(&delta[char_count..])
} else {
None
}
}
#[derive(Debug, Default, Clone)]
pub struct DeltaDeduplicator {
hash_window: RollingHashWindow,
}
impl DeltaDeduplicator {
#[cfg(test)]
#[must_use]
pub fn new() -> Self {
Self::default()
}
#[cfg(test)]
pub fn add_accumulated(&mut self, content: &str) {
self.hash_window.add_content(content);
}
#[cfg(test)]
#[must_use]
pub fn extract_new_content<'a>(delta: &'a str, accumulated: &str) -> Option<&'a str> {
if delta == accumulated {
return Some("");
}
if delta.len() <= accumulated.len() {
return None;
}
let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
if accumulated_hash != delta_prefix_hash {
return None;
}
let kmp = KMPMatcher::new(accumulated);
if let Some(pos) = kmp.find(delta) {
if pos == 0 {
return Some(&delta[accumulated.len()..]);
}
}
None
}
#[must_use]
pub fn is_likely_snapshot(delta: &str, accumulated: &str) -> bool {
if delta == accumulated {
return true;
}
if delta.len() <= accumulated.len() {
return false;
}
let accumulated_hash = RollingHashWindow::compute_hash(accumulated);
let delta_prefix_hash = RollingHashWindow::compute_hash(&delta[..accumulated.len()]);
accumulated_hash == delta_prefix_hash
}
#[must_use]
pub fn is_likely_snapshot_with_thresholds(delta: &str, accumulated: &str) -> bool {
let thresholds = get_overlap_thresholds();
match check_overlap_preconditions(delta, accumulated, &thresholds) {
OverlapPrecondition::ExactMatch => true,
OverlapPrecondition::NotShortNotIdentical => {
Self::is_likely_snapshot(delta, accumulated)
&& score_overlap(delta, accumulated).meets_thresholds(&thresholds)
}
OverlapPrecondition::Reject => false,
}
}
#[must_use]
pub fn extract_new_content_with_thresholds<'a>(
delta: &'a str,
accumulated: &str,
) -> Option<&'a str> {
let thresholds = get_overlap_thresholds();
match check_overlap_preconditions(delta, accumulated, &thresholds) {
OverlapPrecondition::ExactMatch => Some(""),
OverlapPrecondition::NotShortNotIdentical => {
let score = score_overlap(delta, accumulated);
score
.meets_thresholds(&thresholds)
.then(|| extract_suffix_by_overlap(delta, score.char_count))
.flatten()
}
OverlapPrecondition::Reject => None,
}
}
pub fn clear(&mut self) {
self.hash_window.clear();
}
}