use std::collections::{BinaryHeap, HashMap, HashSet};
use super::{
dictionary_has_meaningful_savings, is_dictionary_compressed_output, DICTIONARY_MAX_DICT_SIZE,
DICTIONARY_MAX_INPUT_BYTES, DICTIONARY_MIN_ENTRY_SAVINGS_BYTES, DICTIONARY_MIN_OCCURRENCES,
LZW_DICTIONARY_HEADER as DICTIONARY_HEADER,
};
const MIN_SUBSTRING_LEN: usize = 15;
const MAX_SUBSTRING_LEN: usize = 120;
const MAX_REPETITION_SAMPLES: usize = 500;
const MIN_REPETITION_RATIO_PERCENT: usize = 5;
const ROLLING_HASH_BASE: u64 = 257;
const SELECTED_SUBSTRING_LENGTHS: &[usize] = &[15, 24, 32, 48, 64, 96, 120];
#[derive(Debug, Clone)]
struct Candidate<'a> {
original: &'a str,
count: usize,
savings: isize,
}
#[derive(Debug, Clone)]
struct SelectedEntry<'a> {
marker: String,
original: &'a str,
positions: Vec<usize>,
}
pub(super) fn compress_lzw_dictionary(output: &str) -> Option<String> {
if output.len() > DICTIONARY_MAX_INPUT_BYTES || is_dictionary_compressed_output(output) {
return None;
}
let offsets = char_offsets(output);
let char_count = offsets.len().saturating_sub(1);
if char_count < MIN_SUBSTRING_LEN || !has_sufficient_repetition(output, &offsets) {
return None;
}
let nonce = collision_free_nonce(output)?;
let candidates = find_repeated_substrings(output, &nonce);
if candidates.is_empty() {
return None;
}
let selected = select_non_overlapping(output, &candidates, &nonce);
if selected.is_empty() {
return None;
}
let compressed = apply_replacements(output, &selected);
let dictionary = build_dictionary(&selected)?;
let result = format!("{DICTIONARY_HEADER}\n{dictionary}\n{compressed}");
let savings = output.len().checked_sub(result.len())?;
if !dictionary_has_meaningful_savings(output.len(), savings) {
return None;
}
Some(result)
}
fn char_offsets(value: &str) -> Vec<usize> {
let mut offsets = value.char_indices().map(|(idx, _)| idx).collect::<Vec<_>>();
offsets.push(value.len());
offsets
}
fn has_sufficient_repetition<'a>(text: &'a str, offsets: &[usize]) -> bool {
let char_count = offsets.len().saturating_sub(1);
let max_pos = char_count.saturating_sub(MIN_SUBSTRING_LEN);
if max_pos == 0 {
return false;
}
let step = std::cmp::max(1, max_pos / MAX_REPETITION_SAMPLES);
let mut seen: HashSet<&'a str> = HashSet::new();
let mut repeated = 0usize;
let mut total = 0usize;
for start in (0..=max_pos).step_by(step) {
let Some(substring) = slice_chars(text, offsets, start, MIN_SUBSTRING_LEN) else {
continue;
};
if !eligible_substring(substring) {
continue;
}
total += 1;
if !seen.insert(substring) {
repeated += 1;
}
}
total > 0 && repeated.saturating_mul(100) / total >= MIN_REPETITION_RATIO_PERCENT
}
fn find_repeated_substrings<'a>(text: &'a str, nonce: &str) -> Vec<Candidate<'a>> {
let mut counts: HashMap<&'a str, usize> = HashMap::new();
for len in SELECTED_SUBSTRING_LENGTHS {
if *len < MIN_SUBSTRING_LEN || *len > MAX_SUBSTRING_LEN || *len > text.len() {
continue;
}
let mut hashed_positions: HashMap<u64, Vec<usize>> = HashMap::new();
collect_rolling_hash_positions(text.as_bytes(), *len, &mut hashed_positions);
for positions in hashed_positions.into_values() {
if positions.len() < DICTIONARY_MIN_OCCURRENCES {
continue;
}
let mut exact: HashMap<&'a str, usize> = HashMap::new();
for start in positions {
let end = start + *len;
if !text.is_char_boundary(start) || !text.is_char_boundary(end) {
continue;
}
let substring = &text[start..end];
if !eligible_substring(substring) {
continue;
}
*exact.entry(substring).or_insert(0) += 1;
}
for (substring, count) in exact {
if count >= DICTIONARY_MIN_OCCURRENCES {
let entry = counts.entry(substring).or_insert(0);
*entry = (*entry).max(count);
}
}
}
}
let mut candidates = counts
.into_iter()
.filter_map(|(original, count)| {
if count < DICTIONARY_MIN_OCCURRENCES {
return None;
}
let marker = marker_for(nonce, 0);
let savings = estimated_savings(original, count, marker.len());
(savings > 0).then_some(Candidate {
original,
count,
savings,
})
})
.collect::<Vec<_>>();
candidates.sort_by(|a, b| {
b.savings
.cmp(&a.savings)
.then_with(|| b.original.len().cmp(&a.original.len()))
.then_with(|| b.count.cmp(&a.count))
.then_with(|| a.original.cmp(b.original))
});
candidates
}
fn collect_rolling_hash_positions(
bytes: &[u8],
window_len: usize,
positions: &mut HashMap<u64, Vec<usize>>,
) {
if window_len == 0 || window_len > bytes.len() {
return;
}
let mut hash = 0u64;
let mut power = 1u64;
for (idx, byte) in bytes.iter().take(window_len).enumerate() {
hash = hash
.wrapping_mul(ROLLING_HASH_BASE)
.wrapping_add(*byte as u64);
if idx + 1 < window_len {
power = power.wrapping_mul(ROLLING_HASH_BASE);
}
}
positions.entry(hash).or_default().push(0);
for start in 1..=bytes.len() - window_len {
let outgoing = (bytes[start - 1] as u64).wrapping_mul(power);
hash = hash
.wrapping_sub(outgoing)
.wrapping_mul(ROLLING_HASH_BASE)
.wrapping_add(bytes[start + window_len - 1] as u64);
positions.entry(hash).or_default().push(start);
}
}
fn select_non_overlapping<'a>(
text: &'a str,
candidates: &[Candidate<'a>],
nonce: &str,
) -> Vec<SelectedEntry<'a>> {
let mut heap = BinaryHeap::<(isize, usize)>::with_capacity(candidates.len());
for (idx, c) in candidates.iter().enumerate() {
heap.push((c.savings, idx));
}
let mut selected = Vec::new();
let mut used = vec![false; text.len()];
let mut reeval_count = 0usize;
const MAX_REEVALS: usize = 256;
while let Some((estimate, idx)) = heap.pop() {
if selected.len() >= DICTIONARY_MAX_DICT_SIZE {
break;
}
let candidate = &candidates[idx];
let marker = marker_for(nonce, selected.len() + 1);
if text.contains(&marker) {
continue;
}
let positions = non_overlapping_positions(text, candidate.original, &used);
if positions.len() < DICTIONARY_MIN_OCCURRENCES {
continue;
}
let realized = estimated_savings(candidate.original, positions.len(), marker.len());
if realized < DICTIONARY_MIN_ENTRY_SAVINGS_BYTES as isize {
continue;
}
if realized < estimate {
let next_best = heap.peek().map(|&(s, _)| s).unwrap_or(isize::MIN);
if realized < next_best && reeval_count < MAX_REEVALS {
heap.push((realized, idx));
reeval_count += 1;
continue;
}
}
for pos in &positions {
mark_range(&mut used, *pos, candidate.original.len());
}
selected.push(SelectedEntry {
marker,
original: candidate.original,
positions,
});
}
selected
}
fn non_overlapping_positions(text: &str, needle: &str, used: &[bool]) -> Vec<usize> {
let mut positions = Vec::new();
let mut search_start = 0usize;
while search_start < text.len() {
let Some(relative) = text[search_start..].find(needle) else {
break;
};
let pos = search_start + relative;
if !range_overlaps(used, pos, needle.len()) {
positions.push(pos);
search_start = pos + needle.len();
} else {
search_start = advance_one_char(text, pos);
}
}
positions
}
fn apply_replacements(text: &str, selected: &[SelectedEntry<'_>]) -> String {
let mut replacements = selected
.iter()
.flat_map(|entry| {
entry.positions.iter().map(|pos| Replacement {
pos: *pos,
len: entry.original.len(),
marker: entry.marker.as_str(),
})
})
.collect::<Vec<_>>();
replacements.sort_by_key(|replacement| std::cmp::Reverse(replacement.pos));
let mut compressed = text.to_string();
for replacement in replacements {
compressed.replace_range(
replacement.pos..replacement.pos + replacement.len,
replacement.marker,
);
}
compressed
}
#[derive(Debug)]
struct Replacement<'a> {
pos: usize,
len: usize,
marker: &'a str,
}
fn build_dictionary(selected: &[SelectedEntry<'_>]) -> Option<String> {
let mut lines = Vec::with_capacity(selected.len() + 1);
for entry in selected {
let encoded = serde_json::to_string(entry.original).ok()?;
lines.push(format!("{} = {encoded}", entry.marker));
}
lines.push(String::new());
Some(lines.join("\n"))
}
fn slice_chars<'a>(text: &'a str, offsets: &[usize], start: usize, len: usize) -> Option<&'a str> {
let end = start.checked_add(len)?;
Some(&text[*offsets.get(start)?..*offsets.get(end)?])
}
fn eligible_substring(value: &str) -> bool {
if value.contains('\n') || value.contains('\r') {
return false;
}
!value
.chars()
.all(|ch| ch.is_whitespace() || matches!(ch, '-' | '_' | '='))
}
fn estimated_savings(original: &str, count: usize, marker_len: usize) -> isize {
let original_cost = count.saturating_mul(original.len());
let replacement_cost = count
.saturating_mul(marker_len)
.saturating_add(dictionary_entry_len(original, marker_len));
original_cost as isize - replacement_cost as isize
}
fn dictionary_entry_len(original: &str, marker_len: usize) -> usize {
let encoded_len = serde_json::to_string(original)
.map(|encoded| encoded.len())
.unwrap_or(original.len() + 2);
marker_len + " = ".len() + encoded_len + "\n".len()
}
fn marker_for(nonce: &str, index: usize) -> String {
format!("<<F{nonce}:{index}>>")
}
fn collision_free_nonce(text: &str) -> Option<String> {
for nonce in 1..=999usize {
let prefix = format!("<<F{nonce}:");
if !text.contains(&prefix) {
return Some(nonce.to_string());
}
}
None
}
fn range_overlaps(used: &[bool], pos: usize, len: usize) -> bool {
used.get(pos..pos + len)
.is_none_or(|range| range.iter().any(|used| *used))
}
fn mark_range(used: &mut [bool], pos: usize, len: usize) {
if let Some(range) = used.get_mut(pos..pos + len) {
for slot in range {
*slot = true;
}
}
}
fn advance_one_char(text: &str, pos: usize) -> usize {
let mut chars = text[pos..].char_indices();
let _ = chars.next();
chars.next().map(|(idx, _)| pos + idx).unwrap_or(text.len())
}
#[cfg(test)]
mod tests {
use super::*;
fn repeated_output() -> String {
(0..24)
.map(|_| "error: repeated dependency resolution failure in workspace crate alpha\n")
.collect::<String>()
}
fn decompress_lzw_dictionary(output: &str) -> String {
let Some(rest) = output.strip_prefix(DICTIONARY_HEADER) else {
return output.to_string();
};
let Some(rest) = rest.strip_prefix('\n') else {
return output.to_string();
};
let Some((dict, body)) = rest.split_once("\n\n") else {
return output.to_string();
};
let mut result = body.to_string();
for line in dict.lines().rev() {
let Some((marker, encoded)) = line.split_once(" = ") else {
continue;
};
let Ok(original) = serde_json::from_str::<String>(encoded) else {
continue;
};
result = result.replace(marker, &original);
}
result
}
#[test]
fn aggressive_lzw_compresses_repetitive_output() {
let raw = repeated_output();
let compressed = compress_lzw_dictionary(&raw).expect("compressible");
assert!(compressed.starts_with(DICTIONARY_HEADER));
assert!(compressed.len() < raw.len());
}
#[test]
fn aggressive_lzw_skips_non_repetitive_output() {
let raw = pseudo_random_ascii(2_000);
assert_eq!(compress_lzw_dictionary(&raw), None);
}
#[test]
fn aggressive_lzw_skips_oversized_output() {
let raw = "error: repeated dependency resolution failure "
.repeat(DICTIONARY_MAX_INPUT_BYTES / 10)
+ "tail";
assert_eq!(compress_lzw_dictionary(&raw), None);
}
#[test]
fn aggressive_lzw_handles_marker_collisions() {
let raw = format!("<<F1:1>>\n{}", repeated_output());
let compressed = compress_lzw_dictionary(&raw).expect("compressible");
assert!(compressed.contains("<<F2:"));
assert_eq!(decompress_lzw_dictionary(&compressed), raw);
}
#[test]
fn aggressive_lzw_round_trips_with_test_decompressor() {
let raw = repeated_output();
let compressed = compress_lzw_dictionary(&raw).expect("compressible");
assert_eq!(decompress_lzw_dictionary(&compressed), raw);
}
#[test]
fn aggressive_lzw_skips_already_compressed_output() {
let raw = format!("{DICTIONARY_HEADER}\n<<F1:1>> = \"value\"\n\nbody");
assert_eq!(compress_lzw_dictionary(&raw), None);
}
#[test]
fn aggressive_lzw_skips_repair_dictionary_output() {
let raw = format!(
"{}\n<<R1:1>> = \"value\"\n\nbody",
super::super::REPAIR_DICTIONARY_HEADER
);
assert_eq!(compress_lzw_dictionary(&raw), None);
}
fn pseudo_random_ascii(len: usize) -> String {
let mut seed = 0x1234_5678_9abc_def0u64;
let mut output = String::with_capacity(len);
for _ in 0..len {
seed = seed.wrapping_mul(6364136223846793005).wrapping_add(1);
let byte = b'a' + ((seed >> 32) % 26) as u8;
output.push(byte as char);
}
output
}
}