crush_gpu/vectorize.rs
1//! Vectorized (SIMD) string matching for improved compression ratios
2//!
3//! Provides an enhanced LZ77 matcher that uses longer hash chains and
4//! wider match scanning for text-heavy data. Activation is gated by a
5//! heuristic that checks string density (printable ASCII ratio > 70%)
6//! and entropy (< 6.0 bits/byte).
7
8use crate::entropy::calculate_entropy;
9
10/// Minimum string density (fraction of printable ASCII bytes) to activate
11/// vectorized matching.
12const MIN_STRING_DENSITY: f64 = 0.70;
13
14/// Maximum entropy (bits/byte) to activate vectorized matching.
15const MAX_ENTROPY_FOR_VECTORIZED: f64 = 6.0;
16
17/// Sample size in bytes for the activation heuristic.
18const SAMPLE_SIZE: usize = 1_048_576; // 1 MB
19
20/// Decide whether vectorized matching should be used for the given data.
21///
22/// Returns `true` when the data has high printable ASCII density (>70%)
23/// and low-enough entropy (<6.0 bits/byte), indicating text-heavy content
24/// that benefits from deeper match searching.
25#[must_use]
26pub fn should_use_vectorized(data: &[u8]) -> bool {
27 if data.is_empty() {
28 return false;
29 }
30
31 let sample = if data.len() > SAMPLE_SIZE {
32 &data[..SAMPLE_SIZE]
33 } else {
34 data
35 };
36
37 let density = string_density(sample);
38 if density < MIN_STRING_DENSITY {
39 return false;
40 }
41
42 let entropy = calculate_entropy(sample);
43 entropy < MAX_ENTROPY_FOR_VECTORIZED
44}
45
46/// Compute the fraction of printable ASCII bytes (0x20..=0x7E plus
47/// common whitespace: `\t`, `\n`, `\r`) in `data`.
48#[allow(clippy::cast_precision_loss)]
49fn string_density(data: &[u8]) -> f64 {
50 if data.is_empty() {
51 return 0.0;
52 }
53 let printable = data
54 .iter()
55 .filter(|&&b| b == b'\t' || b == b'\n' || b == b'\r' || (0x20..=0x7E).contains(&b))
56 .count();
57 printable as f64 / data.len() as f64
58}