Skip to main content

crush_gpu/
vectorize.rs

1//! Vectorized (SIMD) string matching for improved compression ratios
2//!
3//! Provides an enhanced LZ77 matcher that uses longer hash chains and
4//! wider match scanning for text-heavy data.  Activation is gated by a
5//! heuristic that checks string density (printable ASCII ratio > 70%)
6//! and entropy (< 6.0 bits/byte).
7
8use crate::entropy::calculate_entropy;
9
10/// Minimum string density (fraction of printable ASCII bytes) to activate
11/// vectorized matching.
12const MIN_STRING_DENSITY: f64 = 0.70;
13
14/// Maximum entropy (bits/byte) to activate vectorized matching.
15const MAX_ENTROPY_FOR_VECTORIZED: f64 = 6.0;
16
17/// Sample size in bytes for the activation heuristic.
18const SAMPLE_SIZE: usize = 1_048_576; // 1 MB
19
20/// Decide whether vectorized matching should be used for the given data.
21///
22/// Returns `true` when the data has high printable ASCII density (>70%)
23/// and low-enough entropy (<6.0 bits/byte), indicating text-heavy content
24/// that benefits from deeper match searching.
25#[must_use]
26pub fn should_use_vectorized(data: &[u8]) -> bool {
27    if data.is_empty() {
28        return false;
29    }
30
31    let sample = if data.len() > SAMPLE_SIZE {
32        &data[..SAMPLE_SIZE]
33    } else {
34        data
35    };
36
37    let density = string_density(sample);
38    if density < MIN_STRING_DENSITY {
39        return false;
40    }
41
42    let entropy = calculate_entropy(sample);
43    entropy < MAX_ENTROPY_FOR_VECTORIZED
44}
45
46/// Compute the fraction of printable ASCII bytes (0x20..=0x7E plus
47/// common whitespace: `\t`, `\n`, `\r`) in `data`.
48#[allow(clippy::cast_precision_loss)]
49fn string_density(data: &[u8]) -> f64 {
50    if data.is_empty() {
51        return 0.0;
52    }
53    let printable = data
54        .iter()
55        .filter(|&&b| b == b'\t' || b == b'\n' || b == b'\r' || (0x20..=0x7E).contains(&b))
56        .count();
57    printable as f64 / data.len() as f64
58}