use std::io::Read;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ContentType {
Text,
Binary,
Random,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum CompressionBackend {
Parallel,
SingleThreaded,
}
#[derive(Debug, Clone)]
pub struct OptimizationConfig {
pub thread_count: usize,
pub buffer_size: usize,
pub backend: CompressionBackend,
#[allow(dead_code)]
pub content_type: ContentType,
#[allow(dead_code)]
pub use_numa_pinning: bool,
pub compression_level: u8,
}
impl OptimizationConfig {
pub fn new(
requested_threads: usize,
file_size: u64,
compression_level: u8,
content_type: ContentType,
) -> Self {
let thread_count = optimal_thread_count(requested_threads, file_size, compression_level);
let buffer_size = optimal_buffer_size(file_size, content_type);
let backend =
choose_compression_backend(compression_level, content_type, file_size, thread_count);
let use_numa_pinning = should_use_numa_pinning(thread_count, file_size);
OptimizationConfig {
thread_count,
buffer_size,
backend,
content_type,
use_numa_pinning,
compression_level,
}
}
}
pub fn detect_content_type<R: Read>(reader: &mut R) -> std::io::Result<ContentType> {
let mut sample = vec![0u8; 8192]; let bytes_read = reader.read(&mut sample)?;
if bytes_read == 0 {
return Ok(ContentType::Binary);
}
sample.truncate(bytes_read);
Ok(analyze_content_type(&sample))
}
pub fn analyze_content_type(sample: &[u8]) -> ContentType {
if sample.is_empty() {
return ContentType::Binary;
}
let mut text_chars = 0;
let mut control_chars = 0;
for &byte in sample {
match byte {
0x20..=0x7E | 0x09 | 0x0A | 0x0D => text_chars += 1,
0x00..=0x08 | 0x0B | 0x0C | 0x0E..=0x1F => control_chars += 1,
0x80..=0xFF if is_likely_utf8_byte(byte) => text_chars += 1,
0x80..=0xFF => {}
_ => {} }
}
let total = sample.len();
let text_ratio = text_chars as f64 / total as f64;
let control_ratio = control_chars as f64 / total as f64;
if text_ratio > 0.8 && control_ratio < 0.1 {
ContentType::Text
} else if text_ratio < 0.3 && is_random_like(sample) {
ContentType::Random
} else {
ContentType::Binary
}
}
fn is_likely_utf8_byte(byte: u8) -> bool {
matches!(byte, 0x80..=0xF7)
}
fn is_random_like(sample: &[u8]) -> bool {
if sample.len() < 256 {
return false; }
let mut counts = [0u32; 256];
for &byte in sample {
counts[byte as usize] += 1;
}
let mean = sample.len() as f64 / 256.0;
let variance: f64 = counts
.iter()
.map(|&count| {
let diff = count as f64 - mean;
diff * diff
})
.sum::<f64>()
/ 256.0;
let mut non_zero_buckets = 0;
let mut max_count = 0;
for &count in &counts {
if count > 0 {
non_zero_buckets += 1;
}
max_count = max_count.max(count);
}
let variance_threshold = mean * 1.5; let diversity_ratio = non_zero_buckets as f64 / 256.0;
let dominance_ratio = max_count as f64 / sample.len() as f64;
variance < variance_threshold && diversity_ratio > 0.8 && dominance_ratio < 0.1
}
fn optimal_thread_count(requested: usize, file_size: u64, compression_level: u8) -> usize {
let max_threads = requested.min(num_cpus::get());
match compression_level {
1 => {
max_threads
}
2..=5 => {
if file_size <= 102_400 {
(max_threads / 2).max(1)
} else {
max_threads
}
}
6 => {
max_threads
}
7..=9 => {
max_threads
}
_ => max_threads,
}
}
fn optimal_buffer_size(file_size: u64, content_type: ContentType) -> usize {
let base_size = match file_size {
0..=102_400 => 32_768, 102_401..=1_048_576 => 65_536, 1_048_577..=10_485_760 => 131_072, _ => 524_288, };
match content_type {
ContentType::Text => base_size, ContentType::Binary => base_size * 2, ContentType::Random => base_size / 2, }
}
fn choose_compression_backend(
_compression_level: u8,
_content_type: ContentType,
file_size: u64,
thread_count: usize,
) -> CompressionBackend {
if file_size <= 65_536 {
return CompressionBackend::SingleThreaded;
}
if thread_count > 1 {
return CompressionBackend::Parallel;
}
CompressionBackend::SingleThreaded
}
fn should_use_numa_pinning(thread_count: usize, file_size: u64) -> bool {
thread_count >= 4 && file_size >= 10_485_760 }
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_content_type_detection() {
let text_sample = b"Hello, world! This is a text file with normal content.";
assert_eq!(analyze_content_type(text_sample), ContentType::Text);
let binary_sample = vec![0u8; 100];
assert_eq!(analyze_content_type(&binary_sample), ContentType::Binary);
}
#[test]
fn test_thread_count_respects_request() {
let result = optimal_thread_count(4, 10_000_000, 6);
assert!((1..=4).contains(&result));
}
#[test]
fn test_buffer_sizing_scales_with_file_size() {
let small = optimal_buffer_size(1024, ContentType::Text);
let large = optimal_buffer_size(100_000_000, ContentType::Text);
assert!(large >= small);
}
}