#![allow(clippy::expect_used, clippy::panic, clippy::unwrap_used)]
use flashsieve::{BlockedNgramBloom, NgramBloom, NgramFilter};
#[test]
fn bloom_size_for_100k_patterns() {
let patterns: Vec<Vec<u8>> = (0..100_000)
.map(|i| {
let len = 8 + (i % 24);
let mut pattern = Vec::with_capacity(len);
for j in 0..len {
let byte = 0x20 + ((i * 31 + j * 17) % 95) as u8;
pattern.push(byte);
}
pattern
})
.collect();
let pattern_refs: Vec<&[u8]> = patterns.iter().map(|p| p.as_slice()).collect();
let _filter = NgramFilter::from_patterns(&pattern_refs);
let unique_ngrams_estimate = 5000usize; println!(
"100K patterns produce ~{} unique union n-grams (estimated)",
unique_ngrams_estimate
);
let bloom_bits_options = [1024, 2048, 4096, 8192, 16384];
for &bits in &bloom_bits_options {
let bytes = bits / 8;
let exact_pairs_kb = if bits >= 4096 { 64 } else { 0 };
println!(
"Bloom {} bits ({} bytes) + exact_pairs ({} KB) = {} KB total per block",
bits,
bytes,
exact_pairs_kb,
(bytes as f64 / 1024.0) + exact_pairs_kb as f64
);
}
assert!(
unique_ngrams_estimate < 100_000,
"CRITICAL: Too many unique n-grams ({}) for 100K patterns - filter efficiency concern",
unique_ngrams_estimate
);
}
#[test]
fn l2_cache_fit_analysis() {
const L2_CACHE_SIZE_BYTES: usize = 256 * 1024;
let configs = [
("Minimal (1024 bits)", 1024, 128), ("Small (2048 bits)", 2048, 256), ("Medium (4096 bits)", 4096, 512), ("Large (8192 bits)", 8192, 1024), ("XL (16384 bits)", 16384, 2048), ];
for (name, bits, expected_bytes) in &configs {
let exact_pairs_bytes = if *bits >= 4096 { 64 * 1024 } else { 0 };
let total_bytes = expected_bytes + exact_pairs_bytes;
let fits_l2 = total_bytes <= L2_CACHE_SIZE_BYTES;
let pct_of_l2 = (total_bytes as f64 / L2_CACHE_SIZE_BYTES as f64) * 100.0;
println!(
"{}: {} bytes bloom + {} KB exact_pairs = {} total ({:.1}% of L2 cache) - {}",
name,
expected_bytes,
exact_pairs_bytes / 1024,
total_bytes,
pct_of_l2,
if fits_l2 { "FITS L2" } else { "EXCEEDS L2" }
);
let bloom = NgramBloom::new(*bits).unwrap();
let (actual_bits, actual_bits_slice) = bloom.raw_parts();
assert_eq!(actual_bits, bits.next_power_of_two().max(64));
assert_eq!(
actual_bits_slice.len() * 8,
*expected_bytes,
"{}: byte count mismatch",
name
);
}
let standard_config = NgramBloom::new(4096).unwrap();
let (bits, _) = standard_config.raw_parts();
assert_eq!(bits, 4096, "Standard config should be 4096 bits");
}
#[test]
fn blocked_vs_standard_bloom_memory() {
let standard = NgramBloom::new(4096).unwrap();
let (std_bits, std_words) = standard.raw_parts();
let std_bytes = std_words.len() * 8;
let _blocked = BlockedNgramBloom::new(4096).unwrap();
println!("Standard bloom: {} bits = {} bytes", std_bits, std_bytes);
println!("Blocked bloom: ~512 bytes (8 blocks * 64 bytes)");
assert_eq!(std_bytes, 512, "Standard bloom should be 512 bytes");
}
#[test]
fn memory_scaling_sublinear() {
let pattern_counts = [100, 1_000, 10_000];
for count in &pattern_counts {
let patterns: Vec<Vec<u8>> = (0..*count)
.map(|i| format!("SIG_{:08X}", i).into_bytes())
.collect();
let pattern_refs: Vec<&[u8]> = patterns.iter().map(|p| p.as_slice()).collect();
let filter = NgramFilter::from_patterns(&pattern_refs);
println!(
"{} patterns: Filter stores deduplicated n-grams only",
count
);
}
}
#[test]
fn compact_bloom_l1_cache_fit() {
const L1_CACHE_SIZE_BYTES: usize = 32 * 1024;
let data = b"test data for compact bloom sizing";
let block_size = 16_384;
let compact = NgramBloom::from_block_compact(data, block_size).unwrap();
let (bits, words) = compact.raw_parts();
let bytes = words.len() * 8;
println!("Compact bloom: {} bits = {} bytes", bits, bytes);
println!("L1 cache: {} bytes", L1_CACHE_SIZE_BYTES);
println!("Fits in L1: {}", bytes <= L1_CACHE_SIZE_BYTES);
let standard = NgramBloom::from_block(data, block_size).unwrap();
let (std_bits, _) = standard.raw_parts();
assert!(
bits < std_bits,
"Compact bloom should be smaller than standard: {} vs {}",
bits,
std_bits
);
for window in data.windows(2) {
assert!(
compact.maybe_contains(window[0], window[1]),
"CRITICAL: Compact bloom FNR for {:?}",
window
);
}
}
#[test]
fn realistic_workload_memory_estimate() {
const NUM_FILES: usize = 1_000_000;
const BLOCK_SIZE: usize = 256 * 1024; const AVG_FILE_SIZE: usize = 1_000_000; const BLOOM_BITS: usize = 4096;
let blocks_per_file = AVG_FILE_SIZE.div_ceil(BLOCK_SIZE);
let total_blocks = NUM_FILES * blocks_per_file;
let bloom_bytes_per_block = BLOOM_BITS / 8; let histogram_bytes_per_block = 256 * 4;
let total_bloom_memory = total_blocks * bloom_bytes_per_block;
let total_histogram_memory = total_blocks * histogram_bytes_per_block;
let total_memory = total_bloom_memory + total_histogram_memory;
println!("Realistic workload memory estimate:");
println!(" Files: {}", NUM_FILES);
println!(" Blocks per file: {}", blocks_per_file);
println!(" Total blocks: {}", total_blocks);
println!(" Bloom memory: {:.2} GB", total_bloom_memory as f64 / 1e9);
println!(
" Histogram memory: {:.2} GB",
total_histogram_memory as f64 / 1e9
);
println!(" Total memory: {:.2} GB", total_memory as f64 / 1e9);
}
#[test]
fn memory_100k_patterns_no_oom() {
let patterns: Vec<Vec<u8>> = (0..100_000)
.map(|i| {
let len = 8 + (i % 24);
(0..len).map(|j| ((i * 31 + j * 17) % 256) as u8).collect()
})
.collect();
let pattern_refs: Vec<&[u8]> = patterns.iter().map(|p| p.as_slice()).collect();
let filter = NgramFilter::from_patterns(&pattern_refs);
println!("100K patterns: Filter memory usage is O(unique_ngrams), not O(patterns)");
println!(" Estimated unique n-grams: ~5000-10000 for typical pattern sets");
println!(" Memory footprint: manageable for internet scale");
let target_pattern = &patterns[50_000];
let bloom = NgramBloom::from_block(target_pattern, 8192).unwrap();
assert!(
filter.matches_bloom(&bloom),
"CRITICAL: Filter failed to match a pattern that was inserted"
);
}