use criterion::{black_box, criterion_group, criterion_main, Criterion};
use auto_encoder::{detect_encoding, detect_language, encode_bytes, encode_bytes_from_language, find_subsequence, is_binary_file, encoding_for_locale};
fn small_html() -> Vec<u8> { br#"<!DOCTYPE html><html lang="en"><head><meta charset="utf-8"><title>Test</title></head><body><p>Hello world</p></body></html>"#.to_vec() }
fn medium_html() -> Vec<u8> { let mut h = br#"<!DOCTYPE html><html lang="ja"><head><meta charset="shift_jis"><title>Test Page</title></head><body>"#.to_vec(); for i in 0..200 { h.extend_from_slice(format!("<p>Paragraph {} with some content to fill space and make this a realistic document size for benchmarking purposes.</p>", i).as_bytes()); } h.extend_from_slice(b"</body></html>"); h }
fn large_html() -> Vec<u8> { let mut h = br#"<!DOCTYPE html><html lang="zh-cn"><head><meta http-equiv="Content-Type" content="text/html; charset=gb18030"><title>Large Page</title></head><body>"#.to_vec(); for i in 0..2000 { h.extend_from_slice(format!("<div class='row-{}'><p>Content block {} with enough text to simulate a real-world HTML document that might be encountered during web crawling.</p></div>", i, i).as_bytes()); } h.extend_from_slice(b"</body></html>"); h }
fn bench_detect_language(c: &mut Criterion) { let s=small_html(); let m=medium_html(); let l=large_html(); let mut g=c.benchmark_group("detect_language"); g.bench_function("small_120b",|b|b.iter(||detect_language(black_box(&s)))); g.bench_function("medium_20kb",|b|b.iter(||detect_language(black_box(&m)))); g.bench_function("large_250kb",|b|b.iter(||detect_language(black_box(&l)))); g.finish(); }
fn bench_detect_encoding(c: &mut Criterion) { let s=small_html(); let m=medium_html(); let l=large_html(); let t=br#"<meta name="viewport" content="width=device-width"><meta name="description" content="test"><meta charset="utf-8"><title>Test</title>"#.to_vec(); let mut g=c.benchmark_group("detect_encoding"); g.bench_function("small_120b",|b|b.iter(||detect_encoding(black_box(&s)))); g.bench_function("medium_20kb",|b|b.iter(||detect_encoding(black_box(&m)))); g.bench_function("large_250kb",|b|b.iter(||detect_encoding(black_box(&l)))); g.bench_function("third_meta_tag",|b|b.iter(||detect_encoding(black_box(&t)))); g.finish(); }
fn bench_is_binary_file(c: &mut Criterion) { let j=vec![0xFF,0xD8,0xFF,0xE0,0x00,0x10]; let p=vec![0x89,0x50,0x4E,0x47,0x0D,0x0A]; let n=b"<html>hello</html>".to_vec(); let mut g=c.benchmark_group("is_binary_file"); g.bench_function("jpeg_match",|b|b.iter(||is_binary_file(black_box(&j)))); g.bench_function("png_match",|b|b.iter(||is_binary_file(black_box(&p)))); g.bench_function("not_binary",|b|b.iter(||is_binary_file(black_box(&n)))); g.finish(); }
fn bench_find_subsequence(c: &mut Criterion) { let s=b"<html lang=\"en\"><head><meta charset=\"utf-8\">".to_vec(); let l:Vec<u8>="x".repeat(900).as_bytes().iter().chain(b"<meta charset=\"utf-8\">".iter()).copied().collect(); let mut g=c.benchmark_group("find_subsequence"); g.bench_function("small_44b",|b|b.iter(||find_subsequence(black_box(&s),black_box(b"charset=")))); g.bench_function("1kb_needle_at_end",|b|b.iter(||find_subsequence(black_box(&l),black_box(b"charset=")))); g.finish(); }
fn bench_encode_bytes(c: &mut Criterion) { let sj:Vec<u8>=(0..5000).flat_map(|_|vec![0x82,0xA0]).collect(); let u="Hello world! ".repeat(1000).into_bytes(); let lu="Content ".repeat(5000).into_bytes(); let mut g=c.benchmark_group("encode_bytes"); g.bench_function("shift_jis_10kb",|b|b.iter(||encode_bytes(black_box(&sj),black_box("shift_jis")))); g.bench_function("utf8_13kb",|b|b.iter(||encode_bytes(black_box(&u),black_box("utf-8")))); g.bench_function("utf8_40kb",|b|b.iter(||encode_bytes(black_box(&lu),black_box("utf-8")))); g.finish(); }
fn bench_encode_bytes_from_language(c: &mut Criterion) { let h=small_html(); let sj:Vec<u8>=(0..500).flat_map(|_|vec![0x82,0xA0]).collect(); let mut g=c.benchmark_group("encode_bytes_from_language"); g.bench_function("en_us_120b",|b|b.iter(||encode_bytes_from_language(black_box(&h),black_box("en-us")))); g.bench_function("ja_1kb_shift_jis",|b|b.iter(||encode_bytes_from_language(black_box(&sj),black_box("ja")))); g.finish(); }
fn bench_encoding_for_locale(c: &mut Criterion) { let mut g=c.benchmark_group("encoding_for_locale"); g.bench_function("hit_en_us",|b|b.iter(||encoding_for_locale(black_box("en-us")))); g.bench_function("hit_ja_jp",|b|b.iter(||encoding_for_locale(black_box("ja-jp")))); g.bench_function("miss",|b|b.iter(||encoding_for_locale(black_box("xx-xx")))); g.finish(); }
criterion_group!(benches, bench_detect_language, bench_detect_encoding, bench_is_binary_file, bench_find_subsequence, bench_encode_bytes, bench_encode_bytes_from_language, bench_encoding_for_locale);
criterion_main!(benches);