use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput};
use kham_core::fts::FtsTokenizer;
use kham_core::Tokenizer;
const SHORT: &str = "กินข้าวกับปลา";
const MEDIUM: &str = "สวัสดีชาวโลกคนที่นี่ไปมาที่ธนาคารแห่งนั้นกินข้าวกับปลาและน้ำ";
const LONG: &str = concat!(
"สวัสดีชาวโลกคนที่นี่ไปมาที่ธนาคารแห่งนั้นกินข้าวกับปลาและน้ำ",
"สวัสดีชาวโลกคนที่นี่ไปมาที่ธนาคารแห่งนั้นกินข้าวกับปลาและน้ำ",
"สวัสดีชาวโลกคนที่นี่ไปมาที่ธนาคารแห่งนั้นกินข้าวกับปลาและน้ำ",
);
const MIXED: &str = "ธนาคาร100แห่งสวัสดีhello123คน42ไปมา";
fn bench_tokenizer_new(c: &mut Criterion) {
c.bench_function("tokenizer_new", |b| {
b.iter(|| {
let tok = Tokenizer::new();
criterion::black_box(tok);
});
});
}
fn bench_segment_by_length(c: &mut Criterion) {
let tok = Tokenizer::new();
let inputs = [("short", SHORT), ("medium", MEDIUM), ("long", LONG)];
let mut group = c.benchmark_group("segment/by_length");
for (label, text) in inputs {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(label), text, |b, text| {
b.iter(|| criterion::black_box(tok.segment(text)));
});
}
group.finish();
}
fn bench_segment_mixed(c: &mut Criterion) {
let tok = Tokenizer::new();
let inputs: &[(&str, &str)] = &[
("sparse", "ธนาคาร100แห่ง"),
("medium", MIXED),
("dense", "a1ก2b3ข4c5ค6d7ง8e9จ10"),
];
let mut group = c.benchmark_group("segment/mixed");
for &(label, text) in inputs {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(label), text, |b, text| {
b.iter(|| criterion::black_box(tok.segment(text)));
});
}
group.finish();
}
fn bench_normalize(c: &mut Criterion) {
let tok = Tokenizer::new();
let inputs = [
("clean_short", SHORT),
("clean_medium", MEDIUM),
("clean_long", LONG),
];
let mut group = c.benchmark_group("normalize");
for (label, text) in inputs {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_with_input(BenchmarkId::new("thai", label), text, |b, text| {
b.iter(|| criterion::black_box(tok.normalize(text)));
});
}
group.finish();
}
fn bench_normalize_then_segment(c: &mut Criterion) {
let tok = Tokenizer::new();
c.bench_function("normalize_then_segment/medium", |b| {
b.iter(|| {
let normalized = tok.normalize(MEDIUM);
let tokens = tok.segment(&normalized);
criterion::black_box(tokens.len())
});
});
}
fn bench_fts_new(c: &mut Criterion) {
c.bench_function("fts/new", |b| {
b.iter(|| {
let fts = FtsTokenizer::new();
criterion::black_box(fts);
});
});
}
fn bench_fts_segment_for_fts(c: &mut Criterion) {
let fts = FtsTokenizer::new();
let inputs = [("short", SHORT), ("medium", MEDIUM), ("long", LONG)];
let mut group = c.benchmark_group("fts/segment_for_fts");
for (label, text) in inputs {
group.throughput(Throughput::Bytes(text.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter(label), text, |b, text| {
b.iter(|| criterion::black_box(fts.segment_for_fts(text)));
});
}
group.finish();
}
fn bench_fts_index_tokens(c: &mut Criterion) {
let fts = FtsTokenizer::new();
let mut group = c.benchmark_group("fts/index_tokens");
group.throughput(Throughput::Bytes(MEDIUM.len() as u64));
group.bench_with_input(BenchmarkId::from_parameter("medium"), MEDIUM, |b, text| {
b.iter(|| criterion::black_box(fts.index_tokens(text)));
});
group.finish();
}
criterion_group!(
benches,
bench_tokenizer_new,
bench_segment_by_length,
bench_segment_mixed,
bench_normalize,
bench_normalize_then_segment,
bench_fts_new,
bench_fts_segment_for_fts,
bench_fts_index_tokens,
);
criterion_main!(benches);