#![allow(clippy::useless_vec)]
use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
use pdf_oxide::text::complex_script_detector::{
detect_complex_script, handle_devanagari_boundary, handle_indic_boundary,
handle_khmer_boundary, handle_thai_boundary, is_complex_script, is_devanagari_anusvar_visarga,
is_devanagari_consonant, is_devanagari_diacritic, is_devanagari_matra, is_devanagari_nukta,
is_devanagari_virama, is_thai_digit, is_thai_major_punctuation, is_thai_tone_mark,
is_thai_vowel_modifier,
};
use pdf_oxide::text::CharacterInfo;
use std::hint::black_box;
fn bench_devanagari(c: &mut Criterion) {
let mut group = c.benchmark_group("devanagari");
group.sample_size(50);
group.bench_function("diacritics", |b| {
let marks = vec![
0x0902, 0x0903, 0x093C, 0x093E, 0x093F, 0x0940, 0x094D, ];
b.iter(|| {
for &code in &marks {
black_box(is_devanagari_diacritic(black_box(code)));
}
});
});
group.bench_function("virama", |b| {
let codes = vec![0x094D, 0x0915, 0x0924, 0x093E]; b.iter(|| {
for &code in &codes {
black_box(is_devanagari_virama(black_box(code)));
}
});
});
group.bench_function("consonants", |b| {
let consonants = vec![
0x0915, 0x0916, 0x0917, 0x091A, 0x0924, ];
b.iter(|| {
for &code in &consonants {
black_box(is_devanagari_consonant(black_box(code)));
}
});
});
group.bench_function("matras", |b| {
let matras = vec![
0x093E, 0x093F, 0x0940, 0x0941, 0x0942, ];
b.iter(|| {
for &code in &matras {
black_box(is_devanagari_matra(black_box(code)));
}
});
});
group.bench_function("anusvara_visarga", |b| {
let codes = vec![0x0902, 0x0903, 0x0915, 0x0924];
b.iter(|| {
for &code in &codes {
black_box(is_devanagari_anusvar_visarga(black_box(code)));
}
});
});
group.bench_function("nukta", |b| {
let codes = vec![0x093C, 0x0915, 0x0924];
b.iter(|| {
for &code in &codes {
black_box(is_devanagari_nukta(black_box(code)));
}
});
});
group.bench_function("boundary_handling", |b| {
let chars = vec![
create_char(0x0928, 0.0, 7.0), create_char(0x092E, 7.0, 7.0), create_char(0x0938, 14.0, 7.0), create_char(0x094D, 21.0, 0.0), create_char(0x0924, 21.0, 7.0), create_char(0x0947, 28.0, 0.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_devanagari_boundary(
black_box(&chars[i]),
black_box(&chars[i + 1]),
));
}
});
});
group.finish();
}
fn bench_thai(c: &mut Criterion) {
let mut group = c.benchmark_group("thai");
group.sample_size(50);
group.bench_function("tone_marks", |b| {
let marks = vec![
0x0E48, 0x0E49, 0x0E4A, 0x0E4B, ];
b.iter(|| {
for &code in &marks {
black_box(is_thai_tone_mark(black_box(code)));
}
});
});
group.bench_function("vowel_modifiers", |b| {
let modifiers = vec![
0x0E31, 0x0E34, 0x0E35, 0x0E36, 0x0E37, 0x0E39, 0x0E3A, ];
b.iter(|| {
for &code in &modifiers {
black_box(is_thai_vowel_modifier(black_box(code)));
}
});
});
group.bench_function("digits", |b| {
let digits = vec![
0x0E50, 0x0E51, 0x0E52, 0x0030, 0x0031, 0x0032, ];
b.iter(|| {
for &code in &digits {
black_box(is_thai_digit(black_box(code)));
}
});
});
group.bench_function("major_punctuation", |b| {
let punct = vec![
0x0E2F, 0x0E46, 0x0E4F, ];
b.iter(|| {
for &code in &punct {
black_box(is_thai_major_punctuation(black_box(code)));
}
});
});
group.bench_function("boundary_handling", |b| {
let chars = vec![
create_char(0x0E01, 0.0, 7.0), create_char(0x0E48, 0.0, 0.0), create_char(0x0E32, 7.0, 7.0), create_char(0x0E23, 14.0, 7.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_thai_boundary(black_box(&chars[i]), black_box(&chars[i + 1])));
}
});
});
group.finish();
}
fn bench_khmer(c: &mut Criterion) {
let mut group = c.benchmark_group("khmer");
group.sample_size(50);
group.bench_function("coeng", |b| {
let codes = vec![0x17D2, 0x1780, 0x1781, 0x1782];
b.iter(|| {
for &code in &codes {
black_box(code == 0x17D2);
}
});
});
group.bench_function("vowel_signs", |b| {
let vowels = vec![
0x17B6, 0x17B7, 0x17B8, 0x17B9, 0x17BA, 0x17BB, 0x17BC, 0x17BD,
];
b.iter(|| {
for &code in &vowels {
black_box(matches!(code, 0x17B6..=0x17BD));
}
});
});
group.bench_function("boundary_handling", |b| {
let chars = vec![
create_char(0x1780, 0.0, 7.0), create_char(0x17D2, 0.0, 0.0), create_char(0x1781, 0.0, 7.0), create_char(0x17B6, 7.0, 0.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_khmer_boundary(black_box(&chars[i]), black_box(&chars[i + 1])));
}
});
});
group.finish();
}
fn bench_indic_scripts(c: &mut Criterion) {
let mut group = c.benchmark_group("indic_scripts");
group.sample_size(50);
group.bench_function("tamil", |b| {
let chars = vec![
create_char(0x0B95, 0.0, 7.0), create_char(0x0BBE, 7.0, 0.0), create_char(0x0B99, 7.0, 7.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_indic_boundary(black_box(&chars[i]), black_box(&chars[i + 1])));
}
});
});
group.bench_function("telugu", |b| {
let chars = vec![
create_char(0x0C15, 0.0, 7.0), create_char(0x0C3E, 7.0, 0.0), create_char(0x0C17, 7.0, 7.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_indic_boundary(black_box(&chars[i]), black_box(&chars[i + 1])));
}
});
});
group.bench_function("bengali", |b| {
let chars = vec![
create_char(0x0995, 0.0, 7.0), create_char(0x09BE, 7.0, 0.0), create_char(0x0997, 7.0, 7.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_indic_boundary(black_box(&chars[i]), black_box(&chars[i + 1])));
}
});
});
group.finish();
}
fn bench_all_complex_scripts(c: &mut Criterion) {
let mut group = c.benchmark_group("all_complex_scripts");
group.sample_size(50);
group.bench_function("detect_all_scripts", |b| {
let test_codes = vec![
0x0928, 0x0995, 0x0A15, 0x0A95, 0x0B15, 0x0B95, 0x0C15, 0x0C95, 0x0D15, 0x0D95, 0x0E01, 0x0E81, 0x1780, 0x1000, 0x1820, ];
b.iter(|| {
for &code in &test_codes {
black_box(detect_complex_script(black_box(code)));
}
});
});
group.bench_function("is_complex_script", |b| {
let codes = vec![
0x0928, 0x0995, 0x0E01, 0x1780, 0x0041, 0x4E00, 0x0627, ];
b.iter(|| {
for &code in &codes {
black_box(is_complex_script(black_box(code)));
}
});
});
group.finish();
}
fn bench_batch_complex_processing(c: &mut Criterion) {
let mut group = c.benchmark_group("batch_complex");
group.sample_size(20);
for size in [10, 100, 1000] {
group.bench_with_input(BenchmarkId::new("devanagari", size), &size, |b, &size| {
let chars = generate_devanagari_text(size);
b.iter(|| {
for &code in &chars {
black_box(detect_complex_script(black_box(code)));
black_box(is_devanagari_diacritic(black_box(code)));
black_box(is_devanagari_virama(black_box(code)));
}
});
});
group.bench_with_input(BenchmarkId::new("thai", size), &size, |b, &size| {
let chars = generate_thai_text(size);
b.iter(|| {
for &code in &chars {
black_box(detect_complex_script(black_box(code)));
black_box(is_thai_tone_mark(black_box(code)));
black_box(is_thai_vowel_modifier(black_box(code)));
}
});
});
group.bench_with_input(BenchmarkId::new("mixed", size), &size, |b, &size| {
let chars = generate_mixed_complex_text(size);
b.iter(|| {
for &code in &chars {
black_box(detect_complex_script(black_box(code)));
}
});
});
}
group.finish();
}
fn bench_virama_coeng_handling(c: &mut Criterion) {
let mut group = c.benchmark_group("virama_coeng");
group.sample_size(50);
group.bench_function("devanagari_virama_sequence", |b| {
let chars = vec![
create_char(0x0915, 0.0, 7.0), create_char(0x094D, 0.0, 0.0), create_char(0x0937, 0.0, 7.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_devanagari_boundary(
black_box(&chars[i]),
black_box(&chars[i + 1]),
));
}
});
});
group.bench_function("khmer_coeng_sequence", |b| {
let chars = vec![
create_char(0x1780, 0.0, 7.0), create_char(0x17D2, 0.0, 0.0), create_char(0x1781, 0.0, 7.0), ];
b.iter(|| {
for i in 0..chars.len() - 1 {
black_box(handle_khmer_boundary(black_box(&chars[i]), black_box(&chars[i + 1])));
}
});
});
group.finish();
}
fn create_char(code: u32, x_pos: f32, width: f32) -> CharacterInfo {
CharacterInfo {
code,
glyph_id: Some(1),
width,
x_position: x_pos,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}
}
fn generate_devanagari_text(size: usize) -> Vec<u32> {
let chars = vec![
0x0928, 0x092E, 0x0938, 0x094D, 0x093E, 0x0947, 0x0902, 0x0903, ];
(0..size).map(|i| chars[i % chars.len()]).collect()
}
fn generate_thai_text(size: usize) -> Vec<u32> {
let chars = vec![
0x0E01, 0x0E02, 0x0E03, 0x0E48, 0x0E49, 0x0E31, 0x0E34, ];
(0..size).map(|i| chars[i % chars.len()]).collect()
}
fn generate_mixed_complex_text(size: usize) -> Vec<u32> {
let chars = vec![
0x0928, 0x0995, 0x0B95, 0x0E01, 0x1780, ];
(0..size).map(|i| chars[i % chars.len()]).collect()
}
criterion_group!(
benches,
bench_devanagari,
bench_thai,
bench_khmer,
bench_indic_scripts,
bench_all_complex_scripts,
bench_batch_complex_processing,
bench_virama_coeng_handling,
);
criterion_main!(benches);