use crate::text::cjk_punctuation;
use crate::text::complex_script_detector::{
detect_complex_script, handle_devanagari_boundary, handle_indic_boundary,
handle_khmer_boundary, handle_thai_boundary, ComplexScript,
};
use crate::text::rtl_detector::should_split_at_rtl_boundary;
use crate::text::script_detector::{
detect_cjk_script, handle_japanese_text, handle_korean_text, should_split_on_script_transition,
DocumentLanguage,
};
#[derive(Clone, Debug)]
pub struct CharacterInfo {
pub code: u32,
pub glyph_id: Option<u16>,
pub width: f32,
pub x_position: f32,
pub tj_offset: Option<i32>,
pub font_size: f32,
pub is_ligature: bool,
pub original_ligature: Option<char>,
pub protected_from_split: bool,
}
#[derive(Clone, Debug)]
pub struct BoundaryContext {
pub font_size: f32,
pub horizontal_scaling: f32,
pub word_spacing: f32,
pub char_spacing: f32,
}
impl BoundaryContext {
pub fn new(font_size: f32) -> Self {
Self {
font_size,
horizontal_scaling: 100.0,
word_spacing: 0.0,
char_spacing: 0.0,
}
}
fn effective_font_size(&self) -> f32 {
self.font_size * (self.horizontal_scaling / 100.0)
}
}
#[allow(clippy::upper_case_acronyms)]
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum DocumentScript {
Latin,
CJK,
RTL,
Complex,
Mixed,
}
impl DocumentScript {
pub fn detect_from_characters(characters: &[CharacterInfo]) -> Self {
if characters.is_empty() {
return Self::Latin; }
let mut has_rtl = false;
let mut has_cjk = false;
let mut has_complex = false;
let sample_size = characters.len().min(1000);
for ch in &characters[..sample_size] {
if (0x0590..=0x08FF).contains(&ch.code) || (0xFB1D..=0xFDFF).contains(&ch.code) {
has_rtl = true;
}
if (0x4E00..=0x9FFF).contains(&ch.code) || (0x3040..=0x309F).contains(&ch.code) || (0x30A0..=0x30FF).contains(&ch.code) || (0xAC00..=0xD7AF).contains(&ch.code)
{
has_cjk = true;
}
if (0x0900..=0x097F).contains(&ch.code) || (0x0E00..=0x0E7F).contains(&ch.code) || (0x1780..=0x17FF).contains(&ch.code)
{
has_complex = true;
}
}
#[allow(clippy::let_and_return)]
let script = match (has_rtl, has_cjk, has_complex) {
(false, false, false) => Self::Latin, (false, true, _) => Self::CJK, (true, false, _) => Self::RTL, (_, _, true) => Self::Complex, _ => Self::Mixed, };
crate::extract_log_trace!(
"Detected document script: {:?} (sampled {} characters)",
script,
sample_size
);
script
}
}
#[derive(Debug)]
pub struct WordBoundaryDetector {
tj_offset_threshold: i32,
geometric_gap_ratio: f32,
cjk_enabled: bool,
detect_script_transitions: bool,
document_language: Option<DocumentLanguage>,
primary_script: DocumentScript,
use_adaptive_threshold: bool,
}
impl Default for WordBoundaryDetector {
fn default() -> Self {
Self::new()
}
}
impl WordBoundaryDetector {
pub fn new() -> Self {
Self {
tj_offset_threshold: -100,
geometric_gap_ratio: 0.8,
cjk_enabled: true,
detect_script_transitions: true,
document_language: None,
primary_script: DocumentScript::Mixed, use_adaptive_threshold: true, }
}
pub fn with_tj_threshold(mut self, threshold: i32) -> Self {
self.tj_offset_threshold = threshold;
self
}
pub fn with_geometric_gap_ratio(mut self, ratio: f32) -> Self {
self.geometric_gap_ratio = ratio;
self
}
pub fn with_cjk_enabled(mut self, enabled: bool) -> Self {
self.cjk_enabled = enabled;
self
}
pub fn with_script_detection(mut self, enabled: bool) -> Self {
self.detect_script_transitions = enabled;
self
}
pub fn with_document_language(mut self, lang: DocumentLanguage) -> Self {
self.document_language = Some(lang);
self
}
pub fn with_document_script(mut self, script: DocumentScript) -> Self {
self.primary_script = script;
self
}
pub fn with_adaptive_threshold(mut self, enabled: bool) -> Self {
self.use_adaptive_threshold = enabled;
self
}
fn calculate_tj_threshold(&self, context: &BoundaryContext) -> f32 {
let font_size = context.font_size.max(1.0);
let h_scale = (context.horizontal_scaling / 100.0).max(0.01);
let base_threshold = -font_size * h_scale * 0.025;
let spacing_adjustment = (context.char_spacing.abs() + context.word_spacing.abs()) * 0.5;
base_threshold - spacing_adjustment
}
pub fn detect_word_boundaries(
&self,
characters: &[CharacterInfo],
context: &BoundaryContext,
) -> Vec<usize> {
if characters.is_empty() {
return Vec::new();
}
let mut boundaries = Vec::new();
for i in 1..characters.len() {
let prev_char = &characters[i - 1];
let curr_char = &characters[i];
if self.is_word_boundary(prev_char, curr_char, context) {
boundaries.push(i);
}
}
crate::extract_log_trace!(
"Word boundary detection: {} boundaries in {} characters",
boundaries.len(),
characters.len()
);
boundaries
}
fn is_word_boundary(
&self,
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
context: &BoundaryContext,
) -> bool {
if prev_char.protected_from_split || curr_char.protected_from_split {
return false;
}
if prev_char.code == 0x20 || prev_char.code == 0x200B {
return true;
}
match self.primary_script {
DocumentScript::Latin => self.is_word_boundary_basic(prev_char, curr_char, context),
DocumentScript::CJK => {
if self.detect_script_transitions {
if let Some(decision) = self.should_split_at_cjk_boundary(prev_char, curr_char)
{
return decision;
}
}
self.is_word_boundary_basic(prev_char, curr_char, context)
},
DocumentScript::RTL => {
if let Some(decision) =
should_split_at_rtl_boundary(prev_char, curr_char, Some(context))
{
return decision;
}
self.is_word_boundary_basic(prev_char, curr_char, context)
},
DocumentScript::Complex => {
if let Some(decision) =
self.should_split_at_complex_script_boundary(prev_char, curr_char)
{
return decision;
}
self.is_word_boundary_basic(prev_char, curr_char, context)
},
DocumentScript::Mixed => {
if let Some(decision) =
should_split_at_rtl_boundary(prev_char, curr_char, Some(context))
{
return decision;
}
if self.detect_script_transitions {
if let Some(decision) = self.should_split_at_cjk_boundary(prev_char, curr_char)
{
return decision;
}
}
if let Some(decision) =
self.should_split_at_complex_script_boundary(prev_char, curr_char)
{
return decision;
}
self.is_word_boundary_basic(prev_char, curr_char, context)
},
}
}
fn is_word_boundary_basic(
&self,
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
context: &BoundaryContext,
) -> bool {
if let Some(tj_offset) = prev_char.tj_offset {
let threshold = if self.use_adaptive_threshold {
self.calculate_tj_threshold(context)
} else {
self.tj_offset_threshold as f32
};
if (tj_offset as f32) < threshold {
return true;
}
}
if self.has_significant_geometric_gap(prev_char, curr_char, context) {
return true;
}
if self.cjk_enabled
&& !self.detect_script_transitions
&& self.is_cjk_character(prev_char.code)
&& !self.is_cjk_punctuation(prev_char.code)
{
return true;
}
false
}
fn should_split_at_complex_script_boundary(
&self,
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
) -> Option<bool> {
let prev_script = detect_complex_script(prev_char.code);
let curr_script = detect_complex_script(curr_char.code);
if prev_script.is_none() && curr_script.is_none() {
return None;
}
match (prev_script, curr_script) {
(Some(ComplexScript::Devanagari), _) | (_, Some(ComplexScript::Devanagari)) => {
handle_devanagari_boundary(prev_char, curr_char)
},
(Some(ComplexScript::Thai), _) | (_, Some(ComplexScript::Thai)) => {
handle_thai_boundary(prev_char, curr_char)
},
(Some(ComplexScript::Khmer), _) | (_, Some(ComplexScript::Khmer)) => {
handle_khmer_boundary(prev_char, curr_char)
},
(Some(ComplexScript::Tamil), _)
| (_, Some(ComplexScript::Tamil))
| (Some(ComplexScript::Telugu), _)
| (_, Some(ComplexScript::Telugu))
| (Some(ComplexScript::Kannada), _)
| (_, Some(ComplexScript::Kannada))
| (Some(ComplexScript::Malayalam), _)
| (_, Some(ComplexScript::Malayalam))
| (Some(ComplexScript::Bengali), _)
| (_, Some(ComplexScript::Bengali)) => handle_indic_boundary(prev_char, curr_char),
_ => None,
}
}
fn should_split_at_cjk_boundary(
&self,
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
) -> Option<bool> {
let prev_punctuation_score =
cjk_punctuation::get_cjk_punctuation_boundary_score(prev_char.code, None);
if prev_punctuation_score >= 0.9 {
return Some(true);
}
let prev_script = detect_cjk_script(prev_char.code);
let curr_script = detect_cjk_script(curr_char.code);
if prev_script.is_none() && curr_script.is_none() {
return None;
}
match self.document_language {
Some(DocumentLanguage::Japanese) => {
handle_japanese_text(prev_char, curr_char, prev_script, curr_script)
},
Some(DocumentLanguage::Korean) => {
handle_korean_text(prev_char, curr_char, prev_script, curr_script)
},
Some(DocumentLanguage::Chinese) | None => {
should_split_on_script_transition(prev_script, curr_script, self.document_language)
},
}
}
fn is_ligature_internal_gap(
&self,
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
) -> bool {
const LIGATURES: [u32; 7] = [0xFB00, 0xFB01, 0xFB02, 0xFB03, 0xFB04, 0xFB05, 0xFB06];
LIGATURES.contains(&prev_char.code)
|| prev_char.is_ligature
|| LIGATURES.contains(&curr_char.code)
|| curr_char.is_ligature
}
pub fn is_punctuation(code: u32) -> bool {
matches!(
code,
0x21 | 0x22 | 0x27 | 0x2C | 0x2E | 0x3A | 0x3B | 0x3F | 0x2018..=0x201F | 0x2010..=0x2015 )
}
fn has_significant_geometric_gap(
&self,
prev_char: &CharacterInfo,
curr_char: &CharacterInfo,
context: &BoundaryContext,
) -> bool {
if self.is_ligature_internal_gap(prev_char, curr_char) {
return false;
}
let prev_end_x = prev_char.x_position + prev_char.width;
let raw_gap = curr_char.x_position - prev_end_x;
let adjusted_gap = raw_gap - context.char_spacing;
let base_threshold = context.effective_font_size() * self.geometric_gap_ratio;
if Self::is_punctuation(curr_char.code) {
return adjusted_gap > (base_threshold * 0.5);
}
adjusted_gap > base_threshold
}
fn is_cjk_character(&self, code: u32) -> bool {
matches!(
code,
0x3040..=0x309F | 0x30A0..=0x30FF | 0x3400..=0x4DBF | 0x4E00..=0x9FFF | 0x20000..=0x2A6DF | 0x2A700..=0x2B73F | 0x2B740..=0x2B81F | 0x2B820..=0x2CEAF | 0x2CEB0..=0x2EBEF )
}
fn is_cjk_punctuation(&self, code: u32) -> bool {
matches!(
code,
0x3001 | 0x3002 | 0x3008 | 0x3009 | 0x300A | 0x300B | 0x300C | 0x300D | 0x300E | 0x300F | 0x3010 | 0x3011 | 0x3014 | 0x3015 )
}
}
pub fn detect_word_boundaries(
characters: &[CharacterInfo],
context: &BoundaryContext,
) -> Vec<usize> {
let detector = WordBoundaryDetector::new();
detector.detect_word_boundaries(characters, context)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_ascii_space_detection() {
let characters = vec![
CharacterInfo {
code: 0x48,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x65,
glyph_id: Some(2),
width: 0.4,
x_position: 6.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x20,
glyph_id: Some(5),
width: 0.25,
x_position: 10.8,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x57,
glyph_id: Some(6),
width: 0.7,
x_position: 16.2,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&3));
}
#[test]
fn test_tj_offset_threshold() {
let characters = vec![
CharacterInfo {
code: 0x54,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x2D,
glyph_id: Some(5),
width: 0.25,
x_position: 6.0,
tj_offset: Some(-200),
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x6F,
glyph_id: Some(6),
width: 0.4,
x_position: 18.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&2));
}
#[test]
fn test_geometric_gap_detection() {
let characters = vec![
CharacterInfo {
code: 0x54,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x65,
glyph_id: Some(2),
width: 0.4,
x_position: 6.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x78,
glyph_id: Some(3),
width: 0.4,
x_position: 10.8,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x74,
glyph_id: Some(4),
width: 0.3,
x_position: 15.6,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x42,
glyph_id: Some(5),
width: 0.5,
x_position: 27.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&4), "Expected boundary at index 4, got: {:?}", boundaries);
}
#[test]
fn test_cjk_character_boundaries() {
let characters = vec![
CharacterInfo {
code: 0x4E2D,
glyph_id: Some(1),
width: 1.0,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x6587,
glyph_id: Some(2),
width: 1.0,
x_position: 12.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x5B57,
glyph_id: Some(3),
width: 1.0,
x_position: 24.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&1), "Expected boundary at index 1");
assert!(boundaries.contains(&2), "Expected boundary at index 2");
}
#[test]
fn test_zero_width_space() {
let characters = vec![
CharacterInfo {
code: 0x6E,
glyph_id: Some(1),
width: 0.4,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x200B,
glyph_id: Some(2),
width: 0.0,
x_position: 4.8,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x72,
glyph_id: Some(3),
width: 0.3,
x_position: 4.8,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&2));
}
#[test]
fn test_horizontal_scaling_affects_gap_threshold() {
let characters = vec![
CharacterInfo {
code: 0x41,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x42,
glyph_id: Some(2),
width: 0.5,
x_position: 8.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let mut context = BoundaryContext::new(12.0);
context.horizontal_scaling = 100.0;
let boundaries_normal =
WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
context.horizontal_scaling = 75.0;
let boundaries_scaled =
WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries_normal.is_empty(), "Should have no boundaries at 100% scaling");
assert!(boundaries_scaled.contains(&1), "Should have boundary at 75% scaling");
}
#[test]
fn test_detect_word_boundaries_ascii_space() {
let characters = vec![
CharacterInfo {
code: 0x48,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x65,
glyph_id: Some(2),
width: 0.4,
x_position: 6.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x20,
glyph_id: Some(5),
width: 0.25,
x_position: 10.8,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x57,
glyph_id: Some(6),
width: 0.7,
x_position: 16.2,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&3), "Should have boundary after space");
}
#[test]
fn test_detect_word_boundaries_tj_offset() {
let characters = vec![
CharacterInfo {
code: 0x54,
glyph_id: Some(1),
width: 0.5,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x2D,
glyph_id: Some(5),
width: 0.25,
x_position: 6.0,
tj_offset: Some(-200),
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x6F,
glyph_id: Some(6),
width: 0.4,
x_position: 18.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&2), "Should have boundary after large TJ offset");
}
#[test]
fn test_detect_word_boundaries_cjk() {
let characters = vec![
CharacterInfo {
code: 0x4E2D,
glyph_id: Some(1),
width: 1.0,
x_position: 0.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, CharacterInfo {
code: 0x6587,
glyph_id: Some(2),
width: 1.0,
x_position: 12.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
}, ];
let context = BoundaryContext::new(12.0);
let boundaries = WordBoundaryDetector::new().detect_word_boundaries(&characters, &context);
assert!(boundaries.contains(&1), "Should have boundary after first CJK character");
}
#[test]
fn test_calculate_tj_threshold_default_font() {
let detector = WordBoundaryDetector::new();
let context = BoundaryContext::new(12.0); let threshold = detector.calculate_tj_threshold(&context);
assert!(
(threshold - (-0.3)).abs() < 0.01,
"12pt font should give -0.3, got {}",
threshold
);
}
#[test]
fn test_calculate_tj_threshold_large_font() {
let detector = WordBoundaryDetector::new();
let context = BoundaryContext::new(24.0); let threshold = detector.calculate_tj_threshold(&context);
assert!(
(threshold - (-0.6)).abs() < 0.01,
"24pt font should give -0.6, got {}",
threshold
);
}
#[test]
fn test_calculate_tj_threshold_with_char_spacing() {
let detector = WordBoundaryDetector::new();
let mut context = BoundaryContext::new(12.0);
context.char_spacing = 2.0;
let threshold = detector.calculate_tj_threshold(&context);
assert!(
(threshold - (-1.3)).abs() < 0.01,
"With char_spacing=2.0, expected -1.3, got {}",
threshold
);
}
#[test]
fn test_calculate_tj_threshold_with_word_spacing() {
let detector = WordBoundaryDetector::new();
let mut context = BoundaryContext::new(12.0);
context.word_spacing = 3.0;
let threshold = detector.calculate_tj_threshold(&context);
assert!(
(threshold - (-1.8)).abs() < 0.01,
"With word_spacing=3.0, expected -1.8, got {}",
threshold
);
}
#[test]
fn test_calculate_tj_threshold_with_horizontal_scaling() {
let detector = WordBoundaryDetector::new();
let mut context = BoundaryContext::new(12.0);
context.horizontal_scaling = 80.0; let threshold = detector.calculate_tj_threshold(&context);
assert!(
(threshold - (-0.24)).abs() < 0.01,
"With 80% scaling, expected -0.24, got {}",
threshold
);
}
#[test]
fn test_adaptive_threshold_affects_boundary_detection() {
let detector = WordBoundaryDetector::new().with_adaptive_threshold(true);
let context = BoundaryContext::new(12.0);
let prev = CharacterInfo {
code: 't' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: Some(-200), font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'h' as u32,
glyph_id: None,
width: 5.0,
x_position: 110.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let boundary = detector.is_word_boundary(&prev, &curr, &context);
assert!(boundary, "TJ offset -200 should trigger boundary with 12pt font");
}
#[test]
fn test_disable_adaptive_threshold_uses_static() {
let detector = WordBoundaryDetector::new()
.with_adaptive_threshold(false)
.with_tj_threshold(-100);
let context = BoundaryContext::new(12.0);
let prev = CharacterInfo {
code: 'a' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: Some(-50), font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'b' as u32,
glyph_id: None,
width: 5.0,
x_position: 110.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let boundary = detector.is_word_boundary(&prev, &curr, &context);
assert!(!boundary, "TJ offset -50 should NOT trigger boundary when static -100 is used");
}
#[test]
fn test_geometric_gap_basic() {
let detector = WordBoundaryDetector::new();
let context = BoundaryContext::new(12.0);
let prev = CharacterInfo {
code: 't' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'h' as u32,
glyph_id: None,
width: 5.0,
x_position: 115.0, tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
assert!(
detector.has_significant_geometric_gap(&prev, &curr, &context),
"Gap of 10 units should exceed threshold of 9.6 (12pt * 0.8)"
);
}
#[test]
fn test_geometric_gap_with_char_spacing() {
let detector = WordBoundaryDetector::new();
let mut context = BoundaryContext::new(12.0);
context.char_spacing = 2.0;
let prev = CharacterInfo {
code: 'a' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'b' as u32,
glyph_id: None,
width: 5.0,
x_position: 115.0, tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
assert!(
!detector.has_significant_geometric_gap(&prev, &curr, &context),
"Gap of 10 - 2.0 (Tc) = 8.0 should NOT exceed threshold of 9.6"
);
}
#[test]
fn test_ligature_internal_gap_fi() {
let detector = WordBoundaryDetector::new();
let context = BoundaryContext::new(12.0);
let prev = CharacterInfo {
code: 'f' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: true, original_ligature: Some('fi'),
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'i' as u32,
glyph_id: None,
width: 3.0,
x_position: 120.0, tj_offset: None,
font_size: 12.0,
is_ligature: true,
original_ligature: Some('fi'),
protected_from_split: false,
};
assert!(
!detector.has_significant_geometric_gap(&prev, &curr, &context),
"Ligature internal gap should NOT create boundary even with large gap"
);
}
#[test]
fn test_punctuation_reduced_threshold() {
let detector = WordBoundaryDetector::new();
let context = BoundaryContext::new(12.0);
let prev = CharacterInfo {
code: 'd' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let curr_period = CharacterInfo {
code: '.' as u32, glyph_id: None,
width: 2.0,
x_position: 111.0, tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
assert!(
detector.has_significant_geometric_gap(&prev, &curr_period, &context),
"Gap of 6 units should exceed punctuation threshold of 4.8 (50% of 9.6)"
);
}
#[test]
fn test_punctuation_does_not_trigger_on_normal_text() {
let detector = WordBoundaryDetector::new();
let context = BoundaryContext::new(12.0);
let prev = CharacterInfo {
code: 'd' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'e' as u32, glyph_id: None,
width: 5.0,
x_position: 111.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
assert!(
!detector.has_significant_geometric_gap(&prev, &curr, &context),
"Gap of 6 units should NOT exceed normal threshold of 9.6"
);
}
#[test]
fn test_is_punctuation_ascii() {
assert!(WordBoundaryDetector::is_punctuation('.' as u32));
assert!(WordBoundaryDetector::is_punctuation(',' as u32));
assert!(WordBoundaryDetector::is_punctuation('!' as u32));
assert!(WordBoundaryDetector::is_punctuation('?' as u32));
assert!(WordBoundaryDetector::is_punctuation(':' as u32));
assert!(WordBoundaryDetector::is_punctuation(';' as u32));
}
#[test]
fn test_is_punctuation_non_punctuation() {
assert!(!WordBoundaryDetector::is_punctuation('a' as u32));
assert!(!WordBoundaryDetector::is_punctuation('1' as u32));
assert!(!WordBoundaryDetector::is_punctuation(' ' as u32));
}
#[test]
fn test_is_ligature_internal_gap_ffi() {
let detector = WordBoundaryDetector::new();
let prev = CharacterInfo {
code: 'f' as u32,
glyph_id: None,
width: 5.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: true,
original_ligature: Some('ffl'), protected_from_split: false,
};
let curr = CharacterInfo {
code: 'f' as u32,
glyph_id: None,
width: 5.0,
x_position: 110.0,
tj_offset: None,
font_size: 12.0,
is_ligature: true,
original_ligature: Some('ffl'),
protected_from_split: false,
};
assert!(
detector.is_ligature_internal_gap(&prev, &curr),
"Should detect ligature internal gap when both have is_ligature=true"
);
}
#[test]
fn test_is_ligature_internal_gap_actual_ligature_code() {
let detector = WordBoundaryDetector::new();
let prev = CharacterInfo {
code: 0xFB00, glyph_id: None,
width: 10.0,
x_position: 100.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false, original_ligature: None,
protected_from_split: false,
};
let curr = CharacterInfo {
code: 'i' as u32,
glyph_id: None,
width: 3.0,
x_position: 115.0,
tj_offset: None,
font_size: 12.0,
is_ligature: false,
original_ligature: None,
protected_from_split: false,
};
assert!(
detector.is_ligature_internal_gap(&prev, &curr),
"Should detect ligature internal gap when prev code is U+FB00"
);
}
}