Skip to main content

ftui_text/
script_segmentation.rs

1#![forbid(unsafe_code)]
2
3//! Script segmentation and bidi-safe text run partitioning.
4//!
5//! This module provides deterministic text-run segmentation by Unicode script,
6//! bidi direction, and style — preparing robust shaping inputs and consistent
7//! cache keys for the downstream HarfBuzz shaping pipeline.
8//!
9//! # Design
10//!
11//! Text shaping engines (HarfBuzz, CoreText, DirectWrite) require input to be
12//! split into runs that share the same **script**, **direction**, and **style**.
13//! Mixing scripts in a single shaping call produces incorrect glyph selection
14//! and positioning.
15//!
16//! This module implements a three-phase algorithm:
17//!
18//! 1. **Raw classification** — assign each character its Unicode script via
19//!    block-range lookup (`char_script`).
20//! 2. **Common/Inherited resolution** — resolve `Common` and `Inherited`
21//!    characters by propagating adjacent specific scripts (UAX#24-inspired).
22//! 3. **Run grouping** — collect contiguous characters sharing the same
23//!    resolved script into [`ScriptRun`] spans.
24//!
25//! The [`TextRun`] type further subdivides by direction and style, producing
26//! the atomic units suitable for shaping. [`RunCacheKey`] provides a
27//! deterministic, hashable identifier for caching shaped glyph output.
28//!
29//! # Example
30//!
31//! ```
32//! use ftui_text::script_segmentation::{Script, ScriptRun, partition_by_script};
33//!
34//! let runs = partition_by_script("Hello مرحبا World");
35//! assert!(runs.len() >= 2); // At least Latin and Arabic runs
36//! assert_eq!(runs[0].script, Script::Latin);
37//! ```
38
39use std::hash::{Hash, Hasher};
40
41// ---------------------------------------------------------------------------
42// Script enum
43// ---------------------------------------------------------------------------
44
45/// Unicode script classification for shaping.
46///
47/// Covers the major scripts encountered in terminal and UI text rendering.
48/// Scripts not explicitly listed fall under `Unknown`.
49///
50/// `Common` represents script-neutral characters (spaces, digits, ASCII
51/// punctuation) and `Inherited` represents combining marks that inherit
52/// the script of their base character. Both are resolved to a specific
53/// script during run partitioning.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, PartialOrd, Ord)]
55#[repr(u8)]
56pub enum Script {
57    /// Script-neutral: spaces, digits, basic punctuation, symbols.
58    Common = 0,
59    /// Combining marks that inherit the base character's script.
60    Inherited,
61    /// Latin script (English, French, German, Vietnamese, etc.).
62    Latin,
63    /// Greek script.
64    Greek,
65    /// Cyrillic script (Russian, Ukrainian, Bulgarian, etc.).
66    Cyrillic,
67    /// Armenian script.
68    Armenian,
69    /// Hebrew script.
70    Hebrew,
71    /// Arabic script (Arabic, Persian, Urdu, etc.).
72    Arabic,
73    /// Syriac script.
74    Syriac,
75    /// Thaana script (Maldivian).
76    Thaana,
77    /// Devanagari script (Hindi, Sanskrit, Marathi, etc.).
78    Devanagari,
79    /// Bengali script.
80    Bengali,
81    /// Gurmukhi script (Punjabi).
82    Gurmukhi,
83    /// Gujarati script.
84    Gujarati,
85    /// Oriya script.
86    Oriya,
87    /// Tamil script.
88    Tamil,
89    /// Telugu script.
90    Telugu,
91    /// Kannada script.
92    Kannada,
93    /// Malayalam script.
94    Malayalam,
95    /// Sinhala script.
96    Sinhala,
97    /// Thai script.
98    Thai,
99    /// Lao script.
100    Lao,
101    /// Tibetan script.
102    Tibetan,
103    /// Myanmar script (Burmese).
104    Myanmar,
105    /// Georgian script.
106    Georgian,
107    /// Hangul script (Korean).
108    Hangul,
109    /// Ethiopic script (Amharic, Tigrinya, etc.).
110    Ethiopic,
111    /// CJK Unified Ideographs (Chinese, Japanese Kanji, Korean Hanja).
112    Han,
113    /// Hiragana (Japanese).
114    Hiragana,
115    /// Katakana (Japanese).
116    Katakana,
117    /// Bopomofo (Chinese phonetic).
118    Bopomofo,
119    /// Unknown or unrecognized script.
120    Unknown,
121}
122
123impl Script {
124    /// Whether this is a "weak" script that should be resolved from context.
125    #[inline]
126    pub const fn is_common_or_inherited(self) -> bool {
127        matches!(self, Script::Common | Script::Inherited)
128    }
129
130    /// Whether this script is typically written right-to-left.
131    #[inline]
132    pub const fn is_rtl(self) -> bool {
133        matches!(
134            self,
135            Script::Arabic | Script::Hebrew | Script::Syriac | Script::Thaana
136        )
137    }
138}
139
140// ---------------------------------------------------------------------------
141// Character-to-script detection
142// ---------------------------------------------------------------------------
143
144/// Classify a character's Unicode script via block-range lookup.
145///
146/// This uses hardcoded Unicode block ranges rather than an external crate,
147/// keeping the dependency footprint minimal. Coverage targets the scripts
148/// most commonly encountered in terminal/UI text. Characters outside
149/// recognized ranges return `Script::Unknown`.
150#[inline]
151pub fn char_script(c: char) -> Script {
152    let cp = c as u32;
153    match cp {
154        // ASCII and Basic Latin
155        // Letters are Latin; digits, punctuation, symbols are Common
156        0x0000..=0x0040 => Script::Common, // Controls, space, ! " # $ % & ' ( ) * + , - . / 0-9 : ; < = > ? @
157        0x0041..=0x005A => Script::Latin,  // A-Z
158        0x005B..=0x0060 => Script::Common, // [ \ ] ^ _ `
159        0x0061..=0x007A => Script::Latin,  // a-z
160        0x007B..=0x00BF => Script::Common, // { | } ~ DEL, Latin-1 Supplement (controls, symbols, punctuation)
161        0x00C0..=0x00D6 => Script::Latin,  // À-Ö
162        0x00D7 => Script::Common,          // ×
163        0x00D8..=0x00F6 => Script::Latin,  // Ø-ö
164        0x00F7 => Script::Common,          // ÷
165        0x00F8..=0x024F => Script::Latin,  // ø-ɏ (Latin Extended-A & B)
166        0x0250..=0x02AF => Script::Latin,  // IPA Extensions (Latin)
167        0x02B0..=0x02FF => Script::Common, // Spacing Modifier Letters
168        0x0300..=0x036F => Script::Inherited, // Combining Diacritical Marks
169
170        // Greek and Coptic
171        0x0370..=0x03FF => Script::Greek,
172        0x1F00..=0x1FFF => Script::Greek, // Greek Extended
173
174        // Cyrillic
175        0x0400..=0x04FF => Script::Cyrillic,
176        0x0500..=0x052F => Script::Cyrillic, // Cyrillic Supplement
177        0x2DE0..=0x2DFF => Script::Cyrillic, // Cyrillic Extended-A
178        0xA640..=0xA69F => Script::Cyrillic, // Cyrillic Extended-B
179        0x1C80..=0x1C8F => Script::Cyrillic, // Cyrillic Extended-C
180
181        // Armenian
182        0x0530..=0x058F => Script::Armenian,
183        0xFB13..=0xFB17 => Script::Armenian, // Armenian ligatures
184
185        // Hebrew
186        0x0590..=0x05FF => Script::Hebrew,
187        0xFB1D..=0xFB4F => Script::Hebrew, // Hebrew Presentation Forms
188
189        // Arabic
190        0x0600..=0x06FF => Script::Arabic,
191        0x0750..=0x077F => Script::Arabic, // Arabic Supplement
192        0x08A0..=0x08FF => Script::Arabic, // Arabic Extended-A
193        0xFB50..=0xFDFF => Script::Arabic, // Arabic Presentation Forms-A
194        0xFE70..=0xFEFF => Script::Arabic, // Arabic Presentation Forms-B
195
196        // Syriac
197        0x0700..=0x074F => Script::Syriac,
198        0x0860..=0x086F => Script::Syriac, // Syriac Supplement
199
200        // Thaana
201        0x0780..=0x07BF => Script::Thaana,
202
203        // Devanagari
204        0x0900..=0x097F => Script::Devanagari,
205        0xA8E0..=0xA8FF => Script::Devanagari, // Devanagari Extended
206
207        // Bengali
208        0x0980..=0x09FF => Script::Bengali,
209
210        // Gurmukhi
211        0x0A00..=0x0A7F => Script::Gurmukhi,
212
213        // Gujarati
214        0x0A80..=0x0AFF => Script::Gujarati,
215
216        // Oriya
217        0x0B00..=0x0B7F => Script::Oriya,
218
219        // Tamil
220        0x0B80..=0x0BFF => Script::Tamil,
221
222        // Telugu
223        0x0C00..=0x0C7F => Script::Telugu,
224
225        // Kannada
226        0x0C80..=0x0CFF => Script::Kannada,
227
228        // Malayalam
229        0x0D00..=0x0D7F => Script::Malayalam,
230
231        // Sinhala
232        0x0D80..=0x0DFF => Script::Sinhala,
233
234        // Thai
235        0x0E00..=0x0E7F => Script::Thai,
236
237        // Lao
238        0x0E80..=0x0EFF => Script::Lao,
239
240        // Tibetan
241        0x0F00..=0x0FFF => Script::Tibetan,
242
243        // Myanmar
244        0x1000..=0x109F => Script::Myanmar,
245        0xAA60..=0xAA7F => Script::Myanmar, // Myanmar Extended-A
246
247        // Georgian
248        0x10A0..=0x10FF => Script::Georgian,
249        0x2D00..=0x2D2F => Script::Georgian, // Georgian Supplement
250        0x1C90..=0x1CBF => Script::Georgian, // Georgian Extended
251
252        // Hangul
253        0x1100..=0x11FF => Script::Hangul, // Hangul Jamo
254        0x3130..=0x318F => Script::Hangul, // Hangul Compatibility Jamo
255        0xA960..=0xA97F => Script::Hangul, // Hangul Jamo Extended-A
256        0xAC00..=0xD7AF => Script::Hangul, // Hangul Syllables
257        0xD7B0..=0xD7FF => Script::Hangul, // Hangul Jamo Extended-B
258
259        // Ethiopic
260        0x1200..=0x137F => Script::Ethiopic,
261        0x1380..=0x139F => Script::Ethiopic, // Ethiopic Supplement
262        0x2D80..=0x2DDF => Script::Ethiopic, // Ethiopic Extended
263        0xAB00..=0xAB2F => Script::Ethiopic, // Ethiopic Extended-A
264
265        // Latin Extended Additional / Extended-C / Extended-D / Extended-E
266        0x1E00..=0x1EFF => Script::Latin, // Latin Extended Additional
267        0x2C60..=0x2C7F => Script::Latin, // Latin Extended-C
268        0xA720..=0xA7FF => Script::Latin, // Latin Extended-D
269        0xAB30..=0xAB6F => Script::Latin, // Latin Extended-E
270        0xFB00..=0xFB06 => Script::Latin, // Latin ligatures
271
272        // CJK / Han
273        0x2E80..=0x2EFF => Script::Han,   // CJK Radicals Supplement
274        0x2F00..=0x2FDF => Script::Han,   // Kangxi Radicals
275        0x3400..=0x4DBF => Script::Han,   // CJK Unified Ideographs Extension A
276        0x4E00..=0x9FFF => Script::Han,   // CJK Unified Ideographs
277        0xF900..=0xFAFF => Script::Han,   // CJK Compatibility Ideographs
278        0x20000..=0x2A6DF => Script::Han, // CJK Extension B
279        0x2A700..=0x2B73F => Script::Han, // CJK Extension C
280        0x2B740..=0x2B81F => Script::Han, // CJK Extension D
281        0x2B820..=0x2CEAF => Script::Han, // CJK Extension E
282        0x2CEB0..=0x2EBEF => Script::Han, // CJK Extension F
283        0x30000..=0x3134F => Script::Han, // CJK Extension G
284
285        // Hiragana
286        0x3040..=0x309F => Script::Hiragana,
287        0x1B001..=0x1B11F => Script::Hiragana, // Hiragana Extended
288
289        // Katakana
290        0x30A0..=0x30FF => Script::Katakana,
291        0x31F0..=0x31FF => Script::Katakana, // Katakana Phonetic Extensions
292        0xFF65..=0xFF9F => Script::Katakana, // Halfwidth Katakana
293
294        // Bopomofo
295        0x3100..=0x312F => Script::Bopomofo,
296        0x31A0..=0x31BF => Script::Bopomofo, // Bopomofo Extended
297
298        // CJK symbols and punctuation — Common (shared across CJK scripts)
299        0x3000..=0x303F => Script::Common,
300
301        // General Punctuation, Superscripts, Currency, Letterlike, Number Forms
302        0x2000..=0x206F => Script::Common, // General Punctuation
303        0x2070..=0x209F => Script::Common, // Superscripts and Subscripts
304        0x20A0..=0x20CF => Script::Common, // Currency Symbols
305        0x20D0..=0x20FF => Script::Inherited, // Combining Marks for Symbols
306        0x2100..=0x214F => Script::Common, // Letterlike Symbols
307        0x2150..=0x218F => Script::Common, // Number Forms
308        0x2190..=0x21FF => Script::Common, // Arrows
309        0x2200..=0x22FF => Script::Common, // Mathematical Operators
310        0x2300..=0x23FF => Script::Common, // Miscellaneous Technical
311        0x2400..=0x243F => Script::Common, // Control Pictures
312        0x2440..=0x245F => Script::Common, // OCR
313        0x2460..=0x24FF => Script::Common, // Enclosed Alphanumerics
314        0x2500..=0x257F => Script::Common, // Box Drawing
315        0x2580..=0x259F => Script::Common, // Block Elements
316        0x25A0..=0x25FF => Script::Common, // Geometric Shapes
317        0x2600..=0x26FF => Script::Common, // Miscellaneous Symbols
318        0x2700..=0x27BF => Script::Common, // Dingbats
319        0x27C0..=0x27EF => Script::Common, // Misc Mathematical Symbols-A
320        0x27F0..=0x27FF => Script::Common, // Supplemental Arrows-A
321        0x2800..=0x28FF => Script::Common, // Braille Patterns
322        0x2900..=0x297F => Script::Common, // Supplemental Arrows-B
323        0x2980..=0x29FF => Script::Common, // Misc Mathematical Symbols-B
324        0x2A00..=0x2AFF => Script::Common, // Supplemental Mathematical Operators
325        0x2B00..=0x2BFF => Script::Common, // Miscellaneous Symbols and Arrows
326
327        // Halfwidth and Fullwidth Forms (Latin part)
328        0xFF01..=0xFF5E => Script::Latin, // Fullwidth ASCII variants
329        0xFF61..=0xFF64 => Script::Common, // Halfwidth CJK punctuation
330
331        // Emoji and symbols (Common)
332        0xFE00..=0xFE0F => Script::Inherited, // Variation Selectors
333        0xE0100..=0xE01EF => Script::Inherited, // Variation Selectors Supplement
334        0x1F000..=0x1FAFF => Script::Common,  // Emoji and symbols blocks
335        0xFE10..=0xFE1F => Script::Common,    // Vertical Forms
336        0xFE20..=0xFE2F => Script::Inherited, // Combining Half Marks
337        0xFE30..=0xFE4F => Script::Common,    // CJK Compatibility Forms
338        0xFE50..=0xFE6F => Script::Common,    // Small Form Variants
339
340        // NKo
341        0x07C0..=0x07FF => Script::Arabic, // Treat NKo as Arabic for shaping
342
343        // Fallback
344        _ => Script::Unknown,
345    }
346}
347
348// ---------------------------------------------------------------------------
349// ScriptRun
350// ---------------------------------------------------------------------------
351
352/// A contiguous run of characters sharing the same resolved script.
353///
354/// Indices are byte offsets into the source string for efficient slicing.
355#[derive(Debug, Clone, PartialEq, Eq)]
356pub struct ScriptRun {
357    /// Start byte offset (inclusive) in the source string.
358    pub start: usize,
359    /// End byte offset (exclusive) in the source string.
360    pub end: usize,
361    /// Resolved script for this run.
362    pub script: Script,
363}
364
365impl ScriptRun {
366    /// The byte length of this run.
367    #[inline]
368    pub fn len(&self) -> usize {
369        self.end - self.start
370    }
371
372    /// Whether the run is empty.
373    #[inline]
374    pub fn is_empty(&self) -> bool {
375        self.start == self.end
376    }
377
378    /// Extract the text slice from the source string.
379    #[inline]
380    pub fn text<'a>(&self, source: &'a str) -> &'a str {
381        &source[self.start..self.end]
382    }
383}
384
385// ---------------------------------------------------------------------------
386// Script resolution (Common/Inherited → specific)
387// ---------------------------------------------------------------------------
388
389/// Resolve Common and Inherited scripts to the nearest specific script.
390///
391/// Uses a two-pass approach:
392/// 1. Forward pass: Inherited characters take the script of the preceding
393///    specific character.
394/// 2. Backward pass: Common characters at the start take the script of
395///    the first following specific character. Common characters between
396///    specific runs are assigned to the preceding run.
397fn resolve_scripts(chars: &[char]) -> Vec<Script> {
398    let n = chars.len();
399    if n == 0 {
400        return Vec::new();
401    }
402
403    let mut scripts: Vec<Script> = chars.iter().map(|&c| char_script(c)).collect();
404
405    // Forward pass: resolve Inherited from the left.
406    // Also resolve Common that follows a specific script.
407    let mut last_specific = Script::Common;
408    for script in &mut scripts {
409        if *script == Script::Inherited {
410            *script = if last_specific.is_common_or_inherited() {
411                Script::Common // will be resolved in backward pass
412            } else {
413                last_specific
414            };
415        } else if !script.is_common_or_inherited() {
416            last_specific = *script;
417        }
418    }
419
420    // Backward pass: resolve remaining Common characters.
421    // Find the first specific script and backfill leading Common chars.
422    let first_specific = scripts
423        .iter()
424        .find(|s| !s.is_common_or_inherited())
425        .copied()
426        .unwrap_or(Script::Latin); // All-Common text defaults to Latin
427
428    // Assign leading Common characters the first specific script.
429    for script in &mut scripts {
430        if script.is_common_or_inherited() {
431            *script = first_specific;
432        } else {
433            break;
434        }
435    }
436
437    // Forward pass again: remaining Common chars take the preceding specific.
438    let mut current = first_specific;
439    for script in &mut scripts {
440        if script.is_common_or_inherited() {
441            *script = current;
442        } else {
443            current = *script;
444        }
445    }
446
447    scripts
448}
449
450// ---------------------------------------------------------------------------
451// partition_by_script
452// ---------------------------------------------------------------------------
453
454/// Partition text into contiguous runs of the same Unicode script.
455///
456/// Common characters (spaces, digits, punctuation) and Inherited characters
457/// (combining marks) are resolved to their surrounding script context using
458/// a UAX#24-inspired algorithm, preventing unnecessary run breaks at
459/// whitespace and punctuation boundaries.
460///
461/// Returns an empty vec for empty input.
462///
463/// # Example
464///
465/// ```
466/// use ftui_text::script_segmentation::{Script, partition_by_script};
467///
468/// let runs = partition_by_script("Hello World");
469/// assert_eq!(runs.len(), 1);
470/// assert_eq!(runs[0].script, Script::Latin);
471///
472/// // Mixed scripts produce multiple runs
473/// let runs = partition_by_script("Helloこんにちは");
474/// assert!(runs.len() >= 2);
475/// ```
476pub fn partition_by_script(text: &str) -> Vec<ScriptRun> {
477    if text.is_empty() {
478        return Vec::new();
479    }
480
481    let chars: Vec<char> = text.chars().collect();
482    let resolved = resolve_scripts(&chars);
483
484    let mut runs = Vec::new();
485    let mut byte_offset = 0;
486    let mut run_start = 0;
487    let mut current_script = resolved[0];
488
489    for (i, ch) in chars.iter().enumerate() {
490        let char_len = ch.len_utf8();
491
492        if resolved[i] != current_script {
493            runs.push(ScriptRun {
494                start: run_start,
495                end: byte_offset,
496                script: current_script,
497            });
498            run_start = byte_offset;
499            current_script = resolved[i];
500        }
501
502        byte_offset += char_len;
503    }
504
505    // Final run.
506    runs.push(ScriptRun {
507        start: run_start,
508        end: byte_offset,
509        script: current_script,
510    });
511
512    runs
513}
514
515// ---------------------------------------------------------------------------
516// TextRun (script + direction + style)
517// ---------------------------------------------------------------------------
518
519/// A text direction for run partitioning.
520///
521/// This is a local enum to avoid a hard dependency on the `bidi` feature.
522/// When bidi is enabled, use `Direction::from_bidi()` for conversion.
523#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
524pub enum RunDirection {
525    /// Left-to-right.
526    Ltr,
527    /// Right-to-left.
528    Rtl,
529}
530
531/// A fully partitioned text run suitable for shaping.
532///
533/// Combines script, direction, and style identity into the atomic unit
534/// that a shaping engine processes. Each field boundary triggers a new run.
535#[derive(Debug, Clone, PartialEq, Eq)]
536pub struct TextRun {
537    /// Start byte offset (inclusive) in the source string.
538    pub start: usize,
539    /// End byte offset (exclusive) in the source string.
540    pub end: usize,
541    /// Resolved Unicode script.
542    pub script: Script,
543    /// Text direction for this run.
544    pub direction: RunDirection,
545    /// Opaque style discriminant for cache keying.
546    /// Two runs with different styles must be shaped separately even if
547    /// script and direction match (e.g., bold vs regular affects glyph selection).
548    pub style_id: u64,
549}
550
551impl TextRun {
552    /// The byte length of this run.
553    #[inline]
554    pub fn len(&self) -> usize {
555        self.end - self.start
556    }
557
558    /// Whether the run is empty.
559    #[inline]
560    pub fn is_empty(&self) -> bool {
561        self.start == self.end
562    }
563
564    /// Extract the text slice from the source string.
565    #[inline]
566    pub fn text<'a>(&self, source: &'a str) -> &'a str {
567        &source[self.start..self.end]
568    }
569
570    /// Produce a deterministic cache key for this run's shaped output.
571    #[inline]
572    pub fn cache_key<'a>(&self, source: &'a str) -> RunCacheKey<'a> {
573        RunCacheKey {
574            text: self.text(source),
575            script: self.script,
576            direction: self.direction,
577            style_id: self.style_id,
578        }
579    }
580}
581
582// ---------------------------------------------------------------------------
583// RunCacheKey
584// ---------------------------------------------------------------------------
585
586/// Deterministic, hashable cache key for shaped glyph output.
587///
588/// Two runs producing equal `RunCacheKey` values can share the same
589/// shaped glyph buffer, enabling efficient caching of shaping results.
590#[derive(Debug, Clone, PartialEq, Eq)]
591pub struct RunCacheKey<'a> {
592    /// The text content of the run.
593    pub text: &'a str,
594    /// The resolved script.
595    pub script: Script,
596    /// The text direction.
597    pub direction: RunDirection,
598    /// Style discriminant (e.g., hash of font weight + style + size).
599    pub style_id: u64,
600}
601
602impl Hash for RunCacheKey<'_> {
603    fn hash<H: Hasher>(&self, state: &mut H) {
604        self.text.hash(state);
605        self.script.hash(state);
606        self.direction.hash(state);
607        self.style_id.hash(state);
608    }
609}
610
611// ---------------------------------------------------------------------------
612// partition_text_runs — full run partitioning
613// ---------------------------------------------------------------------------
614
615/// Partition text into fully-resolved text runs by script and direction.
616///
617/// This is the primary entry point for preparing shaping input. Each
618/// returned [`TextRun`] has a uniform script, direction, and style,
619/// suitable for passing directly to a shaping engine.
620///
621/// `direction_fn` provides per-byte-offset direction resolution. If `None`,
622/// direction is inferred from the script's natural direction (RTL for
623/// Arabic/Hebrew/Syriac/Thaana, LTR for everything else).
624///
625/// `style_fn` provides a style discriminant for each byte offset. Runs
626/// are split whenever the style changes. If `None`, all text is treated
627/// as having the same style (style_id = 0).
628///
629/// # Example
630///
631/// ```
632/// use ftui_text::script_segmentation::{partition_text_runs, Script, RunDirection};
633///
634/// let runs = partition_text_runs("Hello World", None, None);
635/// assert_eq!(runs.len(), 1);
636/// assert_eq!(runs[0].script, Script::Latin);
637/// assert_eq!(runs[0].direction, RunDirection::Ltr);
638/// ```
639pub fn partition_text_runs(
640    text: &str,
641    direction_fn: Option<&dyn Fn(usize) -> RunDirection>,
642    style_fn: Option<&dyn Fn(usize) -> u64>,
643) -> Vec<TextRun> {
644    if text.is_empty() {
645        return Vec::new();
646    }
647
648    let script_runs = partition_by_script(text);
649
650    let default_direction = |script: Script| -> RunDirection {
651        if script.is_rtl() {
652            RunDirection::Rtl
653        } else {
654            RunDirection::Ltr
655        }
656    };
657
658    let mut runs = Vec::new();
659
660    for sr in &script_runs {
661        // Further subdivide each script run by direction and style.
662        let sub_text = &text[sr.start..sr.end];
663        let mut sub_start = sr.start;
664
665        let first_dir = direction_fn
666            .as_ref()
667            .map_or_else(|| default_direction(sr.script), |f| f(sr.start));
668        let first_style = style_fn.as_ref().map_or(0u64, |f| f(sr.start));
669
670        let mut current_dir = first_dir;
671        let mut current_style = first_style;
672
673        for (i, ch) in sub_text.char_indices() {
674            let byte_pos = sr.start + i;
675            let dir = direction_fn
676                .as_ref()
677                .map_or_else(|| default_direction(sr.script), |f| f(byte_pos));
678            let style = style_fn.as_ref().map_or(0u64, |f| f(byte_pos));
679
680            if dir != current_dir || style != current_style {
681                // Emit run up to this point.
682                if byte_pos > sub_start {
683                    runs.push(TextRun {
684                        start: sub_start,
685                        end: byte_pos,
686                        script: sr.script,
687                        direction: current_dir,
688                        style_id: current_style,
689                    });
690                }
691                sub_start = byte_pos;
692                current_dir = dir;
693                current_style = style;
694            }
695
696            // Advance past this character (handled by char_indices).
697            let _ = ch;
698        }
699
700        // Final sub-run.
701        if sr.end > sub_start {
702            runs.push(TextRun {
703                start: sub_start,
704                end: sr.end,
705                script: sr.script,
706                direction: current_dir,
707                style_id: current_style,
708            });
709        }
710    }
711
712    runs
713}
714
715// ===========================================================================
716// Tests
717// ===========================================================================
718
719#[cfg(test)]
720mod tests {
721    use super::*;
722
723    // -----------------------------------------------------------------------
724    // char_script tests
725    // -----------------------------------------------------------------------
726
727    #[test]
728    fn script_ascii_letters() {
729        assert_eq!(char_script('A'), Script::Latin);
730        assert_eq!(char_script('z'), Script::Latin);
731        assert_eq!(char_script('M'), Script::Latin);
732    }
733
734    #[test]
735    fn script_ascii_digits_are_common() {
736        for d in '0'..='9' {
737            assert_eq!(char_script(d), Script::Common, "digit {d}");
738        }
739    }
740
741    #[test]
742    fn script_ascii_punctuation_is_common() {
743        for &c in &[' ', '!', '.', ',', ':', ';', '?', '-', '(', ')', '[', ']'] {
744            assert_eq!(char_script(c), Script::Common, "char {c:?}");
745        }
746    }
747
748    #[test]
749    fn script_latin_extended() {
750        assert_eq!(char_script('\u{00C0}'), Script::Latin); // À
751        assert_eq!(char_script('\u{00E9}'), Script::Latin); // é
752        assert_eq!(char_script('\u{0148}'), Script::Latin); // ň
753        assert_eq!(char_script('\u{1E00}'), Script::Latin); // Latin Extended Additional
754    }
755
756    #[test]
757    fn script_greek() {
758        assert_eq!(char_script('\u{0391}'), Script::Greek); // Α
759        assert_eq!(char_script('\u{03B1}'), Script::Greek); // α
760        assert_eq!(char_script('\u{03C9}'), Script::Greek); // ω
761    }
762
763    #[test]
764    fn script_cyrillic() {
765        assert_eq!(char_script('\u{0410}'), Script::Cyrillic); // А
766        assert_eq!(char_script('\u{044F}'), Script::Cyrillic); // я
767    }
768
769    #[test]
770    fn script_hebrew() {
771        assert_eq!(char_script('\u{05D0}'), Script::Hebrew); // א
772        assert_eq!(char_script('\u{05EA}'), Script::Hebrew); // ת
773    }
774
775    #[test]
776    fn script_arabic() {
777        assert_eq!(char_script('\u{0627}'), Script::Arabic); // ا
778        assert_eq!(char_script('\u{0645}'), Script::Arabic); // م
779    }
780
781    #[test]
782    fn script_devanagari() {
783        assert_eq!(char_script('\u{0905}'), Script::Devanagari); // अ
784        assert_eq!(char_script('\u{0939}'), Script::Devanagari); // ह
785    }
786
787    #[test]
788    fn script_thai() {
789        assert_eq!(char_script('\u{0E01}'), Script::Thai); // ก
790        assert_eq!(char_script('\u{0E3F}'), Script::Thai); // ฿
791    }
792
793    #[test]
794    fn script_hangul() {
795        assert_eq!(char_script('\u{AC00}'), Script::Hangul); // 가
796        assert_eq!(char_script('\u{D7A3}'), Script::Hangul); // 힣
797    }
798
799    #[test]
800    fn script_cjk_han() {
801        assert_eq!(char_script('\u{4E00}'), Script::Han); // 一
802        assert_eq!(char_script('\u{9FFF}'), Script::Han); // last CJK Unified
803    }
804
805    #[test]
806    fn script_hiragana_katakana() {
807        assert_eq!(char_script('\u{3042}'), Script::Hiragana); // あ
808        assert_eq!(char_script('\u{30A2}'), Script::Katakana); // ア
809    }
810
811    #[test]
812    fn script_combining_marks_are_inherited() {
813        assert_eq!(char_script('\u{0300}'), Script::Inherited); // combining grave
814        assert_eq!(char_script('\u{0301}'), Script::Inherited); // combining acute
815        assert_eq!(char_script('\u{036F}'), Script::Inherited); // last combining diacritical
816    }
817
818    #[test]
819    fn script_rtl_detection() {
820        assert!(Script::Arabic.is_rtl());
821        assert!(Script::Hebrew.is_rtl());
822        assert!(Script::Syriac.is_rtl());
823        assert!(Script::Thaana.is_rtl());
824        assert!(!Script::Latin.is_rtl());
825        assert!(!Script::Han.is_rtl());
826        assert!(!Script::Common.is_rtl());
827    }
828
829    #[test]
830    fn script_common_or_inherited() {
831        assert!(Script::Common.is_common_or_inherited());
832        assert!(Script::Inherited.is_common_or_inherited());
833        assert!(!Script::Latin.is_common_or_inherited());
834        assert!(!Script::Arabic.is_common_or_inherited());
835    }
836
837    // -----------------------------------------------------------------------
838    // resolve_scripts tests
839    // -----------------------------------------------------------------------
840
841    #[test]
842    fn resolve_empty() {
843        assert!(resolve_scripts(&[]).is_empty());
844    }
845
846    #[test]
847    fn resolve_pure_latin() {
848        let chars: Vec<char> = "Hello".chars().collect();
849        let resolved = resolve_scripts(&chars);
850        assert!(resolved.iter().all(|&s| s == Script::Latin));
851    }
852
853    #[test]
854    fn resolve_common_absorbed_by_latin() {
855        // "Hi 42!" — spaces, digits, and punctuation should resolve to Latin
856        let chars: Vec<char> = "Hi 42!".chars().collect();
857        let resolved = resolve_scripts(&chars);
858        assert!(
859            resolved.iter().all(|&s| s == Script::Latin),
860            "All should be Latin: {resolved:?}"
861        );
862    }
863
864    #[test]
865    fn resolve_leading_space() {
866        // " Hello" — leading space should resolve to Latin
867        let chars: Vec<char> = " Hello".chars().collect();
868        let resolved = resolve_scripts(&chars);
869        assert_eq!(resolved[0], Script::Latin);
870    }
871
872    #[test]
873    fn resolve_combining_mark_inherits() {
874        // "é" as e + combining acute (U+0301)
875        let chars: Vec<char> = "e\u{0301}".chars().collect();
876        let resolved = resolve_scripts(&chars);
877        assert_eq!(resolved[0], Script::Latin);
878        assert_eq!(
879            resolved[1],
880            Script::Latin,
881            "combining mark should inherit Latin"
882        );
883    }
884
885    #[test]
886    fn resolve_mixed_scripts() {
887        // "Hello مرحبا" — Latin then Arabic with space between
888        let text = "Hello \u{0645}\u{0631}\u{062D}\u{0628}\u{0627}";
889        let chars: Vec<char> = text.chars().collect();
890        let resolved = resolve_scripts(&chars);
891
892        // H, e, l, l, o should be Latin
893        for (i, script) in resolved.iter().enumerate().take(5) {
894            assert_eq!(*script, Script::Latin, "char {i}");
895        }
896        // Space should be Latin (preceding script)
897        assert_eq!(resolved[5], Script::Latin, "space");
898        // Arabic chars
899        for (i, script) in resolved.iter().enumerate().take(11).skip(6) {
900            assert_eq!(*script, Script::Arabic, "char {i}");
901        }
902    }
903
904    #[test]
905    fn resolve_all_common_defaults_to_latin() {
906        let chars: Vec<char> = "123 !?".chars().collect();
907        let resolved = resolve_scripts(&chars);
908        assert!(
909            resolved.iter().all(|&s| s == Script::Latin),
910            "All-Common should default to Latin"
911        );
912    }
913
914    // -----------------------------------------------------------------------
915    // partition_by_script tests
916    // -----------------------------------------------------------------------
917
918    #[test]
919    fn partition_empty() {
920        assert!(partition_by_script("").is_empty());
921    }
922
923    #[test]
924    fn partition_pure_latin() {
925        let runs = partition_by_script("Hello World");
926        assert_eq!(runs.len(), 1);
927        assert_eq!(runs[0].script, Script::Latin);
928        assert_eq!(runs[0].start, 0);
929        assert_eq!(runs[0].end, 11);
930        assert_eq!(runs[0].text("Hello World"), "Hello World");
931    }
932
933    #[test]
934    fn partition_pure_arabic() {
935        let text = "\u{0645}\u{0631}\u{062D}\u{0628}\u{0627}";
936        let runs = partition_by_script(text);
937        assert_eq!(runs.len(), 1);
938        assert_eq!(runs[0].script, Script::Arabic);
939    }
940
941    #[test]
942    fn partition_latin_then_arabic() {
943        let text = "Hello \u{0645}\u{0631}\u{062D}\u{0628}\u{0627}";
944        let runs = partition_by_script(text);
945        assert!(runs.len() >= 2, "runs: {runs:?}");
946
947        // First run should be Latin (including the space)
948        assert_eq!(runs[0].script, Script::Latin);
949        assert!(runs[0].text(text).starts_with("Hello"));
950
951        // Last run should be Arabic
952        let last = runs.last().unwrap();
953        assert_eq!(last.script, Script::Arabic);
954    }
955
956    #[test]
957    fn partition_latin_cjk_latin() {
958        let text = "Hello\u{4E16}\u{754C}World";
959        let runs = partition_by_script(text);
960        assert_eq!(runs.len(), 3, "runs: {runs:?}");
961        assert_eq!(runs[0].script, Script::Latin);
962        assert_eq!(runs[1].script, Script::Han);
963        assert_eq!(runs[2].script, Script::Latin);
964    }
965
966    #[test]
967    fn partition_japanese_mixed() {
968        // Hiragana + Kanji + Katakana
969        let text = "\u{3053}\u{3093}\u{306B}\u{3061}\u{306F}\u{4E16}\u{754C}\u{30A2}";
970        let runs = partition_by_script(text);
971        assert!(runs.len() >= 2, "runs: {runs:?}");
972
973        // Should have Hiragana, Han, Katakana runs
974        let scripts: Vec<Script> = runs.iter().map(|r| r.script).collect();
975        assert!(scripts.contains(&Script::Hiragana));
976        assert!(scripts.contains(&Script::Han));
977        assert!(scripts.contains(&Script::Katakana));
978    }
979
980    #[test]
981    fn partition_runs_cover_full_text() {
982        let text = "Hello \u{05E9}\u{05DC}\u{05D5}\u{05DD} World \u{4E16}\u{754C}";
983        let runs = partition_by_script(text);
984
985        // Runs should be contiguous and cover the full string.
986        assert_eq!(runs[0].start, 0);
987        assert_eq!(runs.last().unwrap().end, text.len());
988        for window in runs.windows(2) {
989            assert_eq!(
990                window[0].end, window[1].start,
991                "runs must be contiguous: {:?}",
992                window
993            );
994        }
995    }
996
997    #[test]
998    fn partition_run_text_slicing() {
999        let text = "ABCdef";
1000        let runs = partition_by_script(text);
1001        let reconstructed: String = runs.iter().map(|r| r.text(text)).collect();
1002        assert_eq!(reconstructed, text);
1003    }
1004
1005    #[test]
1006    fn partition_combining_mark_stays_with_base() {
1007        // "é" as e + combining acute should be a single Latin run
1008        let text = "e\u{0301}";
1009        let runs = partition_by_script(text);
1010        assert_eq!(runs.len(), 1);
1011        assert_eq!(runs[0].script, Script::Latin);
1012    }
1013
1014    #[test]
1015    fn partition_digits_absorbed() {
1016        // "Item 42" should be a single Latin run
1017        let runs = partition_by_script("Item 42");
1018        assert_eq!(runs.len(), 1);
1019        assert_eq!(runs[0].script, Script::Latin);
1020    }
1021
1022    // -----------------------------------------------------------------------
1023    // TextRun and partition_text_runs tests
1024    // -----------------------------------------------------------------------
1025
1026    #[test]
1027    fn text_runs_empty() {
1028        assert!(partition_text_runs("", None, None).is_empty());
1029    }
1030
1031    #[test]
1032    fn text_runs_simple_latin() {
1033        let runs = partition_text_runs("Hello World", None, None);
1034        assert_eq!(runs.len(), 1);
1035        assert_eq!(runs[0].script, Script::Latin);
1036        assert_eq!(runs[0].direction, RunDirection::Ltr);
1037        assert_eq!(runs[0].style_id, 0);
1038    }
1039
1040    #[test]
1041    fn text_runs_arabic_direction() {
1042        let text = "\u{0645}\u{0631}\u{062D}\u{0628}\u{0627}";
1043        let runs = partition_text_runs(text, None, None);
1044        assert_eq!(runs.len(), 1);
1045        assert_eq!(runs[0].script, Script::Arabic);
1046        assert_eq!(runs[0].direction, RunDirection::Rtl);
1047    }
1048
1049    #[test]
1050    fn text_runs_mixed_scripts() {
1051        let text = "Hello\u{4E16}\u{754C}World";
1052        let runs = partition_text_runs(text, None, None);
1053        assert_eq!(runs.len(), 3);
1054        assert_eq!(runs[0].direction, RunDirection::Ltr);
1055        assert_eq!(runs[1].direction, RunDirection::Ltr);
1056        assert_eq!(runs[2].direction, RunDirection::Ltr);
1057    }
1058
1059    #[test]
1060    fn text_runs_style_split() {
1061        let text = "Hello World";
1062        // Style changes at byte offset 5 (the space)
1063        let style_fn = |offset: usize| -> u64 { if offset < 5 { 1 } else { 2 } };
1064        let runs = partition_text_runs(text, None, Some(&style_fn));
1065        assert_eq!(runs.len(), 2, "runs: {runs:?}");
1066        assert_eq!(runs[0].style_id, 1);
1067        assert_eq!(runs[0].text(text), "Hello");
1068        assert_eq!(runs[1].style_id, 2);
1069        assert_eq!(runs[1].text(text), " World");
1070    }
1071
1072    #[test]
1073    fn text_runs_direction_override() {
1074        let text = "ABC";
1075        // Force RTL direction
1076        let dir_fn = |_offset: usize| -> RunDirection { RunDirection::Rtl };
1077        let runs = partition_text_runs(text, Some(&dir_fn), None);
1078        assert_eq!(runs.len(), 1);
1079        assert_eq!(runs[0].direction, RunDirection::Rtl);
1080    }
1081
1082    #[test]
1083    fn text_runs_cover_full_text() {
1084        let text = "Hello \u{05E9}\u{05DC}\u{05D5}\u{05DD} World";
1085        let runs = partition_text_runs(text, None, None);
1086
1087        assert_eq!(runs[0].start, 0);
1088        assert_eq!(runs.last().unwrap().end, text.len());
1089        for window in runs.windows(2) {
1090            assert_eq!(window[0].end, window[1].start);
1091        }
1092
1093        let reconstructed: String = runs.iter().map(|r| r.text(text)).collect();
1094        assert_eq!(reconstructed, text);
1095    }
1096
1097    // -----------------------------------------------------------------------
1098    // RunCacheKey tests
1099    // -----------------------------------------------------------------------
1100
1101    #[test]
1102    fn cache_key_equality() {
1103        let text = "Hello";
1104        let run = TextRun {
1105            start: 0,
1106            end: 5,
1107            script: Script::Latin,
1108            direction: RunDirection::Ltr,
1109            style_id: 0,
1110        };
1111
1112        let k1 = run.cache_key(text);
1113        let k2 = run.cache_key(text);
1114        assert_eq!(k1, k2);
1115    }
1116
1117    #[test]
1118    fn cache_key_differs_by_script() {
1119        let k1 = RunCacheKey {
1120            text: "abc",
1121            script: Script::Latin,
1122            direction: RunDirection::Ltr,
1123            style_id: 0,
1124        };
1125        let k2 = RunCacheKey {
1126            text: "abc",
1127            script: Script::Greek,
1128            direction: RunDirection::Ltr,
1129            style_id: 0,
1130        };
1131        assert_ne!(k1, k2);
1132    }
1133
1134    #[test]
1135    fn cache_key_differs_by_direction() {
1136        let k1 = RunCacheKey {
1137            text: "abc",
1138            script: Script::Latin,
1139            direction: RunDirection::Ltr,
1140            style_id: 0,
1141        };
1142        let k2 = RunCacheKey {
1143            text: "abc",
1144            script: Script::Latin,
1145            direction: RunDirection::Rtl,
1146            style_id: 0,
1147        };
1148        assert_ne!(k1, k2);
1149    }
1150
1151    #[test]
1152    fn cache_key_differs_by_style() {
1153        let k1 = RunCacheKey {
1154            text: "abc",
1155            script: Script::Latin,
1156            direction: RunDirection::Ltr,
1157            style_id: 0,
1158        };
1159        let k2 = RunCacheKey {
1160            text: "abc",
1161            script: Script::Latin,
1162            direction: RunDirection::Ltr,
1163            style_id: 1,
1164        };
1165        assert_ne!(k1, k2);
1166    }
1167
1168    #[test]
1169    fn cache_key_hashable() {
1170        use std::collections::HashSet;
1171        let mut set = HashSet::new();
1172        let k = RunCacheKey {
1173            text: "hello",
1174            script: Script::Latin,
1175            direction: RunDirection::Ltr,
1176            style_id: 0,
1177        };
1178        set.insert(k.clone());
1179        assert!(set.contains(&k));
1180    }
1181
1182    // -----------------------------------------------------------------------
1183    // Edge cases
1184    // -----------------------------------------------------------------------
1185
1186    #[test]
1187    fn single_char() {
1188        let runs = partition_by_script("A");
1189        assert_eq!(runs.len(), 1);
1190        assert_eq!(runs[0].script, Script::Latin);
1191        assert_eq!(runs[0].start, 0);
1192        assert_eq!(runs[0].end, 1);
1193    }
1194
1195    #[test]
1196    fn only_spaces() {
1197        let runs = partition_by_script("   ");
1198        assert_eq!(runs.len(), 1);
1199        // All-Common defaults to Latin
1200        assert_eq!(runs[0].script, Script::Latin);
1201    }
1202
1203    #[test]
1204    fn emoji_is_common() {
1205        // Emoji should be Common, absorbed into surrounding script
1206        let text = "Hello \u{1F600} World";
1207        let runs = partition_by_script(text);
1208        // Should be a single Latin run (emoji is Common, absorbed)
1209        assert_eq!(runs.len(), 1);
1210        assert_eq!(runs[0].script, Script::Latin);
1211    }
1212
1213    #[test]
1214    fn multibyte_utf8_offsets() {
1215        // Ensure byte offsets are correct for multi-byte chars
1216        // é (2 bytes) + 一 (3 bytes)
1217        let text = "\u{00E9}\u{4E00}";
1218        let runs = partition_by_script(text);
1219        assert!(runs.len() >= 2);
1220        assert_eq!(runs[0].end, 2); // é is 2 bytes
1221        assert_eq!(runs[1].start, 2);
1222        assert_eq!(runs[1].end, 5); // 一 is 3 bytes
1223    }
1224
1225    #[test]
1226    fn text_run_len_and_empty() {
1227        let run = TextRun {
1228            start: 5,
1229            end: 10,
1230            script: Script::Latin,
1231            direction: RunDirection::Ltr,
1232            style_id: 0,
1233        };
1234        assert_eq!(run.len(), 5);
1235        assert!(!run.is_empty());
1236
1237        let empty = TextRun {
1238            start: 5,
1239            end: 5,
1240            script: Script::Latin,
1241            direction: RunDirection::Ltr,
1242            style_id: 0,
1243        };
1244        assert_eq!(empty.len(), 0);
1245        assert!(empty.is_empty());
1246    }
1247
1248    #[test]
1249    fn script_run_len_and_empty() {
1250        let run = ScriptRun {
1251            start: 0,
1252            end: 5,
1253            script: Script::Latin,
1254        };
1255        assert_eq!(run.len(), 5);
1256        assert!(!run.is_empty());
1257    }
1258
1259    #[test]
1260    fn script_enum_ord() {
1261        // Script has PartialOrd/Ord derived — verify it's usable for sorting
1262        let mut scripts = [Script::Arabic, Script::Latin, Script::Common];
1263        scripts.sort();
1264        assert_eq!(scripts[0], Script::Common);
1265    }
1266
1267    #[test]
1268    fn many_script_transitions() {
1269        // Latin + Greek + Cyrillic + Hebrew + Arabic
1270        let text = "Hello\u{0391}\u{0392}\u{0410}\u{0411}\u{05D0}\u{05D1}\u{0627}\u{0628}";
1271        let runs = partition_by_script(text);
1272
1273        let scripts: Vec<Script> = runs.iter().map(|r| r.script).collect();
1274        assert!(scripts.contains(&Script::Latin));
1275        assert!(scripts.contains(&Script::Greek));
1276        assert!(scripts.contains(&Script::Cyrillic));
1277        assert!(scripts.contains(&Script::Hebrew));
1278        assert!(scripts.contains(&Script::Arabic));
1279
1280        // Verify contiguity
1281        for window in runs.windows(2) {
1282            assert_eq!(window[0].end, window[1].start);
1283        }
1284    }
1285}