Skip to main content

ftui_text/
cluster_map.rs

1#![forbid(unsafe_code)]
2
3//! Bidirectional cluster↔cell mapping for shaped text.
4//!
5//! This module defines the [`ClusterMap`] — a precomputed index that maps
6//! between source byte offsets, grapheme indices, and visual cell columns
7//! in both directions. It enables correct cursor movement, selection,
8//! copy extraction, and search highlighting over shaped text.
9//!
10//! # Invariants
11//!
12//! The cluster map guarantees:
13//!
14//! 1. **Round-trip preservation**: `byte → cell → byte` returns the original
15//!    cluster start (never a mid-cluster position).
16//! 2. **Monotonicity**: visual cell offsets increase with byte offsets.
17//! 3. **Boundary alignment**: lookups always snap to grapheme cluster
18//!    boundaries — never splitting a grapheme or shaped glyph cluster.
19//! 4. **Continuation cell handling**: wide characters that span 2+ cells
20//!    map back to the same source byte offset.
21//! 5. **Completeness**: every source byte offset and every visual cell
22//!    column has a defined mapping.
23//!
24//! # Example
25//!
26//! ```
27//! use ftui_text::cluster_map::ClusterMap;
28//!
29//! // Build a cluster map from plain text
30//! let map = ClusterMap::from_text("Hello 世界!");
31//!
32//! // Forward: byte offset → visual cell column
33//! assert_eq!(map.byte_to_cell(0), 0);  // 'H' at cell 0
34//! assert_eq!(map.byte_to_cell(6), 6);  // '世' at cell 6
35//! assert_eq!(map.byte_to_cell(9), 8);  // '界' at cell 8
36//!
37//! // Reverse: visual cell column → byte offset
38//! assert_eq!(map.cell_to_byte(0), 0);  // cell 0 → 'H'
39//! assert_eq!(map.cell_to_byte(6), 6);  // cell 6 → '世'
40//! assert_eq!(map.cell_to_byte(7), 6);  // cell 7 → '世' (continuation)
41//!
42//! // Selection: cell range → byte range
43//! let (start, end) = map.cell_range_to_byte_range(6, 10);
44//! assert_eq!(start, 6);   // '世'
45//! assert_eq!(end, 12);    // end of '界'
46//! ```
47
48use crate::shaping::ShapedRun;
49use unicode_segmentation::UnicodeSegmentation;
50
51// ---------------------------------------------------------------------------
52// ClusterEntry — per-grapheme-cluster record
53// ---------------------------------------------------------------------------
54
55/// A single entry in the cluster map, representing one grapheme cluster.
56#[derive(Debug, Clone, Copy, PartialEq, Eq)]
57pub struct ClusterEntry {
58    /// Start byte offset in the source string (inclusive).
59    pub byte_start: u32,
60    /// End byte offset in the source string (exclusive).
61    pub byte_end: u32,
62    /// Grapheme index (0-based position among graphemes).
63    pub grapheme_index: u32,
64    /// Start visual cell column (inclusive).
65    pub cell_start: u32,
66    /// Display width in cells (1 for normal, 2 for wide CJK/emoji).
67    pub cell_width: u8,
68}
69
70impl ClusterEntry {
71    /// The byte range of this cluster.
72    #[inline]
73    pub fn byte_range(&self) -> std::ops::Range<usize> {
74        self.byte_start as usize..self.byte_end as usize
75    }
76
77    /// The cell range of this cluster (start..start+width).
78    #[inline]
79    pub fn cell_range(&self) -> std::ops::Range<usize> {
80        self.cell_start as usize..(self.cell_start as usize + self.cell_width as usize)
81    }
82
83    /// End cell column (exclusive).
84    #[inline]
85    pub fn cell_end(&self) -> u32 {
86        self.cell_start + self.cell_width as u32
87    }
88}
89
90// ---------------------------------------------------------------------------
91// ClusterMap
92// ---------------------------------------------------------------------------
93
94/// Bidirectional mapping between source byte offsets and visual cell columns.
95///
96/// Built from text (optionally with shaped glyph data) and provides O(log n)
97/// lookups in both directions via binary search over the sorted cluster array.
98///
99/// # Construction
100///
101/// - [`from_text`](Self::from_text) — build from plain text (uses grapheme
102///   cluster widths, suitable for terminal/monospace rendering).
103/// - [`from_shaped_run`](Self::from_shaped_run) — build from a shaped glyph
104///   run (uses glyph cluster byte offsets and advances, suitable for
105///   proportional/shaped rendering).
106#[derive(Debug, Clone)]
107pub struct ClusterMap {
108    /// Sorted by byte_start (and equivalently by cell_start due to monotonicity).
109    entries: Vec<ClusterEntry>,
110    /// Total visual width in cells.
111    total_cells: u32,
112    /// Total byte length of the source text.
113    total_bytes: u32,
114}
115
116impl ClusterMap {
117    /// Build a cluster map from plain text using grapheme cluster boundaries.
118    ///
119    /// Each grapheme cluster maps to 1 or 2 cells based on `display_width`.
120    /// This is the appropriate constructor for terminal/monospace rendering.
121    pub fn from_text(text: &str) -> Self {
122        if text.is_empty() {
123            return Self {
124                entries: Vec::new(),
125                total_cells: 0,
126                total_bytes: 0,
127            };
128        }
129
130        let mut entries = Vec::new();
131        let mut cell_offset = 0u32;
132
133        for (grapheme_idx, (byte_offset, grapheme)) in text.grapheme_indices(true).enumerate() {
134            let width = crate::grapheme_width(grapheme) as u8;
135            let byte_end = byte_offset + grapheme.len();
136
137            entries.push(ClusterEntry {
138                byte_start: byte_offset as u32,
139                byte_end: byte_end as u32,
140                grapheme_index: grapheme_idx as u32,
141                cell_start: cell_offset,
142                cell_width: width,
143            });
144
145            cell_offset += width as u32;
146        }
147
148        Self {
149            entries,
150            total_cells: cell_offset,
151            total_bytes: text.len() as u32,
152        }
153    }
154
155    /// Build a cluster map from a shaped glyph run.
156    ///
157    /// Uses glyph cluster byte offsets from the `ShapedRun` to determine
158    /// cluster boundaries, with advances determining cell widths.
159    ///
160    /// For terminal rendering (NoopShaper), each glyph maps to one grapheme
161    /// cluster. For proportional rendering, multiple glyphs may share a
162    /// cluster (ligatures) or one glyph may span multiple characters.
163    pub fn from_shaped_run(text: &str, run: &ShapedRun) -> Self {
164        if text.is_empty() || run.is_empty() {
165            return Self {
166                entries: Vec::new(),
167                total_cells: 0,
168                total_bytes: 0,
169            };
170        }
171
172        // Group glyphs by cluster (byte offset).
173        // Shaped glyphs share a `cluster` value when they form a ligature
174        // or complex glyph group.
175        let mut entries = Vec::new();
176        let mut cell_offset = 0u32;
177        let mut grapheme_idx = 0u32;
178
179        let mut i = 0;
180        while i < run.glyphs.len() {
181            let cluster_byte = run.glyphs[i].cluster as usize;
182            let mut cluster_advance = 0i32;
183
184            // Accumulate all glyphs sharing this cluster.
185            let mut j = i;
186            while j < run.glyphs.len() && run.glyphs[j].cluster as usize == cluster_byte {
187                cluster_advance += run.glyphs[j].x_advance;
188                j += 1;
189            }
190
191            // Find the next cluster's byte offset to determine this cluster's byte range.
192            let next_byte = if j < run.glyphs.len() {
193                run.glyphs[j].cluster as usize
194            } else {
195                text.len()
196            };
197
198            // Use advance as cell width (for terminal, this is already in cells).
199            let width = cluster_advance.unsigned_abs().min(255) as u8;
200
201            entries.push(ClusterEntry {
202                byte_start: cluster_byte as u32,
203                byte_end: next_byte as u32,
204                grapheme_index: grapheme_idx,
205                cell_start: cell_offset,
206                cell_width: width,
207            });
208
209            cell_offset += width as u32;
210            grapheme_idx += 1;
211            i = j;
212        }
213
214        Self {
215            entries,
216            total_cells: cell_offset,
217            total_bytes: text.len() as u32,
218        }
219    }
220
221    // -----------------------------------------------------------------------
222    // Forward lookups (byte → cell)
223    // -----------------------------------------------------------------------
224
225    /// Map a byte offset to its visual cell column.
226    ///
227    /// If the byte offset falls mid-cluster, it snaps to the cluster's
228    /// start cell. Returns `total_cells` for offsets at or past the end.
229    pub fn byte_to_cell(&self, byte_offset: usize) -> usize {
230        if self.entries.is_empty() || byte_offset >= self.total_bytes as usize {
231            return self.total_cells as usize;
232        }
233
234        match self
235            .entries
236            .binary_search_by_key(&(byte_offset as u32), |e| e.byte_start)
237        {
238            Ok(idx) => self.entries[idx].cell_start as usize,
239            Err(idx) => {
240                // byte_offset is mid-cluster — snap to containing cluster.
241                if idx > 0 {
242                    self.entries[idx - 1].cell_start as usize
243                } else {
244                    0
245                }
246            }
247        }
248    }
249
250    /// Map a byte offset to the containing `ClusterEntry`.
251    ///
252    /// Returns `None` for empty maps or offsets past the end.
253    pub fn byte_to_entry(&self, byte_offset: usize) -> Option<&ClusterEntry> {
254        if self.entries.is_empty() {
255            return None;
256        }
257
258        match self
259            .entries
260            .binary_search_by_key(&(byte_offset as u32), |e| e.byte_start)
261        {
262            Ok(idx) => Some(&self.entries[idx]),
263            Err(idx) => {
264                if idx > 0 && (byte_offset as u32) < self.entries[idx - 1].byte_end {
265                    Some(&self.entries[idx - 1])
266                } else {
267                    None
268                }
269            }
270        }
271    }
272
273    /// Map a byte range to a visual cell range.
274    ///
275    /// Returns `(cell_start, cell_end)` covering all clusters that overlap
276    /// the given byte range.
277    pub fn byte_range_to_cell_range(&self, byte_start: usize, byte_end: usize) -> (usize, usize) {
278        if self.entries.is_empty() || byte_start >= byte_end {
279            return (0, 0);
280        }
281
282        let start_cell = self.byte_to_cell(byte_start);
283
284        // Find the cell_end for the cluster containing byte_end - 1.
285        let end_cell = if byte_end >= self.total_bytes as usize {
286            self.total_cells as usize
287        } else {
288            match self
289                .entries
290                .binary_search_by_key(&(byte_end as u32), |e| e.byte_start)
291            {
292                Ok(idx) => self.entries[idx].cell_start as usize,
293                Err(idx) => {
294                    if idx > 0 {
295                        self.entries[idx - 1].cell_end() as usize
296                    } else {
297                        0
298                    }
299                }
300            }
301        };
302
303        (start_cell, end_cell)
304    }
305
306    // -----------------------------------------------------------------------
307    // Reverse lookups (cell → byte)
308    // -----------------------------------------------------------------------
309
310    /// Map a visual cell column to a source byte offset.
311    ///
312    /// Continuation cells (cells within a wide character) map back to the
313    /// cluster's start byte. Returns `total_bytes` for cells at or past
314    /// the total width.
315    pub fn cell_to_byte(&self, cell_col: usize) -> usize {
316        if self.entries.is_empty() || cell_col >= self.total_cells as usize {
317            return self.total_bytes as usize;
318        }
319
320        match self
321            .entries
322            .binary_search_by_key(&(cell_col as u32), |e| e.cell_start)
323        {
324            Ok(idx) => self.entries[idx].byte_start as usize,
325            Err(idx) => {
326                // cell_col is a continuation cell — snap to containing cluster.
327                if idx > 0 {
328                    self.entries[idx - 1].byte_start as usize
329                } else {
330                    0
331                }
332            }
333        }
334    }
335
336    /// Map a visual cell column to the containing `ClusterEntry`.
337    ///
338    /// Returns `None` for empty maps or cells past the total width.
339    pub fn cell_to_entry(&self, cell_col: usize) -> Option<&ClusterEntry> {
340        if self.entries.is_empty() || cell_col >= self.total_cells as usize {
341            return None;
342        }
343
344        match self
345            .entries
346            .binary_search_by_key(&(cell_col as u32), |e| e.cell_start)
347        {
348            Ok(idx) => Some(&self.entries[idx]),
349            Err(idx) => {
350                if idx > 0 {
351                    let entry = &self.entries[idx - 1];
352                    if (cell_col as u32) < entry.cell_end() {
353                        Some(entry)
354                    } else {
355                        None
356                    }
357                } else {
358                    None
359                }
360            }
361        }
362    }
363
364    /// Map a visual cell range to a source byte range.
365    ///
366    /// Returns `(byte_start, byte_end)` covering all clusters that overlap
367    /// the given cell range. Continuation cells are resolved to their
368    /// owning cluster.
369    pub fn cell_range_to_byte_range(&self, cell_start: usize, cell_end: usize) -> (usize, usize) {
370        if self.entries.is_empty() || cell_start >= cell_end {
371            return (0, 0);
372        }
373
374        let start_byte = self.cell_to_byte(cell_start);
375
376        let end_byte = if cell_end >= self.total_cells as usize {
377            self.total_bytes as usize
378        } else {
379            // Find the cluster containing the last included cell and use its
380            // byte_end as the exclusive bound. This ensures wide characters
381            // partially covered by the cell range are fully included.
382            match self.cell_to_entry(cell_end.saturating_sub(1)) {
383                Some(entry) => entry.byte_end as usize,
384                None => self.total_bytes as usize,
385            }
386        };
387
388        (start_byte, end_byte.max(start_byte))
389    }
390
391    // -----------------------------------------------------------------------
392    // Grapheme-level accessors
393    // -----------------------------------------------------------------------
394
395    /// Map a grapheme index to a visual cell column.
396    pub fn grapheme_to_cell(&self, grapheme_index: usize) -> usize {
397        self.entries
398            .get(grapheme_index)
399            .map_or(self.total_cells as usize, |e| e.cell_start as usize)
400    }
401
402    /// Map a visual cell column to a grapheme index.
403    pub fn cell_to_grapheme(&self, cell_col: usize) -> usize {
404        self.cell_to_entry(cell_col)
405            .map_or(self.entries.len(), |e| e.grapheme_index as usize)
406    }
407
408    /// Map a grapheme index to a byte offset.
409    pub fn grapheme_to_byte(&self, grapheme_index: usize) -> usize {
410        self.entries
411            .get(grapheme_index)
412            .map_or(self.total_bytes as usize, |e| e.byte_start as usize)
413    }
414
415    /// Map a byte offset to a grapheme index.
416    pub fn byte_to_grapheme(&self, byte_offset: usize) -> usize {
417        self.byte_to_entry(byte_offset)
418            .map_or(self.entries.len(), |e| e.grapheme_index as usize)
419    }
420
421    // -----------------------------------------------------------------------
422    // Aggregate accessors
423    // -----------------------------------------------------------------------
424
425    /// Total visual width in cells.
426    #[inline]
427    pub fn total_cells(&self) -> usize {
428        self.total_cells as usize
429    }
430
431    /// Total byte length of the source text.
432    #[inline]
433    pub fn total_bytes(&self) -> usize {
434        self.total_bytes as usize
435    }
436
437    /// Number of grapheme clusters.
438    #[inline]
439    pub fn cluster_count(&self) -> usize {
440        self.entries.len()
441    }
442
443    /// Whether the map is empty.
444    #[inline]
445    pub fn is_empty(&self) -> bool {
446        self.entries.is_empty()
447    }
448
449    /// Iterate over all cluster entries.
450    #[inline]
451    pub fn entries(&self) -> &[ClusterEntry] {
452        &self.entries
453    }
454
455    /// Get the cluster entry at a grapheme index.
456    #[inline]
457    pub fn get(&self, grapheme_index: usize) -> Option<&ClusterEntry> {
458        self.entries.get(grapheme_index)
459    }
460
461    /// Extract text from the source string for a cell range.
462    ///
463    /// Returns the substring covering all clusters that overlap the
464    /// given visual cell range.
465    pub fn extract_text_for_cells<'a>(
466        &self,
467        source: &'a str,
468        cell_start: usize,
469        cell_end: usize,
470    ) -> &'a str {
471        let (byte_start, byte_end) = self.cell_range_to_byte_range(cell_start, cell_end);
472        if byte_start >= source.len() {
473            return "";
474        }
475        let end = byte_end.min(source.len());
476        &source[byte_start..end]
477    }
478}
479
480// ===========================================================================
481// Tests
482// ===========================================================================
483
484#[cfg(test)]
485mod tests {
486    use super::*;
487
488    // -----------------------------------------------------------------------
489    // Construction tests
490    // -----------------------------------------------------------------------
491
492    #[test]
493    fn empty_text() {
494        let map = ClusterMap::from_text("");
495        assert!(map.is_empty());
496        assert_eq!(map.total_cells(), 0);
497        assert_eq!(map.total_bytes(), 0);
498        assert_eq!(map.cluster_count(), 0);
499    }
500
501    #[test]
502    fn ascii_text() {
503        let map = ClusterMap::from_text("Hello");
504        assert_eq!(map.cluster_count(), 5);
505        assert_eq!(map.total_cells(), 5);
506        assert_eq!(map.total_bytes(), 5);
507
508        // Each ASCII char is 1 byte, 1 cell.
509        for i in 0..5 {
510            let e = map.get(i).unwrap();
511            assert_eq!(e.byte_start, i as u32);
512            assert_eq!(e.byte_end, (i + 1) as u32);
513            assert_eq!(e.cell_start, i as u32);
514            assert_eq!(e.cell_width, 1);
515        }
516    }
517
518    #[test]
519    fn wide_chars() {
520        // "世界" — 2 CJK chars, each 3 bytes and 2 cells wide
521        let text = "\u{4E16}\u{754C}";
522        let map = ClusterMap::from_text(text);
523
524        assert_eq!(map.cluster_count(), 2);
525        assert_eq!(map.total_bytes(), 6);
526        assert_eq!(map.total_cells(), 4);
527
528        let e0 = map.get(0).unwrap();
529        assert_eq!(e0.byte_start, 0);
530        assert_eq!(e0.byte_end, 3);
531        assert_eq!(e0.cell_start, 0);
532        assert_eq!(e0.cell_width, 2);
533
534        let e1 = map.get(1).unwrap();
535        assert_eq!(e1.byte_start, 3);
536        assert_eq!(e1.byte_end, 6);
537        assert_eq!(e1.cell_start, 2);
538        assert_eq!(e1.cell_width, 2);
539    }
540
541    #[test]
542    fn mixed_ascii_and_wide() {
543        // "Hi世界!" — 2 ASCII + 2 CJK + 1 ASCII
544        let text = "Hi\u{4E16}\u{754C}!";
545        let map = ClusterMap::from_text(text);
546
547        assert_eq!(map.cluster_count(), 5);
548        assert_eq!(map.total_bytes(), 9); // 2 + 3 + 3 + 1
549        assert_eq!(map.total_cells(), 7); // 1+1+2+2+1
550
551        // Verify cell starts.
552        assert_eq!(map.get(0).unwrap().cell_start, 0); // 'H'
553        assert_eq!(map.get(1).unwrap().cell_start, 1); // 'i'
554        assert_eq!(map.get(2).unwrap().cell_start, 2); // '世'
555        assert_eq!(map.get(3).unwrap().cell_start, 4); // '界'
556        assert_eq!(map.get(4).unwrap().cell_start, 6); // '!'
557    }
558
559    #[test]
560    fn combining_marks() {
561        // "é" as e + combining acute: single grapheme, 2 bytes, 1 cell
562        let text = "e\u{0301}";
563        let map = ClusterMap::from_text(text);
564
565        assert_eq!(map.cluster_count(), 1);
566        assert_eq!(map.total_bytes(), 3); // 'e' (1) + U+0301 (2)
567        assert_eq!(map.total_cells(), 1);
568
569        let e = map.get(0).unwrap();
570        assert_eq!(e.byte_start, 0);
571        assert_eq!(e.byte_end, 3);
572        assert_eq!(e.cell_width, 1);
573    }
574
575    // -----------------------------------------------------------------------
576    // Forward lookup tests (byte → cell)
577    // -----------------------------------------------------------------------
578
579    #[test]
580    fn byte_to_cell_ascii() {
581        let map = ClusterMap::from_text("Hello");
582        for i in 0..5 {
583            assert_eq!(map.byte_to_cell(i), i);
584        }
585        assert_eq!(map.byte_to_cell(5), 5); // past end
586    }
587
588    #[test]
589    fn byte_to_cell_wide() {
590        let text = "Hi\u{4E16}\u{754C}!";
591        let map = ClusterMap::from_text(text);
592
593        assert_eq!(map.byte_to_cell(0), 0); // 'H'
594        assert_eq!(map.byte_to_cell(1), 1); // 'i'
595        assert_eq!(map.byte_to_cell(2), 2); // '世' start
596        assert_eq!(map.byte_to_cell(5), 4); // '界' start
597        assert_eq!(map.byte_to_cell(8), 6); // '!'
598    }
599
600    #[test]
601    fn byte_to_cell_mid_cluster_snaps() {
602        let text = "\u{4E16}"; // '世' is 3 bytes
603        let map = ClusterMap::from_text(text);
604
605        // Mid-byte offsets snap to cluster start.
606        assert_eq!(map.byte_to_cell(0), 0);
607        assert_eq!(map.byte_to_cell(1), 0); // mid-cluster → cluster start
608        assert_eq!(map.byte_to_cell(2), 0); // mid-cluster → cluster start
609    }
610
611    #[test]
612    fn byte_to_entry() {
613        let text = "AB\u{4E16}C";
614        let map = ClusterMap::from_text(text);
615
616        let e = map.byte_to_entry(0).unwrap();
617        assert_eq!(e.byte_start, 0); // 'A'
618
619        let e = map.byte_to_entry(2).unwrap();
620        assert_eq!(e.byte_start, 2); // '世'
621
622        // Mid-cluster lookup.
623        let e = map.byte_to_entry(3).unwrap();
624        assert_eq!(e.byte_start, 2); // still '世'
625
626        assert!(map.byte_to_entry(100).is_none());
627    }
628
629    // -----------------------------------------------------------------------
630    // Reverse lookup tests (cell → byte)
631    // -----------------------------------------------------------------------
632
633    #[test]
634    fn cell_to_byte_ascii() {
635        let map = ClusterMap::from_text("Hello");
636        for i in 0..5 {
637            assert_eq!(map.cell_to_byte(i), i);
638        }
639        assert_eq!(map.cell_to_byte(5), 5); // past end
640    }
641
642    #[test]
643    fn cell_to_byte_wide() {
644        let text = "Hi\u{4E16}\u{754C}!";
645        let map = ClusterMap::from_text(text);
646
647        assert_eq!(map.cell_to_byte(0), 0); // 'H'
648        assert_eq!(map.cell_to_byte(1), 1); // 'i'
649        assert_eq!(map.cell_to_byte(2), 2); // '世'
650        assert_eq!(map.cell_to_byte(3), 2); // continuation → same '世'
651        assert_eq!(map.cell_to_byte(4), 5); // '界'
652        assert_eq!(map.cell_to_byte(5), 5); // continuation → same '界'
653        assert_eq!(map.cell_to_byte(6), 8); // '!'
654    }
655
656    #[test]
657    fn cell_to_entry_continuation() {
658        let text = "\u{4E16}"; // '世' — 2 cells
659        let map = ClusterMap::from_text(text);
660
661        // Both cells map to the same entry.
662        let e0 = map.cell_to_entry(0).unwrap();
663        let e1 = map.cell_to_entry(1).unwrap();
664        assert_eq!(e0, e1);
665        assert_eq!(e0.byte_start, 0);
666        assert_eq!(e0.cell_width, 2);
667    }
668
669    // -----------------------------------------------------------------------
670    // Range conversion tests
671    // -----------------------------------------------------------------------
672
673    #[test]
674    fn byte_range_to_cell_range_ascii() {
675        let map = ClusterMap::from_text("Hello World");
676        assert_eq!(map.byte_range_to_cell_range(0, 5), (0, 5)); // "Hello"
677        assert_eq!(map.byte_range_to_cell_range(6, 11), (6, 11)); // "World"
678    }
679
680    #[test]
681    fn byte_range_to_cell_range_wide() {
682        let text = "Hi\u{4E16}\u{754C}!"; // cells: H(0) i(1) 世(2,3) 界(4,5) !(6)
683        let map = ClusterMap::from_text(text);
684
685        // Byte range covering '世界' (bytes 2..8)
686        assert_eq!(map.byte_range_to_cell_range(2, 8), (2, 6));
687    }
688
689    #[test]
690    fn cell_range_to_byte_range_ascii() {
691        let map = ClusterMap::from_text("Hello World");
692        assert_eq!(map.cell_range_to_byte_range(0, 5), (0, 5));
693    }
694
695    #[test]
696    fn cell_range_to_byte_range_wide() {
697        let text = "Hi\u{4E16}\u{754C}!";
698        let map = ClusterMap::from_text(text);
699
700        // Cell range [2, 6) covers 世界
701        assert_eq!(map.cell_range_to_byte_range(2, 6), (2, 8));
702
703        // Cell range [3, 5) starts on continuation → snaps to 世, ends including 界
704        assert_eq!(map.cell_range_to_byte_range(3, 5), (2, 8));
705    }
706
707    // -----------------------------------------------------------------------
708    // Grapheme-level accessors
709    // -----------------------------------------------------------------------
710
711    #[test]
712    fn grapheme_to_cell_and_back() {
713        let text = "Hi\u{4E16}\u{754C}!";
714        let map = ClusterMap::from_text(text);
715
716        assert_eq!(map.grapheme_to_cell(0), 0); // 'H'
717        assert_eq!(map.grapheme_to_cell(2), 2); // '世'
718        assert_eq!(map.grapheme_to_cell(4), 6); // '!'
719        assert_eq!(map.grapheme_to_cell(5), 7); // past end
720
721        assert_eq!(map.cell_to_grapheme(0), 0); // 'H'
722        assert_eq!(map.cell_to_grapheme(2), 2); // '世'
723        assert_eq!(map.cell_to_grapheme(3), 2); // continuation → '世'
724    }
725
726    #[test]
727    fn grapheme_to_byte_and_back() {
728        let text = "A\u{4E16}B";
729        let map = ClusterMap::from_text(text);
730
731        assert_eq!(map.grapheme_to_byte(0), 0); // 'A'
732        assert_eq!(map.grapheme_to_byte(1), 1); // '世'
733        assert_eq!(map.grapheme_to_byte(2), 4); // 'B'
734
735        assert_eq!(map.byte_to_grapheme(0), 0); // 'A'
736        assert_eq!(map.byte_to_grapheme(1), 1); // '世'
737        assert_eq!(map.byte_to_grapheme(4), 2); // 'B'
738    }
739
740    // -----------------------------------------------------------------------
741    // Extract text
742    // -----------------------------------------------------------------------
743
744    #[test]
745    fn extract_text_for_cells_ascii() {
746        let text = "Hello World";
747        let map = ClusterMap::from_text(text);
748        assert_eq!(map.extract_text_for_cells(text, 0, 5), "Hello");
749        assert_eq!(map.extract_text_for_cells(text, 6, 11), "World");
750    }
751
752    #[test]
753    fn extract_text_for_cells_wide() {
754        let text = "Hi\u{4E16}\u{754C}!";
755        let map = ClusterMap::from_text(text);
756
757        // Extract just the CJK chars (cells 2..6).
758        assert_eq!(map.extract_text_for_cells(text, 2, 6), "\u{4E16}\u{754C}");
759
760        // Extract including continuation cell.
761        assert_eq!(map.extract_text_for_cells(text, 3, 5), "\u{4E16}\u{754C}");
762    }
763
764    #[test]
765    fn extract_text_empty_range() {
766        let text = "Hello";
767        let map = ClusterMap::from_text(text);
768        assert_eq!(map.extract_text_for_cells(text, 3, 3), "");
769    }
770
771    // -----------------------------------------------------------------------
772    // Invariant: round-trip
773    // -----------------------------------------------------------------------
774
775    #[test]
776    fn roundtrip_byte_cell_byte() {
777        let texts = [
778            "Hello",
779            "\u{4E16}\u{754C}",
780            "Hi\u{4E16}\u{754C}!",
781            "e\u{0301}f",
782            "\u{05E9}\u{05DC}\u{05D5}\u{05DD}",
783            "",
784        ];
785
786        for text in texts {
787            let map = ClusterMap::from_text(text);
788
789            for entry in map.entries() {
790                let byte = entry.byte_start as usize;
791                let cell = map.byte_to_cell(byte);
792                let back = map.cell_to_byte(cell);
793                assert_eq!(
794                    back, byte,
795                    "Round-trip failed for text={text:?} byte={byte}"
796                );
797            }
798        }
799    }
800
801    #[test]
802    fn roundtrip_cell_byte_cell() {
803        let texts = [
804            "Hello",
805            "\u{4E16}\u{754C}",
806            "Hi\u{4E16}\u{754C}!",
807            "e\u{0301}f",
808        ];
809
810        for text in texts {
811            let map = ClusterMap::from_text(text);
812
813            for entry in map.entries() {
814                let cell = entry.cell_start as usize;
815                let byte = map.cell_to_byte(cell);
816                let back = map.byte_to_cell(byte);
817                assert_eq!(
818                    back, cell,
819                    "Round-trip failed for text={text:?} cell={cell}"
820                );
821            }
822        }
823    }
824
825    // -----------------------------------------------------------------------
826    // Invariant: monotonicity
827    // -----------------------------------------------------------------------
828
829    #[test]
830    fn monotonicity() {
831        let texts = [
832            "Hello World",
833            "Hi\u{4E16}\u{754C}! \u{05E9}\u{05DC}\u{05D5}\u{05DD}",
834            "e\u{0301}\u{0302}",
835        ];
836
837        for text in texts {
838            let map = ClusterMap::from_text(text);
839
840            for window in map.entries().windows(2) {
841                assert!(
842                    window[0].byte_start < window[1].byte_start,
843                    "Byte monotonicity violated: {:?}",
844                    window
845                );
846                assert!(
847                    window[0].cell_start < window[1].cell_start,
848                    "Cell monotonicity violated: {:?}",
849                    window
850                );
851            }
852        }
853    }
854
855    // -----------------------------------------------------------------------
856    // Invariant: contiguity
857    // -----------------------------------------------------------------------
858
859    #[test]
860    fn contiguity() {
861        let text = "Hi\u{4E16}\u{754C}!";
862        let map = ClusterMap::from_text(text);
863
864        // Byte ranges are contiguous.
865        for window in map.entries().windows(2) {
866            assert_eq!(
867                window[0].byte_end, window[1].byte_start,
868                "Byte gap: {:?}",
869                window
870            );
871        }
872
873        // Cell ranges are contiguous.
874        for window in map.entries().windows(2) {
875            assert_eq!(
876                window[0].cell_end(),
877                window[1].cell_start,
878                "Cell gap: {:?}",
879                window
880            );
881        }
882
883        // First entry starts at 0.
884        assert_eq!(map.entries()[0].byte_start, 0);
885        assert_eq!(map.entries()[0].cell_start, 0);
886
887        // Last entry ends at total.
888        let last = map.entries().last().unwrap();
889        assert_eq!(last.byte_end, map.total_bytes() as u32);
890        assert_eq!(last.cell_end(), map.total_cells() as u32);
891    }
892
893    // -----------------------------------------------------------------------
894    // Shaped run integration
895    // -----------------------------------------------------------------------
896
897    #[test]
898    fn from_shaped_run_noop() {
899        use crate::script_segmentation::{RunDirection, Script};
900        use crate::shaping::{FontFeatures, NoopShaper, TextShaper};
901
902        let text = "Hi\u{4E16}!";
903        let shaper = NoopShaper;
904        let ff = FontFeatures::default();
905        let run = shaper.shape(text, Script::Latin, RunDirection::Ltr, &ff);
906
907        let map = ClusterMap::from_shaped_run(text, &run);
908
909        // Same result as from_text for NoopShaper.
910        let text_map = ClusterMap::from_text(text);
911
912        assert_eq!(map.cluster_count(), text_map.cluster_count());
913        assert_eq!(map.total_cells(), text_map.total_cells());
914        assert_eq!(map.total_bytes(), text_map.total_bytes());
915    }
916
917    #[test]
918    fn from_shaped_run_empty() {
919        use crate::shaping::ShapedRun;
920
921        let map = ClusterMap::from_shaped_run(
922            "",
923            &ShapedRun {
924                glyphs: vec![],
925                total_advance: 0,
926            },
927        );
928        assert!(map.is_empty());
929    }
930}