1#![forbid(unsafe_code)]
2
3use crate::shaping::ShapedRun;
49use unicode_segmentation::UnicodeSegmentation;
50
51#[derive(Debug, Clone, Copy, PartialEq, Eq)]
57pub struct ClusterEntry {
58 pub byte_start: u32,
60 pub byte_end: u32,
62 pub grapheme_index: u32,
64 pub cell_start: u32,
66 pub cell_width: u8,
68}
69
70impl ClusterEntry {
71 #[inline]
73 pub fn byte_range(&self) -> std::ops::Range<usize> {
74 self.byte_start as usize..self.byte_end as usize
75 }
76
77 #[inline]
79 pub fn cell_range(&self) -> std::ops::Range<usize> {
80 self.cell_start as usize..(self.cell_start as usize + self.cell_width as usize)
81 }
82
83 #[inline]
85 pub fn cell_end(&self) -> u32 {
86 self.cell_start + self.cell_width as u32
87 }
88}
89
90#[derive(Debug, Clone)]
107pub struct ClusterMap {
108 entries: Vec<ClusterEntry>,
110 total_cells: u32,
112 total_bytes: u32,
114}
115
116impl ClusterMap {
117 pub fn from_text(text: &str) -> Self {
122 if text.is_empty() {
123 return Self {
124 entries: Vec::new(),
125 total_cells: 0,
126 total_bytes: 0,
127 };
128 }
129
130 let mut entries = Vec::new();
131 let mut cell_offset = 0u32;
132
133 for (grapheme_idx, (byte_offset, grapheme)) in text.grapheme_indices(true).enumerate() {
134 let width = crate::grapheme_width(grapheme) as u8;
135 let byte_end = byte_offset + grapheme.len();
136
137 entries.push(ClusterEntry {
138 byte_start: byte_offset as u32,
139 byte_end: byte_end as u32,
140 grapheme_index: grapheme_idx as u32,
141 cell_start: cell_offset,
142 cell_width: width,
143 });
144
145 cell_offset += width as u32;
146 }
147
148 Self {
149 entries,
150 total_cells: cell_offset,
151 total_bytes: text.len() as u32,
152 }
153 }
154
155 pub fn from_shaped_run(text: &str, run: &ShapedRun) -> Self {
164 if text.is_empty() || run.is_empty() {
165 return Self {
166 entries: Vec::new(),
167 total_cells: 0,
168 total_bytes: 0,
169 };
170 }
171
172 let mut entries = Vec::new();
176 let mut cell_offset = 0u32;
177 let mut grapheme_idx = 0u32;
178
179 let mut i = 0;
180 while i < run.glyphs.len() {
181 let cluster_byte = run.glyphs[i].cluster as usize;
182 let mut cluster_advance = 0i32;
183
184 let mut j = i;
186 while j < run.glyphs.len() && run.glyphs[j].cluster as usize == cluster_byte {
187 cluster_advance += run.glyphs[j].x_advance;
188 j += 1;
189 }
190
191 let next_byte = if j < run.glyphs.len() {
193 run.glyphs[j].cluster as usize
194 } else {
195 text.len()
196 };
197
198 let width = cluster_advance.unsigned_abs().min(255) as u8;
200
201 entries.push(ClusterEntry {
202 byte_start: cluster_byte as u32,
203 byte_end: next_byte as u32,
204 grapheme_index: grapheme_idx,
205 cell_start: cell_offset,
206 cell_width: width,
207 });
208
209 cell_offset += width as u32;
210 grapheme_idx += 1;
211 i = j;
212 }
213
214 Self {
215 entries,
216 total_cells: cell_offset,
217 total_bytes: text.len() as u32,
218 }
219 }
220
221 pub fn byte_to_cell(&self, byte_offset: usize) -> usize {
230 if self.entries.is_empty() || byte_offset >= self.total_bytes as usize {
231 return self.total_cells as usize;
232 }
233
234 match self
235 .entries
236 .binary_search_by_key(&(byte_offset as u32), |e| e.byte_start)
237 {
238 Ok(idx) => self.entries[idx].cell_start as usize,
239 Err(idx) => {
240 if idx > 0 {
242 self.entries[idx - 1].cell_start as usize
243 } else {
244 0
245 }
246 }
247 }
248 }
249
250 pub fn byte_to_entry(&self, byte_offset: usize) -> Option<&ClusterEntry> {
254 if self.entries.is_empty() {
255 return None;
256 }
257
258 match self
259 .entries
260 .binary_search_by_key(&(byte_offset as u32), |e| e.byte_start)
261 {
262 Ok(idx) => Some(&self.entries[idx]),
263 Err(idx) => {
264 if idx > 0 && (byte_offset as u32) < self.entries[idx - 1].byte_end {
265 Some(&self.entries[idx - 1])
266 } else {
267 None
268 }
269 }
270 }
271 }
272
273 pub fn byte_range_to_cell_range(&self, byte_start: usize, byte_end: usize) -> (usize, usize) {
278 if self.entries.is_empty() || byte_start >= byte_end {
279 return (0, 0);
280 }
281
282 let start_cell = self.byte_to_cell(byte_start);
283
284 let end_cell = if byte_end >= self.total_bytes as usize {
286 self.total_cells as usize
287 } else {
288 match self
289 .entries
290 .binary_search_by_key(&(byte_end as u32), |e| e.byte_start)
291 {
292 Ok(idx) => self.entries[idx].cell_start as usize,
293 Err(idx) => {
294 if idx > 0 {
295 self.entries[idx - 1].cell_end() as usize
296 } else {
297 0
298 }
299 }
300 }
301 };
302
303 (start_cell, end_cell)
304 }
305
306 pub fn cell_to_byte(&self, cell_col: usize) -> usize {
316 if self.entries.is_empty() || cell_col >= self.total_cells as usize {
317 return self.total_bytes as usize;
318 }
319
320 match self
321 .entries
322 .binary_search_by_key(&(cell_col as u32), |e| e.cell_start)
323 {
324 Ok(idx) => self.entries[idx].byte_start as usize,
325 Err(idx) => {
326 if idx > 0 {
328 self.entries[idx - 1].byte_start as usize
329 } else {
330 0
331 }
332 }
333 }
334 }
335
336 pub fn cell_to_entry(&self, cell_col: usize) -> Option<&ClusterEntry> {
340 if self.entries.is_empty() || cell_col >= self.total_cells as usize {
341 return None;
342 }
343
344 match self
345 .entries
346 .binary_search_by_key(&(cell_col as u32), |e| e.cell_start)
347 {
348 Ok(idx) => Some(&self.entries[idx]),
349 Err(idx) => {
350 if idx > 0 {
351 let entry = &self.entries[idx - 1];
352 if (cell_col as u32) < entry.cell_end() {
353 Some(entry)
354 } else {
355 None
356 }
357 } else {
358 None
359 }
360 }
361 }
362 }
363
364 pub fn cell_range_to_byte_range(&self, cell_start: usize, cell_end: usize) -> (usize, usize) {
370 if self.entries.is_empty() || cell_start >= cell_end {
371 return (0, 0);
372 }
373
374 let start_byte = self.cell_to_byte(cell_start);
375
376 let end_byte = if cell_end >= self.total_cells as usize {
377 self.total_bytes as usize
378 } else {
379 match self.cell_to_entry(cell_end.saturating_sub(1)) {
383 Some(entry) => entry.byte_end as usize,
384 None => self.total_bytes as usize,
385 }
386 };
387
388 (start_byte, end_byte.max(start_byte))
389 }
390
391 pub fn grapheme_to_cell(&self, grapheme_index: usize) -> usize {
397 self.entries
398 .get(grapheme_index)
399 .map_or(self.total_cells as usize, |e| e.cell_start as usize)
400 }
401
402 pub fn cell_to_grapheme(&self, cell_col: usize) -> usize {
404 self.cell_to_entry(cell_col)
405 .map_or(self.entries.len(), |e| e.grapheme_index as usize)
406 }
407
408 pub fn grapheme_to_byte(&self, grapheme_index: usize) -> usize {
410 self.entries
411 .get(grapheme_index)
412 .map_or(self.total_bytes as usize, |e| e.byte_start as usize)
413 }
414
415 pub fn byte_to_grapheme(&self, byte_offset: usize) -> usize {
417 self.byte_to_entry(byte_offset)
418 .map_or(self.entries.len(), |e| e.grapheme_index as usize)
419 }
420
421 #[inline]
427 pub fn total_cells(&self) -> usize {
428 self.total_cells as usize
429 }
430
431 #[inline]
433 pub fn total_bytes(&self) -> usize {
434 self.total_bytes as usize
435 }
436
437 #[inline]
439 pub fn cluster_count(&self) -> usize {
440 self.entries.len()
441 }
442
443 #[inline]
445 pub fn is_empty(&self) -> bool {
446 self.entries.is_empty()
447 }
448
449 #[inline]
451 pub fn entries(&self) -> &[ClusterEntry] {
452 &self.entries
453 }
454
455 #[inline]
457 pub fn get(&self, grapheme_index: usize) -> Option<&ClusterEntry> {
458 self.entries.get(grapheme_index)
459 }
460
461 pub fn extract_text_for_cells<'a>(
466 &self,
467 source: &'a str,
468 cell_start: usize,
469 cell_end: usize,
470 ) -> &'a str {
471 let (byte_start, byte_end) = self.cell_range_to_byte_range(cell_start, cell_end);
472 if byte_start >= source.len() {
473 return "";
474 }
475 let end = byte_end.min(source.len());
476 &source[byte_start..end]
477 }
478}
479
480#[cfg(test)]
485mod tests {
486 use super::*;
487
488 #[test]
493 fn empty_text() {
494 let map = ClusterMap::from_text("");
495 assert!(map.is_empty());
496 assert_eq!(map.total_cells(), 0);
497 assert_eq!(map.total_bytes(), 0);
498 assert_eq!(map.cluster_count(), 0);
499 }
500
501 #[test]
502 fn ascii_text() {
503 let map = ClusterMap::from_text("Hello");
504 assert_eq!(map.cluster_count(), 5);
505 assert_eq!(map.total_cells(), 5);
506 assert_eq!(map.total_bytes(), 5);
507
508 for i in 0..5 {
510 let e = map.get(i).unwrap();
511 assert_eq!(e.byte_start, i as u32);
512 assert_eq!(e.byte_end, (i + 1) as u32);
513 assert_eq!(e.cell_start, i as u32);
514 assert_eq!(e.cell_width, 1);
515 }
516 }
517
518 #[test]
519 fn wide_chars() {
520 let text = "\u{4E16}\u{754C}";
522 let map = ClusterMap::from_text(text);
523
524 assert_eq!(map.cluster_count(), 2);
525 assert_eq!(map.total_bytes(), 6);
526 assert_eq!(map.total_cells(), 4);
527
528 let e0 = map.get(0).unwrap();
529 assert_eq!(e0.byte_start, 0);
530 assert_eq!(e0.byte_end, 3);
531 assert_eq!(e0.cell_start, 0);
532 assert_eq!(e0.cell_width, 2);
533
534 let e1 = map.get(1).unwrap();
535 assert_eq!(e1.byte_start, 3);
536 assert_eq!(e1.byte_end, 6);
537 assert_eq!(e1.cell_start, 2);
538 assert_eq!(e1.cell_width, 2);
539 }
540
541 #[test]
542 fn mixed_ascii_and_wide() {
543 let text = "Hi\u{4E16}\u{754C}!";
545 let map = ClusterMap::from_text(text);
546
547 assert_eq!(map.cluster_count(), 5);
548 assert_eq!(map.total_bytes(), 9); assert_eq!(map.total_cells(), 7); assert_eq!(map.get(0).unwrap().cell_start, 0); assert_eq!(map.get(1).unwrap().cell_start, 1); assert_eq!(map.get(2).unwrap().cell_start, 2); assert_eq!(map.get(3).unwrap().cell_start, 4); assert_eq!(map.get(4).unwrap().cell_start, 6); }
558
559 #[test]
560 fn combining_marks() {
561 let text = "e\u{0301}";
563 let map = ClusterMap::from_text(text);
564
565 assert_eq!(map.cluster_count(), 1);
566 assert_eq!(map.total_bytes(), 3); assert_eq!(map.total_cells(), 1);
568
569 let e = map.get(0).unwrap();
570 assert_eq!(e.byte_start, 0);
571 assert_eq!(e.byte_end, 3);
572 assert_eq!(e.cell_width, 1);
573 }
574
575 #[test]
580 fn byte_to_cell_ascii() {
581 let map = ClusterMap::from_text("Hello");
582 for i in 0..5 {
583 assert_eq!(map.byte_to_cell(i), i);
584 }
585 assert_eq!(map.byte_to_cell(5), 5); }
587
588 #[test]
589 fn byte_to_cell_wide() {
590 let text = "Hi\u{4E16}\u{754C}!";
591 let map = ClusterMap::from_text(text);
592
593 assert_eq!(map.byte_to_cell(0), 0); assert_eq!(map.byte_to_cell(1), 1); assert_eq!(map.byte_to_cell(2), 2); assert_eq!(map.byte_to_cell(5), 4); assert_eq!(map.byte_to_cell(8), 6); }
599
600 #[test]
601 fn byte_to_cell_mid_cluster_snaps() {
602 let text = "\u{4E16}"; let map = ClusterMap::from_text(text);
604
605 assert_eq!(map.byte_to_cell(0), 0);
607 assert_eq!(map.byte_to_cell(1), 0); assert_eq!(map.byte_to_cell(2), 0); }
610
611 #[test]
612 fn byte_to_entry() {
613 let text = "AB\u{4E16}C";
614 let map = ClusterMap::from_text(text);
615
616 let e = map.byte_to_entry(0).unwrap();
617 assert_eq!(e.byte_start, 0); let e = map.byte_to_entry(2).unwrap();
620 assert_eq!(e.byte_start, 2); let e = map.byte_to_entry(3).unwrap();
624 assert_eq!(e.byte_start, 2); assert!(map.byte_to_entry(100).is_none());
627 }
628
629 #[test]
634 fn cell_to_byte_ascii() {
635 let map = ClusterMap::from_text("Hello");
636 for i in 0..5 {
637 assert_eq!(map.cell_to_byte(i), i);
638 }
639 assert_eq!(map.cell_to_byte(5), 5); }
641
642 #[test]
643 fn cell_to_byte_wide() {
644 let text = "Hi\u{4E16}\u{754C}!";
645 let map = ClusterMap::from_text(text);
646
647 assert_eq!(map.cell_to_byte(0), 0); assert_eq!(map.cell_to_byte(1), 1); assert_eq!(map.cell_to_byte(2), 2); assert_eq!(map.cell_to_byte(3), 2); assert_eq!(map.cell_to_byte(4), 5); assert_eq!(map.cell_to_byte(5), 5); assert_eq!(map.cell_to_byte(6), 8); }
655
656 #[test]
657 fn cell_to_entry_continuation() {
658 let text = "\u{4E16}"; let map = ClusterMap::from_text(text);
660
661 let e0 = map.cell_to_entry(0).unwrap();
663 let e1 = map.cell_to_entry(1).unwrap();
664 assert_eq!(e0, e1);
665 assert_eq!(e0.byte_start, 0);
666 assert_eq!(e0.cell_width, 2);
667 }
668
669 #[test]
674 fn byte_range_to_cell_range_ascii() {
675 let map = ClusterMap::from_text("Hello World");
676 assert_eq!(map.byte_range_to_cell_range(0, 5), (0, 5)); assert_eq!(map.byte_range_to_cell_range(6, 11), (6, 11)); }
679
680 #[test]
681 fn byte_range_to_cell_range_wide() {
682 let text = "Hi\u{4E16}\u{754C}!"; let map = ClusterMap::from_text(text);
684
685 assert_eq!(map.byte_range_to_cell_range(2, 8), (2, 6));
687 }
688
689 #[test]
690 fn cell_range_to_byte_range_ascii() {
691 let map = ClusterMap::from_text("Hello World");
692 assert_eq!(map.cell_range_to_byte_range(0, 5), (0, 5));
693 }
694
695 #[test]
696 fn cell_range_to_byte_range_wide() {
697 let text = "Hi\u{4E16}\u{754C}!";
698 let map = ClusterMap::from_text(text);
699
700 assert_eq!(map.cell_range_to_byte_range(2, 6), (2, 8));
702
703 assert_eq!(map.cell_range_to_byte_range(3, 5), (2, 8));
705 }
706
707 #[test]
712 fn grapheme_to_cell_and_back() {
713 let text = "Hi\u{4E16}\u{754C}!";
714 let map = ClusterMap::from_text(text);
715
716 assert_eq!(map.grapheme_to_cell(0), 0); assert_eq!(map.grapheme_to_cell(2), 2); assert_eq!(map.grapheme_to_cell(4), 6); assert_eq!(map.grapheme_to_cell(5), 7); assert_eq!(map.cell_to_grapheme(0), 0); assert_eq!(map.cell_to_grapheme(2), 2); assert_eq!(map.cell_to_grapheme(3), 2); }
725
726 #[test]
727 fn grapheme_to_byte_and_back() {
728 let text = "A\u{4E16}B";
729 let map = ClusterMap::from_text(text);
730
731 assert_eq!(map.grapheme_to_byte(0), 0); assert_eq!(map.grapheme_to_byte(1), 1); assert_eq!(map.grapheme_to_byte(2), 4); assert_eq!(map.byte_to_grapheme(0), 0); assert_eq!(map.byte_to_grapheme(1), 1); assert_eq!(map.byte_to_grapheme(4), 2); }
739
740 #[test]
745 fn extract_text_for_cells_ascii() {
746 let text = "Hello World";
747 let map = ClusterMap::from_text(text);
748 assert_eq!(map.extract_text_for_cells(text, 0, 5), "Hello");
749 assert_eq!(map.extract_text_for_cells(text, 6, 11), "World");
750 }
751
752 #[test]
753 fn extract_text_for_cells_wide() {
754 let text = "Hi\u{4E16}\u{754C}!";
755 let map = ClusterMap::from_text(text);
756
757 assert_eq!(map.extract_text_for_cells(text, 2, 6), "\u{4E16}\u{754C}");
759
760 assert_eq!(map.extract_text_for_cells(text, 3, 5), "\u{4E16}\u{754C}");
762 }
763
764 #[test]
765 fn extract_text_empty_range() {
766 let text = "Hello";
767 let map = ClusterMap::from_text(text);
768 assert_eq!(map.extract_text_for_cells(text, 3, 3), "");
769 }
770
771 #[test]
776 fn roundtrip_byte_cell_byte() {
777 let texts = [
778 "Hello",
779 "\u{4E16}\u{754C}",
780 "Hi\u{4E16}\u{754C}!",
781 "e\u{0301}f",
782 "\u{05E9}\u{05DC}\u{05D5}\u{05DD}",
783 "",
784 ];
785
786 for text in texts {
787 let map = ClusterMap::from_text(text);
788
789 for entry in map.entries() {
790 let byte = entry.byte_start as usize;
791 let cell = map.byte_to_cell(byte);
792 let back = map.cell_to_byte(cell);
793 assert_eq!(
794 back, byte,
795 "Round-trip failed for text={text:?} byte={byte}"
796 );
797 }
798 }
799 }
800
801 #[test]
802 fn roundtrip_cell_byte_cell() {
803 let texts = [
804 "Hello",
805 "\u{4E16}\u{754C}",
806 "Hi\u{4E16}\u{754C}!",
807 "e\u{0301}f",
808 ];
809
810 for text in texts {
811 let map = ClusterMap::from_text(text);
812
813 for entry in map.entries() {
814 let cell = entry.cell_start as usize;
815 let byte = map.cell_to_byte(cell);
816 let back = map.byte_to_cell(byte);
817 assert_eq!(
818 back, cell,
819 "Round-trip failed for text={text:?} cell={cell}"
820 );
821 }
822 }
823 }
824
825 #[test]
830 fn monotonicity() {
831 let texts = [
832 "Hello World",
833 "Hi\u{4E16}\u{754C}! \u{05E9}\u{05DC}\u{05D5}\u{05DD}",
834 "e\u{0301}\u{0302}",
835 ];
836
837 for text in texts {
838 let map = ClusterMap::from_text(text);
839
840 for window in map.entries().windows(2) {
841 assert!(
842 window[0].byte_start < window[1].byte_start,
843 "Byte monotonicity violated: {:?}",
844 window
845 );
846 assert!(
847 window[0].cell_start < window[1].cell_start,
848 "Cell monotonicity violated: {:?}",
849 window
850 );
851 }
852 }
853 }
854
855 #[test]
860 fn contiguity() {
861 let text = "Hi\u{4E16}\u{754C}!";
862 let map = ClusterMap::from_text(text);
863
864 for window in map.entries().windows(2) {
866 assert_eq!(
867 window[0].byte_end, window[1].byte_start,
868 "Byte gap: {:?}",
869 window
870 );
871 }
872
873 for window in map.entries().windows(2) {
875 assert_eq!(
876 window[0].cell_end(),
877 window[1].cell_start,
878 "Cell gap: {:?}",
879 window
880 );
881 }
882
883 assert_eq!(map.entries()[0].byte_start, 0);
885 assert_eq!(map.entries()[0].cell_start, 0);
886
887 let last = map.entries().last().unwrap();
889 assert_eq!(last.byte_end, map.total_bytes() as u32);
890 assert_eq!(last.cell_end(), map.total_cells() as u32);
891 }
892
893 #[test]
898 fn from_shaped_run_noop() {
899 use crate::script_segmentation::{RunDirection, Script};
900 use crate::shaping::{FontFeatures, NoopShaper, TextShaper};
901
902 let text = "Hi\u{4E16}!";
903 let shaper = NoopShaper;
904 let ff = FontFeatures::default();
905 let run = shaper.shape(text, Script::Latin, RunDirection::Ltr, &ff);
906
907 let map = ClusterMap::from_shaped_run(text, &run);
908
909 let text_map = ClusterMap::from_text(text);
911
912 assert_eq!(map.cluster_count(), text_map.cluster_count());
913 assert_eq!(map.total_cells(), text_map.total_cells());
914 assert_eq!(map.total_bytes(), text_map.total_bytes());
915 }
916
917 #[test]
918 fn from_shaped_run_empty() {
919 use crate::shaping::ShapedRun;
920
921 let map = ClusterMap::from_shaped_run(
922 "",
923 &ShapedRun {
924 glyphs: vec![],
925 total_advance: 0,
926 },
927 );
928 assert!(map.is_empty());
929 }
930}