#![forbid(unsafe_code)]
use crate::shaping::ShapedRun;
use unicode_segmentation::UnicodeSegmentation;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct ClusterEntry {
pub byte_start: u32,
pub byte_end: u32,
pub grapheme_index: u32,
pub cell_start: u32,
pub cell_width: u8,
}
impl ClusterEntry {
#[inline]
pub fn byte_range(&self) -> std::ops::Range<usize> {
self.byte_start as usize..self.byte_end as usize
}
#[inline]
pub fn cell_range(&self) -> std::ops::Range<usize> {
self.cell_start as usize..(self.cell_start as usize + self.cell_width as usize)
}
#[inline]
pub fn cell_end(&self) -> u32 {
self.cell_start + self.cell_width as u32
}
}
#[derive(Debug, Clone)]
pub struct ClusterMap {
entries: Vec<ClusterEntry>,
total_cells: u32,
total_bytes: u32,
}
impl ClusterMap {
pub fn from_text(text: &str) -> Self {
if text.is_empty() {
return Self {
entries: Vec::new(),
total_cells: 0,
total_bytes: 0,
};
}
let mut entries = Vec::new();
let mut cell_offset = 0u32;
for (grapheme_idx, (byte_offset, grapheme)) in text.grapheme_indices(true).enumerate() {
let width = crate::grapheme_width(grapheme) as u8;
let byte_end = byte_offset + grapheme.len();
entries.push(ClusterEntry {
byte_start: byte_offset as u32,
byte_end: byte_end as u32,
grapheme_index: grapheme_idx as u32,
cell_start: cell_offset,
cell_width: width,
});
cell_offset += width as u32;
}
Self {
entries,
total_cells: cell_offset,
total_bytes: text.len() as u32,
}
}
pub fn from_shaped_run(text: &str, run: &ShapedRun) -> Self {
if text.is_empty() || run.is_empty() {
return Self {
entries: Vec::new(),
total_cells: 0,
total_bytes: 0,
};
}
let mut entries = Vec::new();
let mut cell_offset = 0u32;
let mut grapheme_idx = 0u32;
let mut i = 0;
while i < run.glyphs.len() {
let cluster_byte = run.glyphs[i].cluster as usize;
let mut cluster_advance = 0i32;
let mut j = i;
while j < run.glyphs.len() && run.glyphs[j].cluster as usize == cluster_byte {
cluster_advance += run.glyphs[j].x_advance;
j += 1;
}
let next_byte = if j < run.glyphs.len() {
run.glyphs[j].cluster as usize
} else {
text.len()
};
let width = cluster_advance.unsigned_abs().min(255) as u8;
entries.push(ClusterEntry {
byte_start: cluster_byte as u32,
byte_end: next_byte as u32,
grapheme_index: grapheme_idx,
cell_start: cell_offset,
cell_width: width,
});
cell_offset += width as u32;
grapheme_idx += 1;
i = j;
}
Self {
entries,
total_cells: cell_offset,
total_bytes: text.len() as u32,
}
}
pub fn byte_to_cell(&self, byte_offset: usize) -> usize {
if self.entries.is_empty() || byte_offset >= self.total_bytes as usize {
return self.total_cells as usize;
}
match self
.entries
.binary_search_by_key(&(byte_offset as u32), |e| e.byte_start)
{
Ok(idx) => self.entries[idx].cell_start as usize,
Err(idx) => {
if idx > 0 {
self.entries[idx - 1].cell_start as usize
} else {
0
}
}
}
}
pub fn byte_to_entry(&self, byte_offset: usize) -> Option<&ClusterEntry> {
if self.entries.is_empty() {
return None;
}
match self
.entries
.binary_search_by_key(&(byte_offset as u32), |e| e.byte_start)
{
Ok(idx) => Some(&self.entries[idx]),
Err(idx) => {
if idx > 0 && (byte_offset as u32) < self.entries[idx - 1].byte_end {
Some(&self.entries[idx - 1])
} else {
None
}
}
}
}
pub fn byte_range_to_cell_range(&self, byte_start: usize, byte_end: usize) -> (usize, usize) {
if self.entries.is_empty() || byte_start >= byte_end {
return (0, 0);
}
let start_cell = self.byte_to_cell(byte_start);
let end_cell = if byte_end >= self.total_bytes as usize {
self.total_cells as usize
} else {
match self
.entries
.binary_search_by_key(&(byte_end as u32), |e| e.byte_start)
{
Ok(idx) => self.entries[idx].cell_start as usize,
Err(idx) => {
if idx > 0 {
self.entries[idx - 1].cell_end() as usize
} else {
0
}
}
}
};
(start_cell, end_cell)
}
pub fn cell_to_byte(&self, cell_col: usize) -> usize {
if self.entries.is_empty() || cell_col >= self.total_cells as usize {
return self.total_bytes as usize;
}
match self
.entries
.binary_search_by_key(&(cell_col as u32), |e| e.cell_start)
{
Ok(idx) => self.entries[idx].byte_start as usize,
Err(idx) => {
if idx > 0 {
self.entries[idx - 1].byte_start as usize
} else {
0
}
}
}
}
pub fn cell_to_entry(&self, cell_col: usize) -> Option<&ClusterEntry> {
if self.entries.is_empty() || cell_col >= self.total_cells as usize {
return None;
}
match self
.entries
.binary_search_by_key(&(cell_col as u32), |e| e.cell_start)
{
Ok(idx) => Some(&self.entries[idx]),
Err(idx) => {
if idx > 0 {
let entry = &self.entries[idx - 1];
if (cell_col as u32) < entry.cell_end() {
Some(entry)
} else {
None
}
} else {
None
}
}
}
}
pub fn cell_range_to_byte_range(&self, cell_start: usize, cell_end: usize) -> (usize, usize) {
if self.entries.is_empty() || cell_start >= cell_end {
return (0, 0);
}
let start_byte = self.cell_to_byte(cell_start);
let end_byte = if cell_end >= self.total_cells as usize {
self.total_bytes as usize
} else {
match self.cell_to_entry(cell_end.saturating_sub(1)) {
Some(entry) => entry.byte_end as usize,
None => self.total_bytes as usize,
}
};
(start_byte, end_byte.max(start_byte))
}
pub fn grapheme_to_cell(&self, grapheme_index: usize) -> usize {
self.entries
.get(grapheme_index)
.map_or(self.total_cells as usize, |e| e.cell_start as usize)
}
pub fn cell_to_grapheme(&self, cell_col: usize) -> usize {
self.cell_to_entry(cell_col)
.map_or(self.entries.len(), |e| e.grapheme_index as usize)
}
pub fn grapheme_to_byte(&self, grapheme_index: usize) -> usize {
self.entries
.get(grapheme_index)
.map_or(self.total_bytes as usize, |e| e.byte_start as usize)
}
pub fn byte_to_grapheme(&self, byte_offset: usize) -> usize {
self.byte_to_entry(byte_offset)
.map_or(self.entries.len(), |e| e.grapheme_index as usize)
}
#[inline]
pub fn total_cells(&self) -> usize {
self.total_cells as usize
}
#[inline]
pub fn total_bytes(&self) -> usize {
self.total_bytes as usize
}
#[inline]
pub fn cluster_count(&self) -> usize {
self.entries.len()
}
#[inline]
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
#[inline]
pub fn entries(&self) -> &[ClusterEntry] {
&self.entries
}
#[inline]
pub fn get(&self, grapheme_index: usize) -> Option<&ClusterEntry> {
self.entries.get(grapheme_index)
}
pub fn extract_text_for_cells<'a>(
&self,
source: &'a str,
cell_start: usize,
cell_end: usize,
) -> &'a str {
let (byte_start, byte_end) = self.cell_range_to_byte_range(cell_start, cell_end);
if byte_start >= source.len() {
return "";
}
let end = byte_end.min(source.len());
&source[byte_start..end]
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_text() {
let map = ClusterMap::from_text("");
assert!(map.is_empty());
assert_eq!(map.total_cells(), 0);
assert_eq!(map.total_bytes(), 0);
assert_eq!(map.cluster_count(), 0);
}
#[test]
fn ascii_text() {
let map = ClusterMap::from_text("Hello");
assert_eq!(map.cluster_count(), 5);
assert_eq!(map.total_cells(), 5);
assert_eq!(map.total_bytes(), 5);
for i in 0..5 {
let e = map.get(i).unwrap();
assert_eq!(e.byte_start, i as u32);
assert_eq!(e.byte_end, (i + 1) as u32);
assert_eq!(e.cell_start, i as u32);
assert_eq!(e.cell_width, 1);
}
}
#[test]
fn wide_chars() {
let text = "\u{4E16}\u{754C}";
let map = ClusterMap::from_text(text);
assert_eq!(map.cluster_count(), 2);
assert_eq!(map.total_bytes(), 6);
assert_eq!(map.total_cells(), 4);
let e0 = map.get(0).unwrap();
assert_eq!(e0.byte_start, 0);
assert_eq!(e0.byte_end, 3);
assert_eq!(e0.cell_start, 0);
assert_eq!(e0.cell_width, 2);
let e1 = map.get(1).unwrap();
assert_eq!(e1.byte_start, 3);
assert_eq!(e1.byte_end, 6);
assert_eq!(e1.cell_start, 2);
assert_eq!(e1.cell_width, 2);
}
#[test]
fn mixed_ascii_and_wide() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
assert_eq!(map.cluster_count(), 5);
assert_eq!(map.total_bytes(), 9); assert_eq!(map.total_cells(), 7);
assert_eq!(map.get(0).unwrap().cell_start, 0); assert_eq!(map.get(1).unwrap().cell_start, 1); assert_eq!(map.get(2).unwrap().cell_start, 2); assert_eq!(map.get(3).unwrap().cell_start, 4); assert_eq!(map.get(4).unwrap().cell_start, 6); }
#[test]
fn combining_marks() {
let text = "e\u{0301}";
let map = ClusterMap::from_text(text);
assert_eq!(map.cluster_count(), 1);
assert_eq!(map.total_bytes(), 3); assert_eq!(map.total_cells(), 1);
let e = map.get(0).unwrap();
assert_eq!(e.byte_start, 0);
assert_eq!(e.byte_end, 3);
assert_eq!(e.cell_width, 1);
}
#[test]
fn byte_to_cell_ascii() {
let map = ClusterMap::from_text("Hello");
for i in 0..5 {
assert_eq!(map.byte_to_cell(i), i);
}
assert_eq!(map.byte_to_cell(5), 5); }
#[test]
fn byte_to_cell_wide() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
assert_eq!(map.byte_to_cell(0), 0); assert_eq!(map.byte_to_cell(1), 1); assert_eq!(map.byte_to_cell(2), 2); assert_eq!(map.byte_to_cell(5), 4); assert_eq!(map.byte_to_cell(8), 6); }
#[test]
fn byte_to_cell_mid_cluster_snaps() {
let text = "\u{4E16}"; let map = ClusterMap::from_text(text);
assert_eq!(map.byte_to_cell(0), 0);
assert_eq!(map.byte_to_cell(1), 0); assert_eq!(map.byte_to_cell(2), 0); }
#[test]
fn byte_to_entry() {
let text = "AB\u{4E16}C";
let map = ClusterMap::from_text(text);
let e = map.byte_to_entry(0).unwrap();
assert_eq!(e.byte_start, 0);
let e = map.byte_to_entry(2).unwrap();
assert_eq!(e.byte_start, 2);
let e = map.byte_to_entry(3).unwrap();
assert_eq!(e.byte_start, 2);
assert!(map.byte_to_entry(100).is_none());
}
#[test]
fn cell_to_byte_ascii() {
let map = ClusterMap::from_text("Hello");
for i in 0..5 {
assert_eq!(map.cell_to_byte(i), i);
}
assert_eq!(map.cell_to_byte(5), 5); }
#[test]
fn cell_to_byte_wide() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
assert_eq!(map.cell_to_byte(0), 0); assert_eq!(map.cell_to_byte(1), 1); assert_eq!(map.cell_to_byte(2), 2); assert_eq!(map.cell_to_byte(3), 2); assert_eq!(map.cell_to_byte(4), 5); assert_eq!(map.cell_to_byte(5), 5); assert_eq!(map.cell_to_byte(6), 8); }
#[test]
fn cell_to_entry_continuation() {
let text = "\u{4E16}"; let map = ClusterMap::from_text(text);
let e0 = map.cell_to_entry(0).unwrap();
let e1 = map.cell_to_entry(1).unwrap();
assert_eq!(e0, e1);
assert_eq!(e0.byte_start, 0);
assert_eq!(e0.cell_width, 2);
}
#[test]
fn byte_range_to_cell_range_ascii() {
let map = ClusterMap::from_text("Hello World");
assert_eq!(map.byte_range_to_cell_range(0, 5), (0, 5)); assert_eq!(map.byte_range_to_cell_range(6, 11), (6, 11)); }
#[test]
fn byte_range_to_cell_range_wide() {
let text = "Hi\u{4E16}\u{754C}!"; let map = ClusterMap::from_text(text);
assert_eq!(map.byte_range_to_cell_range(2, 8), (2, 6));
}
#[test]
fn cell_range_to_byte_range_ascii() {
let map = ClusterMap::from_text("Hello World");
assert_eq!(map.cell_range_to_byte_range(0, 5), (0, 5));
}
#[test]
fn cell_range_to_byte_range_wide() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
assert_eq!(map.cell_range_to_byte_range(2, 6), (2, 8));
assert_eq!(map.cell_range_to_byte_range(3, 5), (2, 8));
}
#[test]
fn grapheme_to_cell_and_back() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
assert_eq!(map.grapheme_to_cell(0), 0); assert_eq!(map.grapheme_to_cell(2), 2); assert_eq!(map.grapheme_to_cell(4), 6); assert_eq!(map.grapheme_to_cell(5), 7);
assert_eq!(map.cell_to_grapheme(0), 0); assert_eq!(map.cell_to_grapheme(2), 2); assert_eq!(map.cell_to_grapheme(3), 2); }
#[test]
fn grapheme_to_byte_and_back() {
let text = "A\u{4E16}B";
let map = ClusterMap::from_text(text);
assert_eq!(map.grapheme_to_byte(0), 0); assert_eq!(map.grapheme_to_byte(1), 1); assert_eq!(map.grapheme_to_byte(2), 4);
assert_eq!(map.byte_to_grapheme(0), 0); assert_eq!(map.byte_to_grapheme(1), 1); assert_eq!(map.byte_to_grapheme(4), 2); }
#[test]
fn extract_text_for_cells_ascii() {
let text = "Hello World";
let map = ClusterMap::from_text(text);
assert_eq!(map.extract_text_for_cells(text, 0, 5), "Hello");
assert_eq!(map.extract_text_for_cells(text, 6, 11), "World");
}
#[test]
fn extract_text_for_cells_wide() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
assert_eq!(map.extract_text_for_cells(text, 2, 6), "\u{4E16}\u{754C}");
assert_eq!(map.extract_text_for_cells(text, 3, 5), "\u{4E16}\u{754C}");
}
#[test]
fn extract_text_empty_range() {
let text = "Hello";
let map = ClusterMap::from_text(text);
assert_eq!(map.extract_text_for_cells(text, 3, 3), "");
}
#[test]
fn roundtrip_byte_cell_byte() {
let texts = [
"Hello",
"\u{4E16}\u{754C}",
"Hi\u{4E16}\u{754C}!",
"e\u{0301}f",
"\u{05E9}\u{05DC}\u{05D5}\u{05DD}",
"",
];
for text in texts {
let map = ClusterMap::from_text(text);
for entry in map.entries() {
let byte = entry.byte_start as usize;
let cell = map.byte_to_cell(byte);
let back = map.cell_to_byte(cell);
assert_eq!(
back, byte,
"Round-trip failed for text={text:?} byte={byte}"
);
}
}
}
#[test]
fn roundtrip_cell_byte_cell() {
let texts = [
"Hello",
"\u{4E16}\u{754C}",
"Hi\u{4E16}\u{754C}!",
"e\u{0301}f",
];
for text in texts {
let map = ClusterMap::from_text(text);
for entry in map.entries() {
let cell = entry.cell_start as usize;
let byte = map.cell_to_byte(cell);
let back = map.byte_to_cell(byte);
assert_eq!(
back, cell,
"Round-trip failed for text={text:?} cell={cell}"
);
}
}
}
#[test]
fn monotonicity() {
let texts = [
"Hello World",
"Hi\u{4E16}\u{754C}! \u{05E9}\u{05DC}\u{05D5}\u{05DD}",
"e\u{0301}\u{0302}",
];
for text in texts {
let map = ClusterMap::from_text(text);
for window in map.entries().windows(2) {
assert!(
window[0].byte_start < window[1].byte_start,
"Byte monotonicity violated: {:?}",
window
);
assert!(
window[0].cell_start < window[1].cell_start,
"Cell monotonicity violated: {:?}",
window
);
}
}
}
#[test]
fn contiguity() {
let text = "Hi\u{4E16}\u{754C}!";
let map = ClusterMap::from_text(text);
for window in map.entries().windows(2) {
assert_eq!(
window[0].byte_end, window[1].byte_start,
"Byte gap: {:?}",
window
);
}
for window in map.entries().windows(2) {
assert_eq!(
window[0].cell_end(),
window[1].cell_start,
"Cell gap: {:?}",
window
);
}
assert_eq!(map.entries()[0].byte_start, 0);
assert_eq!(map.entries()[0].cell_start, 0);
let last = map.entries().last().unwrap();
assert_eq!(last.byte_end, map.total_bytes() as u32);
assert_eq!(last.cell_end(), map.total_cells() as u32);
}
#[test]
fn from_shaped_run_noop() {
use crate::script_segmentation::{RunDirection, Script};
use crate::shaping::{FontFeatures, NoopShaper, TextShaper};
let text = "Hi\u{4E16}!";
let shaper = NoopShaper;
let ff = FontFeatures::default();
let run = shaper.shape(text, Script::Latin, RunDirection::Ltr, &ff);
let map = ClusterMap::from_shaped_run(text, &run);
let text_map = ClusterMap::from_text(text);
assert_eq!(map.cluster_count(), text_map.cluster_count());
assert_eq!(map.total_cells(), text_map.total_cells());
assert_eq!(map.total_bytes(), text_map.total_bytes());
}
#[test]
fn from_shaped_run_empty() {
use crate::shaping::ShapedRun;
let map = ClusterMap::from_shaped_run(
"",
&ShapedRun {
glyphs: vec![],
total_advance: 0,
},
);
assert!(map.is_empty());
}
#[test]
fn from_shaped_run_ligature_cluster_boundaries() {
use crate::shaping::{ShapedGlyph, ShapedRun};
let text = "file";
let run = ShapedRun {
glyphs: vec![
ShapedGlyph {
glyph_id: 1,
cluster: 0, x_advance: 2,
y_advance: 0,
x_offset: 0,
y_offset: 0,
},
ShapedGlyph {
glyph_id: 2,
cluster: 2,
x_advance: 1,
y_advance: 0,
x_offset: 0,
y_offset: 0,
},
ShapedGlyph {
glyph_id: 3,
cluster: 3,
x_advance: 1,
y_advance: 0,
x_offset: 0,
y_offset: 0,
},
],
total_advance: 4,
};
let map = ClusterMap::from_shaped_run(text, &run);
assert_eq!(map.cluster_count(), 3);
assert_eq!(map.total_cells(), 4);
assert_eq!(map.byte_to_cell(0), 0);
assert_eq!(map.byte_to_cell(1), 0);
assert_eq!(map.byte_to_cell(2), 2);
assert_eq!(map.cell_to_byte(1), 0);
assert_eq!(map.cell_to_byte(2), 2);
assert_eq!(map.extract_text_for_cells(text, 0, 2), "fi");
}
}