pub fn grapheme_width(c: char) -> u8 {
if matches!(c, '\u{1F1E6}'..='\u{1F1FF}') {
return 2;
}
if c == '\u{200D}' || c == '\u{200C}' || c == '\u{200B}' || c == '\u{FEFF}' || c == '\u{2060}' {
return 0;
}
if matches!(
c,
'\u{1F3FB}'..='\u{1F3FF}'
| '\u{1F9B0}'..='\u{1F9B3}'
| '\u{1F9C0}'
| '\u{1F9D0}'..='\u{1F9D6}'
| '\u{1F9D7}'..='\u{1F9DF}'
) {
return 0;
}
if matches!(
c,
'\u{0300}'..='\u{036F}' | '\u{1DC0}'..='\u{1DFF}' | '\u{20D0}'..='\u{20FF}' | '\u{FE20}'..='\u{FE2F}' ) {
return 0;
}
if matches!(c, '\u{3099}'..='\u{309A}') {
return 0;
}
if is_wide_char(c) {
return 2;
}
use unicode_width::UnicodeWidthChar;
UnicodeWidthChar::width(c).unwrap_or(1) as u8
}
fn is_wide_char(c: char) -> bool {
matches!(
c,
'\u{2E80}'..='\u{303E}'
| '\u{3040}'..='\u{309F}'
| '\u{30A0}'..='\u{30FF}'
| '\u{3100}'..='\u{312F}'
| '\u{3130}'..='\u{318F}'
| '\u{3190}'..='\u{319F}'
| '\u{31A0}'..='\u{31BF}'
| '\u{31C0}'..='\u{31EF}'
| '\u{31F0}'..='\u{31FF}'
| '\u{3200}'..='\u{32FF}'
| '\u{3300}'..='\u{4DBF}'
| '\u{4E00}'..='\u{9FFF}'
| '\u{A000}'..='\u{A48C}'
| '\u{A490}'..='\u{A4CF}'
| '\u{F900}'..='\u{FAFF}'
| '\u{20000}'..='\u{2A6DF}'
| '\u{2A700}'..='\u{2B73F}'
| '\u{2B740}'..='\u{2B81F}'
| '\u{2B820}'..='\u{2CEAF}'
| '\u{2CEB0}'..='\u{2EBEF}'
| '\u{30000}'..='\u{3134F}'
)
}
pub fn grapheme_indices(text: &str) -> Vec<(usize, usize)> {
let mut result = Vec::with_capacity(text.len() / 2);
let mut byte_offset = 0usize;
let mut visual_column = 0usize;
let mut chars = text.chars().peekable();
while let Some(c) = chars.next() {
let char_len = c.len_utf8();
let width = grapheme_width(c);
if matches!(c, '\u{1F1E6}'..='\u{1F1FF}') {
if let Some(&next_c) = chars.peek() {
if matches!(next_c, '\u{1F1E6}'..='\u{1F1FF}') {
chars.next();
result.push((byte_offset, visual_column));
byte_offset += char_len + next_c.len_utf8();
visual_column += 2;
continue;
}
}
result.push((byte_offset, visual_column));
byte_offset += char_len;
continue;
}
if c == '\u{200D}' {
byte_offset += char_len;
continue;
}
if width == 0 {
byte_offset += char_len;
continue;
}
let cluster_start_byte = byte_offset;
let cluster_start_visual = visual_column;
result.push((cluster_start_byte, cluster_start_visual));
byte_offset += char_len;
visual_column += width as usize;
while let Some(&next_c) = chars.peek() {
if next_c == '\u{200D}' {
chars.next(); byte_offset += 3;
if let Some(&emoji_c) = chars.peek() {
let emoji_len = emoji_c.len_utf8();
chars.next();
byte_offset += emoji_len;
visual_column += grapheme_width(emoji_c) as usize;
}
continue;
}
if matches!(next_c, '\u{1F3FB}'..='\u{1F3FF}') {
let next_len = next_c.len_utf8();
chars.next();
byte_offset += next_len;
continue;
}
if grapheme_width(next_c) == 0 {
let next_len = next_c.len_utf8();
chars.next();
byte_offset += next_len;
continue;
}
break;
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_grapheme_width_ascii() {
assert_eq!(grapheme_width('a'), 1);
assert_eq!(grapheme_width('Z'), 1);
assert_eq!(grapheme_width(' '), 1);
assert_eq!(grapheme_width('!'), 1);
}
#[test]
fn test_grapheme_width_zero_width() {
assert_eq!(grapheme_width('\u{200D}'), 0); assert_eq!(grapheme_width('\u{200C}'), 0); assert_eq!(grapheme_width('\u{200B}'), 0); assert_eq!(grapheme_width('\u{FEFF}'), 0); }
#[test]
fn test_grapheme_width_combining_marks() {
assert_eq!(grapheme_width('\u{0300}'), 0); assert_eq!(grapheme_width('\u{0301}'), 0); assert_eq!(grapheme_width('\u{0327}'), 0); assert_eq!(grapheme_width('\u{036F}'), 0);
assert_eq!(grapheme_width('\u{3099}'), 0); assert_eq!(grapheme_width('\u{309A}'), 0); }
#[test]
fn test_grapheme_width_tone_modifiers() {
assert_eq!(grapheme_width('\u{1F3FB}'), 0); assert_eq!(grapheme_width('\u{1F3FC}'), 0); assert_eq!(grapheme_width('\u{1F3FD}'), 0); assert_eq!(grapheme_width('\u{1F3FE}'), 0); assert_eq!(grapheme_width('\u{1F3FF}'), 0); }
#[test]
fn test_grapheme_width_regional_indicators() {
assert_eq!(grapheme_width('\u{1F1FA}'), 2); assert_eq!(grapheme_width('\u{1F1EB}'), 2); }
#[test]
fn test_grapheme_width_cjk() {
assert_eq!(grapheme_width('日'), 2);
assert_eq!(grapheme_width('本'), 2);
assert_eq!(grapheme_width('語'), 2);
assert_eq!(grapheme_width('あ'), 2);
assert_eq!(grapheme_width('ア'), 2);
}
#[test]
fn test_grapheme_indices_basic() {
let indices = grapheme_indices("hello");
assert_eq!(indices.len(), 5);
assert_eq!(indices[0].0, 0); assert_eq!(indices[1].0, 1); assert_eq!(indices[2].0, 2); assert_eq!(indices[3].0, 3); assert_eq!(indices[4].0, 4); assert_eq!(indices[0].1, 0);
assert_eq!(indices[1].1, 1);
assert_eq!(indices[2].1, 2);
assert_eq!(indices[3].1, 3);
assert_eq!(indices[4].1, 4);
}
#[test]
fn test_grapheme_indices_cjk() {
let indices = grapheme_indices("日本語");
assert_eq!(indices.len(), 3);
assert_eq!(indices[0].0, 0);
assert_eq!(indices[0].1, 0); assert_eq!(indices[1].1, 2); assert_eq!(indices[2].1, 4); }
#[test]
fn test_grapheme_indices_combining_marks() {
let text = "e\u{0301}";
let indices = grapheme_indices(text);
assert_eq!(indices.len(), 1);
assert_eq!(indices[0].0, 0); assert_eq!(indices[0].1, 0); }
#[test]
fn test_grapheme_indices_zwj_sequence() {
let text = "👨👩👧";
let indices = grapheme_indices(text);
assert_eq!(indices.len(), 1);
assert_eq!(indices[0].0, 0);
assert_eq!(indices[0].1, 0);
}
#[test]
fn test_grapheme_indices_flag_emoji() {
let text = "\u{1F1FA}\u{1F1F8}";
let indices = grapheme_indices(text);
assert_eq!(indices.len(), 1);
assert_eq!(indices[0].0, 0);
assert_eq!(indices[0].1, 0); }
#[test]
fn test_grapheme_indices_mixed() {
let indices = grapheme_indices("Hello 世界 👋");
assert_eq!(indices.len(), 10);
assert_eq!(indices[0].1, 0);
assert_eq!(indices[1].1, 1);
assert_eq!(indices[2].1, 2);
assert_eq!(indices[3].1, 3);
assert_eq!(indices[4].1, 4);
assert_eq!(indices[5].1, 5); assert_eq!(indices[6].1, 6); assert_eq!(indices[7].1, 8); assert_eq!(indices[8].1, 10); assert_eq!(indices[9].1, 11); }
#[test]
fn test_grapheme_indices_empty() {
let indices = grapheme_indices("");
assert!(indices.is_empty());
}
#[test]
fn test_grapheme_indices_skin_tone_modifier() {
let text = "👍\u{1F3FB}";
let indices = grapheme_indices(text);
assert_eq!(indices.len(), 1);
}
#[test]
fn test_grapheme_indices_multiple_flags() {
let text = "\u{1F1FA}\u{1F1F8} \u{1F1EB}\u{1F1F7}";
let indices = grapheme_indices(text);
assert_eq!(indices.len(), 3);
assert_eq!(indices[0].1, 0); assert_eq!(indices[1].1, 2); assert_eq!(indices[2].1, 3); }
}