use unicode_properties::{EmojiStatus, UnicodeEmoji};
use unicode_segmentation::UnicodeSegmentation;
#[derive(Clone, Debug, PartialEq, Eq)]
pub enum InlineCluster<'a> {
Text(&'a str),
Emoji(EmojiCluster<'a>),
}
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct EmojiCluster<'a> {
pub text: &'a str,
pub presentation: EmojiPresentation,
pub structure: EmojiStructure,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EmojiPresentation {
Text,
Emoji,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum EmojiStructure {
Single,
KeycapSequence { base: char },
ModifierSequence { base: char, tone: SkinTone },
FlagSequence(FlagKind),
ZwjSequence,
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum SkinTone {
Light, MediumLight, Medium, MediumDark, Dark, }
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum FlagKind {
Regional,
Subdivision,
}
pub fn classify(text: &str) -> Vec<InlineCluster<'_>> {
let mut out = Vec::new();
let mut text_start: Option<usize> = None;
let mut text_end = 0usize;
let mut byte_offset = 0usize;
for cluster in text.graphemes(true) {
let cluster_len = cluster.len();
match classify_cluster(cluster) {
ClusterClass::Text => {
if text_start.is_none() {
text_start = Some(byte_offset);
}
text_end = byte_offset + cluster_len;
}
ClusterClass::Emoji {
presentation,
structure,
} => {
if let Some(s) = text_start.take() {
out.push(InlineCluster::Text(&text[s..text_end]));
}
out.push(InlineCluster::Emoji(EmojiCluster {
text: cluster,
presentation,
structure,
}));
}
}
byte_offset += cluster_len;
}
if let Some(s) = text_start {
out.push(InlineCluster::Text(&text[s..text_end]));
}
out
}
enum ClusterClass {
Text,
Emoji {
presentation: EmojiPresentation,
structure: EmojiStructure,
},
}
fn classify_cluster(cluster: &str) -> ClusterClass {
let mut chars = cluster.chars();
let Some(first) = chars.next() else {
return ClusterClass::Text;
};
let chars: Vec<char> = std::iter::once(first).chain(chars).collect();
if first == '\u{1F3F4}' && chars.iter().any(|&c| is_tag(c)) {
return ClusterClass::Emoji {
presentation: EmojiPresentation::Emoji,
structure: EmojiStructure::FlagSequence(FlagKind::Subdivision),
};
}
let ris_count = chars.iter().filter(|&&c| is_regional_indicator(c)).count();
if ris_count == 2 && ris_count == chars.len() {
return ClusterClass::Emoji {
presentation: EmojiPresentation::Emoji,
structure: EmojiStructure::FlagSequence(FlagKind::Regional),
};
}
if chars.contains(&'\u{20E3}') && is_keycap_base(first) {
return ClusterClass::Emoji {
presentation: EmojiPresentation::Emoji,
structure: EmojiStructure::KeycapSequence { base: first },
};
}
let Some(emoji_base) = chars.iter().copied().find(|&c| char_is_emoji(c)) else {
return ClusterClass::Text;
};
let has_zwj = chars.contains(&'\u{200D}');
let has_vs15 = chars.contains(&'\u{FE0E}');
let has_vs16 = chars.contains(&'\u{FE0F}');
if has_zwj {
return ClusterClass::Emoji {
presentation: EmojiPresentation::Emoji,
structure: EmojiStructure::ZwjSequence,
};
}
if let Some(tone) = chars.iter().find_map(|&c| skin_tone_for(c)) {
return ClusterClass::Emoji {
presentation: EmojiPresentation::Emoji,
structure: EmojiStructure::ModifierSequence {
base: emoji_base,
tone,
},
};
}
if has_vs15 {
return ClusterClass::Text;
}
if has_vs16 || has_default_emoji_presentation(emoji_base) {
return ClusterClass::Emoji {
presentation: EmojiPresentation::Emoji,
structure: EmojiStructure::Single,
};
}
ClusterClass::Text
}
fn char_is_emoji(c: char) -> bool {
use EmojiStatus::*;
matches!(
c.emoji_status(),
EmojiPresentation
| EmojiModifierBase
| EmojiPresentationAndModifierBase
| EmojiOther
| EmojiPresentationAndEmojiComponent
| EmojiPresentationAndModifierAndEmojiComponent
| EmojiOtherAndEmojiComponent
)
}
fn has_default_emoji_presentation(c: char) -> bool {
use EmojiStatus::*;
matches!(
c.emoji_status(),
EmojiPresentation
| EmojiPresentationAndModifierBase
| EmojiPresentationAndEmojiComponent
| EmojiPresentationAndModifierAndEmojiComponent
)
}
fn is_regional_indicator(c: char) -> bool {
matches!(c, '\u{1F1E6}'..='\u{1F1FF}')
}
fn is_tag(c: char) -> bool {
matches!(c, '\u{E0020}'..='\u{E007F}')
}
fn is_keycap_base(c: char) -> bool {
matches!(c, '0'..='9' | '#' | '*')
}
fn skin_tone_for(c: char) -> Option<SkinTone> {
match c {
'\u{1F3FB}' => Some(SkinTone::Light),
'\u{1F3FC}' => Some(SkinTone::MediumLight),
'\u{1F3FD}' => Some(SkinTone::Medium),
'\u{1F3FE}' => Some(SkinTone::MediumDark),
'\u{1F3FF}' => Some(SkinTone::Dark),
_ => None,
}
}
#[cfg(test)]
mod tests {
use super::*;
fn emoji(
text: &str,
presentation: EmojiPresentation,
structure: EmojiStructure,
) -> InlineCluster<'_> {
InlineCluster::Emoji(EmojiCluster {
text,
presentation,
structure,
})
}
#[test]
fn c1_pure_text() {
assert_eq!(classify("hello"), vec![InlineCluster::Text("hello")]);
}
#[test]
fn c2_emoji_between_text() {
assert_eq!(
classify("hi 📞 there"),
vec![
InlineCluster::Text("hi "),
emoji("📞", EmojiPresentation::Emoji, EmojiStructure::Single),
InlineCluster::Text(" there"),
]
);
}
#[test]
fn c3_default_text_presentation_stays_text() {
assert_eq!(classify("\u{260E}"), vec![InlineCluster::Text("\u{260E}")]);
}
#[test]
fn c4_vs16_promotes_to_emoji() {
assert_eq!(
classify("\u{260E}\u{FE0F}"),
vec![emoji(
"\u{260E}\u{FE0F}",
EmojiPresentation::Emoji,
EmojiStructure::Single
)]
);
}
#[test]
fn c5_vs15_forces_text() {
assert_eq!(
classify("\u{260E}\u{FE0E}"),
vec![InlineCluster::Text("\u{260E}\u{FE0E}")]
);
}
#[test]
fn standalone_digit_is_text() {
assert_eq!(classify("1"), vec![InlineCluster::Text("1")]);
}
#[test]
fn multi_digit_is_single_text_span() {
assert_eq!(classify("12345"), vec![InlineCluster::Text("12345")]);
}
#[test]
fn hash_and_star_alone_are_text() {
assert_eq!(classify("#"), vec![InlineCluster::Text("#")]);
assert_eq!(classify("*"), vec![InlineCluster::Text("*")]);
}
#[test]
fn footer_page_number_text_is_not_rasterized() {
let clusters = classify("Seite 1 von 2");
assert_eq!(clusters, vec![InlineCluster::Text("Seite 1 von 2")]);
for c in &clusters {
assert!(
matches!(c, InlineCluster::Text(_)),
"no emoji clusters expected, got {c:?}"
);
}
}
#[test]
fn iban_is_text_only() {
assert_eq!(
classify("IBAN: DE50 3705 0299 0000 3812 08"),
vec![InlineCluster::Text("IBAN: DE50 3705 0299 0000 3812 08")]
);
}
#[test]
fn phone_number_is_text_only() {
assert_eq!(
classify("0221 – 89 06 37 69"),
vec![InlineCluster::Text("0221 – 89 06 37 69")]
);
}
#[test]
fn digit_with_vs16_is_emoji() {
assert_eq!(
classify("1\u{FE0F}"),
vec![emoji(
"1\u{FE0F}",
EmojiPresentation::Emoji,
EmojiStructure::Single
)]
);
}
#[test]
fn c6_modifier_sequence() {
assert_eq!(
classify("\u{1F44D}\u{1F3FE}"),
vec![emoji(
"\u{1F44D}\u{1F3FE}",
EmojiPresentation::Emoji,
EmojiStructure::ModifierSequence {
base: '\u{1F44D}',
tone: SkinTone::MediumDark,
},
)]
);
}
#[test]
fn c7_zwj_sequence() {
let text = "\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}";
assert_eq!(
classify(text),
vec![emoji(
text,
EmojiPresentation::Emoji,
EmojiStructure::ZwjSequence
)]
);
}
#[test]
fn c8_regional_flag() {
let text = "\u{1F1E9}\u{1F1EA}";
assert_eq!(
classify(text),
vec![emoji(
text,
EmojiPresentation::Emoji,
EmojiStructure::FlagSequence(FlagKind::Regional),
)]
);
}
#[test]
fn c9_subdivision_flag() {
let text = "\u{1F3F4}\u{E0067}\u{E0062}\u{E0073}\u{E0063}\u{E0074}\u{E007F}";
assert_eq!(
classify(text),
vec![emoji(
text,
EmojiPresentation::Emoji,
EmojiStructure::FlagSequence(FlagKind::Subdivision),
)]
);
}
#[test]
fn c10_keycap() {
let text = "1\u{FE0F}\u{20E3}";
assert_eq!(
classify(text),
vec![emoji(
text,
EmojiPresentation::Emoji,
EmojiStructure::KeycapSequence { base: '1' },
)]
);
}
#[test]
fn c11_adjacent_emojis() {
assert_eq!(
classify("\u{1F4DE}\u{1F4E7}"),
vec![
emoji(
"\u{1F4DE}",
EmojiPresentation::Emoji,
EmojiStructure::Single
),
emoji(
"\u{1F4E7}",
EmojiPresentation::Emoji,
EmojiStructure::Single
),
]
);
}
#[test]
fn c12_empty_string() {
assert_eq!(classify(""), Vec::<InlineCluster>::new());
}
#[test]
fn c13_combining_mark_then_emoji() {
let text = "a\u{0301}\u{1F4DE}";
assert_eq!(
classify(text),
vec![
InlineCluster::Text("a\u{0301}"),
emoji(
"\u{1F4DE}",
EmojiPresentation::Emoji,
EmojiStructure::Single
),
]
);
}
#[test]
fn negative_zwj_between_letters_is_text() {
let text = "a\u{200D}b";
let clusters = classify(text);
for cluster in &clusters {
assert!(
matches!(cluster, InlineCluster::Text(_)),
"non-emoji ZWJ pairing must classify as text, got {cluster:?}"
);
}
}
#[test]
fn negative_lone_regional_indicator_does_not_panic() {
let _ = classify("\u{1F1E9}");
}
}