use crate::tables;
pub(crate) const ZWJ: char = '\u{200D}';
pub(crate) const VS16: char = '\u{FE0F}';
pub(crate) const VS15: char = '\u{FE0E}';
#[cfg(test)]
const KEY_BUF_CAP: usize = 64; #[cfg(test)]
const _: () = assert!(KEY_BUF_CAP >= MAX_WINDOW * 6 - 1);
#[cfg(test)]
fn encode_key_into(buf: &mut [u8; KEY_BUF_CAP], cps: &[char]) -> usize {
let mut pos = 0usize;
for (i, &c) in cps.iter().enumerate() {
if i > 0 {
buf[pos] = b'_';
pos += 1;
}
let cp = c as u32;
let digits: u32 = if cp >= 0x10_0000 {
6
} else if cp >= 0x1_0000 {
5
} else {
4
};
for d in (0..digits).rev() {
let nibble = ((cp >> (d * 4)) & 0xF) as u8;
buf[pos] = if nibble < 10 {
b'0' + nibble
} else {
b'A' + nibble - 10
};
pos += 1;
}
}
pos
}
pub(crate) fn match_emoji_at(window: &[char]) -> Option<(&'static str, usize)> {
debug_assert!(
!window.is_empty(),
"match_emoji_at requires a non-empty window"
);
let ch = window[0];
if tables::is_emoji_multi_starter(ch) {
if let Some(hit) = tables::match_emoji_sequence(window) {
return Some(hit);
}
}
if let Some(name) = tables::lookup_emoji_single(ch) {
let consumed = if window.len() > 1 && (window[1] == VS16 || window[1] == VS15) {
2
} else {
1
};
return Some((name, consumed));
}
None
}
#[cfg(test)]
fn match_emoji_at_reference(window: &[char]) -> Option<(&'static str, usize)> {
let ch = window[0];
let remaining = window.len();
if tables::is_emoji_multi_starter(ch) {
let max_len = MAX_WINDOW.min(remaining);
let mut key_buf = [0u8; KEY_BUF_CAP];
let total_len = encode_key_into(&mut key_buf, &window[..max_len]);
let mut sep_positions = [0usize; MAX_WINDOW];
let mut sep_count = 0usize;
for (idx, &b) in key_buf[..total_len].iter().enumerate() {
if b == b'_' {
sep_positions[sep_count] = idx;
sep_count += 1;
}
}
for len in (2..=max_len).rev() {
let last = window[len - 1];
if last == ZWJ || last == VS16 || last == VS15 {
continue;
}
let key_slice = if len < max_len {
std::str::from_utf8(&key_buf[..sep_positions[len - 1]]).unwrap_or("")
} else {
std::str::from_utf8(&key_buf[..total_len]).unwrap_or("")
};
if let Some(name) = tables::lookup_emoji_multi(key_slice) {
return Some((name, len));
}
}
}
if let Some(name) = tables::lookup_emoji_single(ch) {
let consumed = if window.len() > 1 && (window[1] == VS16 || window[1] == VS15) {
2
} else {
1
};
return Some((name, consumed));
}
None
}
pub(crate) struct CharWindow<'a> {
buf: [char; MAX_WINDOW],
len: usize,
rest: std::str::Chars<'a>,
}
const MAX_WINDOW: usize = tables::max_emoji_seq_len();
impl<'a> CharWindow<'a> {
pub(crate) fn new(mut chars: std::str::Chars<'a>) -> Self {
let mut buf = ['\0'; MAX_WINDOW];
let mut len = 0;
while len < MAX_WINDOW {
match chars.next() {
Some(c) => {
buf[len] = c;
len += 1;
}
None => break,
}
}
CharWindow {
buf,
len,
rest: chars,
}
}
#[inline]
pub(crate) fn current(&self) -> Option<char> {
if self.len > 0 {
Some(self.buf[0])
} else {
None
}
}
#[inline]
pub(crate) fn as_slice(&self) -> &[char] {
&self.buf[..self.len]
}
pub(crate) fn advance(&mut self, n: usize) {
debug_assert!(n > 0 && n <= self.len);
self.buf.copy_within(n..self.len, 0);
let remaining = self.len - n;
let mut fill = remaining;
while fill < MAX_WINDOW {
match self.rest.next() {
Some(c) => {
self.buf[fill] = c;
fill += 1;
}
None => break,
}
}
self.len = fill;
}
}
pub(crate) fn is_emoji_codepoint(ch: char) -> bool {
let cp = ch as u32;
matches!(cp,
0x2600..=0x27BF | 0x2B50..=0x2B55 | 0xFE00..=0xFE0F | 0x1F000..=0x1FAFF | 0x1FC00..=0x1FFFF | 0xE0020..=0xE007F )
}
pub(crate) fn is_emoji_modifier(ch: char) -> bool {
let cp = ch as u32;
matches!(cp,
0x200D | 0xFE0E..=0xFE0F | 0x1F3FB..=0x1F3FF | 0xE0020..=0xE007F | 0x20E3 )
}
#[inline]
pub(crate) fn strip_modifier_suffix(name: &str, strip_modifiers: bool) -> &str {
if strip_modifiers {
if let Some(base_end) = name.find(": ") {
return &name[..base_end];
}
}
name
}
#[inline]
pub(crate) fn pad_emoji_replacement(result: &mut String, text: &str) {
let ends_with_ws = result.chars().next_back().is_some_and(char::is_whitespace);
if !result.is_empty() && !ends_with_ws {
result.push(' ');
}
result.push_str(text);
}
pub fn demojize_rust(text: &str, strip_modifiers: bool) -> String {
let mut out = String::new();
demojize_rust_into(text, strip_modifiers, &mut out);
out
}
pub fn demojize_rust_into(text: &str, strip_modifiers: bool, result: &mut String) {
result.clear();
if text.is_ascii() {
result.push_str(text);
return;
}
result.reserve(text.len());
let mut win = CharWindow::new(text.chars());
let mut last_was_emoji = false;
while let Some(ch) = win.current() {
if ch == VS16 || ch == VS15 || ch == ZWJ {
win.advance(1);
continue;
}
if let Some((name, consumed)) = match_emoji_at(win.as_slice()) {
let replacement = strip_modifier_suffix(name, strip_modifiers);
pad_emoji_replacement(result, replacement);
win.advance(consumed);
while win.current().is_some_and(is_emoji_modifier) {
win.advance(1);
}
last_was_emoji = true;
continue;
}
if is_emoji_codepoint(ch) {
win.advance(1);
while win.current().is_some_and(is_emoji_modifier) {
win.advance(1);
}
last_was_emoji = false;
continue;
}
if last_was_emoji && ch.is_alphanumeric() {
result.push(' ');
}
result.push(ch);
last_was_emoji = false;
win.advance(1);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_encode_key_single() {
let mut buf = [0u8; KEY_BUF_CAP];
let n = encode_key_into(&mut buf, &['\u{1F600}']);
assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "1F600");
}
#[test]
fn test_encode_key_multi() {
let mut buf = [0u8; KEY_BUF_CAP];
let n = encode_key_into(&mut buf, &['\u{1F468}', ZWJ, '\u{1F469}']);
assert_eq!(std::str::from_utf8(&buf[..n]).unwrap(), "1F468_200D_1F469");
}
fn key_to_chars(key: &str) -> Vec<char> {
key.split('_')
.map(|h| char::from_u32(u32::from_str_radix(h, 16).unwrap()).unwrap())
.collect()
}
#[test]
fn emoji_trie_matches_reference() {
let keys: Vec<&str> = crate::tables::emoji_data::EMOJI_MULTI
.keys()
.copied()
.collect();
assert!(keys.len() > 2000, "expected the full multi-emoji table");
for key in &keys {
let seq = key_to_chars(key);
assert_eq!(
match_emoji_at(&seq),
match_emoji_at_reference(&seq),
"trie/reference disagree on key {key}"
);
let mut padded = seq.clone();
padded.push('x');
assert_eq!(
match_emoji_at(&padded),
match_emoji_at_reference(&padded),
"trie/reference disagree on padded key {key}"
);
let mut chained = seq.clone();
chained.extend(key_to_chars(keys[0]));
assert_eq!(
match_emoji_at(&chained),
match_emoji_at_reference(&chained),
"trie/reference disagree on chained key {key}"
);
}
}
#[test]
fn test_is_emoji_codepoint() {
assert!(is_emoji_codepoint('\u{1F600}'));
assert!(is_emoji_codepoint('\u{2600}'));
assert!(!is_emoji_codepoint('A'));
assert!(!is_emoji_codepoint('€'));
}
#[test]
fn test_is_emoji_modifier() {
assert!(is_emoji_modifier(ZWJ)); assert!(is_emoji_modifier(VS16)); assert!(is_emoji_modifier('\u{1F3FB}')); assert!(!is_emoji_modifier('A'));
}
#[test]
fn test_match_single_emoji() {
let chars: Vec<char> = "😀".chars().collect();
let result = match_emoji_at(&chars);
assert!(result.is_some());
let (name, consumed) = result.unwrap();
assert_eq!(name, "grinning face");
assert_eq!(consumed, 1);
}
#[test]
fn test_demojize_rust_basic() {
let result = demojize_rust("Hello 😀 world", false);
assert_eq!(result, "Hello grinning face world");
}
#[test]
fn test_demojize_rust_no_emoji() {
let result = demojize_rust("Hello world", false);
assert_eq!(result, "Hello world");
}
#[test]
fn test_demojize_rust_multiple() {
let result = demojize_rust("😀😂", false);
assert_eq!(result, "grinning face face with tears of joy");
}
#[test]
fn test_demojize_rust_empty() {
assert_eq!(demojize_rust("", false), "");
}
}