use std::collections::VecDeque;
use std::iter::Peekable;
use std::ops::Range;
use std::str::CharIndices;
use crate::unicode;
#[cfg(test)]
mod tests;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum Presentation {
Text,
Emoji,
}
impl Presentation {
#[must_use]
pub(crate) fn as_selector(self) -> char {
match self {
Self::Text => unicode::TEXT_PRESENTATION_SELECTOR,
Self::Emoji => unicode::EMOJI_PRESENTATION_SELECTOR,
}
}
#[must_use]
pub const fn from_selector(ch: char) -> Option<Self> {
match ch {
unicode::TEXT_PRESENTATION_SELECTOR => Some(Self::Text),
unicode::EMOJI_PRESENTATION_SELECTOR => Some(Self::Emoji),
_ => None,
}
}
}
fn is_presentation_selector(ch: char) -> bool {
Presentation::from_selector(ch).is_some()
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct EmojiTagRun {
pub tag: Vec<char>,
pub presentation_selectors_after_tag: Vec<Presentation>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum EmojiModification {
EmojiModifier {
modifier: char,
presentation_selectors_after_modifier: Vec<Presentation>,
},
EnclosingKeycap {
presentation_selectors_after_keycap: Vec<Presentation>,
},
TagModifier(Vec<EmojiTagRun>),
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum EmojiStem {
SingletonBase {
base: char,
presentation_selectors_after_base: Vec<Presentation>,
},
Flag {
first_ri: char,
presentation_selectors_after_first_ri: Vec<Presentation>,
second_ri: char,
presentation_selectors_after_second_ri: Vec<Presentation>,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct EmojiLike {
pub stem: EmojiStem,
pub modifiers: Vec<EmojiModification>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct ZwjLink {
pub presentation_selectors_after_link: Vec<Presentation>,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct ZwjJoinedEmoji {
pub link: ZwjLink,
pub emoji: EmojiLike,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum EmojiSequence {
LinksOnly(Vec<ZwjLink>),
EmojiHeaded {
first: EmojiLike,
joined: Vec<ZwjJoinedEmoji>,
trailing_links: Vec<ZwjLink>,
},
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub struct ScanItem<'a> {
pub raw: &'a str,
pub span: Range<usize>,
pub kind: ScanKind,
}
#[derive(Debug, Clone, PartialEq, Eq)]
#[non_exhaustive]
pub enum ScanKind {
Passthrough,
UnsanctionedPresentationSelectors(Vec<Presentation>),
EmojiSequence(EmojiSequence),
}
#[derive(Debug, Default)]
enum EmojiSequenceInProgress {
#[default]
Empty,
LinksOnly {
links: Vec<ZwjLink>,
end: usize,
},
EmojiHeaded {
first: EmojiLike,
joined: Vec<ZwjJoinedEmoji>,
trailing_links: Vec<ZwjLink>,
end: usize,
},
}
impl EmojiSequenceInProgress {
fn is_empty(&self) -> bool {
matches!(self, Self::Empty)
}
fn push_link(&mut self, link: ZwjLink, end: usize) {
match self {
Self::Empty => {
*self = Self::LinksOnly {
links: vec![link],
end,
};
}
Self::LinksOnly {
links,
end: sequence_end,
} => {
links.push(link);
*sequence_end = end;
}
Self::EmojiHeaded {
trailing_links,
end: sequence_end,
..
} => {
trailing_links.push(link);
*sequence_end = end;
}
}
}
fn try_push_awaited_emoji(&mut self, emoji: EmojiLike, end: usize) -> Result<(), EmojiLike> {
let Self::EmojiHeaded {
joined,
trailing_links,
end: sequence_end,
..
} = self
else {
return Err(emoji);
};
if trailing_links.len() == 1
&& let Some(link) = trailing_links.pop()
{
joined.push(ZwjJoinedEmoji { link, emoji });
*sequence_end = end;
Ok(())
} else {
Err(emoji)
}
}
fn from_emoji(emoji: EmojiLike, end: usize) -> Self {
Self::EmojiHeaded {
first: emoji,
joined: vec![],
trailing_links: vec![],
end,
}
}
fn take_sequence(&mut self) -> Option<(EmojiSequence, usize)> {
match std::mem::take(self) {
Self::Empty => None,
Self::LinksOnly { links, end } => Some((EmojiSequence::LinksOnly(links), end)),
Self::EmojiHeaded {
first,
joined,
trailing_links,
end,
} => Some((
EmojiSequence::EmojiHeaded {
first,
joined,
trailing_links,
},
end,
)),
}
}
}
#[derive(Debug)]
pub struct Scanner<'a> {
input: &'a str,
ready: VecDeque<ScanItem<'a>>,
ready_end: usize,
cursor: Peekable<CharIndices<'a>>,
sequence_in_progress: EmojiSequenceInProgress,
}
#[must_use]
pub fn scan(input: &str) -> Scanner<'_> {
Scanner {
input,
ready: VecDeque::new(),
ready_end: 0,
cursor: input.char_indices().peekable(),
sequence_in_progress: EmojiSequenceInProgress::Empty,
}
}
impl Scanner<'_> {
fn offset(&mut self) -> usize {
self.cursor.peek().map_or(self.input.len(), |&(i, _)| i)
}
fn at_eof(&mut self) -> bool {
self.peek().is_none()
}
fn peek(&mut self) -> Option<char> {
self.cursor.peek().map(|&(_, c)| c)
}
fn next_if(&mut self, f: impl FnOnce(char) -> bool) -> Option<char> {
self.cursor.next_if(|(_, c)| f(*c)).map(|(_, c)| c)
}
fn next_if_eq(&mut self, ch: char) -> Option<char> {
self.cursor.next_if(|(_, c)| *c == ch).map(|_| ch)
}
fn next_if_map<R>(&mut self, f: impl FnOnce(char) -> Option<R>) -> Option<R> {
let ch = self.cursor.peek().map(|&(_, c)| c)?;
let mapped = f(ch)?;
self.cursor.next();
Some(mapped)
}
fn skip_while(&mut self, mut f: impl FnMut(char) -> bool) {
while self.next_if(&mut f).is_some() {}
}
fn consume_while(&mut self, mut f: impl FnMut(char) -> bool) -> Vec<char> {
std::iter::from_fn(|| self.next_if(&mut f)).collect()
}
fn consume_while_map<R>(&mut self, mut f: impl FnMut(char) -> Option<R>) -> Vec<R> {
std::iter::from_fn(|| self.next_if_map(&mut f)).collect()
}
fn consume_presentation_selectors(&mut self) -> Vec<Presentation> {
self.consume_while_map(Presentation::from_selector)
}
fn is_structural_start(ch: char) -> bool {
is_presentation_selector(ch) || ch == unicode::ZWJ || unicode::is_emoji(ch)
}
fn emit_item(&mut self, kind: ScanKind, end: usize) {
debug_assert!(
end > self.ready_end,
"scanner must not emit zero-width items"
);
self.ready.push_back(ScanItem {
#[allow(clippy::string_slice)]
raw: &self.input[self.ready_end..end],
span: self.ready_end..end,
kind,
});
self.ready_end = end;
}
fn emit_passthrough(&mut self, end: usize) {
self.emit_item(ScanKind::Passthrough, end);
}
fn emit_unsanctioned_selectors(&mut self, selectors: Vec<Presentation>, end: usize) {
self.emit_item(ScanKind::UnsanctionedPresentationSelectors(selectors), end);
}
fn emit_sequence(&mut self, sequence: EmojiSequence, end: usize) {
self.emit_item(ScanKind::EmojiSequence(sequence), end);
}
fn emit_sequence_in_progress(&mut self) {
if let Some((sequence, end)) = self.sequence_in_progress.take_sequence() {
self.emit_sequence(sequence, end);
}
}
fn fold_zwj_link(&mut self, presentation_selectors_after_link: Vec<Presentation>) {
let end = self.offset();
self.sequence_in_progress.push_link(
ZwjLink {
presentation_selectors_after_link,
},
end,
);
}
fn fold_emoji_like(&mut self, stem: EmojiStem, modifiers: Vec<EmojiModification>) {
let emoji = EmojiLike { stem, modifiers };
let end = self.offset();
if let Err(emoji) = self.sequence_in_progress.try_push_awaited_emoji(emoji, end) {
self.emit_sequence_in_progress();
self.sequence_in_progress = EmojiSequenceInProgress::from_emoji(emoji, end);
}
}
fn consume_emoji_tag_run(&mut self) -> Option<EmojiTagRun> {
self.peek().is_some_and(unicode::is_tag).then(|| {
let tag = self.consume_while(unicode::is_tag);
let presentation_selectors_after_tag = self.consume_presentation_selectors();
EmojiTagRun {
tag,
presentation_selectors_after_tag,
}
})
}
fn consume_zwj_link(&mut self) -> Option<Vec<Presentation>> {
self.next_if_eq(unicode::ZWJ)
.map(|_| self.consume_presentation_selectors())
}
fn consume_emoji_modification(&mut self) -> Option<EmojiModification> {
if let Some(modifier) = self.next_if(unicode::is_emoji_modifier) {
Some(EmojiModification::EmojiModifier {
modifier,
presentation_selectors_after_modifier: self.consume_presentation_selectors(),
})
} else if self
.next_if_eq(unicode::COMBINING_ENCLOSING_KEYCAP)
.is_some()
{
Some(EmojiModification::EnclosingKeycap {
presentation_selectors_after_keycap: self.consume_presentation_selectors(),
})
} else if self.peek().is_some_and(unicode::is_tag) {
Some(EmojiModification::TagModifier(
std::iter::from_fn(|| self.consume_emoji_tag_run()).collect(),
))
} else {
None
}
}
fn consume_emoji_modifications(&mut self) -> Vec<EmojiModification> {
std::iter::from_fn(|| self.consume_emoji_modification()).collect()
}
fn consume_regional_indicator_stem(&mut self, first_ri: char) -> EmojiStem {
let presentation_selectors_after_first_ri = self.consume_presentation_selectors();
let Some(second_ri) = self.next_if(unicode::is_ri) else {
return EmojiStem::SingletonBase {
base: first_ri,
presentation_selectors_after_base: presentation_selectors_after_first_ri,
};
};
let presentation_selectors_after_second_ri = self.consume_presentation_selectors();
EmojiStem::Flag {
first_ri,
presentation_selectors_after_first_ri,
second_ri,
presentation_selectors_after_second_ri,
}
}
fn prepare_next_item(&mut self) -> bool {
while self.ready.is_empty() {
if let Some(selectors) = self.consume_zwj_link() {
self.fold_zwj_link(selectors);
continue;
}
if self.peek().is_some_and(is_presentation_selector) {
debug_assert!(self.sequence_in_progress.is_empty());
let selectors = self.consume_presentation_selectors();
let end = self.offset();
self.emit_unsanctioned_selectors(selectors, end);
continue;
}
if let Some(first_ri) = self.next_if(unicode::is_ri) {
let stem = self.consume_regional_indicator_stem(first_ri);
let modifiers = self.consume_emoji_modifications();
self.fold_emoji_like(stem, modifiers);
continue;
}
if let Some(base) = self.next_if(unicode::is_emoji) {
let presentation_selectors_after_base = self.consume_presentation_selectors();
let stem = EmojiStem::SingletonBase {
base,
presentation_selectors_after_base,
};
let modifiers = self.consume_emoji_modifications();
self.fold_emoji_like(stem, modifiers);
continue;
}
self.emit_sequence_in_progress();
if !self.ready.is_empty() {
continue;
}
if self.at_eof() {
return false;
}
self.skip_while(|ch| !Self::is_structural_start(ch));
let end = self.offset();
self.emit_passthrough(end);
}
true
}
}
impl<'a> Iterator for Scanner<'a> {
type Item = ScanItem<'a>;
fn next(&mut self) -> Option<ScanItem<'a>> {
if let Some(item) = self.ready.pop_front() {
return Some(item);
}
self.prepare_next_item();
self.ready.pop_front()
}
}