use unicode_segmentation::UnicodeSegmentation;
use crate::model::{Inline, RunElement, TextRun};
use crate::render::emoji::cluster::{self, EmojiPresentation, EmojiStructure, InlineCluster};
#[derive(Debug)]
pub(super) enum InlineUnit<'a> {
TextSegment(JoinedTextSegment<'a>),
Discrete(&'a Inline),
}
#[derive(Debug)]
pub(super) struct JoinedTextSegment<'a> {
text: String,
char_runs: Vec<&'a TextRun>,
}
impl<'a> JoinedTextSegment<'a> {
#[allow(dead_code)]
pub(super) fn text(&self) -> &str {
&self.text
}
pub(super) fn char_runs(&self) -> &[&'a TextRun] {
&self.char_runs
}
pub(super) fn classify(self) -> Vec<SegmentPiece<'a>> {
let mut out = Vec::new();
let mut buffer: Option<TextBuffer<'a>> = None;
let mut char_idx = 0usize;
for ic in cluster::classify(&self.text) {
match ic {
InlineCluster::Text(span) => {
for grapheme in UnicodeSegmentation::graphemes(span, true) {
let chars_in = grapheme.chars().count();
let run = self.char_runs[char_idx];
match buffer.as_mut() {
None => {
buffer = Some(TextBuffer {
run,
text: grapheme.to_string(),
});
}
Some(buf) if std::ptr::eq(buf.run, run) => {
buf.text.push_str(grapheme);
}
Some(_) => {
let prev = buffer.take().unwrap();
out.push(SegmentPiece::Text {
run: prev.run,
text: prev.text,
});
buffer = Some(TextBuffer {
run,
text: grapheme.to_string(),
});
}
}
char_idx += chars_in;
}
}
InlineCluster::Emoji(ec) => {
if let Some(prev) = buffer.take() {
out.push(SegmentPiece::Text {
run: prev.run,
text: prev.text,
});
}
let base_run = self.char_runs[char_idx];
let chars_in = ec.text.chars().count();
out.push(SegmentPiece::Emoji {
base_run,
text: ec.text.to_string(),
presentation: ec.presentation,
structure: ec.structure,
});
char_idx += chars_in;
}
}
}
if let Some(last) = buffer {
out.push(SegmentPiece::Text {
run: last.run,
text: last.text,
});
}
out
}
}
struct TextBuffer<'a> {
run: &'a TextRun,
text: String,
}
#[derive(Debug)]
pub(super) enum SegmentPiece<'a> {
Text {
run: &'a TextRun,
text: String,
},
Emoji {
base_run: &'a TextRun,
text: String,
presentation: EmojiPresentation,
structure: EmojiStructure,
},
}
pub(super) fn build_inline_units(inlines: &[Inline]) -> Vec<InlineUnit<'_>> {
let mut out = Vec::new();
let mut buf = SegmentBuilder::default();
for inline in inlines {
match inline {
Inline::TextRun(tr) if is_text_only_run(tr) => {
for el in &tr.content {
if let RunElement::Text(s) = el {
buf.append(s, tr);
}
}
}
other => {
if let Some(seg) = buf.take() {
out.push(InlineUnit::TextSegment(seg));
}
out.push(InlineUnit::Discrete(other));
}
}
}
if let Some(seg) = buf.take() {
out.push(InlineUnit::TextSegment(seg));
}
out
}
fn is_text_only_run(tr: &TextRun) -> bool {
!tr.content.is_empty() && tr.content.iter().all(|e| matches!(e, RunElement::Text(_)))
}
#[derive(Default)]
struct SegmentBuilder<'a> {
text: String,
char_runs: Vec<&'a TextRun>,
}
impl<'a> SegmentBuilder<'a> {
fn append(&mut self, raw: &str, run: &'a TextRun) {
for c in raw.chars() {
if c.is_control() && c != '\t' {
continue;
}
self.text.push(c);
self.char_runs.push(run);
}
}
fn take(&mut self) -> Option<JoinedTextSegment<'a>> {
if self.text.is_empty() {
return None;
}
Some(JoinedTextSegment {
text: std::mem::take(&mut self.text),
char_runs: std::mem::take(&mut self.char_runs),
})
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::model::{RevisionIds, RunProperties, Symbol};
fn text_run(text: &str) -> Inline {
Inline::TextRun(Box::new(TextRun {
style_id: None,
properties: RunProperties::default(),
content: vec![RunElement::Text(text.into())],
rsids: RevisionIds::default(),
}))
}
fn run_with_elements(elements: Vec<RunElement>) -> Inline {
Inline::TextRun(Box::new(TextRun {
style_id: None,
properties: RunProperties::default(),
content: elements,
rsids: RevisionIds::default(),
}))
}
fn discrete_inline() -> Inline {
Inline::Symbol(Symbol {
font: "Wingdings".into(),
char_code: 0xF0FE,
})
}
fn extract_run(inline: &Inline) -> &TextRun {
match inline {
Inline::TextRun(tr) => tr,
_ => panic!("expected TextRun"),
}
}
fn segment_of<'a>(unit: &'a InlineUnit<'a>) -> &'a JoinedTextSegment<'a> {
match unit {
InlineUnit::TextSegment(s) => s,
other => panic!("expected TextSegment, got {other:?}"),
}
}
#[test]
fn a1_single_text_run() {
let inlines = vec![text_run("hello")];
let r1 = extract_run(&inlines[0]);
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 1);
let seg = segment_of(&units[0]);
assert_eq!(seg.text(), "hello");
assert_eq!(seg.char_runs().len(), 5);
for &cr in seg.char_runs() {
assert!(std::ptr::eq(cr, r1));
}
}
#[test]
fn a2_two_runs_join() {
let inlines = vec![text_run("ab"), text_run("cd")];
let r1 = extract_run(&inlines[0]);
let r2 = extract_run(&inlines[1]);
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 1);
let seg = segment_of(&units[0]);
assert_eq!(seg.text(), "abcd");
assert_eq!(seg.char_runs().len(), 4);
assert!(std::ptr::eq(seg.char_runs()[0], r1));
assert!(std::ptr::eq(seg.char_runs()[1], r1));
assert!(std::ptr::eq(seg.char_runs()[2], r2));
assert!(std::ptr::eq(seg.char_runs()[3], r2));
}
#[test]
fn a3_non_text_inline_breaks_join() {
let inlines = vec![text_run("ab"), discrete_inline(), text_run("cd")];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 3);
assert_eq!(segment_of(&units[0]).text(), "ab");
assert!(matches!(units[1], InlineUnit::Discrete(Inline::Symbol(_))));
assert_eq!(segment_of(&units[2]).text(), "cd");
}
#[test]
fn a4_text_run_with_tab_is_discrete() {
let r1 = run_with_elements(vec![
RunElement::Text("a".into()),
RunElement::Tab,
RunElement::Text("b".into()),
]);
let inlines = vec![text_run("hello"), r1, text_run("world")];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 3);
assert_eq!(segment_of(&units[0]).text(), "hello");
assert!(matches!(units[1], InlineUnit::Discrete(Inline::TextRun(_))));
assert_eq!(segment_of(&units[2]).text(), "world");
}
#[test]
fn a5_empty_text_run_is_skipped() {
let inlines = vec![text_run("")];
let units = build_inline_units(&inlines);
assert!(units.is_empty(), "no segment from empty text");
}
#[test]
fn a5b_no_content_text_run_is_discrete() {
let inlines = vec![run_with_elements(vec![])];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 1);
assert!(matches!(units[0], InlineUnit::Discrete(_)));
}
#[test]
fn a6_control_chars_stripped() {
let inlines = vec![text_run("\u{0001}hi\u{0002}")];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 1);
let seg = segment_of(&units[0]);
assert_eq!(seg.text(), "hi");
assert_eq!(seg.char_runs().len(), 2);
}
#[test]
fn a6b_tab_preserved_in_text() {
let inlines = vec![text_run("a\tb")];
let units = build_inline_units(&inlines);
let seg = segment_of(&units[0]);
assert_eq!(seg.text(), "a\tb");
}
#[test]
fn a7_three_runs_invariant() {
let inlines = vec![text_run("ab"), text_run("c"), text_run("def")];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 1);
let seg = segment_of(&units[0]);
assert_eq!(seg.text().chars().count(), seg.char_runs().len());
assert_eq!(seg.text(), "abcdef");
}
#[test]
fn a8_hyperlink_breaks_join() {
use crate::model::{Hyperlink, HyperlinkTarget, RelId};
let inlines = vec![
text_run("before "),
Inline::Hyperlink(Hyperlink {
target: HyperlinkTarget::External(RelId::new("rId1")),
content: vec![text_run("link")],
}),
text_run(" after"),
];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 3);
assert_eq!(segment_of(&units[0]).text(), "before ");
assert!(matches!(
units[1],
InlineUnit::Discrete(Inline::Hyperlink(_))
));
assert_eq!(segment_of(&units[2]).text(), " after");
}
#[test]
fn a9_field_char_breaks_join() {
use crate::model::{FieldChar, FieldCharType};
let inlines = vec![
text_run("page "),
Inline::FieldChar(FieldChar {
field_char_type: FieldCharType::Begin,
dirty: None,
fld_lock: None,
}),
text_run("instr"),
];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 3);
assert_eq!(segment_of(&units[0]).text(), "page ");
assert!(matches!(
units[1],
InlineUnit::Discrete(Inline::FieldChar(_))
));
assert_eq!(segment_of(&units[2]).text(), "instr");
}
#[test]
fn b1_single_run_text() {
let inlines = vec![text_run("hello")];
let r1 = extract_run(&inlines[0]);
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
match &pieces[0] {
SegmentPiece::Text { run, text } => {
assert!(std::ptr::eq(*run, r1));
assert_eq!(text, "hello");
}
other => panic!("expected Text, got {other:?}"),
}
}
#[test]
fn b2_two_runs_split_text() {
let inlines = vec![text_run("ab"), text_run("cd")];
let r1 = extract_run(&inlines[0]);
let r2 = extract_run(&inlines[1]);
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 2);
match &pieces[0] {
SegmentPiece::Text { run, text } => {
assert!(std::ptr::eq(*run, r1));
assert_eq!(text, "ab");
}
_ => panic!(),
}
match &pieces[1] {
SegmentPiece::Text { run, text } => {
assert!(std::ptr::eq(*run, r2));
assert_eq!(text, "cd");
}
_ => panic!(),
}
}
#[test]
fn b3_cross_run_keycap_reassembles() {
let inlines = vec![text_run("1"), text_run("\u{FE0F}"), text_run("\u{20E3}")];
let r1 = extract_run(&inlines[0]);
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
match &pieces[0] {
SegmentPiece::Emoji {
base_run,
text,
presentation,
structure,
} => {
assert!(std::ptr::eq(*base_run, r1));
assert_eq!(text, "1\u{FE0F}\u{20E3}");
assert_eq!(*presentation, EmojiPresentation::Emoji);
assert!(matches!(
structure,
EmojiStructure::KeycapSequence { base: '1' }
));
}
other => panic!("expected Emoji KeycapSequence, got {other:?}"),
}
}
#[test]
fn b4_cross_run_zwj_family() {
let inlines = vec![
text_run("\u{1F468}"),
text_run("\u{200D}"),
text_run("\u{1F469}"),
text_run("\u{200D}"),
text_run("\u{1F467}"),
];
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
match &pieces[0] {
SegmentPiece::Emoji {
text, structure, ..
} => {
assert_eq!(text, "\u{1F468}\u{200D}\u{1F469}\u{200D}\u{1F467}");
assert!(matches!(structure, EmojiStructure::ZwjSequence));
}
_ => panic!(),
}
}
#[test]
fn b5_cross_run_modifier_sequence() {
let inlines = vec![text_run("\u{1F44D}"), text_run("\u{1F3FF}")];
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
match &pieces[0] {
SegmentPiece::Emoji {
text, structure, ..
} => {
assert_eq!(text, "\u{1F44D}\u{1F3FF}");
assert!(matches!(
structure,
EmojiStructure::ModifierSequence {
base: '\u{1F44D}',
..
}
));
}
_ => panic!(),
}
}
#[test]
fn b6_mixed_text_and_keycap() {
let inlines = vec![
text_run("hi 1"),
text_run("\u{FE0F}\u{20E3}"),
text_run(" there"),
];
let r1 = extract_run(&inlines[0]);
let r3 = extract_run(&inlines[2]);
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 3);
match &pieces[0] {
SegmentPiece::Text { run, text } => {
assert!(std::ptr::eq(*run, r1));
assert_eq!(text, "hi ");
}
_ => panic!(),
}
match &pieces[1] {
SegmentPiece::Emoji {
base_run,
text,
structure,
..
} => {
assert!(std::ptr::eq(*base_run, r1));
assert_eq!(text, "1\u{FE0F}\u{20E3}");
assert!(matches!(
structure,
EmojiStructure::KeycapSequence { base: '1' }
));
}
_ => panic!(),
}
match &pieces[2] {
SegmentPiece::Text { run, text } => {
assert!(std::ptr::eq(*run, r3));
assert_eq!(text, " there");
}
_ => panic!(),
}
}
#[test]
fn b7_adjacent_distinct_emojis() {
let inlines = vec![text_run("\u{1F4DE}"), text_run("\u{1F4E7}")];
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 2);
for p in &pieces {
assert!(matches!(p, SegmentPiece::Emoji { .. }));
}
}
#[test]
fn b8_cross_run_combining_mark() {
let inlines = vec![text_run("a"), text_run("\u{0301}")];
let r1 = extract_run(&inlines[0]);
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
match &pieces[0] {
SegmentPiece::Text { run, text } => {
assert!(std::ptr::eq(*run, r1));
assert_eq!(text, "a\u{0301}");
}
_ => panic!(),
}
}
#[test]
fn b9_cross_run_ris_pair() {
let inlines = vec![text_run("\u{1F1E9}"), text_run("\u{1F1EA}")];
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
match &pieces[0] {
SegmentPiece::Emoji {
text, structure, ..
} => {
assert_eq!(text, "\u{1F1E9}\u{1F1EA}");
assert!(matches!(
structure,
EmojiStructure::FlagSequence(crate::render::emoji::cluster::FlagKind::Regional)
));
}
_ => panic!(),
}
}
#[test]
fn b10_digits_in_one_run_stay_text() {
let inlines = vec![text_run("Numbers: 1, 2, 3")];
let units = build_inline_units(&inlines);
let seg = match units.into_iter().next().unwrap() {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
let pieces = seg.classify();
assert_eq!(pieces.len(), 1);
assert!(matches!(pieces[0], SegmentPiece::Text { .. }));
}
#[test]
fn b11_field_char_isolates_segments() {
use crate::model::{FieldChar, FieldCharType};
let inlines = vec![
text_run("a"),
text_run("b"),
Inline::FieldChar(FieldChar {
field_char_type: FieldCharType::Begin,
dirty: None,
fld_lock: None,
}),
text_run("c"),
text_run("d"),
];
let units = build_inline_units(&inlines);
assert_eq!(units.len(), 3);
let seg1 = match &units[0] {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
assert_eq!(seg1.text(), "ab");
let seg2 = match &units[2] {
InlineUnit::TextSegment(s) => s,
_ => panic!(),
};
assert_eq!(seg2.text(), "cd");
}
}