Skip to main content

devup_editor_html/
slice.rs

1//! UTF-16 offset-based TextSpan slicing.
2//!
3//! Mirrors `sliceContent()` from the React clipboard helpers so that
4//! partial-block copy (selecting the middle of a paragraph) produces the
5//! same output on both sides of the WASM boundary.
6
7use devup_editor_core::model::inline::{utf16_len, utf16_to_byte};
8use devup_editor_core::{TextSpan, normalize_spans};
9
10/// Slice `content` by the UTF-16 offset range `[start, end)`. Preserves
11/// per-span marks; spans that don't intersect the range are dropped.
12///
13/// `start >= end` returns an empty vector. Offsets beyond the content
14/// clamp to the end rather than panicking.
15#[must_use]
16pub fn slice_content(content: &[TextSpan], start: usize, end: usize) -> Vec<TextSpan> {
17    if start >= end {
18        return Vec::new();
19    }
20
21    let mut out: Vec<TextSpan> = Vec::new();
22    let mut cursor = 0usize;
23    for span in content {
24        let span_len = utf16_len(&span.text);
25        let span_start = cursor;
26        let span_end = cursor + span_len;
27        cursor = span_end;
28
29        if span_end <= start {
30            continue;
31        }
32        if span_start >= end {
33            break;
34        }
35
36        let from = start.saturating_sub(span_start);
37        let to = (end - span_start).min(span_len);
38        if from >= to {
39            continue;
40        }
41
42        let sliced = slice_utf16(&span.text, from, to);
43        if sliced.is_empty() {
44            continue;
45        }
46        out.push(TextSpan {
47            text: sliced,
48            marks: span.marks.clone(),
49        });
50    }
51
52    normalize_spans(&mut out);
53    out
54}
55
56/// Slice `s` by UTF-16 code unit range `[from, to)`. Both ends clamp
57/// forward through [`utf16_to_byte`], so mid-surrogate offsets always
58/// produce valid UTF-8.
59fn slice_utf16(s: &str, from: usize, to: usize) -> String {
60    let byte_from = utf16_to_byte(s, from);
61    let byte_to = utf16_to_byte(s, to);
62    if byte_from >= byte_to {
63        return String::new();
64    }
65    s[byte_from..byte_to].to_string()
66}
67
68#[cfg(test)]
69mod tests {
70    use super::*;
71    use devup_editor_core::Mark;
72
73    fn s(text: &str) -> TextSpan {
74        TextSpan::plain(text)
75    }
76
77    fn sm(text: &str, mark: Mark) -> TextSpan {
78        TextSpan::with_marks(text, vec![mark])
79    }
80
81    #[test]
82    fn slice_empty_range_returns_empty() {
83        let spans = vec![s("hello")];
84        assert!(slice_content(&spans, 2, 2).is_empty());
85        assert!(slice_content(&spans, 5, 2).is_empty());
86    }
87
88    #[test]
89    fn slice_single_span_middle() {
90        let spans = vec![s("hello world")];
91        let out = slice_content(&spans, 6, 11);
92        assert_eq!(out.len(), 1);
93        assert_eq!(out[0].text, "world");
94    }
95
96    #[test]
97    fn slice_across_multiple_spans() {
98        // lens: "hello " = 6, "bold " = 5, "world" = 5 β†’ total 16.
99        // Range [3, 13) = "lo " + "bold " + "wo".
100        let spans = vec![s("hello "), sm("bold ", Mark::bold()), s("world")];
101        let out = slice_content(&spans, 3, 13);
102        assert_eq!(out.len(), 3);
103        assert_eq!(out[0].text, "lo ");
104        assert!(out[0].marks.is_empty());
105        assert_eq!(out[1].text, "bold ");
106        assert!(out[1].has_mark("bold"));
107        assert_eq!(out[2].text, "wo");
108        assert!(out[2].marks.is_empty());
109    }
110
111    #[test]
112    fn slice_clamps_beyond_end() {
113        let spans = vec![s("hi")];
114        let out = slice_content(&spans, 0, 100);
115        assert_eq!(out.len(), 1);
116        assert_eq!(out[0].text, "hi");
117    }
118
119    #[test]
120    fn slice_preserves_utf16_offsets_with_surrogates() {
121        // "πŸ˜€" is a surrogate pair β†’ UTF-16 length 2, UTF-8 length 4.
122        let spans = vec![s("aπŸ˜€b")];
123        // UTF-16 indexes: a=0, πŸ˜€=1..3, b=3..4. Total len 4.
124        let out = slice_content(&spans, 0, 3);
125        assert_eq!(out.len(), 1);
126        assert_eq!(out[0].text, "aπŸ˜€");
127        let out = slice_content(&spans, 1, 3);
128        assert_eq!(out.len(), 1);
129        assert_eq!(out[0].text, "πŸ˜€");
130    }
131
132    #[test]
133    fn slice_drops_empty_result_spans() {
134        let spans = vec![s("hi "), sm("", Mark::bold()), s("there")];
135        let out = slice_content(&spans, 0, 8);
136        // Empty middle span never produces output; normalize_spans also
137        // merges adjacent same-mark spans.
138        assert_eq!(out.len(), 1);
139        assert_eq!(out[0].text, "hi there");
140    }
141
142    #[test]
143    fn slice_at_surrogate_boundary_clamps_forward() {
144        // Offset 2 lands in the middle of πŸ˜€'s surrogate pair. The
145        // implementation advances until `units >= to`, which in the
146        // mid-surrogate case means AFTER the emoji. We never slice
147        // inside a char, so the emoji is included.
148        let spans = vec![s("aπŸ˜€b")];
149        let out = slice_content(&spans, 0, 2);
150        assert_eq!(out.len(), 1);
151        assert_eq!(out[0].text, "aπŸ˜€");
152    }
153
154    #[test]
155    fn slice_handles_korean_hangul() {
156        // "ν•œκΈ€" β€” 2 Hangul syllables, each 1 UTF-16 unit, 3 UTF-8 bytes.
157        let spans = vec![s("ν•œκΈ€ν…ŒμŠ€νŠΈ")];
158        let out = slice_content(&spans, 2, 5);
159        assert_eq!(out.len(), 1);
160        assert_eq!(out[0].text, "ν…ŒμŠ€νŠΈ");
161    }
162
163    #[test]
164    fn slice_handles_zwj_emoji_sequence() {
165        // Family emoji: 4 emoji + 3 ZWJ joiners = 11 UTF-16 units.
166        // Slicing at a ZWJ boundary splits the visual grapheme but
167        // produces valid UTF-8 β€” documented behaviour.
168        let spans = vec![s("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦X")];
169        let total = slice_content(&spans, 0, 12);
170        assert_eq!(total[0].text, "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦X");
171        // Cut right before the trailing X β€” offset 11.
172        let head = slice_content(&spans, 0, 11);
173        assert_eq!(head[0].text, "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦");
174    }
175
176    #[test]
177    fn slice_flag_emoji() {
178        // πŸ‡°πŸ‡· = two regional indicator symbols, each a surrogate pair.
179        // Total 4 UTF-16 units.
180        let spans = vec![s("πŸ‡°πŸ‡·hello")];
181        // Skip the flag entirely: offset 4 = after flag.
182        let out = slice_content(&spans, 4, 9);
183        assert_eq!(out[0].text, "hello");
184        // Take only the flag.
185        let flag = slice_content(&spans, 0, 4);
186        assert_eq!(flag[0].text, "πŸ‡°πŸ‡·");
187    }
188}