devup-editor-html 1.0.10

HTML ↔ Document conversion + clipboard-mode support (tables, Notion heuristics, data-devup-props round-trip) for devup-editor
Documentation
//! UTF-16 offset-based TextSpan slicing.
//!
//! Mirrors `sliceContent()` from the React clipboard helpers so that
//! partial-block copy (selecting the middle of a paragraph) produces the
//! same output on both sides of the WASM boundary.

use devup_editor_core::model::inline::{utf16_len, utf16_to_byte};
use devup_editor_core::{TextSpan, normalize_spans};

/// Slice `content` by the UTF-16 offset range `[start, end)`. Preserves
/// per-span marks; spans that don't intersect the range are dropped.
///
/// `start >= end` returns an empty vector. Offsets beyond the content
/// clamp to the end rather than panicking.
#[must_use]
pub fn slice_content(content: &[TextSpan], start: usize, end: usize) -> Vec<TextSpan> {
    if start >= end {
        return Vec::new();
    }

    let mut out: Vec<TextSpan> = Vec::new();
    let mut cursor = 0usize;
    for span in content {
        let span_len = utf16_len(&span.text);
        let span_start = cursor;
        let span_end = cursor + span_len;
        cursor = span_end;

        if span_end <= start {
            continue;
        }
        if span_start >= end {
            break;
        }

        let from = start.saturating_sub(span_start);
        let to = (end - span_start).min(span_len);
        if from >= to {
            continue;
        }

        let sliced = slice_utf16(&span.text, from, to);
        if sliced.is_empty() {
            continue;
        }
        out.push(TextSpan {
            text: sliced,
            marks: span.marks.clone(),
        });
    }

    normalize_spans(&mut out);
    out
}

/// Slice `s` by UTF-16 code unit range `[from, to)`. Both ends clamp
/// forward through [`utf16_to_byte`], so mid-surrogate offsets always
/// produce valid UTF-8.
fn slice_utf16(s: &str, from: usize, to: usize) -> String {
    let byte_from = utf16_to_byte(s, from);
    let byte_to = utf16_to_byte(s, to);
    if byte_from >= byte_to {
        return String::new();
    }
    s[byte_from..byte_to].to_string()
}

#[cfg(test)]
mod tests {
    use super::*;
    use devup_editor_core::Mark;

    fn s(text: &str) -> TextSpan {
        TextSpan::plain(text)
    }

    fn sm(text: &str, mark: Mark) -> TextSpan {
        TextSpan::with_marks(text, vec![mark])
    }

    #[test]
    fn slice_empty_range_returns_empty() {
        let spans = vec![s("hello")];
        assert!(slice_content(&spans, 2, 2).is_empty());
        assert!(slice_content(&spans, 5, 2).is_empty());
    }

    #[test]
    fn slice_single_span_middle() {
        let spans = vec![s("hello world")];
        let out = slice_content(&spans, 6, 11);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "world");
    }

    #[test]
    fn slice_across_multiple_spans() {
        // lens: "hello " = 6, "bold " = 5, "world" = 5 β†’ total 16.
        // Range [3, 13) = "lo " + "bold " + "wo".
        let spans = vec![s("hello "), sm("bold ", Mark::bold()), s("world")];
        let out = slice_content(&spans, 3, 13);
        assert_eq!(out.len(), 3);
        assert_eq!(out[0].text, "lo ");
        assert!(out[0].marks.is_empty());
        assert_eq!(out[1].text, "bold ");
        assert!(out[1].has_mark("bold"));
        assert_eq!(out[2].text, "wo");
        assert!(out[2].marks.is_empty());
    }

    #[test]
    fn slice_clamps_beyond_end() {
        let spans = vec![s("hi")];
        let out = slice_content(&spans, 0, 100);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "hi");
    }

    #[test]
    fn slice_preserves_utf16_offsets_with_surrogates() {
        // "πŸ˜€" is a surrogate pair β†’ UTF-16 length 2, UTF-8 length 4.
        let spans = vec![s("aπŸ˜€b")];
        // UTF-16 indexes: a=0, πŸ˜€=1..3, b=3..4. Total len 4.
        let out = slice_content(&spans, 0, 3);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "aπŸ˜€");
        let out = slice_content(&spans, 1, 3);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "πŸ˜€");
    }

    #[test]
    fn slice_drops_empty_result_spans() {
        let spans = vec![s("hi "), sm("", Mark::bold()), s("there")];
        let out = slice_content(&spans, 0, 8);
        // Empty middle span never produces output; normalize_spans also
        // merges adjacent same-mark spans.
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "hi there");
    }

    #[test]
    fn slice_at_surrogate_boundary_clamps_forward() {
        // Offset 2 lands in the middle of πŸ˜€'s surrogate pair. The
        // implementation advances until `units >= to`, which in the
        // mid-surrogate case means AFTER the emoji. We never slice
        // inside a char, so the emoji is included.
        let spans = vec![s("aπŸ˜€b")];
        let out = slice_content(&spans, 0, 2);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "aπŸ˜€");
    }

    #[test]
    fn slice_handles_korean_hangul() {
        // "ν•œκΈ€" β€” 2 Hangul syllables, each 1 UTF-16 unit, 3 UTF-8 bytes.
        let spans = vec![s("ν•œκΈ€ν…ŒμŠ€νŠΈ")];
        let out = slice_content(&spans, 2, 5);
        assert_eq!(out.len(), 1);
        assert_eq!(out[0].text, "ν…ŒμŠ€νŠΈ");
    }

    #[test]
    fn slice_handles_zwj_emoji_sequence() {
        // Family emoji: 4 emoji + 3 ZWJ joiners = 11 UTF-16 units.
        // Slicing at a ZWJ boundary splits the visual grapheme but
        // produces valid UTF-8 β€” documented behaviour.
        let spans = vec![s("πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦X")];
        let total = slice_content(&spans, 0, 12);
        assert_eq!(total[0].text, "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦X");
        // Cut right before the trailing X β€” offset 11.
        let head = slice_content(&spans, 0, 11);
        assert_eq!(head[0].text, "πŸ‘¨β€πŸ‘©β€πŸ‘§β€πŸ‘¦");
    }

    #[test]
    fn slice_flag_emoji() {
        // πŸ‡°πŸ‡· = two regional indicator symbols, each a surrogate pair.
        // Total 4 UTF-16 units.
        let spans = vec![s("πŸ‡°πŸ‡·hello")];
        // Skip the flag entirely: offset 4 = after flag.
        let out = slice_content(&spans, 4, 9);
        assert_eq!(out[0].text, "hello");
        // Take only the flag.
        let flag = slice_content(&spans, 0, 4);
        assert_eq!(flag[0].text, "πŸ‡°πŸ‡·");
    }
}