devup-editor-html 0.1.3

HTML ↔ Document conversion + clipboard-mode support (tables, Notion heuristics, data-devup-props round-trip) for devup-editor
Documentation
//! Clipboard-specific helpers and shared types.
//!
//! All behaviour here mirrors the TypeScript `clipboardHtml.ts` byte-for-
//! byte where interop matters. Deviations are called out in comments.

use std::collections::{BTreeSet, HashMap};

use base64::Engine;
use base64::engine::general_purpose::STANDARD as BASE64;
use devup_editor_core::{Block, BlockId};
use serde::{Deserialize, Serialize};
use serde_json::{Map, Value};

/// Subtree bundle produced by [`crate::html_to_copied_blocks`] and
/// consumed by [`crate::blocks_to_html`].
///
/// - `roots`: top-level blocks in document order.
/// - `by_id`: flat map containing **every** block (including descendants
///   of `table` / `table_row` / `toggle`) so pasting can reconstruct
///   nested structures faithfully.
///
/// The React side serialises / deserialises this across the WASM
/// boundary as JSON; the type declaration is therefore the source of
/// truth for the schema.
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct CopiedBlocks {
    pub roots: Vec<Block>,
    #[serde(rename = "byId")]
    pub by_id: HashMap<BlockId, Block>,
}

/// Attribute name used for lossless prop round-trip between devup
/// instances. See [`encode_props`] for the payload format.
pub const DEVUP_PROPS_ATTR: &str = "data-devup-props";

/// Keys that are represented structurally in HTML (native attributes
/// or layout elements) and must therefore NOT be duplicated into the
/// marker — that would invite drift when the HTML is edited externally
/// and then re-imported.
fn props_skip_keys() -> &'static BTreeSet<&'static str> {
    use std::sync::OnceLock;
    static S: OnceLock<BTreeSet<&'static str>> = OnceLock::new();
    S.get_or_init(|| {
        let mut s = BTreeSet::new();
        // Native HTML table attributes.
        s.insert("colspan");
        s.insert("rowspan");
        // Encoded via <colgroup><col>.
        s.insert("columns");
        // Emitted by the enclosing emit() loop via data-indent.
        s.insert("indent");
        s
    })
}

/// Serialize a props map into the `data-devup-props` marker payload.
///
/// Returns an empty string when there is nothing worth marking (all
/// keys either skipped, null, or the map was None). The caller is
/// expected to omit the attribute entirely in that case.
///
/// Payload: `base64(utf8_bytes(json_stringify(filtered_props)))` using
/// standard base64. Attribute-safe by construction.
#[must_use]
pub fn encode_props(props: Option<&Map<String, Value>>) -> String {
    let Some(props) = props else {
        return String::new();
    };
    let skip = props_skip_keys();
    let mut filtered = Map::new();
    for (k, v) in props {
        if skip.contains(k.as_str()) {
            continue;
        }
        if v.is_null() {
            continue;
        }
        filtered.insert(k.clone(), v.clone());
    }
    if filtered.is_empty() {
        return String::new();
    }
    let Ok(json) = serde_json::to_string(&Value::Object(filtered)) else {
        return String::new();
    };
    BASE64.encode(json.as_bytes())
}

/// Decode the `data-devup-props` marker back into a props map.
///
/// Accepts both the modern base64 payload (standard alphabet) and, as
/// a fallback, a raw JSON object string — the TS side emits the latter
/// when `window.btoa` is unavailable (Node fallback path). Returns
/// `None` when the attribute is absent or unparseable.
#[must_use]
pub fn decode_props(raw: &str) -> Option<Map<String, Value>> {
    if raw.is_empty() {
        return None;
    }

    // Primary path: base64 → utf8 → JSON.
    if let Ok(bytes) = BASE64.decode(raw.as_bytes())
        && let Ok(text) = std::str::from_utf8(&bytes)
        && let Ok(Value::Object(map)) = serde_json::from_str::<Value>(text)
    {
        return Some(map);
    }

    // Fallback: treat the attribute value as raw JSON.
    if let Ok(Value::Object(map)) = serde_json::from_str::<Value>(raw) {
        return Some(map);
    }

    None
}

/// Pre-check for XML-shaped input so we don't pay the html5ever cost
/// on pure-Markdown text. Accepts the canonical `<?xml version`
/// declaration, an explicit `<!DOCTYPE`, and bare tags that start with
/// `<` + an ASCII letter (so `<heading>` triggers but `<3` and `< x`
/// don't).
///
/// The caller is expected to [`strip_xml_prolog`] before parsing to
/// remove `<?xml ... ?>` declarations that html5ever does not
/// recognise.
#[must_use]
pub fn looks_like_xml(src: &str) -> bool {
    let trimmed = src.trim_start();
    if trimmed.starts_with("<?xml") || trimmed.starts_with("<!DOCTYPE") {
        return true;
    }
    let mut chars = trimmed.chars();
    if chars.next() != Some('<') {
        return false;
    }
    match chars.next() {
        Some(c) => c.is_ascii_alphabetic(),
        None => false,
    }
}

/// Remove the XML prolog (`<?xml ... ?>`) and DOCTYPE + leading
/// comments so html5ever's HTML5 parser doesn't see tokens it doesn't
/// recognise. The caller is then free to hand the result to the HTML
/// parser.
///
/// Repeats until no more prolog-like prefixes remain, so inputs with
/// multiple headers (`<?xml ?><!DOCTYPE><!-- generated-by -->`) work
/// without extra passes.
#[must_use]
pub fn strip_xml_prolog(src: &str) -> String {
    let mut rest = src.trim_start();
    loop {
        if let Some(after) = rest.strip_prefix("<?xml")
            && let Some(end) = after.find("?>")
        {
            rest = after[end + 2..].trim_start();
            continue;
        }
        if let Some(after) = rest.strip_prefix("<!DOCTYPE")
            && let Some(end) = after.find('>')
        {
            rest = after[end + 1..].trim_start();
            continue;
        }
        if let Some(after) = rest.strip_prefix("<!--")
            && let Some(end) = after.find("-->")
        {
            rest = after[end + 3..].trim_start();
            continue;
        }
        break;
    }
    rest.to_string()
}

/// Strip Microsoft / HWP preprocessing artifacts that would otherwise
/// confuse the HTML parser. Safe to call on arbitrary input including
/// multi-byte UTF-8.
///
/// Currently removes:
/// - `<!--StartFragment-->` and `<!--EndFragment-->` (MS clipboard
///   markers)
/// - `<o:p>…</o:p>` and bare `<o:p>` / `</o:p>` (Office VML namespace)
#[must_use]
pub fn clean_html(html: &str) -> String {
    let mut out = String::with_capacity(html.len());
    let mut i = 0usize;
    while i < html.len() {
        if let Some(next) = skip_matched_region(html, i) {
            i = next;
            continue;
        }
        // Advance one full character to stay on UTF-8 boundaries.
        let rest = &html[i..];
        let Some(ch) = rest.chars().next() else { break };
        out.push(ch);
        i += ch.len_utf8();
    }
    out
}

/// Peek at `html[i..]` and, if it starts with a recognised disposable
/// region, return the byte index *after* it. Returns `None` when the
/// current position should be copied verbatim.
fn skip_matched_region(html: &str, i: usize) -> Option<usize> {
    let rest = &html[i..];

    // <!--StartFragment--> / <!--EndFragment--> (case-insensitive,
    // tolerant of whitespace inside the marker).
    if rest.starts_with("<!--") {
        let comment_end = rest.find("-->")?;
        let inner = rest[4..comment_end].trim().to_ascii_lowercase();
        if inner == "startfragment" || inner == "endfragment" {
            return Some(i + comment_end + 3);
        }
        return None;
    }

    // <o:p …>…</o:p> — strip the whole pair.
    if starts_with_case_insensitive(rest, "<o:p") {
        let open_end = rest.find('>')?;
        let after_open = &rest[open_end + 1..];
        if let Some(close_rel) = find_case_insensitive(after_open, "</o:p>") {
            return Some(i + open_end + 1 + close_rel + "</o:p>".len());
        }
        return Some(i + open_end + 1);
    }

    // Standalone </o:p …> for malformed inputs.
    if starts_with_case_insensitive(rest, "</o:p") {
        let end = rest.find('>')?;
        return Some(i + end + 1);
    }

    None
}

fn starts_with_case_insensitive(haystack: &str, needle: &str) -> bool {
    haystack.len() >= needle.len()
        && haystack.is_char_boundary(needle.len())
        && haystack[..needle.len()].eq_ignore_ascii_case(needle)
}

fn find_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
    let needle_lower = needle.to_ascii_lowercase();
    let needle_len = needle.len();
    if needle_len == 0 || haystack.len() < needle_len {
        return None;
    }
    for i in 0..=haystack.len().saturating_sub(needle_len) {
        if haystack.is_char_boundary(i)
            && haystack[i..i + needle_len].eq_ignore_ascii_case(&needle_lower)
        {
            return Some(i);
        }
    }
    None
}

#[cfg(test)]
mod tests {
    use super::*;
    use serde_json::json;

    #[test]
    fn encode_empty_props() {
        assert_eq!(encode_props(None), "");
        let empty = Map::new();
        assert_eq!(encode_props(Some(&empty)), "");
    }

    #[test]
    fn encode_skips_structural_keys() {
        let mut m = Map::new();
        m.insert("colspan".into(), json!(2));
        m.insert("rowspan".into(), json!(3));
        m.insert("columns".into(), json!([]));
        m.insert("indent".into(), json!(1));
        assert_eq!(
            encode_props(Some(&m)),
            "",
            "all four skip keys alone should produce no marker"
        );
    }

    #[test]
    fn encode_drops_null_values() {
        let mut m = Map::new();
        m.insert("backgroundColor".into(), Value::Null);
        m.insert("borderColor".into(), json!("#f59e0b"));
        let encoded = encode_props(Some(&m));
        let decoded = decode_props(&encoded).unwrap();
        assert!(!decoded.contains_key("backgroundColor"));
        assert_eq!(decoded.get("borderColor"), Some(&json!("#f59e0b")));
    }

    #[test]
    fn roundtrip_preserves_arbitrary_props() {
        let mut m = Map::new();
        m.insert("backgroundColor".into(), json!("#fef3c7"));
        m.insert("borderWidth".into(), json!("2px"));
        m.insert("padding".into(), json!(12));
        m.insert("verticalAlign".into(), json!("bottom"));
        let encoded = encode_props(Some(&m));
        let decoded = decode_props(&encoded).unwrap();
        assert_eq!(decoded, m);
    }

    #[test]
    fn decode_plain_json_fallback() {
        // The TS fallback (no window.btoa) emits raw JSON. Must decode.
        let raw = r##"{"backgroundColor":"#fef3c7"}"##;
        let decoded = decode_props(raw).unwrap();
        assert_eq!(decoded.get("backgroundColor"), Some(&json!("#fef3c7")));
    }

    #[test]
    fn decode_rejects_garbage() {
        assert!(decode_props("").is_none());
        assert!(decode_props("not base64 and not json!").is_none());
    }

    #[test]
    fn clean_html_strips_ms_markers() {
        let input = "<!--StartFragment--><p>hi</p><!--EndFragment-->";
        assert_eq!(clean_html(input), "<p>hi</p>");
    }

    #[test]
    fn clean_html_strips_o_p_tags() {
        let input = "<o:p>junk</o:p><p>real</p><o:p />";
        // The bare-open form is closed by the final `>` so we strip it too.
        assert_eq!(clean_html(input), "<p>real</p>");
    }

    #[test]
    fn clean_html_case_insensitive() {
        let input = "<!--STARTFRAGMENT--><p>x</p><!-- EndFragment -->";
        assert_eq!(clean_html(input), "<p>x</p>");
    }

    #[test]
    fn clean_html_preserves_unknown_comments() {
        let input = "<!-- keep me --><p>x</p>";
        assert_eq!(clean_html(input), "<!-- keep me --><p>x</p>");
    }

    #[test]
    fn clean_html_leaves_unicode_intact() {
        let input = "안녕<!--StartFragment-->세계";
        assert_eq!(clean_html(input), "안녕세계");
    }

    #[test]
    fn looks_like_xml_recognises_canonical_prolog() {
        assert!(looks_like_xml("<?xml version='1.0'?>"));
        assert!(looks_like_xml("<!DOCTYPE html>"));
    }

    #[test]
    fn looks_like_xml_accepts_bare_tags() {
        assert!(looks_like_xml("<h1>"));
        assert!(looks_like_xml("<paragraph>body</paragraph>"));
    }

    #[test]
    fn looks_like_xml_rejects_markdown_lookalikes() {
        assert!(!looks_like_xml("not xml"));
        assert!(!looks_like_xml("<3 love you"));
        assert!(!looks_like_xml("< malformed"));
        assert!(!looks_like_xml(""));
    }

    #[test]
    fn strip_xml_prolog_removes_xml_declaration() {
        assert_eq!(
            strip_xml_prolog(r#"<?xml version="1.0"?><h1>T</h1>"#),
            "<h1>T</h1>"
        );
    }

    #[test]
    fn strip_xml_prolog_removes_doctype() {
        assert_eq!(strip_xml_prolog("<!DOCTYPE html><h1>T</h1>"), "<h1>T</h1>");
    }

    #[test]
    fn strip_xml_prolog_handles_combinations() {
        let out =
            strip_xml_prolog(r#"<?xml version="1.0"?><!DOCTYPE foo><!-- comment --><h1>T</h1>"#);
        assert_eq!(out, "<h1>T</h1>");
    }

    #[test]
    fn strip_xml_prolog_trims_leading_whitespace() {
        assert_eq!(
            strip_xml_prolog("   <?xml version=\"1.0\"?>  <h1>T</h1>"),
            "<h1>T</h1>"
        );
    }
}