Skip to main content

devup_editor_html/
clipboard.rs

1//! Clipboard-specific helpers and shared types.
2//!
3//! All behaviour here mirrors the TypeScript `clipboardHtml.ts` byte-for-
4//! byte where interop matters. Deviations are called out in comments.
5
6use std::collections::{BTreeSet, HashMap};
7
8use base64::Engine;
9use base64::engine::general_purpose::STANDARD as BASE64;
10use devup_editor_core::{Block, BlockId};
11use serde::{Deserialize, Serialize};
12use serde_json::{Map, Value};
13
14/// Subtree bundle produced by [`crate::html_to_copied_blocks`] and
15/// consumed by [`crate::blocks_to_html`].
16///
17/// - `roots`: top-level blocks in document order.
18/// - `by_id`: flat map containing **every** block (including descendants
19///   of `table` / `table_row` / `toggle`) so pasting can reconstruct
20///   nested structures faithfully.
21///
22/// The React side serialises / deserialises this across the WASM
23/// boundary as JSON; the type declaration is therefore the source of
24/// truth for the schema.
25#[derive(Debug, Clone, Serialize, Deserialize)]
26pub struct CopiedBlocks {
27    pub roots: Vec<Block>,
28    #[serde(rename = "byId")]
29    pub by_id: HashMap<BlockId, Block>,
30}
31
32/// Attribute name used for lossless prop round-trip between devup
33/// instances. See [`encode_props`] for the payload format.
34pub const DEVUP_PROPS_ATTR: &str = "data-devup-props";
35
36/// Keys that are represented structurally in HTML (native attributes
37/// or layout elements) and must therefore NOT be duplicated into the
38/// marker — that would invite drift when the HTML is edited externally
39/// and then re-imported.
40fn props_skip_keys() -> &'static BTreeSet<&'static str> {
41    use std::sync::OnceLock;
42    static S: OnceLock<BTreeSet<&'static str>> = OnceLock::new();
43    S.get_or_init(|| {
44        let mut s = BTreeSet::new();
45        // Native HTML table attributes.
46        s.insert("colspan");
47        s.insert("rowspan");
48        // Encoded via <colgroup><col>.
49        s.insert("columns");
50        // Emitted by the enclosing emit() loop via data-indent.
51        s.insert("indent");
52        s
53    })
54}
55
56/// Serialize a props map into the `data-devup-props` marker payload.
57///
58/// Returns an empty string when there is nothing worth marking (all
59/// keys either skipped, null, or the map was None). The caller is
60/// expected to omit the attribute entirely in that case.
61///
62/// Payload: `base64(utf8_bytes(json_stringify(filtered_props)))` using
63/// standard base64. Attribute-safe by construction.
64#[must_use]
65pub fn encode_props(props: Option<&Map<String, Value>>) -> String {
66    let Some(props) = props else {
67        return String::new();
68    };
69    let skip = props_skip_keys();
70    let mut filtered = Map::new();
71    for (k, v) in props {
72        if skip.contains(k.as_str()) {
73            continue;
74        }
75        if v.is_null() {
76            continue;
77        }
78        filtered.insert(k.clone(), v.clone());
79    }
80    if filtered.is_empty() {
81        return String::new();
82    }
83    let Ok(json) = serde_json::to_string(&Value::Object(filtered)) else {
84        return String::new();
85    };
86    BASE64.encode(json.as_bytes())
87}
88
89/// Decode the `data-devup-props` marker back into a props map.
90///
91/// Accepts both the modern base64 payload (standard alphabet) and, as
92/// a fallback, a raw JSON object string — the TS side emits the latter
93/// when `window.btoa` is unavailable (Node fallback path). Returns
94/// `None` when the attribute is absent or unparseable.
95#[must_use]
96pub fn decode_props(raw: &str) -> Option<Map<String, Value>> {
97    if raw.is_empty() {
98        return None;
99    }
100
101    // Primary path: base64 → utf8 → JSON.
102    if let Ok(bytes) = BASE64.decode(raw.as_bytes())
103        && let Ok(text) = std::str::from_utf8(&bytes)
104        && let Ok(Value::Object(map)) = serde_json::from_str::<Value>(text)
105    {
106        return Some(map);
107    }
108
109    // Fallback: treat the attribute value as raw JSON.
110    if let Ok(Value::Object(map)) = serde_json::from_str::<Value>(raw) {
111        return Some(map);
112    }
113
114    None
115}
116
117/// Pre-check for XML-shaped input so we don't pay the html5ever cost
118/// on pure-Markdown text. Accepts the canonical `<?xml version`
119/// declaration, an explicit `<!DOCTYPE`, and bare tags that start with
120/// `<` + an ASCII letter (so `<heading>` triggers but `<3` and `< x`
121/// don't).
122///
123/// The caller is expected to [`strip_xml_prolog`] before parsing to
124/// remove `<?xml ... ?>` declarations that html5ever does not
125/// recognise.
126#[must_use]
127pub fn looks_like_xml(src: &str) -> bool {
128    let trimmed = src.trim_start();
129    if trimmed.starts_with("<?xml") || trimmed.starts_with("<!DOCTYPE") {
130        return true;
131    }
132    let mut chars = trimmed.chars();
133    if chars.next() != Some('<') {
134        return false;
135    }
136    match chars.next() {
137        Some(c) => c.is_ascii_alphabetic(),
138        None => false,
139    }
140}
141
142/// Remove the XML prolog (`<?xml ... ?>`) and DOCTYPE + leading
143/// comments so html5ever's HTML5 parser doesn't see tokens it doesn't
144/// recognise. The caller is then free to hand the result to the HTML
145/// parser.
146///
147/// Repeats until no more prolog-like prefixes remain, so inputs with
148/// multiple headers (`<?xml ?><!DOCTYPE><!-- generated-by -->`) work
149/// without extra passes.
150#[must_use]
151pub fn strip_xml_prolog(src: &str) -> String {
152    let mut rest = src.trim_start();
153    loop {
154        if let Some(after) = rest.strip_prefix("<?xml")
155            && let Some(end) = after.find("?>")
156        {
157            rest = after[end + 2..].trim_start();
158            continue;
159        }
160        if let Some(after) = rest.strip_prefix("<!DOCTYPE")
161            && let Some(end) = after.find('>')
162        {
163            rest = after[end + 1..].trim_start();
164            continue;
165        }
166        if let Some(after) = rest.strip_prefix("<!--")
167            && let Some(end) = after.find("-->")
168        {
169            rest = after[end + 3..].trim_start();
170            continue;
171        }
172        break;
173    }
174    rest.to_string()
175}
176
177/// Strip Microsoft / HWP preprocessing artifacts that would otherwise
178/// confuse the HTML parser. Safe to call on arbitrary input including
179/// multi-byte UTF-8.
180///
181/// Currently removes:
182/// - `<!--StartFragment-->` and `<!--EndFragment-->` (MS clipboard
183///   markers)
184/// - `<o:p>…</o:p>` and bare `<o:p>` / `</o:p>` (Office VML namespace)
185#[must_use]
186pub fn clean_html(html: &str) -> String {
187    let mut out = String::with_capacity(html.len());
188    let mut i = 0usize;
189    while i < html.len() {
190        if let Some(next) = skip_matched_region(html, i) {
191            i = next;
192            continue;
193        }
194        // Advance one full character to stay on UTF-8 boundaries.
195        let rest = &html[i..];
196        let Some(ch) = rest.chars().next() else { break };
197        out.push(ch);
198        i += ch.len_utf8();
199    }
200    out
201}
202
203/// Peek at `html[i..]` and, if it starts with a recognised disposable
204/// region, return the byte index *after* it. Returns `None` when the
205/// current position should be copied verbatim.
206fn skip_matched_region(html: &str, i: usize) -> Option<usize> {
207    let rest = &html[i..];
208
209    // <!--StartFragment--> / <!--EndFragment--> (case-insensitive,
210    // tolerant of whitespace inside the marker).
211    if rest.starts_with("<!--") {
212        let comment_end = rest.find("-->")?;
213        let inner = rest[4..comment_end].trim().to_ascii_lowercase();
214        if inner == "startfragment" || inner == "endfragment" {
215            return Some(i + comment_end + 3);
216        }
217        return None;
218    }
219
220    // <o:p …>…</o:p> — strip the whole pair.
221    if starts_with_case_insensitive(rest, "<o:p") {
222        let open_end = rest.find('>')?;
223        let after_open = &rest[open_end + 1..];
224        if let Some(close_rel) = find_case_insensitive(after_open, "</o:p>") {
225            return Some(i + open_end + 1 + close_rel + "</o:p>".len());
226        }
227        return Some(i + open_end + 1);
228    }
229
230    // Standalone </o:p …> for malformed inputs.
231    if starts_with_case_insensitive(rest, "</o:p") {
232        let end = rest.find('>')?;
233        return Some(i + end + 1);
234    }
235
236    None
237}
238
239fn starts_with_case_insensitive(haystack: &str, needle: &str) -> bool {
240    haystack.len() >= needle.len()
241        && haystack.is_char_boundary(needle.len())
242        && haystack[..needle.len()].eq_ignore_ascii_case(needle)
243}
244
245fn find_case_insensitive(haystack: &str, needle: &str) -> Option<usize> {
246    let needle_lower = needle.to_ascii_lowercase();
247    let needle_len = needle.len();
248    if needle_len == 0 || haystack.len() < needle_len {
249        return None;
250    }
251    for i in 0..=haystack.len().saturating_sub(needle_len) {
252        if haystack.is_char_boundary(i)
253            && haystack[i..i + needle_len].eq_ignore_ascii_case(&needle_lower)
254        {
255            return Some(i);
256        }
257    }
258    None
259}
260
261#[cfg(test)]
262mod tests {
263    use super::*;
264    use serde_json::json;
265
266    #[test]
267    fn encode_empty_props() {
268        assert_eq!(encode_props(None), "");
269        let empty = Map::new();
270        assert_eq!(encode_props(Some(&empty)), "");
271    }
272
273    #[test]
274    fn encode_skips_structural_keys() {
275        let mut m = Map::new();
276        m.insert("colspan".into(), json!(2));
277        m.insert("rowspan".into(), json!(3));
278        m.insert("columns".into(), json!([]));
279        m.insert("indent".into(), json!(1));
280        assert_eq!(
281            encode_props(Some(&m)),
282            "",
283            "all four skip keys alone should produce no marker"
284        );
285    }
286
287    #[test]
288    fn encode_drops_null_values() {
289        let mut m = Map::new();
290        m.insert("backgroundColor".into(), Value::Null);
291        m.insert("borderColor".into(), json!("#f59e0b"));
292        let encoded = encode_props(Some(&m));
293        let decoded = decode_props(&encoded).unwrap();
294        assert!(!decoded.contains_key("backgroundColor"));
295        assert_eq!(decoded.get("borderColor"), Some(&json!("#f59e0b")));
296    }
297
298    #[test]
299    fn roundtrip_preserves_arbitrary_props() {
300        let mut m = Map::new();
301        m.insert("backgroundColor".into(), json!("#fef3c7"));
302        m.insert("borderWidth".into(), json!("2px"));
303        m.insert("padding".into(), json!(12));
304        m.insert("verticalAlign".into(), json!("bottom"));
305        let encoded = encode_props(Some(&m));
306        let decoded = decode_props(&encoded).unwrap();
307        assert_eq!(decoded, m);
308    }
309
310    #[test]
311    fn decode_plain_json_fallback() {
312        // The TS fallback (no window.btoa) emits raw JSON. Must decode.
313        let raw = r##"{"backgroundColor":"#fef3c7"}"##;
314        let decoded = decode_props(raw).unwrap();
315        assert_eq!(decoded.get("backgroundColor"), Some(&json!("#fef3c7")));
316    }
317
318    #[test]
319    fn decode_rejects_garbage() {
320        assert!(decode_props("").is_none());
321        assert!(decode_props("not base64 and not json!").is_none());
322    }
323
324    #[test]
325    fn clean_html_strips_ms_markers() {
326        let input = "<!--StartFragment--><p>hi</p><!--EndFragment-->";
327        assert_eq!(clean_html(input), "<p>hi</p>");
328    }
329
330    #[test]
331    fn clean_html_strips_o_p_tags() {
332        let input = "<o:p>junk</o:p><p>real</p><o:p />";
333        // The bare-open form is closed by the final `>` so we strip it too.
334        assert_eq!(clean_html(input), "<p>real</p>");
335    }
336
337    #[test]
338    fn clean_html_case_insensitive() {
339        let input = "<!--STARTFRAGMENT--><p>x</p><!-- EndFragment -->";
340        assert_eq!(clean_html(input), "<p>x</p>");
341    }
342
343    #[test]
344    fn clean_html_preserves_unknown_comments() {
345        let input = "<!-- keep me --><p>x</p>";
346        assert_eq!(clean_html(input), "<!-- keep me --><p>x</p>");
347    }
348
349    #[test]
350    fn clean_html_leaves_unicode_intact() {
351        let input = "안녕<!--StartFragment-->세계";
352        assert_eq!(clean_html(input), "안녕세계");
353    }
354
355    #[test]
356    fn looks_like_xml_recognises_canonical_prolog() {
357        assert!(looks_like_xml("<?xml version='1.0'?>"));
358        assert!(looks_like_xml("<!DOCTYPE html>"));
359    }
360
361    #[test]
362    fn looks_like_xml_accepts_bare_tags() {
363        assert!(looks_like_xml("<h1>"));
364        assert!(looks_like_xml("<paragraph>body</paragraph>"));
365    }
366
367    #[test]
368    fn looks_like_xml_rejects_markdown_lookalikes() {
369        assert!(!looks_like_xml("not xml"));
370        assert!(!looks_like_xml("<3 love you"));
371        assert!(!looks_like_xml("< malformed"));
372        assert!(!looks_like_xml(""));
373    }
374
375    #[test]
376    fn strip_xml_prolog_removes_xml_declaration() {
377        assert_eq!(
378            strip_xml_prolog(r#"<?xml version="1.0"?><h1>T</h1>"#),
379            "<h1>T</h1>"
380        );
381    }
382
383    #[test]
384    fn strip_xml_prolog_removes_doctype() {
385        assert_eq!(strip_xml_prolog("<!DOCTYPE html><h1>T</h1>"), "<h1>T</h1>");
386    }
387
388    #[test]
389    fn strip_xml_prolog_handles_combinations() {
390        let out =
391            strip_xml_prolog(r#"<?xml version="1.0"?><!DOCTYPE foo><!-- comment --><h1>T</h1>"#);
392        assert_eq!(out, "<h1>T</h1>");
393    }
394
395    #[test]
396    fn strip_xml_prolog_trims_leading_whitespace() {
397        assert_eq!(
398            strip_xml_prolog("   <?xml version=\"1.0\"?>  <h1>T</h1>"),
399            "<h1>T</h1>"
400        );
401    }
402}