1use std::collections::HashMap;
10
11use devup_editor_core::{Block, BlockId, IdGenerator, TextSpan, normalize_spans};
12use html5ever::driver::{ParseOpts, parse_document};
13use html5ever::tendril::TendrilSink;
14use markup5ever_rcdom::{Handle, NodeData, RcDom};
15use serde_json::{Map, Value};
16
17use crate::clipboard::{CopiedBlocks, clean_html};
18
19mod dom;
20use dom::{
21 MarkSet, attr_value, attrs_contains, build_synthetic_parent, clone_node_without_checkboxes,
22 collect_inline_into, collect_raw_text, collect_table_rows, decode_props_from_element,
23 detect_any_checkbox, detect_direct_checkbox, direct_children_of_tag_any, element_attrs,
24 element_tag, extract_cell_props, extract_colgroup_widths, extract_row_props, extract_spans,
25 extract_spans_from_li, find_body, find_descendant_with_any_class, has_class,
26 has_descendant_with_class, is_all_whitespace, is_notion_v3_toggle, parse_inline_style,
27 strip_nested_blocks,
28};
29
30pub(crate) fn parse_html(input: &str, id_gen: &mut dyn IdGenerator) -> CopiedBlocks {
38 html_to_copied_blocks(input, id_gen)
39}
40
41pub fn html_to_copied_blocks(input: &str, id_gen: &mut dyn IdGenerator) -> CopiedBlocks {
45 let cleaned = clean_html(input.trim());
46 if cleaned.is_empty() {
47 return CopiedBlocks {
48 roots: Vec::new(),
49 by_id: HashMap::new(),
50 };
51 }
52
53 let dom = parse_document(RcDom::default(), ParseOpts::default()).one(cleaned);
54 let body = find_body(&dom.document).unwrap_or_else(|| dom.document.clone());
55
56 let mut ctx = Context::new(id_gen);
57 ctx.process_children_with_indent(&body, 0);
58 let roots = ctx.finalize_roots();
59 CopiedBlocks {
60 roots,
61 by_id: ctx.by_id,
62 }
63}
64
65struct Context<'a> {
71 id_gen: &'a mut dyn IdGenerator,
72 roots_order: Vec<BlockId>,
73 by_id: HashMap<BlockId, Block>,
74}
75
76impl<'a> Context<'a> {
77 fn new(id_gen: &'a mut dyn IdGenerator) -> Self {
78 Self {
79 id_gen,
80 roots_order: Vec::new(),
81 by_id: HashMap::new(),
82 }
83 }
84
85 fn next_id(&mut self) -> BlockId {
86 self.id_gen.next_id()
87 }
88
89 fn push_root(&mut self, block: Block) {
92 self.roots_order.push(block.id.clone());
93 self.by_id.insert(block.id.clone(), block);
94 }
95
96 fn insert(&mut self, block: Block) -> BlockId {
100 let id = block.id.clone();
101 self.by_id.insert(id.clone(), block);
102 id
103 }
104
105 fn finalize_roots(&mut self) -> Vec<Block> {
106 let mut out = Vec::with_capacity(self.roots_order.len());
107 for id in &self.roots_order {
108 if let Some(b) = self.by_id.get(id) {
109 out.push(b.clone());
110 }
111 }
112 out
113 }
114
115 fn process_children_with_indent(&mut self, node: &Handle, indent: i64) {
120 let mut inline_buf: Vec<Handle> = Vec::new();
123
124 for child in node.children.borrow().iter() {
125 match &child.data {
126 NodeData::Text { contents } => {
127 if !contents.borrow().trim().is_empty() {
128 inline_buf.push(child.clone());
129 }
130 continue;
131 }
132 NodeData::Comment { .. } | NodeData::Doctype { .. } => continue,
133 NodeData::Element { .. } => {}
134 _ => continue,
135 }
136
137 let tag = element_tag(child).unwrap_or_default();
138
139 if !BLOCK_TAGS.contains(&tag.as_str()) && tag != "details" {
140 inline_buf.push(child.clone());
141 continue;
142 }
143
144 self.flush_inline(&mut inline_buf, indent);
145
146 if matches!(tag.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
147 let level_digit = tag.chars().nth(1).and_then(|c| c.to_digit(10)).unwrap_or(1);
148 let level = u64::from(level_digit);
149 let spans = extract_spans(child);
150 if !is_all_whitespace(&spans) {
151 let mut props = Map::new();
152 props.insert("level".into(), Value::from(level));
153 if indent > 0 {
154 props.insert("indent".into(), Value::from(indent));
155 }
156 let id = self.next_id();
157 let mut b = Block::with_props(id, "heading", props);
158 b.content = spans;
159 self.push_root(b);
160 }
161 continue;
162 }
163
164 if tag == "blockquote" {
165 let spans = extract_spans(child);
166 if !is_all_whitespace(&spans) {
167 let id = self.next_id();
168 let mut b = new_block(id, "quote", indent);
169 b.content = spans;
170 self.push_root(b);
171 }
172 continue;
173 }
174
175 if tag == "pre" {
176 let code_child = child.children.borrow().iter().find_map(|c| {
180 if element_tag(c).as_deref() == Some("code") {
181 Some(c.clone())
182 } else {
183 None
184 }
185 });
186 let language = code_child
187 .as_ref()
188 .and_then(|code| attr_value(code, "class"))
189 .and_then(|cls| {
190 cls.split_whitespace()
191 .find_map(|c| c.strip_prefix("language-").map(String::from))
192 });
193 let text = match code_child.as_ref() {
194 Some(code) => collect_raw_text(code),
195 None => collect_raw_text(child),
196 };
197 let text = text.strip_prefix('\n').unwrap_or(&text).to_string();
198 if !text.is_empty() {
199 let id = self.next_id();
200 let ty = if language.is_some() {
201 "code"
202 } else {
203 "paragraph"
204 };
205 let mut props = Map::new();
206 if indent > 0 {
207 props.insert("indent".into(), Value::from(indent));
208 }
209 if let Some(lang) = language {
210 props.insert("language".into(), Value::String(lang));
211 }
212 let mut b = if props.is_empty() {
213 Block::new(id, ty)
214 } else {
215 Block::with_props(id, ty, props)
216 };
217 b.content = vec![TextSpan::plain(text)];
218 self.push_root(b);
219 }
220 continue;
221 }
222
223 if tag == "details" {
224 self.process_toggle_details(child, indent);
225 continue;
226 }
227
228 if tag == "ul" || tag == "ol" {
229 let attrs = element_attrs(child);
230 if attrs_contains(&attrs, "data-devup-type", "todo")
232 || has_class(&attrs, "to-do-list")
233 {
234 self.process_notion_todo_list(child);
235 continue;
236 }
237 if tag == "ul" && has_class(&attrs, "toggle") {
238 self.process_notion_toggle_list(child, indent);
239 continue;
240 }
241 self.process_list(child, tag == "ol", indent);
242 continue;
243 }
244
245 if tag == "table" {
246 self.process_table(child, indent);
247 continue;
248 }
249
250 if tag == "hr" {
251 let id = self.next_id();
252 self.push_root(new_block(id, "divider", indent));
253 continue;
254 }
255
256 if tag == "li" {
257 let spans = extract_spans(child);
260 if !is_all_whitespace(&spans) {
261 let id = self.next_id();
262 let mut b = new_block(id, "paragraph", indent);
263 b.content = spans;
264 self.push_root(b);
265 }
266 continue;
267 }
268
269 if tag == "p" {
270 let p_attrs = element_attrs(child);
274 if attrs_contains(&p_attrs, "data-type", "todo") {
275 let checked = attrs_contains(&p_attrs, "data-checked", "true");
276 let spans = extract_spans(child);
277 if !is_all_whitespace(&spans) {
278 let id = self.next_id();
279 let mut props = Map::new();
280 props.insert("checked".into(), Value::Bool(checked));
281 if indent > 0 {
282 props.insert("indent".into(), Value::from(indent));
283 }
284 let mut b = Block::with_props(id, "todo", props);
285 b.content = spans;
286 self.push_root(b);
287 }
288 continue;
289 }
290 let spans = extract_spans(child);
291 if !is_all_whitespace(&spans) {
292 let id = self.next_id();
293 let mut b = new_block(id, "paragraph", indent);
294 b.content = spans;
295 self.push_root(b);
296 }
297 continue;
298 }
299
300 let has_block_child = child.children.borrow().iter().any(|c| {
304 if let Some(t) = element_tag(c) {
305 BLOCK_TAGS.contains(&t.as_str()) || t == "details"
306 } else {
307 false
308 }
309 });
310 if has_block_child {
311 self.process_children_with_indent(child, indent);
312 } else {
313 let spans = extract_spans(child);
314 if !is_all_whitespace(&spans) {
315 let id = self.next_id();
316 let mut b = new_block(id, "paragraph", indent);
317 b.content = spans;
318 self.push_root(b);
319 }
320 }
321 }
322
323 self.flush_inline(&mut inline_buf, indent);
324 }
325
326 fn flush_inline(&mut self, buf: &mut Vec<Handle>, indent: i64) {
327 if buf.is_empty() {
328 return;
329 }
330 let mut spans: Vec<TextSpan> = Vec::new();
331 for n in buf.iter() {
332 let mark_set = MarkSet::empty();
333 collect_inline_into(n, &mut spans, &mark_set);
334 }
335 buf.clear();
336 normalize_spans(&mut spans);
337 if is_all_whitespace(&spans) {
338 return;
339 }
340 let id = self.next_id();
341 let mut b = new_block(id, "paragraph", indent);
342 b.content = spans;
343 self.push_root(b);
344 }
345
346 fn process_toggle_details(&mut self, details: &Handle, indent: i64) {
349 let summary_node = details.children.borrow().iter().find_map(|c| {
351 if element_tag(c).as_deref() == Some("summary") {
352 Some(c.clone())
353 } else {
354 None
355 }
356 });
357 let title_spans = summary_node.as_ref().map(extract_spans).unwrap_or_default();
358
359 let id = self.next_id();
360 let mut props = Map::new();
361 props.insert("collapsed".into(), Value::Bool(false));
362 if indent > 0 {
363 props.insert("indent".into(), Value::from(indent));
364 }
365 let mut toggle = Block::with_props(id, "toggle", props);
366 toggle.content = title_spans;
367 self.push_root(toggle);
368
369 let child_handle = details.clone();
373 let original_children: Vec<Handle> = child_handle
374 .children
375 .borrow()
376 .iter()
377 .filter(|c| element_tag(c).as_deref() != Some("summary"))
378 .cloned()
379 .collect();
380 self.process_handles_with_indent(&original_children, indent + 1);
381 }
382
383 fn process_notion_toggle_list(&mut self, ul: &Handle, indent: i64) {
384 for li in ul.children.borrow().iter() {
385 if element_tag(li).as_deref() != Some("li") {
386 continue;
387 }
388 let details = li.children.borrow().iter().find_map(|c| {
389 if element_tag(c).as_deref() == Some("details") {
390 Some(c.clone())
391 } else {
392 None
393 }
394 });
395 if let Some(det) = details {
396 self.process_toggle_details(&det, indent);
397 } else {
398 let spans = extract_spans_from_li(li);
399 if !is_all_whitespace(&spans) {
400 let id = self.next_id();
401 let mut props = Map::new();
402 props.insert("style".into(), Value::String("unordered".into()));
403 if indent > 0 {
404 props.insert("indent".into(), Value::from(indent));
405 }
406 let mut b = Block::with_props(id, "list", props);
407 b.content = spans;
408 self.push_root(b);
409 }
410 }
411 }
412 }
413
414 fn process_list(&mut self, list_el: &Handle, ordered: bool, indent: i64) {
415 let style = if ordered { "ordered" } else { "unordered" };
416 for li in list_el.children.borrow().iter() {
417 if element_tag(li).as_deref() != Some("li") {
418 continue;
419 }
420
421 if let Some(checked) = detect_direct_checkbox(li) {
423 let clone_without_cb = clone_node_without_checkboxes(li);
424 let spans = extract_spans(&clone_without_cb);
425 if !is_all_whitespace(&spans) {
426 let id = self.next_id();
427 let mut props = Map::new();
428 props.insert("checked".into(), Value::Bool(checked));
429 if indent > 0 {
430 props.insert("indent".into(), Value::from(indent));
431 }
432 let mut b = Block::with_props(id, "todo", props);
433 b.content = spans;
434 self.push_root(b);
435 }
436 self.recurse_nested_lists(li, indent);
438 continue;
439 }
440
441 if !ordered && is_notion_v3_toggle(li) {
443 let block_children: Vec<Handle> = li
444 .children
445 .borrow()
446 .iter()
447 .filter(|c| {
448 if let Some(t) = element_tag(c) {
449 matches!(
450 t.as_str(),
451 "p" | "div"
452 | "ul"
453 | "ol"
454 | "blockquote"
455 | "pre"
456 | "table"
457 | "details"
458 ) || is_heading_tag(&t)
459 } else {
460 false
461 }
462 })
463 .cloned()
464 .collect();
465 let title_el = &block_children[0];
466 let title_spans = extract_spans(title_el);
467 let id = self.next_id();
468 let mut props = Map::new();
469 props.insert("collapsed".into(), Value::Bool(false));
470 if indent > 0 {
471 props.insert("indent".into(), Value::from(indent));
472 }
473 let mut toggle = Block::with_props(id, "toggle", props);
474 toggle.content = title_spans;
475 self.push_root(toggle);
476 self.process_handles_with_indent(&block_children[1..], indent + 1);
477 continue;
478 }
479
480 let spans = extract_spans_from_li(li);
482 if !is_all_whitespace(&spans) {
483 let id = self.next_id();
484 let mut props = Map::new();
485 props.insert("style".into(), Value::String(style.into()));
486 if indent > 0 {
487 props.insert("indent".into(), Value::from(indent));
488 }
489 let mut b = Block::with_props(id, "list", props);
490 b.content = spans;
491 self.push_root(b);
492 }
493
494 self.recurse_nested_lists(li, indent);
495 }
496 }
497
498 fn recurse_nested_lists(&mut self, li: &Handle, indent: i64) {
499 for nested in li.children.borrow().iter() {
500 let Some(t) = element_tag(nested) else {
501 continue;
502 };
503 if t == "ul" {
504 let attrs = element_attrs(nested);
505 if has_class(&attrs, "toggle") {
506 self.process_notion_toggle_list(nested, indent + 1);
507 } else {
508 self.process_list(nested, false, indent + 1);
509 }
510 } else if t == "ol" {
511 self.process_list(nested, true, indent + 1);
512 } else if t == "details" {
513 self.process_toggle_details(nested, indent + 1);
514 }
515 }
516 }
517
518 fn process_notion_todo_list(&mut self, ul: &Handle) {
519 for li in ul.children.borrow().iter() {
520 if element_tag(li).as_deref() != Some("li") {
521 continue;
522 }
523
524 let attrs = element_attrs(li);
528 let marker_checked = attrs
529 .iter()
530 .find(|a| a.name.local.as_ref() == "data-checked")
531 .map(|a| a.value.as_ref().eq_ignore_ascii_case("true"));
532 let checkbox = detect_any_checkbox(li);
533 let notion_checked = has_descendant_with_class(li, "checkbox-on");
534
535 let checked = marker_checked
536 .or(checkbox)
537 .or(Some(notion_checked))
538 .unwrap_or(false);
539
540 let notion_wrapper = find_descendant_with_any_class(
544 li,
545 &["to-do-children-checked", "to-do-children-unchecked"],
546 );
547 let spans = if let Some(w) = notion_wrapper {
548 extract_spans(&w)
549 } else {
550 let clone = clone_node_without_checkboxes(li);
551 let clone = strip_nested_blocks(&clone);
552 extract_spans(&clone)
553 };
554 if !is_all_whitespace(&spans) {
555 let id = self.next_id();
556 let mut props = Map::new();
557 props.insert("checked".into(), Value::Bool(checked));
558 let mut b = Block::with_props(id, "todo", props);
559 b.content = spans;
560 self.push_root(b);
561 }
562 }
563 }
564
565 fn process_handles_with_indent(&mut self, handles: &[Handle], indent: i64) {
568 let synthetic = build_synthetic_parent(handles);
569 self.process_children_with_indent(&synthetic, indent);
570 }
571
572 fn process_table(&mut self, table_el: &Handle, indent: i64) {
575 let row_els: Vec<Handle> = collect_table_rows(table_el);
577 if row_els.is_empty() {
578 return;
579 }
580 let max_cols = row_els
581 .iter()
582 .map(|r| direct_children_of_tag_any(r, &["td", "th"]).len())
583 .max()
584 .unwrap_or(0);
585 if max_cols == 0 {
586 return;
587 }
588
589 let mut row_ids: Vec<BlockId> = Vec::new();
591 for tr in &row_els {
592 let cells = direct_children_of_tag_any(tr, &["td", "th"]);
593 let mut cell_ids: Vec<BlockId> = Vec::with_capacity(max_cols);
594 for c in 0..max_cols {
595 let spans = cells.get(c).map(extract_spans).unwrap_or_default();
596 let mut props = cells
597 .get(c)
598 .and_then(extract_cell_props)
599 .unwrap_or_default();
600 let cell_id = self.next_id();
603 let mut cell_block = if props.is_empty() {
604 Block::new(cell_id.clone(), "table_cell")
605 } else {
606 normalize_span_numbers(&mut props);
608 Block::with_props(cell_id.clone(), "table_cell", props)
609 };
610 cell_block.content = spans;
611 cell_ids.push(self.insert(cell_block));
612 }
613 let row_props = extract_row_props(tr).unwrap_or_default();
614 let row_id = self.next_id();
615 let mut row_block = if row_props.is_empty() {
616 Block::new(row_id.clone(), "table_row")
617 } else {
618 Block::with_props(row_id.clone(), "table_row", row_props)
619 };
620 row_block.children.clone_from(&cell_ids);
621 for cid in &cell_ids {
623 if let Some(c) = self.by_id.get_mut(cid) {
624 c.parent = Some(row_id.clone());
625 }
626 }
627 row_ids.push(self.insert(row_block));
628 }
629
630 let mut table_props = decode_props_from_element(table_el).unwrap_or_default();
632 if let Some(style_attr) = attr_value(table_el, "style") {
633 let decl = parse_inline_style(&style_attr);
634 if !table_props.contains_key("backgroundColor")
635 && let Some(v) = decl.background_color
636 {
637 table_props.insert("backgroundColor".into(), Value::String(v));
638 }
639 if !table_props.contains_key("borderColor")
640 && let Some(v) = decl.border_color
641 {
642 table_props.insert("borderColor".into(), Value::String(v));
643 }
644 if !table_props.contains_key("borderWidth")
645 && let Some(v) = decl.border_width
646 {
647 table_props.insert("borderWidth".into(), Value::String(v));
648 }
649 if !table_props.contains_key("borderStyle")
650 && let Some(v) = decl.border_style
651 {
652 table_props.insert("borderStyle".into(), Value::String(v));
653 }
654 if !table_props.contains_key("verticalAlign")
655 && let Some(v) = decl.vertical_align
656 {
657 table_props.insert("verticalAlign".into(), Value::String(v));
658 }
659 if !table_props.contains_key("padding")
660 && let Some(v) = decl.padding
661 {
662 table_props.insert("padding".into(), Value::String(v));
663 }
664 }
665 if !table_props.contains_key("columns") {
667 let widths = extract_colgroup_widths(table_el, max_cols);
668 let cols: Vec<Value> = match widths {
669 Some(ws) => ws
670 .into_iter()
671 .map(|w| {
672 let mut m = Map::new();
673 m.insert("width".into(), Value::from(w));
674 Value::Object(m)
675 })
676 .collect(),
677 None => (0..max_cols)
678 .map(|_| {
679 let mut m = Map::new();
680 m.insert("width".into(), Value::from(120u64));
681 Value::Object(m)
682 })
683 .collect(),
684 };
685 table_props.insert("columns".into(), Value::Array(cols));
686 }
687 if indent > 0 {
688 table_props.insert("indent".into(), Value::from(indent));
689 }
690 let table_id = self.next_id();
691 let mut table_block = Block::with_props(table_id.clone(), "table", table_props);
692 table_block.children.clone_from(&row_ids);
693 for rid in &row_ids {
695 if let Some(r) = self.by_id.get_mut(rid) {
696 r.parent = Some(table_id.clone());
697 }
698 }
699 self.push_root(table_block);
700 }
701}
702
703fn normalize_span_numbers(props: &mut Map<String, Value>) {
704 for key in ["colspan", "rowspan"] {
705 if let Some(v) = props.get(key) {
706 let n = match v {
707 Value::Number(n) => n.as_u64(),
708 Value::String(s) => s.parse::<u64>().ok(),
709 _ => None,
710 };
711 if let Some(n) = n {
712 props.insert(key.into(), Value::from(n));
713 }
714 }
715 }
716}
717
718fn new_block(id: BlockId, ty: &str, indent: i64) -> Block {
719 if indent > 0 {
720 let mut props = Map::new();
721 props.insert("indent".into(), Value::from(indent));
722 Block::with_props(id, ty, props)
723 } else {
724 Block::new(id, ty)
725 }
726}
727
728pub(super) fn is_heading_tag(tag: &str) -> bool {
731 matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
732}
733
734static BLOCK_TAGS: &[&str] = &[
735 "address",
736 "article",
737 "aside",
738 "blockquote",
739 "div",
740 "dd",
741 "dl",
742 "dt",
743 "figcaption",
744 "figure",
745 "footer",
746 "h1",
747 "h2",
748 "h3",
749 "h4",
750 "h5",
751 "h6",
752 "header",
753 "hr",
754 "li",
755 "main",
756 "nav",
757 "ol",
758 "p",
759 "pre",
760 "section",
761 "table",
762 "tbody",
763 "td",
764 "th",
765 "thead",
766 "tfoot",
767 "tr",
768 "ul",
769];
770
771pub(super) static TABLE_STRUCTURE_TAGS: &[&str] = &[
772 "table", "thead", "tbody", "tfoot", "tr", "td", "th", "col", "colgroup", "caption",
773];