use std::collections::HashMap;
use devup_editor_core::{Block, BlockId, IdGenerator, TextSpan, normalize_spans};
use html5ever::driver::{ParseOpts, parse_document};
use html5ever::tendril::TendrilSink;
use markup5ever_rcdom::{Handle, NodeData, RcDom};
use serde_json::{Map, Value};
use crate::clipboard::{CopiedBlocks, clean_html};
mod dom;
use dom::{
MarkSet, attr_value, attrs_contains, build_synthetic_parent, clone_node_without_checkboxes,
collect_inline_into, collect_raw_text, collect_table_rows, decode_props_from_element,
detect_any_checkbox, detect_direct_checkbox, direct_children_of_tag_any, element_attrs,
element_tag, extract_cell_props, extract_colgroup_widths, extract_row_props, extract_spans,
extract_spans_from_li, find_body, find_descendant_with_any_class, has_class,
has_descendant_with_class, is_all_whitespace, is_notion_v3_toggle, parse_inline_style,
strip_nested_blocks,
};
pub(crate) fn parse_html(input: &str, id_gen: &mut dyn IdGenerator) -> CopiedBlocks {
html_to_copied_blocks(input, id_gen)
}
pub fn html_to_copied_blocks(input: &str, id_gen: &mut dyn IdGenerator) -> CopiedBlocks {
let cleaned = clean_html(input.trim());
if cleaned.is_empty() {
return CopiedBlocks {
roots: Vec::new(),
by_id: HashMap::new(),
};
}
let dom = parse_document(RcDom::default(), ParseOpts::default()).one(cleaned);
let body = find_body(&dom.document).unwrap_or_else(|| dom.document.clone());
let mut ctx = Context::new(id_gen);
ctx.process_children_with_indent(&body, 0);
let roots = ctx.finalize_roots();
CopiedBlocks {
roots,
by_id: ctx.by_id,
}
}
struct Context<'a> {
id_gen: &'a mut dyn IdGenerator,
roots_order: Vec<BlockId>,
by_id: HashMap<BlockId, Block>,
}
impl<'a> Context<'a> {
fn new(id_gen: &'a mut dyn IdGenerator) -> Self {
Self {
id_gen,
roots_order: Vec::new(),
by_id: HashMap::new(),
}
}
fn next_id(&mut self) -> BlockId {
self.id_gen.next_id()
}
fn push_root(&mut self, block: Block) {
self.roots_order.push(block.id.clone());
self.by_id.insert(block.id.clone(), block);
}
fn insert(&mut self, block: Block) -> BlockId {
let id = block.id.clone();
self.by_id.insert(id.clone(), block);
id
}
fn finalize_roots(&mut self) -> Vec<Block> {
let mut out = Vec::with_capacity(self.roots_order.len());
for id in &self.roots_order {
if let Some(b) = self.by_id.get(id) {
out.push(b.clone());
}
}
out
}
fn process_children_with_indent(&mut self, node: &Handle, indent: i64) {
let mut inline_buf: Vec<Handle> = Vec::new();
for child in node.children.borrow().iter() {
match &child.data {
NodeData::Text { contents } => {
if !contents.borrow().trim().is_empty() {
inline_buf.push(child.clone());
}
continue;
}
NodeData::Comment { .. } | NodeData::Doctype { .. } => continue,
NodeData::Element { .. } => {}
_ => continue,
}
let tag = element_tag(child).unwrap_or_default();
if !BLOCK_TAGS.contains(&tag.as_str()) && tag != "details" {
inline_buf.push(child.clone());
continue;
}
self.flush_inline(&mut inline_buf, indent);
if matches!(tag.as_str(), "h1" | "h2" | "h3" | "h4" | "h5" | "h6") {
let level_digit = tag.chars().nth(1).and_then(|c| c.to_digit(10)).unwrap_or(1);
let level = u64::from(level_digit);
let spans = extract_spans(child);
if !is_all_whitespace(&spans) {
let mut props = Map::new();
props.insert("level".into(), Value::from(level));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let id = self.next_id();
let mut b = Block::with_props(id, "heading", props);
b.content = spans;
self.push_root(b);
}
continue;
}
if tag == "blockquote" {
let spans = extract_spans(child);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut b = new_block(id, "quote", indent);
b.content = spans;
self.push_root(b);
}
continue;
}
if tag == "pre" {
let code_child = child.children.borrow().iter().find_map(|c| {
if element_tag(c).as_deref() == Some("code") {
Some(c.clone())
} else {
None
}
});
let language = code_child
.as_ref()
.and_then(|code| attr_value(code, "class"))
.and_then(|cls| {
cls.split_whitespace()
.find_map(|c| c.strip_prefix("language-").map(String::from))
});
let text = match code_child.as_ref() {
Some(code) => collect_raw_text(code),
None => collect_raw_text(child),
};
let text = text.strip_prefix('\n').unwrap_or(&text).to_string();
if !text.is_empty() {
let id = self.next_id();
let ty = if language.is_some() {
"code"
} else {
"paragraph"
};
let mut props = Map::new();
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
if let Some(lang) = language {
props.insert("language".into(), Value::String(lang));
}
let mut b = if props.is_empty() {
Block::new(id, ty)
} else {
Block::with_props(id, ty, props)
};
b.content = vec![TextSpan::plain(text)];
self.push_root(b);
}
continue;
}
if tag == "details" {
self.process_toggle_details(child, indent);
continue;
}
if tag == "ul" || tag == "ol" {
let attrs = element_attrs(child);
if attrs_contains(&attrs, "data-devup-type", "todo")
|| has_class(&attrs, "to-do-list")
{
self.process_notion_todo_list(child);
continue;
}
if tag == "ul" && has_class(&attrs, "toggle") {
self.process_notion_toggle_list(child, indent);
continue;
}
self.process_list(child, tag == "ol", indent);
continue;
}
if tag == "table" {
self.process_table(child, indent);
continue;
}
if tag == "hr" {
let id = self.next_id();
self.push_root(new_block(id, "divider", indent));
continue;
}
if tag == "li" {
let spans = extract_spans(child);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut b = new_block(id, "paragraph", indent);
b.content = spans;
self.push_root(b);
}
continue;
}
if tag == "p" {
let p_attrs = element_attrs(child);
if attrs_contains(&p_attrs, "data-type", "todo") {
let checked = attrs_contains(&p_attrs, "data-checked", "true");
let spans = extract_spans(child);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut props = Map::new();
props.insert("checked".into(), Value::Bool(checked));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let mut b = Block::with_props(id, "todo", props);
b.content = spans;
self.push_root(b);
}
continue;
}
let spans = extract_spans(child);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut b = new_block(id, "paragraph", indent);
b.content = spans;
self.push_root(b);
}
continue;
}
let has_block_child = child.children.borrow().iter().any(|c| {
if let Some(t) = element_tag(c) {
BLOCK_TAGS.contains(&t.as_str()) || t == "details"
} else {
false
}
});
if has_block_child {
self.process_children_with_indent(child, indent);
} else {
let spans = extract_spans(child);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut b = new_block(id, "paragraph", indent);
b.content = spans;
self.push_root(b);
}
}
}
self.flush_inline(&mut inline_buf, indent);
}
fn flush_inline(&mut self, buf: &mut Vec<Handle>, indent: i64) {
if buf.is_empty() {
return;
}
let mut spans: Vec<TextSpan> = Vec::new();
for n in buf.iter() {
let mark_set = MarkSet::empty();
collect_inline_into(n, &mut spans, &mark_set);
}
buf.clear();
normalize_spans(&mut spans);
if is_all_whitespace(&spans) {
return;
}
let id = self.next_id();
let mut b = new_block(id, "paragraph", indent);
b.content = spans;
self.push_root(b);
}
fn process_toggle_details(&mut self, details: &Handle, indent: i64) {
let summary_node = details.children.borrow().iter().find_map(|c| {
if element_tag(c).as_deref() == Some("summary") {
Some(c.clone())
} else {
None
}
});
let title_spans = summary_node.as_ref().map(extract_spans).unwrap_or_default();
let id = self.next_id();
let mut props = Map::new();
props.insert("collapsed".into(), Value::Bool(false));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let mut toggle = Block::with_props(id, "toggle", props);
toggle.content = title_spans;
self.push_root(toggle);
let child_handle = details.clone();
let original_children: Vec<Handle> = child_handle
.children
.borrow()
.iter()
.filter(|c| element_tag(c).as_deref() != Some("summary"))
.cloned()
.collect();
self.process_handles_with_indent(&original_children, indent + 1);
}
fn process_notion_toggle_list(&mut self, ul: &Handle, indent: i64) {
for li in ul.children.borrow().iter() {
if element_tag(li).as_deref() != Some("li") {
continue;
}
let details = li.children.borrow().iter().find_map(|c| {
if element_tag(c).as_deref() == Some("details") {
Some(c.clone())
} else {
None
}
});
if let Some(det) = details {
self.process_toggle_details(&det, indent);
} else {
let spans = extract_spans_from_li(li);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut props = Map::new();
props.insert("style".into(), Value::String("unordered".into()));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let mut b = Block::with_props(id, "list", props);
b.content = spans;
self.push_root(b);
}
}
}
}
fn process_list(&mut self, list_el: &Handle, ordered: bool, indent: i64) {
let style = if ordered { "ordered" } else { "unordered" };
for li in list_el.children.borrow().iter() {
if element_tag(li).as_deref() != Some("li") {
continue;
}
if let Some(checked) = detect_direct_checkbox(li) {
let clone_without_cb = clone_node_without_checkboxes(li);
let spans = extract_spans(&clone_without_cb);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut props = Map::new();
props.insert("checked".into(), Value::Bool(checked));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let mut b = Block::with_props(id, "todo", props);
b.content = spans;
self.push_root(b);
}
self.recurse_nested_lists(li, indent);
continue;
}
if !ordered && is_notion_v3_toggle(li) {
let block_children: Vec<Handle> = li
.children
.borrow()
.iter()
.filter(|c| {
if let Some(t) = element_tag(c) {
matches!(
t.as_str(),
"p" | "div"
| "ul"
| "ol"
| "blockquote"
| "pre"
| "table"
| "details"
) || is_heading_tag(&t)
} else {
false
}
})
.cloned()
.collect();
let title_el = &block_children[0];
let title_spans = extract_spans(title_el);
let id = self.next_id();
let mut props = Map::new();
props.insert("collapsed".into(), Value::Bool(false));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let mut toggle = Block::with_props(id, "toggle", props);
toggle.content = title_spans;
self.push_root(toggle);
self.process_handles_with_indent(&block_children[1..], indent + 1);
continue;
}
let spans = extract_spans_from_li(li);
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut props = Map::new();
props.insert("style".into(), Value::String(style.into()));
if indent > 0 {
props.insert("indent".into(), Value::from(indent));
}
let mut b = Block::with_props(id, "list", props);
b.content = spans;
self.push_root(b);
}
self.recurse_nested_lists(li, indent);
}
}
fn recurse_nested_lists(&mut self, li: &Handle, indent: i64) {
for nested in li.children.borrow().iter() {
let Some(t) = element_tag(nested) else {
continue;
};
if t == "ul" {
let attrs = element_attrs(nested);
if has_class(&attrs, "toggle") {
self.process_notion_toggle_list(nested, indent + 1);
} else {
self.process_list(nested, false, indent + 1);
}
} else if t == "ol" {
self.process_list(nested, true, indent + 1);
} else if t == "details" {
self.process_toggle_details(nested, indent + 1);
}
}
}
fn process_notion_todo_list(&mut self, ul: &Handle) {
for li in ul.children.borrow().iter() {
if element_tag(li).as_deref() != Some("li") {
continue;
}
let attrs = element_attrs(li);
let marker_checked = attrs
.iter()
.find(|a| a.name.local.as_ref() == "data-checked")
.map(|a| a.value.as_ref().eq_ignore_ascii_case("true"));
let checkbox = detect_any_checkbox(li);
let notion_checked = has_descendant_with_class(li, "checkbox-on");
let checked = marker_checked
.or(checkbox)
.or(Some(notion_checked))
.unwrap_or(false);
let notion_wrapper = find_descendant_with_any_class(
li,
&["to-do-children-checked", "to-do-children-unchecked"],
);
let spans = if let Some(w) = notion_wrapper {
extract_spans(&w)
} else {
let clone = clone_node_without_checkboxes(li);
let clone = strip_nested_blocks(&clone);
extract_spans(&clone)
};
if !is_all_whitespace(&spans) {
let id = self.next_id();
let mut props = Map::new();
props.insert("checked".into(), Value::Bool(checked));
let mut b = Block::with_props(id, "todo", props);
b.content = spans;
self.push_root(b);
}
}
}
fn process_handles_with_indent(&mut self, handles: &[Handle], indent: i64) {
let synthetic = build_synthetic_parent(handles);
self.process_children_with_indent(&synthetic, indent);
}
fn process_table(&mut self, table_el: &Handle, indent: i64) {
let row_els: Vec<Handle> = collect_table_rows(table_el);
if row_els.is_empty() {
return;
}
let max_cols = row_els
.iter()
.map(|r| direct_children_of_tag_any(r, &["td", "th"]).len())
.max()
.unwrap_or(0);
if max_cols == 0 {
return;
}
let mut row_ids: Vec<BlockId> = Vec::new();
for tr in &row_els {
let cells = direct_children_of_tag_any(tr, &["td", "th"]);
let mut cell_ids: Vec<BlockId> = Vec::with_capacity(max_cols);
for c in 0..max_cols {
let spans = cells.get(c).map(extract_spans).unwrap_or_default();
let mut props = cells
.get(c)
.and_then(extract_cell_props)
.unwrap_or_default();
let cell_id = self.next_id();
let mut cell_block = if props.is_empty() {
Block::new(cell_id.clone(), "table_cell")
} else {
normalize_span_numbers(&mut props);
Block::with_props(cell_id.clone(), "table_cell", props)
};
cell_block.content = spans;
cell_ids.push(self.insert(cell_block));
}
let row_props = extract_row_props(tr).unwrap_or_default();
let row_id = self.next_id();
let mut row_block = if row_props.is_empty() {
Block::new(row_id.clone(), "table_row")
} else {
Block::with_props(row_id.clone(), "table_row", row_props)
};
row_block.children.clone_from(&cell_ids);
for cid in &cell_ids {
if let Some(c) = self.by_id.get_mut(cid) {
c.parent = Some(row_id.clone());
}
}
row_ids.push(self.insert(row_block));
}
let mut table_props = decode_props_from_element(table_el).unwrap_or_default();
if let Some(style_attr) = attr_value(table_el, "style") {
let decl = parse_inline_style(&style_attr);
if !table_props.contains_key("backgroundColor")
&& let Some(v) = decl.background_color
{
table_props.insert("backgroundColor".into(), Value::String(v));
}
if !table_props.contains_key("borderColor")
&& let Some(v) = decl.border_color
{
table_props.insert("borderColor".into(), Value::String(v));
}
if !table_props.contains_key("borderWidth")
&& let Some(v) = decl.border_width
{
table_props.insert("borderWidth".into(), Value::String(v));
}
if !table_props.contains_key("borderStyle")
&& let Some(v) = decl.border_style
{
table_props.insert("borderStyle".into(), Value::String(v));
}
if !table_props.contains_key("verticalAlign")
&& let Some(v) = decl.vertical_align
{
table_props.insert("verticalAlign".into(), Value::String(v));
}
if !table_props.contains_key("padding")
&& let Some(v) = decl.padding
{
table_props.insert("padding".into(), Value::String(v));
}
}
if !table_props.contains_key("columns") {
let widths = extract_colgroup_widths(table_el, max_cols);
let cols: Vec<Value> = match widths {
Some(ws) => ws
.into_iter()
.map(|w| {
let mut m = Map::new();
m.insert("width".into(), Value::from(w));
Value::Object(m)
})
.collect(),
None => (0..max_cols)
.map(|_| {
let mut m = Map::new();
m.insert("width".into(), Value::from(120u64));
Value::Object(m)
})
.collect(),
};
table_props.insert("columns".into(), Value::Array(cols));
}
if indent > 0 {
table_props.insert("indent".into(), Value::from(indent));
}
let table_id = self.next_id();
let mut table_block = Block::with_props(table_id.clone(), "table", table_props);
table_block.children.clone_from(&row_ids);
for rid in &row_ids {
if let Some(r) = self.by_id.get_mut(rid) {
r.parent = Some(table_id.clone());
}
}
self.push_root(table_block);
}
}
fn normalize_span_numbers(props: &mut Map<String, Value>) {
for key in ["colspan", "rowspan"] {
if let Some(v) = props.get(key) {
let n = match v {
Value::Number(n) => n.as_u64(),
Value::String(s) => s.parse::<u64>().ok(),
_ => None,
};
if let Some(n) = n {
props.insert(key.into(), Value::from(n));
}
}
}
}
fn new_block(id: BlockId, ty: &str, indent: i64) -> Block {
if indent > 0 {
let mut props = Map::new();
props.insert("indent".into(), Value::from(indent));
Block::with_props(id, ty, props)
} else {
Block::new(id, ty)
}
}
pub(super) fn is_heading_tag(tag: &str) -> bool {
matches!(tag, "h1" | "h2" | "h3" | "h4" | "h5" | "h6")
}
static BLOCK_TAGS: &[&str] = &[
"address",
"article",
"aside",
"blockquote",
"div",
"dd",
"dl",
"dt",
"figcaption",
"figure",
"footer",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"header",
"hr",
"li",
"main",
"nav",
"ol",
"p",
"pre",
"section",
"table",
"tbody",
"td",
"th",
"thead",
"tfoot",
"tr",
"ul",
];
pub(super) static TABLE_STRUCTURE_TAGS: &[&str] = &[
"table", "thead", "tbody", "tfoot", "tr", "td", "th", "col", "colgroup", "caption",
];