Skip to main content

taino_edit_core/
html.rs

1//! HTML serialization (doc → string) and a strict, dependency-free HTML
2//! parser (string → doc).
3//!
4//! Design and safety notes:
5//!
6//! - **No third-party HTML engine.** A small hand-written tokenizer keeps
7//!   `core` lean and `#![deny(unsafe_code)]`, and minimizes supply-chain and
8//!   parsing-attack surface. It deliberately understands only the subset an
9//!   editor needs; it is not a full HTML5 conformance parser.
10//! - **Output is always escaped.** Text and attribute values are HTML-escaped;
11//!   the serializer never emits raw markup, so a document cannot inject
12//!   `<script>` or break out of an attribute.
13//! - **Input is schema-gated.** Only tags for which the schema declares a
14//!   [`ParseRule`] become nodes/marks; unknown elements are unwrapped (their
15//!   children are kept) and the assembled tree is validated against the
16//!   schema, so structurally invalid HTML is rejected rather than trusted.
17//! - **Hostile input is bounded.** Nesting depth is capped
18//!   ([`MAX_DEPTH`]); pathological input yields [`DocError::HtmlParse`]
19//!   instead of unbounded recursion or memory growth.
20
21use std::collections::BTreeMap;
22
23use crate::attrs::Attrs;
24use crate::error::DocError;
25use crate::mark::Mark;
26use crate::node::Node;
27use crate::schema::Schema;
28
29/// Maximum element nesting depth accepted by [`Schema::parse_html`]. Input
30/// that nests deeper is rejected with [`DocError::HtmlParse`].
31pub const MAX_DEPTH: usize = 100;
32
33const VOID_TAGS: &[&str] = &[
34    "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "source", "track",
35    "wbr",
36];
37
38/// How to render a node or mark as an HTML element.
39///
40/// Build with [`DomSpec::element`] (a container with a content hole) or
41/// [`DomSpec::void`] (a childless element such as `<img>`), then chain
42/// [`DomSpec::attr`].
43#[derive(Debug, Clone)]
44pub struct DomSpec {
45    tag: String,
46    attrs: Vec<(String, String)>,
47    content_hole: bool,
48}
49
50impl DomSpec {
51    /// A container element whose children are serialized inside it.
52    pub fn element(tag: &str) -> Self {
53        DomSpec {
54            tag: tag.to_string(),
55            attrs: Vec::new(),
56            content_hole: true,
57        }
58    }
59
60    /// A childless element (e.g. `<img>`, `<hr>`).
61    pub fn void(tag: &str) -> Self {
62        DomSpec {
63            tag: tag.to_string(),
64            attrs: Vec::new(),
65            content_hole: false,
66        }
67    }
68
69    /// Add an attribute. Order is preserved in the output.
70    pub fn attr(mut self, name: &str, value: impl Into<String>) -> Self {
71        self.attrs.push((name.to_string(), value.into()));
72        self
73    }
74
75    /// The element tag name.
76    pub fn tag(&self) -> &str {
77        &self.tag
78    }
79
80    /// The attributes in declaration order.
81    pub fn attrs(&self) -> &[(String, String)] {
82        &self.attrs
83    }
84
85    /// Whether the element holds the node's content (true) or is a void leaf.
86    pub fn content_hole(&self) -> bool {
87        self.content_hole
88    }
89}
90
91/// A parsed HTML element exposed to [`ParseRule`] attribute extractors.
92#[derive(Debug, Clone)]
93pub struct HtmlElement {
94    /// Lowercased tag name.
95    pub tag: String,
96    attrs: BTreeMap<String, String>,
97}
98
99impl HtmlElement {
100    /// The value of attribute `name`, if present.
101    pub fn attr(&self, name: &str) -> Option<&str> {
102        self.attrs.get(name).map(String::as_str)
103    }
104}
105
106/// A rule mapping an HTML tag back to the node or mark type whose spec
107/// declares it.
108#[derive(Debug, Clone)]
109pub struct ParseRule {
110    /// Tag name to match (compared case-insensitively).
111    pub tag: String,
112    /// Derive attributes from the matched element. `None` means no
113    /// attributes; a closure returning `None` declines the match so another
114    /// rule may apply.
115    pub get_attrs: Option<fn(&HtmlElement) -> Option<Attrs>>,
116}
117
118impl ParseRule {
119    /// A rule matching `tag` with no attributes.
120    pub fn tag(tag: &str) -> Self {
121        ParseRule {
122            tag: tag.to_string(),
123            get_attrs: None,
124        }
125    }
126
127    /// A rule matching `tag` that derives attributes via `f`.
128    pub fn with_attrs(tag: &str, f: fn(&HtmlElement) -> Option<Attrs>) -> Self {
129        ParseRule {
130            tag: tag.to_string(),
131            get_attrs: Some(f),
132        }
133    }
134}
135
136// ---- serialization --------------------------------------------------------
137
138fn escape_text(s: &str) -> String {
139    let mut out = String::with_capacity(s.len());
140    for c in s.chars() {
141        match c {
142            '&' => out.push_str("&amp;"),
143            '<' => out.push_str("&lt;"),
144            '>' => out.push_str("&gt;"),
145            _ => out.push(c),
146        }
147    }
148    out
149}
150
151fn escape_attr(s: &str) -> String {
152    let mut out = String::with_capacity(s.len());
153    for c in s.chars() {
154        match c {
155            '&' => out.push_str("&amp;"),
156            '<' => out.push_str("&lt;"),
157            '>' => out.push_str("&gt;"),
158            '"' => out.push_str("&quot;"),
159            _ => out.push(c),
160        }
161    }
162    out
163}
164
165fn open_tag(spec: &DomSpec) -> String {
166    let mut s = String::new();
167    s.push('<');
168    s.push_str(&spec.tag);
169    for (k, v) in &spec.attrs {
170        s.push(' ');
171        s.push_str(k);
172        s.push_str("=\"");
173        s.push_str(&escape_attr(v));
174        s.push('"');
175    }
176    s
177}
178
179impl Node {
180    /// Serialize this node (and its subtree) to an HTML string.
181    ///
182    /// Text and attribute values are HTML-escaped. A node type whose spec has
183    /// no `to_dom` is transparent — only its content is emitted (the usual
184    /// case for the document node), which is why a serialized document is a
185    /// run of its block children with no wrapper.
186    pub fn to_html(&self) -> String {
187        if let Some(text) = self.text() {
188            let mut s = escape_text(text);
189            for mark in self.marks() {
190                if let Some(f) = mark.mark_type().spec().to_dom {
191                    let spec = f(mark);
192                    s = format!("{}>{}</{}>", open_tag(&spec), s, spec.tag);
193                }
194            }
195            return s;
196        }
197
198        let children: String = self.content().iter().map(Node::to_html).collect();
199        match self.node_type().spec().to_dom {
200            None => children,
201            Some(f) => {
202                let spec = f(self);
203                if spec.content_hole {
204                    format!("{}>{}</{}>", open_tag(&spec), children, spec.tag)
205                } else {
206                    format!("{}/>", open_tag(&spec))
207                }
208            }
209        }
210    }
211}
212
213// ---- tokenizer ------------------------------------------------------------
214
215#[derive(Debug)]
216enum Token {
217    Open {
218        tag: String,
219        attrs: BTreeMap<String, String>,
220        self_closing: bool,
221    },
222    Close(String),
223    Text(String),
224}
225
226fn decode_entities(s: &str) -> String {
227    if !s.contains('&') {
228        return s.to_string();
229    }
230    let mut out = String::with_capacity(s.len());
231    let bytes: Vec<char> = s.chars().collect();
232    let mut i = 0;
233    while i < bytes.len() {
234        if bytes[i] == '&' {
235            if let Some(semi) = bytes[i + 1..].iter().position(|&c| c == ';') {
236                let ent: String = bytes[i + 1..i + 1 + semi].iter().collect();
237                let decoded = match ent.as_str() {
238                    "amp" => Some('&'),
239                    "lt" => Some('<'),
240                    "gt" => Some('>'),
241                    "quot" => Some('"'),
242                    "apos" => Some('\''),
243                    _ if ent.starts_with("#x") || ent.starts_with("#X") => {
244                        u32::from_str_radix(&ent[2..], 16)
245                            .ok()
246                            .and_then(char::from_u32)
247                    }
248                    _ if ent.starts_with('#') => {
249                        ent[1..].parse::<u32>().ok().and_then(char::from_u32)
250                    }
251                    _ => None,
252                };
253                if let Some(c) = decoded {
254                    out.push(c);
255                    i += semi + 2;
256                    continue;
257                }
258            }
259        }
260        out.push(bytes[i]);
261        i += 1;
262    }
263    out
264}
265
266fn tokenize(html: &str) -> Vec<Token> {
267    let chars: Vec<char> = html.chars().collect();
268    let n = chars.len();
269    let mut i = 0;
270    let mut tokens = Vec::new();
271
272    while i < n {
273        if chars[i] == '<' {
274            // Comment / CDATA / doctype / processing instruction → skipped.
275            if chars[i + 1..].starts_with(&['!', '-', '-']) {
276                if let Some(end) = find_subseq(&chars, i + 4, &['-', '-', '>']) {
277                    i = end + 3;
278                } else {
279                    i = n;
280                }
281                continue;
282            }
283            if chars.get(i + 1) == Some(&'!') || chars.get(i + 1) == Some(&'?') {
284                i = chars[i..]
285                    .iter()
286                    .position(|&c| c == '>')
287                    .map_or(n, |p| i + p + 1);
288                continue;
289            }
290            if chars.get(i + 1) == Some(&'/') {
291                let mut j = i + 2;
292                let mut name = String::new();
293                while j < n && chars[j] != '>' {
294                    name.push(chars[j]);
295                    j += 1;
296                }
297                tokens.push(Token::Close(name.trim().to_ascii_lowercase()));
298                i = j + 1;
299                continue;
300            }
301            // Opening tag.
302            let mut j = i + 1;
303            let mut tag = String::new();
304            while j < n && !chars[j].is_whitespace() && chars[j] != '>' && chars[j] != '/' {
305                tag.push(chars[j]);
306                j += 1;
307            }
308            let mut attrs = BTreeMap::new();
309            let mut self_closing = false;
310            loop {
311                while j < n && chars[j].is_whitespace() {
312                    j += 1;
313                }
314                if j >= n || chars[j] == '>' {
315                    break;
316                }
317                if chars[j] == '/' {
318                    self_closing = true;
319                    j += 1;
320                    continue;
321                }
322                let mut name = String::new();
323                while j < n
324                    && !chars[j].is_whitespace()
325                    && chars[j] != '='
326                    && chars[j] != '>'
327                    && chars[j] != '/'
328                {
329                    name.push(chars[j]);
330                    j += 1;
331                }
332                while j < n && chars[j].is_whitespace() {
333                    j += 1;
334                }
335                let mut value = String::new();
336                if j < n && chars[j] == '=' {
337                    j += 1;
338                    while j < n && chars[j].is_whitespace() {
339                        j += 1;
340                    }
341                    if j < n && (chars[j] == '"' || chars[j] == '\'') {
342                        let quote = chars[j];
343                        j += 1;
344                        while j < n && chars[j] != quote {
345                            value.push(chars[j]);
346                            j += 1;
347                        }
348                        j += 1;
349                    } else {
350                        while j < n
351                            && !chars[j].is_whitespace()
352                            && chars[j] != '>'
353                            && chars[j] != '/'
354                        {
355                            value.push(chars[j]);
356                            j += 1;
357                        }
358                    }
359                }
360                if !name.is_empty() {
361                    attrs.insert(name.to_ascii_lowercase(), decode_entities(&value));
362                }
363            }
364            let tag = tag.to_ascii_lowercase();
365            if VOID_TAGS.contains(&tag.as_str()) {
366                self_closing = true;
367            }
368            tokens.push(Token::Open {
369                tag,
370                attrs,
371                self_closing,
372            });
373            i = j + 1;
374        } else {
375            let mut text = String::new();
376            while i < n && chars[i] != '<' {
377                text.push(chars[i]);
378                i += 1;
379            }
380            tokens.push(Token::Text(decode_entities(&text)));
381        }
382    }
383    tokens
384}
385
386fn find_subseq(chars: &[char], from: usize, needle: &[char]) -> Option<usize> {
387    if from > chars.len() {
388        return None;
389    }
390    chars[from..]
391        .windows(needle.len())
392        .position(|w| w == needle)
393        .map(|p| from + p)
394}
395
396// ---- tree ----------------------------------------------------------------
397
398#[derive(Debug)]
399enum DomTree {
400    Element {
401        tag: String,
402        attrs: BTreeMap<String, String>,
403        children: Vec<DomTree>,
404    },
405    Text(String),
406}
407
408struct Frame {
409    tag: String,
410    attrs: BTreeMap<String, String>,
411    children: Vec<DomTree>,
412}
413
414fn build_tree(tokens: Vec<Token>) -> Result<Vec<DomTree>, DocError> {
415    let mut root: Vec<DomTree> = Vec::new();
416    let mut stack: Vec<Frame> = Vec::new();
417
418    macro_rules! push_child {
419        ($node:expr) => {
420            match stack.last_mut() {
421                Some(f) => f.children.push($node),
422                None => root.push($node),
423            }
424        };
425    }
426
427    for tok in tokens {
428        match tok {
429            Token::Text(t) => push_child!(DomTree::Text(t)),
430            Token::Open {
431                tag,
432                attrs,
433                self_closing,
434            } => {
435                if self_closing {
436                    push_child!(DomTree::Element {
437                        tag,
438                        attrs,
439                        children: Vec::new()
440                    });
441                } else {
442                    if stack.len() >= MAX_DEPTH {
443                        return Err(DocError::HtmlParse(format!(
444                            "element nesting exceeds {MAX_DEPTH}"
445                        )));
446                    }
447                    stack.push(Frame {
448                        tag,
449                        attrs,
450                        children: Vec::new(),
451                    });
452                }
453            }
454            Token::Close(tag) => {
455                if let Some(depth) = stack.iter().rposition(|f| f.tag == tag) {
456                    // Auto-close any intervening unclosed elements.
457                    while stack.len() > depth {
458                        let f = stack.pop().unwrap();
459                        let el = DomTree::Element {
460                            tag: f.tag,
461                            attrs: f.attrs,
462                            children: f.children,
463                        };
464                        push_child!(el);
465                    }
466                }
467                // A stray close with no matching open is ignored.
468            }
469        }
470    }
471    // Unwind anything left open.
472    while let Some(f) = stack.pop() {
473        let el = DomTree::Element {
474            tag: f.tag,
475            attrs: f.attrs,
476            children: f.children,
477        };
478        push_child!(el);
479    }
480    Ok(root)
481}
482
483// ---- conversion to nodes -------------------------------------------------
484
485fn is_ws_text(n: &Node) -> bool {
486    n.text().is_some_and(|t| t.chars().all(char::is_whitespace))
487}
488
489impl Schema {
490    fn fill_mark_attrs(&self, mark: &str, mut given: Attrs) -> Attrs {
491        if let Some(mt) = self.mark_type(mark) {
492            for (k, s) in &mt.spec().attrs {
493                if !given.contains_key(k) {
494                    if let Some(d) = &s.default {
495                        given.insert(k.clone(), d.clone());
496                    }
497                }
498            }
499        }
500        given
501    }
502
503    fn match_mark(&self, el: &HtmlElement) -> Option<(String, Attrs)> {
504        for mt in self.mark_types() {
505            for rule in &mt.spec().parse_dom {
506                if rule.tag.eq_ignore_ascii_case(&el.tag) {
507                    let attrs = match rule.get_attrs {
508                        None => Some(Attrs::new()),
509                        Some(f) => f(el),
510                    };
511                    if let Some(a) = attrs {
512                        return Some((mt.name().to_string(), self.fill_mark_attrs(mt.name(), a)));
513                    }
514                }
515            }
516        }
517        None
518    }
519
520    fn match_node(&self, el: &HtmlElement) -> Option<(String, Attrs)> {
521        for nt in self.node_types() {
522            for rule in &nt.spec().parse_dom {
523                if rule.tag.eq_ignore_ascii_case(&el.tag) {
524                    let attrs = match rule.get_attrs {
525                        None => Some(Attrs::new()),
526                        Some(f) => f(el),
527                    };
528                    if let Some(a) = attrs {
529                        return Some((nt.name().to_string(), a));
530                    }
531                }
532            }
533        }
534        None
535    }
536
537    fn convert(
538        &self,
539        trees: &[DomTree],
540        marks: &[Mark],
541        depth: usize,
542    ) -> Result<Vec<Node>, DocError> {
543        if depth > MAX_DEPTH {
544            return Err(DocError::HtmlParse(format!(
545                "element nesting exceeds {MAX_DEPTH}"
546            )));
547        }
548        let mut out = Vec::new();
549        for tree in trees {
550            match tree {
551                DomTree::Text(t) => {
552                    if !t.is_empty() {
553                        out.push(self.text(t, marks.to_vec())?);
554                    }
555                }
556                DomTree::Element {
557                    tag,
558                    attrs,
559                    children,
560                } => {
561                    let el = HtmlElement {
562                        tag: tag.clone(),
563                        attrs: attrs.clone(),
564                    };
565                    if let Some((mark_name, mark_attrs)) = self.match_mark(&el) {
566                        let m = self.mark_type(&mark_name).unwrap().create(mark_attrs);
567                        let new_marks = m.add_to_set(marks);
568                        out.extend(self.convert(children, &new_marks, depth + 1)?);
569                    } else if let Some((node_name, node_attrs)) = self.match_node(&el) {
570                        // Marks are inline-scoped: a fresh element resets them.
571                        let kids = self.convert(children, &[], depth + 1)?;
572                        out.push(self.build_node(&node_name, node_attrs, kids)?);
573                    } else {
574                        // Unknown element: unwrap, keep its content.
575                        out.extend(self.convert(children, marks, depth + 1)?);
576                    }
577                }
578            }
579        }
580        Ok(out)
581    }
582
583    /// Build a node, retrying once without whitespace-only text children if
584    /// the first attempt violates the content expression (handles insignificant
585    /// inter-tag whitespace without loosening strictness for real content).
586    fn build_node(&self, name: &str, attrs: Attrs, kids: Vec<Node>) -> Result<Node, DocError> {
587        match self.node(name, attrs.clone(), kids.clone(), vec![]) {
588            Ok(n) => Ok(n),
589            Err(DocError::InvalidContent { .. }) => {
590                let filtered: Vec<Node> = kids.into_iter().filter(|n| !is_ws_text(n)).collect();
591                self.node(name, attrs, filtered, vec![])
592            }
593            Err(e) => Err(e),
594        }
595    }
596
597    /// Parse an HTML string into a document, strictly validated against this
598    /// schema.
599    ///
600    /// Recognized tags (those a node/mark spec declares via [`ParseRule`])
601    /// become nodes/marks; unknown elements are unwrapped. The result is
602    /// wrapped in (or returned as) the schema's top node and validated, so
603    /// content that cannot satisfy the schema yields
604    /// [`DocError::InvalidContent`]. Overly deep input yields
605    /// [`DocError::HtmlParse`].
606    pub fn parse_html(&self, html: &str) -> Result<Node, DocError> {
607        let trees = build_tree(tokenize(html))?;
608        let children = self.convert(&trees, &[], 0)?;
609
610        let top = self.top_node_type().name().to_string();
611        if children.len() == 1 && children[0].node_type().name() == top {
612            return Ok(children[0].clone());
613        }
614        self.build_node(&top, Attrs::new(), children)
615    }
616}