1use std::collections::BTreeMap;
22
23use crate::attrs::Attrs;
24use crate::error::DocError;
25use crate::mark::Mark;
26use crate::node::Node;
27use crate::schema::Schema;
28
29pub const MAX_DEPTH: usize = 100;
32
33const VOID_TAGS: &[&str] = &[
34 "area", "base", "br", "col", "embed", "hr", "img", "input", "link", "meta", "source", "track",
35 "wbr",
36];
37
38#[derive(Debug, Clone)]
44pub struct DomSpec {
45 tag: String,
46 attrs: Vec<(String, String)>,
47 content_hole: bool,
48}
49
50impl DomSpec {
51 pub fn element(tag: &str) -> Self {
53 DomSpec {
54 tag: tag.to_string(),
55 attrs: Vec::new(),
56 content_hole: true,
57 }
58 }
59
60 pub fn void(tag: &str) -> Self {
62 DomSpec {
63 tag: tag.to_string(),
64 attrs: Vec::new(),
65 content_hole: false,
66 }
67 }
68
69 pub fn attr(mut self, name: &str, value: impl Into<String>) -> Self {
71 self.attrs.push((name.to_string(), value.into()));
72 self
73 }
74
75 pub fn tag(&self) -> &str {
77 &self.tag
78 }
79
80 pub fn attrs(&self) -> &[(String, String)] {
82 &self.attrs
83 }
84
85 pub fn content_hole(&self) -> bool {
87 self.content_hole
88 }
89}
90
91#[derive(Debug, Clone)]
93pub struct HtmlElement {
94 pub tag: String,
96 attrs: BTreeMap<String, String>,
97}
98
99impl HtmlElement {
100 pub fn attr(&self, name: &str) -> Option<&str> {
102 self.attrs.get(name).map(String::as_str)
103 }
104}
105
106#[derive(Debug, Clone)]
109pub struct ParseRule {
110 pub tag: String,
112 pub get_attrs: Option<fn(&HtmlElement) -> Option<Attrs>>,
116}
117
118impl ParseRule {
119 pub fn tag(tag: &str) -> Self {
121 ParseRule {
122 tag: tag.to_string(),
123 get_attrs: None,
124 }
125 }
126
127 pub fn with_attrs(tag: &str, f: fn(&HtmlElement) -> Option<Attrs>) -> Self {
129 ParseRule {
130 tag: tag.to_string(),
131 get_attrs: Some(f),
132 }
133 }
134}
135
136fn escape_text(s: &str) -> String {
139 let mut out = String::with_capacity(s.len());
140 for c in s.chars() {
141 match c {
142 '&' => out.push_str("&"),
143 '<' => out.push_str("<"),
144 '>' => out.push_str(">"),
145 _ => out.push(c),
146 }
147 }
148 out
149}
150
151fn escape_attr(s: &str) -> String {
152 let mut out = String::with_capacity(s.len());
153 for c in s.chars() {
154 match c {
155 '&' => out.push_str("&"),
156 '<' => out.push_str("<"),
157 '>' => out.push_str(">"),
158 '"' => out.push_str("""),
159 _ => out.push(c),
160 }
161 }
162 out
163}
164
165fn open_tag(spec: &DomSpec) -> String {
166 let mut s = String::new();
167 s.push('<');
168 s.push_str(&spec.tag);
169 for (k, v) in &spec.attrs {
170 s.push(' ');
171 s.push_str(k);
172 s.push_str("=\"");
173 s.push_str(&escape_attr(v));
174 s.push('"');
175 }
176 s
177}
178
179impl Node {
180 pub fn to_html(&self) -> String {
187 if let Some(text) = self.text() {
188 let mut s = escape_text(text);
189 for mark in self.marks() {
190 if let Some(f) = mark.mark_type().spec().to_dom {
191 let spec = f(mark);
192 s = format!("{}>{}</{}>", open_tag(&spec), s, spec.tag);
193 }
194 }
195 return s;
196 }
197
198 let children: String = self.content().iter().map(Node::to_html).collect();
199 match self.node_type().spec().to_dom {
200 None => children,
201 Some(f) => {
202 let spec = f(self);
203 if spec.content_hole {
204 format!("{}>{}</{}>", open_tag(&spec), children, spec.tag)
205 } else {
206 format!("{}/>", open_tag(&spec))
207 }
208 }
209 }
210 }
211}
212
213#[derive(Debug)]
216enum Token {
217 Open {
218 tag: String,
219 attrs: BTreeMap<String, String>,
220 self_closing: bool,
221 },
222 Close(String),
223 Text(String),
224}
225
226fn decode_entities(s: &str) -> String {
227 if !s.contains('&') {
228 return s.to_string();
229 }
230 let mut out = String::with_capacity(s.len());
231 let bytes: Vec<char> = s.chars().collect();
232 let mut i = 0;
233 while i < bytes.len() {
234 if bytes[i] == '&' {
235 if let Some(semi) = bytes[i + 1..].iter().position(|&c| c == ';') {
236 let ent: String = bytes[i + 1..i + 1 + semi].iter().collect();
237 let decoded = match ent.as_str() {
238 "amp" => Some('&'),
239 "lt" => Some('<'),
240 "gt" => Some('>'),
241 "quot" => Some('"'),
242 "apos" => Some('\''),
243 _ if ent.starts_with("#x") || ent.starts_with("#X") => {
244 u32::from_str_radix(&ent[2..], 16)
245 .ok()
246 .and_then(char::from_u32)
247 }
248 _ if ent.starts_with('#') => {
249 ent[1..].parse::<u32>().ok().and_then(char::from_u32)
250 }
251 _ => None,
252 };
253 if let Some(c) = decoded {
254 out.push(c);
255 i += semi + 2;
256 continue;
257 }
258 }
259 }
260 out.push(bytes[i]);
261 i += 1;
262 }
263 out
264}
265
266fn tokenize(html: &str) -> Vec<Token> {
267 let chars: Vec<char> = html.chars().collect();
268 let n = chars.len();
269 let mut i = 0;
270 let mut tokens = Vec::new();
271
272 while i < n {
273 if chars[i] == '<' {
274 if chars[i + 1..].starts_with(&['!', '-', '-']) {
276 if let Some(end) = find_subseq(&chars, i + 4, &['-', '-', '>']) {
277 i = end + 3;
278 } else {
279 i = n;
280 }
281 continue;
282 }
283 if chars.get(i + 1) == Some(&'!') || chars.get(i + 1) == Some(&'?') {
284 i = chars[i..]
285 .iter()
286 .position(|&c| c == '>')
287 .map_or(n, |p| i + p + 1);
288 continue;
289 }
290 if chars.get(i + 1) == Some(&'/') {
291 let mut j = i + 2;
292 let mut name = String::new();
293 while j < n && chars[j] != '>' {
294 name.push(chars[j]);
295 j += 1;
296 }
297 tokens.push(Token::Close(name.trim().to_ascii_lowercase()));
298 i = j + 1;
299 continue;
300 }
301 let mut j = i + 1;
303 let mut tag = String::new();
304 while j < n && !chars[j].is_whitespace() && chars[j] != '>' && chars[j] != '/' {
305 tag.push(chars[j]);
306 j += 1;
307 }
308 let mut attrs = BTreeMap::new();
309 let mut self_closing = false;
310 loop {
311 while j < n && chars[j].is_whitespace() {
312 j += 1;
313 }
314 if j >= n || chars[j] == '>' {
315 break;
316 }
317 if chars[j] == '/' {
318 self_closing = true;
319 j += 1;
320 continue;
321 }
322 let mut name = String::new();
323 while j < n
324 && !chars[j].is_whitespace()
325 && chars[j] != '='
326 && chars[j] != '>'
327 && chars[j] != '/'
328 {
329 name.push(chars[j]);
330 j += 1;
331 }
332 while j < n && chars[j].is_whitespace() {
333 j += 1;
334 }
335 let mut value = String::new();
336 if j < n && chars[j] == '=' {
337 j += 1;
338 while j < n && chars[j].is_whitespace() {
339 j += 1;
340 }
341 if j < n && (chars[j] == '"' || chars[j] == '\'') {
342 let quote = chars[j];
343 j += 1;
344 while j < n && chars[j] != quote {
345 value.push(chars[j]);
346 j += 1;
347 }
348 j += 1;
349 } else {
350 while j < n
351 && !chars[j].is_whitespace()
352 && chars[j] != '>'
353 && chars[j] != '/'
354 {
355 value.push(chars[j]);
356 j += 1;
357 }
358 }
359 }
360 if !name.is_empty() {
361 attrs.insert(name.to_ascii_lowercase(), decode_entities(&value));
362 }
363 }
364 let tag = tag.to_ascii_lowercase();
365 if VOID_TAGS.contains(&tag.as_str()) {
366 self_closing = true;
367 }
368 tokens.push(Token::Open {
369 tag,
370 attrs,
371 self_closing,
372 });
373 i = j + 1;
374 } else {
375 let mut text = String::new();
376 while i < n && chars[i] != '<' {
377 text.push(chars[i]);
378 i += 1;
379 }
380 tokens.push(Token::Text(decode_entities(&text)));
381 }
382 }
383 tokens
384}
385
386fn find_subseq(chars: &[char], from: usize, needle: &[char]) -> Option<usize> {
387 if from > chars.len() {
388 return None;
389 }
390 chars[from..]
391 .windows(needle.len())
392 .position(|w| w == needle)
393 .map(|p| from + p)
394}
395
396#[derive(Debug)]
399enum DomTree {
400 Element {
401 tag: String,
402 attrs: BTreeMap<String, String>,
403 children: Vec<DomTree>,
404 },
405 Text(String),
406}
407
408struct Frame {
409 tag: String,
410 attrs: BTreeMap<String, String>,
411 children: Vec<DomTree>,
412}
413
414fn build_tree(tokens: Vec<Token>) -> Result<Vec<DomTree>, DocError> {
415 let mut root: Vec<DomTree> = Vec::new();
416 let mut stack: Vec<Frame> = Vec::new();
417
418 macro_rules! push_child {
419 ($node:expr) => {
420 match stack.last_mut() {
421 Some(f) => f.children.push($node),
422 None => root.push($node),
423 }
424 };
425 }
426
427 for tok in tokens {
428 match tok {
429 Token::Text(t) => push_child!(DomTree::Text(t)),
430 Token::Open {
431 tag,
432 attrs,
433 self_closing,
434 } => {
435 if self_closing {
436 push_child!(DomTree::Element {
437 tag,
438 attrs,
439 children: Vec::new()
440 });
441 } else {
442 if stack.len() >= MAX_DEPTH {
443 return Err(DocError::HtmlParse(format!(
444 "element nesting exceeds {MAX_DEPTH}"
445 )));
446 }
447 stack.push(Frame {
448 tag,
449 attrs,
450 children: Vec::new(),
451 });
452 }
453 }
454 Token::Close(tag) => {
455 if let Some(depth) = stack.iter().rposition(|f| f.tag == tag) {
456 while stack.len() > depth {
458 let f = stack.pop().unwrap();
459 let el = DomTree::Element {
460 tag: f.tag,
461 attrs: f.attrs,
462 children: f.children,
463 };
464 push_child!(el);
465 }
466 }
467 }
469 }
470 }
471 while let Some(f) = stack.pop() {
473 let el = DomTree::Element {
474 tag: f.tag,
475 attrs: f.attrs,
476 children: f.children,
477 };
478 push_child!(el);
479 }
480 Ok(root)
481}
482
483fn is_ws_text(n: &Node) -> bool {
486 n.text().is_some_and(|t| t.chars().all(char::is_whitespace))
487}
488
489impl Schema {
490 fn fill_mark_attrs(&self, mark: &str, mut given: Attrs) -> Attrs {
491 if let Some(mt) = self.mark_type(mark) {
492 for (k, s) in &mt.spec().attrs {
493 if !given.contains_key(k) {
494 if let Some(d) = &s.default {
495 given.insert(k.clone(), d.clone());
496 }
497 }
498 }
499 }
500 given
501 }
502
503 fn match_mark(&self, el: &HtmlElement) -> Option<(String, Attrs)> {
504 for mt in self.mark_types() {
505 for rule in &mt.spec().parse_dom {
506 if rule.tag.eq_ignore_ascii_case(&el.tag) {
507 let attrs = match rule.get_attrs {
508 None => Some(Attrs::new()),
509 Some(f) => f(el),
510 };
511 if let Some(a) = attrs {
512 return Some((mt.name().to_string(), self.fill_mark_attrs(mt.name(), a)));
513 }
514 }
515 }
516 }
517 None
518 }
519
520 fn match_node(&self, el: &HtmlElement) -> Option<(String, Attrs)> {
521 for nt in self.node_types() {
522 for rule in &nt.spec().parse_dom {
523 if rule.tag.eq_ignore_ascii_case(&el.tag) {
524 let attrs = match rule.get_attrs {
525 None => Some(Attrs::new()),
526 Some(f) => f(el),
527 };
528 if let Some(a) = attrs {
529 return Some((nt.name().to_string(), a));
530 }
531 }
532 }
533 }
534 None
535 }
536
537 fn convert(
538 &self,
539 trees: &[DomTree],
540 marks: &[Mark],
541 depth: usize,
542 ) -> Result<Vec<Node>, DocError> {
543 if depth > MAX_DEPTH {
544 return Err(DocError::HtmlParse(format!(
545 "element nesting exceeds {MAX_DEPTH}"
546 )));
547 }
548 let mut out = Vec::new();
549 for tree in trees {
550 match tree {
551 DomTree::Text(t) => {
552 if !t.is_empty() {
553 out.push(self.text(t, marks.to_vec())?);
554 }
555 }
556 DomTree::Element {
557 tag,
558 attrs,
559 children,
560 } => {
561 let el = HtmlElement {
562 tag: tag.clone(),
563 attrs: attrs.clone(),
564 };
565 if let Some((mark_name, mark_attrs)) = self.match_mark(&el) {
566 let m = self.mark_type(&mark_name).unwrap().create(mark_attrs);
567 let new_marks = m.add_to_set(marks);
568 out.extend(self.convert(children, &new_marks, depth + 1)?);
569 } else if let Some((node_name, node_attrs)) = self.match_node(&el) {
570 let kids = self.convert(children, &[], depth + 1)?;
572 out.push(self.build_node(&node_name, node_attrs, kids)?);
573 } else {
574 out.extend(self.convert(children, marks, depth + 1)?);
576 }
577 }
578 }
579 }
580 Ok(out)
581 }
582
583 fn build_node(&self, name: &str, attrs: Attrs, kids: Vec<Node>) -> Result<Node, DocError> {
587 match self.node(name, attrs.clone(), kids.clone(), vec![]) {
588 Ok(n) => Ok(n),
589 Err(DocError::InvalidContent { .. }) => {
590 let filtered: Vec<Node> = kids.into_iter().filter(|n| !is_ws_text(n)).collect();
591 self.node(name, attrs, filtered, vec![])
592 }
593 Err(e) => Err(e),
594 }
595 }
596
597 pub fn parse_html(&self, html: &str) -> Result<Node, DocError> {
607 let trees = build_tree(tokenize(html))?;
608 let children = self.convert(&trees, &[], 0)?;
609
610 let top = self.top_node_type().name().to_string();
611 if children.len() == 1 && children[0].node_type().name() == top {
612 return Ok(children[0].clone());
613 }
614 self.build_node(&top, Attrs::new(), children)
615 }
616}