html_to_markdown_rs/
converter.rs

1//! HTML to Markdown conversion using the astral-tl parser.
2//!
3//! This module provides the core conversion logic for transforming HTML documents into Markdown.
4//! It uses the astral-tl parser for high-performance HTML parsing and supports 60+ HTML tags.
5//!
6
7#![allow(clippy::collapsible_match)]
8//! # Architecture
9//!
10//! The conversion process follows these steps:
11//! 1. Parse HTML into a DOM tree using the astral-tl parser
12//! 2. Walk the DOM tree recursively
13//! 3. Convert each node type to its Markdown equivalent
14//! 4. Apply text escaping and whitespace normalization
15//!
16//! # Whitespace Handling
17//!
18//! This library preserves whitespace exactly as it appears in the HTML source.
19//! Text nodes retain their original spacing, including multiple spaces and newlines.
20//!
21//! - **Raw text preservation**: All whitespace in text nodes is preserved
22//! - **No HTML5 normalization**: Whitespace is not collapsed according to HTML5 rules
23//! - **Full control**: Applications can handle whitespace as needed
24//!
25//! # Supported Features
26//!
27//! - **Block elements**: headings, paragraphs, lists, tables, blockquotes
28//! - **Inline formatting**: bold, italic, code, links, images, strikethrough
29//! - **Semantic HTML5**: article, section, nav, aside, header, footer
30//! - **Forms**: inputs, select, button, textarea, fieldset
31//! - **Media**: audio, video, picture, iframe, svg
32//! - **Advanced**: task lists, ruby annotations, definition lists
33//!
34//! # Examples
35//!
36//! ```rust
37//! use html_to_markdown_rs::{convert, ConversionOptions};
38//!
39//! let html = "<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";
40//! let markdown = convert(html, None).unwrap();
41//! assert_eq!(markdown, "# Title\n\nParagraph with **bold** text.\n");
42//! ```
43
44#[cfg(feature = "inline-images")]
45use std::cell::RefCell;
46use std::collections::{BTreeMap, HashMap};
47#[cfg(feature = "inline-images")]
48use std::rc::Rc;
49
50use std::borrow::Cow;
51use std::str;
52
53use crate::error::Result;
54#[cfg(feature = "inline-images")]
55use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
56use crate::options::{ConversionOptions, HeadingStyle, ListIndentType};
57use crate::text;
58
59#[cfg(feature = "inline-images")]
60type InlineCollectorHandle = Rc<RefCell<InlineImageCollector>>;
61#[cfg(not(feature = "inline-images"))]
62type InlineCollectorHandle = ();
63
64/// Chomp whitespace from inline element content, preserving line breaks.
65///
66/// Similar to text::chomp but handles line breaks from <br> tags specially.
67/// Line breaks are extracted as suffix to be placed outside formatting.
68/// Returns (prefix, suffix, trimmed_text).
69fn chomp_inline(text: &str) -> (&str, &str, &str) {
70    if text.is_empty() {
71        return ("", "", "");
72    }
73
74    let prefix = if text.starts_with(&[' ', '\t'][..]) { " " } else { "" };
75
76    let has_trailing_linebreak = text.ends_with("  \n") || text.ends_with("\\\n");
77
78    let suffix = if has_trailing_linebreak {
79        if text.ends_with("  \n") { "  \n" } else { "\\\n" }
80    } else if text.ends_with(&[' ', '\t'][..]) {
81        " "
82    } else {
83        ""
84    };
85
86    let trimmed = if has_trailing_linebreak {
87        if let Some(stripped) = text.strip_suffix("  \n") {
88            stripped.trim()
89        } else if let Some(stripped) = text.strip_suffix("\\\n") {
90            stripped.trim()
91        } else {
92            text.trim()
93        }
94    } else {
95        text.trim()
96    };
97
98    (prefix, suffix, trimmed)
99}
100
101/// Remove trailing spaces and tabs from output string.
102///
103/// This is used before adding block separators or newlines to ensure
104/// clean Markdown output without spurious whitespace.
105fn trim_trailing_whitespace(output: &mut String) {
106    while output.ends_with(' ') || output.ends_with('\t') {
107        output.pop();
108    }
109}
110
111/// Remove trailing spaces/tabs from every line while preserving newlines.
112fn trim_line_end_whitespace(output: &mut String) {
113    if output.is_empty() {
114        return;
115    }
116
117    let mut cleaned = String::with_capacity(output.len());
118    for (idx, line) in output.split('\n').enumerate() {
119        if idx > 0 {
120            cleaned.push('\n');
121        }
122
123        let has_soft_break = line.ends_with("  ");
124        let trimmed = line.trim_end_matches([' ', '\t']);
125
126        if has_soft_break {
127            cleaned.push_str(trimmed);
128            cleaned.push_str("  ");
129        } else {
130            cleaned.push_str(trimmed);
131        }
132    }
133
134    cleaned.push('\n');
135    *output = cleaned;
136}
137
138/// Truncate a string at a valid UTF-8 boundary.
139fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
140    if value.len() <= max_len {
141        return;
142    }
143
144    let mut new_len = max_len.min(value.len());
145    while new_len > 0 && !value.is_char_boundary(new_len) {
146        new_len -= 1;
147    }
148    value.truncate(new_len);
149}
150
151/// Remove common leading whitespace from all lines in a code block.
152///
153/// This is useful when HTML authors indent `<pre>` content for readability,
154/// so we can strip the shared indentation without touching meaningful spacing.
155fn dedent_code_block(content: &str) -> String {
156    let lines: Vec<&str> = content.lines().collect();
157    if lines.is_empty() {
158        return String::new();
159    }
160
161    let min_indent = lines
162        .iter()
163        .filter(|line| !line.trim().is_empty())
164        .map(|line| {
165            line.char_indices()
166                .take_while(|(_, c)| c.is_whitespace())
167                .map(|(idx, c)| idx + c.len_utf8())
168                .last()
169                .unwrap_or(0)
170        })
171        .min()
172        .unwrap_or(0);
173
174    lines
175        .iter()
176        .map(|line| {
177            if line.trim().is_empty() {
178                *line
179            } else {
180                &line[min_indent.min(line.len())..]
181            }
182        })
183        .collect::<Vec<_>>()
184        .join("\n")
185}
186
187/// Calculate indentation level for list item continuations.
188///
189/// Returns the number of 4-space indent groups needed for list continuations.
190///
191/// List continuations (block elements inside list items) need special indentation:
192/// - Base indentation: (depth - 1) groups (for the nesting level)
193/// - Content indentation: depth groups (for the list item content)
194/// - Combined formula: (2 * depth - 1) groups of 4 spaces each
195///
196/// # Examples
197///
198/// ```text
199/// * Item 1           (depth=0, no continuation)
200/// * Item 2           (depth=0)
201///     Continuation   (depth=0: 0 groups = 0 spaces)
202///
203/// * Level 1          (depth=0)
204///     + Level 2      (depth=1)
205///             Cont   (depth=1: (2*1-1) = 1 group = 4 spaces, total 12 with bullet indent)
206/// ```
207fn calculate_list_continuation_indent(depth: usize) -> usize {
208    if depth > 0 { 2 * depth - 1 } else { 0 }
209}
210
211/// Check if a list (ul or ol) is "loose".
212///
213/// A loose list is one where any list item contains block-level elements
214/// like paragraphs (<p>). In loose lists, all items should have blank line
215/// separation (ending with \n\n) regardless of their own content.
216///
217/// # Examples
218///
219/// ```html
220/// <!-- Loose list (has <p> in an item) -->
221/// <ul>
222///   <li><p>Item 1</p></li>
223///   <li>Item 2</li>  <!-- Also gets \n\n ending -->
224/// </ul>
225///
226/// <!-- Tight list (no block elements) -->
227/// <ul>
228///   <li>Item 1</li>
229///   <li>Item 2</li>
230/// </ul>
231/// ```
232fn is_loose_list(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
233    if let Some(node) = node_handle.get(parser) {
234        if let tl::Node::Tag(tag) = node {
235            let children = tag.children();
236            {
237                for child_handle in children.top().iter() {
238                    if let Some(child_node) = child_handle.get(parser) {
239                        if let tl::Node::Tag(child_tag) = child_node {
240                            if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
241                                let li_children = child_tag.children();
242                                {
243                                    for li_child_handle in li_children.top().iter() {
244                                        if let Some(li_child_node) = li_child_handle.get(parser) {
245                                            if let tl::Node::Tag(li_child_tag) = li_child_node {
246                                                if tag_name_eq(li_child_tag.name().as_utf8_str(), "p") {
247                                                    return true;
248                                                }
249                                            }
250                                        }
251                                    }
252                                }
253                            }
254                        }
255                    }
256                }
257            }
258        }
259    }
260    false
261}
262
263/// Add list continuation indentation to output.
264///
265/// Used when block elements (like <p> or <div>) appear inside list items.
266/// Adds appropriate line separation and indentation to continue the list item.
267///
268/// # Arguments
269///
270/// * `output` - The output string to append to
271/// * `list_depth` - Current list nesting depth
272/// * `blank_line` - If true, adds blank line separation (\n\n); if false, single newline (\n)
273///
274/// # Examples
275///
276/// ```text
277/// Paragraph continuation (blank_line = true):
278///   * First para
279///
280///       Second para  (blank line + indentation)
281///
282/// Div continuation (blank_line = false):
283///   * First div
284///       Second div   (single newline + indentation)
285/// ```
286fn add_list_continuation_indent(output: &mut String, list_depth: usize, blank_line: bool, options: &ConversionOptions) {
287    trim_trailing_whitespace(output);
288
289    if blank_line {
290        if !output.ends_with("\n\n") {
291            if output.ends_with('\n') {
292                output.push('\n');
293            } else {
294                output.push_str("\n\n");
295            }
296        }
297    } else if !output.ends_with('\n') {
298        output.push('\n');
299    }
300
301    let indent_level = calculate_list_continuation_indent(list_depth);
302    let indent_char = match options.list_indent_type {
303        ListIndentType::Tabs => "\t",
304        ListIndentType::Spaces => &" ".repeat(options.list_indent_width),
305    };
306    output.push_str(&indent_char.repeat(indent_level));
307}
308
309/// Calculate the indentation string for list continuations based on depth and options.
310fn continuation_indent_string(list_depth: usize, options: &ConversionOptions) -> Option<String> {
311    let indent_level = calculate_list_continuation_indent(list_depth);
312    if indent_level == 0 {
313        return None;
314    }
315
316    let indent = match options.list_indent_type {
317        ListIndentType::Tabs => "\t".repeat(indent_level),
318        ListIndentType::Spaces => " ".repeat(options.list_indent_width * indent_level),
319    };
320    Some(indent)
321}
322
323/// Add appropriate leading separator before a list.
324///
325/// Lists need different separators depending on context:
326/// - In table cells: <br> tag if there's already content
327/// - Outside lists: blank line (\n\n) if needed
328/// - Inside list items: blank line before nested list
329fn add_list_leading_separator(output: &mut String, ctx: &Context) {
330    if ctx.in_table_cell {
331        let is_table_continuation =
332            !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
333        if is_table_continuation {
334            output.push_str("<br>");
335        }
336        return;
337    }
338
339    if !output.is_empty() && !ctx.in_list {
340        let needs_newline =
341            !output.ends_with("\n\n") && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
342        if needs_newline {
343            output.push_str("\n\n");
344        }
345        return;
346    }
347
348    if ctx.in_list_item && !output.is_empty() {
349        let needs_newline =
350            !output.ends_with('\n') && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
351        if needs_newline {
352            trim_trailing_whitespace(output);
353            output.push('\n');
354        }
355    }
356}
357
358/// Add appropriate trailing separator after a nested list.
359///
360/// Nested lists inside list items need trailing newlines to separate
361/// from following content. In loose lists, use blank line (\n\n). In tight lists, single newline (\n).
362fn add_nested_list_trailing_separator(output: &mut String, ctx: &Context) {
363    if !ctx.in_list_item {
364        return;
365    }
366
367    if ctx.loose_list {
368        if !output.ends_with("\n\n") {
369            if !output.ends_with('\n') {
370                output.push('\n');
371            }
372            output.push('\n');
373        }
374    } else if !output.ends_with('\n') {
375        output.push('\n');
376    }
377}
378
379/// Calculate the nesting depth for a list.
380///
381/// If we're in a list but NOT in a list item, this is incorrectly nested HTML
382/// and we need to increment the depth. If in a list item, the depth was already
383/// incremented by the <li> element.
384fn calculate_list_nesting_depth(ctx: &Context) -> usize {
385    if ctx.in_list && !ctx.in_list_item {
386        ctx.list_depth + 1
387    } else {
388        ctx.list_depth
389    }
390}
391
392/// Process a list's children, tracking which items had block elements.
393///
394/// This is used to determine proper spacing between list items.
395/// Returns true if the last processed item had block children.
396#[allow(clippy::too_many_arguments)]
397fn process_list_children(
398    node_handle: &tl::NodeHandle,
399    parser: &tl::Parser,
400    output: &mut String,
401    options: &ConversionOptions,
402    ctx: &Context,
403    depth: usize,
404    is_ordered: bool,
405    is_loose: bool,
406    nested_depth: usize,
407    start_counter: usize,
408    dom_ctx: &DomContext,
409) {
410    let mut counter = start_counter;
411
412    if let Some(node) = node_handle.get(parser) {
413        if let tl::Node::Tag(tag) = node {
414            let children = tag.children();
415            {
416                for child_handle in children.top().iter() {
417                    if let Some(child_node) = child_handle.get(parser) {
418                        if let tl::Node::Raw(bytes) = child_node {
419                            if bytes.as_utf8_str().trim().is_empty() {
420                                continue;
421                            }
422                        }
423                    }
424
425                    let list_ctx = Context {
426                        in_ordered_list: is_ordered,
427                        list_counter: if is_ordered { counter } else { 0 },
428                        in_list: true,
429                        list_depth: nested_depth,
430                        ul_depth: if is_ordered { ctx.ul_depth } else { ctx.ul_depth + 1 },
431                        loose_list: is_loose,
432                        prev_item_had_blocks: false,
433                        ..ctx.clone()
434                    };
435
436                    walk_node(child_handle, parser, output, options, &list_ctx, depth, dom_ctx);
437
438                    if is_ordered {
439                        if let Some(child_node) = child_handle.get(parser) {
440                            if let tl::Node::Tag(child_tag) = child_node {
441                                if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
442                                    counter += 1;
443                                }
444                            }
445                        }
446                    }
447                }
448            }
449        }
450    }
451}
452
453/// Conversion context to track state during traversal
454#[derive(Debug, Clone)]
455struct Context {
456    /// Are we inside a code-like element (pre, code, kbd, samp)?
457    in_code: bool,
458    /// Current list item counter for ordered lists
459    list_counter: usize,
460    /// Are we in an ordered list (vs unordered)?
461    in_ordered_list: bool,
462    /// Track if previous sibling in dl was a dt
463    last_was_dt: bool,
464    /// Blockquote nesting depth
465    blockquote_depth: usize,
466    /// Are we inside a table cell (td/th)?
467    in_table_cell: bool,
468    /// Should we convert block elements as inline?
469    convert_as_inline: bool,
470    /// Depth of inline formatting elements (strong/emphasis/span/etc).
471    inline_depth: usize,
472    /// Are we inside a list item?
473    in_list_item: bool,
474    /// List nesting depth (for indentation)
475    list_depth: usize,
476    /// Unordered list nesting depth (for bullet cycling)
477    ul_depth: usize,
478    /// Are we inside any list (ul or ol)?
479    in_list: bool,
480    /// Is this a "loose" list where all items should have blank lines?
481    loose_list: bool,
482    /// Did a previous list item have block children?
483    prev_item_had_blocks: bool,
484    /// Are we inside a heading element (h1-h6)?
485    in_heading: bool,
486    /// Current heading tag (h1, h2, etc.) if in_heading is true
487    heading_tag: Option<String>,
488    /// Are we inside a paragraph element?
489    in_paragraph: bool,
490    /// Are we inside a ruby element?
491    in_ruby: bool,
492    /// Are we inside a `<strong>` / `<b>` element?
493    in_strong: bool,
494    #[cfg(feature = "inline-images")]
495    /// Shared collector for inline images when enabled.
496    inline_collector: Option<InlineCollectorHandle>,
497    #[cfg(feature = "metadata")]
498    /// Shared collector for metadata when enabled.
499    metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
500}
501
502struct DomContext {
503    parent_map: HashMap<u32, Option<u32>>,
504    children_map: HashMap<u32, Vec<tl::NodeHandle>>,
505    root_children: Vec<tl::NodeHandle>,
506    node_map: HashMap<u32, tl::NodeHandle>,
507}
508
509fn escape_link_label(text: &str) -> String {
510    if text.is_empty() {
511        return String::new();
512    }
513
514    let mut result = String::with_capacity(text.len());
515    let mut backslash_count = 0usize;
516    let mut bracket_depth = 0usize;
517
518    for ch in text.chars() {
519        if ch == '\\' {
520            result.push('\\');
521            backslash_count += 1;
522            continue;
523        }
524
525        let is_escaped = backslash_count % 2 == 1;
526        backslash_count = 0;
527
528        match ch {
529            '[' if !is_escaped => {
530                bracket_depth = bracket_depth.saturating_add(1);
531                result.push('[');
532            }
533            ']' if !is_escaped => {
534                if bracket_depth == 0 {
535                    result.push('\\');
536                } else {
537                    bracket_depth -= 1;
538                }
539                result.push(']');
540            }
541            _ => result.push(ch),
542        }
543    }
544
545    result
546}
547
548fn append_markdown_link(
549    output: &mut String,
550    label: &str,
551    href: &str,
552    title: Option<&str>,
553    raw_text: &str,
554    options: &ConversionOptions,
555) {
556    output.push('[');
557    output.push_str(label);
558    output.push_str("](");
559
560    if href.is_empty() {
561        output.push_str("<>");
562    } else if href.contains(' ') || href.contains('\n') {
563        output.push('<');
564        output.push_str(href);
565        output.push('>');
566    } else {
567        let open_count = href.chars().filter(|&c| c == '(').count();
568        let close_count = href.chars().filter(|&c| c == ')').count();
569
570        if open_count == close_count {
571            output.push_str(href);
572        } else {
573            let escaped_href = href.replace("(", "\\(").replace(")", "\\)");
574            output.push_str(&escaped_href);
575        }
576    }
577
578    if let Some(title_text) = title {
579        output.push_str(" \"");
580        if title_text.contains('"') {
581            let escaped_title = title_text.replace('"', "\\\"");
582            output.push_str(&escaped_title);
583        } else {
584            output.push_str(title_text);
585        }
586        output.push('"');
587    } else if options.default_title && raw_text == href {
588        output.push_str(" \"");
589        if href.contains('"') {
590            let escaped_href = href.replace('"', "\\\"");
591            output.push_str(&escaped_href);
592        } else {
593            output.push_str(href);
594        }
595        output.push('"');
596    }
597
598    output.push(')');
599}
600
601fn heading_level_from_name(name: &str) -> Option<usize> {
602    match name {
603        "h1" => Some(1),
604        "h2" => Some(2),
605        "h3" => Some(3),
606        "h4" => Some(4),
607        "h5" => Some(5),
608        "h6" => Some(6),
609        _ => None,
610    }
611}
612
613fn find_single_heading_child(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<(usize, tl::NodeHandle)> {
614    let node = node_handle.get(parser)?;
615
616    let tl::Node::Tag(tag) = node else {
617        return None;
618    };
619
620    let children = tag.children();
621    let mut heading_data: Option<(usize, tl::NodeHandle)> = None;
622
623    for child_handle in children.top().iter() {
624        let Some(child_node) = child_handle.get(parser) else {
625            continue;
626        };
627
628        match child_node {
629            tl::Node::Raw(bytes) => {
630                if !bytes.as_utf8_str().trim().is_empty() {
631                    return None;
632                }
633            }
634            tl::Node::Tag(child_tag) => {
635                let name = normalized_tag_name(child_tag.name().as_utf8_str());
636                if let Some(level) = heading_level_from_name(name.as_ref()) {
637                    if heading_data.is_some() {
638                        return None;
639                    }
640                    heading_data = Some((level, *child_handle));
641                } else {
642                    return None;
643                }
644            }
645            _ => return None,
646        }
647    }
648
649    heading_data
650}
651
652fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
653    if text.is_empty() {
654        return;
655    }
656
657    if ctx.convert_as_inline {
658        output.push_str(text);
659        return;
660    }
661
662    if ctx.in_table_cell {
663        let is_table_continuation =
664            !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
665        if is_table_continuation {
666            output.push_str("<br>");
667        }
668        output.push_str(text);
669        return;
670    }
671
672    if ctx.in_list_item {
673        if output.ends_with('\n') {
674            if let Some(indent) = continuation_indent_string(ctx.list_depth, options) {
675                output.push_str(&indent);
676            }
677        } else if !output.ends_with(' ') && !output.is_empty() {
678            output.push(' ');
679        }
680    } else if !output.is_empty() && !output.ends_with("\n\n") {
681        if output.ends_with('\n') {
682            output.push('\n');
683        } else {
684            trim_trailing_whitespace(output);
685            output.push_str("\n\n");
686        }
687    }
688
689    let heading_suffix = if ctx.in_list_item || ctx.blockquote_depth > 0 {
690        "\n"
691    } else {
692        "\n\n"
693    };
694
695    match options.heading_style {
696        HeadingStyle::Underlined => {
697            if level == 1 {
698                output.push_str(text);
699                output.push('\n');
700                output.push_str(&"=".repeat(text.len()));
701                output.push_str(heading_suffix);
702            } else if level == 2 {
703                output.push_str(text);
704                output.push('\n');
705                output.push_str(&"-".repeat(text.len()));
706                output.push_str(heading_suffix);
707            } else {
708                output.push_str(&"#".repeat(level));
709                output.push(' ');
710                output.push_str(text);
711                output.push_str(heading_suffix);
712            }
713        }
714        HeadingStyle::Atx => {
715            output.push_str(&"#".repeat(level));
716            output.push(' ');
717            output.push_str(text);
718            output.push_str(heading_suffix);
719        }
720        HeadingStyle::AtxClosed => {
721            output.push_str(&"#".repeat(level));
722            output.push(' ');
723            output.push_str(text);
724            output.push(' ');
725            output.push_str(&"#".repeat(level));
726            output.push_str(heading_suffix);
727        }
728    }
729}
730
731fn normalize_heading_text<'a>(text: &'a str) -> Cow<'a, str> {
732    if !text.contains('\n') && !text.contains('\r') {
733        return Cow::Borrowed(text);
734    }
735
736    let mut normalized = String::with_capacity(text.len());
737    let mut pending_space = false;
738
739    for ch in text.chars() {
740        match ch {
741            '\n' | '\r' => {
742                if !normalized.is_empty() {
743                    pending_space = true;
744                }
745            }
746            ' ' | '\t' if pending_space => continue,
747            _ => {
748                if pending_space {
749                    if !normalized.ends_with(' ') {
750                        normalized.push(' ');
751                    }
752                    pending_space = false;
753                }
754                normalized.push(ch);
755            }
756        }
757    }
758
759    Cow::Owned(normalized)
760}
761
762fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser) -> DomContext {
763    let mut ctx = DomContext {
764        parent_map: HashMap::new(),
765        children_map: HashMap::new(),
766        root_children: dom.children().to_vec(),
767        node_map: HashMap::new(),
768    };
769
770    for child_handle in dom.children().iter() {
771        record_node_hierarchy(child_handle, None, parser, &mut ctx);
772    }
773
774    ctx
775}
776
777/// Detect block elements that were incorrectly nested under inline ancestors.
778fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
779    for handle in dom_ctx.node_map.values() {
780        if let Some(tl::Node::Tag(tag)) = handle.get(parser) {
781            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
782            if is_block_level_element(tag_name.as_ref()) {
783                let mut current = dom_ctx.parent_map.get(&handle.get_inner()).and_then(|p| *p);
784                while let Some(parent_id) = current {
785                    if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
786                        if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
787                            let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
788                            if is_inline_element(parent_name.as_ref()) {
789                                return true;
790                            }
791                        }
792                    }
793                    current = dom_ctx.parent_map.get(&parent_id).and_then(|p| *p);
794                }
795            }
796        }
797    }
798
799    false
800}
801
802/// Round-trip HTML through html5ever to repair malformed trees.
803fn repair_with_html5ever(input: &str) -> Option<String> {
804    use html5ever::serialize::{SerializeOpts, serialize};
805    use html5ever::tendril::TendrilSink;
806    use markup5ever_rcdom::{RcDom, SerializableHandle};
807
808    let dom = html5ever::parse_document(RcDom::default(), Default::default())
809        .from_utf8()
810        .read_from(&mut input.as_bytes())
811        .ok()?;
812
813    let mut buf = Vec::with_capacity(input.len());
814    let handle = SerializableHandle::from(dom.document.clone());
815    serialize(&mut buf, &handle, SerializeOpts::default()).ok()?;
816    String::from_utf8(buf).ok()
817}
818
819fn record_node_hierarchy(node_handle: &tl::NodeHandle, parent: Option<u32>, parser: &tl::Parser, ctx: &mut DomContext) {
820    let id = node_handle.get_inner();
821    ctx.parent_map.insert(id, parent);
822    ctx.node_map.insert(id, *node_handle);
823
824    if let Some(node) = node_handle.get(parser) {
825        if let tl::Node::Tag(tag) = node {
826            let children: Vec<_> = tag.children().top().iter().copied().collect();
827            ctx.children_map.insert(id, children.clone());
828            for child in children {
829                record_node_hierarchy(&child, Some(id), parser, ctx);
830            }
831        }
832    }
833}
834
835/// Check if a document is an hOCR (HTML-based OCR) document.
836///
837/// hOCR documents should have metadata extraction disabled to avoid
838/// including OCR metadata (system info, capabilities, etc.) in output.
839///
840/// Detection criteria:
841/// - meta tag with name="ocr-system" or name="ocr-capabilities"
842/// - Elements with classes: ocr_page, ocrx_word, ocr_carea, ocr_par, ocr_line
843fn is_hocr_document(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
844    fn check_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
845        if let Some(node) = node_handle.get(parser) {
846            match node {
847                tl::Node::Tag(tag) => {
848                    let tag_name = normalized_tag_name(tag.name().as_utf8_str());
849
850                    if tag_name == "meta" {
851                        if let Some(name_attr) = tag.attributes().get("name") {
852                            if let Some(name_bytes) = name_attr {
853                                let name_value = name_bytes.as_utf8_str();
854                                if name_value == "ocr-system" || name_value == "ocr-capabilities" {
855                                    return true;
856                                }
857                            }
858                        }
859                    }
860
861                    if let Some(class_attr) = tag.attributes().get("class") {
862                        if let Some(class_bytes) = class_attr {
863                            let class_value = class_bytes.as_utf8_str();
864                            if class_value.contains("ocr_page")
865                                || class_value.contains("ocrx_word")
866                                || class_value.contains("ocr_carea")
867                                || class_value.contains("ocr_par")
868                                || class_value.contains("ocr_line")
869                            {
870                                return true;
871                            }
872                        }
873                    }
874
875                    let children = tag.children();
876                    {
877                        for child_handle in children.top().iter() {
878                            if check_node(child_handle, parser) {
879                                return true;
880                            }
881                        }
882                    }
883                    false
884                }
885                _ => false,
886            }
887        } else {
888            false
889        }
890    }
891
892    check_node(node_handle, parser)
893}
894
895/// Extract metadata from HTML document head.
896///
897/// Extracts comprehensive document metadata including:
898/// - title: Document title from <title> tag
899/// - meta tags: description, keywords, author, etc.
900/// - Open Graph tags: og:title, og:description, og:image, etc.
901/// - Twitter Card tags: twitter:card, twitter:title, etc.
902/// - base-href: Base URL from <base> tag
903/// - canonical: Canonical URL from <link rel="canonical">
904/// - link relations: author, license, alternate links
905fn extract_metadata(
906    node_handle: &tl::NodeHandle,
907    parser: &tl::Parser,
908    options: &ConversionOptions,
909) -> BTreeMap<String, String> {
910    let mut metadata = BTreeMap::new();
911
912    fn find_head(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<tl::NodeHandle> {
913        if let Some(node) = node_handle.get(parser) {
914            if let tl::Node::Tag(tag) = node {
915                if tag_name_eq(tag.name().as_utf8_str(), "head") {
916                    return Some(*node_handle);
917                }
918                let children = tag.children();
919                {
920                    for child_handle in children.top().iter() {
921                        if let Some(result) = find_head(child_handle, parser) {
922                            return Some(result);
923                        }
924                    }
925                }
926            }
927        }
928        None
929    }
930
931    let head_handle = match find_head(node_handle, parser) {
932        Some(h) => h,
933        None => return metadata,
934    };
935
936    if let Some(head_node) = head_handle.get(parser) {
937        if let tl::Node::Tag(head_tag) = head_node {
938            let children = head_tag.children();
939            {
940                for child_handle in children.top().iter() {
941                    if let Some(child_node) = child_handle.get(parser) {
942                        if let tl::Node::Tag(child_tag) = child_node {
943                            let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
944
945                            match tag_name.as_ref() {
946                                "title" => {
947                                    if options.strip_tags.contains(&"title".to_string())
948                                        || options.preserve_tags.contains(&"title".to_string())
949                                    {
950                                    } else {
951                                        let title_children = child_tag.children();
952                                        {
953                                            if let Some(first_child) = title_children.top().iter().next() {
954                                                if let Some(text_node) = first_child.get(parser) {
955                                                    if let tl::Node::Raw(bytes) = text_node {
956                                                        let title = text::normalize_whitespace(&bytes.as_utf8_str())
957                                                            .trim()
958                                                            .to_string();
959                                                        if !title.is_empty() {
960                                                            metadata.insert("title".to_string(), title);
961                                                        }
962                                                    }
963                                                }
964                                            }
965                                        }
966                                    }
967                                }
968                                "base" => {
969                                    if let Some(href_attr) = child_tag.attributes().get("href") {
970                                        if let Some(href_bytes) = href_attr {
971                                            let href = href_bytes.as_utf8_str().to_string();
972                                            if !href.is_empty() {
973                                                metadata.insert("base-href".to_string(), href);
974                                            }
975                                        }
976                                    }
977                                }
978                                "meta" => {
979                                    if !options.strip_tags.contains(&"meta".to_string())
980                                        && !options.preserve_tags.contains(&"meta".to_string())
981                                    {
982                                        let mut name_attr = None;
983                                        let mut property_attr = None;
984                                        let mut http_equiv_attr = None;
985                                        let mut content_attr = None;
986
987                                        if let Some(attr) = child_tag.attributes().get("name") {
988                                            if let Some(bytes) = attr {
989                                                name_attr = Some(bytes.as_utf8_str().to_string());
990                                            }
991                                        }
992                                        if let Some(attr) = child_tag.attributes().get("property") {
993                                            if let Some(bytes) = attr {
994                                                property_attr = Some(bytes.as_utf8_str().to_string());
995                                            }
996                                        }
997                                        if let Some(attr) = child_tag.attributes().get("http-equiv") {
998                                            if let Some(bytes) = attr {
999                                                http_equiv_attr = Some(bytes.as_utf8_str().to_string());
1000                                            }
1001                                        }
1002                                        if let Some(attr) = child_tag.attributes().get("content") {
1003                                            if let Some(bytes) = attr {
1004                                                content_attr = Some(bytes.as_utf8_str().to_string());
1005                                            }
1006                                        }
1007
1008                                        if let Some(content) = content_attr {
1009                                            if let Some(name) = name_attr {
1010                                                let key = format!("meta-{}", name.to_lowercase());
1011                                                metadata.insert(key, content);
1012                                            } else if let Some(property) = property_attr {
1013                                                let key = format!("meta-{}", property.to_lowercase().replace(':', "-"));
1014                                                metadata.insert(key, content);
1015                                            } else if let Some(http_equiv) = http_equiv_attr {
1016                                                let key = format!("meta-{}", http_equiv.to_lowercase());
1017                                                metadata.insert(key, content);
1018                                            }
1019                                        }
1020                                    }
1021                                }
1022                                "link" => {
1023                                    let mut rel_attr = None;
1024                                    let mut href_attr = None;
1025
1026                                    if let Some(attr) = child_tag.attributes().get("rel") {
1027                                        if let Some(bytes) = attr {
1028                                            rel_attr = Some(bytes.as_utf8_str().to_string());
1029                                        }
1030                                    }
1031                                    if let Some(attr) = child_tag.attributes().get("href") {
1032                                        if let Some(bytes) = attr {
1033                                            href_attr = Some(bytes.as_utf8_str().to_string());
1034                                        }
1035                                    }
1036
1037                                    if let (Some(rel), Some(href)) = (rel_attr, href_attr) {
1038                                        let rel_lower = rel.to_lowercase();
1039                                        match rel_lower.as_str() {
1040                                            "canonical" => {
1041                                                metadata.insert("canonical".to_string(), href);
1042                                            }
1043                                            "author" | "license" | "alternate" => {
1044                                                metadata.insert(format!("link-{}", rel_lower), href);
1045                                            }
1046                                            _ => {}
1047                                        }
1048                                    }
1049                                }
1050                                _ => {}
1051                            }
1052                        }
1053                    }
1054                }
1055            }
1056        }
1057    }
1058
1059    metadata
1060}
1061
1062/// Format metadata as YAML frontmatter.
1063fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
1064    if metadata.is_empty() {
1065        return String::new();
1066    }
1067
1068    let mut lines = vec!["---".to_string()];
1069    for (key, value) in metadata {
1070        let needs_quotes = value.contains(':') || value.contains('#') || value.contains('[') || value.contains(']');
1071        if needs_quotes {
1072            let escaped = value.replace('\\', "\\\\").replace('"', "\\\"");
1073            lines.push(format!("{}: \"{}\"", key, escaped));
1074        } else {
1075            lines.push(format!("{}: {}", key, value));
1076        }
1077    }
1078    lines.push("---".to_string());
1079
1080    lines.join("\n") + "\n\n"
1081}
1082
1083/// Check if a handle is an empty inline element (abbr, var, ins, dfn, etc. with no text content).
1084fn is_empty_inline_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
1085    const EMPTY_WHEN_NO_CONTENT_TAGS: &[&str] = &[
1086        "abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u",
1087    ];
1088
1089    if let Some(node) = node_handle.get(parser) {
1090        if let tl::Node::Tag(tag) = node {
1091            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1092            if EMPTY_WHEN_NO_CONTENT_TAGS.contains(&tag_name.as_ref()) {
1093                return get_text_content(node_handle, parser).trim().is_empty();
1094            }
1095        }
1096    }
1097    false
1098}
1099
1100/// Get the text content of a node and its children.
1101fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1102    let mut text = String::with_capacity(64);
1103    if let Some(node) = node_handle.get(parser) {
1104        match node {
1105            tl::Node::Raw(bytes) => {
1106                text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1107            }
1108            tl::Node::Tag(tag) => {
1109                let children = tag.children();
1110                {
1111                    for child_handle in children.top().iter() {
1112                        text.push_str(&get_text_content(child_handle, parser));
1113                    }
1114                }
1115            }
1116            _ => {}
1117        }
1118    }
1119    text
1120}
1121
1122/// Collect inline text for link labels, skipping block-level descendants.
1123fn collect_link_label_text(children: &[tl::NodeHandle], parser: &tl::Parser) -> (String, Vec<tl::NodeHandle>, bool) {
1124    let mut text = String::new();
1125    let mut saw_block = false;
1126    let mut block_nodes = Vec::new();
1127    let mut stack: Vec<_> = children.iter().rev().copied().collect();
1128
1129    while let Some(handle) = stack.pop() {
1130        if let Some(node) = handle.get(parser) {
1131            match node {
1132                tl::Node::Raw(bytes) => {
1133                    text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1134                }
1135                tl::Node::Tag(tag) => {
1136                    let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1137                    if is_block_level_element(tag_name.as_ref()) {
1138                        saw_block = true;
1139                        block_nodes.push(handle);
1140                        continue;
1141                    }
1142
1143                    let tag_children = tag.children();
1144                    {
1145                        let mut child_nodes: Vec<_> = tag_children.top().iter().copied().collect();
1146                        child_nodes.reverse();
1147                        for child in child_nodes {
1148                            stack.push(child);
1149                        }
1150                    }
1151                }
1152                _ => {}
1153            }
1154        }
1155    }
1156
1157    (text, block_nodes, saw_block)
1158}
1159
1160fn normalize_link_label(label: &str) -> String {
1161    let collapsed = label
1162        .chars()
1163        .map(|ch| if ch == '\n' || ch == '\r' { ' ' } else { ch })
1164        .collect::<String>();
1165    text::normalize_whitespace(&collapsed).trim().to_string()
1166}
1167
1168/// Serialize an element to HTML string (for SVG and Math elements).
1169fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1170    if let Some(node) = node_handle.get(parser) {
1171        if let tl::Node::Tag(tag) = node {
1172            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1173            let mut html = String::with_capacity(256);
1174            html.push('<');
1175            html.push_str(&tag_name);
1176
1177            for (key, value_opt) in tag.attributes().iter() {
1178                html.push(' ');
1179                html.push_str(&key);
1180                if let Some(value) = value_opt {
1181                    html.push_str("=\"");
1182                    html.push_str(&value);
1183                    html.push('"');
1184                }
1185            }
1186
1187            let has_children = !tag.children().top().is_empty();
1188            if !has_children {
1189                html.push_str(" />");
1190            } else {
1191                html.push('>');
1192                let children = tag.children();
1193                {
1194                    for child_handle in children.top().iter() {
1195                        html.push_str(&serialize_node(child_handle, parser));
1196                    }
1197                }
1198                html.push_str("</");
1199                html.push_str(&tag_name);
1200                html.push('>');
1201            }
1202            return html;
1203        }
1204    }
1205    String::new()
1206}
1207
1208#[cfg(feature = "inline-images")]
1209fn non_empty_trimmed(value: &str) -> Option<String> {
1210    let trimmed = value.trim();
1211    if trimmed.is_empty() {
1212        None
1213    } else {
1214        Some(trimmed.to_string())
1215    }
1216}
1217
1218#[cfg(feature = "inline-images")]
1219fn handle_inline_data_image(
1220    collector_ref: &InlineCollectorHandle,
1221    src: &str,
1222    alt: &str,
1223    title: Option<&str>,
1224    attributes: BTreeMap<String, String>,
1225) {
1226    let trimmed_src = src.trim();
1227    if !trimmed_src.starts_with("data:") {
1228        return;
1229    }
1230
1231    let mut collector = collector_ref.borrow_mut();
1232    let index = collector.next_index();
1233
1234    let Some((meta, payload)) = trimmed_src.split_once(',') else {
1235        collector.warn_skip(index, "missing data URI separator");
1236        return;
1237    };
1238
1239    if payload.trim().is_empty() {
1240        collector.warn_skip(index, "empty data URI payload");
1241        return;
1242    }
1243
1244    if !meta.starts_with("data:") {
1245        collector.warn_skip(index, "invalid data URI scheme");
1246        return;
1247    }
1248
1249    let header = &meta["data:".len()..];
1250    if header.is_empty() {
1251        collector.warn_skip(index, "missing MIME type");
1252        return;
1253    }
1254
1255    let mut segments = header.split(';');
1256    let mime = segments.next().unwrap_or("");
1257    let Some((top_level, subtype_raw)) = mime.split_once('/') else {
1258        collector.warn_skip(index, "missing MIME subtype");
1259        return;
1260    };
1261
1262    if !top_level.eq_ignore_ascii_case("image") {
1263        collector.warn_skip(index, format!("unsupported MIME type {mime}"));
1264        return;
1265    }
1266
1267    let subtype_raw = subtype_raw.trim();
1268    if subtype_raw.is_empty() {
1269        collector.warn_skip(index, "missing MIME subtype");
1270        return;
1271    }
1272
1273    let subtype_lower = subtype_raw.to_ascii_lowercase();
1274
1275    let mut is_base64 = false;
1276    let mut inline_name: Option<String> = None;
1277    for segment in segments {
1278        if segment.eq_ignore_ascii_case("base64") {
1279            is_base64 = true;
1280        } else if let Some(value) = segment.strip_prefix("name=") {
1281            inline_name = non_empty_trimmed(value.trim_matches('"'));
1282        } else if let Some(value) = segment.strip_prefix("filename=") {
1283            inline_name = non_empty_trimmed(value.trim_matches('"'));
1284        }
1285    }
1286
1287    if !is_base64 {
1288        collector.warn_skip(index, "missing base64 encoding marker");
1289        return;
1290    }
1291
1292    use base64::{Engine as _, engine::general_purpose::STANDARD};
1293
1294    let payload_clean = payload.trim();
1295    let decoded = match STANDARD.decode(payload_clean) {
1296        Ok(bytes) => bytes,
1297        Err(_) => {
1298            collector.warn_skip(index, "invalid base64 payload");
1299            return;
1300        }
1301    };
1302
1303    if decoded.is_empty() {
1304        collector.warn_skip(index, "empty base64 payload");
1305        return;
1306    }
1307
1308    let max_size = collector.max_decoded_size();
1309    if decoded.len() as u64 > max_size {
1310        collector.warn_skip(
1311            index,
1312            format!(
1313                "decoded payload ({} bytes) exceeds configured max ({})",
1314                decoded.len(),
1315                max_size
1316            ),
1317        );
1318        return;
1319    }
1320
1321    let format = match subtype_lower.as_str() {
1322        "png" => InlineImageFormat::Png,
1323        "jpeg" | "jpg" => InlineImageFormat::Jpeg,
1324        "gif" => InlineImageFormat::Gif,
1325        "bmp" => InlineImageFormat::Bmp,
1326        "webp" => InlineImageFormat::Webp,
1327        "svg+xml" => InlineImageFormat::Svg,
1328        other => InlineImageFormat::Other(other.to_string()),
1329    };
1330
1331    let description = non_empty_trimmed(alt).or_else(|| title.and_then(non_empty_trimmed));
1332
1333    let filename_candidate = attributes
1334        .get("data-filename")
1335        .cloned()
1336        .or_else(|| attributes.get("filename").cloned())
1337        .or_else(|| attributes.get("data-name").cloned())
1338        .or(inline_name);
1339
1340    let dimensions = collector.infer_dimensions(index, &decoded, &format);
1341
1342    let image = collector.build_image(
1343        decoded,
1344        format,
1345        filename_candidate,
1346        description,
1347        dimensions,
1348        InlineImageSource::ImgDataUri,
1349        attributes,
1350    );
1351
1352    collector.push_image(index, image);
1353}
1354
1355#[cfg(feature = "inline-images")]
1356fn handle_inline_svg(
1357    collector_ref: &InlineCollectorHandle,
1358    node_handle: &tl::NodeHandle,
1359    parser: &tl::Parser,
1360    title_opt: Option<String>,
1361    attributes: BTreeMap<String, String>,
1362) {
1363    {
1364        let borrow = collector_ref.borrow();
1365        if !borrow.capture_svg() {
1366            return;
1367        }
1368    }
1369
1370    let mut collector = collector_ref.borrow_mut();
1371    let index = collector.next_index();
1372
1373    let serialized = serialize_element(node_handle, parser);
1374    if serialized.is_empty() {
1375        collector.warn_skip(index, "unable to serialize SVG element");
1376        return;
1377    }
1378
1379    let data = serialized.into_bytes();
1380    let max_size = collector.max_decoded_size();
1381    if data.len() as u64 > max_size {
1382        collector.warn_skip(
1383            index,
1384            format!(
1385                "serialized SVG payload ({} bytes) exceeds configured max ({})",
1386                data.len(),
1387                max_size
1388            ),
1389        );
1390        return;
1391    }
1392
1393    let description = attributes
1394        .get("aria-label")
1395        .and_then(|value| non_empty_trimmed(value))
1396        .or_else(|| title_opt.clone().and_then(|t| non_empty_trimmed(&t)));
1397
1398    let filename_candidate = attributes
1399        .get("data-filename")
1400        .cloned()
1401        .or_else(|| attributes.get("filename").cloned())
1402        .or_else(|| attributes.get("data-name").cloned());
1403
1404    let image = collector.build_image(
1405        data,
1406        InlineImageFormat::Svg,
1407        filename_candidate,
1408        description,
1409        None,
1410        InlineImageSource::SvgElement,
1411        attributes,
1412    );
1413
1414    collector.push_image(index, image);
1415}
1416
1417/// Serialize a node to HTML string.
1418fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1419    if let Some(node) = node_handle.get(parser) {
1420        match node {
1421            tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
1422            tl::Node::Tag(_) => serialize_element(node_handle, parser),
1423            _ => String::new(),
1424        }
1425    } else {
1426        String::new()
1427    }
1428}
1429
1430/// Convert HTML to Markdown using tl DOM parser.
1431pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
1432    convert_html_impl(html, options, None, None)
1433}
1434
1435#[cfg(feature = "inline-images")]
1436pub(crate) fn convert_html_with_inline_collector(
1437    html: &str,
1438    options: &ConversionOptions,
1439    collector: InlineCollectorHandle,
1440) -> Result<String> {
1441    convert_html_impl(html, options, Some(collector), None)
1442}
1443
1444#[cfg(feature = "metadata")]
1445pub(crate) fn convert_html_with_metadata(
1446    html: &str,
1447    options: &ConversionOptions,
1448    metadata_collector: crate::metadata::MetadataCollectorHandle,
1449) -> Result<String> {
1450    convert_html_impl(html, options, None, Some(metadata_collector))
1451}
1452
1453#[cfg_attr(not(feature = "inline-images"), allow(unused_variables))]
1454#[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
1455fn convert_html_impl(
1456    html: &str,
1457    options: &ConversionOptions,
1458    inline_collector: Option<InlineCollectorHandle>,
1459    #[cfg(feature = "metadata")] metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
1460    #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
1461) -> Result<String> {
1462    let mut preprocessed = preprocess_html(html).into_owned();
1463    let mut preprocessed_len = preprocessed.len();
1464
1465    let parser_options = tl::ParserOptions::default();
1466    let mut dom_guard = unsafe {
1467        tl::parse_owned(preprocessed.clone(), parser_options)
1468            .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1469    };
1470    let mut dom_ref = dom_guard.get_ref();
1471    let mut parser = dom_ref.parser();
1472    let mut dom_ctx = build_dom_context(dom_ref, parser);
1473    let mut output = String::with_capacity(preprocessed_len);
1474
1475    if has_inline_block_misnest(&dom_ctx, parser) {
1476        if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
1477            preprocessed = preprocess_html(&repaired_html).into_owned();
1478            preprocessed_len = preprocessed.len();
1479            dom_guard = unsafe {
1480                tl::parse_owned(preprocessed.clone(), parser_options)
1481                    .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1482            };
1483            dom_ref = dom_guard.get_ref();
1484            parser = dom_ref.parser();
1485            dom_ctx = build_dom_context(dom_ref, parser);
1486            output = String::with_capacity(preprocessed_len);
1487        }
1488    }
1489
1490    let mut is_hocr = false;
1491    for child_handle in dom_ref.children().iter() {
1492        if is_hocr_document(child_handle, parser) {
1493            is_hocr = true;
1494            break;
1495        }
1496    }
1497
1498    if options.extract_metadata && !options.convert_as_inline && !is_hocr {
1499        for child_handle in dom_ref.children().iter() {
1500            let metadata = extract_metadata(child_handle, parser, options);
1501            if !metadata.is_empty() {
1502                let metadata_frontmatter = format_metadata_frontmatter(&metadata);
1503                output.push_str(&metadata_frontmatter);
1504                break;
1505            }
1506        }
1507    }
1508
1509    if is_hocr {
1510        use crate::hocr::{convert_to_markdown_with_options as convert_hocr_to_markdown, extract_hocr_document};
1511
1512        let (elements, metadata) = extract_hocr_document(dom_ref, options.debug);
1513
1514        if options.extract_metadata && !options.convert_as_inline {
1515            let mut metadata_map = BTreeMap::new();
1516            if let Some(system) = metadata.ocr_system {
1517                metadata_map.insert("ocr-system".to_string(), system);
1518            }
1519            if !metadata.ocr_capabilities.is_empty() {
1520                metadata_map.insert("ocr-capabilities".to_string(), metadata.ocr_capabilities.join(", "));
1521            }
1522            if let Some(pages) = metadata.ocr_number_of_pages {
1523                metadata_map.insert("ocr-number-of-pages".to_string(), pages.to_string());
1524            }
1525            if !metadata.ocr_langs.is_empty() {
1526                metadata_map.insert("ocr-langs".to_string(), metadata.ocr_langs.join(", "));
1527            }
1528            if !metadata.ocr_scripts.is_empty() {
1529                metadata_map.insert("ocr-scripts".to_string(), metadata.ocr_scripts.join(", "));
1530            }
1531
1532            if !metadata_map.is_empty() {
1533                output.push_str(&format_metadata_frontmatter(&metadata_map));
1534            }
1535        }
1536
1537        let mut markdown = convert_hocr_to_markdown(&elements, true, options.hocr_spatial_tables);
1538
1539        if markdown.trim().is_empty() {
1540            return Ok(output);
1541        }
1542
1543        markdown.truncate(markdown.trim_end().len());
1544        output.push_str(&markdown);
1545        output.push('\n');
1546
1547        return Ok(output);
1548    }
1549
1550    #[cfg(feature = "metadata")]
1551    if let Some(ref collector) = metadata_collector {
1552        if !is_hocr {
1553            for child_handle in dom_ref.children().iter() {
1554                let head_meta = extract_metadata(child_handle, parser, options);
1555                if !head_meta.is_empty() {
1556                    collector.borrow_mut().set_head_metadata(head_meta);
1557                    break;
1558                }
1559            }
1560        }
1561    }
1562
1563    #[cfg(feature = "metadata")]
1564    if let Some(ref collector) = metadata_collector {
1565        for child_handle in dom_ref.children().iter() {
1566            if let Some(tl::Node::Tag(tag)) = child_handle.get(parser) {
1567                let tag_name = tag.name().as_utf8_str();
1568                if tag_name == "html" || tag_name == "body" {
1569                    if let Some(lang) = tag.attributes().get("lang") {
1570                        if let Some(lang_bytes) = lang {
1571                            let lang_str = lang_bytes.as_utf8_str();
1572                            collector.borrow_mut().set_language(lang_str.to_string());
1573                        }
1574                    }
1575                    if let Some(dir) = tag.attributes().get("dir") {
1576                        if let Some(dir_bytes) = dir {
1577                            let dir_str = dir_bytes.as_utf8_str();
1578                            collector.borrow_mut().set_text_direction(dir_str.to_string());
1579                        }
1580                    }
1581                }
1582            }
1583        }
1584    }
1585
1586    let ctx = Context {
1587        in_code: false,
1588        list_counter: 0,
1589        in_ordered_list: false,
1590        last_was_dt: false,
1591        blockquote_depth: 0,
1592        in_table_cell: false,
1593        convert_as_inline: options.convert_as_inline,
1594        inline_depth: 0,
1595        in_list_item: false,
1596        list_depth: 0,
1597        ul_depth: 0,
1598        in_list: false,
1599        loose_list: false,
1600        prev_item_had_blocks: false,
1601        in_heading: false,
1602        heading_tag: None,
1603        in_paragraph: false,
1604        in_ruby: false,
1605        in_strong: false,
1606        #[cfg(feature = "inline-images")]
1607        inline_collector: inline_collector.clone(),
1608        #[cfg(feature = "metadata")]
1609        metadata_collector: metadata_collector.clone(),
1610    };
1611
1612    for child_handle in dom_ref.children().iter() {
1613        walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
1614    }
1615
1616    trim_line_end_whitespace(&mut output);
1617    let trimmed = output.trim_end_matches('\n');
1618    if trimmed.is_empty() {
1619        Ok(String::new())
1620    } else {
1621        Ok(format!("{}\n", trimmed))
1622    }
1623}
1624
1625fn preprocess_html(input: &str) -> Cow<'_, str> {
1626    const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1627    const TAGS: [&[u8]; 2] = [b"script", b"style"];
1628    const SVG: &[u8] = b"svg";
1629    const DOCTYPE: &[u8] = b"doctype";
1630    const EMPTY_COMMENT: &[u8] = b"<!---->";
1631
1632    let bytes = input.as_bytes();
1633    let len = bytes.len();
1634    if len == 0 {
1635        return Cow::Borrowed(input);
1636    }
1637
1638    let mut idx = 0;
1639    let mut last = 0;
1640    let mut output: Option<String> = None;
1641    let mut svg_depth = 0usize;
1642
1643    while idx < len {
1644        if bytes[idx] == b'<' {
1645            if bytes[idx..].starts_with(EMPTY_COMMENT) {
1646                let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1647                out.push_str(&input[last..idx]);
1648                out.push_str("<!-- -->");
1649                idx += EMPTY_COMMENT.len();
1650                last = idx;
1651                continue;
1652            }
1653
1654            let mut replaced = false;
1655            for (pattern, replacement) in &SELF_CLOSING {
1656                if bytes[idx..].starts_with(pattern) {
1657                    let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1658                    out.push_str(&input[last..idx]);
1659                    out.push_str(replacement);
1660                    idx += pattern.len();
1661                    last = idx;
1662                    replaced = true;
1663                    break;
1664                }
1665            }
1666            if replaced {
1667                continue;
1668            }
1669
1670            if matches_tag_start(bytes, idx + 1, SVG) {
1671                if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1672                    svg_depth += 1;
1673                    idx = open_end;
1674                    continue;
1675                }
1676            } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1677                if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1678                    if svg_depth > 0 {
1679                        svg_depth = svg_depth.saturating_sub(1);
1680                    }
1681                    idx = close_end;
1682                    continue;
1683                }
1684            }
1685
1686            if svg_depth == 0 {
1687                let mut handled = false;
1688                for tag in TAGS {
1689                    if matches_tag_start(bytes, idx + 1, tag) {
1690                        if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1691                            let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1692                            let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1693                            out.push_str(&input[last..idx]);
1694                            out.push_str(&input[idx..open_end]);
1695                            out.push_str("</");
1696                            out.push_str(str::from_utf8(tag).unwrap());
1697                            out.push('>');
1698
1699                            last = remove_end;
1700                            idx = remove_end;
1701                            handled = true;
1702                        }
1703                    }
1704
1705                    if handled {
1706                        break;
1707                    }
1708                }
1709
1710                if handled {
1711                    continue;
1712                }
1713
1714                if idx + 2 < len && bytes[idx + 1] == b'!' {
1715                    let mut cursor = idx + 2;
1716                    while cursor < len && bytes[cursor].is_ascii_whitespace() {
1717                        cursor += 1;
1718                    }
1719
1720                    if cursor + DOCTYPE.len() <= len
1721                        && bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
1722                    {
1723                        if let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len()) {
1724                            let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1725                            out.push_str(&input[last..idx]);
1726                            last = end;
1727                            idx = end;
1728                            continue;
1729                        }
1730                    }
1731                }
1732            }
1733
1734            let is_valid_tag = if idx + 1 < len {
1735                match bytes[idx + 1] {
1736                    b'!' => {
1737                        idx + 2 < len
1738                            && (bytes[idx + 2] == b'-'
1739                                || bytes[idx + 2].is_ascii_alphabetic()
1740                                || bytes[idx + 2].is_ascii_uppercase())
1741                    }
1742                    b'/' => {
1743                        idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1744                    }
1745                    b'?' => true,
1746                    c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
1747                    _ => false,
1748                }
1749            } else {
1750                false
1751            };
1752
1753            if !is_valid_tag {
1754                let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1755                out.push_str(&input[last..idx]);
1756                out.push_str("&lt;");
1757                idx += 1;
1758                last = idx;
1759                continue;
1760            }
1761        }
1762
1763        idx += 1;
1764    }
1765
1766    if let Some(mut out) = output {
1767        if last < len {
1768            out.push_str(&input[last..]);
1769        }
1770        Cow::Owned(out)
1771    } else {
1772        Cow::Borrowed(input)
1773    }
1774}
1775
1776#[cfg(test)]
1777fn normalize_self_closing_tags(input: &str) -> Cow<'_, str> {
1778    const REPLACEMENTS: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1779
1780    if !REPLACEMENTS
1781        .iter()
1782        .any(|(pattern, _)| input.as_bytes().windows(pattern.len()).any(|w| w == *pattern))
1783    {
1784        return Cow::Borrowed(input);
1785    }
1786
1787    let bytes = input.as_bytes();
1788    let mut output = String::with_capacity(input.len());
1789    let mut idx = 0;
1790    let mut last = 0;
1791
1792    while idx < bytes.len() {
1793        let mut matched = false;
1794        for (pattern, replacement) in &REPLACEMENTS {
1795            if bytes[idx..].starts_with(*pattern) {
1796                output.push_str(&input[last..idx]);
1797                output.push_str(replacement);
1798                idx += pattern.len();
1799                last = idx;
1800                matched = true;
1801                break;
1802            }
1803        }
1804
1805        if !matched {
1806            idx += 1;
1807        }
1808    }
1809
1810    if last < input.len() {
1811        output.push_str(&input[last..]);
1812    }
1813
1814    Cow::Owned(output)
1815}
1816
1817/// Escape malformed angle brackets in HTML that are not part of valid tags.
1818///
1819/// This function ensures robust parsing by escaping bare `<` and `>` characters
1820/// that appear in text content and are not part of HTML tags. This prevents
1821/// parser failures on malformed HTML like "1<2" or comparisons in text.
1822///
1823/// # Examples
1824///
1825/// - `1<2` becomes `1&lt;2`
1826/// - `<div>1<2</div>` becomes `<div>1&lt;2</div>`
1827/// - `<script>1 < 2</script>` remains unchanged (handled by script stripping)
1828#[cfg(test)]
1829fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
1830    let bytes = input.as_bytes();
1831    let len = bytes.len();
1832    let mut idx = 0;
1833    let mut last = 0;
1834    let mut output: Option<String> = None;
1835
1836    while idx < len {
1837        if bytes[idx] == b'<' {
1838            if idx + 1 < len {
1839                let next = bytes[idx + 1];
1840
1841                let is_valid_tag = match next {
1842                    b'!' => {
1843                        idx + 2 < len
1844                            && (bytes[idx + 2] == b'-'
1845                                || bytes[idx + 2].is_ascii_alphabetic()
1846                                || bytes[idx + 2].is_ascii_uppercase())
1847                    }
1848                    b'/' => {
1849                        idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1850                    }
1851                    b'?' => true,
1852                    c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
1853                    _ => false,
1854                };
1855
1856                if !is_valid_tag {
1857                    let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1858                    out.push_str(&input[last..idx]);
1859                    out.push_str("&lt;");
1860                    last = idx + 1;
1861                }
1862            } else {
1863                let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1864                out.push_str(&input[last..idx]);
1865                out.push_str("&lt;");
1866                last = idx + 1;
1867            }
1868        }
1869        idx += 1;
1870    }
1871
1872    if let Some(mut out) = output {
1873        if last < input.len() {
1874            out.push_str(&input[last..]);
1875        }
1876        Cow::Owned(out)
1877    } else {
1878        Cow::Borrowed(input)
1879    }
1880}
1881
1882fn normalized_tag_name<'a>(raw: Cow<'a, str>) -> Cow<'a, str> {
1883    if raw.as_bytes().iter().any(|b| b.is_ascii_uppercase()) {
1884        let mut owned = raw.into_owned();
1885        owned.make_ascii_lowercase();
1886        Cow::Owned(owned)
1887    } else {
1888        raw
1889    }
1890}
1891
1892fn tag_name_eq(name: Cow<'_, str>, needle: &str) -> bool {
1893    name.eq_ignore_ascii_case(needle)
1894}
1895
1896fn should_drop_for_preprocessing(
1897    node_handle: &tl::NodeHandle,
1898    tag_name: &str,
1899    tag: &tl::HTMLTag,
1900    parser: &tl::Parser,
1901    dom_ctx: &DomContext,
1902    options: &ConversionOptions,
1903) -> bool {
1904    if !options.preprocessing.enabled {
1905        return false;
1906    }
1907
1908    if options.preprocessing.remove_navigation {
1909        let has_nav_hint = element_has_navigation_hint(tag);
1910
1911        if tag_name == "nav" {
1912            return true;
1913        }
1914
1915        if tag_name == "header" {
1916            let inside_semantic_content = has_semantic_content_ancestor(node_handle, parser, dom_ctx);
1917            if !inside_semantic_content {
1918                return true;
1919            }
1920            if has_nav_hint {
1921                return true;
1922            }
1923        } else if tag_name == "footer" || tag_name == "aside" {
1924            if has_nav_hint {
1925                return true;
1926            }
1927        } else if has_nav_hint && !matches!(tag_name, "main" | "article" | "html" | "body" | "head") {
1928            return true;
1929        }
1930    }
1931
1932    if options.preprocessing.remove_forms {
1933        if tag_name == "form" {
1934            let preserves_form = options.preserve_tags.iter().any(|t| t == "form");
1935            if !preserves_form {
1936                return true;
1937            }
1938        } else if matches!(
1939            tag_name,
1940            "button" | "select" | "textarea" | "label" | "fieldset" | "legend"
1941        ) {
1942            return true;
1943        }
1944    }
1945
1946    false
1947}
1948
1949fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
1950    let mut current_id = node_handle.get_inner();
1951    while let Some(parent_id) = dom_ctx.parent_map.get(&current_id).copied().flatten() {
1952        if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
1953            if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
1954                let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
1955                if matches!(parent_name.as_ref(), "main" | "article" | "section") {
1956                    return true;
1957                }
1958                if tag_has_main_semantics(parent_tag) {
1959                    return true;
1960                }
1961            }
1962        }
1963        current_id = parent_id;
1964    }
1965    false
1966}
1967
1968fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
1969    if let Some(role_attr) = tag.attributes().get("role") {
1970        if let Some(role) = role_attr {
1971            let lowered = role.as_utf8_str().to_ascii_lowercase();
1972            if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
1973                return true;
1974            }
1975        }
1976    }
1977
1978    if let Some(class_attr) = tag.attributes().get("class") {
1979        if let Some(class_bytes) = class_attr {
1980            let class_value = class_bytes.as_utf8_str().to_ascii_lowercase();
1981            const MAIN_CLASS_HINTS: &[&str] = &[
1982                "mw-body",
1983                "mw-parser-output",
1984                "content-body",
1985                "content-container",
1986                "article-body",
1987                "article-content",
1988                "main-content",
1989                "page-content",
1990                "entry-content",
1991                "post-content",
1992                "document-body",
1993            ];
1994            if MAIN_CLASS_HINTS.iter().any(|hint| class_value.contains(hint)) {
1995                return true;
1996            }
1997        }
1998    }
1999
2000    false
2001}
2002
2003fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
2004    if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
2005        return true;
2006    }
2007
2008    if attribute_contains_any(
2009        tag,
2010        "aria-label",
2011        &["navigation", "menu", "contents", "table of contents", "toc"],
2012    ) {
2013        return true;
2014    }
2015
2016    const NAV_KEYWORDS: &[&str] = &[
2017        "nav",
2018        "navigation",
2019        "navbar",
2020        "breadcrumbs",
2021        "breadcrumb",
2022        "toc",
2023        "sidebar",
2024        "sidenav",
2025        "menu",
2026        "menubar",
2027        "mainmenu",
2028        "subnav",
2029        "tabs",
2030        "tablist",
2031        "toolbar",
2032        "pager",
2033        "pagination",
2034        "skipnav",
2035        "skip-link",
2036        "skiplinks",
2037        "site-nav",
2038        "site-menu",
2039        "site-header",
2040        "site-footer",
2041        "topbar",
2042        "bottombar",
2043        "masthead",
2044        "vector-nav",
2045        "vector-header",
2046        "vector-footer",
2047    ];
2048
2049    attribute_matches_any(tag, "class", NAV_KEYWORDS) || attribute_matches_any(tag, "id", NAV_KEYWORDS)
2050}
2051
2052fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2053    let Some(attr_value) = tag.attributes().get(attr) else {
2054        return false;
2055    };
2056    let Some(value) = attr_value else {
2057        return false;
2058    };
2059    let raw = value.as_utf8_str();
2060    raw.split_whitespace()
2061        .map(|token| {
2062            token
2063                .chars()
2064                .map(|c| match c {
2065                    '_' | ':' | '.' | '/' => '-',
2066                    _ => c,
2067                })
2068                .collect::<String>()
2069                .to_ascii_lowercase()
2070        })
2071        .filter(|token| !token.is_empty())
2072        .any(|token| keywords.iter().any(|kw| token == *kw))
2073}
2074
2075fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2076    let Some(attr_value) = tag.attributes().get(attr) else {
2077        return false;
2078    };
2079    let Some(value) = attr_value else {
2080        return false;
2081    };
2082    let lower = value.as_utf8_str().to_ascii_lowercase();
2083    keywords.iter().any(|kw| lower.contains(*kw))
2084}
2085
2086/// Serialize a tag and its children back to HTML.
2087///
2088/// This is used for the preserve_tags feature to output original HTML for specific elements.
2089fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
2090    let mut html = String::new();
2091    serialize_node_to_html(handle, parser, &mut html);
2092    html
2093}
2094
2095/// Recursively serialize a node to HTML.
2096fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
2097    match handle.get(parser) {
2098        Some(tl::Node::Tag(tag)) => {
2099            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2100
2101            output.push('<');
2102            output.push_str(&tag_name);
2103
2104            for (key, value) in tag.attributes().iter() {
2105                output.push(' ');
2106                output.push_str(&key);
2107                if let Some(val) = value {
2108                    output.push_str("=\"");
2109                    output.push_str(&val);
2110                    output.push('"');
2111                }
2112            }
2113
2114            output.push('>');
2115
2116            let children = tag.children();
2117            for child_handle in children.top().iter() {
2118                serialize_node_to_html(child_handle, parser, output);
2119            }
2120
2121            if !matches!(
2122                tag_name.as_ref(),
2123                "br" | "hr"
2124                    | "img"
2125                    | "input"
2126                    | "meta"
2127                    | "link"
2128                    | "area"
2129                    | "base"
2130                    | "col"
2131                    | "embed"
2132                    | "param"
2133                    | "source"
2134                    | "track"
2135                    | "wbr"
2136            ) {
2137                output.push_str("</");
2138                output.push_str(&tag_name);
2139                output.push('>');
2140            }
2141        }
2142        Some(tl::Node::Raw(bytes)) => {
2143            if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
2144                output.push_str(text);
2145            }
2146        }
2147        _ => {}
2148    }
2149}
2150
2151#[cfg(test)]
2152fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
2153    const TAGS: [&[u8]; 2] = [b"script", b"style"];
2154    const SVG: &[u8] = b"svg";
2155
2156    let bytes = input.as_bytes();
2157    let len = bytes.len();
2158    let mut idx = 0;
2159    let mut last = 0;
2160    let mut output: Option<String> = None;
2161    let mut svg_depth = 0usize;
2162
2163    while idx < len {
2164        if bytes[idx] == b'<' {
2165            if matches_tag_start(bytes, idx + 1, SVG) {
2166                if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
2167                    svg_depth += 1;
2168                    idx = open_end;
2169                    continue;
2170                }
2171            } else if matches_end_tag_start(bytes, idx + 1, SVG) {
2172                if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
2173                    if svg_depth > 0 {
2174                        svg_depth = svg_depth.saturating_sub(1);
2175                    }
2176                    idx = close_end;
2177                    continue;
2178                }
2179            }
2180
2181            if svg_depth == 0 {
2182                let mut handled = false;
2183                for tag in TAGS {
2184                    if matches_tag_start(bytes, idx + 1, tag) {
2185                        if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
2186                            let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
2187                            let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
2188                            out.push_str(&input[last..idx]);
2189                            out.push_str(&input[idx..open_end]);
2190                            out.push_str("</");
2191                            out.push_str(str::from_utf8(tag).unwrap());
2192                            out.push('>');
2193
2194                            last = remove_end;
2195                            idx = remove_end;
2196                            handled = true;
2197                        }
2198                    }
2199
2200                    if handled {
2201                        break;
2202                    }
2203                }
2204
2205                if handled {
2206                    continue;
2207                }
2208            }
2209        }
2210
2211        idx += 1;
2212    }
2213
2214    if let Some(mut out) = output {
2215        if last < input.len() {
2216            out.push_str(&input[last..]);
2217        }
2218        Cow::Owned(out)
2219    } else {
2220        Cow::Borrowed(input)
2221    }
2222}
2223
2224fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
2225    if start >= bytes.len() {
2226        return false;
2227    }
2228
2229    if start + tag.len() > bytes.len() {
2230        return false;
2231    }
2232
2233    if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
2234        return false;
2235    }
2236
2237    start += tag.len();
2238
2239    match bytes.get(start) {
2240        Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
2241        Some(_) => false,
2242        None => true,
2243    }
2244}
2245
2246fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
2247    let len = bytes.len();
2248    let mut in_quote: Option<u8> = None;
2249
2250    while idx < len {
2251        match bytes[idx] {
2252            b'"' | b'\'' => {
2253                if let Some(current) = in_quote {
2254                    if current == bytes[idx] {
2255                        in_quote = None;
2256                    }
2257                } else {
2258                    in_quote = Some(bytes[idx]);
2259                }
2260            }
2261            b'>' if in_quote.is_none() => return Some(idx + 1),
2262            _ => {}
2263        }
2264        idx += 1;
2265    }
2266
2267    None
2268}
2269
2270fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
2271    let len = bytes.len();
2272    let mut depth = 1usize;
2273
2274    while idx < len {
2275        if bytes[idx] == b'<' {
2276            if matches_tag_start(bytes, idx + 1, tag) {
2277                if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
2278                    depth += 1;
2279                    idx = next;
2280                    continue;
2281                }
2282            } else if matches_end_tag_start(bytes, idx + 1, tag) {
2283                if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
2284                    depth -= 1;
2285                    if depth == 0 {
2286                        return Some(close);
2287                    }
2288                    idx = close;
2289                    continue;
2290                }
2291            }
2292        }
2293
2294        idx += 1;
2295    }
2296
2297    None
2298}
2299
2300fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
2301    if start >= bytes.len() || bytes[start] != b'/' {
2302        return false;
2303    }
2304    matches_tag_start(bytes, start + 1, tag)
2305}
2306
2307/// Check if an element is inline (not block-level).
2308fn is_inline_element(tag_name: &str) -> bool {
2309    matches!(
2310        tag_name,
2311        "a" | "abbr"
2312            | "b"
2313            | "bdi"
2314            | "bdo"
2315            | "br"
2316            | "cite"
2317            | "code"
2318            | "data"
2319            | "dfn"
2320            | "em"
2321            | "i"
2322            | "kbd"
2323            | "mark"
2324            | "q"
2325            | "rp"
2326            | "rt"
2327            | "ruby"
2328            | "s"
2329            | "samp"
2330            | "small"
2331            | "span"
2332            | "strong"
2333            | "sub"
2334            | "sup"
2335            | "time"
2336            | "u"
2337            | "var"
2338            | "wbr"
2339            | "del"
2340            | "ins"
2341            | "img"
2342            | "map"
2343            | "area"
2344            | "audio"
2345            | "video"
2346            | "picture"
2347            | "source"
2348            | "track"
2349            | "embed"
2350            | "object"
2351            | "param"
2352            | "input"
2353            | "label"
2354            | "button"
2355            | "select"
2356            | "textarea"
2357            | "output"
2358            | "progress"
2359            | "meter"
2360    )
2361}
2362
2363/// Check if an element is block-level (not inline).
2364fn is_block_level_element(tag_name: &str) -> bool {
2365    !is_inline_element(tag_name)
2366        && matches!(
2367            tag_name,
2368            "address"
2369                | "article"
2370                | "aside"
2371                | "blockquote"
2372                | "canvas"
2373                | "dd"
2374                | "div"
2375                | "dl"
2376                | "dt"
2377                | "fieldset"
2378                | "figcaption"
2379                | "figure"
2380                | "footer"
2381                | "form"
2382                | "h1"
2383                | "h2"
2384                | "h3"
2385                | "h4"
2386                | "h5"
2387                | "h6"
2388                | "header"
2389                | "hr"
2390                | "li"
2391                | "main"
2392                | "nav"
2393                | "ol"
2394                | "p"
2395                | "pre"
2396                | "section"
2397                | "table"
2398                | "tfoot"
2399                | "ul"
2400        )
2401}
2402
2403fn get_next_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2404    let id = node_handle.get_inner();
2405    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2406
2407    let siblings = if let Some(parent_id) = parent {
2408        dom_ctx.children_map.get(&parent_id)?
2409    } else {
2410        &dom_ctx.root_children
2411    };
2412
2413    let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2414
2415    for sibling in siblings.iter().skip(position + 1) {
2416        if let Some(node) = sibling.get(parser) {
2417            match node {
2418                tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2419                tl::Node::Raw(raw) => {
2420                    if !raw.as_utf8_str().trim().is_empty() {
2421                        return None;
2422                    }
2423                }
2424                _ => {}
2425            }
2426        }
2427    }
2428
2429    None
2430}
2431
2432fn get_previous_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2433    let id = node_handle.get_inner();
2434    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2435
2436    let siblings = if let Some(parent_id) = parent {
2437        dom_ctx.children_map.get(&parent_id)?
2438    } else {
2439        &dom_ctx.root_children
2440    };
2441
2442    let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2443
2444    for sibling in siblings.iter().take(position).rev() {
2445        if let Some(node) = sibling.get(parser) {
2446            match node {
2447                tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2448                tl::Node::Raw(raw) => {
2449                    if !raw.as_utf8_str().trim().is_empty() {
2450                        return None;
2451                    }
2452                }
2453                _ => {}
2454            }
2455        }
2456    }
2457
2458    None
2459}
2460
2461fn previous_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2462    let id = node_handle.get_inner();
2463    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2464
2465    let siblings = if let Some(parent_id) = parent {
2466        if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2467            children
2468        } else {
2469            return false;
2470        }
2471    } else {
2472        &dom_ctx.root_children
2473    };
2474
2475    let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2476        return false;
2477    };
2478
2479    for sibling in siblings.iter().take(position).rev() {
2480        if let Some(node) = sibling.get(parser) {
2481            match node {
2482                tl::Node::Tag(tag) => {
2483                    let name = normalized_tag_name(tag.name().as_utf8_str());
2484                    return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2485                }
2486                tl::Node::Raw(raw) => {
2487                    if raw.as_utf8_str().trim().is_empty() {
2488                        continue;
2489                    }
2490                    return false;
2491                }
2492                _ => continue,
2493            }
2494        }
2495    }
2496
2497    false
2498}
2499
2500fn next_sibling_is_whitespace_text(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2501    let id = node_handle.get_inner();
2502    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2503
2504    let siblings = if let Some(parent_id) = parent {
2505        if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2506            children
2507        } else {
2508            return false;
2509        }
2510    } else {
2511        &dom_ctx.root_children
2512    };
2513
2514    let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2515        return false;
2516    };
2517
2518    for sibling in siblings.iter().skip(position + 1) {
2519        if let Some(node) = sibling.get(parser) {
2520            match node {
2521                tl::Node::Raw(raw) => return raw.as_utf8_str().trim().is_empty(),
2522                tl::Node::Tag(_) => return false,
2523                _ => continue,
2524            }
2525        }
2526    }
2527
2528    false
2529}
2530
2531fn next_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2532    let id = node_handle.get_inner();
2533    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2534
2535    let siblings = if let Some(parent_id) = parent {
2536        if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2537            children
2538        } else {
2539            return false;
2540        }
2541    } else {
2542        &dom_ctx.root_children
2543    };
2544
2545    let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2546        return false;
2547    };
2548
2549    for sibling in siblings.iter().skip(position + 1) {
2550        if let Some(node) = sibling.get(parser) {
2551            match node {
2552                tl::Node::Tag(tag) => {
2553                    let name = normalized_tag_name(tag.name().as_utf8_str());
2554                    return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2555                }
2556                tl::Node::Raw(raw) => {
2557                    if raw.as_utf8_str().trim().is_empty() {
2558                        continue;
2559                    }
2560                    return false;
2561                }
2562                _ => continue,
2563            }
2564        }
2565    }
2566
2567    false
2568}
2569
2570fn append_inline_suffix(
2571    output: &mut String,
2572    suffix: &str,
2573    has_core_content: bool,
2574    node_handle: &tl::NodeHandle,
2575    parser: &tl::Parser,
2576    dom_ctx: &DomContext,
2577) {
2578    if suffix.is_empty() {
2579        return;
2580    }
2581
2582    if suffix == " " && has_core_content && next_sibling_is_whitespace_text(node_handle, parser, dom_ctx) {
2583        return;
2584    }
2585
2586    output.push_str(suffix);
2587}
2588
2589/// Recursively walk DOM nodes and convert to Markdown.
2590#[allow(clippy::only_used_in_recursion)]
2591fn walk_node(
2592    node_handle: &tl::NodeHandle,
2593    parser: &tl::Parser,
2594    output: &mut String,
2595    options: &ConversionOptions,
2596    ctx: &Context,
2597    depth: usize,
2598    dom_ctx: &DomContext,
2599) {
2600    let Some(node) = node_handle.get(parser) else { return };
2601
2602    match node {
2603        tl::Node::Raw(bytes) => {
2604            let mut text = text::decode_html_entities(&bytes.as_utf8_str());
2605
2606            if text.is_empty() {
2607                return;
2608            }
2609
2610            let had_newlines = text.contains('\n');
2611
2612            if options.strip_newlines {
2613                text = text.replace(['\r', '\n'], " ");
2614            }
2615
2616            if text.trim().is_empty() {
2617                if ctx.in_code {
2618                    output.push_str(&text);
2619                    return;
2620                }
2621
2622                if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2623                    if ctx.convert_as_inline || ctx.in_table_cell || ctx.in_list_item {
2624                        output.push_str(&text);
2625                        return;
2626                    }
2627                    if text.contains("\n\n") || text.contains("\r\n\r\n") {
2628                        if !output.ends_with("\n\n") {
2629                            output.push('\n');
2630                        }
2631                        return;
2632                    }
2633                    output.push_str(&text);
2634                    return;
2635                }
2636
2637                if had_newlines {
2638                    if output.is_empty() {
2639                        return;
2640                    }
2641                    if !output.ends_with("\n\n") {
2642                        if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2643                            if is_inline_element(&next_tag) {
2644                                return;
2645                            }
2646                        }
2647                    }
2648                    return;
2649                }
2650
2651                if previous_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2652                    && next_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2653                {
2654                    if text.chars().count() > 1 {
2655                        if !output.ends_with(' ') {
2656                            output.push(' ');
2657                        }
2658                    } else {
2659                        output.push_str(&text);
2660                    }
2661                } else {
2662                    output.push_str(&text);
2663                }
2664                return;
2665            }
2666
2667            let processed_text = if ctx.in_code || ctx.in_ruby {
2668                text
2669            } else if ctx.in_table_cell {
2670                let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
2671                    let normalized_text = text::normalize_whitespace(&text);
2672                    text::escape(
2673                        &normalized_text,
2674                        options.escape_misc,
2675                        options.escape_asterisks,
2676                        options.escape_underscores,
2677                        options.escape_ascii,
2678                    )
2679                } else {
2680                    text::escape(
2681                        &text,
2682                        options.escape_misc,
2683                        options.escape_asterisks,
2684                        options.escape_underscores,
2685                        options.escape_ascii,
2686                    )
2687                };
2688                if options.escape_misc {
2689                    escaped
2690                } else {
2691                    escaped.replace('|', r"\|")
2692                }
2693            } else if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2694                text::escape(
2695                    &text,
2696                    options.escape_misc,
2697                    options.escape_asterisks,
2698                    options.escape_underscores,
2699                    options.escape_ascii,
2700                )
2701            } else {
2702                let has_trailing_single_newline =
2703                    text.ends_with('\n') && !text.ends_with("\n\n") && !text.ends_with("\r\n\r\n");
2704
2705                let normalized_text = text::normalize_whitespace(&text);
2706
2707                let (prefix, suffix, core) = text::chomp(&normalized_text);
2708
2709                let skip_prefix = output.ends_with("\n\n")
2710                    || output.ends_with("* ")
2711                    || output.ends_with("- ")
2712                    || output.ends_with(". ")
2713                    || output.ends_with("] ")
2714                    || (output.ends_with('\n') && prefix == " ")
2715                    || (output.ends_with(' ')
2716                        && prefix == " "
2717                        && !previous_sibling_is_inline_tag(node_handle, parser, dom_ctx));
2718
2719                let mut final_text = String::new();
2720                if !skip_prefix && !prefix.is_empty() {
2721                    final_text.push_str(prefix);
2722                }
2723
2724                let escaped_core = text::escape(
2725                    core,
2726                    options.escape_misc,
2727                    options.escape_asterisks,
2728                    options.escape_underscores,
2729                    options.escape_ascii,
2730                );
2731                final_text.push_str(&escaped_core);
2732
2733                if !suffix.is_empty() {
2734                    final_text.push_str(suffix);
2735                } else if has_trailing_single_newline {
2736                    let at_paragraph_break = output.ends_with("\n\n");
2737                    if options.debug {
2738                        eprintln!(
2739                            "[DEBUG] Text had trailing single newline that was chomped, at_paragraph_break={}",
2740                            at_paragraph_break
2741                        );
2742                    }
2743                    if !at_paragraph_break {
2744                        if text.contains("\n\n") || text.contains("\r\n\r\n") {
2745                            final_text.push('\n');
2746                        } else if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2747                            if options.debug {
2748                                eprintln!("[DEBUG] Next sibling tag after newline: {}", next_tag);
2749                            }
2750                            if matches!(next_tag.as_str(), "span") {
2751                            } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2752                                final_text.push(' ');
2753                            } else {
2754                                final_text.push('\n');
2755                            }
2756                        } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2757                            final_text.push(' ');
2758                        } else {
2759                            final_text.push('\n');
2760                        }
2761                    }
2762                }
2763
2764                final_text
2765            };
2766
2767            if ctx.in_list_item && processed_text.contains("\n\n") {
2768                let parts: Vec<&str> = processed_text.split("\n\n").collect();
2769                for (i, part) in parts.iter().enumerate() {
2770                    if i > 0 {
2771                        output.push_str("\n\n");
2772                        output.push_str(&" ".repeat(4 * ctx.list_depth));
2773                    }
2774                    output.push_str(part.trim());
2775                }
2776            } else {
2777                output.push_str(&processed_text);
2778            }
2779        }
2780
2781        tl::Node::Tag(tag) => {
2782            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2783
2784            if should_drop_for_preprocessing(node_handle, tag_name.as_ref(), tag, parser, dom_ctx, options) {
2785                trim_trailing_whitespace(output);
2786                if options.debug {
2787                    eprintln!("[DEBUG] Dropping <{}> subtree due to preprocessing settings", tag_name);
2788                }
2789                return;
2790            }
2791
2792            if options.strip_tags.iter().any(|t| t.as_str() == tag_name) {
2793                let children = tag.children();
2794                {
2795                    for child_handle in children.top().iter() {
2796                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2797                    }
2798                }
2799                return;
2800            }
2801
2802            if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
2803                let html = serialize_tag_to_html(node_handle, parser);
2804                output.push_str(&html);
2805                return;
2806            }
2807
2808            #[cfg(feature = "metadata")]
2809            if matches!(tag_name.as_ref(), "html" | "head" | "body") {
2810                if let Some(ref collector) = ctx.metadata_collector {
2811                    let mut c = collector.borrow_mut();
2812
2813                    if let Some(lang) = tag.attributes().get("lang").flatten() {
2814                        c.set_language(lang.as_utf8_str().to_string());
2815                    }
2816
2817                    if let Some(dir) = tag.attributes().get("dir").flatten() {
2818                        c.set_text_direction(dir.as_utf8_str().to_string());
2819                    }
2820                }
2821            }
2822
2823            match tag_name.as_ref() {
2824                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
2825                    let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
2826
2827                    let mut text = String::new();
2828                    let heading_ctx = Context {
2829                        in_heading: true,
2830                        convert_as_inline: true,
2831                        heading_tag: Some(tag_name.to_string()),
2832                        ..ctx.clone()
2833                    };
2834                    let children = tag.children();
2835                    {
2836                        for child_handle in children.top().iter() {
2837                            walk_node(
2838                                child_handle,
2839                                parser,
2840                                &mut text,
2841                                options,
2842                                &heading_ctx,
2843                                depth + 1,
2844                                dom_ctx,
2845                            );
2846                        }
2847                    }
2848                    let trimmed = text.trim();
2849                    if !trimmed.is_empty() {
2850                        let normalized = normalize_heading_text(trimmed);
2851                        push_heading(output, ctx, options, level, normalized.as_ref());
2852
2853                        #[cfg(feature = "metadata")]
2854                        if let Some(ref collector) = ctx.metadata_collector {
2855                            let id = tag
2856                                .attributes()
2857                                .get("id")
2858                                .flatten()
2859                                .map(|v| v.as_utf8_str().to_string());
2860                            collector
2861                                .borrow_mut()
2862                                .add_header(level as u8, normalized.to_string(), id, depth, 0);
2863                        }
2864                    }
2865                }
2866
2867                "p" => {
2868                    let content_start_pos = output.len();
2869
2870                    let is_table_continuation =
2871                        ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
2872
2873                    let is_list_continuation = ctx.in_list_item
2874                        && !output.is_empty()
2875                        && !output.ends_with("* ")
2876                        && !output.ends_with("- ")
2877                        && !output.ends_with(". ");
2878
2879                    let after_code_block = output.ends_with("```\n");
2880                    let needs_leading_sep = !ctx.in_table_cell
2881                        && !ctx.in_list_item
2882                        && !ctx.convert_as_inline
2883                        && ctx.blockquote_depth == 0
2884                        && !output.is_empty()
2885                        && !output.ends_with("\n\n")
2886                        && !after_code_block;
2887
2888                    if is_table_continuation {
2889                        trim_trailing_whitespace(output);
2890                        output.push_str("<br>");
2891                    } else if is_list_continuation {
2892                        add_list_continuation_indent(output, ctx.list_depth, true, options);
2893                    } else if needs_leading_sep {
2894                        trim_trailing_whitespace(output);
2895                        output.push_str("\n\n");
2896                    }
2897
2898                    let p_ctx = Context {
2899                        in_paragraph: true,
2900                        ..ctx.clone()
2901                    };
2902
2903                    let children = tag.children();
2904                    {
2905                        let child_handles: Vec<_> = children.top().iter().collect();
2906                        for (i, child_handle) in child_handles.iter().enumerate() {
2907                            if let Some(node) = child_handle.get(parser) {
2908                                if let tl::Node::Raw(bytes) = node {
2909                                    let text = bytes.as_utf8_str();
2910                                    if text.trim().is_empty() && i > 0 && i < child_handles.len() - 1 {
2911                                        let prev = &child_handles[i - 1];
2912                                        let next = &child_handles[i + 1];
2913                                        if is_empty_inline_element(prev, parser)
2914                                            && is_empty_inline_element(next, parser)
2915                                        {
2916                                            continue;
2917                                        }
2918                                    }
2919                                }
2920                            }
2921                            walk_node(child_handle, parser, output, options, &p_ctx, depth + 1, dom_ctx);
2922                        }
2923                    }
2924
2925                    let has_content = output.len() > content_start_pos;
2926
2927                    if has_content && !ctx.convert_as_inline && !ctx.in_table_cell {
2928                        output.push_str("\n\n");
2929                    }
2930                }
2931
2932                "strong" | "b" => {
2933                    if ctx.in_code {
2934                        let children = tag.children();
2935                        {
2936                            for child_handle in children.top().iter() {
2937                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2938                            }
2939                        }
2940                    } else {
2941                        let mut content = String::with_capacity(64);
2942                        let children = tag.children();
2943                        {
2944                            let strong_ctx = Context {
2945                                inline_depth: ctx.inline_depth + 1,
2946                                in_strong: true,
2947                                ..ctx.clone()
2948                            };
2949                            for child_handle in children.top().iter() {
2950                                walk_node(
2951                                    child_handle,
2952                                    parser,
2953                                    &mut content,
2954                                    options,
2955                                    &strong_ctx,
2956                                    depth + 1,
2957                                    dom_ctx,
2958                                );
2959                            }
2960                        }
2961                        let (prefix, suffix, trimmed) = chomp_inline(&content);
2962                        if !content.trim().is_empty() {
2963                            output.push_str(prefix);
2964                            if ctx.in_strong {
2965                                output.push_str(trimmed);
2966                            } else {
2967                                output.push(options.strong_em_symbol);
2968                                output.push(options.strong_em_symbol);
2969                                output.push_str(trimmed);
2970                                output.push(options.strong_em_symbol);
2971                                output.push(options.strong_em_symbol);
2972                            }
2973                            append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
2974                        } else if !content.is_empty() {
2975                            output.push_str(prefix);
2976                            append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
2977                        }
2978                    }
2979                }
2980
2981                "em" | "i" => {
2982                    if ctx.in_code {
2983                        let children = tag.children();
2984                        {
2985                            for child_handle in children.top().iter() {
2986                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2987                            }
2988                        }
2989                    } else {
2990                        let mut content = String::with_capacity(64);
2991                        let children = tag.children();
2992                        {
2993                            let em_ctx = Context {
2994                                inline_depth: ctx.inline_depth + 1,
2995                                ..ctx.clone()
2996                            };
2997                            for child_handle in children.top().iter() {
2998                                walk_node(child_handle, parser, &mut content, options, &em_ctx, depth + 1, dom_ctx);
2999                            }
3000                        }
3001                        let (prefix, suffix, trimmed) = chomp_inline(&content);
3002                        if !content.trim().is_empty() {
3003                            output.push_str(prefix);
3004                            output.push(options.strong_em_symbol);
3005                            output.push_str(trimmed);
3006                            output.push(options.strong_em_symbol);
3007                            append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3008                        } else if !content.is_empty() {
3009                            output.push_str(prefix);
3010                            append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3011                        } else if let Some(class_value) = tag
3012                            .attributes()
3013                            .get("class")
3014                            .and_then(|v| v.as_ref().map(|val| val.as_utf8_str().to_string()))
3015                        {
3016                            if class_value.contains("caret") && !output.ends_with(' ') {
3017                                output.push_str(" > ");
3018                            }
3019                        }
3020                    }
3021                }
3022
3023                "a" => {
3024                    const MAX_LINK_LABEL_LEN: usize = 512;
3025
3026                    let href_attr = tag
3027                        .attributes()
3028                        .get("href")
3029                        .flatten()
3030                        .map(|v| text::decode_html_entities(&v.as_utf8_str()));
3031                    let title = tag
3032                        .attributes()
3033                        .get("title")
3034                        .flatten()
3035                        .map(|v| v.as_utf8_str().to_string());
3036
3037                    if let Some(href) = href_attr {
3038                        let raw_text = text::normalize_whitespace(&get_text_content(node_handle, parser))
3039                            .trim()
3040                            .to_string();
3041
3042                        let is_autolink = options.autolinks
3043                            && !options.default_title
3044                            && !href.is_empty()
3045                            && (raw_text == href || (href.starts_with("mailto:") && raw_text == href[7..]));
3046
3047                        if is_autolink {
3048                            output.push('<');
3049                            if href.starts_with("mailto:") && raw_text == href[7..] {
3050                                output.push_str(&raw_text);
3051                            } else {
3052                                output.push_str(&href);
3053                            }
3054                            output.push('>');
3055                            return;
3056                        }
3057
3058                        if let Some((heading_level, heading_handle)) = find_single_heading_child(node_handle, parser) {
3059                            if let Some(heading_node) = heading_handle.get(parser) {
3060                                if let tl::Node::Tag(heading_tag) = heading_node {
3061                                    let heading_name =
3062                                        normalized_tag_name(heading_tag.name().as_utf8_str()).into_owned();
3063                                    let mut heading_text = String::new();
3064                                    let heading_ctx = Context {
3065                                        in_heading: true,
3066                                        convert_as_inline: true,
3067                                        heading_tag: Some(heading_name),
3068                                        ..ctx.clone()
3069                                    };
3070                                    walk_node(
3071                                        &heading_handle,
3072                                        parser,
3073                                        &mut heading_text,
3074                                        options,
3075                                        &heading_ctx,
3076                                        depth + 1,
3077                                        dom_ctx,
3078                                    );
3079                                    let trimmed_heading = heading_text.trim();
3080                                    if !trimmed_heading.is_empty() {
3081                                        let escaped_label = escape_link_label(trimmed_heading);
3082                                        let mut link_buffer = String::new();
3083                                        append_markdown_link(
3084                                            &mut link_buffer,
3085                                            &escaped_label,
3086                                            href.as_str(),
3087                                            title.as_deref(),
3088                                            raw_text.as_str(),
3089                                            options,
3090                                        );
3091                                        push_heading(output, ctx, options, heading_level, link_buffer.as_str());
3092                                        return;
3093                                    }
3094                                }
3095                            }
3096                        }
3097
3098                        let children: Vec<_> = tag.children().top().iter().copied().collect();
3099                        let (inline_label, _block_nodes, saw_block) = collect_link_label_text(&children, parser);
3100                        let mut label = if saw_block {
3101                            let mut content = String::new();
3102                            let link_ctx = Context {
3103                                inline_depth: ctx.inline_depth + 1,
3104                                convert_as_inline: true,
3105                                ..ctx.clone()
3106                            };
3107                            for child_handle in children.iter() {
3108                                let mut child_buf = String::new();
3109                                walk_node(
3110                                    child_handle,
3111                                    parser,
3112                                    &mut child_buf,
3113                                    options,
3114                                    &link_ctx,
3115                                    depth + 1,
3116                                    dom_ctx,
3117                                );
3118                                if !child_buf.trim().is_empty()
3119                                    && !content.is_empty()
3120                                    && !content.chars().last().map(|c| c.is_whitespace()).unwrap_or(true)
3121                                    && !child_buf.chars().next().map(|c| c.is_whitespace()).unwrap_or(true)
3122                                {
3123                                    content.push(' ');
3124                                }
3125                                content.push_str(&child_buf);
3126                            }
3127                            if content.trim().is_empty() {
3128                                normalize_link_label(&inline_label)
3129                            } else {
3130                                normalize_link_label(&content)
3131                            }
3132                        } else {
3133                            let mut content = String::new();
3134                            let link_ctx = Context {
3135                                inline_depth: ctx.inline_depth + 1,
3136                                ..ctx.clone()
3137                            };
3138                            for child_handle in children.iter() {
3139                                walk_node(
3140                                    child_handle,
3141                                    parser,
3142                                    &mut content,
3143                                    options,
3144                                    &link_ctx,
3145                                    depth + 1,
3146                                    dom_ctx,
3147                                );
3148                            }
3149                            normalize_link_label(&content)
3150                        };
3151
3152                        if label.is_empty() && saw_block {
3153                            let fallback = text::normalize_whitespace(&get_text_content(node_handle, parser));
3154                            label = normalize_link_label(&fallback);
3155                        }
3156
3157                        if label.is_empty() && !raw_text.is_empty() {
3158                            label = normalize_link_label(&raw_text);
3159                        }
3160
3161                        if label.is_empty() && !href.is_empty() && !children.is_empty() {
3162                            label = href.clone();
3163                        }
3164
3165                        if label.len() > MAX_LINK_LABEL_LEN {
3166                            truncate_at_char_boundary(&mut label, MAX_LINK_LABEL_LEN);
3167                            label.push('…');
3168                        }
3169
3170                        let escaped_label = escape_link_label(&label);
3171                        append_markdown_link(
3172                            output,
3173                            &escaped_label,
3174                            href.as_str(),
3175                            title.as_deref(),
3176                            label.as_str(),
3177                            options,
3178                        );
3179
3180                        #[cfg(feature = "metadata")]
3181                        if let Some(ref collector) = ctx.metadata_collector {
3182                            let rel_attr = tag
3183                                .attributes()
3184                                .get("rel")
3185                                .flatten()
3186                                .map(|v| v.as_utf8_str().to_string());
3187                            let mut attributes_map = BTreeMap::new();
3188                            for (key, value_opt) in tag.attributes().iter() {
3189                                let key_str = key.to_string();
3190                                if key_str == "href" {
3191                                    continue;
3192                                }
3193
3194                                let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3195                                attributes_map.insert(key_str, value);
3196                            }
3197                            collector.borrow_mut().add_link(
3198                                href.clone(),
3199                                label.clone(),
3200                                title.clone(),
3201                                rel_attr,
3202                                attributes_map,
3203                            );
3204                        }
3205                    } else {
3206                        let children = tag.children();
3207                        {
3208                            for child_handle in children.top().iter() {
3209                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3210                            }
3211                        }
3212                    }
3213                }
3214
3215                "img" => {
3216                    use std::borrow::Cow;
3217
3218                    let src = tag
3219                        .attributes()
3220                        .get("src")
3221                        .flatten()
3222                        .map(|v| v.as_utf8_str())
3223                        .unwrap_or(Cow::Borrowed(""));
3224
3225                    let alt = tag
3226                        .attributes()
3227                        .get("alt")
3228                        .flatten()
3229                        .map(|v| v.as_utf8_str())
3230                        .unwrap_or(Cow::Borrowed(""));
3231
3232                    let title = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str());
3233                    #[cfg(feature = "metadata")]
3234                    let mut attributes_map = BTreeMap::new();
3235                    #[cfg(feature = "metadata")]
3236                    let mut width: Option<u32> = None;
3237                    #[cfg(feature = "metadata")]
3238                    let mut height: Option<u32> = None;
3239                    #[cfg(feature = "metadata")]
3240                    for (key, value_opt) in tag.attributes().iter() {
3241                        let key_str = key.to_string();
3242                        if key_str == "src" {
3243                            continue;
3244                        }
3245                        let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3246                        if key_str == "width" {
3247                            if let Ok(parsed) = value.parse::<u32>() {
3248                                width = Some(parsed);
3249                            }
3250                        } else if key_str == "height" {
3251                            if let Ok(parsed) = value.parse::<u32>() {
3252                                height = Some(parsed);
3253                            }
3254                        }
3255                        attributes_map.insert(key_str, value);
3256                    }
3257
3258                    #[cfg(feature = "inline-images")]
3259                    if let Some(ref collector_ref) = ctx.inline_collector {
3260                        let mut attributes_map = BTreeMap::new();
3261                        for (key, value_opt) in tag.attributes().iter() {
3262                            let key_str = key.to_string();
3263                            let keep = key_str == "width"
3264                                || key_str == "height"
3265                                || key_str == "filename"
3266                                || key_str == "aria-label"
3267                                || key_str.starts_with("data-");
3268                            if keep {
3269                                let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
3270                                attributes_map.insert(key_str, value);
3271                            }
3272                        }
3273                        handle_inline_data_image(
3274                            collector_ref,
3275                            src.as_ref(),
3276                            alt.as_ref(),
3277                            title.as_deref(),
3278                            attributes_map,
3279                        );
3280                    }
3281
3282                    let keep_as_markdown = ctx.in_heading
3283                        && ctx
3284                            .heading_tag
3285                            .as_ref()
3286                            .is_some_and(|tag| options.keep_inline_images_in.iter().any(|t| t == tag));
3287
3288                    let should_use_alt_text = !keep_as_markdown
3289                        && (ctx.convert_as_inline
3290                            || (ctx.in_heading
3291                                && ctx
3292                                    .heading_tag
3293                                    .as_ref()
3294                                    .is_none_or(|tag| !options.keep_inline_images_in.iter().any(|t| t == tag))));
3295
3296                    if should_use_alt_text {
3297                        output.push_str(&alt);
3298                    } else {
3299                        output.push_str("![");
3300                        output.push_str(&alt);
3301                        output.push_str("](");
3302                        output.push_str(&src);
3303                        if let Some(ref title_text) = title {
3304                            output.push_str(" \"");
3305                            output.push_str(title_text);
3306                            output.push('"');
3307                        }
3308                        output.push(')');
3309                    }
3310
3311                    #[cfg(feature = "metadata")]
3312                    if let Some(ref collector) = ctx.metadata_collector {
3313                        if !src.is_empty() {
3314                            let dimensions = match (width, height) {
3315                                (Some(w), Some(h)) => Some((w, h)),
3316                                _ => None,
3317                            };
3318                            collector.borrow_mut().add_image(
3319                                src.to_string(),
3320                                if alt.is_empty() { None } else { Some(alt.to_string()) },
3321                                title.as_deref().map(|t| t.to_string()),
3322                                dimensions,
3323                                attributes_map.clone(),
3324                            );
3325                        }
3326                    }
3327                }
3328
3329                "mark" => {
3330                    if ctx.convert_as_inline {
3331                        let children = tag.children();
3332                        {
3333                            for child_handle in children.top().iter() {
3334                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3335                            }
3336                        }
3337                    } else {
3338                        use crate::options::HighlightStyle;
3339                        match options.highlight_style {
3340                            HighlightStyle::DoubleEqual => {
3341                                output.push_str("==");
3342                                let children = tag.children();
3343                                {
3344                                    for child_handle in children.top().iter() {
3345                                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3346                                    }
3347                                }
3348                                output.push_str("==");
3349                            }
3350                            HighlightStyle::Html => {
3351                                output.push_str("<mark>");
3352                                let children = tag.children();
3353                                {
3354                                    for child_handle in children.top().iter() {
3355                                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3356                                    }
3357                                }
3358                                output.push_str("</mark>");
3359                            }
3360                            HighlightStyle::Bold => {
3361                                let symbol = options.strong_em_symbol.to_string().repeat(2);
3362                                output.push_str(&symbol);
3363                                let bold_ctx = Context {
3364                                    in_strong: true,
3365                                    ..ctx.clone()
3366                                };
3367                                let children = tag.children();
3368                                {
3369                                    for child_handle in children.top().iter() {
3370                                        walk_node(child_handle, parser, output, options, &bold_ctx, depth + 1, dom_ctx);
3371                                    }
3372                                }
3373                                output.push_str(&symbol);
3374                            }
3375                            HighlightStyle::None => {
3376                                let children = tag.children();
3377                                {
3378                                    for child_handle in children.top().iter() {
3379                                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3380                                    }
3381                                }
3382                            }
3383                        }
3384                    }
3385                }
3386
3387                "del" | "s" => {
3388                    if ctx.in_code {
3389                        let children = tag.children();
3390                        {
3391                            for child_handle in children.top().iter() {
3392                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3393                            }
3394                        }
3395                    } else {
3396                        let mut content = String::with_capacity(32);
3397                        let children = tag.children();
3398                        {
3399                            for child_handle in children.top().iter() {
3400                                walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3401                            }
3402                        }
3403                        let (prefix, suffix, trimmed) = chomp_inline(&content);
3404                        if !content.trim().is_empty() {
3405                            output.push_str(prefix);
3406                            output.push_str("~~");
3407                            output.push_str(trimmed);
3408                            output.push_str("~~");
3409                            append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3410                        } else if !content.is_empty() {
3411                            output.push_str(prefix);
3412                            append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3413                        }
3414                    }
3415                }
3416
3417                "ins" => {
3418                    let mut content = String::with_capacity(32);
3419                    let children = tag.children();
3420                    {
3421                        for child_handle in children.top().iter() {
3422                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3423                        }
3424                    }
3425                    let (prefix, suffix, trimmed) = chomp_inline(&content);
3426                    if !trimmed.is_empty() {
3427                        output.push_str(prefix);
3428                        output.push_str("==");
3429                        output.push_str(trimmed);
3430                        output.push_str("==");
3431                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3432                    }
3433                }
3434
3435                "u" | "small" => {
3436                    let children = tag.children();
3437                    {
3438                        for child_handle in children.top().iter() {
3439                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3440                        }
3441                    }
3442                }
3443
3444                "sub" => {
3445                    if !ctx.in_code && !options.sub_symbol.is_empty() {
3446                        output.push_str(&options.sub_symbol);
3447                    }
3448                    let children = tag.children();
3449                    {
3450                        for child_handle in children.top().iter() {
3451                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3452                        }
3453                    }
3454                    if !ctx.in_code && !options.sub_symbol.is_empty() {
3455                        if options.sub_symbol.starts_with('<') && !options.sub_symbol.starts_with("</") {
3456                            output.push_str(&options.sub_symbol.replace('<', "</"));
3457                        } else {
3458                            output.push_str(&options.sub_symbol);
3459                        }
3460                    }
3461                }
3462
3463                "sup" => {
3464                    if !ctx.in_code && !options.sup_symbol.is_empty() {
3465                        output.push_str(&options.sup_symbol);
3466                    }
3467                    let children = tag.children();
3468                    {
3469                        for child_handle in children.top().iter() {
3470                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3471                        }
3472                    }
3473                    if !ctx.in_code && !options.sup_symbol.is_empty() {
3474                        if options.sup_symbol.starts_with('<') && !options.sup_symbol.starts_with("</") {
3475                            output.push_str(&options.sup_symbol.replace('<', "</"));
3476                        } else {
3477                            output.push_str(&options.sup_symbol);
3478                        }
3479                    }
3480                }
3481
3482                "kbd" | "samp" => {
3483                    let code_ctx = Context {
3484                        in_code: true,
3485                        ..ctx.clone()
3486                    };
3487                    let mut content = String::with_capacity(32);
3488                    let children = tag.children();
3489                    {
3490                        for child_handle in children.top().iter() {
3491                            walk_node(
3492                                child_handle,
3493                                parser,
3494                                &mut content,
3495                                options,
3496                                &code_ctx,
3497                                depth + 1,
3498                                dom_ctx,
3499                            );
3500                        }
3501                    }
3502                    let normalized = text::normalize_whitespace(&content);
3503                    let (prefix, suffix, trimmed) = chomp_inline(&normalized);
3504                    if !content.trim().is_empty() {
3505                        output.push_str(prefix);
3506                        output.push('`');
3507                        output.push_str(trimmed);
3508                        output.push('`');
3509                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3510                    } else if !content.is_empty() {
3511                        output.push_str(prefix);
3512                        append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3513                    }
3514                }
3515
3516                "var" => {
3517                    let mut content = String::with_capacity(32);
3518                    let children = tag.children();
3519                    {
3520                        for child_handle in children.top().iter() {
3521                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3522                        }
3523                    }
3524                    let (prefix, suffix, trimmed) = chomp_inline(&content);
3525                    if !trimmed.is_empty() {
3526                        output.push_str(prefix);
3527                        output.push(options.strong_em_symbol);
3528                        output.push_str(trimmed);
3529                        output.push(options.strong_em_symbol);
3530                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3531                    }
3532                }
3533
3534                "dfn" => {
3535                    let mut content = String::with_capacity(32);
3536                    let children = tag.children();
3537                    {
3538                        for child_handle in children.top().iter() {
3539                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3540                        }
3541                    }
3542                    let (prefix, suffix, trimmed) = chomp_inline(&content);
3543                    if !trimmed.is_empty() {
3544                        output.push_str(prefix);
3545                        output.push(options.strong_em_symbol);
3546                        output.push_str(trimmed);
3547                        output.push(options.strong_em_symbol);
3548                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3549                    }
3550                }
3551
3552                "abbr" => {
3553                    let mut content = String::with_capacity(32);
3554                    let children = tag.children();
3555                    {
3556                        for child_handle in children.top().iter() {
3557                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3558                        }
3559                    }
3560                    let trimmed = content.trim();
3561
3562                    if !trimmed.is_empty() {
3563                        output.push_str(trimmed);
3564
3565                        if let Some(title) = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str()) {
3566                            let trimmed_title = title.trim();
3567                            if !trimmed_title.is_empty() {
3568                                output.push_str(" (");
3569                                output.push_str(trimmed_title);
3570                                output.push(')');
3571                            }
3572                        }
3573                    }
3574                }
3575
3576                "time" | "data" => {
3577                    let children = tag.children();
3578                    {
3579                        for child_handle in children.top().iter() {
3580                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3581                        }
3582                    }
3583                }
3584
3585                "wbr" => {}
3586
3587                "code" => {
3588                    let code_ctx = Context {
3589                        in_code: true,
3590                        ..ctx.clone()
3591                    };
3592
3593                    if !ctx.in_code {
3594                        let mut content = String::with_capacity(32);
3595                        let children = tag.children();
3596                        {
3597                            for child_handle in children.top().iter() {
3598                                walk_node(
3599                                    child_handle,
3600                                    parser,
3601                                    &mut content,
3602                                    options,
3603                                    &code_ctx,
3604                                    depth + 1,
3605                                    dom_ctx,
3606                                );
3607                            }
3608                        }
3609
3610                        let trimmed = &content;
3611
3612                        if !content.trim().is_empty() {
3613                            let contains_backtick = trimmed.contains('`');
3614
3615                            let needs_delimiter_spaces = {
3616                                let first_char = trimmed.chars().next();
3617                                let last_char = trimmed.chars().last();
3618                                let starts_with_space = first_char == Some(' ');
3619                                let ends_with_space = last_char == Some(' ');
3620                                let starts_with_backtick = first_char == Some('`');
3621                                let ends_with_backtick = last_char == Some('`');
3622                                let all_spaces = trimmed.chars().all(|c| c == ' ');
3623
3624                                all_spaces
3625                                    || starts_with_backtick
3626                                    || ends_with_backtick
3627                                    || (starts_with_space && ends_with_space && contains_backtick)
3628                            };
3629
3630                            let (num_backticks, needs_spaces) = if contains_backtick {
3631                                let max_consecutive = trimmed
3632                                    .chars()
3633                                    .fold((0, 0), |(max, current), c| {
3634                                        if c == '`' {
3635                                            let new_current = current + 1;
3636                                            (max.max(new_current), new_current)
3637                                        } else {
3638                                            (max, 0)
3639                                        }
3640                                    })
3641                                    .0;
3642                                let num = if max_consecutive == 1 { 2 } else { 1 };
3643                                (num, needs_delimiter_spaces)
3644                            } else {
3645                                (1, needs_delimiter_spaces)
3646                            };
3647
3648                            for _ in 0..num_backticks {
3649                                output.push('`');
3650                            }
3651                            if needs_spaces {
3652                                output.push(' ');
3653                            }
3654                            output.push_str(trimmed);
3655                            if needs_spaces {
3656                                output.push(' ');
3657                            }
3658                            for _ in 0..num_backticks {
3659                                output.push('`');
3660                            }
3661                        }
3662                    } else {
3663                        let children = tag.children();
3664                        {
3665                            for child_handle in children.top().iter() {
3666                                walk_node(child_handle, parser, output, options, &code_ctx, depth + 1, dom_ctx);
3667                            }
3668                        }
3669                    }
3670                }
3671
3672                "pre" => {
3673                    let code_ctx = Context {
3674                        in_code: true,
3675                        ..ctx.clone()
3676                    };
3677
3678                    let mut content = String::with_capacity(256);
3679                    let children = tag.children();
3680                    {
3681                        for child_handle in children.top().iter() {
3682                            walk_node(
3683                                child_handle,
3684                                parser,
3685                                &mut content,
3686                                options,
3687                                &code_ctx,
3688                                depth + 1,
3689                                dom_ctx,
3690                            );
3691                        }
3692                    }
3693
3694                    if !content.is_empty() {
3695                        let leading_newlines = content.chars().take_while(|&c| c == '\n').count();
3696                        let trailing_newlines = content.chars().rev().take_while(|&c| c == '\n').count();
3697                        let core = content.trim_matches('\n');
3698                        let is_whitespace_only = core.trim().is_empty();
3699
3700                        let processed_content = if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
3701                            content
3702                        } else {
3703                            let mut core_text = if leading_newlines > 0 {
3704                                dedent_code_block(core)
3705                            } else {
3706                                core.to_string()
3707                            };
3708
3709                            if is_whitespace_only {
3710                                let mut rebuilt = String::new();
3711                                for _ in 0..leading_newlines {
3712                                    rebuilt.push('\n');
3713                                }
3714                                rebuilt.push_str(&core_text);
3715                                for _ in 0..trailing_newlines {
3716                                    rebuilt.push('\n');
3717                                }
3718                                rebuilt
3719                            } else {
3720                                for _ in 0..trailing_newlines {
3721                                    core_text.push('\n');
3722                                }
3723                                core_text
3724                            }
3725                        };
3726
3727                        match options.code_block_style {
3728                            crate::options::CodeBlockStyle::Indented => {
3729                                if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3730                                    if output.ends_with('\n') {
3731                                        output.push('\n');
3732                                    } else {
3733                                        output.push_str("\n\n");
3734                                    }
3735                                }
3736
3737                                let indented = processed_content
3738                                    .lines()
3739                                    .map(|line| {
3740                                        if line.is_empty() {
3741                                            String::new()
3742                                        } else {
3743                                            format!("    {}", line)
3744                                        }
3745                                    })
3746                                    .collect::<Vec<_>>()
3747                                    .join("\n");
3748                                output.push_str(&indented);
3749
3750                                output.push_str("\n\n");
3751                            }
3752                            crate::options::CodeBlockStyle::Backticks | crate::options::CodeBlockStyle::Tildes => {
3753                                if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3754                                    if output.ends_with('\n') {
3755                                        output.push('\n');
3756                                    } else {
3757                                        output.push_str("\n\n");
3758                                    }
3759                                }
3760
3761                                let fence = if options.code_block_style == crate::options::CodeBlockStyle::Backticks {
3762                                    "```"
3763                                } else {
3764                                    "~~~"
3765                                };
3766
3767                                output.push_str(fence);
3768                                if !options.code_language.is_empty() {
3769                                    output.push_str(&options.code_language);
3770                                }
3771                                output.push('\n');
3772                                output.push_str(&processed_content);
3773                                output.push('\n');
3774                                output.push_str(fence);
3775                                output.push('\n');
3776                            }
3777                        }
3778                    }
3779                }
3780
3781                "blockquote" => {
3782                    if ctx.convert_as_inline {
3783                        let children = tag.children();
3784                        {
3785                            for child_handle in children.top().iter() {
3786                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3787                            }
3788                        }
3789                        return;
3790                    }
3791
3792                    let cite = tag
3793                        .attributes()
3794                        .get("cite")
3795                        .flatten()
3796                        .map(|v| v.as_utf8_str().to_string());
3797
3798                    let blockquote_ctx = Context {
3799                        blockquote_depth: ctx.blockquote_depth + 1,
3800                        ..ctx.clone()
3801                    };
3802                    let mut content = String::with_capacity(256);
3803                    let children = tag.children();
3804                    {
3805                        for child_handle in children.top().iter() {
3806                            walk_node(
3807                                child_handle,
3808                                parser,
3809                                &mut content,
3810                                options,
3811                                &blockquote_ctx,
3812                                depth + 1,
3813                                dom_ctx,
3814                            );
3815                        }
3816                    }
3817
3818                    let trimmed_content = content.trim();
3819
3820                    if !trimmed_content.is_empty() {
3821                        if ctx.blockquote_depth > 0 {
3822                            output.push_str("\n\n\n");
3823                        } else if !output.is_empty() {
3824                            if !output.ends_with('\n') {
3825                                output.push('\n');
3826                            } else if output.ends_with("\n\n") {
3827                                output.truncate(output.len() - 1);
3828                            }
3829                        }
3830
3831                        let prefix = "> ";
3832
3833                        for line in trimmed_content.lines() {
3834                            output.push_str(prefix);
3835                            output.push_str(line.trim());
3836                            output.push('\n');
3837                        }
3838
3839                        if let Some(url) = cite {
3840                            output.push('\n');
3841                            output.push_str("— <");
3842                            output.push_str(&url);
3843                            output.push_str(">\n\n");
3844                        }
3845
3846                        while output.ends_with('\n') {
3847                            output.truncate(output.len() - 1);
3848                        }
3849                    }
3850                }
3851
3852                "br" => {
3853                    if ctx.in_heading {
3854                        trim_trailing_whitespace(output);
3855                        output.push_str("  ");
3856                    } else {
3857                        use crate::options::NewlineStyle;
3858                        if output.is_empty() || output.ends_with('\n') {
3859                            output.push('\n');
3860                        } else {
3861                            match options.newline_style {
3862                                NewlineStyle::Spaces => output.push_str("  \n"),
3863                                NewlineStyle::Backslash => output.push_str("\\\n"),
3864                            }
3865                        }
3866                    }
3867                }
3868
3869                "hr" => {
3870                    if !output.is_empty() {
3871                        let prev_tag = get_previous_sibling_tag(node_handle, parser, dom_ctx);
3872                        let last_line_is_blockquote = output
3873                            .rsplit('\n')
3874                            .find(|line| !line.trim().is_empty())
3875                            .map(|line| line.trim_start().starts_with('>'))
3876                            .unwrap_or(false);
3877                        let needs_blank_line = !ctx.in_paragraph
3878                            && !matches!(prev_tag.as_deref(), Some("blockquote"))
3879                            && !last_line_is_blockquote;
3880
3881                        if options.debug {
3882                            eprintln!(
3883                                "[DEBUG] <hr> prev_tag={:?} needs_blank_line={} in_paragraph={}",
3884                                prev_tag, needs_blank_line, ctx.in_paragraph
3885                            );
3886                        }
3887
3888                        if ctx.in_paragraph || !needs_blank_line {
3889                            if !output.ends_with('\n') {
3890                                output.push('\n');
3891                            }
3892                        } else {
3893                            trim_trailing_whitespace(output);
3894                            if output.ends_with('\n') {
3895                                if !output.ends_with("\n\n") {
3896                                    output.push('\n');
3897                                }
3898                            } else {
3899                                output.push_str("\n\n");
3900                            }
3901                        }
3902                    }
3903                    output.push_str("---\n");
3904                }
3905
3906                "ul" => {
3907                    add_list_leading_separator(output, ctx);
3908
3909                    let nested_depth = calculate_list_nesting_depth(ctx);
3910                    let is_loose = is_loose_list(node_handle, parser);
3911
3912                    process_list_children(
3913                        node_handle,
3914                        parser,
3915                        output,
3916                        options,
3917                        ctx,
3918                        depth,
3919                        false,
3920                        is_loose,
3921                        nested_depth,
3922                        1,
3923                        dom_ctx,
3924                    );
3925
3926                    add_nested_list_trailing_separator(output, ctx);
3927                }
3928
3929                "ol" => {
3930                    add_list_leading_separator(output, ctx);
3931
3932                    let nested_depth = calculate_list_nesting_depth(ctx);
3933                    let is_loose = is_loose_list(node_handle, parser);
3934
3935                    let start = tag
3936                        .attributes()
3937                        .get("start")
3938                        .flatten()
3939                        .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
3940                        .unwrap_or(1);
3941
3942                    process_list_children(
3943                        node_handle,
3944                        parser,
3945                        output,
3946                        options,
3947                        ctx,
3948                        depth,
3949                        true,
3950                        is_loose,
3951                        nested_depth,
3952                        start,
3953                        dom_ctx,
3954                    );
3955
3956                    add_nested_list_trailing_separator(output, ctx);
3957                }
3958
3959                "li" => {
3960                    if ctx.list_depth > 0 {
3961                        let indent = match options.list_indent_type {
3962                            ListIndentType::Tabs => "\t".repeat(ctx.list_depth),
3963                            ListIndentType::Spaces => " ".repeat(ctx.list_depth * options.list_indent_width),
3964                        };
3965                        output.push_str(&indent);
3966                    }
3967
3968                    let mut has_block_children = false;
3969                    let children = tag.children();
3970                    {
3971                        for child_handle in children.top().iter() {
3972                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
3973                                let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
3974                                if matches!(
3975                                    tag_name.as_ref(),
3976                                    "p" | "div" | "blockquote" | "pre" | "table" | "hr" | "dl"
3977                                ) {
3978                                    has_block_children = true;
3979                                    break;
3980                                }
3981                            }
3982                        }
3983                    }
3984
3985                    fn find_checkbox<'a>(
3986                        node_handle: &tl::NodeHandle,
3987                        parser: &'a tl::Parser<'a>,
3988                    ) -> Option<(bool, tl::NodeHandle)> {
3989                        if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
3990                            if tag_name_eq(node_tag.name().as_utf8_str(), "input") {
3991                                let input_type = node_tag.attributes().get("type").flatten().map(|v| v.as_utf8_str());
3992
3993                                if input_type.as_deref() == Some("checkbox") {
3994                                    let checked = node_tag.attributes().get("checked").is_some();
3995                                    return Some((checked, *node_handle));
3996                                }
3997                            }
3998
3999                            let children = node_tag.children();
4000                            {
4001                                for child_handle in children.top().iter() {
4002                                    if let Some(result) = find_checkbox(child_handle, parser) {
4003                                        return Some(result);
4004                                    }
4005                                }
4006                            }
4007                        }
4008                        None
4009                    }
4010
4011                    let (is_task_list, task_checked, checkbox_node) =
4012                        if let Some((checked, node)) = find_checkbox(node_handle, parser) {
4013                            (true, checked, Some(node))
4014                        } else {
4015                            (false, false, None)
4016                        };
4017
4018                    let li_ctx = Context {
4019                        in_list_item: true,
4020                        list_depth: ctx.list_depth + 1,
4021                        ..ctx.clone()
4022                    };
4023
4024                    if is_task_list {
4025                        output.push('-');
4026                        output.push(' ');
4027                        output.push_str(if task_checked { "[x]" } else { "[ ]" });
4028
4029                        fn is_checkbox_node(node_handle: &tl::NodeHandle, checkbox: &Option<tl::NodeHandle>) -> bool {
4030                            if let Some(cb) = checkbox {
4031                                node_handle == cb
4032                            } else {
4033                                false
4034                            }
4035                        }
4036
4037                        fn contains_checkbox<'a>(
4038                            node_handle: &tl::NodeHandle,
4039                            parser: &'a tl::Parser<'a>,
4040                            checkbox: &Option<tl::NodeHandle>,
4041                        ) -> bool {
4042                            if is_checkbox_node(node_handle, checkbox) {
4043                                return true;
4044                            }
4045                            if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4046                                let children = node_tag.children();
4047                                {
4048                                    for child_handle in children.top().iter() {
4049                                        if contains_checkbox(child_handle, parser, checkbox) {
4050                                            return true;
4051                                        }
4052                                    }
4053                                }
4054                            }
4055                            false
4056                        }
4057
4058                        #[allow(clippy::too_many_arguments)]
4059                        fn render_li_content<'a>(
4060                            node_handle: &tl::NodeHandle,
4061                            parser: &'a tl::Parser<'a>,
4062                            output: &mut String,
4063                            options: &ConversionOptions,
4064                            ctx: &Context,
4065                            depth: usize,
4066                            checkbox: &Option<tl::NodeHandle>,
4067                            dom_ctx: &DomContext,
4068                        ) {
4069                            if is_checkbox_node(node_handle, checkbox) {
4070                                return;
4071                            }
4072
4073                            if contains_checkbox(node_handle, parser, checkbox) {
4074                                if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4075                                    let children = node_tag.children();
4076                                    {
4077                                        for child_handle in children.top().iter() {
4078                                            render_li_content(
4079                                                child_handle,
4080                                                parser,
4081                                                output,
4082                                                options,
4083                                                ctx,
4084                                                depth,
4085                                                checkbox,
4086                                                dom_ctx,
4087                                            );
4088                                        }
4089                                    }
4090                                }
4091                            } else {
4092                                walk_node(node_handle, parser, output, options, ctx, depth, dom_ctx);
4093                            }
4094                        }
4095
4096                        let mut task_text = String::new();
4097                        let children = tag.children();
4098                        {
4099                            for child_handle in children.top().iter() {
4100                                render_li_content(
4101                                    child_handle,
4102                                    parser,
4103                                    &mut task_text,
4104                                    options,
4105                                    &li_ctx,
4106                                    depth + 1,
4107                                    &checkbox_node,
4108                                    dom_ctx,
4109                                );
4110                            }
4111                        }
4112                        output.push(' ');
4113                        let trimmed_task = task_text.trim();
4114                        if !trimmed_task.is_empty() {
4115                            output.push_str(trimmed_task);
4116                        }
4117                    } else {
4118                        if !ctx.in_table_cell {
4119                            if ctx.in_ordered_list {
4120                                output.push_str(&format!("{}. ", ctx.list_counter));
4121                            } else {
4122                                let bullets: Vec<char> = options.bullets.chars().collect();
4123                                let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
4124                                let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
4125                                output.push(bullet);
4126                                output.push(' ');
4127                            }
4128                        }
4129
4130                        let children = tag.children();
4131                        {
4132                            for child_handle in children.top().iter() {
4133                                walk_node(child_handle, parser, output, options, &li_ctx, depth + 1, dom_ctx);
4134                            }
4135                        }
4136
4137                        trim_trailing_whitespace(output);
4138                    }
4139
4140                    if !ctx.in_table_cell {
4141                        if has_block_children || ctx.loose_list || ctx.prev_item_had_blocks {
4142                            if !output.ends_with("\n\n") {
4143                                if output.ends_with('\n') {
4144                                    output.push('\n');
4145                                } else {
4146                                    output.push_str("\n\n");
4147                                }
4148                            }
4149                        } else if !output.ends_with('\n') {
4150                            output.push('\n');
4151                        }
4152                    }
4153                }
4154
4155                "table" => {
4156                    let mut table_output = String::new();
4157                    convert_table(node_handle, parser, &mut table_output, options, ctx, dom_ctx);
4158
4159                    if ctx.in_list_item {
4160                        let has_caption = table_output.starts_with('*');
4161
4162                        if !has_caption {
4163                            trim_trailing_whitespace(output);
4164                            if !output.is_empty() && !output.ends_with('\n') {
4165                                output.push('\n');
4166                            }
4167                        }
4168
4169                        let indented = indent_table_for_list(&table_output, ctx.list_depth, options);
4170                        output.push_str(&indented);
4171                    } else {
4172                        if !output.ends_with("\n\n") {
4173                            if output.is_empty() || !output.ends_with('\n') {
4174                                output.push_str("\n\n");
4175                            } else {
4176                                output.push('\n');
4177                            }
4178                        }
4179                        output.push_str(&table_output);
4180                    }
4181
4182                    if !output.ends_with('\n') {
4183                        output.push('\n');
4184                    }
4185                }
4186
4187                "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" => {}
4188
4189                "caption" => {
4190                    let mut text = String::new();
4191                    let children = tag.children();
4192                    {
4193                        for child_handle in children.top().iter() {
4194                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4195                        }
4196                    }
4197                    let text = text.trim();
4198                    if !text.is_empty() {
4199                        let escaped_text = text.replace('-', r"\-");
4200                        output.push('*');
4201                        output.push_str(&escaped_text);
4202                        output.push_str("*\n\n");
4203                    }
4204                }
4205
4206                "colgroup" | "col" => {}
4207
4208                "article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
4209                    if ctx.convert_as_inline {
4210                        let children = tag.children();
4211                        {
4212                            for child_handle in children.top().iter() {
4213                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4214                            }
4215                        }
4216                        return;
4217                    }
4218
4219                    let mut content = String::with_capacity(256);
4220                    let children = tag.children();
4221                    {
4222                        for child_handle in children.top().iter() {
4223                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4224                        }
4225                    }
4226                    if content.trim().is_empty() {
4227                        return;
4228                    }
4229
4230                    if !output.is_empty() && !output.ends_with("\n\n") {
4231                        output.push_str("\n\n");
4232                    }
4233                    output.push_str(&content);
4234                    if content.ends_with('\n') && !content.ends_with("\n\n") {
4235                        output.push('\n');
4236                    } else if !content.ends_with('\n') {
4237                        output.push_str("\n\n");
4238                    }
4239                }
4240
4241                "figure" => {
4242                    if ctx.convert_as_inline {
4243                        let children = tag.children();
4244                        {
4245                            for child_handle in children.top().iter() {
4246                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4247                            }
4248                        }
4249                        return;
4250                    }
4251
4252                    if !output.is_empty() && !output.ends_with("\n\n") {
4253                        output.push_str("\n\n");
4254                    }
4255
4256                    let mut figure_content = String::new();
4257                    let children = tag.children();
4258                    {
4259                        for child_handle in children.top().iter() {
4260                            walk_node(child_handle, parser, &mut figure_content, options, ctx, depth, dom_ctx);
4261                        }
4262                    }
4263
4264                    figure_content = figure_content.replace("\n![", "![");
4265                    figure_content = figure_content.replace(" ![", "![");
4266
4267                    let trimmed = figure_content.trim_matches(|c| c == '\n' || c == ' ' || c == '\t');
4268                    if !trimmed.is_empty() {
4269                        output.push_str(trimmed);
4270                        if !output.ends_with('\n') {
4271                            output.push('\n');
4272                        }
4273                        if !output.ends_with("\n\n") {
4274                            output.push('\n');
4275                        }
4276                    }
4277                }
4278
4279                "figcaption" => {
4280                    let mut text = String::new();
4281                    let children = tag.children();
4282                    {
4283                        for child_handle in children.top().iter() {
4284                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4285                        }
4286                    }
4287                    let text = text.trim();
4288                    if !text.is_empty() {
4289                        if !output.is_empty() {
4290                            if output.ends_with("```\n") {
4291                                output.push('\n');
4292                            } else {
4293                                trim_trailing_whitespace(output);
4294                                if output.ends_with('\n') && !output.ends_with("\n\n") {
4295                                    output.push('\n');
4296                                } else if !output.ends_with('\n') {
4297                                    output.push_str("\n\n");
4298                                }
4299                            }
4300                        }
4301                        output.push('*');
4302                        output.push_str(text);
4303                        output.push_str("*\n\n");
4304                    }
4305                }
4306
4307                "hgroup" => {
4308                    let children = tag.children();
4309                    {
4310                        for child_handle in children.top().iter() {
4311                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4312                        }
4313                    }
4314                }
4315
4316                "cite" => {
4317                    let mut content = String::with_capacity(32);
4318                    let children = tag.children();
4319                    {
4320                        for child_handle in children.top().iter() {
4321                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4322                        }
4323                    }
4324                    let trimmed = content.trim();
4325                    if !trimmed.is_empty() {
4326                        if ctx.convert_as_inline {
4327                            output.push_str(trimmed);
4328                        } else {
4329                            output.push('*');
4330                            output.push_str(trimmed);
4331                            output.push('*');
4332                        }
4333                    }
4334                }
4335
4336                "q" => {
4337                    let mut content = String::with_capacity(32);
4338                    let children = tag.children();
4339                    {
4340                        for child_handle in children.top().iter() {
4341                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4342                        }
4343                    }
4344                    let trimmed = content.trim();
4345                    if !trimmed.is_empty() {
4346                        if ctx.convert_as_inline {
4347                            output.push_str(trimmed);
4348                        } else {
4349                            output.push('"');
4350                            let escaped = trimmed.replace('\\', r"\\").replace('"', r#"\""#);
4351                            output.push_str(&escaped);
4352                            output.push('"');
4353                        }
4354                    }
4355                }
4356
4357                "dl" => {
4358                    if ctx.convert_as_inline {
4359                        let children = tag.children();
4360                        {
4361                            for child_handle in children.top().iter() {
4362                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4363                            }
4364                        }
4365                        return;
4366                    }
4367
4368                    let mut content = String::new();
4369                    let mut in_dt_group = false;
4370                    let children = tag.children();
4371                    {
4372                        for child_handle in children.top().iter() {
4373                            let (is_dt, is_dd) = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4374                                let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
4375                                (tag_name == "dt", tag_name == "dd")
4376                            } else {
4377                                (false, false)
4378                            };
4379
4380                            let child_ctx = Context {
4381                                last_was_dt: in_dt_group && is_dd,
4382                                ..ctx.clone()
4383                            };
4384                            walk_node(child_handle, parser, &mut content, options, &child_ctx, depth, dom_ctx);
4385
4386                            if is_dt {
4387                                in_dt_group = true;
4388                            } else if !is_dd {
4389                                in_dt_group = false;
4390                            }
4391                        }
4392                    }
4393
4394                    let trimmed = content.trim();
4395                    if !trimmed.is_empty() {
4396                        if !output.is_empty() && !output.ends_with("\n\n") {
4397                            output.push_str("\n\n");
4398                        }
4399                        output.push_str(trimmed);
4400                        output.push_str("\n\n");
4401                    }
4402                }
4403
4404                "dt" => {
4405                    let mut content = String::with_capacity(64);
4406                    let children = tag.children();
4407                    {
4408                        for child_handle in children.top().iter() {
4409                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4410                        }
4411                    }
4412                    let trimmed = content.trim();
4413                    if !trimmed.is_empty() {
4414                        if ctx.convert_as_inline {
4415                            output.push_str(trimmed);
4416                        } else {
4417                            output.push_str(trimmed);
4418                            output.push('\n');
4419                        }
4420                    }
4421                }
4422
4423                "dd" => {
4424                    let mut content = String::with_capacity(128);
4425                    let children = tag.children();
4426                    {
4427                        for child_handle in children.top().iter() {
4428                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4429                        }
4430                    }
4431
4432                    let trimmed = content.trim();
4433
4434                    if ctx.convert_as_inline {
4435                        if !trimmed.is_empty() {
4436                            output.push_str(trimmed);
4437                        }
4438                    } else if ctx.last_was_dt {
4439                        if !trimmed.is_empty() {
4440                            output.push_str(":   ");
4441                            output.push_str(trimmed);
4442                            output.push_str("\n\n");
4443                        } else {
4444                            output.push_str(":   \n\n");
4445                        }
4446                    } else if !trimmed.is_empty() {
4447                        output.push_str(trimmed);
4448                        output.push_str("\n\n");
4449                    }
4450                }
4451
4452                "details" => {
4453                    if ctx.convert_as_inline {
4454                        let children = tag.children();
4455                        {
4456                            for child_handle in children.top().iter() {
4457                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4458                            }
4459                        }
4460                        return;
4461                    }
4462
4463                    let mut content = String::with_capacity(256);
4464                    let children = tag.children();
4465                    {
4466                        for child_handle in children.top().iter() {
4467                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4468                        }
4469                    }
4470                    let trimmed = content.trim();
4471                    if !trimmed.is_empty() {
4472                        if !output.is_empty() && !output.ends_with("\n\n") {
4473                            output.push_str("\n\n");
4474                        }
4475                        output.push_str(trimmed);
4476                        output.push_str("\n\n");
4477                    }
4478                }
4479
4480                "summary" => {
4481                    let mut content = String::with_capacity(64);
4482                    let mut summary_ctx = ctx.clone();
4483                    if !ctx.convert_as_inline {
4484                        summary_ctx.in_strong = true;
4485                    }
4486                    let children = tag.children();
4487                    {
4488                        for child_handle in children.top().iter() {
4489                            walk_node(
4490                                child_handle,
4491                                parser,
4492                                &mut content,
4493                                options,
4494                                &summary_ctx,
4495                                depth + 1,
4496                                dom_ctx,
4497                            );
4498                        }
4499                    }
4500                    let trimmed = content.trim();
4501                    if !trimmed.is_empty() {
4502                        if ctx.convert_as_inline {
4503                            output.push_str(trimmed);
4504                        } else {
4505                            let symbol = options.strong_em_symbol.to_string().repeat(2);
4506                            output.push_str(&symbol);
4507                            output.push_str(trimmed);
4508                            output.push_str(&symbol);
4509                            output.push_str("\n\n");
4510                        }
4511                    }
4512                }
4513
4514                "dialog" => {
4515                    if ctx.convert_as_inline {
4516                        let children = tag.children();
4517                        {
4518                            for child_handle in children.top().iter() {
4519                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4520                            }
4521                        }
4522                        return;
4523                    }
4524
4525                    let content_start = output.len();
4526
4527                    let children = tag.children();
4528                    {
4529                        for child_handle in children.top().iter() {
4530                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4531                        }
4532                    }
4533
4534                    while output.len() > content_start && (output.ends_with(' ') || output.ends_with('\t')) {
4535                        output.pop();
4536                    }
4537
4538                    if output.len() > content_start && !output.ends_with("\n\n") {
4539                        output.push_str("\n\n");
4540                    }
4541                }
4542
4543                "menu" => {
4544                    let content_start = output.len();
4545
4546                    let menu_options = ConversionOptions {
4547                        bullets: "-".to_string(),
4548                        ..options.clone()
4549                    };
4550
4551                    let list_ctx = Context {
4552                        in_ordered_list: false,
4553                        list_counter: 0,
4554                        in_list: true,
4555                        list_depth: ctx.list_depth,
4556                        ..ctx.clone()
4557                    };
4558
4559                    let children = tag.children();
4560                    {
4561                        for child_handle in children.top().iter() {
4562                            walk_node(child_handle, parser, output, &menu_options, &list_ctx, depth, dom_ctx);
4563                        }
4564                    }
4565
4566                    if !ctx.convert_as_inline && output.len() > content_start {
4567                        if !output.ends_with("\n\n") {
4568                            if output.ends_with('\n') {
4569                                output.push('\n');
4570                            } else {
4571                                output.push_str("\n\n");
4572                            }
4573                        }
4574                    } else if ctx.convert_as_inline {
4575                        while output.ends_with('\n') {
4576                            output.pop();
4577                        }
4578                    }
4579                }
4580
4581                "audio" => {
4582                    use std::borrow::Cow;
4583
4584                    let src = tag
4585                        .attributes()
4586                        .get("src")
4587                        .flatten()
4588                        .map(|v| v.as_utf8_str())
4589                        .or_else(|| {
4590                            let children = tag.children();
4591                            {
4592                                for child_handle in children.top().iter() {
4593                                    if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4594                                        if tag_name_eq(child_tag.name().as_utf8_str(), "source") {
4595                                            return child_tag
4596                                                .attributes()
4597                                                .get("src")
4598                                                .flatten()
4599                                                .map(|v| v.as_utf8_str());
4600                                        }
4601                                    }
4602                                }
4603                            }
4604                            None
4605                        })
4606                        .unwrap_or(Cow::Borrowed(""));
4607
4608                    if !src.is_empty() {
4609                        output.push('[');
4610                        output.push_str(&src);
4611                        output.push_str("](");
4612                        output.push_str(&src);
4613                        output.push(')');
4614                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4615                            output.push_str("\n\n");
4616                        }
4617                    }
4618
4619                    let mut fallback = String::new();
4620                    let children = tag.children();
4621                    {
4622                        for child_handle in children.top().iter() {
4623                            let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4624                                tag_name_eq(child_tag.name().as_utf8_str(), "source")
4625                            } else {
4626                                false
4627                            };
4628
4629                            if !is_source {
4630                                walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4631                            }
4632                        }
4633                    }
4634                    if !fallback.is_empty() {
4635                        output.push_str(fallback.trim());
4636                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4637                            output.push_str("\n\n");
4638                        }
4639                    }
4640                }
4641
4642                "video" => {
4643                    use std::borrow::Cow;
4644
4645                    let src = tag
4646                        .attributes()
4647                        .get("src")
4648                        .flatten()
4649                        .map(|v| v.as_utf8_str())
4650                        .or_else(|| {
4651                            let children = tag.children();
4652                            {
4653                                for child_handle in children.top().iter() {
4654                                    if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4655                                        if tag_name_eq(child_tag.name().as_utf8_str(), "source") {
4656                                            return child_tag
4657                                                .attributes()
4658                                                .get("src")
4659                                                .flatten()
4660                                                .map(|v| v.as_utf8_str());
4661                                        }
4662                                    }
4663                                }
4664                            }
4665                            None
4666                        })
4667                        .unwrap_or(Cow::Borrowed(""));
4668
4669                    if !src.is_empty() {
4670                        output.push('[');
4671                        output.push_str(&src);
4672                        output.push_str("](");
4673                        output.push_str(&src);
4674                        output.push(')');
4675                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4676                            output.push_str("\n\n");
4677                        }
4678                    }
4679
4680                    let mut fallback = String::new();
4681                    let children = tag.children();
4682                    {
4683                        for child_handle in children.top().iter() {
4684                            let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4685                                tag_name_eq(child_tag.name().as_utf8_str(), "source")
4686                            } else {
4687                                false
4688                            };
4689
4690                            if !is_source {
4691                                walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4692                            }
4693                        }
4694                    }
4695                    if !fallback.is_empty() {
4696                        output.push_str(fallback.trim());
4697                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4698                            output.push_str("\n\n");
4699                        }
4700                    }
4701                }
4702
4703                "source" => {}
4704
4705                "picture" => {
4706                    let children = tag.children();
4707                    {
4708                        for child_handle in children.top().iter() {
4709                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4710                                if tag_name_eq(child_tag.name().as_utf8_str(), "img") {
4711                                    walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4712                                    break;
4713                                }
4714                            }
4715                        }
4716                    }
4717                }
4718
4719                "iframe" => {
4720                    use std::borrow::Cow;
4721
4722                    let src = tag
4723                        .attributes()
4724                        .get("src")
4725                        .flatten()
4726                        .map(|v| v.as_utf8_str())
4727                        .unwrap_or(Cow::Borrowed(""));
4728
4729                    if !src.is_empty() {
4730                        output.push('[');
4731                        output.push_str(&src);
4732                        output.push_str("](");
4733                        output.push_str(&src);
4734                        output.push(')');
4735                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4736                            output.push_str("\n\n");
4737                        }
4738                    }
4739                }
4740
4741                "svg" => {
4742                    let mut title = String::from("SVG Image");
4743                    let children = tag.children();
4744                    {
4745                        for child_handle in children.top().iter() {
4746                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4747                                if tag_name_eq(child_tag.name().as_utf8_str(), "title") {
4748                                    title = get_text_content(child_handle, parser).trim().to_string();
4749                                    break;
4750                                }
4751                            }
4752                        }
4753                    }
4754
4755                    #[cfg(feature = "inline-images")]
4756                    if let Some(ref collector_ref) = ctx.inline_collector {
4757                        let title_opt = if title == "SVG Image" {
4758                            None
4759                        } else {
4760                            Some(title.clone())
4761                        };
4762                        let mut attributes_map = BTreeMap::new();
4763                        for (key, value_opt) in tag.attributes().iter() {
4764                            let key_str = key.to_string();
4765                            let keep = key_str == "width"
4766                                || key_str == "height"
4767                                || key_str == "filename"
4768                                || key_str == "aria-label"
4769                                || key_str.starts_with("data-");
4770                            if keep {
4771                                let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
4772                                attributes_map.insert(key_str, value);
4773                            }
4774                        }
4775                        handle_inline_svg(collector_ref, node_handle, parser, title_opt, attributes_map);
4776                    }
4777
4778                    if ctx.convert_as_inline {
4779                        output.push_str(&title);
4780                    } else {
4781                        use base64::{Engine as _, engine::general_purpose::STANDARD};
4782
4783                        let svg_html = serialize_element(node_handle, parser);
4784
4785                        let base64_svg = STANDARD.encode(svg_html.as_bytes());
4786
4787                        output.push_str("![");
4788                        output.push_str(&title);
4789                        output.push_str("](data:image/svg+xml;base64,");
4790                        output.push_str(&base64_svg);
4791                        output.push(')');
4792                    }
4793                }
4794
4795                "math" => {
4796                    let text_content = get_text_content(node_handle, parser).trim().to_string();
4797
4798                    if text_content.is_empty() {
4799                        return;
4800                    }
4801
4802                    let math_html = serialize_element(node_handle, parser);
4803
4804                    let escaped_text = text::escape(
4805                        &text_content,
4806                        options.escape_misc,
4807                        options.escape_asterisks,
4808                        options.escape_underscores,
4809                        options.escape_ascii,
4810                    );
4811
4812                    let is_display_block = tag
4813                        .attributes()
4814                        .get("display")
4815                        .flatten()
4816                        .map(|v| v.as_utf8_str() == "block")
4817                        .unwrap_or(false);
4818
4819                    if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4820                        output.push_str("\n\n");
4821                    }
4822
4823                    output.push_str("<!-- MathML: ");
4824                    output.push_str(&math_html);
4825                    output.push_str(" --> ");
4826                    output.push_str(&escaped_text);
4827
4828                    if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4829                        output.push_str("\n\n");
4830                    }
4831                }
4832
4833                "form" => {
4834                    if ctx.convert_as_inline {
4835                        let children = tag.children();
4836                        {
4837                            for child_handle in children.top().iter() {
4838                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4839                            }
4840                        }
4841                        return;
4842                    }
4843
4844                    let mut content = String::new();
4845                    let children = tag.children();
4846                    {
4847                        for child_handle in children.top().iter() {
4848                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4849                        }
4850                    }
4851                    let trimmed = content.trim();
4852                    if !trimmed.is_empty() {
4853                        if !output.is_empty() && !output.ends_with("\n\n") {
4854                            output.push_str("\n\n");
4855                        }
4856                        output.push_str(trimmed);
4857                        output.push_str("\n\n");
4858                    }
4859                }
4860
4861                "fieldset" => {
4862                    if ctx.convert_as_inline {
4863                        let children = tag.children();
4864                        {
4865                            for child_handle in children.top().iter() {
4866                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4867                            }
4868                        }
4869                        return;
4870                    }
4871                    let mut content = String::new();
4872                    let children = tag.children();
4873                    {
4874                        for child_handle in children.top().iter() {
4875                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4876                        }
4877                    }
4878                    let trimmed = content.trim();
4879                    if !trimmed.is_empty() {
4880                        if !output.is_empty() && !output.ends_with("\n\n") {
4881                            output.push_str("\n\n");
4882                        }
4883                        output.push_str(trimmed);
4884                        output.push_str("\n\n");
4885                    }
4886                }
4887
4888                "legend" => {
4889                    let mut content = String::new();
4890                    let mut legend_ctx = ctx.clone();
4891                    if !ctx.convert_as_inline {
4892                        legend_ctx.in_strong = true;
4893                    }
4894                    let children = tag.children();
4895                    {
4896                        for child_handle in children.top().iter() {
4897                            walk_node(
4898                                child_handle,
4899                                parser,
4900                                &mut content,
4901                                options,
4902                                &legend_ctx,
4903                                depth + 1,
4904                                dom_ctx,
4905                            );
4906                        }
4907                    }
4908                    let trimmed = content.trim();
4909                    if !trimmed.is_empty() {
4910                        if ctx.convert_as_inline {
4911                            output.push_str(trimmed);
4912                        } else {
4913                            let symbol = options.strong_em_symbol.to_string().repeat(2);
4914                            output.push_str(&symbol);
4915                            output.push_str(trimmed);
4916                            output.push_str(&symbol);
4917                            output.push_str("\n\n");
4918                        }
4919                    }
4920                }
4921
4922                "label" => {
4923                    let mut content = String::new();
4924                    let children = tag.children();
4925                    {
4926                        for child_handle in children.top().iter() {
4927                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4928                        }
4929                    }
4930                    let trimmed = content.trim();
4931                    if !trimmed.is_empty() {
4932                        output.push_str(trimmed);
4933                        if !ctx.convert_as_inline {
4934                            output.push_str("\n\n");
4935                        }
4936                    }
4937                }
4938
4939                "input" => {}
4940
4941                "textarea" => {
4942                    let start_len = output.len();
4943                    let children = tag.children();
4944                    {
4945                        for child_handle in children.top().iter() {
4946                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4947                        }
4948                    }
4949
4950                    if !ctx.convert_as_inline && output.len() > start_len {
4951                        output.push_str("\n\n");
4952                    }
4953                }
4954
4955                "select" => {
4956                    let start_len = output.len();
4957                    let children = tag.children();
4958                    {
4959                        for child_handle in children.top().iter() {
4960                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4961                        }
4962                    }
4963
4964                    if !ctx.convert_as_inline && output.len() > start_len {
4965                        output.push('\n');
4966                    }
4967                }
4968
4969                "option" => {
4970                    let selected = tag.attributes().iter().any(|(name, _)| name.as_ref() == "selected");
4971
4972                    let mut text = String::new();
4973                    let children = tag.children();
4974                    {
4975                        for child_handle in children.top().iter() {
4976                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4977                        }
4978                    }
4979                    let trimmed = text.trim();
4980                    if !trimmed.is_empty() {
4981                        if selected && !ctx.convert_as_inline {
4982                            output.push_str("* ");
4983                        }
4984                        output.push_str(trimmed);
4985                        if !ctx.convert_as_inline {
4986                            output.push('\n');
4987                        }
4988                    }
4989                }
4990
4991                "optgroup" => {
4992                    use std::borrow::Cow;
4993
4994                    let label = tag
4995                        .attributes()
4996                        .get("label")
4997                        .flatten()
4998                        .map(|v| v.as_utf8_str())
4999                        .unwrap_or(Cow::Borrowed(""));
5000
5001                    if !label.is_empty() {
5002                        let symbol = options.strong_em_symbol.to_string().repeat(2);
5003                        output.push_str(&symbol);
5004                        output.push_str(&label);
5005                        output.push_str(&symbol);
5006                        output.push('\n');
5007                    }
5008
5009                    let children = tag.children();
5010                    {
5011                        for child_handle in children.top().iter() {
5012                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5013                        }
5014                    }
5015                }
5016
5017                "button" => {
5018                    let start_len = output.len();
5019                    let children = tag.children();
5020                    {
5021                        for child_handle in children.top().iter() {
5022                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5023                        }
5024                    }
5025
5026                    if !ctx.convert_as_inline && output.len() > start_len {
5027                        output.push_str("\n\n");
5028                    }
5029                }
5030
5031                "progress" => {
5032                    let start_len = output.len();
5033                    let children = tag.children();
5034                    {
5035                        for child_handle in children.top().iter() {
5036                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5037                        }
5038                    }
5039
5040                    if !ctx.convert_as_inline && output.len() > start_len {
5041                        output.push_str("\n\n");
5042                    }
5043                }
5044
5045                "meter" => {
5046                    let start_len = output.len();
5047                    let children = tag.children();
5048                    {
5049                        for child_handle in children.top().iter() {
5050                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5051                        }
5052                    }
5053
5054                    if !ctx.convert_as_inline && output.len() > start_len {
5055                        output.push_str("\n\n");
5056                    }
5057                }
5058
5059                "output" => {
5060                    let start_len = output.len();
5061                    let children = tag.children();
5062                    {
5063                        for child_handle in children.top().iter() {
5064                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5065                        }
5066                    }
5067
5068                    if !ctx.convert_as_inline && output.len() > start_len {
5069                        output.push_str("\n\n");
5070                    }
5071                }
5072
5073                "datalist" => {
5074                    let start_len = output.len();
5075                    let children = tag.children();
5076                    {
5077                        for child_handle in children.top().iter() {
5078                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5079                        }
5080                    }
5081
5082                    if !ctx.convert_as_inline && output.len() > start_len {
5083                        output.push('\n');
5084                    }
5085                }
5086
5087                "ruby" => {
5088                    let ruby_ctx = ctx.clone();
5089
5090                    let tag_sequence: Vec<String> = tag
5091                        .children()
5092                        .top()
5093                        .iter()
5094                        .filter_map(|child_handle| {
5095                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5096                                let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5097                                if matches!(tag_name.as_ref(), "rb" | "rt" | "rtc") {
5098                                    Some(tag_name.into_owned())
5099                                } else {
5100                                    None
5101                                }
5102                            } else {
5103                                None
5104                            }
5105                        })
5106                        .collect();
5107
5108                    let has_rtc = tag_sequence.iter().any(|tag| tag == "rtc");
5109
5110                    let is_interleaved = tag_sequence.windows(2).any(|w| w[0] == "rb" && w[1] == "rt");
5111
5112                    if is_interleaved && !has_rtc {
5113                        let mut current_base = String::new();
5114                        let children = tag.children();
5115                        {
5116                            for child_handle in children.top().iter() {
5117                                if let Some(node) = child_handle.get(parser) {
5118                                    match node {
5119                                        tl::Node::Tag(child_tag) => {
5120                                            let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5121                                            if tag_name == "rt" {
5122                                                let mut annotation = String::new();
5123                                                walk_node(
5124                                                    child_handle,
5125                                                    parser,
5126                                                    &mut annotation,
5127                                                    options,
5128                                                    &ruby_ctx,
5129                                                    depth,
5130                                                    dom_ctx,
5131                                                );
5132                                                if !current_base.is_empty() {
5133                                                    output.push_str(current_base.trim());
5134                                                    current_base.clear();
5135                                                }
5136                                                output.push_str(annotation.trim());
5137                                            } else if tag_name == "rb" {
5138                                                if !current_base.is_empty() {
5139                                                    output.push_str(current_base.trim());
5140                                                    current_base.clear();
5141                                                }
5142                                                walk_node(
5143                                                    child_handle,
5144                                                    parser,
5145                                                    &mut current_base,
5146                                                    options,
5147                                                    &ruby_ctx,
5148                                                    depth,
5149                                                    dom_ctx,
5150                                                );
5151                                            } else if tag_name != "rp" {
5152                                                walk_node(
5153                                                    child_handle,
5154                                                    parser,
5155                                                    &mut current_base,
5156                                                    options,
5157                                                    &ruby_ctx,
5158                                                    depth,
5159                                                    dom_ctx,
5160                                                );
5161                                            }
5162                                        }
5163                                        tl::Node::Raw(_) => {
5164                                            walk_node(
5165                                                child_handle,
5166                                                parser,
5167                                                &mut current_base,
5168                                                options,
5169                                                &ruby_ctx,
5170                                                depth,
5171                                                dom_ctx,
5172                                            );
5173                                        }
5174                                        _ => {}
5175                                    }
5176                                }
5177                            }
5178                        }
5179                        if !current_base.is_empty() {
5180                            output.push_str(current_base.trim());
5181                        }
5182                    } else {
5183                        let mut base_text = String::new();
5184                        let mut rt_annotations = Vec::new();
5185                        let mut rtc_content = String::new();
5186
5187                        let children = tag.children();
5188                        {
5189                            for child_handle in children.top().iter() {
5190                                if let Some(node) = child_handle.get(parser) {
5191                                    match node {
5192                                        tl::Node::Tag(child_tag) => {
5193                                            let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5194                                            if tag_name == "rt" {
5195                                                let mut annotation = String::new();
5196                                                walk_node(
5197                                                    child_handle,
5198                                                    parser,
5199                                                    &mut annotation,
5200                                                    options,
5201                                                    &ruby_ctx,
5202                                                    depth,
5203                                                    dom_ctx,
5204                                                );
5205                                                rt_annotations.push(annotation);
5206                                            } else if tag_name == "rtc" {
5207                                                walk_node(
5208                                                    child_handle,
5209                                                    parser,
5210                                                    &mut rtc_content,
5211                                                    options,
5212                                                    &ruby_ctx,
5213                                                    depth,
5214                                                    dom_ctx,
5215                                                );
5216                                            } else if tag_name != "rp" {
5217                                                walk_node(
5218                                                    child_handle,
5219                                                    parser,
5220                                                    &mut base_text,
5221                                                    options,
5222                                                    &ruby_ctx,
5223                                                    depth,
5224                                                    dom_ctx,
5225                                                );
5226                                            }
5227                                        }
5228                                        tl::Node::Raw(_) => {
5229                                            walk_node(
5230                                                child_handle,
5231                                                parser,
5232                                                &mut base_text,
5233                                                options,
5234                                                &ruby_ctx,
5235                                                depth,
5236                                                dom_ctx,
5237                                            );
5238                                        }
5239                                        _ => {}
5240                                    }
5241                                }
5242                            }
5243                        }
5244
5245                        let trimmed_base = base_text.trim();
5246
5247                        output.push_str(trimmed_base);
5248
5249                        if !rt_annotations.is_empty() {
5250                            let rt_text = rt_annotations.iter().map(|s| s.trim()).collect::<Vec<_>>().join("");
5251                            if !rt_text.is_empty() {
5252                                if has_rtc && !rtc_content.trim().is_empty() && rt_annotations.len() > 1 {
5253                                    output.push('(');
5254                                    output.push_str(&rt_text);
5255                                    output.push(')');
5256                                } else {
5257                                    output.push_str(&rt_text);
5258                                }
5259                            }
5260                        }
5261
5262                        if !rtc_content.trim().is_empty() {
5263                            output.push_str(rtc_content.trim());
5264                        }
5265                    }
5266                }
5267
5268                "rb" => {
5269                    let mut text = String::new();
5270                    let children = tag.children();
5271                    {
5272                        for child_handle in children.top().iter() {
5273                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5274                        }
5275                    }
5276                    output.push_str(text.trim());
5277                }
5278
5279                "rt" => {
5280                    let mut text = String::new();
5281                    let children = tag.children();
5282                    {
5283                        for child_handle in children.top().iter() {
5284                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5285                        }
5286                    }
5287                    let trimmed = text.trim();
5288
5289                    if output.ends_with('(') {
5290                        output.push_str(trimmed);
5291                    } else {
5292                        output.push('(');
5293                        output.push_str(trimmed);
5294                        output.push(')');
5295                    }
5296                }
5297
5298                "rp" => {
5299                    let mut content = String::new();
5300                    let children = tag.children();
5301                    {
5302                        for child_handle in children.top().iter() {
5303                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
5304                        }
5305                    }
5306                    let trimmed = content.trim();
5307                    if !trimmed.is_empty() {
5308                        output.push_str(trimmed);
5309                    }
5310                }
5311
5312                "rtc" => {
5313                    let children = tag.children();
5314                    {
5315                        for child_handle in children.top().iter() {
5316                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5317                        }
5318                    }
5319                }
5320
5321                "div" => {
5322                    if ctx.convert_as_inline {
5323                        let children = tag.children();
5324                        {
5325                            for child_handle in children.top().iter() {
5326                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5327                            }
5328                        }
5329                        return;
5330                    }
5331
5332                    let content_start_pos = output.len();
5333
5334                    let is_table_continuation =
5335                        ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
5336
5337                    let is_list_continuation = ctx.in_list_item
5338                        && !output.is_empty()
5339                        && !output.ends_with("* ")
5340                        && !output.ends_with("- ")
5341                        && !output.ends_with(". ");
5342
5343                    let needs_leading_sep = !ctx.in_table_cell
5344                        && !ctx.in_list_item
5345                        && !ctx.convert_as_inline
5346                        && !output.is_empty()
5347                        && !output.ends_with("\n\n");
5348
5349                    if is_table_continuation {
5350                        trim_trailing_whitespace(output);
5351                        output.push_str("<br>");
5352                    } else if is_list_continuation {
5353                        add_list_continuation_indent(output, ctx.list_depth, false, options);
5354                    } else if needs_leading_sep {
5355                        trim_trailing_whitespace(output);
5356                        output.push_str("\n\n");
5357                    }
5358
5359                    let children = tag.children();
5360                    {
5361                        for child_handle in children.top().iter() {
5362                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5363                        }
5364                    }
5365
5366                    let has_content = output.len() > content_start_pos;
5367
5368                    if has_content {
5369                        if content_start_pos == 0 && output.starts_with('\n') && !output.starts_with("\n\n") {
5370                            output.remove(0);
5371                        }
5372                        trim_trailing_whitespace(output);
5373
5374                        if ctx.in_table_cell {
5375                        } else if ctx.in_list_item {
5376                            if is_list_continuation {
5377                                if !output.ends_with('\n') {
5378                                    output.push('\n');
5379                                }
5380                            } else if !output.ends_with("\n\n") {
5381                                if output.ends_with('\n') {
5382                                    output.push('\n');
5383                                } else {
5384                                    output.push_str("\n\n");
5385                                }
5386                            }
5387                        } else if !ctx.in_list_item && !ctx.convert_as_inline {
5388                            if output.ends_with("\n\n") {
5389                            } else if output.ends_with('\n') {
5390                                output.push('\n');
5391                            } else {
5392                                output.push_str("\n\n");
5393                            }
5394                        }
5395                    }
5396                }
5397
5398                "head" => {
5399                    let children = tag.children();
5400                    let has_body_like = children.top().iter().any(|child_handle| {
5401                        if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5402                            let child_name = normalized_tag_name(child_tag.name().as_utf8_str());
5403                            matches!(
5404                                child_name.as_ref(),
5405                                "body" | "main" | "article" | "section" | "div" | "p"
5406                            )
5407                        } else {
5408                            false
5409                        }
5410                    });
5411
5412                    if has_body_like {
5413                        for child_handle in children.top().iter() {
5414                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5415                        }
5416                    }
5417                }
5418
5419                "script" =>
5420                {
5421                    #[cfg(feature = "metadata")]
5422                    if let Some(type_attr) = tag.attributes().get("type").flatten() {
5423                        if type_attr.as_utf8_str() == "application/ld+json" {
5424                            if let Some(ref collector) = ctx.metadata_collector {
5425                                let json = get_text_content(node_handle, parser);
5426                                collector.borrow_mut().add_json_ld(json);
5427                            }
5428                        }
5429                    }
5430                }
5431                "style" => {}
5432
5433                "span" => {
5434                    let is_hocr_word = tag.attributes().iter().any(|(name, value)| {
5435                        name.as_ref() == "class" && value.as_ref().is_some_and(|v| v.as_ref().contains("ocrx_word"))
5436                    });
5437
5438                    if is_hocr_word
5439                        && !output.is_empty()
5440                        && !output.ends_with(' ')
5441                        && !output.ends_with('\t')
5442                        && !output.ends_with('\n')
5443                    {
5444                        output.push(' ');
5445                    }
5446
5447                    if !ctx.in_code
5448                        && options.whitespace_mode == crate::options::WhitespaceMode::Normalized
5449                        && output.ends_with('\n')
5450                        && !output.ends_with("\n\n")
5451                    {
5452                        output.pop();
5453                    }
5454
5455                    let children = tag.children();
5456                    {
5457                        for child_handle in children.top().iter() {
5458                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5459                        }
5460                    }
5461                }
5462
5463                _ => {
5464                    let len_before = output.len();
5465                    let had_trailing_space = output.ends_with(' ');
5466
5467                    let children = tag.children();
5468                    {
5469                        for child_handle in children.top().iter() {
5470                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5471                        }
5472                    }
5473
5474                    let len_after = output.len();
5475                    if len_after > len_before {
5476                        let start_idx = if output.is_char_boundary(len_before) {
5477                            len_before
5478                        } else {
5479                            let capped = len_before.min(output.len());
5480                            output
5481                                .char_indices()
5482                                .map(|(idx, _)| idx)
5483                                .take_while(|idx| *idx <= capped)
5484                                .last()
5485                                .unwrap_or(capped)
5486                        };
5487
5488                        let added_content = output[start_idx..].to_string();
5489                        if options.debug {
5490                            eprintln!(
5491                                "[DEBUG] <{}> added {:?}, trim={:?}, had_trailing_space={}",
5492                                tag_name,
5493                                added_content,
5494                                added_content.trim(),
5495                                had_trailing_space
5496                            );
5497                        }
5498
5499                        let is_code_block = added_content.starts_with("    ")
5500                            || added_content.starts_with("```")
5501                            || added_content.starts_with("~~~");
5502
5503                        if options.debug && added_content.trim().is_empty() {
5504                            eprintln!(
5505                                "[DEBUG] Whitespace-only content, is_code_block={}, will_truncate={}",
5506                                is_code_block, !is_code_block
5507                            );
5508                        }
5509
5510                        if added_content.trim().is_empty() && !is_code_block {
5511                            output.truncate(start_idx);
5512                            if !had_trailing_space && added_content.contains(' ') {
5513                                output.push(' ');
5514                            }
5515                            if options.debug {
5516                                eprintln!(
5517                                    "[DEBUG] Truncated, output now ends with space: {}",
5518                                    output.ends_with(' ')
5519                                );
5520                            }
5521                        }
5522                    }
5523                }
5524            }
5525        }
5526
5527        tl::Node::Comment(_) => {}
5528    }
5529}
5530
5531/// Get colspan attribute value from element
5532fn get_colspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5533    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5534        if let Some(Some(bytes)) = tag.attributes().get("colspan") {
5535            if let Ok(colspan) = bytes.as_utf8_str().parse::<usize>() {
5536                return colspan;
5537            }
5538        }
5539    }
5540    1
5541}
5542
5543/// Get both colspan and rowspan in a single lookup
5544fn get_colspan_rowspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> (usize, usize) {
5545    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5546        let attrs = tag.attributes();
5547        let colspan = attrs
5548            .get("colspan")
5549            .flatten()
5550            .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5551            .unwrap_or(1);
5552        let rowspan = attrs
5553            .get("rowspan")
5554            .flatten()
5555            .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5556            .unwrap_or(1);
5557        (colspan, rowspan)
5558    } else {
5559        (1, 1)
5560    }
5561}
5562
5563/// Convert table cell (td or th)
5564fn convert_table_cell(
5565    node_handle: &tl::NodeHandle,
5566    parser: &tl::Parser,
5567    output: &mut String,
5568    options: &ConversionOptions,
5569    ctx: &Context,
5570    _tag_name: &str,
5571    dom_ctx: &DomContext,
5572) {
5573    let mut text = String::with_capacity(128);
5574
5575    let cell_ctx = Context {
5576        in_table_cell: true,
5577        ..ctx.clone()
5578    };
5579
5580    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5581        let children = tag.children();
5582        {
5583            for child_handle in children.top().iter() {
5584                walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
5585            }
5586        }
5587    }
5588
5589    let text = text.trim();
5590    let text = if options.br_in_tables {
5591        text.split('\n')
5592            .filter(|s| !s.is_empty())
5593            .collect::<Vec<_>>()
5594            .join("<br>")
5595    } else {
5596        text.replace('\n', " ")
5597    };
5598
5599    let colspan = get_colspan(node_handle, parser);
5600
5601    output.push(' ');
5602    output.push_str(&text);
5603    output.push_str(&" |".repeat(colspan));
5604}
5605
5606/// Convert table row (tr)
5607#[allow(clippy::too_many_arguments)]
5608fn convert_table_row(
5609    node_handle: &tl::NodeHandle,
5610    parser: &tl::Parser,
5611    output: &mut String,
5612    options: &ConversionOptions,
5613    ctx: &Context,
5614    row_index: usize,
5615    rowspan_tracker: &mut std::collections::HashMap<usize, (String, usize)>,
5616    dom_ctx: &DomContext,
5617) {
5618    let mut row_text = String::with_capacity(256);
5619    let mut cells = Vec::new();
5620
5621    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5622        let children = tag.children();
5623        {
5624            for child_handle in children.top().iter() {
5625                if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5626                    let cell_name = normalized_tag_name(child_tag.name().as_utf8_str());
5627                    if cell_name == "th" || cell_name == "td" {
5628                        cells.push(*child_handle);
5629                    }
5630                }
5631            }
5632        }
5633    }
5634
5635    let mut col_index = 0;
5636    let mut cell_iter = cells.iter();
5637
5638    loop {
5639        if let Some((_content, remaining_rows)) = rowspan_tracker.get_mut(&col_index) {
5640            if *remaining_rows > 0 {
5641                row_text.push(' ');
5642                row_text.push_str(" |");
5643                *remaining_rows -= 1;
5644                if *remaining_rows == 0 {
5645                    rowspan_tracker.remove(&col_index);
5646                }
5647                col_index += 1;
5648                continue;
5649            }
5650        }
5651
5652        if let Some(cell_handle) = cell_iter.next() {
5653            let cell_start = row_text.len();
5654            convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
5655
5656            let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
5657
5658            if rowspan > 1 {
5659                let cell_text = &row_text[cell_start..];
5660                let cell_content = cell_text
5661                    .trim_start_matches(' ')
5662                    .trim_end_matches(" |")
5663                    .trim()
5664                    .to_string();
5665                rowspan_tracker.insert(col_index, (cell_content, rowspan - 1));
5666            }
5667
5668            col_index += colspan;
5669        } else {
5670            break;
5671        }
5672    }
5673
5674    output.push('|');
5675    output.push_str(&row_text);
5676    output.push('\n');
5677
5678    let is_first_row = row_index == 0;
5679    if is_first_row {
5680        let total_cols = cells.iter().map(|h| get_colspan(h, parser)).sum::<usize>().max(1);
5681        output.push_str("| ");
5682        for i in 0..total_cols {
5683            if i > 0 {
5684                output.push_str(" | ");
5685            }
5686            output.push_str("---");
5687        }
5688        output.push_str(" |\n");
5689    }
5690}
5691
5692fn table_has_header(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5693    if let Some(node) = node_handle.get(parser) {
5694        if let tl::Node::Tag(tag) = node {
5695            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5696            if tag_name.as_ref() == "th" {
5697                return true;
5698            }
5699            let children = tag.children();
5700            for child in children.top().iter() {
5701                if table_has_header(child, parser) {
5702                    return true;
5703                }
5704            }
5705        }
5706    }
5707    false
5708}
5709
5710fn table_has_caption(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5711    if let Some(node) = node_handle.get(parser) {
5712        if let tl::Node::Tag(tag) = node {
5713            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5714            if tag_name.as_ref() == "caption" {
5715                return true;
5716            }
5717            let children = tag.children();
5718            for child in children.top().iter() {
5719                if table_has_caption(child, parser) {
5720                    return true;
5721                }
5722            }
5723        }
5724    }
5725    false
5726}
5727
5728fn table_contains_nested_table(node_handle: &tl::NodeHandle, parser: &tl::Parser, is_root: bool) -> bool {
5729    if let Some(node) = node_handle.get(parser) {
5730        if let tl::Node::Tag(tag) = node {
5731            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5732            if !is_root && tag_name.as_ref() == "table" {
5733                return true;
5734            }
5735
5736            for child in tag.children().top().iter() {
5737                if table_contains_nested_table(child, parser, false) {
5738                    return true;
5739                }
5740            }
5741        }
5742    }
5743    false
5744}
5745
5746fn collect_table_row_counts(
5747    node_handle: &tl::NodeHandle,
5748    parser: &tl::Parser,
5749    counts: &mut Vec<usize>,
5750    has_span: &mut bool,
5751) {
5752    if let Some(node) = node_handle.get(parser) {
5753        if let tl::Node::Tag(tag) = node {
5754            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5755            match tag_name.as_ref() {
5756                "tr" => {
5757                    let mut cell_count = 0;
5758                    for child in tag.children().top().iter() {
5759                        if let Some(tl::Node::Tag(cell_tag)) = child.get(parser) {
5760                            let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5761                            if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5762                                cell_count += 1;
5763                                let attrs = cell_tag.attributes();
5764                                if attrs.get("colspan").is_some() || attrs.get("rowspan").is_some() {
5765                                    *has_span = true;
5766                                }
5767                            }
5768                        }
5769                    }
5770                    counts.push(cell_count);
5771                }
5772                _ => {
5773                    for child in tag.children().top().iter() {
5774                        collect_table_row_counts(child, parser, counts, has_span);
5775                    }
5776                }
5777            }
5778        }
5779    }
5780}
5781
5782fn count_links(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5783    let mut total = 0;
5784    if let Some(node) = node_handle.get(parser) {
5785        if let tl::Node::Tag(tag) = node {
5786            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5787            if tag_name.as_ref() == "a" {
5788                total += 1;
5789            }
5790
5791            for child in tag.children().top().iter() {
5792                total += count_links(child, parser);
5793            }
5794        }
5795    }
5796    total
5797}
5798
5799fn append_layout_row(
5800    row_handle: &tl::NodeHandle,
5801    parser: &tl::Parser,
5802    output: &mut String,
5803    options: &ConversionOptions,
5804    ctx: &Context,
5805    dom_ctx: &DomContext,
5806) {
5807    if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5808        let mut row_text = String::new();
5809        let row_children = row_tag.children();
5810        for cell_handle in row_children.top().iter() {
5811            if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
5812                let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5813                if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5814                    let mut cell_text = String::new();
5815                    let cell_ctx = Context {
5816                        convert_as_inline: true,
5817                        ..ctx.clone()
5818                    };
5819                    let cell_children = cell_tag.children();
5820                    for cell_child in cell_children.top().iter() {
5821                        walk_node(cell_child, parser, &mut cell_text, options, &cell_ctx, 0, dom_ctx);
5822                    }
5823                    let cell_content = text::normalize_whitespace(&cell_text);
5824                    if !cell_content.trim().is_empty() {
5825                        if !row_text.is_empty() {
5826                            row_text.push(' ');
5827                        }
5828                        row_text.push_str(cell_content.trim());
5829                    }
5830                }
5831            }
5832        }
5833
5834        let trimmed = row_text.trim();
5835        if !trimmed.is_empty() {
5836            if !output.is_empty() && !output.ends_with('\n') {
5837                output.push('\n');
5838            }
5839            let formatted = trimmed.strip_prefix("- ").unwrap_or(trimmed).trim_start();
5840            output.push_str("- ");
5841            output.push_str(formatted);
5842            output.push('\n');
5843        }
5844    }
5845}
5846
5847/// Indent table lines so they stay within their parent list item.
5848fn indent_table_for_list(table_content: &str, list_depth: usize, options: &ConversionOptions) -> String {
5849    if list_depth == 0 {
5850        return table_content.to_string();
5851    }
5852
5853    let Some(mut indent) = continuation_indent_string(list_depth, options) else {
5854        return table_content.to_string();
5855    };
5856
5857    if matches!(options.list_indent_type, ListIndentType::Spaces) {
5858        let space_count = indent.chars().filter(|c| *c == ' ').count();
5859        if space_count < 4 {
5860            indent.push_str(&" ".repeat(4 - space_count));
5861        }
5862    }
5863
5864    let mut result = String::with_capacity(table_content.len() + indent.len() * 4);
5865    for segment in table_content.split_inclusive('\n') {
5866        if segment.starts_with('|') {
5867            result.push_str(&indent);
5868            result.push_str(segment);
5869        } else {
5870            result.push_str(segment);
5871        }
5872    }
5873    result
5874}
5875
5876/// Convert an entire table element
5877fn convert_table(
5878    node_handle: &tl::NodeHandle,
5879    parser: &tl::Parser,
5880    output: &mut String,
5881    options: &ConversionOptions,
5882    ctx: &Context,
5883    dom_ctx: &DomContext,
5884) {
5885    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5886        let mut row_counts = Vec::new();
5887        let mut has_span = false;
5888        collect_table_row_counts(node_handle, parser, &mut row_counts, &mut has_span);
5889
5890        let row_count = row_counts.len();
5891        let mut distinct_counts: Vec<_> = row_counts.into_iter().filter(|c| *c > 0).collect();
5892        distinct_counts.sort_unstable();
5893        distinct_counts.dedup();
5894
5895        let looks_like_layout =
5896            table_contains_nested_table(node_handle, parser, true) || has_span || distinct_counts.len() > 1;
5897        let link_count = count_links(node_handle, parser);
5898        let table_text = text::normalize_whitespace(&get_text_content(node_handle, parser));
5899        let is_blank_table = table_text.trim().is_empty();
5900
5901        if !table_has_header(node_handle, parser)
5902            && !table_has_caption(node_handle, parser)
5903            && (looks_like_layout || is_blank_table || (row_count <= 2 && link_count >= 3))
5904        {
5905            if is_blank_table {
5906                return;
5907            }
5908
5909            let table_children = tag.children();
5910            for child_handle in table_children.top().iter() {
5911                if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5912                    let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5913                    match tag_name.as_ref() {
5914                        "thead" | "tbody" | "tfoot" => {
5915                            for row_handle in child_tag.children().top().iter() {
5916                                if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5917                                    if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
5918                                        append_layout_row(row_handle, parser, output, options, ctx, dom_ctx);
5919                                    }
5920                                }
5921                            }
5922                        }
5923                        "tr" => append_layout_row(child_handle, parser, output, options, ctx, dom_ctx),
5924                        _ => {}
5925                    }
5926                }
5927            }
5928            if !output.ends_with('\n') {
5929                output.push('\n');
5930            }
5931            return;
5932        }
5933
5934        let mut row_index = 0;
5935        let mut rowspan_tracker = std::collections::HashMap::new();
5936
5937        let children = tag.children();
5938        {
5939            for child_handle in children.top().iter() {
5940                if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5941                    let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5942
5943                    match tag_name.as_ref() {
5944                        "caption" => {
5945                            let mut text = String::new();
5946                            let grandchildren = child_tag.children();
5947                            {
5948                                for grandchild_handle in grandchildren.top().iter() {
5949                                    walk_node(grandchild_handle, parser, &mut text, options, ctx, 0, dom_ctx);
5950                                }
5951                            }
5952                            let text = text.trim();
5953                            if !text.is_empty() {
5954                                let escaped_text = text.replace('-', r"\-");
5955                                output.push('*');
5956                                output.push_str(&escaped_text);
5957                                output.push_str("*\n\n");
5958                            }
5959                        }
5960
5961                        "thead" | "tbody" | "tfoot" => {
5962                            let section_children = child_tag.children();
5963                            {
5964                                for row_handle in section_children.top().iter() {
5965                                    if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5966                                        if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
5967                                            convert_table_row(
5968                                                row_handle,
5969                                                parser,
5970                                                output,
5971                                                options,
5972                                                ctx,
5973                                                row_index,
5974                                                &mut rowspan_tracker,
5975                                                dom_ctx,
5976                                            );
5977                                            row_index += 1;
5978                                        }
5979                                    }
5980                                }
5981                            }
5982                        }
5983
5984                        "tr" => {
5985                            convert_table_row(
5986                                child_handle,
5987                                parser,
5988                                output,
5989                                options,
5990                                ctx,
5991                                row_index,
5992                                &mut rowspan_tracker,
5993                                dom_ctx,
5994                            );
5995                            row_index += 1;
5996                        }
5997
5998                        "colgroup" | "col" => {}
5999
6000                        _ => {}
6001                    }
6002                }
6003            }
6004        }
6005    }
6006}
6007
6008#[cfg(test)]
6009mod tests {
6010    use super::*;
6011    use crate::options::HighlightStyle;
6012
6013    #[test]
6014    fn test_trim_trailing_whitespace() {
6015        let mut s = String::from("hello   ");
6016        trim_trailing_whitespace(&mut s);
6017        assert_eq!(s, "hello");
6018
6019        let mut s = String::from("hello\t\t");
6020        trim_trailing_whitespace(&mut s);
6021        assert_eq!(s, "hello");
6022
6023        let mut s = String::from("hello \t \t");
6024        trim_trailing_whitespace(&mut s);
6025        assert_eq!(s, "hello");
6026
6027        let mut s = String::from("hello");
6028        trim_trailing_whitespace(&mut s);
6029        assert_eq!(s, "hello");
6030
6031        let mut s = String::from("");
6032        trim_trailing_whitespace(&mut s);
6033        assert_eq!(s, "");
6034
6035        let mut s = String::from("hello\n");
6036        trim_trailing_whitespace(&mut s);
6037        assert_eq!(s, "hello\n");
6038    }
6039
6040    #[test]
6041    fn test_chomp_preserves_boundary_spaces() {
6042        assert_eq!(chomp_inline("  text  "), (" ", " ", "text"));
6043        assert_eq!(chomp_inline("text"), ("", "", "text"));
6044        assert_eq!(chomp_inline("  text"), (" ", "", "text"));
6045        assert_eq!(chomp_inline("text  "), ("", " ", "text"));
6046        assert_eq!(chomp_inline("   "), (" ", " ", ""));
6047        assert_eq!(chomp_inline(""), ("", "", ""));
6048    }
6049
6050    #[test]
6051    fn nested_strong_markup_is_normalized() {
6052        let html = "<strong><strong>Bold</strong></strong>";
6053        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6054        assert_eq!(result.trim(), "**Bold**");
6055    }
6056
6057    #[test]
6058    fn nested_strong_with_additional_text_is_normalized() {
6059        let html = "<strong>Hello <strong>World</strong></strong>";
6060        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6061        assert_eq!(result.trim(), "**Hello World**");
6062    }
6063
6064    #[test]
6065    fn nested_strong_partial_segments_are_normalized() {
6066        let html = "<b>bo<b>ld</b>er</b>";
6067        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6068        assert_eq!(result.trim(), "**bolder**");
6069    }
6070
6071    #[test]
6072    fn summary_with_inner_strong_is_not_double_wrapped() {
6073        let html = "<details><summary><strong>Title</strong></summary></details>";
6074        let mut options = ConversionOptions::default();
6075        options.preprocessing.remove_forms = false;
6076        let result = convert_html(html, &options).unwrap();
6077        assert_eq!(result.trim(), "**Title**");
6078    }
6079
6080    #[test]
6081    fn legend_with_inner_strong_is_not_double_wrapped() {
6082        let html = "<fieldset><legend><strong>Section</strong></legend></fieldset>";
6083        let mut options = ConversionOptions::default();
6084        options.preprocessing.remove_forms = false;
6085        let result = convert_html(html, &options).unwrap();
6086        assert_eq!(result.trim(), "**Section**");
6087    }
6088
6089    #[test]
6090    fn preprocessing_keeps_article_header_inside_main() {
6091        let html = r#"
6092        <body>
6093            <header class="global-header">
6094                <div>Global Navigation</div>
6095            </header>
6096            <main>
6097                <header class="article-header">
6098                    <h1>Primary Title</h1>
6099                </header>
6100                <p>Body content stays.</p>
6101            </main>
6102        </body>
6103        "#;
6104        let mut options = ConversionOptions::default();
6105        options.preprocessing.enabled = true;
6106        let result = convert_html(html, &options).unwrap();
6107        assert!(
6108            result.contains("Primary Title"),
6109            "article header was removed: {}",
6110            result
6111        );
6112        assert!(
6113            result.contains("Body content stays"),
6114            "main body content missing: {}",
6115            result
6116        );
6117        assert!(
6118            !result.contains("Global Navigation"),
6119            "site chrome unexpectedly rendered: {}",
6120            result
6121        );
6122    }
6123
6124    #[test]
6125    fn preprocessing_drops_nav_but_keeps_body() {
6126        let html = r##"
6127        <main>
6128            <nav aria-label="Primary navigation">
6129                <a href="#a">NavOnly</a>
6130            </nav>
6131            <article>
6132                <p>Important narrative</p>
6133            </article>
6134        </main>
6135        "##;
6136        let mut options = ConversionOptions::default();
6137        options.preprocessing.enabled = true;
6138        let result = convert_html(html, &options).unwrap();
6139        assert!(
6140            !result.contains("NavOnly"),
6141            "navigation text should not appear: {}",
6142            result
6143        );
6144        assert!(
6145            result.contains("Important narrative"),
6146            "article text should remain: {}",
6147            result
6148        );
6149    }
6150
6151    #[test]
6152    fn preprocessing_retains_section_headers_inside_articles() {
6153        let html = r#"
6154        <article>
6155            <header>
6156                <h2>Section Heading</h2>
6157            </header>
6158            <section>
6159                <p>Section body</p>
6160            </section>
6161        </article>
6162        "#;
6163        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6164        assert!(
6165            result.contains("Section Heading"),
6166            "section heading was stripped: {}",
6167            result
6168        );
6169        assert!(result.contains("Section body"), "section body missing: {}", result);
6170    }
6171
6172    #[test]
6173    fn bold_highlight_suppresses_nested_strong() {
6174        let mut options = ConversionOptions::default();
6175        options.highlight_style = HighlightStyle::Bold;
6176        let html = "<p><mark><strong>Hot</strong></mark></p>";
6177        let result = convert_html(html, &options).unwrap();
6178        assert_eq!(result.trim(), "**Hot**");
6179    }
6180
6181    #[test]
6182    fn atx_heading_swallows_layout_line_breaks() {
6183        let html = r#"<h2>
6184  Heading
6185  Text
6186  with
6187  Line
6188  Breaks
6189</h2>"#;
6190        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6191        assert_eq!(result.trim(), "## Heading Text with Line Breaks");
6192    }
6193
6194    #[test]
6195    fn doctype_is_removed() {
6196        let html = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
6197            <html>
6198                <head><title>Example</title></head>
6199                <body><p>Hello World</p></body>
6200            </html>"#;
6201        let mut options = ConversionOptions::default();
6202        options.extract_metadata = false;
6203        let result = convert_html(html, &options).unwrap();
6204        assert_eq!(result.trim(), "Hello World");
6205    }
6206
6207    #[test]
6208    fn test_calculate_list_continuation_indent() {
6209        assert_eq!(calculate_list_continuation_indent(0), 0);
6210
6211        assert_eq!(calculate_list_continuation_indent(1), 1);
6212
6213        assert_eq!(calculate_list_continuation_indent(2), 3);
6214
6215        assert_eq!(calculate_list_continuation_indent(3), 5);
6216
6217        assert_eq!(calculate_list_continuation_indent(4), 7);
6218    }
6219
6220    #[test]
6221    fn strips_script_sections_without_removing_following_content() {
6222        let input = "<div>before</div><script>1 < 2</script><p>after</p>";
6223        let stripped = strip_script_and_style_sections(input);
6224        assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
6225    }
6226
6227    #[test]
6228    fn strips_multiline_script_sections() {
6229        let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
6230        let stripped = strip_script_and_style_sections(input);
6231        assert!(stripped.contains("Content"));
6232        assert!(stripped.contains("<script"));
6233        assert!(!stripped.contains("1 < 2"));
6234    }
6235
6236    #[test]
6237    fn hr_inside_paragraph_matches_inline_expectation() {
6238        let mut options = ConversionOptions::default();
6239        options.extract_metadata = false;
6240        let markdown = convert_html("<p>Hello<hr>World</p>", &options).unwrap();
6241        assert_eq!(markdown, "Hello\n---\nWorld\n");
6242    }
6243
6244    #[test]
6245    fn hr_inside_paragraph_matches_inline_expectation_via_public_api() {
6246        let mut options = ConversionOptions::default();
6247        options.extract_metadata = false;
6248        let markdown = crate::convert("<p>Hello<hr>World</p>", Some(options)).unwrap();
6249        assert_eq!(markdown, "Hello\n---\nWorld\n");
6250    }
6251
6252    #[test]
6253    fn test_add_list_continuation_indent_blank_line() {
6254        let opts = ConversionOptions::default();
6255        let mut output = String::from("* First para");
6256        add_list_continuation_indent(&mut output, 1, true, &opts);
6257        assert_eq!(output, "* First para\n\n  ");
6258
6259        let mut output = String::from("* First para\n");
6260        add_list_continuation_indent(&mut output, 1, true, &opts);
6261        assert_eq!(output, "* First para\n\n  ");
6262
6263        let mut output = String::from("* First para\n\n");
6264        add_list_continuation_indent(&mut output, 1, true, &opts);
6265        assert_eq!(output, "* First para\n\n  ");
6266
6267        let mut output = String::from("* First para");
6268        add_list_continuation_indent(&mut output, 2, true, &opts);
6269        assert_eq!(output, "* First para\n\n      ");
6270    }
6271
6272    #[test]
6273    fn test_add_list_continuation_indent_single_line() {
6274        let opts = ConversionOptions::default();
6275        let mut output = String::from("* First div");
6276        add_list_continuation_indent(&mut output, 1, false, &opts);
6277        assert_eq!(output, "* First div\n  ");
6278
6279        let mut output = String::from("* First div\n");
6280        add_list_continuation_indent(&mut output, 1, false, &opts);
6281        assert_eq!(output, "* First div\n  ");
6282
6283        let mut output = String::from("* First div\n");
6284        add_list_continuation_indent(&mut output, 1, false, &opts);
6285        assert_eq!(output, "* First div\n  ");
6286    }
6287
6288    #[test]
6289    fn test_trim_trailing_whitespace_in_continuation() {
6290        let opts = ConversionOptions::default();
6291        let mut output = String::from("* First   ");
6292        add_list_continuation_indent(&mut output, 1, true, &opts);
6293        assert_eq!(output, "* First\n\n  ");
6294
6295        let mut output = String::from("* First\t\t");
6296        add_list_continuation_indent(&mut output, 1, false, &opts);
6297        assert_eq!(output, "* First\n  ");
6298    }
6299
6300    #[test]
6301    fn test_escape_malformed_angle_brackets_bare() {
6302        let input = "1<2";
6303        let escaped = escape_malformed_angle_brackets(input);
6304        assert_eq!(escaped, "1&lt;2");
6305    }
6306
6307    #[test]
6308    fn test_escape_malformed_angle_brackets_in_text() {
6309        let input = "<html>1<2 Content</html>";
6310        let escaped = escape_malformed_angle_brackets(input);
6311        assert_eq!(escaped, "<html>1&lt;2 Content</html>");
6312    }
6313
6314    #[test]
6315    fn test_escape_malformed_angle_brackets_multiple() {
6316        let input = "1 < 2 < 3";
6317        let escaped = escape_malformed_angle_brackets(input);
6318        assert_eq!(escaped, "1 &lt; 2 &lt; 3");
6319    }
6320
6321    #[test]
6322    fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
6323        let input = "<div>content</div>";
6324        let escaped = escape_malformed_angle_brackets(input);
6325        assert_eq!(escaped, "<div>content</div>");
6326    }
6327
6328    #[test]
6329    fn test_escape_malformed_angle_brackets_mixed() {
6330        let input = "<div>1<2</div><p>3<4</p>";
6331        let escaped = escape_malformed_angle_brackets(input);
6332        assert_eq!(escaped, "<div>1&lt;2</div><p>3&lt;4</p>");
6333    }
6334
6335    #[test]
6336    fn test_escape_malformed_angle_brackets_at_end() {
6337        let input = "test<";
6338        let escaped = escape_malformed_angle_brackets(input);
6339        assert_eq!(escaped, "test&lt;");
6340    }
6341
6342    #[test]
6343    fn test_escape_malformed_angle_brackets_preserves_comments() {
6344        let input = "<!-- comment -->1<2";
6345        let escaped = escape_malformed_angle_brackets(input);
6346        assert_eq!(escaped, "<!-- comment -->1&lt;2");
6347    }
6348
6349    #[test]
6350    fn test_escape_malformed_angle_brackets_preserves_doctype() {
6351        let input = "<!DOCTYPE html>1<2";
6352        let escaped = escape_malformed_angle_brackets(input);
6353        assert_eq!(escaped, "<!DOCTYPE html>1&lt;2");
6354    }
6355
6356    #[test]
6357    fn test_convert_with_malformed_angle_brackets() {
6358        let html = "<html>1<2\nContent</html>";
6359        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6360        assert!(
6361            result.contains("Content"),
6362            "Result should contain 'Content': {:?}",
6363            result
6364        );
6365        assert!(
6366            result.contains("1<2") || result.contains("1&lt;2"),
6367            "Result should contain escaped or unescaped comparison"
6368        );
6369    }
6370
6371    #[test]
6372    fn test_convert_with_malformed_angle_brackets_in_div() {
6373        let html = "<html><div>1<2</div><div>Content</div></html>";
6374        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6375        assert!(
6376            result.contains("Content"),
6377            "Result should contain 'Content': {:?}",
6378            result
6379        );
6380    }
6381
6382    #[test]
6383    fn test_convert_with_multiple_malformed_angle_brackets() {
6384        let html = "<html>1 < 2 < 3<p>Content</p></html>";
6385        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6386        assert!(
6387            result.contains("Content"),
6388            "Result should contain 'Content': {:?}",
6389            result
6390        );
6391    }
6392
6393    #[test]
6394    fn test_preserve_tags_simple_table() {
6395        let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
6396        let options = ConversionOptions {
6397            preserve_tags: vec!["table".to_string()],
6398            ..Default::default()
6399        };
6400        let result = convert_html(html, &options).unwrap();
6401
6402        assert!(result.contains("<table>"), "Should preserve table tag");
6403        assert!(result.contains("</table>"), "Should have closing table tag");
6404        assert!(result.contains("<tr>"), "Should preserve tr tag");
6405        assert!(result.contains("<td>"), "Should preserve td tag");
6406        assert!(result.contains("Text"), "Should convert other elements");
6407    }
6408
6409    #[test]
6410    fn test_preserve_tags_with_attributes() {
6411        let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
6412        let options = ConversionOptions {
6413            preserve_tags: vec!["table".to_string()],
6414            ..Default::default()
6415        };
6416        let result = convert_html(html, &options).unwrap();
6417
6418        assert!(result.contains("<table"), "Should preserve table tag");
6419        assert!(result.contains("class="), "Should preserve class attribute");
6420        assert!(result.contains("id="), "Should preserve id attribute");
6421        assert!(result.contains("</table>"), "Should have closing tag");
6422    }
6423
6424    #[test]
6425    fn test_preserve_tags_multiple_tags() {
6426        let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
6427        let options = ConversionOptions {
6428            preserve_tags: vec!["table".to_string(), "form".to_string()],
6429            ..Default::default()
6430        };
6431        let result = convert_html(html, &options).unwrap();
6432
6433        assert!(result.contains("<table>"), "Should preserve table");
6434        assert!(result.contains("<form>"), "Should preserve form");
6435        assert!(result.contains("Text"), "Should convert paragraph");
6436    }
6437
6438    #[test]
6439    fn test_preserve_tags_nested_content() {
6440        let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
6441        let options = ConversionOptions {
6442            preserve_tags: vec!["table".to_string()],
6443            ..Default::default()
6444        };
6445        let result = convert_html(html, &options).unwrap();
6446
6447        assert!(result.contains("<thead>"), "Should preserve nested thead");
6448        assert!(result.contains("<tbody>"), "Should preserve nested tbody");
6449        assert!(result.contains("<th>"), "Should preserve th tag");
6450        assert!(result.contains("Header"), "Should preserve text content");
6451    }
6452
6453    #[test]
6454    fn test_preserve_tags_empty_list() {
6455        let html = r#"<table><tr><td>Cell</td></tr></table>"#;
6456        let options = ConversionOptions::default();
6457        let result = convert_html(html, &options).unwrap();
6458
6459        assert!(
6460            !result.contains("<table>"),
6461            "Should not preserve table without preserve_tags"
6462        );
6463    }
6464
6465    #[test]
6466    fn test_preserve_tags_vs_strip_tags() {
6467        let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
6468        let options = ConversionOptions {
6469            preserve_tags: vec!["table".to_string()],
6470            strip_tags: vec!["span".to_string()],
6471            ..Default::default()
6472        };
6473        let result = convert_html(html, &options).unwrap();
6474
6475        assert!(result.contains("<table>"), "Should preserve table");
6476        assert!(!result.contains("<span>"), "Should strip span tag");
6477        assert!(result.contains("Text"), "Should keep span text content");
6478    }
6479
6480    #[test]
6481    fn example_com_remains_visible() {
6482        let html = "<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href=\"https://iana.org/domains/example\">Learn more</a></div></body></html>";
6483
6484        let mut options = ConversionOptions::default();
6485        options.extract_metadata = false;
6486        let result = convert_html(html, &options).unwrap();
6487
6488        assert!(
6489            result.contains("Example Domain"),
6490            "content unexpectedly missing: {}",
6491            result
6492        );
6493    }
6494}
6495#[test]
6496fn normalize_self_closing_tags_noop_when_absent() {
6497    let html = "<div><p>text</p></div>";
6498    let normalized = normalize_self_closing_tags(html);
6499    assert!(matches!(normalized, Cow::Borrowed(_)));
6500    assert_eq!(normalized.as_ref(), html);
6501}
6502
6503#[test]
6504fn normalize_self_closing_tags_replaces_targets() {
6505    let html = "<br/><hr/><img/>";
6506    let normalized = normalize_self_closing_tags(html);
6507    assert_eq!(normalized.as_ref(), "<br><hr><img>");
6508}