html_to_markdown_rs/
converter.rs

1//! HTML to Markdown conversion using the astral-tl parser.
2//!
3//! This module provides the core conversion logic for transforming HTML documents into Markdown.
4//! It uses the astral-tl parser for high-performance HTML parsing and supports 60+ HTML tags.
5//!
6
7#![allow(clippy::collapsible_match)]
8//! # Architecture
9//!
10//! The conversion process follows these steps:
11//! 1. Parse HTML into a DOM tree using the astral-tl parser
12//! 2. Walk the DOM tree recursively
13//! 3. Convert each node type to its Markdown equivalent
14//! 4. Apply text escaping and whitespace normalization
15//!
16//! # Whitespace Handling
17//!
18//! This library preserves whitespace exactly as it appears in the HTML source.
19//! Text nodes retain their original spacing, including multiple spaces and newlines.
20//!
21//! - **Raw text preservation**: All whitespace in text nodes is preserved
22//! - **No HTML5 normalization**: Whitespace is not collapsed according to HTML5 rules
23//! - **Full control**: Applications can handle whitespace as needed
24//!
25//! # Supported Features
26//!
27//! - **Block elements**: headings, paragraphs, lists, tables, blockquotes
28//! - **Inline formatting**: bold, italic, code, links, images, strikethrough
29//! - **Semantic HTML5**: article, section, nav, aside, header, footer
30//! - **Forms**: inputs, select, button, textarea, fieldset
31//! - **Media**: audio, video, picture, iframe, svg
32//! - **Advanced**: task lists, ruby annotations, definition lists
33//!
34//! # Examples
35//!
36//! ```rust
37//! use html_to_markdown_rs::{convert, ConversionOptions};
38//!
39//! let html = "<h1>Title</h1><p>Paragraph with <strong>bold</strong> text.</p>";
40//! let markdown = convert(html, None).unwrap();
41//! assert_eq!(markdown, "# Title\n\nParagraph with **bold** text.\n");
42//! ```
43
44#[cfg(feature = "inline-images")]
45use std::cell::RefCell;
46use std::collections::{BTreeMap, HashMap};
47#[cfg(feature = "inline-images")]
48use std::rc::Rc;
49
50use std::borrow::Cow;
51use std::str;
52
53use crate::error::Result;
54#[cfg(feature = "inline-images")]
55use crate::inline_images::{InlineImageCollector, InlineImageFormat, InlineImageSource};
56use crate::options::{ConversionOptions, HeadingStyle, ListIndentType};
57use crate::text;
58
59#[cfg(feature = "inline-images")]
60type InlineCollectorHandle = Rc<RefCell<InlineImageCollector>>;
61#[cfg(not(feature = "inline-images"))]
62type InlineCollectorHandle = ();
63
64/// Chomp whitespace from inline element content, preserving line breaks.
65///
66/// Similar to text::chomp but handles line breaks from <br> tags specially.
67/// Line breaks are extracted as suffix to be placed outside formatting.
68/// Returns (prefix, suffix, trimmed_text).
69fn chomp_inline(text: &str) -> (&str, &str, &str) {
70    if text.is_empty() {
71        return ("", "", "");
72    }
73
74    let prefix = if text.starts_with(&[' ', '\t'][..]) { " " } else { "" };
75
76    let has_trailing_linebreak = text.ends_with("  \n") || text.ends_with("\\\n");
77
78    let suffix = if has_trailing_linebreak {
79        if text.ends_with("  \n") { "  \n" } else { "\\\n" }
80    } else if text.ends_with(&[' ', '\t'][..]) {
81        " "
82    } else {
83        ""
84    };
85
86    let trimmed = if has_trailing_linebreak {
87        if let Some(stripped) = text.strip_suffix("  \n") {
88            stripped.trim()
89        } else if let Some(stripped) = text.strip_suffix("\\\n") {
90            stripped.trim()
91        } else {
92            text.trim()
93        }
94    } else {
95        text.trim()
96    };
97
98    (prefix, suffix, trimmed)
99}
100
101/// Remove trailing spaces and tabs from output string.
102///
103/// This is used before adding block separators or newlines to ensure
104/// clean Markdown output without spurious whitespace.
105fn trim_trailing_whitespace(output: &mut String) {
106    while output.ends_with(' ') || output.ends_with('\t') {
107        output.pop();
108    }
109}
110
111/// Remove trailing spaces/tabs from every line while preserving newlines.
112fn trim_line_end_whitespace(output: &mut String) {
113    if output.is_empty() {
114        return;
115    }
116
117    let mut cleaned = String::with_capacity(output.len());
118    for (idx, line) in output.split('\n').enumerate() {
119        if idx > 0 {
120            cleaned.push('\n');
121        }
122
123        let has_soft_break = line.ends_with("  ");
124        let trimmed = line.trim_end_matches([' ', '\t']);
125
126        if has_soft_break {
127            cleaned.push_str(trimmed);
128            cleaned.push_str("  ");
129        } else {
130            cleaned.push_str(trimmed);
131        }
132    }
133
134    cleaned.push('\n');
135    *output = cleaned;
136}
137
138/// Truncate a string at a valid UTF-8 boundary.
139fn truncate_at_char_boundary(value: &mut String, max_len: usize) {
140    if value.len() <= max_len {
141        return;
142    }
143
144    let mut new_len = max_len.min(value.len());
145    while new_len > 0 && !value.is_char_boundary(new_len) {
146        new_len -= 1;
147    }
148    value.truncate(new_len);
149}
150
151/// Remove common leading whitespace from all lines in a code block.
152///
153/// This is useful when HTML authors indent `<pre>` content for readability,
154/// so we can strip the shared indentation without touching meaningful spacing.
155fn dedent_code_block(content: &str) -> String {
156    let lines: Vec<&str> = content.lines().collect();
157    if lines.is_empty() {
158        return String::new();
159    }
160
161    let min_indent = lines
162        .iter()
163        .filter(|line| !line.trim().is_empty())
164        .map(|line| {
165            line.char_indices()
166                .take_while(|(_, c)| c.is_whitespace())
167                .map(|(idx, c)| idx + c.len_utf8())
168                .last()
169                .unwrap_or(0)
170        })
171        .min()
172        .unwrap_or(0);
173
174    lines
175        .iter()
176        .map(|line| {
177            if line.trim().is_empty() {
178                *line
179            } else {
180                &line[min_indent.min(line.len())..]
181            }
182        })
183        .collect::<Vec<_>>()
184        .join("\n")
185}
186
187/// Calculate indentation level for list item continuations.
188///
189/// Returns the number of 4-space indent groups needed for list continuations.
190///
191/// List continuations (block elements inside list items) need special indentation:
192/// - Base indentation: (depth - 1) groups (for the nesting level)
193/// - Content indentation: depth groups (for the list item content)
194/// - Combined formula: (2 * depth - 1) groups of 4 spaces each
195///
196/// # Examples
197///
198/// ```text
199/// * Item 1           (depth=0, no continuation)
200/// * Item 2           (depth=0)
201///     Continuation   (depth=0: 0 groups = 0 spaces)
202///
203/// * Level 1          (depth=0)
204///     + Level 2      (depth=1)
205///             Cont   (depth=1: (2*1-1) = 1 group = 4 spaces, total 12 with bullet indent)
206/// ```
207fn calculate_list_continuation_indent(depth: usize) -> usize {
208    if depth > 0 { 2 * depth - 1 } else { 0 }
209}
210
211/// Check if a list (ul or ol) is "loose".
212///
213/// A loose list is one where any list item contains block-level elements
214/// like paragraphs (<p>). In loose lists, all items should have blank line
215/// separation (ending with \n\n) regardless of their own content.
216///
217/// # Examples
218///
219/// ```html
220/// <!-- Loose list (has <p> in an item) -->
221/// <ul>
222///   <li><p>Item 1</p></li>
223///   <li>Item 2</li>  <!-- Also gets \n\n ending -->
224/// </ul>
225///
226/// <!-- Tight list (no block elements) -->
227/// <ul>
228///   <li>Item 1</li>
229///   <li>Item 2</li>
230/// </ul>
231/// ```
232fn is_loose_list(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
233    if let Some(node) = node_handle.get(parser) {
234        if let tl::Node::Tag(tag) = node {
235            let children = tag.children();
236            {
237                for child_handle in children.top().iter() {
238                    if let Some(child_node) = child_handle.get(parser) {
239                        if let tl::Node::Tag(child_tag) = child_node {
240                            if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
241                                let li_children = child_tag.children();
242                                {
243                                    for li_child_handle in li_children.top().iter() {
244                                        if let Some(li_child_node) = li_child_handle.get(parser) {
245                                            if let tl::Node::Tag(li_child_tag) = li_child_node {
246                                                if tag_name_eq(li_child_tag.name().as_utf8_str(), "p") {
247                                                    return true;
248                                                }
249                                            }
250                                        }
251                                    }
252                                }
253                            }
254                        }
255                    }
256                }
257            }
258        }
259    }
260    false
261}
262
263/// Add list continuation indentation to output.
264///
265/// Used when block elements (like <p> or <div>) appear inside list items.
266/// Adds appropriate line separation and indentation to continue the list item.
267///
268/// # Arguments
269///
270/// * `output` - The output string to append to
271/// * `list_depth` - Current list nesting depth
272/// * `blank_line` - If true, adds blank line separation (\n\n); if false, single newline (\n)
273///
274/// # Examples
275///
276/// ```text
277/// Paragraph continuation (blank_line = true):
278///   * First para
279///
280///       Second para  (blank line + indentation)
281///
282/// Div continuation (blank_line = false):
283///   * First div
284///       Second div   (single newline + indentation)
285/// ```
286fn add_list_continuation_indent(output: &mut String, list_depth: usize, blank_line: bool, options: &ConversionOptions) {
287    trim_trailing_whitespace(output);
288
289    if blank_line {
290        if !output.ends_with("\n\n") {
291            if output.ends_with('\n') {
292                output.push('\n');
293            } else {
294                output.push_str("\n\n");
295            }
296        }
297    } else if !output.ends_with('\n') {
298        output.push('\n');
299    }
300
301    let indent_level = calculate_list_continuation_indent(list_depth);
302    let indent_char = match options.list_indent_type {
303        ListIndentType::Tabs => "\t",
304        ListIndentType::Spaces => &" ".repeat(options.list_indent_width),
305    };
306    output.push_str(&indent_char.repeat(indent_level));
307}
308
309/// Calculate the indentation string for list continuations based on depth and options.
310fn continuation_indent_string(list_depth: usize, options: &ConversionOptions) -> Option<String> {
311    let indent_level = calculate_list_continuation_indent(list_depth);
312    if indent_level == 0 {
313        return None;
314    }
315
316    let indent = match options.list_indent_type {
317        ListIndentType::Tabs => "\t".repeat(indent_level),
318        ListIndentType::Spaces => " ".repeat(options.list_indent_width * indent_level),
319    };
320    Some(indent)
321}
322
323/// Add appropriate leading separator before a list.
324///
325/// Lists need different separators depending on context:
326/// - In table cells: <br> tag if there's already content
327/// - Outside lists: blank line (\n\n) if needed
328/// - Inside list items: blank line before nested list
329fn add_list_leading_separator(output: &mut String, ctx: &Context) {
330    if ctx.in_table_cell {
331        let is_table_continuation =
332            !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
333        if is_table_continuation {
334            output.push_str("<br>");
335        }
336        return;
337    }
338
339    if !output.is_empty() && !ctx.in_list {
340        let needs_newline =
341            !output.ends_with("\n\n") && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
342        if needs_newline {
343            output.push_str("\n\n");
344        }
345        return;
346    }
347
348    if ctx.in_list_item && !output.is_empty() {
349        let needs_newline =
350            !output.ends_with('\n') && !output.ends_with("* ") && !output.ends_with("- ") && !output.ends_with(". ");
351        if needs_newline {
352            trim_trailing_whitespace(output);
353            output.push('\n');
354        }
355    }
356}
357
358/// Add appropriate trailing separator after a nested list.
359///
360/// Nested lists inside list items need trailing newlines to separate
361/// from following content. In loose lists, use blank line (\n\n). In tight lists, single newline (\n).
362fn add_nested_list_trailing_separator(output: &mut String, ctx: &Context) {
363    if !ctx.in_list_item {
364        return;
365    }
366
367    if ctx.loose_list {
368        if !output.ends_with("\n\n") {
369            if !output.ends_with('\n') {
370                output.push('\n');
371            }
372            output.push('\n');
373        }
374    } else if !output.ends_with('\n') {
375        output.push('\n');
376    }
377}
378
379/// Calculate the nesting depth for a list.
380///
381/// If we're in a list but NOT in a list item, this is incorrectly nested HTML
382/// and we need to increment the depth. If in a list item, the depth was already
383/// incremented by the <li> element.
384fn calculate_list_nesting_depth(ctx: &Context) -> usize {
385    if ctx.in_list && !ctx.in_list_item {
386        ctx.list_depth + 1
387    } else {
388        ctx.list_depth
389    }
390}
391
392/// Process a list's children, tracking which items had block elements.
393///
394/// This is used to determine proper spacing between list items.
395/// Returns true if the last processed item had block children.
396#[allow(clippy::too_many_arguments)]
397fn process_list_children(
398    node_handle: &tl::NodeHandle,
399    parser: &tl::Parser,
400    output: &mut String,
401    options: &ConversionOptions,
402    ctx: &Context,
403    depth: usize,
404    is_ordered: bool,
405    is_loose: bool,
406    nested_depth: usize,
407    start_counter: usize,
408    dom_ctx: &DomContext,
409) {
410    let mut counter = start_counter;
411
412    if let Some(node) = node_handle.get(parser) {
413        if let tl::Node::Tag(tag) = node {
414            let children = tag.children();
415            {
416                for child_handle in children.top().iter() {
417                    if let Some(child_node) = child_handle.get(parser) {
418                        if let tl::Node::Raw(bytes) = child_node {
419                            if bytes.as_utf8_str().trim().is_empty() {
420                                continue;
421                            }
422                        }
423                    }
424
425                    let list_ctx = Context {
426                        in_ordered_list: is_ordered,
427                        list_counter: if is_ordered { counter } else { 0 },
428                        in_list: true,
429                        list_depth: nested_depth,
430                        ul_depth: if is_ordered { ctx.ul_depth } else { ctx.ul_depth + 1 },
431                        loose_list: is_loose,
432                        prev_item_had_blocks: false,
433                        ..ctx.clone()
434                    };
435
436                    walk_node(child_handle, parser, output, options, &list_ctx, depth, dom_ctx);
437
438                    if is_ordered {
439                        if let Some(child_node) = child_handle.get(parser) {
440                            if let tl::Node::Tag(child_tag) = child_node {
441                                if tag_name_eq(child_tag.name().as_utf8_str(), "li") {
442                                    counter += 1;
443                                }
444                            }
445                        }
446                    }
447                }
448            }
449        }
450    }
451}
452
453/// Conversion context to track state during traversal
454#[derive(Debug, Clone)]
455struct Context {
456    /// Are we inside a code-like element (pre, code, kbd, samp)?
457    in_code: bool,
458    /// Current list item counter for ordered lists
459    list_counter: usize,
460    /// Are we in an ordered list (vs unordered)?
461    in_ordered_list: bool,
462    /// Track if previous sibling in dl was a dt
463    last_was_dt: bool,
464    /// Blockquote nesting depth
465    blockquote_depth: usize,
466    /// Are we inside a table cell (td/th)?
467    in_table_cell: bool,
468    /// Should we convert block elements as inline?
469    convert_as_inline: bool,
470    /// Depth of inline formatting elements (strong/emphasis/span/etc).
471    inline_depth: usize,
472    /// Are we inside a list item?
473    in_list_item: bool,
474    /// List nesting depth (for indentation)
475    list_depth: usize,
476    /// Unordered list nesting depth (for bullet cycling)
477    ul_depth: usize,
478    /// Are we inside any list (ul or ol)?
479    in_list: bool,
480    /// Is this a "loose" list where all items should have blank lines?
481    loose_list: bool,
482    /// Did a previous list item have block children?
483    prev_item_had_blocks: bool,
484    /// Are we inside a heading element (h1-h6)?
485    in_heading: bool,
486    /// Current heading tag (h1, h2, etc.) if in_heading is true
487    heading_tag: Option<String>,
488    /// Are we inside a paragraph element?
489    in_paragraph: bool,
490    /// Are we inside a ruby element?
491    in_ruby: bool,
492    /// Are we inside a `<strong>` / `<b>` element?
493    in_strong: bool,
494    #[cfg(feature = "inline-images")]
495    /// Shared collector for inline images when enabled.
496    inline_collector: Option<InlineCollectorHandle>,
497    #[cfg(feature = "metadata")]
498    /// Shared collector for metadata when enabled.
499    metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
500}
501
502struct DomContext {
503    parent_map: HashMap<u32, Option<u32>>,
504    children_map: HashMap<u32, Vec<tl::NodeHandle>>,
505    root_children: Vec<tl::NodeHandle>,
506    node_map: HashMap<u32, tl::NodeHandle>,
507}
508
509fn escape_link_label(text: &str) -> String {
510    if text.is_empty() {
511        return String::new();
512    }
513
514    let mut result = String::with_capacity(text.len());
515    let mut backslash_count = 0usize;
516    let mut bracket_depth = 0usize;
517
518    for ch in text.chars() {
519        if ch == '\\' {
520            result.push('\\');
521            backslash_count += 1;
522            continue;
523        }
524
525        let is_escaped = backslash_count % 2 == 1;
526        backslash_count = 0;
527
528        match ch {
529            '[' if !is_escaped => {
530                bracket_depth = bracket_depth.saturating_add(1);
531                result.push('[');
532            }
533            ']' if !is_escaped => {
534                if bracket_depth == 0 {
535                    result.push('\\');
536                } else {
537                    bracket_depth -= 1;
538                }
539                result.push(']');
540            }
541            _ => result.push(ch),
542        }
543    }
544
545    result
546}
547
548fn append_markdown_link(
549    output: &mut String,
550    label: &str,
551    href: &str,
552    title: Option<&str>,
553    raw_text: &str,
554    options: &ConversionOptions,
555) {
556    output.push('[');
557    output.push_str(label);
558    output.push_str("](");
559
560    if href.is_empty() {
561        output.push_str("<>");
562    } else if href.contains(' ') || href.contains('\n') {
563        output.push('<');
564        output.push_str(href);
565        output.push('>');
566    } else {
567        let open_count = href.chars().filter(|&c| c == '(').count();
568        let close_count = href.chars().filter(|&c| c == ')').count();
569
570        if open_count == close_count {
571            output.push_str(href);
572        } else {
573            let escaped_href = href.replace("(", "\\(").replace(")", "\\)");
574            output.push_str(&escaped_href);
575        }
576    }
577
578    if let Some(title_text) = title {
579        output.push_str(" \"");
580        if title_text.contains('"') {
581            let escaped_title = title_text.replace('"', "\\\"");
582            output.push_str(&escaped_title);
583        } else {
584            output.push_str(title_text);
585        }
586        output.push('"');
587    } else if options.default_title && raw_text == href {
588        output.push_str(" \"");
589        if href.contains('"') {
590            let escaped_href = href.replace('"', "\\\"");
591            output.push_str(&escaped_href);
592        } else {
593            output.push_str(href);
594        }
595        output.push('"');
596    }
597
598    output.push(')');
599}
600
601fn heading_level_from_name(name: &str) -> Option<usize> {
602    match name {
603        "h1" => Some(1),
604        "h2" => Some(2),
605        "h3" => Some(3),
606        "h4" => Some(4),
607        "h5" => Some(5),
608        "h6" => Some(6),
609        _ => None,
610    }
611}
612
613fn find_single_heading_child(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<(usize, tl::NodeHandle)> {
614    let node = node_handle.get(parser)?;
615
616    let tl::Node::Tag(tag) = node else {
617        return None;
618    };
619
620    let children = tag.children();
621    let mut heading_data: Option<(usize, tl::NodeHandle)> = None;
622
623    for child_handle in children.top().iter() {
624        let Some(child_node) = child_handle.get(parser) else {
625            continue;
626        };
627
628        match child_node {
629            tl::Node::Raw(bytes) => {
630                if !bytes.as_utf8_str().trim().is_empty() {
631                    return None;
632                }
633            }
634            tl::Node::Tag(child_tag) => {
635                let name = normalized_tag_name(child_tag.name().as_utf8_str());
636                if let Some(level) = heading_level_from_name(name.as_ref()) {
637                    if heading_data.is_some() {
638                        return None;
639                    }
640                    heading_data = Some((level, *child_handle));
641                } else {
642                    return None;
643                }
644            }
645            _ => return None,
646        }
647    }
648
649    heading_data
650}
651
652fn push_heading(output: &mut String, ctx: &Context, options: &ConversionOptions, level: usize, text: &str) {
653    if text.is_empty() {
654        return;
655    }
656
657    if ctx.convert_as_inline {
658        output.push_str(text);
659        return;
660    }
661
662    if ctx.in_table_cell {
663        let is_table_continuation =
664            !output.is_empty() && !output.ends_with('|') && !output.ends_with(' ') && !output.ends_with("<br>");
665        if is_table_continuation {
666            output.push_str("<br>");
667        }
668        output.push_str(text);
669        return;
670    }
671
672    if ctx.in_list_item {
673        if output.ends_with('\n') {
674            if let Some(indent) = continuation_indent_string(ctx.list_depth, options) {
675                output.push_str(&indent);
676            }
677        } else if !output.ends_with(' ') && !output.is_empty() {
678            output.push(' ');
679        }
680    } else if !output.is_empty() && !output.ends_with("\n\n") {
681        if output.ends_with('\n') {
682            output.push('\n');
683        } else {
684            trim_trailing_whitespace(output);
685            output.push_str("\n\n");
686        }
687    }
688
689    let heading_suffix = if ctx.in_list_item || ctx.blockquote_depth > 0 {
690        "\n"
691    } else {
692        "\n\n"
693    };
694
695    match options.heading_style {
696        HeadingStyle::Underlined => {
697            if level == 1 {
698                output.push_str(text);
699                output.push('\n');
700                output.push_str(&"=".repeat(text.len()));
701                output.push_str(heading_suffix);
702            } else if level == 2 {
703                output.push_str(text);
704                output.push('\n');
705                output.push_str(&"-".repeat(text.len()));
706                output.push_str(heading_suffix);
707            } else {
708                output.push_str(&"#".repeat(level));
709                output.push(' ');
710                output.push_str(text);
711                output.push_str(heading_suffix);
712            }
713        }
714        HeadingStyle::Atx => {
715            output.push_str(&"#".repeat(level));
716            output.push(' ');
717            output.push_str(text);
718            output.push_str(heading_suffix);
719        }
720        HeadingStyle::AtxClosed => {
721            output.push_str(&"#".repeat(level));
722            output.push(' ');
723            output.push_str(text);
724            output.push(' ');
725            output.push_str(&"#".repeat(level));
726            output.push_str(heading_suffix);
727        }
728    }
729}
730
731fn normalize_heading_text<'a>(text: &'a str) -> Cow<'a, str> {
732    if !text.contains('\n') && !text.contains('\r') {
733        return Cow::Borrowed(text);
734    }
735
736    let mut normalized = String::with_capacity(text.len());
737    let mut pending_space = false;
738
739    for ch in text.chars() {
740        match ch {
741            '\n' | '\r' => {
742                if !normalized.is_empty() {
743                    pending_space = true;
744                }
745            }
746            ' ' | '\t' if pending_space => continue,
747            _ => {
748                if pending_space {
749                    if !normalized.ends_with(' ') {
750                        normalized.push(' ');
751                    }
752                    pending_space = false;
753                }
754                normalized.push(ch);
755            }
756        }
757    }
758
759    Cow::Owned(normalized)
760}
761
762fn build_dom_context(dom: &tl::VDom, parser: &tl::Parser) -> DomContext {
763    let mut ctx = DomContext {
764        parent_map: HashMap::new(),
765        children_map: HashMap::new(),
766        root_children: dom.children().to_vec(),
767        node_map: HashMap::new(),
768    };
769
770    for child_handle in dom.children().iter() {
771        record_node_hierarchy(child_handle, None, parser, &mut ctx);
772    }
773
774    ctx
775}
776
777/// Detect block elements that were incorrectly nested under inline ancestors.
778fn has_inline_block_misnest(dom_ctx: &DomContext, parser: &tl::Parser) -> bool {
779    for handle in dom_ctx.node_map.values() {
780        if let Some(tl::Node::Tag(tag)) = handle.get(parser) {
781            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
782            if is_block_level_element(tag_name.as_ref()) {
783                let mut current = dom_ctx.parent_map.get(&handle.get_inner()).and_then(|p| *p);
784                while let Some(parent_id) = current {
785                    if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
786                        if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
787                            let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
788                            if is_inline_element(parent_name.as_ref()) {
789                                return true;
790                            }
791                        }
792                    }
793                    current = dom_ctx.parent_map.get(&parent_id).and_then(|p| *p);
794                }
795            }
796        }
797    }
798
799    false
800}
801
802/// Round-trip HTML through html5ever to repair malformed trees.
803fn repair_with_html5ever(input: &str) -> Option<String> {
804    use html5ever::serialize::{SerializeOpts, serialize};
805    use html5ever::tendril::TendrilSink;
806    use markup5ever_rcdom::{RcDom, SerializableHandle};
807
808    let dom = html5ever::parse_document(RcDom::default(), Default::default())
809        .from_utf8()
810        .read_from(&mut input.as_bytes())
811        .ok()?;
812
813    let mut buf = Vec::with_capacity(input.len());
814    let handle = SerializableHandle::from(dom.document.clone());
815    serialize(&mut buf, &handle, SerializeOpts::default()).ok()?;
816    String::from_utf8(buf).ok()
817}
818
819fn record_node_hierarchy(node_handle: &tl::NodeHandle, parent: Option<u32>, parser: &tl::Parser, ctx: &mut DomContext) {
820    let id = node_handle.get_inner();
821    ctx.parent_map.insert(id, parent);
822    ctx.node_map.insert(id, *node_handle);
823
824    if let Some(node) = node_handle.get(parser) {
825        if let tl::Node::Tag(tag) = node {
826            let children: Vec<_> = tag.children().top().iter().copied().collect();
827            ctx.children_map.insert(id, children.clone());
828            for child in children {
829                record_node_hierarchy(&child, Some(id), parser, ctx);
830            }
831        }
832    }
833}
834
835/// Check if a document is an hOCR (HTML-based OCR) document.
836///
837/// hOCR documents should have metadata extraction disabled to avoid
838/// including OCR metadata (system info, capabilities, etc.) in output.
839///
840/// Detection criteria:
841/// - meta tag with name="ocr-system" or name="ocr-capabilities"
842/// - Elements with classes: ocr_page, ocrx_word, ocr_carea, ocr_par, ocr_line
843fn is_hocr_document(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
844    fn check_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
845        if let Some(node) = node_handle.get(parser) {
846            match node {
847                tl::Node::Tag(tag) => {
848                    let tag_name = normalized_tag_name(tag.name().as_utf8_str());
849
850                    if tag_name == "meta" {
851                        if let Some(name_attr) = tag.attributes().get("name") {
852                            if let Some(name_bytes) = name_attr {
853                                let name_value = name_bytes.as_utf8_str();
854                                if name_value == "ocr-system" || name_value == "ocr-capabilities" {
855                                    return true;
856                                }
857                            }
858                        }
859                    }
860
861                    if let Some(class_attr) = tag.attributes().get("class") {
862                        if let Some(class_bytes) = class_attr {
863                            let class_value = class_bytes.as_utf8_str();
864                            if class_value.contains("ocr_page")
865                                || class_value.contains("ocrx_word")
866                                || class_value.contains("ocr_carea")
867                                || class_value.contains("ocr_par")
868                                || class_value.contains("ocr_line")
869                            {
870                                return true;
871                            }
872                        }
873                    }
874
875                    let children = tag.children();
876                    {
877                        for child_handle in children.top().iter() {
878                            if check_node(child_handle, parser) {
879                                return true;
880                            }
881                        }
882                    }
883                    false
884                }
885                _ => false,
886            }
887        } else {
888            false
889        }
890    }
891
892    check_node(node_handle, parser)
893}
894
895/// Extract metadata from HTML document head.
896///
897/// Extracts comprehensive document metadata including:
898/// - title: Document title from <title> tag
899/// - meta tags: description, keywords, author, etc.
900/// - Open Graph tags: og:title, og:description, og:image, etc.
901/// - Twitter Card tags: twitter:card, twitter:title, etc.
902/// - base-href: Base URL from <base> tag
903/// - canonical: Canonical URL from <link rel="canonical">
904/// - link relations: author, license, alternate links
905fn extract_metadata(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> BTreeMap<String, String> {
906    let mut metadata = BTreeMap::new();
907
908    fn find_head(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> Option<tl::NodeHandle> {
909        if let Some(node) = node_handle.get(parser) {
910            if let tl::Node::Tag(tag) = node {
911                if tag_name_eq(tag.name().as_utf8_str(), "head") {
912                    return Some(*node_handle);
913                }
914                let children = tag.children();
915                {
916                    for child_handle in children.top().iter() {
917                        if let Some(result) = find_head(child_handle, parser) {
918                            return Some(result);
919                        }
920                    }
921                }
922            }
923        }
924        None
925    }
926
927    let head_handle = match find_head(node_handle, parser) {
928        Some(h) => h,
929        None => return metadata,
930    };
931
932    if let Some(head_node) = head_handle.get(parser) {
933        if let tl::Node::Tag(head_tag) = head_node {
934            let children = head_tag.children();
935            {
936                for child_handle in children.top().iter() {
937                    if let Some(child_node) = child_handle.get(parser) {
938                        if let tl::Node::Tag(child_tag) = child_node {
939                            let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
940
941                            match tag_name.as_ref() {
942                                "title" => {
943                                    let title_children = child_tag.children();
944                                    {
945                                        if let Some(first_child) = title_children.top().iter().next() {
946                                            if let Some(text_node) = first_child.get(parser) {
947                                                if let tl::Node::Raw(bytes) = text_node {
948                                                    let title = text::normalize_whitespace(&bytes.as_utf8_str())
949                                                        .trim()
950                                                        .to_string();
951                                                    if !title.is_empty() {
952                                                        metadata.insert("title".to_string(), title);
953                                                    }
954                                                }
955                                            }
956                                        }
957                                    }
958                                }
959                                "base" => {
960                                    if let Some(href_attr) = child_tag.attributes().get("href") {
961                                        if let Some(href_bytes) = href_attr {
962                                            let href = href_bytes.as_utf8_str().to_string();
963                                            if !href.is_empty() {
964                                                metadata.insert("base-href".to_string(), href);
965                                            }
966                                        }
967                                    }
968                                }
969                                "meta" => {
970                                    let mut name_attr = None;
971                                    let mut property_attr = None;
972                                    let mut http_equiv_attr = None;
973                                    let mut content_attr = None;
974
975                                    if let Some(attr) = child_tag.attributes().get("name") {
976                                        if let Some(bytes) = attr {
977                                            name_attr = Some(bytes.as_utf8_str().to_string());
978                                        }
979                                    }
980                                    if let Some(attr) = child_tag.attributes().get("property") {
981                                        if let Some(bytes) = attr {
982                                            property_attr = Some(bytes.as_utf8_str().to_string());
983                                        }
984                                    }
985                                    if let Some(attr) = child_tag.attributes().get("http-equiv") {
986                                        if let Some(bytes) = attr {
987                                            http_equiv_attr = Some(bytes.as_utf8_str().to_string());
988                                        }
989                                    }
990                                    if let Some(attr) = child_tag.attributes().get("content") {
991                                        if let Some(bytes) = attr {
992                                            content_attr = Some(bytes.as_utf8_str().to_string());
993                                        }
994                                    }
995
996                                    if let Some(content) = content_attr {
997                                        if let Some(name) = name_attr {
998                                            let key = format!("meta-{}", name.to_lowercase());
999                                            metadata.insert(key, content);
1000                                        } else if let Some(property) = property_attr {
1001                                            let key = format!("meta-{}", property.to_lowercase().replace(':', "-"));
1002                                            metadata.insert(key, content);
1003                                        } else if let Some(http_equiv) = http_equiv_attr {
1004                                            let key = format!("meta-{}", http_equiv.to_lowercase());
1005                                            metadata.insert(key, content);
1006                                        }
1007                                    }
1008                                }
1009                                "link" => {
1010                                    let mut rel_attr = None;
1011                                    let mut href_attr = None;
1012
1013                                    if let Some(attr) = child_tag.attributes().get("rel") {
1014                                        if let Some(bytes) = attr {
1015                                            rel_attr = Some(bytes.as_utf8_str().to_string());
1016                                        }
1017                                    }
1018                                    if let Some(attr) = child_tag.attributes().get("href") {
1019                                        if let Some(bytes) = attr {
1020                                            href_attr = Some(bytes.as_utf8_str().to_string());
1021                                        }
1022                                    }
1023
1024                                    if let (Some(rel), Some(href)) = (rel_attr, href_attr) {
1025                                        let rel_lower = rel.to_lowercase();
1026                                        match rel_lower.as_str() {
1027                                            "canonical" => {
1028                                                metadata.insert("canonical".to_string(), href);
1029                                            }
1030                                            "author" | "license" | "alternate" => {
1031                                                metadata.insert(format!("link-{}", rel_lower), href);
1032                                            }
1033                                            _ => {}
1034                                        }
1035                                    }
1036                                }
1037                                _ => {}
1038                            }
1039                        }
1040                    }
1041                }
1042            }
1043        }
1044    }
1045
1046    metadata
1047}
1048
1049/// Format metadata as YAML frontmatter.
1050fn format_metadata_frontmatter(metadata: &BTreeMap<String, String>) -> String {
1051    if metadata.is_empty() {
1052        return String::new();
1053    }
1054
1055    let mut lines = vec!["---".to_string()];
1056    for (key, value) in metadata {
1057        // Escape YAML special characters and quote if needed
1058        let needs_quotes = value.contains(':') || value.contains('#') || value.contains('[') || value.contains(']');
1059        if needs_quotes {
1060            let escaped = value.replace('\\', "\\\\").replace('"', "\\\"");
1061            lines.push(format!("{}: \"{}\"", key, escaped));
1062        } else {
1063            lines.push(format!("{}: {}", key, value));
1064        }
1065    }
1066    lines.push("---".to_string());
1067
1068    lines.join("\n") + "\n\n"
1069}
1070
1071/// Check if a handle is an empty inline element (abbr, var, ins, dfn, etc. with no text content).
1072fn is_empty_inline_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
1073    const EMPTY_WHEN_NO_CONTENT_TAGS: &[&str] = &[
1074        "abbr", "var", "ins", "dfn", "time", "data", "cite", "q", "mark", "small", "u",
1075    ];
1076
1077    if let Some(node) = node_handle.get(parser) {
1078        if let tl::Node::Tag(tag) = node {
1079            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1080            if EMPTY_WHEN_NO_CONTENT_TAGS.contains(&tag_name.as_ref()) {
1081                return get_text_content(node_handle, parser).trim().is_empty();
1082            }
1083        }
1084    }
1085    false
1086}
1087
1088/// Get the text content of a node and its children.
1089fn get_text_content(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1090    let mut text = String::with_capacity(64);
1091    if let Some(node) = node_handle.get(parser) {
1092        match node {
1093            tl::Node::Raw(bytes) => {
1094                text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1095            }
1096            tl::Node::Tag(tag) => {
1097                let children = tag.children();
1098                {
1099                    for child_handle in children.top().iter() {
1100                        text.push_str(&get_text_content(child_handle, parser));
1101                    }
1102                }
1103            }
1104            _ => {}
1105        }
1106    }
1107    text
1108}
1109
1110/// Collect inline text for link labels, skipping block-level descendants.
1111fn collect_link_label_text(children: &[tl::NodeHandle], parser: &tl::Parser) -> (String, Vec<tl::NodeHandle>, bool) {
1112    let mut text = String::new();
1113    let mut saw_block = false;
1114    let mut block_nodes = Vec::new();
1115    let mut stack: Vec<_> = children.iter().rev().copied().collect();
1116
1117    while let Some(handle) = stack.pop() {
1118        if let Some(node) = handle.get(parser) {
1119            match node {
1120                tl::Node::Raw(bytes) => {
1121                    text.push_str(&text::decode_html_entities(&bytes.as_utf8_str()));
1122                }
1123                tl::Node::Tag(tag) => {
1124                    let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1125                    if is_block_level_element(tag_name.as_ref()) {
1126                        saw_block = true;
1127                        block_nodes.push(handle);
1128                        continue;
1129                    }
1130
1131                    let tag_children = tag.children();
1132                    {
1133                        let mut child_nodes: Vec<_> = tag_children.top().iter().copied().collect();
1134                        child_nodes.reverse();
1135                        for child in child_nodes {
1136                            stack.push(child);
1137                        }
1138                    }
1139                }
1140                _ => {}
1141            }
1142        }
1143    }
1144
1145    (text, block_nodes, saw_block)
1146}
1147
1148fn normalize_link_label(label: &str) -> String {
1149    let collapsed = label
1150        .chars()
1151        .map(|ch| if ch == '\n' || ch == '\r' { ' ' } else { ch })
1152        .collect::<String>();
1153    text::normalize_whitespace(&collapsed).trim().to_string()
1154}
1155
1156/// Serialize an element to HTML string (for SVG and Math elements).
1157fn serialize_element(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1158    if let Some(node) = node_handle.get(parser) {
1159        if let tl::Node::Tag(tag) = node {
1160            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
1161            let mut html = String::with_capacity(256);
1162            html.push('<');
1163            html.push_str(&tag_name);
1164
1165            // Serialize attributes
1166            for (key, value_opt) in tag.attributes().iter() {
1167                html.push(' ');
1168                html.push_str(&key);
1169                if let Some(value) = value_opt {
1170                    html.push_str("=\"");
1171                    html.push_str(&value);
1172                    html.push('"');
1173                }
1174            }
1175
1176            let has_children = !tag.children().top().is_empty();
1177            if !has_children {
1178                html.push_str(" />");
1179            } else {
1180                html.push('>');
1181                let children = tag.children();
1182                {
1183                    for child_handle in children.top().iter() {
1184                        html.push_str(&serialize_node(child_handle, parser));
1185                    }
1186                }
1187                html.push_str("</");
1188                html.push_str(&tag_name);
1189                html.push('>');
1190            }
1191            return html;
1192        }
1193    }
1194    String::new()
1195}
1196
1197#[cfg(feature = "inline-images")]
1198fn non_empty_trimmed(value: &str) -> Option<String> {
1199    let trimmed = value.trim();
1200    if trimmed.is_empty() {
1201        None
1202    } else {
1203        Some(trimmed.to_string())
1204    }
1205}
1206
1207#[cfg(feature = "inline-images")]
1208fn handle_inline_data_image(
1209    collector_ref: &InlineCollectorHandle,
1210    src: &str,
1211    alt: &str,
1212    title: Option<&str>,
1213    attributes: BTreeMap<String, String>,
1214) {
1215    let trimmed_src = src.trim();
1216    if !trimmed_src.starts_with("data:") {
1217        return;
1218    }
1219
1220    let mut collector = collector_ref.borrow_mut();
1221    let index = collector.next_index();
1222
1223    let Some((meta, payload)) = trimmed_src.split_once(',') else {
1224        collector.warn_skip(index, "missing data URI separator");
1225        return;
1226    };
1227
1228    if payload.trim().is_empty() {
1229        collector.warn_skip(index, "empty data URI payload");
1230        return;
1231    }
1232
1233    if !meta.starts_with("data:") {
1234        collector.warn_skip(index, "invalid data URI scheme");
1235        return;
1236    }
1237
1238    let header = &meta["data:".len()..];
1239    if header.is_empty() {
1240        collector.warn_skip(index, "missing MIME type");
1241        return;
1242    }
1243
1244    let mut segments = header.split(';');
1245    let mime = segments.next().unwrap_or("");
1246    let Some((top_level, subtype_raw)) = mime.split_once('/') else {
1247        collector.warn_skip(index, "missing MIME subtype");
1248        return;
1249    };
1250
1251    if !top_level.eq_ignore_ascii_case("image") {
1252        collector.warn_skip(index, format!("unsupported MIME type {mime}"));
1253        return;
1254    }
1255
1256    let subtype_raw = subtype_raw.trim();
1257    if subtype_raw.is_empty() {
1258        collector.warn_skip(index, "missing MIME subtype");
1259        return;
1260    }
1261
1262    let subtype_lower = subtype_raw.to_ascii_lowercase();
1263
1264    let mut is_base64 = false;
1265    let mut inline_name: Option<String> = None;
1266    for segment in segments {
1267        if segment.eq_ignore_ascii_case("base64") {
1268            is_base64 = true;
1269        } else if let Some(value) = segment.strip_prefix("name=") {
1270            inline_name = non_empty_trimmed(value.trim_matches('"'));
1271        } else if let Some(value) = segment.strip_prefix("filename=") {
1272            inline_name = non_empty_trimmed(value.trim_matches('"'));
1273        }
1274    }
1275
1276    if !is_base64 {
1277        collector.warn_skip(index, "missing base64 encoding marker");
1278        return;
1279    }
1280
1281    use base64::{Engine as _, engine::general_purpose::STANDARD};
1282
1283    let payload_clean = payload.trim();
1284    let decoded = match STANDARD.decode(payload_clean) {
1285        Ok(bytes) => bytes,
1286        Err(_) => {
1287            collector.warn_skip(index, "invalid base64 payload");
1288            return;
1289        }
1290    };
1291
1292    if decoded.is_empty() {
1293        collector.warn_skip(index, "empty base64 payload");
1294        return;
1295    }
1296
1297    let max_size = collector.max_decoded_size();
1298    if decoded.len() as u64 > max_size {
1299        collector.warn_skip(
1300            index,
1301            format!(
1302                "decoded payload ({} bytes) exceeds configured max ({})",
1303                decoded.len(),
1304                max_size
1305            ),
1306        );
1307        return;
1308    }
1309
1310    let format = match subtype_lower.as_str() {
1311        "png" => InlineImageFormat::Png,
1312        "jpeg" | "jpg" => InlineImageFormat::Jpeg,
1313        "gif" => InlineImageFormat::Gif,
1314        "bmp" => InlineImageFormat::Bmp,
1315        "webp" => InlineImageFormat::Webp,
1316        "svg+xml" => InlineImageFormat::Svg,
1317        other => InlineImageFormat::Other(other.to_string()),
1318    };
1319
1320    let description = non_empty_trimmed(alt).or_else(|| title.and_then(non_empty_trimmed));
1321
1322    let filename_candidate = attributes
1323        .get("data-filename")
1324        .cloned()
1325        .or_else(|| attributes.get("filename").cloned())
1326        .or_else(|| attributes.get("data-name").cloned())
1327        .or(inline_name);
1328
1329    let dimensions = collector.infer_dimensions(index, &decoded, &format);
1330
1331    let image = collector.build_image(
1332        decoded,
1333        format,
1334        filename_candidate,
1335        description,
1336        dimensions,
1337        InlineImageSource::ImgDataUri,
1338        attributes,
1339    );
1340
1341    collector.push_image(index, image);
1342}
1343
1344#[cfg(feature = "inline-images")]
1345fn handle_inline_svg(
1346    collector_ref: &InlineCollectorHandle,
1347    node_handle: &tl::NodeHandle,
1348    parser: &tl::Parser,
1349    title_opt: Option<String>,
1350    attributes: BTreeMap<String, String>,
1351) {
1352    {
1353        let borrow = collector_ref.borrow();
1354        if !borrow.capture_svg() {
1355            return;
1356        }
1357    }
1358
1359    let mut collector = collector_ref.borrow_mut();
1360    let index = collector.next_index();
1361
1362    let serialized = serialize_element(node_handle, parser);
1363    if serialized.is_empty() {
1364        collector.warn_skip(index, "unable to serialize SVG element");
1365        return;
1366    }
1367
1368    let data = serialized.into_bytes();
1369    let max_size = collector.max_decoded_size();
1370    if data.len() as u64 > max_size {
1371        collector.warn_skip(
1372            index,
1373            format!(
1374                "serialized SVG payload ({} bytes) exceeds configured max ({})",
1375                data.len(),
1376                max_size
1377            ),
1378        );
1379        return;
1380    }
1381
1382    let description = attributes
1383        .get("aria-label")
1384        .and_then(|value| non_empty_trimmed(value))
1385        .or_else(|| title_opt.clone().and_then(|t| non_empty_trimmed(&t)));
1386
1387    let filename_candidate = attributes
1388        .get("data-filename")
1389        .cloned()
1390        .or_else(|| attributes.get("filename").cloned())
1391        .or_else(|| attributes.get("data-name").cloned());
1392
1393    let image = collector.build_image(
1394        data,
1395        InlineImageFormat::Svg,
1396        filename_candidate,
1397        description,
1398        None,
1399        InlineImageSource::SvgElement,
1400        attributes,
1401    );
1402
1403    collector.push_image(index, image);
1404}
1405
1406/// Serialize a node to HTML string.
1407fn serialize_node(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
1408    if let Some(node) = node_handle.get(parser) {
1409        match node {
1410            tl::Node::Raw(bytes) => bytes.as_utf8_str().to_string(),
1411            tl::Node::Tag(_) => serialize_element(node_handle, parser),
1412            _ => String::new(),
1413        }
1414    } else {
1415        String::new()
1416    }
1417}
1418
1419/// Convert HTML to Markdown using tl DOM parser.
1420pub fn convert_html(html: &str, options: &ConversionOptions) -> Result<String> {
1421    convert_html_impl(html, options, None, None)
1422}
1423
1424#[cfg(feature = "inline-images")]
1425pub(crate) fn convert_html_with_inline_collector(
1426    html: &str,
1427    options: &ConversionOptions,
1428    collector: InlineCollectorHandle,
1429) -> Result<String> {
1430    convert_html_impl(html, options, Some(collector), None)
1431}
1432
1433#[cfg(feature = "metadata")]
1434pub(crate) fn convert_html_with_metadata(
1435    html: &str,
1436    options: &ConversionOptions,
1437    metadata_collector: crate::metadata::MetadataCollectorHandle,
1438) -> Result<String> {
1439    convert_html_impl(html, options, None, Some(metadata_collector))
1440}
1441
1442#[cfg_attr(not(feature = "inline-images"), allow(unused_variables))]
1443#[cfg_attr(not(feature = "metadata"), allow(unused_variables))]
1444fn convert_html_impl(
1445    html: &str,
1446    options: &ConversionOptions,
1447    inline_collector: Option<InlineCollectorHandle>,
1448    #[cfg(feature = "metadata")] metadata_collector: Option<crate::metadata::MetadataCollectorHandle>,
1449    #[cfg(not(feature = "metadata"))] _metadata_collector: Option<()>,
1450) -> Result<String> {
1451    // Normalize problematic HTML constructs before parsing
1452    let mut preprocessed = preprocess_html(html).into_owned();
1453    let mut preprocessed_len = preprocessed.len();
1454
1455    let parser_options = tl::ParserOptions::default();
1456    let mut dom_guard = unsafe {
1457        tl::parse_owned(preprocessed.clone(), parser_options)
1458            .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1459    };
1460    let mut dom_ref = dom_guard.get_ref();
1461    let mut parser = dom_ref.parser();
1462    let mut dom_ctx = build_dom_context(dom_ref, parser);
1463    let mut output = String::with_capacity(preprocessed_len);
1464
1465    if has_inline_block_misnest(&dom_ctx, parser) {
1466        if let Some(repaired_html) = repair_with_html5ever(&preprocessed) {
1467            preprocessed = preprocess_html(&repaired_html).into_owned();
1468            preprocessed_len = preprocessed.len();
1469            dom_guard = unsafe {
1470                tl::parse_owned(preprocessed.clone(), parser_options)
1471                    .map_err(|_| crate::error::ConversionError::ParseError("Failed to parse HTML".to_string()))?
1472            };
1473            dom_ref = dom_guard.get_ref();
1474            parser = dom_ref.parser();
1475            dom_ctx = build_dom_context(dom_ref, parser);
1476            output = String::with_capacity(preprocessed_len);
1477        }
1478    }
1479
1480    // Check for hOCR document and extract metadata by checking all top-level children
1481    let mut is_hocr = false;
1482    for child_handle in dom_ref.children().iter() {
1483        if is_hocr_document(child_handle, parser) {
1484            is_hocr = true;
1485            break;
1486        }
1487    }
1488
1489    if options.extract_metadata && !options.convert_as_inline && !is_hocr {
1490        for child_handle in dom_ref.children().iter() {
1491            let metadata = extract_metadata(child_handle, parser);
1492            if !metadata.is_empty() {
1493                let metadata_frontmatter = format_metadata_frontmatter(&metadata);
1494                output.push_str(&metadata_frontmatter);
1495                break;
1496            }
1497        }
1498    }
1499
1500    if is_hocr {
1501        use crate::hocr::{convert_to_markdown_with_options as convert_hocr_to_markdown, extract_hocr_document};
1502
1503        let (elements, metadata) = extract_hocr_document(dom_ref, options.debug);
1504
1505        // Extract hOCR metadata as YAML frontmatter
1506        if options.extract_metadata && !options.convert_as_inline {
1507            let mut metadata_map = BTreeMap::new();
1508            if let Some(system) = metadata.ocr_system {
1509                metadata_map.insert("ocr-system".to_string(), system);
1510            }
1511            if !metadata.ocr_capabilities.is_empty() {
1512                metadata_map.insert("ocr-capabilities".to_string(), metadata.ocr_capabilities.join(", "));
1513            }
1514            if let Some(pages) = metadata.ocr_number_of_pages {
1515                metadata_map.insert("ocr-number-of-pages".to_string(), pages.to_string());
1516            }
1517            if !metadata.ocr_langs.is_empty() {
1518                metadata_map.insert("ocr-langs".to_string(), metadata.ocr_langs.join(", "));
1519            }
1520            if !metadata.ocr_scripts.is_empty() {
1521                metadata_map.insert("ocr-scripts".to_string(), metadata.ocr_scripts.join(", "));
1522            }
1523
1524            if !metadata_map.is_empty() {
1525                output.push_str(&format_metadata_frontmatter(&metadata_map));
1526            }
1527        }
1528
1529        let mut markdown = convert_hocr_to_markdown(&elements, true, options.hocr_spatial_tables);
1530
1531        if markdown.trim().is_empty() {
1532            return Ok(output);
1533        }
1534
1535        markdown.truncate(markdown.trim_end().len());
1536        output.push_str(&markdown);
1537        output.push('\n');
1538
1539        return Ok(output);
1540    }
1541
1542    // Extract head metadata if metadata collector is provided
1543    #[cfg(feature = "metadata")]
1544    if let Some(ref collector) = metadata_collector {
1545        if !is_hocr {
1546            for child_handle in dom_ref.children().iter() {
1547                let head_meta = extract_metadata(child_handle, parser);
1548                if !head_meta.is_empty() {
1549                    collector.borrow_mut().set_head_metadata(head_meta);
1550                    break;
1551                }
1552            }
1553        }
1554    }
1555
1556    // Extract html/body attributes for language and text direction if metadata collector is provided
1557    #[cfg(feature = "metadata")]
1558    if let Some(ref collector) = metadata_collector {
1559        for child_handle in dom_ref.children().iter() {
1560            if let Some(tl::Node::Tag(tag)) = child_handle.get(parser) {
1561                let tag_name = tag.name().as_utf8_str();
1562                if tag_name == "html" || tag_name == "body" {
1563                    if let Some(lang) = tag.attributes().get("lang") {
1564                        if let Some(lang_bytes) = lang {
1565                            let lang_str = lang_bytes.as_utf8_str();
1566                            collector.borrow_mut().set_language(lang_str.to_string());
1567                        }
1568                    }
1569                    if let Some(dir) = tag.attributes().get("dir") {
1570                        if let Some(dir_bytes) = dir {
1571                            let dir_str = dir_bytes.as_utf8_str();
1572                            collector.borrow_mut().set_text_direction(dir_str.to_string());
1573                        }
1574                    }
1575                }
1576            }
1577        }
1578    }
1579
1580    let ctx = Context {
1581        in_code: false,
1582        list_counter: 0,
1583        in_ordered_list: false,
1584        last_was_dt: false,
1585        blockquote_depth: 0,
1586        in_table_cell: false,
1587        convert_as_inline: options.convert_as_inline,
1588        inline_depth: 0,
1589        in_list_item: false,
1590        list_depth: 0,
1591        ul_depth: 0,
1592        in_list: false,
1593        loose_list: false,
1594        prev_item_had_blocks: false,
1595        in_heading: false,
1596        heading_tag: None,
1597        in_paragraph: false,
1598        in_ruby: false,
1599        in_strong: false,
1600        #[cfg(feature = "inline-images")]
1601        inline_collector: inline_collector.clone(),
1602        #[cfg(feature = "metadata")]
1603        metadata_collector: metadata_collector.clone(),
1604    };
1605
1606    // Walk all top-level children
1607    for child_handle in dom_ref.children().iter() {
1608        walk_node(child_handle, parser, &mut output, options, &ctx, 0, &dom_ctx);
1609    }
1610
1611    // Trim trailing spaces per line, then trim trailing blank lines but preserve final newline
1612    trim_line_end_whitespace(&mut output);
1613    let trimmed = output.trim_end_matches('\n');
1614    if trimmed.is_empty() {
1615        Ok(String::new())
1616    } else {
1617        Ok(format!("{}\n", trimmed))
1618    }
1619}
1620
1621fn preprocess_html(input: &str) -> Cow<'_, str> {
1622    const SELF_CLOSING: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1623    const TAGS: [&[u8]; 2] = [b"script", b"style"];
1624    const SVG: &[u8] = b"svg";
1625    const DOCTYPE: &[u8] = b"doctype";
1626    const EMPTY_COMMENT: &[u8] = b"<!---->";
1627
1628    let bytes = input.as_bytes();
1629    let len = bytes.len();
1630    if len == 0 {
1631        return Cow::Borrowed(input);
1632    }
1633
1634    let mut idx = 0;
1635    let mut last = 0;
1636    let mut output: Option<String> = None;
1637    let mut svg_depth = 0usize;
1638
1639    while idx < len {
1640        if bytes[idx] == b'<' {
1641            if bytes[idx..].starts_with(EMPTY_COMMENT) {
1642                let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1643                out.push_str(&input[last..idx]);
1644                out.push_str("<!-- -->");
1645                idx += EMPTY_COMMENT.len();
1646                last = idx;
1647                continue;
1648            }
1649
1650            let mut replaced = false;
1651            for (pattern, replacement) in &SELF_CLOSING {
1652                if bytes[idx..].starts_with(pattern) {
1653                    let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1654                    out.push_str(&input[last..idx]);
1655                    out.push_str(replacement);
1656                    idx += pattern.len();
1657                    last = idx;
1658                    replaced = true;
1659                    break;
1660                }
1661            }
1662            if replaced {
1663                continue;
1664            }
1665
1666            if matches_tag_start(bytes, idx + 1, SVG) {
1667                if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
1668                    svg_depth += 1;
1669                    idx = open_end;
1670                    continue;
1671                }
1672            } else if matches_end_tag_start(bytes, idx + 1, SVG) {
1673                if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
1674                    if svg_depth > 0 {
1675                        svg_depth = svg_depth.saturating_sub(1);
1676                    }
1677                    idx = close_end;
1678                    continue;
1679                }
1680            }
1681
1682            if svg_depth == 0 {
1683                let mut handled = false;
1684                for tag in TAGS {
1685                    if matches_tag_start(bytes, idx + 1, tag) {
1686                        if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
1687                            let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
1688                            let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1689                            out.push_str(&input[last..idx]);
1690                            out.push_str(&input[idx..open_end]);
1691                            out.push_str("</");
1692                            out.push_str(str::from_utf8(tag).unwrap());
1693                            out.push('>');
1694
1695                            last = remove_end;
1696                            idx = remove_end;
1697                            handled = true;
1698                        }
1699                    }
1700
1701                    if handled {
1702                        break;
1703                    }
1704                }
1705
1706                if handled {
1707                    continue;
1708                }
1709
1710                if idx + 2 < len && bytes[idx + 1] == b'!' {
1711                    let mut cursor = idx + 2;
1712                    while cursor < len && bytes[cursor].is_ascii_whitespace() {
1713                        cursor += 1;
1714                    }
1715
1716                    if cursor + DOCTYPE.len() <= len
1717                        && bytes[cursor..cursor + DOCTYPE.len()].eq_ignore_ascii_case(DOCTYPE)
1718                    {
1719                        if let Some(end) = find_tag_end(bytes, cursor + DOCTYPE.len()) {
1720                            let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
1721                            out.push_str(&input[last..idx]);
1722                            last = end;
1723                            idx = end;
1724                            continue;
1725                        }
1726                    }
1727                }
1728            }
1729
1730            let is_valid_tag = if idx + 1 < len {
1731                match bytes[idx + 1] {
1732                    b'!' => {
1733                        idx + 2 < len
1734                            && (bytes[idx + 2] == b'-'
1735                                || bytes[idx + 2].is_ascii_alphabetic()
1736                                || bytes[idx + 2].is_ascii_uppercase())
1737                    }
1738                    b'/' => {
1739                        idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1740                    }
1741                    b'?' => true,
1742                    c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => true,
1743                    _ => false,
1744                }
1745            } else {
1746                false
1747            };
1748
1749            if !is_valid_tag {
1750                let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1751                out.push_str(&input[last..idx]);
1752                out.push_str("&lt;");
1753                idx += 1;
1754                last = idx;
1755                continue;
1756            }
1757        }
1758
1759        idx += 1;
1760    }
1761
1762    if let Some(mut out) = output {
1763        if last < len {
1764            out.push_str(&input[last..]);
1765        }
1766        Cow::Owned(out)
1767    } else {
1768        Cow::Borrowed(input)
1769    }
1770}
1771
1772#[cfg(test)]
1773fn normalize_self_closing_tags(input: &str) -> Cow<'_, str> {
1774    const REPLACEMENTS: [(&[u8], &str); 3] = [(b"<br/>", "<br>"), (b"<hr/>", "<hr>"), (b"<img/>", "<img>")];
1775
1776    if !REPLACEMENTS
1777        .iter()
1778        .any(|(pattern, _)| input.as_bytes().windows(pattern.len()).any(|w| w == *pattern))
1779    {
1780        return Cow::Borrowed(input);
1781    }
1782
1783    let bytes = input.as_bytes();
1784    let mut output = String::with_capacity(input.len());
1785    let mut idx = 0;
1786    let mut last = 0;
1787
1788    while idx < bytes.len() {
1789        let mut matched = false;
1790        for (pattern, replacement) in &REPLACEMENTS {
1791            if bytes[idx..].starts_with(*pattern) {
1792                output.push_str(&input[last..idx]);
1793                output.push_str(replacement);
1794                idx += pattern.len();
1795                last = idx;
1796                matched = true;
1797                break;
1798            }
1799        }
1800
1801        if !matched {
1802            idx += 1;
1803        }
1804    }
1805
1806    if last < input.len() {
1807        output.push_str(&input[last..]);
1808    }
1809
1810    Cow::Owned(output)
1811}
1812
1813/// Escape malformed angle brackets in HTML that are not part of valid tags.
1814///
1815/// This function ensures robust parsing by escaping bare `<` and `>` characters
1816/// that appear in text content and are not part of HTML tags. This prevents
1817/// parser failures on malformed HTML like "1<2" or comparisons in text.
1818///
1819/// # Examples
1820///
1821/// - `1<2` becomes `1&lt;2`
1822/// - `<div>1<2</div>` becomes `<div>1&lt;2</div>`
1823/// - `<script>1 < 2</script>` remains unchanged (handled by script stripping)
1824#[cfg(test)]
1825fn escape_malformed_angle_brackets(input: &str) -> Cow<'_, str> {
1826    let bytes = input.as_bytes();
1827    let len = bytes.len();
1828    let mut idx = 0;
1829    let mut last = 0;
1830    let mut output: Option<String> = None;
1831
1832    while idx < len {
1833        if bytes[idx] == b'<' {
1834            // Check if this is a valid tag start
1835            if idx + 1 < len {
1836                let next = bytes[idx + 1];
1837
1838                // Valid tag patterns: <tagname, </tagname, <!doctype, <!--
1839                let is_valid_tag = match next {
1840                    b'!' => {
1841                        // DOCTYPE or comment
1842                        idx + 2 < len
1843                            && (bytes[idx + 2] == b'-'
1844                                || bytes[idx + 2].is_ascii_alphabetic()
1845                                || bytes[idx + 2].is_ascii_uppercase())
1846                    }
1847                    b'/' => {
1848                        // Closing tag
1849                        idx + 2 < len && (bytes[idx + 2].is_ascii_alphabetic() || bytes[idx + 2].is_ascii_uppercase())
1850                    }
1851                    b'?' => {
1852                        // XML declaration
1853                        true
1854                    }
1855                    c if c.is_ascii_alphabetic() || c.is_ascii_uppercase() => {
1856                        // Opening tag
1857                        true
1858                    }
1859                    _ => false,
1860                };
1861
1862                if !is_valid_tag {
1863                    // This is a bare `<` that should be escaped
1864                    let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1865                    out.push_str(&input[last..idx]);
1866                    out.push_str("&lt;");
1867                    last = idx + 1;
1868                }
1869            } else {
1870                // `<` at end of string - escape it
1871                let out = output.get_or_insert_with(|| String::with_capacity(input.len() + 4));
1872                out.push_str(&input[last..idx]);
1873                out.push_str("&lt;");
1874                last = idx + 1;
1875            }
1876        }
1877        idx += 1;
1878    }
1879
1880    if let Some(mut out) = output {
1881        if last < input.len() {
1882            out.push_str(&input[last..]);
1883        }
1884        Cow::Owned(out)
1885    } else {
1886        Cow::Borrowed(input)
1887    }
1888}
1889
1890fn normalized_tag_name<'a>(raw: Cow<'a, str>) -> Cow<'a, str> {
1891    if raw.as_bytes().iter().any(|b| b.is_ascii_uppercase()) {
1892        let mut owned = raw.into_owned();
1893        owned.make_ascii_lowercase();
1894        Cow::Owned(owned)
1895    } else {
1896        raw
1897    }
1898}
1899
1900fn tag_name_eq(name: Cow<'_, str>, needle: &str) -> bool {
1901    name.eq_ignore_ascii_case(needle)
1902}
1903
1904fn should_drop_for_preprocessing(
1905    node_handle: &tl::NodeHandle,
1906    tag_name: &str,
1907    tag: &tl::HTMLTag,
1908    parser: &tl::Parser,
1909    dom_ctx: &DomContext,
1910    options: &ConversionOptions,
1911) -> bool {
1912    if !options.preprocessing.enabled {
1913        return false;
1914    }
1915
1916    if options.preprocessing.remove_navigation {
1917        let has_nav_hint = element_has_navigation_hint(tag);
1918
1919        if tag_name == "nav" {
1920            return true;
1921        }
1922
1923        if tag_name == "header" {
1924            let inside_semantic_content = has_semantic_content_ancestor(node_handle, parser, dom_ctx);
1925            if !inside_semantic_content {
1926                return true;
1927            }
1928            if has_nav_hint {
1929                return true;
1930            }
1931        } else if tag_name == "footer" || tag_name == "aside" {
1932            if has_nav_hint {
1933                return true;
1934            }
1935        } else if has_nav_hint && !matches!(tag_name, "main" | "article" | "html" | "body" | "head") {
1936            return true;
1937        }
1938    }
1939
1940    if options.preprocessing.remove_forms {
1941        if tag_name == "form" {
1942            let preserves_form = options.preserve_tags.iter().any(|t| t == "form");
1943            if !preserves_form {
1944                return true;
1945            }
1946        } else if matches!(
1947            tag_name,
1948            "button" | "select" | "textarea" | "label" | "fieldset" | "legend"
1949        ) {
1950            return true;
1951        }
1952    }
1953
1954    false
1955}
1956
1957fn has_semantic_content_ancestor(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
1958    let mut current_id = node_handle.get_inner();
1959    while let Some(parent_id) = dom_ctx.parent_map.get(&current_id).copied().flatten() {
1960        if let Some(parent_handle) = dom_ctx.node_map.get(&parent_id) {
1961            if let Some(tl::Node::Tag(parent_tag)) = parent_handle.get(parser) {
1962                let parent_name = normalized_tag_name(parent_tag.name().as_utf8_str());
1963                if matches!(parent_name.as_ref(), "main" | "article" | "section") {
1964                    return true;
1965                }
1966                if tag_has_main_semantics(parent_tag) {
1967                    return true;
1968                }
1969            }
1970        }
1971        current_id = parent_id;
1972    }
1973    false
1974}
1975
1976fn tag_has_main_semantics(tag: &tl::HTMLTag) -> bool {
1977    if let Some(role_attr) = tag.attributes().get("role") {
1978        if let Some(role) = role_attr {
1979            let lowered = role.as_utf8_str().to_ascii_lowercase();
1980            if matches!(lowered.as_str(), "main" | "article" | "document" | "region") {
1981                return true;
1982            }
1983        }
1984    }
1985
1986    if let Some(class_attr) = tag.attributes().get("class") {
1987        if let Some(class_bytes) = class_attr {
1988            let class_value = class_bytes.as_utf8_str().to_ascii_lowercase();
1989            const MAIN_CLASS_HINTS: &[&str] = &[
1990                "mw-body",
1991                "mw-parser-output",
1992                "content-body",
1993                "content-container",
1994                "article-body",
1995                "article-content",
1996                "main-content",
1997                "page-content",
1998                "entry-content",
1999                "post-content",
2000                "document-body",
2001            ];
2002            if MAIN_CLASS_HINTS.iter().any(|hint| class_value.contains(hint)) {
2003                return true;
2004            }
2005        }
2006    }
2007
2008    false
2009}
2010
2011fn element_has_navigation_hint(tag: &tl::HTMLTag) -> bool {
2012    if attribute_matches_any(tag, "role", &["navigation", "menubar", "tablist", "toolbar"]) {
2013        return true;
2014    }
2015
2016    if attribute_contains_any(
2017        tag,
2018        "aria-label",
2019        &["navigation", "menu", "contents", "table of contents", "toc"],
2020    ) {
2021        return true;
2022    }
2023
2024    const NAV_KEYWORDS: &[&str] = &[
2025        "nav",
2026        "navigation",
2027        "navbar",
2028        "breadcrumbs",
2029        "breadcrumb",
2030        "toc",
2031        "sidebar",
2032        "sidenav",
2033        "menu",
2034        "menubar",
2035        "mainmenu",
2036        "subnav",
2037        "tabs",
2038        "tablist",
2039        "toolbar",
2040        "pager",
2041        "pagination",
2042        "skipnav",
2043        "skip-link",
2044        "skiplinks",
2045        "site-nav",
2046        "site-menu",
2047        "site-header",
2048        "site-footer",
2049        "topbar",
2050        "bottombar",
2051        "masthead",
2052        "vector-nav",
2053        "vector-header",
2054        "vector-footer",
2055    ];
2056
2057    attribute_matches_any(tag, "class", NAV_KEYWORDS) || attribute_matches_any(tag, "id", NAV_KEYWORDS)
2058}
2059
2060fn attribute_matches_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2061    let Some(attr_value) = tag.attributes().get(attr) else {
2062        return false;
2063    };
2064    let Some(value) = attr_value else {
2065        return false;
2066    };
2067    let raw = value.as_utf8_str();
2068    raw.split_whitespace()
2069        .map(|token| {
2070            token
2071                .chars()
2072                .map(|c| match c {
2073                    '_' | ':' | '.' | '/' => '-',
2074                    _ => c,
2075                })
2076                .collect::<String>()
2077                .to_ascii_lowercase()
2078        })
2079        .filter(|token| !token.is_empty())
2080        .any(|token| keywords.iter().any(|kw| token == *kw))
2081}
2082
2083fn attribute_contains_any(tag: &tl::HTMLTag, attr: &str, keywords: &[&str]) -> bool {
2084    let Some(attr_value) = tag.attributes().get(attr) else {
2085        return false;
2086    };
2087    let Some(value) = attr_value else {
2088        return false;
2089    };
2090    let lower = value.as_utf8_str().to_ascii_lowercase();
2091    keywords.iter().any(|kw| lower.contains(*kw))
2092}
2093
2094/// Serialize a tag and its children back to HTML.
2095///
2096/// This is used for the preserve_tags feature to output original HTML for specific elements.
2097fn serialize_tag_to_html(handle: &tl::NodeHandle, parser: &tl::Parser) -> String {
2098    let mut html = String::new();
2099    serialize_node_to_html(handle, parser, &mut html);
2100    html
2101}
2102
2103/// Recursively serialize a node to HTML.
2104fn serialize_node_to_html(handle: &tl::NodeHandle, parser: &tl::Parser, output: &mut String) {
2105    match handle.get(parser) {
2106        Some(tl::Node::Tag(tag)) => {
2107            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2108
2109            // Opening tag
2110            output.push('<');
2111            output.push_str(&tag_name);
2112
2113            // Attributes
2114            for (key, value) in tag.attributes().iter() {
2115                output.push(' ');
2116                output.push_str(&key);
2117                if let Some(val) = value {
2118                    output.push_str("=\"");
2119                    output.push_str(&val);
2120                    output.push('"');
2121                }
2122            }
2123
2124            output.push('>');
2125
2126            // Children
2127            let children = tag.children();
2128            for child_handle in children.top().iter() {
2129                serialize_node_to_html(child_handle, parser, output);
2130            }
2131
2132            // Closing tag (skip for self-closing tags)
2133            if !matches!(
2134                tag_name.as_ref(),
2135                "br" | "hr"
2136                    | "img"
2137                    | "input"
2138                    | "meta"
2139                    | "link"
2140                    | "area"
2141                    | "base"
2142                    | "col"
2143                    | "embed"
2144                    | "param"
2145                    | "source"
2146                    | "track"
2147                    | "wbr"
2148            ) {
2149                output.push_str("</");
2150                output.push_str(&tag_name);
2151                output.push('>');
2152            }
2153        }
2154        Some(tl::Node::Raw(bytes)) => {
2155            if let Ok(text) = std::str::from_utf8(bytes.as_bytes()) {
2156                output.push_str(text);
2157            }
2158        }
2159        _ => {}
2160    }
2161}
2162
2163#[cfg(test)]
2164fn strip_script_and_style_sections(input: &str) -> Cow<'_, str> {
2165    const TAGS: [&[u8]; 2] = [b"script", b"style"];
2166    const SVG: &[u8] = b"svg";
2167
2168    let bytes = input.as_bytes();
2169    let len = bytes.len();
2170    let mut idx = 0;
2171    let mut last = 0;
2172    let mut output: Option<String> = None;
2173    let mut svg_depth = 0usize;
2174
2175    while idx < len {
2176        if bytes[idx] == b'<' {
2177            if matches_tag_start(bytes, idx + 1, SVG) {
2178                if let Some(open_end) = find_tag_end(bytes, idx + 1 + SVG.len()) {
2179                    svg_depth += 1;
2180                    idx = open_end;
2181                    continue;
2182                }
2183            } else if matches_end_tag_start(bytes, idx + 1, SVG) {
2184                if let Some(close_end) = find_tag_end(bytes, idx + 2 + SVG.len()) {
2185                    if svg_depth > 0 {
2186                        svg_depth = svg_depth.saturating_sub(1);
2187                    }
2188                    idx = close_end;
2189                    continue;
2190                }
2191            }
2192
2193            if svg_depth == 0 {
2194                let mut handled = false;
2195                for tag in TAGS {
2196                    if matches_tag_start(bytes, idx + 1, tag) {
2197                        if let Some(open_end) = find_tag_end(bytes, idx + 1 + tag.len()) {
2198                            let remove_end = find_closing_tag(bytes, open_end, tag).unwrap_or(len);
2199                            let out = output.get_or_insert_with(|| String::with_capacity(input.len()));
2200                            out.push_str(&input[last..idx]);
2201                            out.push_str(&input[idx..open_end]);
2202                            out.push_str("</");
2203                            out.push_str(str::from_utf8(tag).unwrap());
2204                            out.push('>');
2205
2206                            last = remove_end;
2207                            idx = remove_end;
2208                            handled = true;
2209                        }
2210                    }
2211
2212                    if handled {
2213                        break;
2214                    }
2215                }
2216
2217                if handled {
2218                    continue;
2219                }
2220            }
2221        }
2222
2223        idx += 1;
2224    }
2225
2226    if let Some(mut out) = output {
2227        if last < input.len() {
2228            out.push_str(&input[last..]);
2229        }
2230        Cow::Owned(out)
2231    } else {
2232        Cow::Borrowed(input)
2233    }
2234}
2235
2236fn matches_tag_start(bytes: &[u8], mut start: usize, tag: &[u8]) -> bool {
2237    if start >= bytes.len() {
2238        return false;
2239    }
2240
2241    if start + tag.len() > bytes.len() {
2242        return false;
2243    }
2244
2245    if !bytes[start..start + tag.len()].eq_ignore_ascii_case(tag) {
2246        return false;
2247    }
2248
2249    start += tag.len();
2250
2251    match bytes.get(start) {
2252        Some(b'>' | b'/' | b' ' | b'\t' | b'\n' | b'\r') => true,
2253        Some(_) => false,
2254        None => true,
2255    }
2256}
2257
2258fn find_tag_end(bytes: &[u8], mut idx: usize) -> Option<usize> {
2259    let len = bytes.len();
2260    let mut in_quote: Option<u8> = None;
2261
2262    while idx < len {
2263        match bytes[idx] {
2264            b'"' | b'\'' => {
2265                if let Some(current) = in_quote {
2266                    if current == bytes[idx] {
2267                        in_quote = None;
2268                    }
2269                } else {
2270                    in_quote = Some(bytes[idx]);
2271                }
2272            }
2273            b'>' if in_quote.is_none() => return Some(idx + 1),
2274            _ => {}
2275        }
2276        idx += 1;
2277    }
2278
2279    None
2280}
2281
2282fn find_closing_tag(bytes: &[u8], mut idx: usize, tag: &[u8]) -> Option<usize> {
2283    let len = bytes.len();
2284    let mut depth = 1usize;
2285
2286    while idx < len {
2287        if bytes[idx] == b'<' {
2288            if matches_tag_start(bytes, idx + 1, tag) {
2289                if let Some(next) = find_tag_end(bytes, idx + 1 + tag.len()) {
2290                    depth += 1;
2291                    idx = next;
2292                    continue;
2293                }
2294            } else if matches_end_tag_start(bytes, idx + 1, tag) {
2295                if let Some(close) = find_tag_end(bytes, idx + 2 + tag.len()) {
2296                    depth -= 1;
2297                    if depth == 0 {
2298                        return Some(close);
2299                    }
2300                    idx = close;
2301                    continue;
2302                }
2303            }
2304        }
2305
2306        idx += 1;
2307    }
2308
2309    None
2310}
2311
2312fn matches_end_tag_start(bytes: &[u8], start: usize, tag: &[u8]) -> bool {
2313    if start >= bytes.len() || bytes[start] != b'/' {
2314        return false;
2315    }
2316    matches_tag_start(bytes, start + 1, tag)
2317}
2318
2319/// Check if an element is inline (not block-level).
2320fn is_inline_element(tag_name: &str) -> bool {
2321    matches!(
2322        tag_name,
2323        "a" | "abbr"
2324            | "b"
2325            | "bdi"
2326            | "bdo"
2327            | "br"
2328            | "cite"
2329            | "code"
2330            | "data"
2331            | "dfn"
2332            | "em"
2333            | "i"
2334            | "kbd"
2335            | "mark"
2336            | "q"
2337            | "rp"
2338            | "rt"
2339            | "ruby"
2340            | "s"
2341            | "samp"
2342            | "small"
2343            | "span"
2344            | "strong"
2345            | "sub"
2346            | "sup"
2347            | "time"
2348            | "u"
2349            | "var"
2350            | "wbr"
2351            | "del"
2352            | "ins"
2353            | "img"
2354            | "map"
2355            | "area"
2356            | "audio"
2357            | "video"
2358            | "picture"
2359            | "source"
2360            | "track"
2361            | "embed"
2362            | "object"
2363            | "param"
2364            | "input"
2365            | "label"
2366            | "button"
2367            | "select"
2368            | "textarea"
2369            | "output"
2370            | "progress"
2371            | "meter"
2372    )
2373}
2374
2375/// Check if an element is block-level (not inline).
2376fn is_block_level_element(tag_name: &str) -> bool {
2377    !is_inline_element(tag_name)
2378        && matches!(
2379            tag_name,
2380            "address"
2381                | "article"
2382                | "aside"
2383                | "blockquote"
2384                | "canvas"
2385                | "dd"
2386                | "div"
2387                | "dl"
2388                | "dt"
2389                | "fieldset"
2390                | "figcaption"
2391                | "figure"
2392                | "footer"
2393                | "form"
2394                | "h1"
2395                | "h2"
2396                | "h3"
2397                | "h4"
2398                | "h5"
2399                | "h6"
2400                | "header"
2401                | "hr"
2402                | "li"
2403                | "main"
2404                | "nav"
2405                | "ol"
2406                | "p"
2407                | "pre"
2408                | "section"
2409                | "table"
2410                | "tfoot"
2411                | "ul"
2412        )
2413}
2414
2415fn get_next_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2416    let id = node_handle.get_inner();
2417    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2418
2419    let siblings = if let Some(parent_id) = parent {
2420        dom_ctx.children_map.get(&parent_id)?
2421    } else {
2422        &dom_ctx.root_children
2423    };
2424
2425    let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2426
2427    for sibling in siblings.iter().skip(position + 1) {
2428        if let Some(node) = sibling.get(parser) {
2429            match node {
2430                tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2431                tl::Node::Raw(raw) => {
2432                    if !raw.as_utf8_str().trim().is_empty() {
2433                        return None;
2434                    }
2435                }
2436                _ => {}
2437            }
2438        }
2439    }
2440
2441    None
2442}
2443
2444fn get_previous_sibling_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> Option<String> {
2445    let id = node_handle.get_inner();
2446    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2447
2448    let siblings = if let Some(parent_id) = parent {
2449        dom_ctx.children_map.get(&parent_id)?
2450    } else {
2451        &dom_ctx.root_children
2452    };
2453
2454    let position = siblings.iter().position(|handle| handle.get_inner() == id)?;
2455
2456    for sibling in siblings.iter().take(position).rev() {
2457        if let Some(node) = sibling.get(parser) {
2458            match node {
2459                tl::Node::Tag(tag) => return Some(normalized_tag_name(tag.name().as_utf8_str()).into_owned()),
2460                tl::Node::Raw(raw) => {
2461                    if !raw.as_utf8_str().trim().is_empty() {
2462                        return None;
2463                    }
2464                }
2465                _ => {}
2466            }
2467        }
2468    }
2469
2470    None
2471}
2472
2473fn previous_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2474    let id = node_handle.get_inner();
2475    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2476
2477    let siblings = if let Some(parent_id) = parent {
2478        if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2479            children
2480        } else {
2481            return false;
2482        }
2483    } else {
2484        &dom_ctx.root_children
2485    };
2486
2487    let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2488        return false;
2489    };
2490
2491    for sibling in siblings.iter().take(position).rev() {
2492        if let Some(node) = sibling.get(parser) {
2493            match node {
2494                tl::Node::Tag(tag) => {
2495                    let name = normalized_tag_name(tag.name().as_utf8_str());
2496                    return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2497                }
2498                tl::Node::Raw(raw) => {
2499                    if raw.as_utf8_str().trim().is_empty() {
2500                        continue;
2501                    }
2502                    return false;
2503                }
2504                _ => continue,
2505            }
2506        }
2507    }
2508
2509    false
2510}
2511
2512fn next_sibling_is_whitespace_text(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2513    let id = node_handle.get_inner();
2514    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2515
2516    let siblings = if let Some(parent_id) = parent {
2517        if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2518            children
2519        } else {
2520            return false;
2521        }
2522    } else {
2523        &dom_ctx.root_children
2524    };
2525
2526    let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2527        return false;
2528    };
2529
2530    for sibling in siblings.iter().skip(position + 1) {
2531        if let Some(node) = sibling.get(parser) {
2532            match node {
2533                tl::Node::Raw(raw) => return raw.as_utf8_str().trim().is_empty(),
2534                tl::Node::Tag(_) => return false,
2535                _ => continue,
2536            }
2537        }
2538    }
2539
2540    false
2541}
2542
2543fn next_sibling_is_inline_tag(node_handle: &tl::NodeHandle, parser: &tl::Parser, dom_ctx: &DomContext) -> bool {
2544    let id = node_handle.get_inner();
2545    let parent = dom_ctx.parent_map.get(&id).copied().flatten();
2546
2547    let siblings = if let Some(parent_id) = parent {
2548        if let Some(children) = dom_ctx.children_map.get(&parent_id) {
2549            children
2550        } else {
2551            return false;
2552        }
2553    } else {
2554        &dom_ctx.root_children
2555    };
2556
2557    let Some(position) = siblings.iter().position(|handle| handle.get_inner() == id) else {
2558        return false;
2559    };
2560
2561    for sibling in siblings.iter().skip(position + 1) {
2562        if let Some(node) = sibling.get(parser) {
2563            match node {
2564                tl::Node::Tag(tag) => {
2565                    let name = normalized_tag_name(tag.name().as_utf8_str());
2566                    return is_inline_element(name.as_ref()) || matches!(name.as_ref(), "script" | "style");
2567                }
2568                tl::Node::Raw(raw) => {
2569                    if raw.as_utf8_str().trim().is_empty() {
2570                        continue;
2571                    }
2572                    return false;
2573                }
2574                _ => continue,
2575            }
2576        }
2577    }
2578
2579    false
2580}
2581
2582fn append_inline_suffix(
2583    output: &mut String,
2584    suffix: &str,
2585    has_core_content: bool,
2586    node_handle: &tl::NodeHandle,
2587    parser: &tl::Parser,
2588    dom_ctx: &DomContext,
2589) {
2590    if suffix.is_empty() {
2591        return;
2592    }
2593
2594    if suffix == " " && has_core_content && next_sibling_is_whitespace_text(node_handle, parser, dom_ctx) {
2595        return;
2596    }
2597
2598    output.push_str(suffix);
2599}
2600
2601/// Recursively walk DOM nodes and convert to Markdown.
2602#[allow(clippy::only_used_in_recursion)]
2603fn walk_node(
2604    node_handle: &tl::NodeHandle,
2605    parser: &tl::Parser,
2606    output: &mut String,
2607    options: &ConversionOptions,
2608    ctx: &Context,
2609    depth: usize,
2610    dom_ctx: &DomContext,
2611) {
2612    let Some(node) = node_handle.get(parser) else { return };
2613
2614    match node {
2615        tl::Node::Raw(bytes) => {
2616            let mut text = text::decode_html_entities(&bytes.as_utf8_str());
2617
2618            if text.is_empty() {
2619                return;
2620            }
2621
2622            if options.strip_newlines {
2623                text = text.replace(['\r', '\n'], " ");
2624            }
2625
2626            if text.trim().is_empty() {
2627                if ctx.in_code {
2628                    output.push_str(&text);
2629                    return;
2630                }
2631
2632                if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2633                    if ctx.convert_as_inline || ctx.in_table_cell || ctx.in_list_item {
2634                        output.push_str(&text);
2635                        return;
2636                    }
2637                    if text.contains("\n\n") || text.contains("\r\n\r\n") {
2638                        if !output.ends_with("\n\n") {
2639                            output.push('\n');
2640                        }
2641                        return;
2642                    }
2643                    output.push_str(&text);
2644                    return;
2645                }
2646
2647                if text.contains('\n') {
2648                    if output.is_empty() {
2649                        return;
2650                    }
2651                    if !output.ends_with("\n\n") {
2652                        if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2653                            if is_inline_element(&next_tag) {
2654                                return;
2655                            }
2656                        }
2657                    }
2658                    return;
2659                }
2660
2661                if previous_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2662                    && next_sibling_is_inline_tag(node_handle, parser, dom_ctx)
2663                {
2664                    if text.chars().count() > 1 {
2665                        if !output.ends_with(' ') {
2666                            output.push(' ');
2667                        }
2668                    } else {
2669                        output.push_str(&text);
2670                    }
2671                } else {
2672                    output.push_str(&text);
2673                }
2674                return;
2675            }
2676
2677            let processed_text = if ctx.in_code || ctx.in_ruby {
2678                text
2679            } else if ctx.in_table_cell {
2680                let escaped = if options.whitespace_mode == crate::options::WhitespaceMode::Normalized {
2681                    let normalized_text = text::normalize_whitespace(&text);
2682                    text::escape(
2683                        &normalized_text,
2684                        options.escape_misc,
2685                        options.escape_asterisks,
2686                        options.escape_underscores,
2687                        options.escape_ascii,
2688                    )
2689                } else {
2690                    text::escape(
2691                        &text,
2692                        options.escape_misc,
2693                        options.escape_asterisks,
2694                        options.escape_underscores,
2695                        options.escape_ascii,
2696                    )
2697                };
2698                // Always escape pipes in table cells (unless escape_misc already did it)
2699                if options.escape_misc {
2700                    escaped
2701                } else {
2702                    escaped.replace('|', r"\|")
2703                }
2704            } else if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
2705                text::escape(
2706                    &text,
2707                    options.escape_misc,
2708                    options.escape_asterisks,
2709                    options.escape_underscores,
2710                    options.escape_ascii,
2711                )
2712            } else {
2713                let has_trailing_single_newline =
2714                    text.ends_with('\n') && !text.ends_with("\n\n") && !text.ends_with("\r\n\r\n");
2715
2716                let normalized_text = text::normalize_whitespace(&text);
2717
2718                let (prefix, suffix, core) = text::chomp(&normalized_text);
2719
2720                let skip_prefix = output.ends_with("\n\n")
2721                    || output.ends_with("* ")
2722                    || output.ends_with("- ")
2723                    || output.ends_with(". ")
2724                    || output.ends_with("] ")
2725                    || (output.ends_with('\n') && prefix == " ")
2726                    || (output.ends_with(' ')
2727                        && prefix == " "
2728                        && !previous_sibling_is_inline_tag(node_handle, parser, dom_ctx));
2729
2730                let mut final_text = String::new();
2731                if !skip_prefix && !prefix.is_empty() {
2732                    final_text.push_str(prefix);
2733                }
2734
2735                let escaped_core = text::escape(
2736                    core,
2737                    options.escape_misc,
2738                    options.escape_asterisks,
2739                    options.escape_underscores,
2740                    options.escape_ascii,
2741                );
2742                final_text.push_str(&escaped_core);
2743
2744                if !suffix.is_empty() {
2745                    final_text.push_str(suffix);
2746                } else if has_trailing_single_newline {
2747                    let at_paragraph_break = output.ends_with("\n\n");
2748                    if options.debug {
2749                        eprintln!(
2750                            "[DEBUG] Text had trailing single newline that was chomped, at_paragraph_break={}",
2751                            at_paragraph_break
2752                        );
2753                    }
2754                    if !at_paragraph_break {
2755                        if text.contains("\n\n") || text.contains("\r\n\r\n") {
2756                            final_text.push('\n');
2757                        } else if let Some(next_tag) = get_next_sibling_tag(node_handle, parser, dom_ctx) {
2758                            if options.debug {
2759                                eprintln!("[DEBUG] Next sibling tag after newline: {}", next_tag);
2760                            }
2761                            if matches!(next_tag.as_str(), "span") {
2762                                // Collapse formatting newlines between inline siblings like span
2763                            } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2764                                final_text.push(' ');
2765                            } else {
2766                                final_text.push('\n');
2767                            }
2768                        } else if ctx.inline_depth > 0 || ctx.convert_as_inline || ctx.in_paragraph {
2769                            final_text.push(' ');
2770                        } else {
2771                            final_text.push('\n');
2772                        }
2773                    }
2774                }
2775
2776                final_text
2777            };
2778
2779            if ctx.in_list_item && processed_text.contains("\n\n") {
2780                let parts: Vec<&str> = processed_text.split("\n\n").collect();
2781                for (i, part) in parts.iter().enumerate() {
2782                    if i > 0 {
2783                        output.push_str("\n\n");
2784                        output.push_str(&" ".repeat(4 * ctx.list_depth));
2785                    }
2786                    output.push_str(part.trim());
2787                }
2788            } else {
2789                output.push_str(&processed_text);
2790            }
2791        }
2792
2793        tl::Node::Tag(tag) => {
2794            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
2795
2796            if should_drop_for_preprocessing(node_handle, tag_name.as_ref(), tag, parser, dom_ctx, options) {
2797                trim_trailing_whitespace(output);
2798                if options.debug {
2799                    eprintln!("[DEBUG] Dropping <{}> subtree due to preprocessing settings", tag_name);
2800                }
2801                return;
2802            }
2803
2804            if options.strip_tags.iter().any(|t| t.as_str() == tag_name) {
2805                let children = tag.children();
2806                {
2807                    for child_handle in children.top().iter() {
2808                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2809                    }
2810                }
2811                return;
2812            }
2813
2814            // Preserve tags: output original HTML
2815            if options.preserve_tags.iter().any(|t| t.as_str() == tag_name) {
2816                let html = serialize_tag_to_html(node_handle, parser);
2817                output.push_str(&html);
2818                return;
2819            }
2820
2821            // NEW: Extract lang/dir from html, head, or body tags
2822            #[cfg(feature = "metadata")]
2823            if matches!(tag_name.as_ref(), "html" | "head" | "body") {
2824                if let Some(ref collector) = ctx.metadata_collector {
2825                    let mut c = collector.borrow_mut();
2826
2827                    if let Some(lang) = tag.attributes().get("lang").flatten() {
2828                        c.set_language(lang.as_utf8_str().to_string());
2829                    }
2830
2831                    if let Some(dir) = tag.attributes().get("dir").flatten() {
2832                        c.set_text_direction(dir.as_utf8_str().to_string());
2833                    }
2834                }
2835            }
2836
2837            match tag_name.as_ref() {
2838                "h1" | "h2" | "h3" | "h4" | "h5" | "h6" => {
2839                    let level = tag_name.chars().last().and_then(|c| c.to_digit(10)).unwrap_or(1) as usize;
2840
2841                    let mut text = String::new();
2842                    let heading_ctx = Context {
2843                        in_heading: true,
2844                        convert_as_inline: true,
2845                        heading_tag: Some(tag_name.to_string()),
2846                        ..ctx.clone()
2847                    };
2848                    let children = tag.children();
2849                    {
2850                        for child_handle in children.top().iter() {
2851                            walk_node(
2852                                child_handle,
2853                                parser,
2854                                &mut text,
2855                                options,
2856                                &heading_ctx,
2857                                depth + 1,
2858                                dom_ctx,
2859                            );
2860                        }
2861                    }
2862                    let trimmed = text.trim();
2863                    if !trimmed.is_empty() {
2864                        let normalized = normalize_heading_text(trimmed);
2865                        push_heading(output, ctx, options, level, normalized.as_ref());
2866
2867                        // NEW: Collect header metadata
2868                        #[cfg(feature = "metadata")]
2869                        if let Some(ref collector) = ctx.metadata_collector {
2870                            let id = tag
2871                                .attributes()
2872                                .get("id")
2873                                .flatten()
2874                                .map(|v| v.as_utf8_str().to_string());
2875                            collector
2876                                .borrow_mut()
2877                                .add_header(level as u8, normalized.to_string(), id, depth, 0);
2878                        }
2879                    }
2880                }
2881
2882                "p" => {
2883                    let content_start_pos = output.len();
2884
2885                    let is_table_continuation =
2886                        ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
2887
2888                    let is_list_continuation = ctx.in_list_item
2889                        && !output.is_empty()
2890                        && !output.ends_with("* ")
2891                        && !output.ends_with("- ")
2892                        && !output.ends_with(". ");
2893
2894                    let after_code_block = output.ends_with("```\n");
2895                    let needs_leading_sep = !ctx.in_table_cell
2896                        && !ctx.in_list_item
2897                        && !ctx.convert_as_inline
2898                        && ctx.blockquote_depth == 0
2899                        && !output.is_empty()
2900                        && !output.ends_with("\n\n")
2901                        && !after_code_block;
2902
2903                    if is_table_continuation {
2904                        trim_trailing_whitespace(output);
2905                        output.push_str("<br>");
2906                    } else if is_list_continuation {
2907                        add_list_continuation_indent(output, ctx.list_depth, true, options);
2908                    } else if needs_leading_sep {
2909                        trim_trailing_whitespace(output);
2910                        output.push_str("\n\n");
2911                    }
2912
2913                    let p_ctx = Context {
2914                        in_paragraph: true,
2915                        ..ctx.clone()
2916                    };
2917
2918                    let children = tag.children();
2919                    {
2920                        let child_handles: Vec<_> = children.top().iter().collect();
2921                        for (i, child_handle) in child_handles.iter().enumerate() {
2922                            // Skip whitespace-only text nodes between empty inline elements
2923                            if let Some(node) = child_handle.get(parser) {
2924                                if let tl::Node::Raw(bytes) = node {
2925                                    let text = bytes.as_utf8_str();
2926                                    if text.trim().is_empty() && i > 0 && i < child_handles.len() - 1 {
2927                                        let prev = &child_handles[i - 1];
2928                                        let next = &child_handles[i + 1];
2929                                        if is_empty_inline_element(prev, parser)
2930                                            && is_empty_inline_element(next, parser)
2931                                        {
2932                                            continue;
2933                                        }
2934                                    }
2935                                }
2936                            }
2937                            walk_node(child_handle, parser, output, options, &p_ctx, depth + 1, dom_ctx);
2938                        }
2939                    }
2940
2941                    let has_content = output.len() > content_start_pos;
2942
2943                    if has_content && !ctx.convert_as_inline && !ctx.in_table_cell {
2944                        output.push_str("\n\n");
2945                    }
2946                }
2947
2948                "strong" | "b" => {
2949                    if ctx.in_code {
2950                        let children = tag.children();
2951                        {
2952                            for child_handle in children.top().iter() {
2953                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
2954                            }
2955                        }
2956                    } else {
2957                        let mut content = String::with_capacity(64);
2958                        let children = tag.children();
2959                        {
2960                            let strong_ctx = Context {
2961                                inline_depth: ctx.inline_depth + 1,
2962                                in_strong: true,
2963                                ..ctx.clone()
2964                            };
2965                            for child_handle in children.top().iter() {
2966                                walk_node(
2967                                    child_handle,
2968                                    parser,
2969                                    &mut content,
2970                                    options,
2971                                    &strong_ctx,
2972                                    depth + 1,
2973                                    dom_ctx,
2974                                );
2975                            }
2976                        }
2977                        let (prefix, suffix, trimmed) = chomp_inline(&content);
2978                        if !content.trim().is_empty() {
2979                            output.push_str(prefix);
2980                            if ctx.in_strong {
2981                                output.push_str(trimmed);
2982                            } else {
2983                                output.push(options.strong_em_symbol);
2984                                output.push(options.strong_em_symbol);
2985                                output.push_str(trimmed);
2986                                output.push(options.strong_em_symbol);
2987                                output.push(options.strong_em_symbol);
2988                            }
2989                            append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
2990                        } else if !content.is_empty() {
2991                            output.push_str(prefix);
2992                            append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
2993                        }
2994                    }
2995                }
2996
2997                "em" | "i" => {
2998                    if ctx.in_code {
2999                        let children = tag.children();
3000                        {
3001                            for child_handle in children.top().iter() {
3002                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3003                            }
3004                        }
3005                    } else {
3006                        let mut content = String::with_capacity(64);
3007                        let children = tag.children();
3008                        {
3009                            let em_ctx = Context {
3010                                inline_depth: ctx.inline_depth + 1,
3011                                ..ctx.clone()
3012                            };
3013                            for child_handle in children.top().iter() {
3014                                walk_node(child_handle, parser, &mut content, options, &em_ctx, depth + 1, dom_ctx);
3015                            }
3016                        }
3017                        let (prefix, suffix, trimmed) = chomp_inline(&content);
3018                        if !content.trim().is_empty() {
3019                            output.push_str(prefix);
3020                            output.push(options.strong_em_symbol);
3021                            output.push_str(trimmed);
3022                            output.push(options.strong_em_symbol);
3023                            append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3024                        } else if !content.is_empty() {
3025                            output.push_str(prefix);
3026                            append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3027                        } else if let Some(class_value) = tag
3028                            .attributes()
3029                            .get("class")
3030                            .and_then(|v| v.as_ref().map(|val| val.as_utf8_str().to_string()))
3031                        {
3032                            if class_value.contains("caret") && !output.ends_with(' ') {
3033                                output.push_str(" > ");
3034                            }
3035                        }
3036                    }
3037                }
3038
3039                "a" => {
3040                    const MAX_LINK_LABEL_LEN: usize = 512;
3041
3042                    let href_attr = tag
3043                        .attributes()
3044                        .get("href")
3045                        .flatten()
3046                        .map(|v| text::decode_html_entities(&v.as_utf8_str()));
3047                    let title = tag
3048                        .attributes()
3049                        .get("title")
3050                        .flatten()
3051                        .map(|v| v.as_utf8_str().to_string());
3052
3053                    if let Some(href) = href_attr {
3054                        let raw_text = text::normalize_whitespace(&get_text_content(node_handle, parser))
3055                            .trim()
3056                            .to_string();
3057
3058                        let is_autolink = options.autolinks
3059                            && !options.default_title
3060                            && !href.is_empty()
3061                            && (raw_text == href || (href.starts_with("mailto:") && raw_text == href[7..]));
3062
3063                        if is_autolink {
3064                            output.push('<');
3065                            if href.starts_with("mailto:") && raw_text == href[7..] {
3066                                output.push_str(&raw_text);
3067                            } else {
3068                                output.push_str(&href);
3069                            }
3070                            output.push('>');
3071                            return;
3072                        }
3073
3074                        if let Some((heading_level, heading_handle)) = find_single_heading_child(node_handle, parser) {
3075                            if let Some(heading_node) = heading_handle.get(parser) {
3076                                if let tl::Node::Tag(heading_tag) = heading_node {
3077                                    let heading_name =
3078                                        normalized_tag_name(heading_tag.name().as_utf8_str()).into_owned();
3079                                    let mut heading_text = String::new();
3080                                    let heading_ctx = Context {
3081                                        in_heading: true,
3082                                        convert_as_inline: true,
3083                                        heading_tag: Some(heading_name),
3084                                        ..ctx.clone()
3085                                    };
3086                                    walk_node(
3087                                        &heading_handle,
3088                                        parser,
3089                                        &mut heading_text,
3090                                        options,
3091                                        &heading_ctx,
3092                                        depth + 1,
3093                                        dom_ctx,
3094                                    );
3095                                    let trimmed_heading = heading_text.trim();
3096                                    if !trimmed_heading.is_empty() {
3097                                        let escaped_label = escape_link_label(trimmed_heading);
3098                                        let mut link_buffer = String::new();
3099                                        append_markdown_link(
3100                                            &mut link_buffer,
3101                                            &escaped_label,
3102                                            href.as_str(),
3103                                            title.as_deref(),
3104                                            raw_text.as_str(),
3105                                            options,
3106                                        );
3107                                        push_heading(output, ctx, options, heading_level, link_buffer.as_str());
3108                                        return;
3109                                    }
3110                                }
3111                            }
3112                        }
3113
3114                        let children: Vec<_> = tag.children().top().iter().copied().collect();
3115                        let (inline_label, _block_nodes, saw_block) = collect_link_label_text(&children, parser);
3116                        let mut label = if saw_block {
3117                            let mut content = String::new();
3118                            let link_ctx = Context {
3119                                inline_depth: ctx.inline_depth + 1,
3120                                convert_as_inline: true,
3121                                ..ctx.clone()
3122                            };
3123                            for child_handle in children.iter() {
3124                                let mut child_buf = String::new();
3125                                walk_node(
3126                                    child_handle,
3127                                    parser,
3128                                    &mut child_buf,
3129                                    options,
3130                                    &link_ctx,
3131                                    depth + 1,
3132                                    dom_ctx,
3133                                );
3134                                if !child_buf.trim().is_empty()
3135                                    && !content.is_empty()
3136                                    && !content.chars().last().map(|c| c.is_whitespace()).unwrap_or(true)
3137                                    && !child_buf.chars().next().map(|c| c.is_whitespace()).unwrap_or(true)
3138                                {
3139                                    content.push(' ');
3140                                }
3141                                content.push_str(&child_buf);
3142                            }
3143                            if content.trim().is_empty() {
3144                                normalize_link_label(&inline_label)
3145                            } else {
3146                                normalize_link_label(&content)
3147                            }
3148                        } else {
3149                            let mut content = String::new();
3150                            let link_ctx = Context {
3151                                inline_depth: ctx.inline_depth + 1,
3152                                ..ctx.clone()
3153                            };
3154                            for child_handle in children.iter() {
3155                                walk_node(
3156                                    child_handle,
3157                                    parser,
3158                                    &mut content,
3159                                    options,
3160                                    &link_ctx,
3161                                    depth + 1,
3162                                    dom_ctx,
3163                                );
3164                            }
3165                            normalize_link_label(&content)
3166                        };
3167
3168                        if label.is_empty() && saw_block {
3169                            let fallback = text::normalize_whitespace(&get_text_content(node_handle, parser));
3170                            label = normalize_link_label(&fallback);
3171                        }
3172
3173                        if label.is_empty() && !raw_text.is_empty() {
3174                            label = normalize_link_label(&raw_text);
3175                        }
3176
3177                        if label.is_empty() && !href.is_empty() && !children.is_empty() {
3178                            label = href.clone();
3179                        }
3180
3181                        if label.len() > MAX_LINK_LABEL_LEN {
3182                            truncate_at_char_boundary(&mut label, MAX_LINK_LABEL_LEN);
3183                            label.push('…');
3184                        }
3185
3186                        let escaped_label = escape_link_label(&label);
3187                        append_markdown_link(
3188                            output,
3189                            &escaped_label,
3190                            href.as_str(),
3191                            title.as_deref(),
3192                            label.as_str(),
3193                            options,
3194                        );
3195
3196                        // NEW: Collect link metadata
3197                        #[cfg(feature = "metadata")]
3198                        if let Some(ref collector) = ctx.metadata_collector {
3199                            let rel_attr = tag
3200                                .attributes()
3201                                .get("rel")
3202                                .flatten()
3203                                .map(|v| v.as_utf8_str().to_string());
3204                            let mut attributes_map = BTreeMap::new();
3205                            for (key, value_opt) in tag.attributes().iter() {
3206                                let key_str = key.to_string();
3207                                if key_str == "href" {
3208                                    continue;
3209                                }
3210
3211                                let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3212                                attributes_map.insert(key_str, value);
3213                            }
3214                            collector.borrow_mut().add_link(
3215                                href.clone(),
3216                                label.clone(),
3217                                title.clone(),
3218                                rel_attr,
3219                                attributes_map,
3220                            );
3221                        }
3222                    } else {
3223                        let children = tag.children();
3224                        {
3225                            for child_handle in children.top().iter() {
3226                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3227                            }
3228                        }
3229                    }
3230                }
3231
3232                "img" => {
3233                    use std::borrow::Cow;
3234
3235                    let src = tag
3236                        .attributes()
3237                        .get("src")
3238                        .flatten()
3239                        .map(|v| v.as_utf8_str())
3240                        .unwrap_or(Cow::Borrowed(""));
3241
3242                    let alt = tag
3243                        .attributes()
3244                        .get("alt")
3245                        .flatten()
3246                        .map(|v| v.as_utf8_str())
3247                        .unwrap_or(Cow::Borrowed(""));
3248
3249                    let title = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str());
3250                    #[cfg(feature = "metadata")]
3251                    let mut attributes_map = BTreeMap::new();
3252                    #[cfg(feature = "metadata")]
3253                    let mut width: Option<u32> = None;
3254                    #[cfg(feature = "metadata")]
3255                    let mut height: Option<u32> = None;
3256                    #[cfg(feature = "metadata")]
3257                    for (key, value_opt) in tag.attributes().iter() {
3258                        let key_str = key.to_string();
3259                        if key_str == "src" {
3260                            continue;
3261                        }
3262                        let value = value_opt.map(|v| v.to_string()).unwrap_or_default();
3263                        if key_str == "width" {
3264                            if let Ok(parsed) = value.parse::<u32>() {
3265                                width = Some(parsed);
3266                            }
3267                        } else if key_str == "height" {
3268                            if let Ok(parsed) = value.parse::<u32>() {
3269                                height = Some(parsed);
3270                            }
3271                        }
3272                        attributes_map.insert(key_str, value);
3273                    }
3274
3275                    #[cfg(feature = "inline-images")]
3276                    if let Some(ref collector_ref) = ctx.inline_collector {
3277                        let mut attributes_map = BTreeMap::new();
3278                        for (key, value_opt) in tag.attributes().iter() {
3279                            let key_str = key.to_string();
3280                            let keep = key_str == "width"
3281                                || key_str == "height"
3282                                || key_str == "filename"
3283                                || key_str == "aria-label"
3284                                || key_str.starts_with("data-");
3285                            if keep {
3286                                let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
3287                                attributes_map.insert(key_str, value);
3288                            }
3289                        }
3290                        handle_inline_data_image(
3291                            collector_ref,
3292                            src.as_ref(),
3293                            alt.as_ref(),
3294                            title.as_deref(),
3295                            attributes_map,
3296                        );
3297                    }
3298
3299                    let keep_as_markdown = ctx.in_heading
3300                        && ctx
3301                            .heading_tag
3302                            .as_ref()
3303                            .is_some_and(|tag| options.keep_inline_images_in.iter().any(|t| t == tag));
3304
3305                    let should_use_alt_text = !keep_as_markdown
3306                        && (ctx.convert_as_inline
3307                            || (ctx.in_heading
3308                                && ctx
3309                                    .heading_tag
3310                                    .as_ref()
3311                                    .is_none_or(|tag| !options.keep_inline_images_in.iter().any(|t| t == tag))));
3312
3313                    if should_use_alt_text {
3314                        output.push_str(&alt);
3315                    } else {
3316                        output.push_str("![");
3317                        output.push_str(&alt);
3318                        output.push_str("](");
3319                        output.push_str(&src);
3320                        if let Some(ref title_text) = title {
3321                            output.push_str(" \"");
3322                            output.push_str(title_text);
3323                            output.push('"');
3324                        }
3325                        output.push(')');
3326                    }
3327
3328                    // NEW: Collect image metadata
3329                    #[cfg(feature = "metadata")]
3330                    if let Some(ref collector) = ctx.metadata_collector {
3331                        if !src.is_empty() {
3332                            let dimensions = match (width, height) {
3333                                (Some(w), Some(h)) => Some((w, h)),
3334                                _ => None,
3335                            };
3336                            collector.borrow_mut().add_image(
3337                                src.to_string(),
3338                                if alt.is_empty() { None } else { Some(alt.to_string()) },
3339                                title.as_deref().map(|t| t.to_string()),
3340                                dimensions,
3341                                attributes_map.clone(),
3342                            );
3343                        }
3344                    }
3345                }
3346
3347                "mark" => {
3348                    if ctx.convert_as_inline {
3349                        let children = tag.children();
3350                        {
3351                            for child_handle in children.top().iter() {
3352                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3353                            }
3354                        }
3355                    } else {
3356                        use crate::options::HighlightStyle;
3357                        match options.highlight_style {
3358                            HighlightStyle::DoubleEqual => {
3359                                output.push_str("==");
3360                                let children = tag.children();
3361                                {
3362                                    for child_handle in children.top().iter() {
3363                                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3364                                    }
3365                                }
3366                                output.push_str("==");
3367                            }
3368                            HighlightStyle::Html => {
3369                                output.push_str("<mark>");
3370                                let children = tag.children();
3371                                {
3372                                    for child_handle in children.top().iter() {
3373                                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3374                                    }
3375                                }
3376                                output.push_str("</mark>");
3377                            }
3378                            HighlightStyle::Bold => {
3379                                let symbol = options.strong_em_symbol.to_string().repeat(2);
3380                                output.push_str(&symbol);
3381                                let bold_ctx = Context {
3382                                    in_strong: true,
3383                                    ..ctx.clone()
3384                                };
3385                                let children = tag.children();
3386                                {
3387                                    for child_handle in children.top().iter() {
3388                                        walk_node(child_handle, parser, output, options, &bold_ctx, depth + 1, dom_ctx);
3389                                    }
3390                                }
3391                                output.push_str(&symbol);
3392                            }
3393                            HighlightStyle::None => {
3394                                let children = tag.children();
3395                                {
3396                                    for child_handle in children.top().iter() {
3397                                        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3398                                    }
3399                                }
3400                            }
3401                        }
3402                    }
3403                }
3404
3405                "del" | "s" => {
3406                    if ctx.in_code {
3407                        let children = tag.children();
3408                        {
3409                            for child_handle in children.top().iter() {
3410                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3411                            }
3412                        }
3413                    } else {
3414                        let mut content = String::with_capacity(32);
3415                        let children = tag.children();
3416                        {
3417                            for child_handle in children.top().iter() {
3418                                walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3419                            }
3420                        }
3421                        let (prefix, suffix, trimmed) = chomp_inline(&content);
3422                        if !content.trim().is_empty() {
3423                            output.push_str(prefix);
3424                            output.push_str("~~");
3425                            output.push_str(trimmed);
3426                            output.push_str("~~");
3427                            append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3428                        } else if !content.is_empty() {
3429                            output.push_str(prefix);
3430                            append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3431                        }
3432                    }
3433                }
3434
3435                "ins" => {
3436                    let mut content = String::with_capacity(32);
3437                    let children = tag.children();
3438                    {
3439                        for child_handle in children.top().iter() {
3440                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3441                        }
3442                    }
3443                    let (prefix, suffix, trimmed) = chomp_inline(&content);
3444                    if !trimmed.is_empty() {
3445                        output.push_str(prefix);
3446                        output.push_str("==");
3447                        output.push_str(trimmed);
3448                        output.push_str("==");
3449                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3450                    }
3451                }
3452
3453                "u" | "small" => {
3454                    let children = tag.children();
3455                    {
3456                        for child_handle in children.top().iter() {
3457                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3458                        }
3459                    }
3460                }
3461
3462                "sub" => {
3463                    if !ctx.in_code && !options.sub_symbol.is_empty() {
3464                        output.push_str(&options.sub_symbol);
3465                    }
3466                    let children = tag.children();
3467                    {
3468                        for child_handle in children.top().iter() {
3469                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3470                        }
3471                    }
3472                    if !ctx.in_code && !options.sub_symbol.is_empty() {
3473                        if options.sub_symbol.starts_with('<') && !options.sub_symbol.starts_with("</") {
3474                            output.push_str(&options.sub_symbol.replace('<', "</"));
3475                        } else {
3476                            output.push_str(&options.sub_symbol);
3477                        }
3478                    }
3479                }
3480
3481                "sup" => {
3482                    if !ctx.in_code && !options.sup_symbol.is_empty() {
3483                        output.push_str(&options.sup_symbol);
3484                    }
3485                    let children = tag.children();
3486                    {
3487                        for child_handle in children.top().iter() {
3488                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3489                        }
3490                    }
3491                    if !ctx.in_code && !options.sup_symbol.is_empty() {
3492                        if options.sup_symbol.starts_with('<') && !options.sup_symbol.starts_with("</") {
3493                            output.push_str(&options.sup_symbol.replace('<', "</"));
3494                        } else {
3495                            output.push_str(&options.sup_symbol);
3496                        }
3497                    }
3498                }
3499
3500                "kbd" | "samp" => {
3501                    let code_ctx = Context {
3502                        in_code: true,
3503                        ..ctx.clone()
3504                    };
3505                    let mut content = String::with_capacity(32);
3506                    let children = tag.children();
3507                    {
3508                        for child_handle in children.top().iter() {
3509                            walk_node(
3510                                child_handle,
3511                                parser,
3512                                &mut content,
3513                                options,
3514                                &code_ctx,
3515                                depth + 1,
3516                                dom_ctx,
3517                            );
3518                        }
3519                    }
3520                    let normalized = text::normalize_whitespace(&content);
3521                    let (prefix, suffix, trimmed) = chomp_inline(&normalized);
3522                    if !content.trim().is_empty() {
3523                        output.push_str(prefix);
3524                        output.push('`');
3525                        output.push_str(trimmed);
3526                        output.push('`');
3527                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3528                    } else if !content.is_empty() {
3529                        output.push_str(prefix);
3530                        append_inline_suffix(output, suffix, false, node_handle, parser, dom_ctx);
3531                    }
3532                }
3533
3534                "var" => {
3535                    let mut content = String::with_capacity(32);
3536                    let children = tag.children();
3537                    {
3538                        for child_handle in children.top().iter() {
3539                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3540                        }
3541                    }
3542                    let (prefix, suffix, trimmed) = chomp_inline(&content);
3543                    if !trimmed.is_empty() {
3544                        output.push_str(prefix);
3545                        output.push(options.strong_em_symbol);
3546                        output.push_str(trimmed);
3547                        output.push(options.strong_em_symbol);
3548                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3549                    }
3550                }
3551
3552                "dfn" => {
3553                    let mut content = String::with_capacity(32);
3554                    let children = tag.children();
3555                    {
3556                        for child_handle in children.top().iter() {
3557                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3558                        }
3559                    }
3560                    let (prefix, suffix, trimmed) = chomp_inline(&content);
3561                    if !trimmed.is_empty() {
3562                        output.push_str(prefix);
3563                        output.push(options.strong_em_symbol);
3564                        output.push_str(trimmed);
3565                        output.push(options.strong_em_symbol);
3566                        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
3567                    }
3568                }
3569
3570                "abbr" => {
3571                    let mut content = String::with_capacity(32);
3572                    let children = tag.children();
3573                    {
3574                        for child_handle in children.top().iter() {
3575                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
3576                        }
3577                    }
3578                    let trimmed = content.trim();
3579
3580                    if !trimmed.is_empty() {
3581                        output.push_str(trimmed);
3582
3583                        if let Some(title) = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str()) {
3584                            let trimmed_title = title.trim();
3585                            if !trimmed_title.is_empty() {
3586                                output.push_str(" (");
3587                                output.push_str(trimmed_title);
3588                                output.push(')');
3589                            }
3590                        }
3591                    }
3592                }
3593
3594                "time" | "data" => {
3595                    let children = tag.children();
3596                    {
3597                        for child_handle in children.top().iter() {
3598                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3599                        }
3600                    }
3601                }
3602
3603                "wbr" => {}
3604
3605                "code" => {
3606                    let code_ctx = Context {
3607                        in_code: true,
3608                        ..ctx.clone()
3609                    };
3610
3611                    if !ctx.in_code {
3612                        let mut content = String::with_capacity(32);
3613                        let children = tag.children();
3614                        {
3615                            for child_handle in children.top().iter() {
3616                                walk_node(
3617                                    child_handle,
3618                                    parser,
3619                                    &mut content,
3620                                    options,
3621                                    &code_ctx,
3622                                    depth + 1,
3623                                    dom_ctx,
3624                                );
3625                            }
3626                        }
3627
3628                        let trimmed = &content;
3629
3630                        if !content.trim().is_empty() {
3631                            let contains_backtick = trimmed.contains('`');
3632
3633                            let needs_delimiter_spaces = {
3634                                let first_char = trimmed.chars().next();
3635                                let last_char = trimmed.chars().last();
3636                                let starts_with_space = first_char == Some(' ');
3637                                let ends_with_space = last_char == Some(' ');
3638                                let starts_with_backtick = first_char == Some('`');
3639                                let ends_with_backtick = last_char == Some('`');
3640                                let all_spaces = trimmed.chars().all(|c| c == ' ');
3641
3642                                all_spaces
3643                                    || starts_with_backtick
3644                                    || ends_with_backtick
3645                                    || (starts_with_space && ends_with_space && contains_backtick)
3646                            };
3647
3648                            let (num_backticks, needs_spaces) = if contains_backtick {
3649                                let max_consecutive = trimmed
3650                                    .chars()
3651                                    .fold((0, 0), |(max, current), c| {
3652                                        if c == '`' {
3653                                            let new_current = current + 1;
3654                                            (max.max(new_current), new_current)
3655                                        } else {
3656                                            (max, 0)
3657                                        }
3658                                    })
3659                                    .0;
3660                                let num = if max_consecutive == 1 { 2 } else { 1 };
3661                                (num, needs_delimiter_spaces)
3662                            } else {
3663                                (1, needs_delimiter_spaces)
3664                            };
3665
3666                            for _ in 0..num_backticks {
3667                                output.push('`');
3668                            }
3669                            if needs_spaces {
3670                                output.push(' ');
3671                            }
3672                            output.push_str(trimmed);
3673                            if needs_spaces {
3674                                output.push(' ');
3675                            }
3676                            for _ in 0..num_backticks {
3677                                output.push('`');
3678                            }
3679                        }
3680                    } else {
3681                        let children = tag.children();
3682                        {
3683                            for child_handle in children.top().iter() {
3684                                walk_node(child_handle, parser, output, options, &code_ctx, depth + 1, dom_ctx);
3685                            }
3686                        }
3687                    }
3688                }
3689
3690                "pre" => {
3691                    let code_ctx = Context {
3692                        in_code: true,
3693                        ..ctx.clone()
3694                    };
3695
3696                    let mut content = String::with_capacity(256);
3697                    let children = tag.children();
3698                    {
3699                        for child_handle in children.top().iter() {
3700                            walk_node(
3701                                child_handle,
3702                                parser,
3703                                &mut content,
3704                                options,
3705                                &code_ctx,
3706                                depth + 1,
3707                                dom_ctx,
3708                            );
3709                        }
3710                    }
3711
3712                    if !content.is_empty() {
3713                        let leading_newlines = content.chars().take_while(|&c| c == '\n').count();
3714                        let trailing_newlines = content.chars().rev().take_while(|&c| c == '\n').count();
3715                        let core = content.trim_matches('\n');
3716                        let is_whitespace_only = core.trim().is_empty();
3717
3718                        let processed_content = if options.whitespace_mode == crate::options::WhitespaceMode::Strict {
3719                            content
3720                        } else {
3721                            let mut core_text = if leading_newlines > 0 {
3722                                dedent_code_block(core)
3723                            } else {
3724                                core.to_string()
3725                            };
3726
3727                            if is_whitespace_only {
3728                                let mut rebuilt = String::new();
3729                                for _ in 0..leading_newlines {
3730                                    rebuilt.push('\n');
3731                                }
3732                                rebuilt.push_str(&core_text);
3733                                for _ in 0..trailing_newlines {
3734                                    rebuilt.push('\n');
3735                                }
3736                                rebuilt
3737                            } else {
3738                                for _ in 0..trailing_newlines {
3739                                    core_text.push('\n');
3740                                }
3741                                core_text
3742                            }
3743                        };
3744
3745                        match options.code_block_style {
3746                            crate::options::CodeBlockStyle::Indented => {
3747                                if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3748                                    if output.ends_with('\n') {
3749                                        output.push('\n');
3750                                    } else {
3751                                        output.push_str("\n\n");
3752                                    }
3753                                }
3754
3755                                let indented = processed_content
3756                                    .lines()
3757                                    .map(|line| {
3758                                        if line.is_empty() {
3759                                            String::new()
3760                                        } else {
3761                                            format!("    {}", line)
3762                                        }
3763                                    })
3764                                    .collect::<Vec<_>>()
3765                                    .join("\n");
3766                                output.push_str(&indented);
3767
3768                                output.push_str("\n\n");
3769                            }
3770                            crate::options::CodeBlockStyle::Backticks | crate::options::CodeBlockStyle::Tildes => {
3771                                if !ctx.convert_as_inline && !output.is_empty() && !output.ends_with("\n\n") {
3772                                    if output.ends_with('\n') {
3773                                        output.push('\n');
3774                                    } else {
3775                                        output.push_str("\n\n");
3776                                    }
3777                                }
3778
3779                                let fence = if options.code_block_style == crate::options::CodeBlockStyle::Backticks {
3780                                    "```"
3781                                } else {
3782                                    "~~~"
3783                                };
3784
3785                                output.push_str(fence);
3786                                if !options.code_language.is_empty() {
3787                                    output.push_str(&options.code_language);
3788                                }
3789                                output.push('\n');
3790                                output.push_str(&processed_content);
3791                                output.push('\n');
3792                                output.push_str(fence);
3793                                output.push('\n');
3794                            }
3795                        }
3796                    }
3797                }
3798
3799                "blockquote" => {
3800                    if ctx.convert_as_inline {
3801                        let children = tag.children();
3802                        {
3803                            for child_handle in children.top().iter() {
3804                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
3805                            }
3806                        }
3807                        return;
3808                    }
3809
3810                    let cite = tag
3811                        .attributes()
3812                        .get("cite")
3813                        .flatten()
3814                        .map(|v| v.as_utf8_str().to_string());
3815
3816                    let blockquote_ctx = Context {
3817                        blockquote_depth: ctx.blockquote_depth + 1,
3818                        ..ctx.clone()
3819                    };
3820                    let mut content = String::with_capacity(256);
3821                    let children = tag.children();
3822                    {
3823                        for child_handle in children.top().iter() {
3824                            walk_node(
3825                                child_handle,
3826                                parser,
3827                                &mut content,
3828                                options,
3829                                &blockquote_ctx,
3830                                depth + 1,
3831                                dom_ctx,
3832                            );
3833                        }
3834                    }
3835
3836                    let trimmed_content = content.trim();
3837
3838                    if !trimmed_content.is_empty() {
3839                        // Handle spacing before blockquote
3840                        if ctx.blockquote_depth > 0 {
3841                            // Nested blockquote
3842                            output.push_str("\n\n\n");
3843                        } else if !output.is_empty() {
3844                            // CommonMark: blockquote needs only single newline before it
3845                            if !output.ends_with('\n') {
3846                                output.push('\n');
3847                            } else if output.ends_with("\n\n") {
3848                                // Remove one trailing newline (paragraph already added \n\n)
3849                                output.truncate(output.len() - 1);
3850                            }
3851                        }
3852                        // If output.is_empty(), add nothing (no leading newline)
3853
3854                        let prefix = "> ";
3855
3856                        for line in trimmed_content.lines() {
3857                            output.push_str(prefix);
3858                            output.push_str(line.trim());
3859                            output.push('\n');
3860                        }
3861
3862                        // Add spacing after blockquote
3863                        if let Some(url) = cite {
3864                            output.push('\n');
3865                            output.push_str("— <");
3866                            output.push_str(&url);
3867                            output.push_str(">\n\n");
3868                        }
3869
3870                        while output.ends_with('\n') {
3871                            output.truncate(output.len() - 1);
3872                        }
3873                    }
3874                }
3875
3876                "br" => {
3877                    if ctx.in_heading {
3878                        trim_trailing_whitespace(output);
3879                        output.push_str("  ");
3880                    } else {
3881                        use crate::options::NewlineStyle;
3882                        if output.is_empty() || output.ends_with('\n') {
3883                            output.push('\n');
3884                        } else {
3885                            match options.newline_style {
3886                                NewlineStyle::Spaces => output.push_str("  \n"),
3887                                NewlineStyle::Backslash => output.push_str("\\\n"),
3888                            }
3889                        }
3890                    }
3891                }
3892
3893                "hr" => {
3894                    // CommonMark: ensure a blank line before the hr so it is not interpreted as a setext heading underline
3895                    if !output.is_empty() {
3896                        let prev_tag = get_previous_sibling_tag(node_handle, parser, dom_ctx);
3897                        let last_line_is_blockquote = output
3898                            .rsplit('\n')
3899                            .find(|line| !line.trim().is_empty())
3900                            .map(|line| line.trim_start().starts_with('>'))
3901                            .unwrap_or(false);
3902                        let needs_blank_line = !ctx.in_paragraph
3903                            && !matches!(prev_tag.as_deref(), Some("blockquote"))
3904                            && !last_line_is_blockquote;
3905
3906                        if options.debug {
3907                            eprintln!(
3908                                "[DEBUG] <hr> prev_tag={:?} needs_blank_line={} in_paragraph={}",
3909                                prev_tag, needs_blank_line, ctx.in_paragraph
3910                            );
3911                        }
3912
3913                        if ctx.in_paragraph || !needs_blank_line {
3914                            if !output.ends_with('\n') {
3915                                output.push('\n');
3916                            }
3917                        } else {
3918                            trim_trailing_whitespace(output);
3919                            if output.ends_with('\n') {
3920                                if !output.ends_with("\n\n") {
3921                                    output.push('\n');
3922                                }
3923                            } else {
3924                                output.push_str("\n\n");
3925                            }
3926                        }
3927                    }
3928                    output.push_str("---\n");
3929                }
3930
3931                "ul" => {
3932                    add_list_leading_separator(output, ctx);
3933
3934                    let nested_depth = calculate_list_nesting_depth(ctx);
3935                    let is_loose = is_loose_list(node_handle, parser);
3936
3937                    process_list_children(
3938                        node_handle,
3939                        parser,
3940                        output,
3941                        options,
3942                        ctx,
3943                        depth,
3944                        false,
3945                        is_loose,
3946                        nested_depth,
3947                        1,
3948                        dom_ctx,
3949                    );
3950
3951                    add_nested_list_trailing_separator(output, ctx);
3952                }
3953
3954                "ol" => {
3955                    add_list_leading_separator(output, ctx);
3956
3957                    let nested_depth = calculate_list_nesting_depth(ctx);
3958                    let is_loose = is_loose_list(node_handle, parser);
3959
3960                    let start = tag
3961                        .attributes()
3962                        .get("start")
3963                        .flatten()
3964                        .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
3965                        .unwrap_or(1);
3966
3967                    process_list_children(
3968                        node_handle,
3969                        parser,
3970                        output,
3971                        options,
3972                        ctx,
3973                        depth,
3974                        true,
3975                        is_loose,
3976                        nested_depth,
3977                        start,
3978                        dom_ctx,
3979                    );
3980
3981                    add_nested_list_trailing_separator(output, ctx);
3982                }
3983
3984                "li" => {
3985                    if ctx.list_depth > 0 {
3986                        let indent = match options.list_indent_type {
3987                            ListIndentType::Tabs => "\t".repeat(ctx.list_depth),
3988                            ListIndentType::Spaces => " ".repeat(ctx.list_depth * options.list_indent_width),
3989                        };
3990                        output.push_str(&indent);
3991                    }
3992
3993                    let mut has_block_children = false;
3994                    let children = tag.children();
3995                    {
3996                        for child_handle in children.top().iter() {
3997                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
3998                                let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
3999                                if matches!(
4000                                    tag_name.as_ref(),
4001                                    "p" | "div" | "blockquote" | "pre" | "table" | "hr" | "dl"
4002                                ) {
4003                                    has_block_children = true;
4004                                    break;
4005                                }
4006                            }
4007                        }
4008                    }
4009
4010                    fn find_checkbox<'a>(
4011                        node_handle: &tl::NodeHandle,
4012                        parser: &'a tl::Parser<'a>,
4013                    ) -> Option<(bool, tl::NodeHandle)> {
4014                        if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4015                            if tag_name_eq(node_tag.name().as_utf8_str(), "input") {
4016                                let input_type = node_tag.attributes().get("type").flatten().map(|v| v.as_utf8_str());
4017
4018                                if input_type.as_deref() == Some("checkbox") {
4019                                    let checked = node_tag.attributes().get("checked").is_some();
4020                                    return Some((checked, *node_handle));
4021                                }
4022                            }
4023
4024                            let children = node_tag.children();
4025                            {
4026                                for child_handle in children.top().iter() {
4027                                    if let Some(result) = find_checkbox(child_handle, parser) {
4028                                        return Some(result);
4029                                    }
4030                                }
4031                            }
4032                        }
4033                        None
4034                    }
4035
4036                    let (is_task_list, task_checked, checkbox_node) =
4037                        if let Some((checked, node)) = find_checkbox(node_handle, parser) {
4038                            (true, checked, Some(node))
4039                        } else {
4040                            (false, false, None)
4041                        };
4042
4043                    let li_ctx = Context {
4044                        in_list_item: true,
4045                        list_depth: ctx.list_depth + 1,
4046                        ..ctx.clone()
4047                    };
4048
4049                    if is_task_list {
4050                        output.push('-');
4051                        output.push(' ');
4052                        output.push_str(if task_checked { "[x]" } else { "[ ]" });
4053
4054                        fn is_checkbox_node(node_handle: &tl::NodeHandle, checkbox: &Option<tl::NodeHandle>) -> bool {
4055                            if let Some(cb) = checkbox {
4056                                node_handle == cb
4057                            } else {
4058                                false
4059                            }
4060                        }
4061
4062                        fn contains_checkbox<'a>(
4063                            node_handle: &tl::NodeHandle,
4064                            parser: &'a tl::Parser<'a>,
4065                            checkbox: &Option<tl::NodeHandle>,
4066                        ) -> bool {
4067                            if is_checkbox_node(node_handle, checkbox) {
4068                                return true;
4069                            }
4070                            if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4071                                let children = node_tag.children();
4072                                {
4073                                    for child_handle in children.top().iter() {
4074                                        if contains_checkbox(child_handle, parser, checkbox) {
4075                                            return true;
4076                                        }
4077                                    }
4078                                }
4079                            }
4080                            false
4081                        }
4082
4083                        #[allow(clippy::too_many_arguments)]
4084                        fn render_li_content<'a>(
4085                            node_handle: &tl::NodeHandle,
4086                            parser: &'a tl::Parser<'a>,
4087                            output: &mut String,
4088                            options: &ConversionOptions,
4089                            ctx: &Context,
4090                            depth: usize,
4091                            checkbox: &Option<tl::NodeHandle>,
4092                            dom_ctx: &DomContext,
4093                        ) {
4094                            if is_checkbox_node(node_handle, checkbox) {
4095                                return;
4096                            }
4097
4098                            if contains_checkbox(node_handle, parser, checkbox) {
4099                                if let Some(tl::Node::Tag(node_tag)) = node_handle.get(parser) {
4100                                    let children = node_tag.children();
4101                                    {
4102                                        for child_handle in children.top().iter() {
4103                                            render_li_content(
4104                                                child_handle,
4105                                                parser,
4106                                                output,
4107                                                options,
4108                                                ctx,
4109                                                depth,
4110                                                checkbox,
4111                                                dom_ctx,
4112                                            );
4113                                        }
4114                                    }
4115                                }
4116                            } else {
4117                                walk_node(node_handle, parser, output, options, ctx, depth, dom_ctx);
4118                            }
4119                        }
4120
4121                        let mut task_text = String::new();
4122                        let children = tag.children();
4123                        {
4124                            for child_handle in children.top().iter() {
4125                                render_li_content(
4126                                    child_handle,
4127                                    parser,
4128                                    &mut task_text,
4129                                    options,
4130                                    &li_ctx,
4131                                    depth + 1,
4132                                    &checkbox_node,
4133                                    dom_ctx,
4134                                );
4135                            }
4136                        }
4137                        output.push(' ');
4138                        let trimmed_task = task_text.trim();
4139                        if !trimmed_task.is_empty() {
4140                            output.push_str(trimmed_task);
4141                        }
4142                    } else {
4143                        if !ctx.in_table_cell {
4144                            if ctx.in_ordered_list {
4145                                output.push_str(&format!("{}. ", ctx.list_counter));
4146                            } else {
4147                                let bullets: Vec<char> = options.bullets.chars().collect();
4148                                let bullet_index = if ctx.ul_depth > 0 { ctx.ul_depth - 1 } else { 0 };
4149                                let bullet = bullets.get(bullet_index % bullets.len()).copied().unwrap_or('*');
4150                                output.push(bullet);
4151                                output.push(' ');
4152                            }
4153                        }
4154
4155                        let children = tag.children();
4156                        {
4157                            for child_handle in children.top().iter() {
4158                                walk_node(child_handle, parser, output, options, &li_ctx, depth + 1, dom_ctx);
4159                            }
4160                        }
4161
4162                        trim_trailing_whitespace(output);
4163                    }
4164
4165                    if !ctx.in_table_cell {
4166                        if has_block_children || ctx.loose_list || ctx.prev_item_had_blocks {
4167                            if !output.ends_with("\n\n") {
4168                                if output.ends_with('\n') {
4169                                    output.push('\n');
4170                                } else {
4171                                    output.push_str("\n\n");
4172                                }
4173                            }
4174                        } else if !output.ends_with('\n') {
4175                            output.push('\n');
4176                        }
4177                    }
4178                }
4179
4180                "table" => {
4181                    let mut table_output = String::new();
4182                    convert_table(node_handle, parser, &mut table_output, options, ctx, dom_ctx);
4183
4184                    if ctx.in_list_item {
4185                        let has_caption = table_output.starts_with('*');
4186
4187                        if !has_caption {
4188                            trim_trailing_whitespace(output);
4189                            if !output.is_empty() && !output.ends_with('\n') {
4190                                output.push('\n');
4191                            }
4192                        }
4193
4194                        let indented = indent_table_for_list(&table_output, ctx.list_depth, options);
4195                        output.push_str(&indented);
4196                    } else {
4197                        if !output.ends_with("\n\n") {
4198                            if output.is_empty() || !output.ends_with('\n') {
4199                                output.push_str("\n\n");
4200                            } else {
4201                                output.push('\n');
4202                            }
4203                        }
4204                        output.push_str(&table_output);
4205                    }
4206
4207                    if !output.ends_with('\n') {
4208                        output.push('\n');
4209                    }
4210                }
4211
4212                "thead" | "tbody" | "tfoot" | "tr" | "th" | "td" => {}
4213
4214                "caption" => {
4215                    let mut text = String::new();
4216                    let children = tag.children();
4217                    {
4218                        for child_handle in children.top().iter() {
4219                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4220                        }
4221                    }
4222                    let text = text.trim();
4223                    if !text.is_empty() {
4224                        // Escape dashes in captions to avoid confusion with table separators
4225                        let escaped_text = text.replace('-', r"\-");
4226                        output.push('*');
4227                        output.push_str(&escaped_text);
4228                        output.push_str("*\n\n");
4229                    }
4230                }
4231
4232                "colgroup" | "col" => {}
4233
4234                "article" | "section" | "nav" | "aside" | "header" | "footer" | "main" => {
4235                    if ctx.convert_as_inline {
4236                        let children = tag.children();
4237                        {
4238                            for child_handle in children.top().iter() {
4239                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4240                            }
4241                        }
4242                        return;
4243                    }
4244
4245                    let mut content = String::with_capacity(256);
4246                    let children = tag.children();
4247                    {
4248                        for child_handle in children.top().iter() {
4249                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4250                        }
4251                    }
4252                    if content.trim().is_empty() {
4253                        return;
4254                    }
4255
4256                    if !output.is_empty() && !output.ends_with("\n\n") {
4257                        output.push_str("\n\n");
4258                    }
4259                    output.push_str(&content);
4260                    if content.ends_with('\n') && !content.ends_with("\n\n") {
4261                        output.push('\n');
4262                    } else if !content.ends_with('\n') {
4263                        output.push_str("\n\n");
4264                    }
4265                }
4266
4267                "figure" => {
4268                    if ctx.convert_as_inline {
4269                        let children = tag.children();
4270                        {
4271                            for child_handle in children.top().iter() {
4272                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4273                            }
4274                        }
4275                        return;
4276                    }
4277
4278                    if !output.is_empty() && !output.ends_with("\n\n") {
4279                        output.push_str("\n\n");
4280                    }
4281
4282                    let mut figure_content = String::new();
4283                    let children = tag.children();
4284                    {
4285                        for child_handle in children.top().iter() {
4286                            walk_node(child_handle, parser, &mut figure_content, options, ctx, depth, dom_ctx);
4287                        }
4288                    }
4289
4290                    figure_content = figure_content.replace("\n![", "![");
4291                    figure_content = figure_content.replace(" ![", "![");
4292
4293                    let trimmed = figure_content.trim_matches(|c| c == '\n' || c == ' ' || c == '\t');
4294                    if !trimmed.is_empty() {
4295                        output.push_str(trimmed);
4296                        if !output.ends_with('\n') {
4297                            output.push('\n');
4298                        }
4299                        if !output.ends_with("\n\n") {
4300                            output.push('\n');
4301                        }
4302                    }
4303                }
4304
4305                "figcaption" => {
4306                    let mut text = String::new();
4307                    let children = tag.children();
4308                    {
4309                        for child_handle in children.top().iter() {
4310                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
4311                        }
4312                    }
4313                    let text = text.trim();
4314                    if !text.is_empty() {
4315                        if !output.is_empty() {
4316                            if output.ends_with("```\n") {
4317                                output.push('\n');
4318                            } else {
4319                                trim_trailing_whitespace(output);
4320                                if output.ends_with('\n') && !output.ends_with("\n\n") {
4321                                    output.push('\n');
4322                                } else if !output.ends_with('\n') {
4323                                    output.push_str("\n\n");
4324                                }
4325                            }
4326                        }
4327                        output.push('*');
4328                        output.push_str(text);
4329                        output.push_str("*\n\n");
4330                    }
4331                }
4332
4333                "hgroup" => {
4334                    let children = tag.children();
4335                    {
4336                        for child_handle in children.top().iter() {
4337                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4338                        }
4339                    }
4340                }
4341
4342                "cite" => {
4343                    let mut content = String::with_capacity(32);
4344                    let children = tag.children();
4345                    {
4346                        for child_handle in children.top().iter() {
4347                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4348                        }
4349                    }
4350                    let trimmed = content.trim();
4351                    if !trimmed.is_empty() {
4352                        if ctx.convert_as_inline {
4353                            output.push_str(trimmed);
4354                        } else {
4355                            output.push('*');
4356                            output.push_str(trimmed);
4357                            output.push('*');
4358                        }
4359                    }
4360                }
4361
4362                "q" => {
4363                    let mut content = String::with_capacity(32);
4364                    let children = tag.children();
4365                    {
4366                        for child_handle in children.top().iter() {
4367                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4368                        }
4369                    }
4370                    let trimmed = content.trim();
4371                    if !trimmed.is_empty() {
4372                        if ctx.convert_as_inline {
4373                            output.push_str(trimmed);
4374                        } else {
4375                            output.push('"');
4376                            // Escape backslashes first, then quotes
4377                            let escaped = trimmed.replace('\\', r"\\").replace('"', r#"\""#);
4378                            output.push_str(&escaped);
4379                            output.push('"');
4380                        }
4381                    }
4382                }
4383
4384                "dl" => {
4385                    if ctx.convert_as_inline {
4386                        let children = tag.children();
4387                        {
4388                            for child_handle in children.top().iter() {
4389                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4390                            }
4391                        }
4392                        return;
4393                    }
4394
4395                    let mut content = String::new();
4396                    let mut in_dt_group = false;
4397                    let children = tag.children();
4398                    {
4399                        for child_handle in children.top().iter() {
4400                            let (is_dt, is_dd) = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4401                                let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
4402                                (tag_name == "dt", tag_name == "dd")
4403                            } else {
4404                                (false, false)
4405                            };
4406
4407                            let child_ctx = Context {
4408                                last_was_dt: in_dt_group && is_dd,
4409                                ..ctx.clone()
4410                            };
4411                            walk_node(child_handle, parser, &mut content, options, &child_ctx, depth, dom_ctx);
4412
4413                            if is_dt {
4414                                in_dt_group = true;
4415                            } else if !is_dd {
4416                                in_dt_group = false;
4417                            }
4418                        }
4419                    }
4420
4421                    let trimmed = content.trim();
4422                    if !trimmed.is_empty() {
4423                        if !output.is_empty() && !output.ends_with("\n\n") {
4424                            output.push_str("\n\n");
4425                        }
4426                        output.push_str(trimmed);
4427                        output.push_str("\n\n");
4428                    }
4429                }
4430
4431                "dt" => {
4432                    let mut content = String::with_capacity(64);
4433                    let children = tag.children();
4434                    {
4435                        for child_handle in children.top().iter() {
4436                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4437                        }
4438                    }
4439                    let trimmed = content.trim();
4440                    if !trimmed.is_empty() {
4441                        if ctx.convert_as_inline {
4442                            output.push_str(trimmed);
4443                        } else {
4444                            output.push_str(trimmed);
4445                            output.push('\n');
4446                        }
4447                    }
4448                }
4449
4450                "dd" => {
4451                    let mut content = String::with_capacity(128);
4452                    let children = tag.children();
4453                    {
4454                        for child_handle in children.top().iter() {
4455                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4456                        }
4457                    }
4458
4459                    let trimmed = content.trim();
4460
4461                    if ctx.convert_as_inline {
4462                        if !trimmed.is_empty() {
4463                            output.push_str(trimmed);
4464                        }
4465                    } else if ctx.last_was_dt {
4466                        if !trimmed.is_empty() {
4467                            output.push_str(":   ");
4468                            output.push_str(trimmed);
4469                            output.push_str("\n\n");
4470                        } else {
4471                            output.push_str(":   \n\n");
4472                        }
4473                    } else if !trimmed.is_empty() {
4474                        output.push_str(trimmed);
4475                        output.push_str("\n\n");
4476                    }
4477                }
4478
4479                "details" => {
4480                    if ctx.convert_as_inline {
4481                        let children = tag.children();
4482                        {
4483                            for child_handle in children.top().iter() {
4484                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4485                            }
4486                        }
4487                        return;
4488                    }
4489
4490                    let mut content = String::with_capacity(256);
4491                    let children = tag.children();
4492                    {
4493                        for child_handle in children.top().iter() {
4494                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4495                        }
4496                    }
4497                    let trimmed = content.trim();
4498                    if !trimmed.is_empty() {
4499                        if !output.is_empty() && !output.ends_with("\n\n") {
4500                            output.push_str("\n\n");
4501                        }
4502                        output.push_str(trimmed);
4503                        output.push_str("\n\n");
4504                    }
4505                }
4506
4507                "summary" => {
4508                    let mut content = String::with_capacity(64);
4509                    let mut summary_ctx = ctx.clone();
4510                    if !ctx.convert_as_inline {
4511                        summary_ctx.in_strong = true;
4512                    }
4513                    let children = tag.children();
4514                    {
4515                        for child_handle in children.top().iter() {
4516                            walk_node(
4517                                child_handle,
4518                                parser,
4519                                &mut content,
4520                                options,
4521                                &summary_ctx,
4522                                depth + 1,
4523                                dom_ctx,
4524                            );
4525                        }
4526                    }
4527                    let trimmed = content.trim();
4528                    if !trimmed.is_empty() {
4529                        if ctx.convert_as_inline {
4530                            output.push_str(trimmed);
4531                        } else {
4532                            let symbol = options.strong_em_symbol.to_string().repeat(2);
4533                            output.push_str(&symbol);
4534                            output.push_str(trimmed);
4535                            output.push_str(&symbol);
4536                            output.push_str("\n\n");
4537                        }
4538                    }
4539                }
4540
4541                "dialog" => {
4542                    if ctx.convert_as_inline {
4543                        let children = tag.children();
4544                        {
4545                            for child_handle in children.top().iter() {
4546                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4547                            }
4548                        }
4549                        return;
4550                    }
4551
4552                    let content_start = output.len();
4553
4554                    let children = tag.children();
4555                    {
4556                        for child_handle in children.top().iter() {
4557                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4558                        }
4559                    }
4560
4561                    while output.len() > content_start && (output.ends_with(' ') || output.ends_with('\t')) {
4562                        output.pop();
4563                    }
4564
4565                    if output.len() > content_start && !output.ends_with("\n\n") {
4566                        output.push_str("\n\n");
4567                    }
4568                }
4569
4570                "menu" => {
4571                    let content_start = output.len();
4572
4573                    let menu_options = ConversionOptions {
4574                        bullets: "-".to_string(),
4575                        ..options.clone()
4576                    };
4577
4578                    let list_ctx = Context {
4579                        in_ordered_list: false,
4580                        list_counter: 0,
4581                        in_list: true,
4582                        list_depth: ctx.list_depth,
4583                        ..ctx.clone()
4584                    };
4585
4586                    let children = tag.children();
4587                    {
4588                        for child_handle in children.top().iter() {
4589                            walk_node(child_handle, parser, output, &menu_options, &list_ctx, depth, dom_ctx);
4590                        }
4591                    }
4592
4593                    if !ctx.convert_as_inline && output.len() > content_start {
4594                        if !output.ends_with("\n\n") {
4595                            if output.ends_with('\n') {
4596                                output.push('\n');
4597                            } else {
4598                                output.push_str("\n\n");
4599                            }
4600                        }
4601                    } else if ctx.convert_as_inline {
4602                        while output.ends_with('\n') {
4603                            output.pop();
4604                        }
4605                    }
4606                }
4607
4608                "audio" => {
4609                    use std::borrow::Cow;
4610
4611                    let src = tag
4612                        .attributes()
4613                        .get("src")
4614                        .flatten()
4615                        .map(|v| v.as_utf8_str())
4616                        .or_else(|| {
4617                            let children = tag.children();
4618                            {
4619                                for child_handle in children.top().iter() {
4620                                    if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4621                                        if tag_name_eq(child_tag.name().as_utf8_str(), "source") {
4622                                            return child_tag
4623                                                .attributes()
4624                                                .get("src")
4625                                                .flatten()
4626                                                .map(|v| v.as_utf8_str());
4627                                        }
4628                                    }
4629                                }
4630                            }
4631                            None
4632                        })
4633                        .unwrap_or(Cow::Borrowed(""));
4634
4635                    if !src.is_empty() {
4636                        output.push('[');
4637                        output.push_str(&src);
4638                        output.push_str("](");
4639                        output.push_str(&src);
4640                        output.push(')');
4641                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4642                            output.push_str("\n\n");
4643                        }
4644                    }
4645
4646                    let mut fallback = String::new();
4647                    let children = tag.children();
4648                    {
4649                        for child_handle in children.top().iter() {
4650                            let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4651                                tag_name_eq(child_tag.name().as_utf8_str(), "source")
4652                            } else {
4653                                false
4654                            };
4655
4656                            if !is_source {
4657                                walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4658                            }
4659                        }
4660                    }
4661                    if !fallback.is_empty() {
4662                        output.push_str(fallback.trim());
4663                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4664                            output.push_str("\n\n");
4665                        }
4666                    }
4667                }
4668
4669                "video" => {
4670                    use std::borrow::Cow;
4671
4672                    let src = tag
4673                        .attributes()
4674                        .get("src")
4675                        .flatten()
4676                        .map(|v| v.as_utf8_str())
4677                        .or_else(|| {
4678                            let children = tag.children();
4679                            {
4680                                for child_handle in children.top().iter() {
4681                                    if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4682                                        if tag_name_eq(child_tag.name().as_utf8_str(), "source") {
4683                                            return child_tag
4684                                                .attributes()
4685                                                .get("src")
4686                                                .flatten()
4687                                                .map(|v| v.as_utf8_str());
4688                                        }
4689                                    }
4690                                }
4691                            }
4692                            None
4693                        })
4694                        .unwrap_or(Cow::Borrowed(""));
4695
4696                    if !src.is_empty() {
4697                        output.push('[');
4698                        output.push_str(&src);
4699                        output.push_str("](");
4700                        output.push_str(&src);
4701                        output.push(')');
4702                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4703                            output.push_str("\n\n");
4704                        }
4705                    }
4706
4707                    let mut fallback = String::new();
4708                    let children = tag.children();
4709                    {
4710                        for child_handle in children.top().iter() {
4711                            let is_source = if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4712                                tag_name_eq(child_tag.name().as_utf8_str(), "source")
4713                            } else {
4714                                false
4715                            };
4716
4717                            if !is_source {
4718                                walk_node(child_handle, parser, &mut fallback, options, ctx, depth + 1, dom_ctx);
4719                            }
4720                        }
4721                    }
4722                    if !fallback.is_empty() {
4723                        output.push_str(fallback.trim());
4724                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4725                            output.push_str("\n\n");
4726                        }
4727                    }
4728                }
4729
4730                "source" => {}
4731
4732                "picture" => {
4733                    let children = tag.children();
4734                    {
4735                        for child_handle in children.top().iter() {
4736                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4737                                if tag_name_eq(child_tag.name().as_utf8_str(), "img") {
4738                                    walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4739                                    break;
4740                                }
4741                            }
4742                        }
4743                    }
4744                }
4745
4746                "iframe" => {
4747                    use std::borrow::Cow;
4748
4749                    let src = tag
4750                        .attributes()
4751                        .get("src")
4752                        .flatten()
4753                        .map(|v| v.as_utf8_str())
4754                        .unwrap_or(Cow::Borrowed(""));
4755
4756                    if !src.is_empty() {
4757                        output.push('[');
4758                        output.push_str(&src);
4759                        output.push_str("](");
4760                        output.push_str(&src);
4761                        output.push(')');
4762                        if !ctx.in_paragraph && !ctx.convert_as_inline {
4763                            output.push_str("\n\n");
4764                        }
4765                    }
4766                }
4767
4768                "svg" => {
4769                    let mut title = String::from("SVG Image");
4770                    let children = tag.children();
4771                    {
4772                        for child_handle in children.top().iter() {
4773                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
4774                                if tag_name_eq(child_tag.name().as_utf8_str(), "title") {
4775                                    title = get_text_content(child_handle, parser).trim().to_string();
4776                                    break;
4777                                }
4778                            }
4779                        }
4780                    }
4781
4782                    #[cfg(feature = "inline-images")]
4783                    if let Some(ref collector_ref) = ctx.inline_collector {
4784                        let title_opt = if title == "SVG Image" {
4785                            None
4786                        } else {
4787                            Some(title.clone())
4788                        };
4789                        let mut attributes_map = BTreeMap::new();
4790                        for (key, value_opt) in tag.attributes().iter() {
4791                            let key_str = key.to_string();
4792                            let keep = key_str == "width"
4793                                || key_str == "height"
4794                                || key_str == "filename"
4795                                || key_str == "aria-label"
4796                                || key_str.starts_with("data-");
4797                            if keep {
4798                                let value = value_opt.map(|value| value.to_string()).unwrap_or_default();
4799                                attributes_map.insert(key_str, value);
4800                            }
4801                        }
4802                        handle_inline_svg(collector_ref, node_handle, parser, title_opt, attributes_map);
4803                    }
4804
4805                    if ctx.convert_as_inline {
4806                        output.push_str(&title);
4807                    } else {
4808                        use base64::{Engine as _, engine::general_purpose::STANDARD};
4809
4810                        let svg_html = serialize_element(node_handle, parser);
4811
4812                        let base64_svg = STANDARD.encode(svg_html.as_bytes());
4813
4814                        output.push_str("![");
4815                        output.push_str(&title);
4816                        output.push_str("](data:image/svg+xml;base64,");
4817                        output.push_str(&base64_svg);
4818                        output.push(')');
4819                    }
4820                }
4821
4822                "math" => {
4823                    let text_content = get_text_content(node_handle, parser).trim().to_string();
4824
4825                    if text_content.is_empty() {
4826                        return;
4827                    }
4828
4829                    let math_html = serialize_element(node_handle, parser);
4830
4831                    let escaped_text = text::escape(
4832                        &text_content,
4833                        options.escape_misc,
4834                        options.escape_asterisks,
4835                        options.escape_underscores,
4836                        options.escape_ascii,
4837                    );
4838
4839                    let is_display_block = tag
4840                        .attributes()
4841                        .get("display")
4842                        .flatten()
4843                        .map(|v| v.as_utf8_str() == "block")
4844                        .unwrap_or(false);
4845
4846                    if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4847                        output.push_str("\n\n");
4848                    }
4849
4850                    output.push_str("<!-- MathML: ");
4851                    output.push_str(&math_html);
4852                    output.push_str(" --> ");
4853                    output.push_str(&escaped_text);
4854
4855                    if is_display_block && !ctx.in_paragraph && !ctx.convert_as_inline {
4856                        output.push_str("\n\n");
4857                    }
4858                }
4859
4860                "form" => {
4861                    if ctx.convert_as_inline {
4862                        let children = tag.children();
4863                        {
4864                            for child_handle in children.top().iter() {
4865                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4866                            }
4867                        }
4868                        return;
4869                    }
4870
4871                    let mut content = String::new();
4872                    let children = tag.children();
4873                    {
4874                        for child_handle in children.top().iter() {
4875                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4876                        }
4877                    }
4878                    let trimmed = content.trim();
4879                    if !trimmed.is_empty() {
4880                        if !output.is_empty() && !output.ends_with("\n\n") {
4881                            output.push_str("\n\n");
4882                        }
4883                        output.push_str(trimmed);
4884                        output.push_str("\n\n");
4885                    }
4886                }
4887
4888                "fieldset" => {
4889                    if ctx.convert_as_inline {
4890                        let children = tag.children();
4891                        {
4892                            for child_handle in children.top().iter() {
4893                                walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
4894                            }
4895                        }
4896                        return;
4897                    }
4898                    let mut content = String::new();
4899                    let children = tag.children();
4900                    {
4901                        for child_handle in children.top().iter() {
4902                            walk_node(child_handle, parser, &mut content, options, ctx, depth, dom_ctx);
4903                        }
4904                    }
4905                    let trimmed = content.trim();
4906                    if !trimmed.is_empty() {
4907                        if !output.is_empty() && !output.ends_with("\n\n") {
4908                            output.push_str("\n\n");
4909                        }
4910                        output.push_str(trimmed);
4911                        output.push_str("\n\n");
4912                    }
4913                }
4914
4915                "legend" => {
4916                    let mut content = String::new();
4917                    let mut legend_ctx = ctx.clone();
4918                    if !ctx.convert_as_inline {
4919                        legend_ctx.in_strong = true;
4920                    }
4921                    let children = tag.children();
4922                    {
4923                        for child_handle in children.top().iter() {
4924                            walk_node(
4925                                child_handle,
4926                                parser,
4927                                &mut content,
4928                                options,
4929                                &legend_ctx,
4930                                depth + 1,
4931                                dom_ctx,
4932                            );
4933                        }
4934                    }
4935                    let trimmed = content.trim();
4936                    if !trimmed.is_empty() {
4937                        if ctx.convert_as_inline {
4938                            output.push_str(trimmed);
4939                        } else {
4940                            let symbol = options.strong_em_symbol.to_string().repeat(2);
4941                            output.push_str(&symbol);
4942                            output.push_str(trimmed);
4943                            output.push_str(&symbol);
4944                            output.push_str("\n\n");
4945                        }
4946                    }
4947                }
4948
4949                "label" => {
4950                    let mut content = String::new();
4951                    let children = tag.children();
4952                    {
4953                        for child_handle in children.top().iter() {
4954                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
4955                        }
4956                    }
4957                    let trimmed = content.trim();
4958                    if !trimmed.is_empty() {
4959                        output.push_str(trimmed);
4960                        if !ctx.convert_as_inline {
4961                            output.push_str("\n\n");
4962                        }
4963                    }
4964                }
4965
4966                "input" => {}
4967
4968                "textarea" => {
4969                    let start_len = output.len();
4970                    let children = tag.children();
4971                    {
4972                        for child_handle in children.top().iter() {
4973                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4974                        }
4975                    }
4976
4977                    if !ctx.convert_as_inline && output.len() > start_len {
4978                        output.push_str("\n\n");
4979                    }
4980                }
4981
4982                "select" => {
4983                    let start_len = output.len();
4984                    let children = tag.children();
4985                    {
4986                        for child_handle in children.top().iter() {
4987                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
4988                        }
4989                    }
4990
4991                    if !ctx.convert_as_inline && output.len() > start_len {
4992                        output.push('\n');
4993                    }
4994                }
4995
4996                "option" => {
4997                    let selected = tag.attributes().iter().any(|(name, _)| name.as_ref() == "selected");
4998
4999                    let mut text = String::new();
5000                    let children = tag.children();
5001                    {
5002                        for child_handle in children.top().iter() {
5003                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5004                        }
5005                    }
5006                    let trimmed = text.trim();
5007                    if !trimmed.is_empty() {
5008                        if selected && !ctx.convert_as_inline {
5009                            output.push_str("* ");
5010                        }
5011                        output.push_str(trimmed);
5012                        if !ctx.convert_as_inline {
5013                            output.push('\n');
5014                        }
5015                    }
5016                }
5017
5018                "optgroup" => {
5019                    use std::borrow::Cow;
5020
5021                    let label = tag
5022                        .attributes()
5023                        .get("label")
5024                        .flatten()
5025                        .map(|v| v.as_utf8_str())
5026                        .unwrap_or(Cow::Borrowed(""));
5027
5028                    if !label.is_empty() {
5029                        let symbol = options.strong_em_symbol.to_string().repeat(2);
5030                        output.push_str(&symbol);
5031                        output.push_str(&label);
5032                        output.push_str(&symbol);
5033                        output.push('\n');
5034                    }
5035
5036                    let children = tag.children();
5037                    {
5038                        for child_handle in children.top().iter() {
5039                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5040                        }
5041                    }
5042                }
5043
5044                "button" => {
5045                    let start_len = output.len();
5046                    let children = tag.children();
5047                    {
5048                        for child_handle in children.top().iter() {
5049                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5050                        }
5051                    }
5052
5053                    if !ctx.convert_as_inline && output.len() > start_len {
5054                        output.push_str("\n\n");
5055                    }
5056                }
5057
5058                "progress" => {
5059                    let start_len = output.len();
5060                    let children = tag.children();
5061                    {
5062                        for child_handle in children.top().iter() {
5063                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5064                        }
5065                    }
5066
5067                    if !ctx.convert_as_inline && output.len() > start_len {
5068                        output.push_str("\n\n");
5069                    }
5070                }
5071
5072                "meter" => {
5073                    let start_len = output.len();
5074                    let children = tag.children();
5075                    {
5076                        for child_handle in children.top().iter() {
5077                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5078                        }
5079                    }
5080
5081                    if !ctx.convert_as_inline && output.len() > start_len {
5082                        output.push_str("\n\n");
5083                    }
5084                }
5085
5086                "output" => {
5087                    let start_len = output.len();
5088                    let children = tag.children();
5089                    {
5090                        for child_handle in children.top().iter() {
5091                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5092                        }
5093                    }
5094
5095                    if !ctx.convert_as_inline && output.len() > start_len {
5096                        output.push_str("\n\n");
5097                    }
5098                }
5099
5100                "datalist" => {
5101                    let start_len = output.len();
5102                    let children = tag.children();
5103                    {
5104                        for child_handle in children.top().iter() {
5105                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5106                        }
5107                    }
5108
5109                    if !ctx.convert_as_inline && output.len() > start_len {
5110                        output.push('\n');
5111                    }
5112                }
5113
5114                "ruby" => {
5115                    let ruby_ctx = ctx.clone();
5116
5117                    let tag_sequence: Vec<String> = tag
5118                        .children()
5119                        .top()
5120                        .iter()
5121                        .filter_map(|child_handle| {
5122                            if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5123                                let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5124                                if matches!(tag_name.as_ref(), "rb" | "rt" | "rtc") {
5125                                    Some(tag_name.into_owned())
5126                                } else {
5127                                    None
5128                                }
5129                            } else {
5130                                None
5131                            }
5132                        })
5133                        .collect();
5134
5135                    let has_rtc = tag_sequence.iter().any(|tag| tag == "rtc");
5136
5137                    let is_interleaved = tag_sequence.windows(2).any(|w| w[0] == "rb" && w[1] == "rt");
5138
5139                    if is_interleaved && !has_rtc {
5140                        let mut current_base = String::new();
5141                        let children = tag.children();
5142                        {
5143                            for child_handle in children.top().iter() {
5144                                if let Some(node) = child_handle.get(parser) {
5145                                    match node {
5146                                        tl::Node::Tag(child_tag) => {
5147                                            let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5148                                            if tag_name == "rt" {
5149                                                let mut annotation = String::new();
5150                                                walk_node(
5151                                                    child_handle,
5152                                                    parser,
5153                                                    &mut annotation,
5154                                                    options,
5155                                                    &ruby_ctx,
5156                                                    depth,
5157                                                    dom_ctx,
5158                                                );
5159                                                if !current_base.is_empty() {
5160                                                    output.push_str(current_base.trim());
5161                                                    current_base.clear();
5162                                                }
5163                                                output.push_str(annotation.trim());
5164                                            } else if tag_name == "rb" {
5165                                                if !current_base.is_empty() {
5166                                                    output.push_str(current_base.trim());
5167                                                    current_base.clear();
5168                                                }
5169                                                walk_node(
5170                                                    child_handle,
5171                                                    parser,
5172                                                    &mut current_base,
5173                                                    options,
5174                                                    &ruby_ctx,
5175                                                    depth,
5176                                                    dom_ctx,
5177                                                );
5178                                            } else if tag_name != "rp" {
5179                                                walk_node(
5180                                                    child_handle,
5181                                                    parser,
5182                                                    &mut current_base,
5183                                                    options,
5184                                                    &ruby_ctx,
5185                                                    depth,
5186                                                    dom_ctx,
5187                                                );
5188                                            }
5189                                        }
5190                                        tl::Node::Raw(_) => {
5191                                            walk_node(
5192                                                child_handle,
5193                                                parser,
5194                                                &mut current_base,
5195                                                options,
5196                                                &ruby_ctx,
5197                                                depth,
5198                                                dom_ctx,
5199                                            );
5200                                        }
5201                                        _ => {}
5202                                    }
5203                                }
5204                            }
5205                        }
5206                        if !current_base.is_empty() {
5207                            output.push_str(current_base.trim());
5208                        }
5209                    } else {
5210                        let mut base_text = String::new();
5211                        let mut rt_annotations = Vec::new();
5212                        let mut rtc_content = String::new();
5213
5214                        let children = tag.children();
5215                        {
5216                            for child_handle in children.top().iter() {
5217                                if let Some(node) = child_handle.get(parser) {
5218                                    match node {
5219                                        tl::Node::Tag(child_tag) => {
5220                                            let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5221                                            if tag_name == "rt" {
5222                                                let mut annotation = String::new();
5223                                                walk_node(
5224                                                    child_handle,
5225                                                    parser,
5226                                                    &mut annotation,
5227                                                    options,
5228                                                    &ruby_ctx,
5229                                                    depth,
5230                                                    dom_ctx,
5231                                                );
5232                                                rt_annotations.push(annotation);
5233                                            } else if tag_name == "rtc" {
5234                                                walk_node(
5235                                                    child_handle,
5236                                                    parser,
5237                                                    &mut rtc_content,
5238                                                    options,
5239                                                    &ruby_ctx,
5240                                                    depth,
5241                                                    dom_ctx,
5242                                                );
5243                                            } else if tag_name != "rp" {
5244                                                walk_node(
5245                                                    child_handle,
5246                                                    parser,
5247                                                    &mut base_text,
5248                                                    options,
5249                                                    &ruby_ctx,
5250                                                    depth,
5251                                                    dom_ctx,
5252                                                );
5253                                            }
5254                                        }
5255                                        tl::Node::Raw(_) => {
5256                                            walk_node(
5257                                                child_handle,
5258                                                parser,
5259                                                &mut base_text,
5260                                                options,
5261                                                &ruby_ctx,
5262                                                depth,
5263                                                dom_ctx,
5264                                            );
5265                                        }
5266                                        _ => {}
5267                                    }
5268                                }
5269                            }
5270                        }
5271
5272                        let trimmed_base = base_text.trim();
5273
5274                        output.push_str(trimmed_base);
5275
5276                        if !rt_annotations.is_empty() {
5277                            let rt_text = rt_annotations.iter().map(|s| s.trim()).collect::<Vec<_>>().join("");
5278                            if !rt_text.is_empty() {
5279                                if has_rtc && !rtc_content.trim().is_empty() && rt_annotations.len() > 1 {
5280                                    output.push('(');
5281                                    output.push_str(&rt_text);
5282                                    output.push(')');
5283                                } else {
5284                                    output.push_str(&rt_text);
5285                                }
5286                            }
5287                        }
5288
5289                        if !rtc_content.trim().is_empty() {
5290                            output.push_str(rtc_content.trim());
5291                        }
5292                    }
5293                }
5294
5295                "rb" => {
5296                    let mut text = String::new();
5297                    let children = tag.children();
5298                    {
5299                        for child_handle in children.top().iter() {
5300                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5301                        }
5302                    }
5303                    output.push_str(text.trim());
5304                }
5305
5306                "rt" => {
5307                    let mut text = String::new();
5308                    let children = tag.children();
5309                    {
5310                        for child_handle in children.top().iter() {
5311                            walk_node(child_handle, parser, &mut text, options, ctx, depth + 1, dom_ctx);
5312                        }
5313                    }
5314                    let trimmed = text.trim();
5315
5316                    if output.ends_with('(') {
5317                        output.push_str(trimmed);
5318                    } else {
5319                        output.push('(');
5320                        output.push_str(trimmed);
5321                        output.push(')');
5322                    }
5323                }
5324
5325                "rp" => {
5326                    let mut content = String::new();
5327                    let children = tag.children();
5328                    {
5329                        for child_handle in children.top().iter() {
5330                            walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
5331                        }
5332                    }
5333                    let trimmed = content.trim();
5334                    if !trimmed.is_empty() {
5335                        output.push_str(trimmed);
5336                    }
5337                }
5338
5339                "rtc" => {
5340                    let children = tag.children();
5341                    {
5342                        for child_handle in children.top().iter() {
5343                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5344                        }
5345                    }
5346                }
5347
5348                "div" => {
5349                    if ctx.convert_as_inline {
5350                        let children = tag.children();
5351                        {
5352                            for child_handle in children.top().iter() {
5353                                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5354                            }
5355                        }
5356                        return;
5357                    }
5358
5359                    let content_start_pos = output.len();
5360
5361                    let is_table_continuation =
5362                        ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");
5363
5364                    let is_list_continuation = ctx.in_list_item
5365                        && !output.is_empty()
5366                        && !output.ends_with("* ")
5367                        && !output.ends_with("- ")
5368                        && !output.ends_with(". ");
5369
5370                    let needs_leading_sep = !ctx.in_table_cell
5371                        && !ctx.in_list_item
5372                        && !ctx.convert_as_inline
5373                        && !output.is_empty()
5374                        && !output.ends_with("\n\n");
5375
5376                    if is_table_continuation {
5377                        trim_trailing_whitespace(output);
5378                        output.push_str("<br>");
5379                    } else if is_list_continuation {
5380                        add_list_continuation_indent(output, ctx.list_depth, false, options);
5381                    } else if needs_leading_sep {
5382                        trim_trailing_whitespace(output);
5383                        output.push_str("\n\n");
5384                    }
5385
5386                    let children = tag.children();
5387                    {
5388                        for child_handle in children.top().iter() {
5389                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5390                        }
5391                    }
5392
5393                    let has_content = output.len() > content_start_pos;
5394
5395                    if has_content {
5396                        if content_start_pos == 0 && output.starts_with('\n') && !output.starts_with("\n\n") {
5397                            output.remove(0);
5398                        }
5399                        trim_trailing_whitespace(output);
5400
5401                        if ctx.in_table_cell {
5402                        } else if ctx.in_list_item {
5403                            if is_list_continuation {
5404                                if !output.ends_with('\n') {
5405                                    output.push('\n');
5406                                }
5407                            } else if !output.ends_with("\n\n") {
5408                                if output.ends_with('\n') {
5409                                    output.push('\n');
5410                                } else {
5411                                    output.push_str("\n\n");
5412                                }
5413                            }
5414                        } else if !ctx.in_list_item && !ctx.convert_as_inline {
5415                            if output.ends_with("\n\n") {
5416                            } else if output.ends_with('\n') {
5417                                output.push('\n');
5418                            } else {
5419                                output.push_str("\n\n");
5420                            }
5421                        }
5422                    }
5423                }
5424
5425                "head" => {
5426                    // Malformed pages sometimes place <body> or main content inside <head>.
5427                    // Only walk children if we detect non-head content to avoid rendering metadata.
5428                    let children = tag.children();
5429                    let has_body_like = children.top().iter().any(|child_handle| {
5430                        if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5431                            let child_name = normalized_tag_name(child_tag.name().as_utf8_str());
5432                            matches!(
5433                                child_name.as_ref(),
5434                                "body" | "main" | "article" | "section" | "div" | "p"
5435                            )
5436                        } else {
5437                            false
5438                        }
5439                    });
5440
5441                    if has_body_like {
5442                        for child_handle in children.top().iter() {
5443                            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
5444                        }
5445                    }
5446                }
5447
5448                "script" => {
5449                    // NEW: Extract JSON-LD structured data
5450                    #[cfg(feature = "metadata")]
5451                    if let Some(type_attr) = tag.attributes().get("type").flatten() {
5452                        if type_attr.as_utf8_str() == "application/ld+json" {
5453                            if let Some(ref collector) = ctx.metadata_collector {
5454                                let json = get_text_content(node_handle, parser);
5455                                collector.borrow_mut().add_json_ld(json);
5456                            }
5457                        }
5458                    }
5459                }
5460                "style" => {}
5461
5462                "span" => {
5463                    let is_hocr_word = tag.attributes().iter().any(|(name, value)| {
5464                        name.as_ref() == "class" && value.as_ref().is_some_and(|v| v.as_ref().contains("ocrx_word"))
5465                    });
5466
5467                    if is_hocr_word
5468                        && !output.is_empty()
5469                        && !output.ends_with(' ')
5470                        && !output.ends_with('\t')
5471                        && !output.ends_with('\n')
5472                    {
5473                        output.push(' ');
5474                    }
5475
5476                    if !ctx.in_code
5477                        && options.whitespace_mode == crate::options::WhitespaceMode::Normalized
5478                        && output.ends_with('\n')
5479                        && !output.ends_with("\n\n")
5480                    {
5481                        output.pop();
5482                    }
5483
5484                    let children = tag.children();
5485                    {
5486                        for child_handle in children.top().iter() {
5487                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5488                        }
5489                    }
5490                }
5491
5492                _ => {
5493                    let len_before = output.len();
5494                    let had_trailing_space = output.ends_with(' ');
5495
5496                    let children = tag.children();
5497                    {
5498                        for child_handle in children.top().iter() {
5499                            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
5500                        }
5501                    }
5502
5503                    let len_after = output.len();
5504                    if len_after > len_before {
5505                        // Child processing can pop a trailing byte before appending new content,
5506                        // so len_before might land inside a multi-byte char; clamp to a safe boundary.
5507                        let start_idx = if output.is_char_boundary(len_before) {
5508                            len_before
5509                        } else {
5510                            let capped = len_before.min(output.len());
5511                            output
5512                                .char_indices()
5513                                .map(|(idx, _)| idx)
5514                                .take_while(|idx| *idx <= capped)
5515                                .last()
5516                                .unwrap_or(capped)
5517                        };
5518
5519                        let added_content = output[start_idx..].to_string();
5520                        if options.debug {
5521                            eprintln!(
5522                                "[DEBUG] <{}> added {:?}, trim={:?}, had_trailing_space={}",
5523                                tag_name,
5524                                added_content,
5525                                added_content.trim(),
5526                                had_trailing_space
5527                            );
5528                        }
5529
5530                        // Don't truncate code blocks (indented or fenced)
5531                        let is_code_block = added_content.starts_with("    ")
5532                            || added_content.starts_with("```")
5533                            || added_content.starts_with("~~~");
5534
5535                        if options.debug && added_content.trim().is_empty() {
5536                            eprintln!(
5537                                "[DEBUG] Whitespace-only content, is_code_block={}, will_truncate={}",
5538                                is_code_block, !is_code_block
5539                            );
5540                        }
5541
5542                        if added_content.trim().is_empty() && !is_code_block {
5543                            output.truncate(start_idx);
5544                            if !had_trailing_space && added_content.contains(' ') {
5545                                output.push(' ');
5546                            }
5547                            if options.debug {
5548                                eprintln!(
5549                                    "[DEBUG] Truncated, output now ends with space: {}",
5550                                    output.ends_with(' ')
5551                                );
5552                            }
5553                        }
5554                    }
5555                }
5556            }
5557        }
5558
5559        tl::Node::Comment(_) => {
5560            // Comments are ignored
5561        }
5562    }
5563}
5564
5565/// Get colspan attribute value from element
5566fn get_colspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5567    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5568        if let Some(Some(bytes)) = tag.attributes().get("colspan") {
5569            if let Ok(colspan) = bytes.as_utf8_str().parse::<usize>() {
5570                return colspan;
5571            }
5572        }
5573    }
5574    1
5575}
5576
5577/// Get both colspan and rowspan in a single lookup
5578fn get_colspan_rowspan(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> (usize, usize) {
5579    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5580        let attrs = tag.attributes();
5581        let colspan = attrs
5582            .get("colspan")
5583            .flatten()
5584            .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5585            .unwrap_or(1);
5586        let rowspan = attrs
5587            .get("rowspan")
5588            .flatten()
5589            .and_then(|v| v.as_utf8_str().parse::<usize>().ok())
5590            .unwrap_or(1);
5591        (colspan, rowspan)
5592    } else {
5593        (1, 1)
5594    }
5595}
5596
5597/// Convert table cell (td or th)
5598fn convert_table_cell(
5599    node_handle: &tl::NodeHandle,
5600    parser: &tl::Parser,
5601    output: &mut String,
5602    options: &ConversionOptions,
5603    ctx: &Context,
5604    _tag_name: &str,
5605    dom_ctx: &DomContext,
5606) {
5607    let mut text = String::with_capacity(128);
5608
5609    let cell_ctx = Context {
5610        in_table_cell: true,
5611        ..ctx.clone()
5612    };
5613
5614    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5615        let children = tag.children();
5616        {
5617            for child_handle in children.top().iter() {
5618                walk_node(child_handle, parser, &mut text, options, &cell_ctx, 0, dom_ctx);
5619            }
5620        }
5621    }
5622
5623    let text = text.trim();
5624    let text = if options.br_in_tables {
5625        text.split('\n')
5626            .filter(|s| !s.is_empty())
5627            .collect::<Vec<_>>()
5628            .join("<br>")
5629    } else {
5630        text.replace('\n', " ")
5631    };
5632
5633    let colspan = get_colspan(node_handle, parser);
5634
5635    output.push(' ');
5636    output.push_str(&text);
5637    output.push_str(&" |".repeat(colspan));
5638}
5639
5640/// Convert table row (tr)
5641#[allow(clippy::too_many_arguments)]
5642fn convert_table_row(
5643    node_handle: &tl::NodeHandle,
5644    parser: &tl::Parser,
5645    output: &mut String,
5646    options: &ConversionOptions,
5647    ctx: &Context,
5648    row_index: usize,
5649    rowspan_tracker: &mut std::collections::HashMap<usize, (String, usize)>,
5650    dom_ctx: &DomContext,
5651) {
5652    let mut row_text = String::with_capacity(256);
5653    let mut cells = Vec::new();
5654
5655    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5656        let children = tag.children();
5657        {
5658            for child_handle in children.top().iter() {
5659                if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5660                    let cell_name = normalized_tag_name(child_tag.name().as_utf8_str());
5661                    if cell_name == "th" || cell_name == "td" {
5662                        cells.push(*child_handle);
5663                    }
5664                }
5665            }
5666        }
5667    }
5668
5669    let mut col_index = 0;
5670    let mut cell_iter = cells.iter();
5671
5672    loop {
5673        if let Some((_content, remaining_rows)) = rowspan_tracker.get_mut(&col_index) {
5674            if *remaining_rows > 0 {
5675                row_text.push(' ');
5676                row_text.push_str(" |");
5677                *remaining_rows -= 1;
5678                if *remaining_rows == 0 {
5679                    rowspan_tracker.remove(&col_index);
5680                }
5681                col_index += 1;
5682                continue;
5683            }
5684        }
5685
5686        if let Some(cell_handle) = cell_iter.next() {
5687            let cell_start = row_text.len();
5688            convert_table_cell(cell_handle, parser, &mut row_text, options, ctx, "", dom_ctx);
5689
5690            let (colspan, rowspan) = get_colspan_rowspan(cell_handle, parser);
5691
5692            if rowspan > 1 {
5693                // Extract the cell content that was just added (without separators)
5694                let cell_text = &row_text[cell_start..];
5695                // Strip leading space and trailing " |"
5696                let cell_content = cell_text
5697                    .trim_start_matches(' ')
5698                    .trim_end_matches(" |")
5699                    .trim()
5700                    .to_string();
5701                rowspan_tracker.insert(col_index, (cell_content, rowspan - 1));
5702            }
5703
5704            col_index += colspan;
5705        } else {
5706            break;
5707        }
5708    }
5709
5710    output.push('|');
5711    output.push_str(&row_text);
5712    output.push('\n');
5713
5714    let is_first_row = row_index == 0;
5715    if is_first_row {
5716        let total_cols = cells.iter().map(|h| get_colspan(h, parser)).sum::<usize>().max(1);
5717        output.push_str("| ");
5718        for i in 0..total_cols {
5719            if i > 0 {
5720                output.push_str(" | ");
5721            }
5722            output.push_str("---");
5723        }
5724        output.push_str(" |\n");
5725    }
5726}
5727
5728fn table_has_header(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5729    if let Some(node) = node_handle.get(parser) {
5730        if let tl::Node::Tag(tag) = node {
5731            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5732            if tag_name.as_ref() == "th" {
5733                return true;
5734            }
5735            let children = tag.children();
5736            for child in children.top().iter() {
5737                if table_has_header(child, parser) {
5738                    return true;
5739                }
5740            }
5741        }
5742    }
5743    false
5744}
5745
5746fn table_has_caption(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> bool {
5747    if let Some(node) = node_handle.get(parser) {
5748        if let tl::Node::Tag(tag) = node {
5749            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5750            if tag_name.as_ref() == "caption" {
5751                return true;
5752            }
5753            let children = tag.children();
5754            for child in children.top().iter() {
5755                if table_has_caption(child, parser) {
5756                    return true;
5757                }
5758            }
5759        }
5760    }
5761    false
5762}
5763
5764fn table_contains_nested_table(node_handle: &tl::NodeHandle, parser: &tl::Parser, is_root: bool) -> bool {
5765    if let Some(node) = node_handle.get(parser) {
5766        if let tl::Node::Tag(tag) = node {
5767            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5768            if !is_root && tag_name.as_ref() == "table" {
5769                return true;
5770            }
5771
5772            for child in tag.children().top().iter() {
5773                if table_contains_nested_table(child, parser, false) {
5774                    return true;
5775                }
5776            }
5777        }
5778    }
5779    false
5780}
5781
5782fn collect_table_row_counts(
5783    node_handle: &tl::NodeHandle,
5784    parser: &tl::Parser,
5785    counts: &mut Vec<usize>,
5786    has_span: &mut bool,
5787) {
5788    if let Some(node) = node_handle.get(parser) {
5789        if let tl::Node::Tag(tag) = node {
5790            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5791            match tag_name.as_ref() {
5792                "tr" => {
5793                    let mut cell_count = 0;
5794                    for child in tag.children().top().iter() {
5795                        if let Some(tl::Node::Tag(cell_tag)) = child.get(parser) {
5796                            let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5797                            if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5798                                cell_count += 1;
5799                                let attrs = cell_tag.attributes();
5800                                if attrs.get("colspan").is_some() || attrs.get("rowspan").is_some() {
5801                                    *has_span = true;
5802                                }
5803                            }
5804                        }
5805                    }
5806                    counts.push(cell_count);
5807                }
5808                _ => {
5809                    for child in tag.children().top().iter() {
5810                        collect_table_row_counts(child, parser, counts, has_span);
5811                    }
5812                }
5813            }
5814        }
5815    }
5816}
5817
5818fn count_links(node_handle: &tl::NodeHandle, parser: &tl::Parser) -> usize {
5819    let mut total = 0;
5820    if let Some(node) = node_handle.get(parser) {
5821        if let tl::Node::Tag(tag) = node {
5822            let tag_name = normalized_tag_name(tag.name().as_utf8_str());
5823            if tag_name.as_ref() == "a" {
5824                total += 1;
5825            }
5826
5827            for child in tag.children().top().iter() {
5828                total += count_links(child, parser);
5829            }
5830        }
5831    }
5832    total
5833}
5834
5835fn append_layout_row(
5836    row_handle: &tl::NodeHandle,
5837    parser: &tl::Parser,
5838    output: &mut String,
5839    options: &ConversionOptions,
5840    ctx: &Context,
5841    dom_ctx: &DomContext,
5842) {
5843    if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5844        let mut row_text = String::new();
5845        let row_children = row_tag.children();
5846        for cell_handle in row_children.top().iter() {
5847            if let Some(tl::Node::Tag(cell_tag)) = cell_handle.get(parser) {
5848                let cell_name = normalized_tag_name(cell_tag.name().as_utf8_str());
5849                if cell_name.as_ref() == "td" || cell_name.as_ref() == "th" {
5850                    let mut cell_text = String::new();
5851                    let cell_ctx = Context {
5852                        convert_as_inline: true,
5853                        ..ctx.clone()
5854                    };
5855                    let cell_children = cell_tag.children();
5856                    for cell_child in cell_children.top().iter() {
5857                        walk_node(cell_child, parser, &mut cell_text, options, &cell_ctx, 0, dom_ctx);
5858                    }
5859                    let cell_content = text::normalize_whitespace(&cell_text);
5860                    if !cell_content.trim().is_empty() {
5861                        if !row_text.is_empty() {
5862                            row_text.push(' ');
5863                        }
5864                        row_text.push_str(cell_content.trim());
5865                    }
5866                }
5867            }
5868        }
5869
5870        let trimmed = row_text.trim();
5871        if !trimmed.is_empty() {
5872            if !output.is_empty() && !output.ends_with('\n') {
5873                output.push('\n');
5874            }
5875            let formatted = trimmed.strip_prefix("- ").unwrap_or(trimmed).trim_start();
5876            output.push_str("- ");
5877            output.push_str(formatted);
5878            output.push('\n');
5879        }
5880    }
5881}
5882
5883/// Indent table lines so they stay within their parent list item.
5884fn indent_table_for_list(table_content: &str, list_depth: usize, options: &ConversionOptions) -> String {
5885    if list_depth == 0 {
5886        return table_content.to_string();
5887    }
5888
5889    let Some(mut indent) = continuation_indent_string(list_depth, options) else {
5890        return table_content.to_string();
5891    };
5892
5893    if matches!(options.list_indent_type, ListIndentType::Spaces) {
5894        let space_count = indent.chars().filter(|c| *c == ' ').count();
5895        if space_count < 4 {
5896            indent.push_str(&" ".repeat(4 - space_count));
5897        }
5898    }
5899
5900    let mut result = String::with_capacity(table_content.len() + indent.len() * 4);
5901    for segment in table_content.split_inclusive('\n') {
5902        if segment.starts_with('|') {
5903            result.push_str(&indent);
5904            result.push_str(segment);
5905        } else {
5906            result.push_str(segment);
5907        }
5908    }
5909    result
5910}
5911
5912/// Convert an entire table element
5913fn convert_table(
5914    node_handle: &tl::NodeHandle,
5915    parser: &tl::Parser,
5916    output: &mut String,
5917    options: &ConversionOptions,
5918    ctx: &Context,
5919    dom_ctx: &DomContext,
5920) {
5921    if let Some(tl::Node::Tag(tag)) = node_handle.get(parser) {
5922        let mut row_counts = Vec::new();
5923        let mut has_span = false;
5924        collect_table_row_counts(node_handle, parser, &mut row_counts, &mut has_span);
5925
5926        let row_count = row_counts.len();
5927        let mut distinct_counts: Vec<_> = row_counts.into_iter().filter(|c| *c > 0).collect();
5928        distinct_counts.sort_unstable();
5929        distinct_counts.dedup();
5930
5931        let looks_like_layout =
5932            table_contains_nested_table(node_handle, parser, true) || has_span || distinct_counts.len() > 1;
5933        let link_count = count_links(node_handle, parser);
5934        let table_text = text::normalize_whitespace(&get_text_content(node_handle, parser));
5935        let is_blank_table = table_text.trim().is_empty();
5936
5937        if !table_has_header(node_handle, parser)
5938            && !table_has_caption(node_handle, parser)
5939            && (looks_like_layout || is_blank_table || (row_count <= 2 && link_count >= 3))
5940        {
5941            if is_blank_table {
5942                return;
5943            }
5944
5945            let table_children = tag.children();
5946            for child_handle in table_children.top().iter() {
5947                if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5948                    let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5949                    match tag_name.as_ref() {
5950                        "thead" | "tbody" | "tfoot" => {
5951                            for row_handle in child_tag.children().top().iter() {
5952                                if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
5953                                    if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
5954                                        append_layout_row(row_handle, parser, output, options, ctx, dom_ctx);
5955                                    }
5956                                }
5957                            }
5958                        }
5959                        "tr" => append_layout_row(child_handle, parser, output, options, ctx, dom_ctx),
5960                        _ => {}
5961                    }
5962                }
5963            }
5964            if !output.ends_with('\n') {
5965                output.push('\n');
5966            }
5967            return;
5968        }
5969
5970        let mut row_index = 0;
5971        let mut rowspan_tracker = std::collections::HashMap::new();
5972
5973        let children = tag.children();
5974        {
5975            for child_handle in children.top().iter() {
5976                if let Some(tl::Node::Tag(child_tag)) = child_handle.get(parser) {
5977                    let tag_name = normalized_tag_name(child_tag.name().as_utf8_str());
5978
5979                    match tag_name.as_ref() {
5980                        "caption" => {
5981                            let mut text = String::new();
5982                            let grandchildren = child_tag.children();
5983                            {
5984                                for grandchild_handle in grandchildren.top().iter() {
5985                                    walk_node(grandchild_handle, parser, &mut text, options, ctx, 0, dom_ctx);
5986                                }
5987                            }
5988                            let text = text.trim();
5989                            if !text.is_empty() {
5990                                // Escape dashes in captions to avoid confusion with table separators
5991                                let escaped_text = text.replace('-', r"\-");
5992                                output.push('*');
5993                                output.push_str(&escaped_text);
5994                                output.push_str("*\n\n");
5995                            }
5996                        }
5997
5998                        "thead" | "tbody" | "tfoot" => {
5999                            let section_children = child_tag.children();
6000                            {
6001                                for row_handle in section_children.top().iter() {
6002                                    if let Some(tl::Node::Tag(row_tag)) = row_handle.get(parser) {
6003                                        if tag_name_eq(row_tag.name().as_utf8_str(), "tr") {
6004                                            convert_table_row(
6005                                                row_handle,
6006                                                parser,
6007                                                output,
6008                                                options,
6009                                                ctx,
6010                                                row_index,
6011                                                &mut rowspan_tracker,
6012                                                dom_ctx,
6013                                            );
6014                                            row_index += 1;
6015                                        }
6016                                    }
6017                                }
6018                            }
6019                        }
6020
6021                        "tr" => {
6022                            convert_table_row(
6023                                child_handle,
6024                                parser,
6025                                output,
6026                                options,
6027                                ctx,
6028                                row_index,
6029                                &mut rowspan_tracker,
6030                                dom_ctx,
6031                            );
6032                            row_index += 1;
6033                        }
6034
6035                        "colgroup" | "col" => {}
6036
6037                        _ => {}
6038                    }
6039                }
6040            }
6041        }
6042    }
6043}
6044
6045#[cfg(test)]
6046mod tests {
6047    use super::*;
6048    use crate::options::HighlightStyle;
6049
6050    #[test]
6051    fn test_trim_trailing_whitespace() {
6052        let mut s = String::from("hello   ");
6053        trim_trailing_whitespace(&mut s);
6054        assert_eq!(s, "hello");
6055
6056        let mut s = String::from("hello\t\t");
6057        trim_trailing_whitespace(&mut s);
6058        assert_eq!(s, "hello");
6059
6060        let mut s = String::from("hello \t \t");
6061        trim_trailing_whitespace(&mut s);
6062        assert_eq!(s, "hello");
6063
6064        let mut s = String::from("hello");
6065        trim_trailing_whitespace(&mut s);
6066        assert_eq!(s, "hello");
6067
6068        let mut s = String::from("");
6069        trim_trailing_whitespace(&mut s);
6070        assert_eq!(s, "");
6071
6072        let mut s = String::from("hello\n");
6073        trim_trailing_whitespace(&mut s);
6074        assert_eq!(s, "hello\n");
6075    }
6076
6077    #[test]
6078    fn test_chomp_preserves_boundary_spaces() {
6079        assert_eq!(chomp_inline("  text  "), (" ", " ", "text"));
6080        assert_eq!(chomp_inline("text"), ("", "", "text"));
6081        assert_eq!(chomp_inline("  text"), (" ", "", "text"));
6082        assert_eq!(chomp_inline("text  "), ("", " ", "text"));
6083        assert_eq!(chomp_inline("   "), (" ", " ", ""));
6084        assert_eq!(chomp_inline(""), ("", "", ""));
6085    }
6086
6087    #[test]
6088    fn nested_strong_markup_is_normalized() {
6089        let html = "<strong><strong>Bold</strong></strong>";
6090        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6091        assert_eq!(result.trim(), "**Bold**");
6092    }
6093
6094    #[test]
6095    fn nested_strong_with_additional_text_is_normalized() {
6096        let html = "<strong>Hello <strong>World</strong></strong>";
6097        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6098        assert_eq!(result.trim(), "**Hello World**");
6099    }
6100
6101    #[test]
6102    fn nested_strong_partial_segments_are_normalized() {
6103        let html = "<b>bo<b>ld</b>er</b>";
6104        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6105        assert_eq!(result.trim(), "**bolder**");
6106    }
6107
6108    #[test]
6109    fn summary_with_inner_strong_is_not_double_wrapped() {
6110        let html = "<details><summary><strong>Title</strong></summary></details>";
6111        let mut options = ConversionOptions::default();
6112        options.preprocessing.remove_forms = false;
6113        let result = convert_html(html, &options).unwrap();
6114        assert_eq!(result.trim(), "**Title**");
6115    }
6116
6117    #[test]
6118    fn legend_with_inner_strong_is_not_double_wrapped() {
6119        let html = "<fieldset><legend><strong>Section</strong></legend></fieldset>";
6120        let mut options = ConversionOptions::default();
6121        options.preprocessing.remove_forms = false; // keep form controls for this regression check
6122        let result = convert_html(html, &options).unwrap();
6123        assert_eq!(result.trim(), "**Section**");
6124    }
6125
6126    #[test]
6127    fn preprocessing_keeps_article_header_inside_main() {
6128        let html = r#"
6129        <body>
6130            <header class="global-header">
6131                <div>Global Navigation</div>
6132            </header>
6133            <main>
6134                <header class="article-header">
6135                    <h1>Primary Title</h1>
6136                </header>
6137                <p>Body content stays.</p>
6138            </main>
6139        </body>
6140        "#;
6141        let mut options = ConversionOptions::default();
6142        options.preprocessing.enabled = true;
6143        let result = convert_html(html, &options).unwrap();
6144        assert!(
6145            result.contains("Primary Title"),
6146            "article header was removed: {}",
6147            result
6148        );
6149        assert!(
6150            result.contains("Body content stays"),
6151            "main body content missing: {}",
6152            result
6153        );
6154        assert!(
6155            !result.contains("Global Navigation"),
6156            "site chrome unexpectedly rendered: {}",
6157            result
6158        );
6159    }
6160
6161    #[test]
6162    fn preprocessing_drops_nav_but_keeps_body() {
6163        let html = r##"
6164        <main>
6165            <nav aria-label="Primary navigation">
6166                <a href="#a">NavOnly</a>
6167            </nav>
6168            <article>
6169                <p>Important narrative</p>
6170            </article>
6171        </main>
6172        "##;
6173        let mut options = ConversionOptions::default();
6174        options.preprocessing.enabled = true;
6175        let result = convert_html(html, &options).unwrap();
6176        assert!(
6177            !result.contains("NavOnly"),
6178            "navigation text should not appear: {}",
6179            result
6180        );
6181        assert!(
6182            result.contains("Important narrative"),
6183            "article text should remain: {}",
6184            result
6185        );
6186    }
6187
6188    #[test]
6189    fn preprocessing_retains_section_headers_inside_articles() {
6190        let html = r#"
6191        <article>
6192            <header>
6193                <h2>Section Heading</h2>
6194            </header>
6195            <section>
6196                <p>Section body</p>
6197            </section>
6198        </article>
6199        "#;
6200        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6201        assert!(
6202            result.contains("Section Heading"),
6203            "section heading was stripped: {}",
6204            result
6205        );
6206        assert!(result.contains("Section body"), "section body missing: {}", result);
6207    }
6208
6209    #[test]
6210    fn bold_highlight_suppresses_nested_strong() {
6211        let mut options = ConversionOptions::default();
6212        options.highlight_style = HighlightStyle::Bold;
6213        let html = "<p><mark><strong>Hot</strong></mark></p>";
6214        let result = convert_html(html, &options).unwrap();
6215        assert_eq!(result.trim(), "**Hot**");
6216    }
6217
6218    #[test]
6219    fn atx_heading_swallows_layout_line_breaks() {
6220        let html = r#"<h2>
6221  Heading
6222  Text
6223  with
6224  Line
6225  Breaks
6226</h2>"#;
6227        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6228        assert_eq!(result.trim(), "## Heading Text with Line Breaks");
6229    }
6230
6231    #[test]
6232    fn doctype_is_removed() {
6233        let html = r#"<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
6234            <html>
6235                <head><title>Example</title></head>
6236                <body><p>Hello World</p></body>
6237            </html>"#;
6238        let mut options = ConversionOptions::default();
6239        options.extract_metadata = false;
6240        let result = convert_html(html, &options).unwrap();
6241        assert_eq!(result.trim(), "Hello World");
6242    }
6243
6244    #[test]
6245    fn test_calculate_list_continuation_indent() {
6246        assert_eq!(calculate_list_continuation_indent(0), 0);
6247
6248        assert_eq!(calculate_list_continuation_indent(1), 1);
6249
6250        assert_eq!(calculate_list_continuation_indent(2), 3);
6251
6252        assert_eq!(calculate_list_continuation_indent(3), 5);
6253
6254        assert_eq!(calculate_list_continuation_indent(4), 7);
6255    }
6256
6257    #[test]
6258    fn strips_script_sections_without_removing_following_content() {
6259        let input = "<div>before</div><script>1 < 2</script><p>after</p>";
6260        let stripped = strip_script_and_style_sections(input);
6261        assert_eq!(stripped, "<div>before</div><script></script><p>after</p>");
6262    }
6263
6264    #[test]
6265    fn strips_multiline_script_sections() {
6266        let input = "<html>\n<script>1 < 2</script>\nContent\n</html>";
6267        let stripped = strip_script_and_style_sections(input);
6268        assert!(stripped.contains("Content"));
6269        assert!(stripped.contains("<script"));
6270        assert!(!stripped.contains("1 < 2"));
6271    }
6272
6273    #[test]
6274    fn hr_inside_paragraph_matches_inline_expectation() {
6275        let mut options = ConversionOptions::default();
6276        options.extract_metadata = false;
6277        let markdown = convert_html("<p>Hello<hr>World</p>", &options).unwrap();
6278        assert_eq!(markdown, "Hello\n---\nWorld\n");
6279    }
6280
6281    #[test]
6282    fn hr_inside_paragraph_matches_inline_expectation_via_public_api() {
6283        let mut options = ConversionOptions::default();
6284        options.extract_metadata = false;
6285        let markdown = crate::convert("<p>Hello<hr>World</p>", Some(options)).unwrap();
6286        assert_eq!(markdown, "Hello\n---\nWorld\n");
6287    }
6288
6289    #[test]
6290    fn test_add_list_continuation_indent_blank_line() {
6291        let opts = ConversionOptions::default();
6292        let mut output = String::from("* First para");
6293        add_list_continuation_indent(&mut output, 1, true, &opts);
6294        assert_eq!(output, "* First para\n\n  ");
6295
6296        let mut output = String::from("* First para\n");
6297        add_list_continuation_indent(&mut output, 1, true, &opts);
6298        assert_eq!(output, "* First para\n\n  ");
6299
6300        let mut output = String::from("* First para\n\n");
6301        add_list_continuation_indent(&mut output, 1, true, &opts);
6302        assert_eq!(output, "* First para\n\n  ");
6303
6304        let mut output = String::from("* First para");
6305        add_list_continuation_indent(&mut output, 2, true, &opts);
6306        assert_eq!(output, "* First para\n\n      ");
6307    }
6308
6309    #[test]
6310    fn test_add_list_continuation_indent_single_line() {
6311        let opts = ConversionOptions::default();
6312        let mut output = String::from("* First div");
6313        add_list_continuation_indent(&mut output, 1, false, &opts);
6314        assert_eq!(output, "* First div\n  ");
6315
6316        let mut output = String::from("* First div\n");
6317        add_list_continuation_indent(&mut output, 1, false, &opts);
6318        assert_eq!(output, "* First div\n  ");
6319
6320        let mut output = String::from("* First div\n");
6321        add_list_continuation_indent(&mut output, 1, false, &opts);
6322        assert_eq!(output, "* First div\n  ");
6323    }
6324
6325    #[test]
6326    fn test_trim_trailing_whitespace_in_continuation() {
6327        let opts = ConversionOptions::default();
6328        let mut output = String::from("* First   ");
6329        add_list_continuation_indent(&mut output, 1, true, &opts);
6330        assert_eq!(output, "* First\n\n  ");
6331
6332        let mut output = String::from("* First\t\t");
6333        add_list_continuation_indent(&mut output, 1, false, &opts);
6334        assert_eq!(output, "* First\n  ");
6335    }
6336
6337    #[test]
6338    fn test_escape_malformed_angle_brackets_bare() {
6339        let input = "1<2";
6340        let escaped = escape_malformed_angle_brackets(input);
6341        assert_eq!(escaped, "1&lt;2");
6342    }
6343
6344    #[test]
6345    fn test_escape_malformed_angle_brackets_in_text() {
6346        let input = "<html>1<2 Content</html>";
6347        let escaped = escape_malformed_angle_brackets(input);
6348        assert_eq!(escaped, "<html>1&lt;2 Content</html>");
6349    }
6350
6351    #[test]
6352    fn test_escape_malformed_angle_brackets_multiple() {
6353        let input = "1 < 2 < 3";
6354        let escaped = escape_malformed_angle_brackets(input);
6355        assert_eq!(escaped, "1 &lt; 2 &lt; 3");
6356    }
6357
6358    #[test]
6359    fn test_escape_malformed_angle_brackets_preserves_valid_tags() {
6360        let input = "<div>content</div>";
6361        let escaped = escape_malformed_angle_brackets(input);
6362        assert_eq!(escaped, "<div>content</div>");
6363    }
6364
6365    #[test]
6366    fn test_escape_malformed_angle_brackets_mixed() {
6367        let input = "<div>1<2</div><p>3<4</p>";
6368        let escaped = escape_malformed_angle_brackets(input);
6369        assert_eq!(escaped, "<div>1&lt;2</div><p>3&lt;4</p>");
6370    }
6371
6372    #[test]
6373    fn test_escape_malformed_angle_brackets_at_end() {
6374        let input = "test<";
6375        let escaped = escape_malformed_angle_brackets(input);
6376        assert_eq!(escaped, "test&lt;");
6377    }
6378
6379    #[test]
6380    fn test_escape_malformed_angle_brackets_preserves_comments() {
6381        let input = "<!-- comment -->1<2";
6382        let escaped = escape_malformed_angle_brackets(input);
6383        assert_eq!(escaped, "<!-- comment -->1&lt;2");
6384    }
6385
6386    #[test]
6387    fn test_escape_malformed_angle_brackets_preserves_doctype() {
6388        let input = "<!DOCTYPE html>1<2";
6389        let escaped = escape_malformed_angle_brackets(input);
6390        assert_eq!(escaped, "<!DOCTYPE html>1&lt;2");
6391    }
6392
6393    #[test]
6394    fn test_convert_with_malformed_angle_brackets() {
6395        // Test the full conversion pipeline (issue #94)
6396        let html = "<html>1<2\nContent</html>";
6397        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6398        assert!(
6399            result.contains("Content"),
6400            "Result should contain 'Content': {:?}",
6401            result
6402        );
6403        assert!(
6404            result.contains("1<2") || result.contains("1&lt;2"),
6405            "Result should contain escaped or unescaped comparison"
6406        );
6407    }
6408
6409    #[test]
6410    fn test_convert_with_malformed_angle_brackets_in_div() {
6411        let html = "<html><div>1<2</div><div>Content</div></html>";
6412        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6413        assert!(
6414            result.contains("Content"),
6415            "Result should contain 'Content': {:?}",
6416            result
6417        );
6418    }
6419
6420    #[test]
6421    fn test_convert_with_multiple_malformed_angle_brackets() {
6422        let html = "<html>1 < 2 < 3<p>Content</p></html>";
6423        let result = convert_html(html, &ConversionOptions::default()).unwrap();
6424        assert!(
6425            result.contains("Content"),
6426            "Result should contain 'Content': {:?}",
6427            result
6428        );
6429    }
6430
6431    #[test]
6432    fn test_preserve_tags_simple_table() {
6433        let html = r#"<div><table><tr><td>Cell 1</td><td>Cell 2</td></tr></table><p>Text</p></div>"#;
6434        let options = ConversionOptions {
6435            preserve_tags: vec!["table".to_string()],
6436            ..Default::default()
6437        };
6438        let result = convert_html(html, &options).unwrap();
6439
6440        assert!(result.contains("<table>"), "Should preserve table tag");
6441        assert!(result.contains("</table>"), "Should have closing table tag");
6442        assert!(result.contains("<tr>"), "Should preserve tr tag");
6443        assert!(result.contains("<td>"), "Should preserve td tag");
6444        assert!(result.contains("Text"), "Should convert other elements");
6445    }
6446
6447    #[test]
6448    fn test_preserve_tags_with_attributes() {
6449        let html = r#"<table class="data" id="mytable"><tr><td>Data</td></tr></table>"#;
6450        let options = ConversionOptions {
6451            preserve_tags: vec!["table".to_string()],
6452            ..Default::default()
6453        };
6454        let result = convert_html(html, &options).unwrap();
6455
6456        assert!(result.contains("<table"), "Should preserve table tag");
6457        assert!(result.contains("class="), "Should preserve class attribute");
6458        assert!(result.contains("id="), "Should preserve id attribute");
6459        assert!(result.contains("</table>"), "Should have closing tag");
6460    }
6461
6462    #[test]
6463    fn test_preserve_tags_multiple_tags() {
6464        let html = r#"<div><table><tr><td>Table</td></tr></table><form><input type="text"/></form><p>Text</p></div>"#;
6465        let options = ConversionOptions {
6466            preserve_tags: vec!["table".to_string(), "form".to_string()],
6467            ..Default::default()
6468        };
6469        let result = convert_html(html, &options).unwrap();
6470
6471        assert!(result.contains("<table>"), "Should preserve table");
6472        assert!(result.contains("<form>"), "Should preserve form");
6473        assert!(result.contains("Text"), "Should convert paragraph");
6474    }
6475
6476    #[test]
6477    fn test_preserve_tags_nested_content() {
6478        let html = r#"<table><thead><tr><th>Header</th></tr></thead><tbody><tr><td>Data</td></tr></tbody></table>"#;
6479        let options = ConversionOptions {
6480            preserve_tags: vec!["table".to_string()],
6481            ..Default::default()
6482        };
6483        let result = convert_html(html, &options).unwrap();
6484
6485        assert!(result.contains("<thead>"), "Should preserve nested thead");
6486        assert!(result.contains("<tbody>"), "Should preserve nested tbody");
6487        assert!(result.contains("<th>"), "Should preserve th tag");
6488        assert!(result.contains("Header"), "Should preserve text content");
6489    }
6490
6491    #[test]
6492    fn test_preserve_tags_empty_list() {
6493        let html = r#"<table><tr><td>Cell</td></tr></table>"#;
6494        let options = ConversionOptions::default(); // No preserve_tags
6495        let result = convert_html(html, &options).unwrap();
6496
6497        // Should convert to markdown table (or at least not preserve HTML)
6498        assert!(
6499            !result.contains("<table>"),
6500            "Should not preserve table without preserve_tags"
6501        );
6502    }
6503
6504    #[test]
6505    fn test_preserve_tags_vs_strip_tags() {
6506        let html = r#"<table><tr><td>Table</td></tr></table><div><span>Text</span></div>"#;
6507        let options = ConversionOptions {
6508            preserve_tags: vec!["table".to_string()],
6509            strip_tags: vec!["span".to_string()],
6510            ..Default::default()
6511        };
6512        let result = convert_html(html, &options).unwrap();
6513
6514        assert!(result.contains("<table>"), "Should preserve table");
6515        assert!(!result.contains("<span>"), "Should strip span tag");
6516        assert!(result.contains("Text"), "Should keep span text content");
6517    }
6518
6519    #[test]
6520    fn example_com_remains_visible() {
6521        let html = "<!doctype html><html lang=\"en\"><head><title>Example Domain</title><meta name=\"viewport\" content=\"width=device-width, initial-scale=1\"><style>body{background:#eee;width:60vw;margin:15vh auto;font-family:system-ui,sans-serif}h1{font-size:1.5em}div{opacity:0.8}a:link,a:visited{color:#348}</style><body><div><h1>Example Domain</h1><p>This domain is for use in documentation examples without needing permission. Avoid use in operations.<p><a href=\"https://iana.org/domains/example\">Learn more</a></div></body></html>";
6522
6523        let mut options = ConversionOptions::default();
6524        options.extract_metadata = false; // matches CLI default
6525        let result = convert_html(html, &options).unwrap();
6526
6527        assert!(
6528            result.contains("Example Domain"),
6529            "content unexpectedly missing: {}",
6530            result
6531        );
6532    }
6533}
6534#[test]
6535fn normalize_self_closing_tags_noop_when_absent() {
6536    let html = "<div><p>text</p></div>";
6537    let normalized = normalize_self_closing_tags(html);
6538    assert!(matches!(normalized, Cow::Borrowed(_)));
6539    assert_eq!(normalized.as_ref(), html);
6540}
6541
6542#[test]
6543fn normalize_self_closing_tags_replaces_targets() {
6544    let html = "<br/><hr/><img/>";
6545    let normalized = normalize_self_closing_tags(html);
6546    assert_eq!(normalized.as_ref(), "<br><hr><img>");
6547}