html-to-markdown-rs 3.3.3

High-performance HTML to Markdown converter using the astral-tl parser. Part of the Kreuzberg ecosystem.
Documentation
//! Handler for div element.
//!
//! Converts HTML div elements to Markdown by processing children while maintaining
//! appropriate spacing and context awareness for:
//! - Table continuations: Uses table-specific line breaks
//! - List continuations: Uses list indentation
//! - Block context: Adds surrounding newlines for proper block separation

use crate::converter::main_helpers::trim_trailing_whitespace;
use crate::options::ConversionOptions;
use tl::{NodeHandle, Parser};

// Type aliases for Context and DomContext to avoid circular imports
type Context = crate::converter::Context;
type DomContext = crate::converter::DomContext;

/// Handles div elements.
///
/// Divs are generic container elements that need special handling based on context:
/// - When inline context: passes through children without separators
/// - When in table cell: uses table-specific line breaks (<br> or backslash)
/// - When in list item: uses list continuation indentation
/// - When in block context: adds appropriate newlines before/after content
///
/// # Note
/// This function references `walk_node` and helper functions from converter.rs
/// which must be accessible (pub(crate)) for this module to work correctly.
pub fn handle(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::walk_node;

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    // If inline conversion mode, just pass children through
    if ctx.convert_as_inline {
        let children = tag.children();
        {
            for child_handle in children.top().iter() {
                walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
            }
        }
        return;
    }

    let content_start_pos = output.len();

    let is_table_continuation =
        ctx.in_table_cell && !output.is_empty() && !output.ends_with('|') && !output.ends_with("<br>");

    let is_list_continuation = ctx.in_list_item
        && !output.is_empty()
        && !output.ends_with("* ")
        && !output.ends_with("- ")
        && !output.ends_with(". ");

    let needs_leading_sep = !ctx.in_table_cell
        && !ctx.in_list_item
        && !ctx.convert_as_inline
        && !output.is_empty()
        && !output.ends_with("\n\n");

    // Handle leading separators based on context
    if is_table_continuation {
        trim_trailing_whitespace(output);
        if options.br_in_tables {
            output.push_str("<br>");
        } else {
            use crate::options::NewlineStyle;
            match options.newline_style {
                NewlineStyle::Spaces => output.push_str("  \n"),
                NewlineStyle::Backslash => output.push_str("\\\n"),
            }
        }
    } else if is_list_continuation {
        add_list_continuation_indent(output, ctx.list_depth, false, options);
    } else if needs_leading_sep {
        trim_trailing_whitespace(output);
        output.push_str("\n\n");
    }

    // Process children
    let children = tag.children();
    {
        for child_handle in children.top().iter() {
            walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
        }
    }

    let has_content = output.len() > content_start_pos;

    if has_content {
        if content_start_pos == 0 && output.starts_with('\n') && !output.starts_with("\n\n") {
            output.remove(0);
        }
        trim_trailing_whitespace(output);

        if ctx.in_table_cell {
            // No trailing separator in table cells
        } else if ctx.in_list_item {
            if is_list_continuation {
                if !output.ends_with('\n') {
                    output.push('\n');
                }
            } else if !output.ends_with("\n\n") {
                if output.ends_with('\n') {
                    output.push('\n');
                } else {
                    output.push_str("\n\n");
                }
            }
        } else if !ctx.in_list_item && !ctx.convert_as_inline {
            if output.ends_with("\n\n") {
                // Already has proper spacing
            } else if output.ends_with('\n') {
                output.push('\n');
            } else {
                output.push_str("\n\n");
            }
        }
    }
}

/// Helper function to add list continuation indentation
fn add_list_continuation_indent(
    output: &mut String,
    list_depth: usize,
    _block_level: bool,
    _options: &ConversionOptions,
) {
    if !output.ends_with('\n') {
        output.push('\n');
    }

    for _ in 0..list_depth {
        output.push_str("  ");
    }
}