html-to-markdown-rs 3.3.3

High-performance HTML to Markdown converter using the astral-tl parser. Part of the Kreuzberg ecosystem.
Documentation
//! Handlers for typography and text semantic elements.
//!
//! Contains:
//! - Small text (pass through)
//! - Subscript and superscript with configurable symbols
//! - Variable (var) and definition (dfn) text with italic formatting
//! - Abbreviation (abbr) with optional title
//! - Span element with special OCR handling

use crate::options::{ConversionOptions, OutputFormat};
use tl::{NodeHandle, Parser};

type Context = crate::converter::Context;
type DomContext = crate::converter::DomContext;

/// Handle small element.
///
/// Small text has no direct Markdown equivalent, so just pass through content.
pub fn handle_small(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::walk_node;

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    let children = tag.children();
    for child_handle in children.top().iter() {
        walk_node(child_handle, parser, output, options, ctx, depth + 1, dom_ctx);
    }
}

/// Handle subscript element (sub tag).
///
/// Wraps content with configurable subscript symbol from options.
pub fn handle_subscript(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::{append_inline_suffix, chomp_inline, walk_node};

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    let mut content = String::with_capacity(32);
    let children = tag.children();
    for child_handle in children.top().iter() {
        walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
    }

    if ctx.in_code {
        output.push_str(&content);
        return;
    }

    let (prefix, suffix, trimmed) = chomp_inline(&content);
    if !trimmed.is_empty() {
        output.push_str(prefix);
        if options.output_format == OutputFormat::Djot {
            output.push('~');
            output.push_str(trimmed);
            output.push('~');
        } else if !options.sub_symbol.is_empty() {
            output.push_str(&options.sub_symbol);
            output.push_str(trimmed);
            if options.sub_symbol.starts_with('<') && !options.sub_symbol.starts_with("</") {
                output.push_str(&options.sub_symbol.replace('<', "</"));
            } else {
                output.push_str(&options.sub_symbol);
            }
        } else {
            output.push_str(trimmed);
        }
        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
    }
}

/// Handle superscript element (sup tag).
///
/// Wraps content with configurable superscript symbol from options.
pub fn handle_superscript(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::{append_inline_suffix, chomp_inline, walk_node};

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    let mut content = String::with_capacity(32);
    let children = tag.children();
    for child_handle in children.top().iter() {
        walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
    }

    if ctx.in_code {
        output.push_str(&content);
        return;
    }

    let (prefix, suffix, trimmed) = chomp_inline(&content);
    if !trimmed.is_empty() {
        output.push_str(prefix);
        if options.output_format == OutputFormat::Djot {
            output.push('^');
            output.push_str(trimmed);
            output.push('^');
        } else if !options.sup_symbol.is_empty() {
            output.push_str(&options.sup_symbol);
            output.push_str(trimmed);
            if options.sup_symbol.starts_with('<') && !options.sup_symbol.starts_with("</") {
                output.push_str(&options.sup_symbol.replace('<', "</"));
            } else {
                output.push_str(&options.sup_symbol);
            }
        } else {
            output.push_str(trimmed);
        }
        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
    }
}

/// Handle variable element (var tag).
///
/// Wraps content with italic symbol (strong_em_symbol from options).
pub fn handle_variable(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::{append_inline_suffix, chomp_inline, walk_node};

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    let mut content = String::with_capacity(32);
    let children = tag.children();
    for child_handle in children.top().iter() {
        walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
    }

    let (prefix, suffix, trimmed) = chomp_inline(&content);
    if !trimmed.is_empty() {
        output.push_str(prefix);
        output.push(options.strong_em_symbol);
        output.push_str(trimmed);
        output.push(options.strong_em_symbol);
        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
    }
}

/// Handle definition element (dfn tag).
///
/// Wraps content with italic symbol (strong_em_symbol from options).
pub fn handle_definition(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::{append_inline_suffix, chomp_inline, walk_node};

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    let mut content = String::with_capacity(32);
    let children = tag.children();
    for child_handle in children.top().iter() {
        walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
    }

    let (prefix, suffix, trimmed) = chomp_inline(&content);
    if !trimmed.is_empty() {
        output.push_str(prefix);
        output.push(options.strong_em_symbol);
        output.push_str(trimmed);
        output.push(options.strong_em_symbol);
        append_inline_suffix(output, suffix, !trimmed.is_empty(), node_handle, parser, dom_ctx);
    }
}

/// Handle abbreviation element (abbr tag).
///
/// Passes through content and optionally appends title attribute in parentheses.
pub fn handle_abbreviation(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::walk_node;

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    let mut content = String::with_capacity(32);
    let children = tag.children();
    for child_handle in children.top().iter() {
        walk_node(child_handle, parser, &mut content, options, ctx, depth + 1, dom_ctx);
    }

    let trimmed = content.trim();

    if !trimmed.is_empty() {
        output.push_str(trimmed);

        if let Some(title) = tag.attributes().get("title").flatten().map(|v| v.as_utf8_str()) {
            let trimmed_title = title.trim();
            if !trimmed_title.is_empty() {
                output.push_str(" (");
                output.push_str(trimmed_title);
                output.push(')');
            }
        }
    }
}

/// Handle span element.
///
/// Processes span elements with special handling for:
/// - OCR words (elements with class "ocrx_word"): adds space before if needed
/// - Whitespace normalization in normalized mode: removes single newlines
/// - Otherwise passes through content normally
pub fn handle_span(
    node_handle: &NodeHandle,
    parser: &Parser,
    output: &mut String,
    options: &ConversionOptions,
    ctx: &Context,
    depth: usize,
    dom_ctx: &DomContext,
) {
    use crate::converter::walk_node;

    let Some(node) = node_handle.get(parser) else { return };

    let tag = match node {
        tl::Node::Tag(tag) => tag,
        _ => return,
    };

    // Check if this is an OCR word span (class="ocrx_word")
    let is_hocr_word = tag.attributes().iter().any(|(name, value)| {
        name.as_ref() == "class" && value.as_ref().is_some_and(|v| v.as_ref().contains("ocrx_word"))
    });

    // Add space before OCR words if needed
    if is_hocr_word
        && !output.is_empty()
        && !output.ends_with(' ')
        && !output.ends_with('\t')
        && !output.ends_with('\n')
    {
        output.push(' ');
    }

    // Handle whitespace normalization
    if !ctx.in_code
        && options.whitespace_mode == crate::options::WhitespaceMode::Normalized
        && output.ends_with('\n')
        && !output.ends_with("\n\n")
    {
        output.pop();
    }

    // Process children normally
    let children = tag.children();
    {
        for child_handle in children.top().iter() {
            walk_node(child_handle, parser, output, options, ctx, depth, dom_ctx);
        }
    }
}