html-to-markdown-rs 3.4.0

High-performance HTML to Markdown converter using the astral-tl parser. Part of the Kreuzberg ecosystem.
Documentation
#![allow(missing_docs)]

fn convert(
    html: &str,
    opts: Option<html_to_markdown_rs::ConversionOptions>,
) -> html_to_markdown_rs::error::Result<String> {
    html_to_markdown_rs::convert(html, opts).map(|r| r.content.unwrap_or_default())
}

use html_to_markdown_rs::ConversionOptions;

#[test]
fn footer_without_navigation_hint_is_preserved() {
    let html = r#"<!DOCTYPE html>
<html lang="en">
  <body>
    <main>
      <h1>Simple Webpage</h1>
      <p>This is a simple webpage without external images.</p>
    </main>
    <footer>
      <p>Test page for processors validation</p>
    </footer>
  </body>
</html>"#;

    let markdown = convert(html, None).unwrap();
    assert!(
        markdown.contains("Test page for processors validation"),
        "footer content should be retained in markdown:\n{markdown}"
    );
}

#[test]
fn footer_with_navigation_hint_is_removed() {
    let html = r#"<!DOCTYPE html>
<html lang="en">
  <body>
    <main>
      <h1>Simple Webpage</h1>
    </main>
    <footer class="site-footer">
      <p>Test page for processors validation</p>
      <nav><a href="/about">About</a></nav>
    </footer>
  </body>
</html>"#;

    let options = ConversionOptions {
        preprocessing: html_to_markdown_rs::PreprocessingOptions {
            enabled: true,
            ..Default::default()
        },
        ..Default::default()
    };
    let markdown = convert(html, Some(options)).unwrap();
    assert!(
        !markdown.contains("processors validation"),
        "navigational footers should still be stripped entirely:\n{markdown}"
    );
}