skymark 0.1.1

HTML-to-Markdown converter prioritizing proper conversion for human readability
Documentation
#![allow(missing_docs, reason = "allow in tests")]
#![allow(clippy::unwrap_used, reason = "allow in tests")]

use regex::Regex;
use skymark::{CodeBlockStyle, HtmlToMarkdown, Options};

#[test]
fn keep_data_images_option() {
    let mut converter = HtmlToMarkdown::new();
    converter.options_mut().keep_data_images = true;
    assert_eq!(
        converter.translate(
            "<img alt=\"normal\" src=\"normal_img.jpg\">\n      <img src=\"data:image/gif;base64,R0lGODlhEA\"/>",
        ),
        "![normal](normal_img.jpg) ![](data:image/gif;base64,R0lGODlhEA)"
    );

    converter.options_mut().keep_data_images = false;
    assert_eq!(
        converter.translate(
            "<img alt=\"normal\" src=\"normal_img.jpg\">\n      <img src=\"data:image/gif;base64,R0lGODlhEA\"/>",
        ),
        "![normal](normal_img.jpg)"
    );
}

#[test]
fn use_link_reference_definitions_option() {
    let mut converter = HtmlToMarkdown::new();
    let url = "http://www.github.com/crosstype";
    let html = format!(
        "Hello:&nbsp;\n        <a href=\"{url}\">a<br><br>b<strong>c</strong></a>\n        <a>a<strong>b</strong></a>\n        <a href=\"{url}/other\">link2</a>\n        <a href=\"{url}\">repeat link</a>\n        <a href=\"{url}\">{url}</a>&nbsp;Goodbye!\n    "
    );

    converter.options_mut().use_link_reference_definitions = false;
    assert_eq!(
        converter.translate(&html),
        format!(
            "Hello: [a b**c**]({url}) a**b** [link2]({url}/other) [repeat link]({url}) <{url}> Goodbye!"
        )
    );

    converter.options_mut().use_link_reference_definitions = true;
    assert_eq!(
        converter.translate(&html),
        format!(
            "Hello: [a b**c**][1] a**b** [link2][2] [repeat link][1] <{url}> Goodbye! \n\n[1]: {url}\n[2]: {url}/other"
        )
    );
}

#[test]
fn use_inline_links_option() {
    let mut converter = HtmlToMarkdown::new();
    let url = "http://www.github.com/crosstype";
    let html = format!(
        "Hello:&nbsp;\n        <a href=\"{url}\">{url}</a>\n        <a>a<strong>b</strong></a>\n        <a href=\"{url}/other\">link2</a>\n        <a href=\"{url}\">repeat link</a> Goodbye!\n    "
    );

    converter.options_mut().use_inline_links = false;
    assert_eq!(
        converter.translate(&html),
        format!("Hello: [{url}]({url}) a**b** [link2]({url}/other) [repeat link]({url}) Goodbye!")
    );

    converter.options_mut().use_inline_links = true;
    assert_eq!(
        converter.translate(&html),
        format!("Hello: <{url}> a**b** [link2]({url}/other) [repeat link]({url}) Goodbye!")
    );
}

#[test]
fn code_fence_option() {
    let mut converter = HtmlToMarkdown::new();
    let text = "* test  \n\n1. test\n\\Test";
    let html = format!("<pre><code class=\"language-fortran\">{text}</code></pre>");

    assert_eq!(
        converter.translate(&html),
        format!("```fortran\n{text}\n```")
    );
    converter.options_mut().code_fence = "+++++".to_owned();
    assert_eq!(
        converter.translate(&html),
        format!("+++++fortran\n{text}\n+++++")
    );
}

#[test]
fn code_block_style_option() {
    let mut converter = HtmlToMarkdown::new();
    let html = "<pre><code>line1\nline2</code></pre>";

    converter.options_mut().code_block_style = CodeBlockStyle::Fenced;
    assert_eq!(converter.translate(html), "```\nline1\nline2\n```");

    converter.options_mut().code_block_style = CodeBlockStyle::Indented;
    assert_eq!(converter.translate(html), "    line1\n    line2");
}

#[test]
fn em_delimiter_option() {
    let mut converter = HtmlToMarkdown::new();
    let html = "<em>some text</em><em>more text</em>";

    assert_eq!(converter.translate(html), "_some text_ _more text_");
    converter.options_mut().em_delimiter = "|".to_owned();
    assert_eq!(converter.translate(html), "|some text| |more text|");
    converter.options_mut().em_delimiter = "+++".to_owned();
    assert_eq!(converter.translate(html), "+++some text+++ +++more text+++");
}

#[test]
fn strong_delimiter_option() {
    let mut converter = HtmlToMarkdown::new();
    let html = "<strong>some text</strong><strong>more text</strong>";

    assert_eq!(converter.translate(html), "**some text** **more text**");
    converter.options_mut().strong_delimiter = "|".to_owned();
    assert_eq!(converter.translate(html), "|some text| |more text|");
    converter.options_mut().strong_delimiter = "+++".to_owned();
    assert_eq!(converter.translate(html), "+++some text+++ +++more text+++");
}

#[test]
fn strike_delimiter_option() {
    let mut converter = HtmlToMarkdown::new();
    let html = "<strike>some text</strike><s>more text</s><del>one more text</del>";

    assert_eq!(
        converter.translate(html),
        "~~some text~~ ~~more text~~ ~~one more text~~"
    );
    converter.options_mut().strike_delimiter = "~".to_owned();
    assert_eq!(
        converter.translate(html),
        "~some text~ ~more text~ ~one more text~"
    );
    converter.options_mut().strike_delimiter = "+++".to_owned();
    assert_eq!(
        converter.translate(html),
        "+++some text+++ +++more text+++ +++one more text+++"
    );
}

#[test]
#[allow(clippy::trivial_regex, reason = "required for test")]
fn text_replace_option() {
    let mut converter = HtmlToMarkdown::new();
    converter.options_mut().text_replace = vec![(Regex::new("abc").unwrap(), "xyz".to_owned())];
    assert_eq!(converter.translate("<h1>hello abc</h1>"), "# hello xyz");
}

#[test]
fn line_start_escape_option() {
    let mut converter = HtmlToMarkdown::new();
    assert_eq!(
        converter.translate("<p>text<br>+ text<br>+ more text</p>"),
        "text  \n\\+ text  \n\\+ more text"
    );
    assert_eq!(
        converter.translate("<p>text<br>> text<br>> more text</p>"),
        "text  \n\\> text  \n\\> more text"
    );

    converter.options_mut().line_start_escape = (
        Regex::new(r"(?m)^(\s*?)((?:[=>-])|(?:#{1,6}\s))|(?:(\d+)(\.\s))").unwrap(),
        "$1$3\\$2$4".to_owned(),
    );
    assert_eq!(
        converter.translate("<p>text<br>+ text<br>+ more text</p>"),
        "text  \n+ text  \n+ more text"
    );
}

#[test]
fn global_escape_option() {
    let mut converter = HtmlToMarkdown::new();
    assert_eq!(
        converter.translate("<strong>text**text</strong>"),
        "**text\\*\\*text**"
    );
    converter.options_mut().global_escape = (Regex::new(r"[_~\[\]]").unwrap(), r"\$0".to_owned());
    assert_eq!(converter.translate("<i>text**text</i>"), "_text**text_");
    assert_eq!(
        converter.translate("<h1>title [more words]</h1>"),
        "# title \\[more words\\]"
    );
}

#[test]
fn bullet_marker_option() {
    let mut converter = HtmlToMarkdown::new();
    let html = "<ul><li>item1</li><li>item2</li></ul>";

    assert_eq!(converter.translate(html), "* item1\n* item2");
    converter.options_mut().bullet_marker = "-".to_owned();
    assert_eq!(converter.translate(html), "- item1\n- item2");
    converter.options_mut().bullet_marker = "<->".to_owned();
    assert_eq!(converter.translate(html), "<-> item1\n<-> item2");
}

#[test]
fn ignore_option() {
    let html = "<strong>some text</strong><em>more text</em>";

    let ignore_strong = Options {
        ignore: vec!["STRONG".to_owned()],
        ..Options::default()
    };
    assert_eq!(
        HtmlToMarkdown::with_options(ignore_strong).translate(html),
        "_more text_"
    );

    let ignore_em = Options {
        ignore: vec!["EM".to_owned()],
        ..Options::default()
    };
    assert_eq!(
        HtmlToMarkdown::with_options(ignore_em).translate(html),
        "**some text**"
    );

    let ignore_both = Options {
        ignore: vec!["EM".to_owned(), "STRONG".to_owned()],
        ..Options::default()
    };
    assert_eq!(
        HtmlToMarkdown::with_options(ignore_both).translate(html),
        ""
    );
}

#[test]
fn ignore_block_elements_option() {
    let html = "<p>Before</p><nav>Navigation content</nav><p>After</p>";
    let options = Options {
        ignore: vec!["nav".to_owned()],
        ..Options::default()
    };
    assert_eq!(
        HtmlToMarkdown::with_options(options).translate(html),
        "Before\n\nAfter"
    );
}

#[test]
fn block_elements_option() {
    let html = "<em>x</em><strong>yyy</strong><em>x</em><span>text</span>";

    let strong_block = Options {
        block_elements: vec!["STRONG".to_owned()],
        ..Options::default()
    };
    assert_eq!(
        HtmlToMarkdown::with_options(strong_block).translate(html),
        "_x_\n\n**yyy**\n\n_x_text"
    );

    let em_block = Options {
        block_elements: vec!["EM".to_owned()],
        ..Options::default()
    };
    assert_eq!(
        HtmlToMarkdown::with_options(em_block).translate(html),
        "_x_\n\n**yyy**\n\n_x_\n\ntext"
    );
}

#[test]
fn max_consecutive_newlines_option() {
    let mut converter = HtmlToMarkdown::new();
    let html = format!("<b>text</b>{}<em>something</em>", "<br/>".repeat(10));

    assert_eq!(
        converter.translate(&html),
        format!("**text**{}_something_", "  \n".repeat(3))
    );

    converter.options_mut().max_consecutive_newlines = 5;
    assert_eq!(
        converter.translate(&html),
        format!("**text**{}_something_", "  \n".repeat(5))
    );
}