web-capture 0.3.13

CLI and microservice to render web pages as HTML, Markdown, or PNG
Documentation
use web_capture::gdocs::extract_base64_images;
use web_capture::markdown::convert_html_to_markdown;

const PNG_B64: &str =
    "iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mNkYAAAAAYAAjCB0C8AAAAASUVORK5CYII=";

#[test]
fn img_inside_heading_is_kept_in_markdown() {
    let html = format!(
        r#"<html><body>
<h1><img src="data:image/png;base64,{PNG_B64}" alt=""> Title With Image</h1>
<p>Body <img src="data:image/png;base64,{PNG_B64}" alt=""> paragraph.</p>
</body></html>"#
    );

    let (local_html, images) = extract_base64_images(&html);
    assert_eq!(images.len(), 2, "two images should be extracted to files");

    let md = convert_html_to_markdown(&local_html, None).unwrap();
    let md_refs = md.matches("](images/").count();

    assert_eq!(
        md_refs,
        images.len(),
        "markdown image refs ({md_refs}) must match extracted image files ({}). MD:\n{md}",
        images.len(),
    );
}

#[test]
fn img_inside_all_heading_levels_is_kept() {
    for level in 1..=6 {
        let open = format!("<h{level}>");
        let close = format!("</h{level}>");
        let html = format!(
            r#"<html><body>{open}<img src="data:image/png;base64,{PNG_B64}" alt="icon"> Heading {level}{close}</body></html>"#
        );

        let (local_html, images) = extract_base64_images(&html);
        assert_eq!(images.len(), 1, "h{level}: one image extracted");

        let md = convert_html_to_markdown(&local_html, None).unwrap();
        let refs = md.matches("](images/").count();
        assert_eq!(
            refs, 1,
            "h{level}: image ref missing from markdown. MD:\n{md}"
        );
    }
}

#[test]
fn gdocs_style_heading_with_spans_keeps_img() {
    let html = format!(
        r#"<html><head><style>body{{margin:0}}</style></head><body>
<h1 id="h.abc" class="c12"><span class="c4"><img src="data:image/png;base64,{PNG_B64}" alt="" style="width:32px"></span><span class="c4"> Title</span></h1>
<p class="c5"><span><img src="data:image/png;base64,{PNG_B64}" alt=""></span><span> Body.</span></p>
</body></html>"#
    );

    let (local_html, images) = extract_base64_images(&html);
    assert_eq!(images.len(), 2);

    let md = convert_html_to_markdown(&local_html, None).unwrap();
    let refs = md.matches("](images/").count();
    assert_eq!(
        refs,
        images.len(),
        "GDocs-style heading lost image. MD:\n{md}"
    );
}

#[test]
fn image_count_parity_across_pipeline() {
    let html = format!(
        r#"<html><body>
<h1><img src="data:image/png;base64,{PNG_B64}" alt="h1"> Chapter</h1>
<p>Text <img src="data:image/png;base64,{PNG_B64}" alt="p1"> here.</p>
<h2><img src="data:image/png;base64,{PNG_B64}" alt="h2"> Section</h2>
<p><img src="data:image/png;base64,{PNG_B64}" alt="p2"></p>
<h3>No image heading</h3>
<p>Final <img src="data:image/png;base64,{PNG_B64}" alt="p3"> paragraph.</p>
</body></html>"#
    );

    let (local_html, images) = extract_base64_images(&html);
    let html_img_count = local_html.matches("<img ").count();
    let md = convert_html_to_markdown(&local_html, None).unwrap();
    let md_ref_count = md.matches("](images/").count();

    assert_eq!(
        html_img_count,
        images.len(),
        "HTML img tags ({html_img_count}) != extracted files ({})",
        images.len()
    );
    assert_eq!(
        md_ref_count,
        images.len(),
        "MD refs ({md_ref_count}) != extracted files ({}). MD:\n{md}",
        images.len()
    );
}