Skip to main content

web_capture/
archive.rs

1//! Build a self-contained ZIP archive from raw HTML.
2//!
3//! Pins the default `--format archive` layout contract (see issue #113): the
4//! produced zip contains **exactly** `document.md`, `document.html`, and an
5//! `images/` folder, identical across every capture path.
6
7use crate::extract_images::extract_base64_to_buffers;
8use crate::gdocs::{create_archive_zip, ExtractedImage, GDocsArchiveResult};
9use crate::markdown::convert_html_to_markdown;
10
11/// Build a default `--format archive` ZIP (`Vec<u8>`) from raw HTML.
12///
13/// The archive contains exactly:
14/// - `document.md` — markdown that references images by **relative** path to
15///   the bundled `images/` folder.
16/// - `document.html` — the source HTML the markdown was derived from, for
17///   reference only (so reviewers can verify the conversion).
18/// - `images/` — every inline base64 image as a separate file, in its original
19///   format (PNG/JPEG/SVG…).
20///
21/// # Arguments
22///
23/// * `html` - Source HTML to convert.
24/// * `base_url` - Base URL used to resolve relative links during conversion.
25///
26/// # Errors
27///
28/// Returns an error if HTML→Markdown conversion or ZIP creation fails.
29pub fn build_zip_from_html(html: &str, base_url: &str) -> crate::Result<Vec<u8>> {
30    let markdown = convert_html_to_markdown(html, Some(base_url))?;
31    let buffers = extract_base64_to_buffers(&markdown, "images")?;
32
33    let archive = GDocsArchiveResult {
34        html: html.to_string(),
35        markdown: buffers.markdown,
36        images: buffers
37            .images
38            .into_iter()
39            .map(|b| ExtractedImage {
40                filename: b.filename,
41                data: b.data,
42                mime_type: String::new(),
43            })
44            .collect(),
45        document_id: String::new(),
46        export_url: base_url.to_string(),
47    };
48
49    create_archive_zip(&archive, true)
50}