web_capture/archive.rs
1//! Build a self-contained ZIP archive from raw HTML.
2//!
3//! Pins the default `--format archive` layout contract (see issue #113): the
4//! produced zip contains **exactly** `document.md`, `document.html`, and an
5//! `images/` folder, identical across every capture path.
6
7use crate::extract_images::extract_base64_to_buffers;
8use crate::gdocs::{create_archive_zip, ExtractedImage, GDocsArchiveResult};
9use crate::markdown::convert_html_to_markdown;
10
11/// Build a default `--format archive` ZIP (`Vec<u8>`) from raw HTML.
12///
13/// The archive contains exactly:
14/// - `document.md` — markdown that references images by **relative** path to
15/// the bundled `images/` folder.
16/// - `document.html` — the source HTML the markdown was derived from, for
17/// reference only (so reviewers can verify the conversion).
18/// - `images/` — every inline base64 image as a separate file, in its original
19/// format (PNG/JPEG/SVG…).
20///
21/// # Arguments
22///
23/// * `html` - Source HTML to convert.
24/// * `base_url` - Base URL used to resolve relative links during conversion.
25///
26/// # Errors
27///
28/// Returns an error if HTML→Markdown conversion or ZIP creation fails.
29pub fn build_zip_from_html(html: &str, base_url: &str) -> crate::Result<Vec<u8>> {
30 let markdown = convert_html_to_markdown(html, Some(base_url))?;
31 let buffers = extract_base64_to_buffers(&markdown, "images")?;
32
33 let archive = GDocsArchiveResult {
34 html: html.to_string(),
35 markdown: buffers.markdown,
36 images: buffers
37 .images
38 .into_iter()
39 .map(|b| ExtractedImage {
40 filename: b.filename,
41 data: b.data,
42 mime_type: String::new(),
43 })
44 .collect(),
45 document_id: String::new(),
46 export_url: base_url.to_string(),
47 };
48
49 create_archive_zip(&archive, true)
50}