mdz_rs/
lib.rs

1//! # MDZ - Markdown Zip Library
2//!
3//! A Rust library for creating and working with MDZ (Markdown Zip) files.
4//! MDZ is a ZIP-based archive format that bundles Markdown documents with their
5//! embedded assets (images, videos, audio, and other files) into a single, portable file.
6//!
7//! ## Features
8//!
9//! - **Automatic Asset Processing**: Download network images and copy local files
10//! - **Smart Link Resolution**: Convert absolute paths to relative paths for maximum compatibility
11//! - **UUID-based Naming**: Use UUIDs for downloaded assets to avoid conflicts
12//! - **Backward Compatibility**: Handle legacy MDZ files seamlessly
13//! - **Rich Metadata**: Complete manifest with document and asset information
14//!
15//! ## Quick Start
16//!
17//! ```rust,no_run
18//! use mdz_rs::{pack, unpack};
19//!
20//! #[tokio::main]
21//! async fn main() -> Result<(), Box<dyn std::error::Error>> {
22//!     // Pack a markdown file with assets
23//!     pack("document.md", "document.mdz").await?;
24//!
25//!     // Unpack MDZ file to extract content and assets
26//!     unpack("document.mdz", Some("output/"))?;
27//!
28//!     Ok(())
29//! }
30//! ```
31//!
32//! ## MDZ Format Structure
33//!
34//! ```text
35//! document.mdz (ZIP archive)
36//! ├── index.md              # Updated markdown with relative asset links
37//! ├── manifest.json         # Metadata and asset mapping
38//! └── assets/               # Organized asset files
39//!     ├── images/
40//!     ├── videos/
41//!     ├── audio/
42//!     └── files/
43//! ```
44//!
45//! ## Asset Handling
46//!
47//! The library automatically processes:
48//!
49//! - **Network Images**: Downloaded asynchronously with UUID filenames
50//! - **Local Files**: Copied with conflict resolution (counter suffixes)
51//! - **Link Updates**: Markdown links are updated to use relative paths (`./assets/...`)
52//! - **Metadata**: Complete manifest.json with asset mapping and document info
53
54use std::collections::HashMap;
55use std::fs;
56use std::io::Read;
57use std::path::{Path, PathBuf};
58use anyhow::{anyhow, Result};
59use regex::Regex;
60use serde::{Deserialize, Serialize};
61use url::Url;
62
63// 包含测试模块
64#[cfg(test)]
65mod tests;
66
67/// Represents an embedded asset within an MDZ archive.
68///
69/// Assets are stored in the manifest.json file and reference actual files
70/// within the archive's assets/ directory.
71///
72/// # Examples
73///
74/// ```json
75/// {
76///   "id": "image1",
77///   "path": "assets/images/image1.png",
78///   "type": "image",
79///   "alt": "Example image",
80///   "title": "A sample image"
81/// }
82/// ```
83#[derive(Debug, Clone, Serialize, Deserialize)]
84pub struct Asset {
85    /// Unique identifier for the asset, used for internal reference
86    pub id: String,
87
88    /// Relative path to the asset file within the MDZ archive
89    pub path: String,
90
91    /// Type of the asset (image, video, audio, file)
92    #[serde(rename = "type")]
93    pub asset_type: String,
94
95    /// Alt text for images (accessibility)
96    pub alt: Option<String>,
97
98    /// Title or description of the asset
99    pub title: Option<String>,
100}
101
102/// Manifest structure for MDZ files.
103///
104/// The manifest.json file contains metadata about the document and all embedded assets.
105/// It's stored in the root of the MDZ archive and provides information about the
106/// original document structure.
107///
108/// # Examples
109///
110/// ```json
111/// {
112///   "version": "1.1.0",
113///   "title": "My Document",
114///   "author": "John Doe",
115///   "date": "2025-12-13",
116///   "filename": "document.md",
117///   "assets": [...]
118/// }
119/// ```
120#[derive(Debug, Serialize, Deserialize)]
121pub struct Manifest {
122    /// MDZ specification version (e.g., "1.1.0")
123    pub version: String,
124
125    /// Document title
126    pub title: String,
127
128    /// Document author (optional)
129    pub author: Option<String>,
130
131    /// Creation or modification date (ISO 8601 format, optional)
132    pub date: Option<String>,
133
134    /// Original markdown filename (optional since v1.1.0)
135    pub filename: Option<String>,
136
137    /// List of all embedded assets
138    pub assets: Vec<Asset>,
139}
140
141/// Determines if a given path is a URL or a local file path.
142///
143/// This function uses URL parsing to determine if the provided string represents
144/// a valid URL (HTTP, HTTPS, FTP, etc.) or a local file system path.
145///
146/// # Arguments
147///
148/// * `path` - The path string to check
149///
150/// # Returns
151///
152/// Returns `true` if the path is a valid URL, `false` otherwise.
153///
154/// # Examples
155///
156/// ```
157/// use mdz_rs::is_url;
158///
159/// assert!(is_url("https://example.com/image.jpg"));
160/// assert!(is_url("http://localhost:8080/file.pdf"));
161/// assert!(!is_url("./local/image.png"));
162/// assert!(!is_url("/absolute/path/file.jpg"));
163/// ```
164pub fn is_url(path: &str) -> bool {
165    Url::parse(path).is_ok()
166}
167
168/// Get asset type based on file extension
169fn get_asset_type(path: &Path) -> String {
170    let extension = path.extension()
171        .and_then(|ext| ext.to_str())
172        .map(|ext| ext.to_lowercase());
173
174    match extension.as_deref() {
175        Some("png") | Some("jpg") | Some("jpeg") | Some("gif") | Some("bmp") |
176        Some("svg") | Some("webp") | Some("ico") => "image".to_string(),
177        Some("mp4") | Some("avi") | Some("mov") | Some("wmv") | Some("webm") |
178        Some("mkv") | Some("flv") => "video".to_string(),
179        Some("mp3") | Some("wav") | Some("ogg") | Some("flac") | Some("aac") => "audio".to_string(),
180        _ => "file".to_string(),
181    }
182}
183
184/// Get appropriate subdirectory for asset type
185fn get_asset_subdir(asset_type: &str) -> &'static str {
186    match asset_type {
187        "image" => "images",
188        "video" => "videos",
189        "audio" => "audio",
190        _ => "files",
191    }
192}
193
194/// Download an image from URL to local file
195async fn download_image(url: &str, dest_path: &Path) -> Result<()> {
196    let response = reqwest::get(url).await?;
197
198    if !response.status().is_success() {
199        return Err(anyhow!("Failed to download image: {}", response.status()));
200    }
201
202    let content = response.bytes().await?;
203
204    // Create parent directories if they don't exist
205    if let Some(parent) = dest_path.parent() {
206        fs::create_dir_all(parent)?;
207    }
208
209    fs::write(dest_path, content)?;
210    Ok(())
211}
212
213/// Copy local file to destination
214fn copy_local_file(src_path: &Path, dest_path: &Path) -> Result<()> {
215    // Create parent directories if they don't exist
216    if let Some(parent) = dest_path.parent() {
217        fs::create_dir_all(parent)?;
218    }
219
220    fs::copy(src_path, dest_path)?;
221    Ok(())
222}
223
224/// Extract image URLs and paths from Markdown content
225fn extract_images_from_markdown(content: &str) -> Vec<(String, Option<String>)> {
226    let mut images = Vec::new();
227
228    // Regex patterns for different Markdown image syntaxes
229    let patterns = vec![
230        // ![alt](url) - standard Markdown image
231        r#"\!\[([^\]]*)\]\(([^)]+)\)"#,
232        // <img src="url" alt="alt"> - HTML img tag
233        r#"<img[^>]+src=["']([^"']+)["'][^>]*>"#,
234    ];
235
236    // Pre-compile regex for extracting alt attribute
237    let alt_regex = Regex::new(r#"alt=["']([^"']*)["']"#).unwrap();
238
239    for pattern in patterns {
240        let regex = Regex::new(pattern).unwrap();
241        for captures in regex.captures_iter(content) {
242            match pattern {
243                p if p.starts_with(r#"\!\["#) => {
244                    // Markdown syntax
245                    let alt = captures.get(1).map(|m| m.as_str().to_string());
246                    let url = captures.get(2).unwrap().as_str().to_string();
247                    images.push((url, alt));
248                }
249                p if p.starts_with(r#"<img"#) => {
250                    // HTML img tag
251                    let url = captures.get(1).unwrap().as_str().to_string();
252                    // Try to extract alt attribute from the full match
253                    let full_match = captures.get(0).unwrap().as_str();
254                    let alt = alt_regex.captures(full_match)
255                        .and_then(|c| c.get(1))
256                        .map(|m| m.as_str().to_string());
257                    images.push((url, alt));
258                }
259                _ => {}
260            }
261        }
262    }
263
264    images
265}
266
267/// Update markdown content to use relative paths to assets
268fn update_markdown_links(content: &str, assets: &[(Asset, String)]) -> Result<String> {
269    let mut updated_content = content.to_string();
270
271    // Create a mapping from original URLs to new relative paths (./assets/...)
272    let mut url_mapping: HashMap<String, String> = HashMap::new();
273
274    for (asset, original_url) in assets {
275        let new_url = format!("./{}", asset.path);
276        url_mapping.insert(original_url.clone(), new_url);
277    }
278
279    // Update Markdown image links: ![alt](url)
280    let markdown_regex = Regex::new(r#"(\!\[([^\]]*)\]\()([^)]+)\)"#).unwrap();
281    updated_content = markdown_regex.replace_all(&updated_content, |caps: &regex::Captures| {
282        let alt = caps.get(2).unwrap().as_str();     // alt text
283        let url = caps.get(3).unwrap().as_str();     // url
284
285        if let Some(new_url) = url_mapping.get(url) {
286            format!("![{}]({})", alt, new_url)
287        } else {
288            format!("![{}]({})", alt, url)
289        }
290    }).to_string();
291
292    // Update HTML img tags: <img src="url" ...>
293    let html_regex = Regex::new(r#"(<img[^>]+src=["'])([^"']+)(["'][^>]*>)"#).unwrap();
294    updated_content = html_regex.replace_all(&updated_content, |caps: &regex::Captures| {
295        let prefix = caps.get(1).unwrap().as_str();
296        let url = caps.get(2).unwrap().as_str();
297        let suffix = caps.get(3).unwrap().as_str();
298
299        if let Some(new_url) = url_mapping.get(url) {
300            format!("{}{}{}", prefix, new_url, suffix)
301        } else {
302            format!("{}{}{}", prefix, url, suffix)
303        }
304    }).to_string();
305
306    Ok(updated_content)
307}
308
309/// Packs a Markdown file and its assets into an MDZ archive.
310///
311/// This function reads a markdown file, extracts all referenced assets, downloads
312/// network images, copies local files, updates links to use relative paths, and
313/// bundles everything into a single MDZ file.
314///
315/// # Arguments
316///
317/// * `markdown_file` - Path to the source markdown file
318/// * `output_file` - Path where the MDZ file should be created
319///
320/// # Returns
321///
322/// Returns `Ok(())` on success, or an error if packing fails.
323///
324/// # Errors
325///
326/// This function will return an error if:
327/// - The markdown file cannot be read
328/// - Network images cannot be downloaded
329/// - Local files cannot be copied
330/// - The MDZ file cannot be created
331///
332/// # Examples
333///
334/// ```rust,no_run
335/// use mdz_rs::pack;
336///
337/// #[tokio::main]
338/// async fn main() -> Result<(), Box<dyn std::error::Error>> {
339///     pack("document.md", "document.mdz").await?;
340///     println!("Successfully packed document.mdz");
341///     Ok(())
342/// }
343/// ```
344///
345/// # Asset Processing
346///
347/// The function automatically handles:
348/// - **Network Images**: Downloaded with UUID filenames (e.g., `12345678-... .jpg`)
349/// - **Local Files**: Copied with conflict resolution (adds counter suffixes)
350/// - **Link Updates**: Updated to use relative paths (`./assets/...`)
351/// - **Directory Structure**: Organized by type in `assets/` subdirectories
352pub async fn pack(
353    markdown_file: &str,
354    output_file: &str,
355) -> Result<()> {
356    // Read the markdown file
357    let markdown_path = Path::new(markdown_file);
358    let markdown_content = fs::read_to_string(markdown_path)?;
359
360    // Extract images from markdown
361    let extracted_images = extract_images_from_markdown(&markdown_content);
362
363    // Create a temporary directory for the MDZ structure
364    let temp_dir = std::env::temp_dir().join(format!("mdz_assets_{}", uuid::Uuid::new_v4()));
365    fs::create_dir_all(&temp_dir)?;
366
367    // Create assets directory structure
368    let assets_dir = temp_dir.join("assets");
369    fs::create_dir_all(&assets_dir)?;
370    let images_dir = assets_dir.join("images");
371    let videos_dir = assets_dir.join("videos");
372    let audio_dir = assets_dir.join("audio");
373    let files_dir = assets_dir.join("files");
374    fs::create_dir_all(&images_dir)?;
375    fs::create_dir_all(&videos_dir)?;
376    fs::create_dir_all(&audio_dir)?;
377    fs::create_dir_all(&files_dir)?;
378
379    let mut assets = Vec::new();
380    let mut asset_counter = HashMap::new();
381    let mut assets_with_original_urls = Vec::new();
382
383    // Process each image
384    for (image_url, alt_text) in extracted_images {
385        // Skip if it's already an assets:// URL
386        if image_url.starts_with("assets://") {
387            continue;
388        }
389
390        // Determine if it's a URL or local path
391        let is_remote = is_url(&image_url);
392
393        // Determine file extension first
394        let extension = if is_remote {
395            Path::new(&image_url)
396                .extension()
397                .and_then(|ext| ext.to_str())
398                .unwrap_or("png")
399        } else {
400            Path::new(&image_url)
401                .extension()
402                .and_then(|ext| ext.to_str())
403                .unwrap_or("png")
404        };
405
406        // Generate UUID-based filename for remote images, keep original for local files
407        let (_filename, _asset_path_in_archive, should_process) = if is_remote {
408            let uuid_filename = format!("{}.{}", uuid::Uuid::new_v4(), extension);
409            let asset_type = get_asset_type(Path::new(&uuid_filename));
410            let subdir = get_asset_subdir(&asset_type);
411            let path = format!("assets/{}/{}", subdir, uuid_filename);
412            let asset_dest_dir = match asset_type.as_str() {
413                "image" => &images_dir,
414                "video" => &videos_dir,
415                "audio" => &audio_dir,
416                _ => &files_dir,
417            };
418            let final_asset_path = asset_dest_dir.join(&uuid_filename);
419
420            // Try to download the remote image
421            match download_image(&image_url, &final_asset_path).await {
422                Ok(()) => {
423                    let asset = Asset {
424                        id: uuid_filename.clone(),
425                        path: path.clone(),
426                        asset_type,
427                        alt: alt_text.clone(),
428                        title: None,
429                    };
430                    (uuid_filename, path, Some((asset, image_url.clone())))
431                }
432                Err(e) => {
433                    // Download failed, skip this image and keep original link
434                    eprintln!("Warning: Failed to download image '{}': {}. Keeping original link.", image_url, e);
435                    continue;
436                }
437            }
438        } else {
439            // Local file - use original filename logic
440            let base_name = Path::new(&image_url)
441                .file_stem()
442                .and_then(|s| s.to_str())
443                .unwrap_or("image")
444                .to_string();
445
446            // Ensure unique ID
447            let count = asset_counter.entry(base_name.clone()).or_insert(0);
448            let asset_id = if *count == 0 {
449                base_name.clone()
450            } else {
451                format!("{}_{}", base_name, count)
452            };
453            *count += 1;
454
455            let local_filename = format!("{}.{}", asset_id, extension);
456            let asset_type = get_asset_type(Path::new(&local_filename));
457            let subdir = get_asset_subdir(&asset_type);
458            let path = format!("assets/{}/{}", subdir, local_filename);
459            let asset_dest_dir = match asset_type.as_str() {
460                "image" => &images_dir,
461                "video" => &videos_dir,
462                "audio" => &audio_dir,
463                _ => &files_dir,
464            };
465            let final_asset_path = asset_dest_dir.join(&local_filename);
466
467            // Copy local file
468            let full_image_path = if Path::new(&image_url).is_absolute() {
469                PathBuf::from(&image_url)
470            } else {
471                markdown_path.parent()
472                    .unwrap_or_else(|| Path::new("."))
473                    .join(&image_url)
474            };
475
476            match copy_local_file(&full_image_path, &final_asset_path) {
477                Ok(()) => {
478                    let asset = Asset {
479                        id: asset_id,
480                        path: path.clone(),
481                        asset_type,
482                        alt: alt_text.clone(),
483                        title: None,
484                    };
485                    (local_filename, path, Some((asset, image_url.clone())))
486                }
487                Err(e) => {
488                    // Copy failed, skip this image and keep original link
489                    eprintln!("Warning: Failed to copy file '{}': {}. Keeping original link.", image_url, e);
490                    continue;
491                }
492            }
493        };
494
495        // Add asset to lists if processing was successful
496        if let Some((asset, original_url)) = should_process {
497            assets_with_original_urls.push((asset.clone(), original_url));
498            assets.push(asset);
499        }
500    }
501
502    // Create manifest
503    let original_filename = markdown_path
504        .file_name()
505        .and_then(|s| s.to_str())
506        .unwrap_or("index.md")
507        .to_string();
508
509    let manifest = Manifest {
510        version: "1.0.0".to_string(),
511        title: markdown_path
512            .file_stem()
513            .and_then(|s| s.to_str())
514            .unwrap_or("Untitled")
515            .to_string(),
516        author: None,
517        date: Some(chrono::Utc::now().date_naive().to_string()),
518        filename: Some(original_filename.clone()),
519        assets,
520    };
521
522    // Write manifest.json
523    let manifest_path = temp_dir.join("manifest.json");
524    fs::write(&manifest_path, serde_json::to_string_pretty(&manifest)?)?;
525
526    // Update markdown content with assets:// links and write index.md
527    let updated_markdown_content = update_markdown_links(&markdown_content, &assets_with_original_urls)?;
528    let index_path = temp_dir.join("index.md");
529    fs::write(&index_path, updated_markdown_content)?;
530
531    // Create ZIP file
532    create_zip_file(&temp_dir, output_file)?;
533
534    // Clean up temporary directory
535    fs::remove_dir_all(&temp_dir)?;
536
537    Ok(())
538}
539
540/// Create a ZIP file from the given directory
541fn create_zip_file(source_dir: &Path, output_file: &str) -> Result<()> {
542    use zip::{ZipWriter, write::FileOptions};
543    use std::io::Write;
544    use std::fs::File;
545
546    let file = File::create(output_file)?;
547    let mut zip = ZipWriter::new(file);
548    let options = FileOptions::<'_, ()>::default()
549        .compression_method(zip::CompressionMethod::Deflated)
550        .unix_permissions(0o755);
551
552    // Add all files to the ZIP
553    for entry in walkdir::WalkDir::new(source_dir) {
554        let entry = entry?;
555        let path = entry.path();
556
557        if path.is_file() {
558            let name = path.strip_prefix(source_dir)?;
559            let name_str = name.to_str().ok_or_else(|| anyhow!("Invalid path"))?;
560
561            zip.start_file(name_str, options)?;
562
563            let mut f = fs::File::open(path)?;
564            let mut buffer = Vec::new();
565            f.read_to_end(&mut buffer)?;
566            zip.write_all(&buffer)?;
567        }
568    }
569
570    zip.finish()?;
571    Ok(())
572}
573
574/// Convert assets:// links to local relative paths
575fn convert_assets_to_local(content: &str) -> String {
576    let mut updated_content = content.to_string();
577
578    // Update Markdown image links: ![alt](assets://path)
579    let markdown_regex = Regex::new(r#"(\!\[([^\]]*)\]\()assets://([^)]+)\)"#).unwrap();
580    updated_content = markdown_regex.replace_all(&updated_content, |caps: &regex::Captures| {
581        let alt = caps.get(2).unwrap().as_str();
582        let asset_path = caps.get(3).unwrap().as_str();
583        format!("![{}]({})", alt, asset_path)
584    }).to_string();
585
586    // Update HTML img tags: <img src="assets://path" ...>
587    let html_regex = Regex::new(r#"(<img[^>]+src=["'])assets://([^"']+)(["'][^>]*>)"#).unwrap();
588    updated_content = html_regex.replace_all(&updated_content, |caps: &regex::Captures| {
589        let prefix = caps.get(1).unwrap().as_str();
590        let asset_path = caps.get(2).unwrap().as_str();
591        let suffix = caps.get(3).unwrap().as_str();
592        format!("{}{}{}", prefix, asset_path, suffix)
593    }).to_string();
594
595    updated_content
596}
597
598/// Unpacks an MDZ archive to extract the markdown file and embedded assets.
599///
600/// This function reads a MDZ file, extracts the manifest to determine the original
601/// filename, and extracts all files to the specified directory. It handles both
602/// v1.1.0 (with filename field) and v1.0.0 (without filename field) MDZ files.
603///
604/// # Arguments
605///
606/// * `input_file` - Path to the MDZ file to unpack
607/// * `output_dir` - Directory where files should be extracted (None for parent directory)
608///
609/// # Returns
610///
611/// Returns `Ok(())` on success, or an error if unpacking fails.
612///
613/// # Errors
614///
615/// This function will return an error if:
616/// - The MDZ file cannot be opened or is not a valid ZIP archive
617/// - The manifest.json file is missing or malformed
618/// - Files cannot be extracted to the output directory
619///
620/// # Examples
621///
622/// ```rust,no_run
623/// use mdz_rs::unpack;
624///
625/// fn main() -> Result<(), Box<dyn std::error::Error>> {
626///     // Unpack to current directory
627///     unpack("document.mdz", None)?;
628///
629///     // Unpack to specific directory
630///     unpack("document.mdz", Some("output/"))?;
631///
632///     println!("Successfully unpacked MDZ file");
633///     Ok(())
634/// }
635/// ```
636///
637/// # Backward Compatibility
638///
639/// This function handles MDZ files created with different specification versions:
640/// - **v1.1.0+**: Uses the filename field from manifest.json
641/// - **v1.0.0**: Derives filename from the MDZ file basename
642pub fn unpack(input_file: &str, output_dir: Option<&str>) -> Result<()> {
643    use zip::ZipArchive;
644    use std::fs::File;
645    use std::io::{Read, Write};
646
647    let file = File::open(input_file)?;
648    let mut archive = ZipArchive::new(file)?;
649
650    // Read manifest first to get original filename
651    let manifest_content = {
652        let mut manifest_file = archive.by_name("manifest.json")?;
653        let mut content = String::new();
654        manifest_file.read_to_string(&mut content)?;
655        content
656    };
657
658    let manifest: Manifest = serde_json::from_str(&manifest_content)?;
659    let output_md_filename = manifest.filename.unwrap_or_else(|| {
660        // 向后兼容:如果没有 filename 字段,从输入文件路径推导
661        let input_path = Path::new(input_file);
662        input_path
663            .file_stem()
664            .and_then(|s| s.to_str())
665            .unwrap_or("index")
666            .to_string() + ".md"
667    });
668
669    // Determine output directory
670    let base_output_path = Path::new(output_dir.unwrap_or("."));
671
672    // Extract all files
673    for i in 0..archive.len() {
674        let mut file = archive.by_index(i)?;
675        let filepath = file.name().to_string();  // Clone the string to avoid borrow issues
676
677        // Skip manifest.json as it's only used internally
678        if filepath == "manifest.json" {
679            continue;
680        }
681
682        // For index.md, use the original filename and convert assets:// links
683        let relative_path = if filepath == "index.md" {
684            &output_md_filename
685        } else {
686            &filepath
687        };
688
689        let outpath = base_output_path.join(relative_path);
690
691        if filepath.ends_with('/') {
692            fs::create_dir_all(outpath)?;
693        } else {
694            if let Some(parent) = outpath.parent() {
695                fs::create_dir_all(parent)?;
696            }
697
698            // Handle text files (Markdown) differently from binary files (images)
699            if filepath == "index.md" {
700                let mut content = String::new();
701                file.read_to_string(&mut content)?;
702                content = convert_assets_to_local(&content);
703                let mut outfile = fs::File::create(outpath)?;
704                outfile.write_all(content.as_bytes())?;
705            } else {
706                // For binary files, copy directly
707                let mut outfile = fs::File::create(outpath)?;
708                std::io::copy(&mut file, &mut outfile)?;
709            }
710        }
711    }
712
713    Ok(())
714}
715