md_prune_image/
parser.rs

1use crate::error::{Error, Result};
2use percent_encoding::percent_decode_str;
3use regex::Regex;
4use std::collections::HashSet;
5use std::fs;
6use std::path::{Path, PathBuf};
7
8pub fn extract_image_references(markdown_path: &Path, base_dir: &Path) -> Result<HashSet<PathBuf>> {
9    let content = fs::read_to_string(markdown_path).map_err(|source| Error::ReadFile {
10        path: markdown_path.to_path_buf(),
11        source,
12    })?;
13
14    let mut references = HashSet::new();
15    let markdown_dir = markdown_path.parent().unwrap_or(base_dir);
16
17    // Regex for markdown image syntax: ![alt](path) and ![alt](path "title")
18    let img_pattern = Regex::new(r#"!\[.*?]\(([^)]+?)(?:\s+["'].*?["'])?\)"#)?;
19
20    // Regex for HTML img tags: <img src="path">
21    let html_pattern = Regex::new(r#"<img[^>]+src=["']([^"']+)["']"#)?;
22
23    for cap in img_pattern.captures_iter(&content) {
24        if let Some(path_match) = cap.get(1) {
25            let img_path = path_match.as_str().trim();
26
27            if is_url(img_path) {
28                continue;
29            }
30
31            if let Some(resolved) = resolve_image_path(img_path, markdown_dir, base_dir) {
32                references.insert(resolved);
33            }
34        }
35    }
36
37    for cap in html_pattern.captures_iter(&content) {
38        if let Some(path_match) = cap.get(1) {
39            let img_path = path_match.as_str().trim();
40
41            if is_url(img_path) {
42                continue;
43            }
44
45            if let Some(resolved) = resolve_image_path(img_path, markdown_dir, base_dir) {
46                references.insert(resolved);
47            }
48        }
49    }
50
51    Ok(references)
52}
53
54fn is_url(path: &str) -> bool {
55    path.starts_with("http://")
56        || path.starts_with("https://")
57        || path.starts_with("//")
58        || path.starts_with("data:")
59}
60
61pub(crate) fn resolve_image_path(
62    img_path: &str,
63    markdown_dir: &Path,
64    base_dir: &Path,
65) -> Option<PathBuf> {
66    let decoded_path = percent_decode_str(img_path)
67        .decode_utf8()
68        .map(|s| s.into_owned())
69        .unwrap_or_else(|_| img_path.to_string());
70
71    let clean_path = decoded_path
72        .split('#')
73        .next()
74        .and_then(|s| s.split('?').next())
75        .unwrap_or(&decoded_path);
76
77    try_resolve_path(clean_path, markdown_dir, base_dir).or_else(|| {
78        if clean_path != img_path {
79            let clean_original = img_path
80                .split('#')
81                .next()
82                .and_then(|s| s.split('?').next())
83                .unwrap_or(img_path);
84            try_resolve_path(clean_original, markdown_dir, base_dir)
85        } else {
86            None
87        }
88    })
89}
90
91fn try_resolve_path(img_path: &str, markdown_dir: &Path, base_dir: &Path) -> Option<PathBuf> {
92    let relative_to_md = markdown_dir.join(img_path);
93    if let Ok(canonical) = relative_to_md.canonicalize()
94        && canonical.starts_with(base_dir.canonicalize().ok()?)
95    {
96        return Some(canonical);
97    }
98
99    let relative_to_base = base_dir.join(img_path);
100    if let Ok(canonical) = relative_to_base.canonicalize()
101        && canonical.starts_with(base_dir.canonicalize().ok()?)
102    {
103        return Some(canonical);
104    }
105
106    let abs_path = PathBuf::from(img_path);
107    if abs_path.is_absolute()
108        && let Ok(canonical) = abs_path.canonicalize()
109        && canonical.starts_with(base_dir.canonicalize().ok()?)
110    {
111        return Some(canonical);
112    }
113
114    None
115}