Skip to main content

docrawl/
util.rs

1use std::path::{Path, PathBuf};
2
3use url::Url;
4
5pub fn normalize_url(u: &Url) -> String {
6    // Manual, conservative normalization: drop fragment, default ports, normalize path
7    let mut clone = u.clone();
8    clone.set_fragment(None);
9    if (clone.scheme() == "http" && clone.port() == Some(80))
10        || (clone.scheme() == "https" && clone.port() == Some(443))
11    {
12        clone.set_port(None).ok();
13    }
14    // Remove trailing slash except for root
15    let mut path = clone.path().to_string();
16    if path.len() > 1 && path.ends_with('/') {
17        path.pop();
18    }
19    clone.set_path(&path);
20    // Remove empty query
21    if clone.query().map(|q| q.trim().is_empty()).unwrap_or(false) {
22        clone.set_query(None);
23    }
24    clone.as_str().to_string()
25}
26
27pub fn is_same_host(a: &Url, b: &Url) -> bool {
28    fn bare_domain(u: &Url) -> Option<&str> {
29        u.domain().map(|d| d.strip_prefix("www.").unwrap_or(d))
30    }
31    bare_domain(a) == bare_domain(b)
32        && a.scheme() == b.scheme()
33        && a.port_or_known_default() == b.port_or_known_default()
34}
35
36pub fn site_name_from_url(u: &Url) -> String {
37    u.host_str().unwrap_or("site").to_string()
38}
39
40fn sanitize_segment(seg: &str) -> String {
41    let mut s = seg
42        .trim()
43        .replace(['\\', '/', ':', '*', '?', '"', '<', '>', '|'], "-");
44    s = s.replace([' ', '\t', '\n'], "-");
45    // Collapse repeated '-'
46    while s.contains("--") {
47        s = s.replace("--", "-");
48    }
49    s.trim_matches('-').to_string()
50}
51
52pub fn path_for_url(output_root: &Path, base: &Url, target: &Url) -> PathBuf {
53    let base_host = site_name_from_url(base);
54    let mut rel = PathBuf::new();
55    // Ensure host folder exists one level up (already in main), we only build inside it
56    let path = target.path();
57    let mut segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
58
59    // Determine filename
60    let (mut file_stem, ext): (String, &str) =
61        if path.ends_with('/') || path.is_empty() || segments.is_empty() {
62            // Directory or root path: keep directory segments and use index.md
63            if !segments.is_empty() {
64                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
65            }
66            ("index".to_string(), "md")
67        } else if let Some(last) = segments.last() {
68            if last.ends_with(".html") || last.ends_with(".htm") {
69                let stem = last.trim_end_matches(".html").trim_end_matches(".htm");
70                segments.pop();
71                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
72                (sanitize_segment(stem), "md")
73            } else if last.contains('.') {
74                // looks like a file with other extension → treat as a directory index
75                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
76                ("index".to_string(), "md")
77            } else {
78                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
79                ("index".to_string(), "md")
80            }
81        } else {
82            ("index".to_string(), "md")
83        };
84
85    // Encode query into file name if present (to avoid collisions)
86    if let Some(q) = target.query() {
87        use xxhash_rust::xxh3::xxh3_64;
88        let h = xxh3_64(q.as_bytes());
89        file_stem.push_str(&format!("__q_{:x}", h));
90    }
91
92    let file_name = format!("{}.{}", file_stem, ext);
93
94    output_root.join(base_host).join(rel).join(file_name)
95}
96
97pub fn now_rfc3339() -> String {
98    let t = chrono::Utc::now();
99    t.to_rfc3339()
100}
101
102pub fn ensure_parent_dir(path: &Path) -> std::io::Result<()> {
103    if let Some(parent) = path.parent() {
104        std::fs::create_dir_all(parent)?;
105    }
106    Ok(())
107}
108
109pub fn path_for_asset(output_root: &Path, base: &Url, asset: &Url) -> PathBuf {
110    let base_host = site_name_from_url(base);
111    let mut rel = PathBuf::new();
112    let path = asset.path();
113    let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
114
115    // If ends with '/', generate a name from hash
116    let file_name = if let Some(last) = segments.last() {
117        if last.ends_with('/') || last.is_empty() {
118            None
119        } else {
120            Some(last.to_string())
121        }
122    } else {
123        None
124    };
125
126    rel.extend(segments.into_iter().map(sanitize_segment));
127
128    let final_name = match file_name {
129        Some(n) => sanitize_segment(&n),
130        None => {
131            use xxhash_rust::xxh3::xxh3_64;
132            let h = xxh3_64(asset.as_str().as_bytes());
133            format!("asset_{:x}", h)
134        }
135    };
136
137    output_root
138        .join(base_host)
139        .join(rel)
140        .with_file_name(final_name)
141}
142
143pub fn relpath(from: &Path, to: &Path) -> Option<PathBuf> {
144    pathdiff::diff_paths(to, from.parent().unwrap_or_else(|| Path::new(".")))
145}