docrawl/
util.rs

1use std::path::{Path, PathBuf};
2
3use url::Url;
4
5pub fn normalize_url(u: &Url) -> String {
6    // Manual, conservative normalization: drop fragment, default ports, normalize path
7    let mut clone = u.clone();
8    clone.set_fragment(None);
9    if (clone.scheme() == "http" && clone.port() == Some(80))
10        || (clone.scheme() == "https" && clone.port() == Some(443))
11    {
12        clone.set_port(None).ok();
13    }
14    // Remove trailing slash except for root
15    let mut path = clone.path().to_string();
16    if path.len() > 1 && path.ends_with('/') {
17        path.pop();
18    }
19    clone.set_path(&path);
20    // Remove empty query
21    if clone.query().map(|q| q.trim().is_empty()).unwrap_or(false) {
22        clone.set_query(None);
23    }
24    clone.as_str().to_string()
25}
26
27pub fn is_same_host(a: &Url, b: &Url) -> bool {
28    a.domain() == b.domain() && a.scheme() == b.scheme()
29}
30
31pub fn site_name_from_url(u: &Url) -> String {
32    u.host_str().unwrap_or("site").to_string()
33}
34
35fn sanitize_segment(seg: &str) -> String {
36    let mut s = seg
37        .trim()
38        .replace(['\\', '/', ':', '*', '?', '"', '<', '>', '|'], "-");
39    s = s.replace([' ', '\t', '\n'], "-");
40    // Collapse repeated '-'
41    while s.contains("--") {
42        s = s.replace("--", "-");
43    }
44    s.trim_matches('-').to_string()
45}
46
47pub fn path_for_url(output_root: &Path, base: &Url, target: &Url) -> PathBuf {
48    let base_host = site_name_from_url(base);
49    let mut rel = PathBuf::new();
50    // Ensure host folder exists one level up (already in main), we only build inside it
51    let path = target.path();
52    let mut segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
53
54    // Determine filename
55    let (mut file_stem, ext): (String, &str) =
56        if path.ends_with('/') || path.is_empty() || segments.is_empty() {
57            // Directory or root path: keep directory segments and use index.md
58            if !segments.is_empty() {
59                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
60            }
61            ("index".to_string(), "md")
62        } else if let Some(last) = segments.last() {
63            if last.ends_with(".html") || last.ends_with(".htm") {
64                let stem = last.trim_end_matches(".html").trim_end_matches(".htm");
65                segments.pop();
66                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
67                (sanitize_segment(stem), "md")
68            } else if last.contains('.') {
69                // looks like a file with other extension → treat as a directory index
70                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
71                ("index".to_string(), "md")
72            } else {
73                rel.extend(segments.iter().map(|s| sanitize_segment(s)));
74                ("index".to_string(), "md")
75            }
76        } else {
77            ("index".to_string(), "md")
78        };
79
80    // Encode query into file name if present (to avoid collisions)
81    if let Some(q) = target.query() {
82        use xxhash_rust::xxh3::xxh3_64;
83        let h = xxh3_64(q.as_bytes());
84        file_stem.push_str(&format!("__q_{:x}", h));
85    }
86
87    let file_name = format!("{}.{}", file_stem, ext);
88
89    output_root.join(base_host).join(rel).join(file_name)
90}
91
92pub fn now_rfc3339() -> String {
93    let t = chrono::Utc::now();
94    t.to_rfc3339()
95}
96
97pub fn ensure_parent_dir(path: &Path) -> std::io::Result<()> {
98    if let Some(parent) = path.parent() {
99        std::fs::create_dir_all(parent)?;
100    }
101    Ok(())
102}
103
104pub fn path_for_asset(output_root: &Path, base: &Url, asset: &Url) -> PathBuf {
105    let base_host = site_name_from_url(base);
106    let mut rel = PathBuf::new();
107    let path = asset.path();
108    let segments: Vec<&str> = path.split('/').filter(|s| !s.is_empty()).collect();
109
110    // If ends with '/', generate a name from hash
111    let file_name = if let Some(last) = segments.last() {
112        if last.ends_with('/') || last.is_empty() {
113            None
114        } else {
115            Some(last.to_string())
116        }
117    } else {
118        None
119    };
120
121    rel.extend(segments.into_iter().map(sanitize_segment));
122
123    let final_name = match file_name {
124        Some(n) => sanitize_segment(&n),
125        None => {
126            use xxhash_rust::xxh3::xxh3_64;
127            let h = xxh3_64(asset.as_str().as_bytes());
128            format!("asset_{:x}", h)
129        }
130    };
131
132    output_root
133        .join(base_host)
134        .join(rel)
135        .with_file_name(final_name)
136}
137
138pub fn relpath(from: &Path, to: &Path) -> Option<PathBuf> {
139    pathdiff::diff_paths(to, from.parent().unwrap_or_else(|| Path::new(".")))
140}