1use sha2::{Digest, Sha256};
5use std::collections::HashMap;
6use std::path::{Path, PathBuf};
7use walkdir::WalkDir;
8
9use super::link_parser;
10
11#[derive(Debug, Clone)]
13pub struct ScannedFile {
14 pub relative_path: String,
16 pub absolute_path: PathBuf,
18 pub content: String,
20 pub content_hash: String,
22 pub title: String,
24 pub slug: String,
26}
27
28pub fn content_hash(content: &str) -> String {
30 let mut hasher = Sha256::new();
31 hasher.update(content.as_bytes());
32 hex::encode(hasher.finalize())
33}
34
35pub fn title_from_path(relative_path: &str) -> String {
38 let path = Path::new(relative_path);
39 path.file_stem().map_or_else(|| relative_path.to_string(), |s| s.to_string_lossy().into_owned())
40}
41
42pub fn slug_from_path(relative_path: &str) -> String {
45 let path = Path::new(relative_path);
46
47 let parent = path.parent().and_then(|p| {
48 let s = p.to_string_lossy().replace('\\', "/");
49 if s.is_empty() {
50 None
51 } else {
52 Some(s)
53 }
54 });
55
56 let stem = path
57 .file_stem()
58 .map(|s| s.to_string_lossy().into_owned())
59 .unwrap_or_default();
60
61 let name = match parent {
62 Some(dir) => format!("{dir}/{stem}"),
63 None => stem,
64 };
65
66 link_parser::slugify(&name)
67}
68
69const UNSAFE_FILENAME_CHARS: &[char] = &['/', '\\', ':', '*', '?', '"', '<', '>', '|'];
71
72const WINDOWS_RESERVED: &[&str] = &[
74 "CON", "PRN", "AUX", "NUL",
75 "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
76 "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
77];
78
79pub fn sanitize_filename(title: &str) -> String {
86 let mut result: String = title
88 .chars()
89 .map(|c| {
90 if c == '\0' || UNSAFE_FILENAME_CHARS.contains(&c) {
91 '-'
92 } else {
93 c
94 }
95 })
96 .collect();
97
98 while result.contains("--") {
100 result = result.replace("--", "-");
101 }
102
103 let trimmed = result.trim_matches(|c: char| c == '.' || c == ' ' || c == '-');
105
106 if trimmed.is_empty() {
107 let slug = link_parser::slugify(title);
109 if slug.is_empty() { "untitled".to_string() } else { slug }
110 } else {
111 let mut name = trimmed.to_string();
112 let upper = name.to_uppercase();
114 if WINDOWS_RESERVED.contains(&upper.as_str()) {
115 name.push('_');
116 }
117 name
118 }
119}
120
121fn should_skip(entry: &walkdir::DirEntry) -> bool {
122 let name = entry.file_name().to_string_lossy();
123
124 if name.starts_with('.') {
125 return true;
126 }
127
128 if entry.file_type().is_dir() {
129 return matches!(
130 name.as_ref(),
131 "node_modules" | "__pycache__" | ".git" | ".lore"
132 );
133 }
134
135 false
136}
137
138pub fn scan_single_file(vault_path: &Path, absolute_path: &Path) -> Option<ScannedFile> {
140 let extension = absolute_path
141 .extension()
142 .map(|e| e.to_string_lossy().to_lowercase());
143
144 if extension.as_deref() != Some("md") {
145 return None;
146 }
147
148 if let Ok(relative) = absolute_path.strip_prefix(vault_path) {
149 for component in relative.components() {
150 let name = component.as_os_str().to_string_lossy();
151 if name.starts_with('.')
152 || matches!(
153 name.as_ref(),
154 "node_modules" | "__pycache__" | ".git" | ".lore"
155 )
156 {
157 return None;
158 }
159 }
160 }
161
162 let content = match std::fs::read_to_string(absolute_path) {
163 Ok(c) => {
164 let normalized = c.replace("\r\n", "\n");
165 if let Some(stripped) = normalized.strip_prefix('\u{FEFF}') {
167 stripped.to_string()
168 } else {
169 normalized
170 }
171 }
172 Err(e) => {
173 log::warn!("Failed to read {}: {e}", absolute_path.display());
174 return None;
175 }
176 };
177
178 let relative = match absolute_path.strip_prefix(vault_path) {
179 Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
180 Err(_) => {
181 log::warn!(
182 "File {} is outside vault root {}, skipping",
183 absolute_path.display(),
184 vault_path.display()
185 );
186 return None;
187 }
188 };
189
190 let hash = content_hash(&content);
191 let title = title_from_path(&relative);
192 let slug = slug_from_path(&relative);
193
194 Some(ScannedFile {
195 relative_path: relative,
196 absolute_path: absolute_path.to_path_buf(),
197 content,
198 content_hash: hash,
199 title,
200 slug,
201 })
202}
203
204pub fn scan_folder(vault_path: &Path) -> Result<Vec<ScannedFile>, std::io::Error> {
206 let mut files = Vec::new();
207
208 let walker = WalkDir::new(vault_path)
209 .follow_links(false)
210 .into_iter()
211 .filter_entry(|e| !should_skip(e));
212
213 for entry in walker {
214 let entry = entry?;
215
216 if !entry.file_type().is_file() {
217 continue;
218 }
219
220 if let Some(scanned) = scan_single_file(vault_path, entry.path()) {
221 files.push(scanned);
222 }
223 }
224
225 log::info!(
226 "Scanned {} markdown files in {}",
227 files.len(),
228 vault_path.display()
229 );
230 Ok(files)
231}
232
233pub fn diff_scan<'a>(
236 scanned: &'a [ScannedFile],
237 existing_hashes: &HashMap<String, String>,
238) -> (Vec<&'a ScannedFile>, Vec<String>) {
239 let scanned_slugs: std::collections::HashSet<&str> =
240 scanned.iter().map(|f| f.slug.as_str()).collect();
241
242 let new_or_changed: Vec<&ScannedFile> = scanned
243 .iter()
244 .filter(|f| match existing_hashes.get(&f.slug) {
245 Some(old_hash) => old_hash != &f.content_hash,
246 None => true,
247 })
248 .collect();
249
250 let deleted: Vec<String> = existing_hashes
251 .keys()
252 .filter(|slug| !scanned_slugs.contains(slug.as_str()))
253 .cloned()
254 .collect();
255
256 (new_or_changed, deleted)
257}
258
259#[cfg(test)]
260mod tests {
261 use super::*;
262
263 #[test]
264 fn test_content_hash_deterministic() {
265 let h1 = content_hash("hello world");
266 let h2 = content_hash("hello world");
267 assert_eq!(h1, h2);
268 assert_eq!(h1.len(), 64);
269 }
270
271 #[test]
272 fn test_title_from_path() {
273 assert_eq!(title_from_path("My Page.md"), "My Page");
274 assert_eq!(title_from_path("notes/My Page.md"), "My Page");
275 assert_eq!(title_from_path("README.md"), "README");
276 }
277
278 #[test]
279 fn test_slug_from_path() {
280 assert_eq!(slug_from_path("My Page.md"), "my-page");
281 assert_eq!(slug_from_path("notes/My Page.md"), "notes/my-page");
282 assert_eq!(slug_from_path("README.md"), "readme");
283 assert_eq!(
284 slug_from_path("deep/nested/Page.md"),
285 "deep/nested/page"
286 );
287 }
288
289 #[test]
290 fn test_slug_from_path_backslashes() {
291 assert_eq!(slug_from_path("notes\\My Page.md"), "notes/my-page");
292 }
293
294 #[test]
295 fn test_sanitize_filename_passthrough() {
296 assert_eq!(sanitize_filename("My Page"), "My Page");
297 assert_eq!(sanitize_filename("simple"), "simple");
298 }
299
300 #[test]
301 fn test_sanitize_filename_special_chars() {
302 assert_eq!(sanitize_filename("Notes/Ideas"), "Notes-Ideas");
303 assert_eq!(sanitize_filename("file:name*here"), "file-name-here");
304 assert_eq!(sanitize_filename("a<b>c?d"), "a-b-c-d");
305 assert_eq!(sanitize_filename("pipe|test"), "pipe-test");
306 assert_eq!(sanitize_filename("back\\slash"), "back-slash");
307 assert_eq!(sanitize_filename("quote\"mark"), "quote-mark");
308 }
309
310 #[test]
311 fn test_sanitize_filename_windows_reserved() {
312 assert_eq!(sanitize_filename("CON"), "CON_");
313 assert_eq!(sanitize_filename("con"), "con_");
314 assert_eq!(sanitize_filename("PRN"), "PRN_");
315 assert_eq!(sanitize_filename("NUL"), "NUL_");
316 assert_eq!(sanitize_filename("COM1"), "COM1_");
317 assert_eq!(sanitize_filename("LPT3"), "LPT3_");
318 }
319
320 #[test]
321 fn test_sanitize_filename_empty_fallback() {
322 assert_eq!(sanitize_filename("***"), "untitled");
323 assert_eq!(sanitize_filename("..."), "untitled");
324 assert_eq!(sanitize_filename(""), "untitled");
325 }
326
327 #[test]
328 fn test_sanitize_filename_leading_trailing_dots() {
329 assert_eq!(sanitize_filename("...title..."), "title");
330 assert_eq!(sanitize_filename(" spaces "), "spaces");
331 }
332}