Skip to main content

lore_engine/engine/
file_loader.rs

1//! Filesystem scanning — discovers `.md` files in a vault and computes
2//! slugs, titles, and content hashes for incremental sync.
3
4use sha2::{Digest, Sha256};
5use std::collections::HashMap;
6use std::path::{Path, PathBuf};
7use walkdir::WalkDir;
8
9use super::link_parser;
10
11/// A markdown file discovered during a vault scan.
12#[derive(Debug, Clone)]
13pub struct ScannedFile {
14    /// Relative path from vault root, always using forward slashes.
15    pub relative_path: String,
16    /// Absolute path on disk.
17    pub absolute_path: PathBuf,
18    /// Raw markdown content.
19    pub content: String,
20    /// SHA-256 hex digest of content.
21    pub content_hash: String,
22    /// Human title derived from filename (e.g. "My Page").
23    pub title: String,
24    /// Normalized slug (e.g. "my-page" or "folder/my-page").
25    pub slug: String,
26}
27
28/// Compute SHA-256 hex digest of a string.
29pub fn content_hash(content: &str) -> String {
30    let mut hasher = Sha256::new();
31    hasher.update(content.as_bytes());
32    hex::encode(hasher.finalize())
33}
34
35/// Derive a page title from a relative file path.
36/// "notes/My Page.md" -> "My Page"
37pub fn title_from_path(relative_path: &str) -> String {
38    let path = Path::new(relative_path);
39    path.file_stem().map_or_else(|| relative_path.to_string(), |s| s.to_string_lossy().into_owned())
40}
41
42/// Derive a slug from a relative file path.
43/// "notes/My Page.md" -> "notes/my-page"
44pub fn slug_from_path(relative_path: &str) -> String {
45    let path = Path::new(relative_path);
46
47    let parent = path.parent().and_then(|p| {
48        let s = p.to_string_lossy().replace('\\', "/");
49        if s.is_empty() {
50            None
51        } else {
52            Some(s)
53        }
54    });
55
56    let stem = path
57        .file_stem()
58        .map(|s| s.to_string_lossy().into_owned())
59        .unwrap_or_default();
60
61    let name = match parent {
62        Some(dir) => format!("{dir}/{stem}"),
63        None => stem,
64    };
65
66    link_parser::slugify(&name)
67}
68
69/// Characters that are unsafe in filenames across platforms.
70const UNSAFE_FILENAME_CHARS: &[char] = &['/', '\\', ':', '*', '?', '"', '<', '>', '|'];
71
72/// Windows reserved device names (case-insensitive).
73const WINDOWS_RESERVED: &[&str] = &[
74    "CON", "PRN", "AUX", "NUL",
75    "COM1", "COM2", "COM3", "COM4", "COM5", "COM6", "COM7", "COM8", "COM9",
76    "LPT1", "LPT2", "LPT3", "LPT4", "LPT5", "LPT6", "LPT7", "LPT8", "LPT9",
77];
78
79/// Sanitize a page title for use as a filename (without extension).
80///
81/// Replaces filesystem-unsafe characters with `-`, strips leading/trailing
82/// dots and spaces, collapses consecutive hyphens, and appends `_` to
83/// Windows reserved names. Falls back to the slugified title if the result
84/// would be empty.
85pub fn sanitize_filename(title: &str) -> String {
86    // Replace unsafe chars and null bytes with '-'
87    let mut result: String = title
88        .chars()
89        .map(|c| {
90            if c == '\0' || UNSAFE_FILENAME_CHARS.contains(&c) {
91                '-'
92            } else {
93                c
94            }
95        })
96        .collect();
97
98    // Collapse consecutive hyphens
99    while result.contains("--") {
100        result = result.replace("--", "-");
101    }
102
103    // Strip leading/trailing dots, spaces, and hyphens
104    let trimmed = result.trim_matches(|c: char| c == '.' || c == ' ' || c == '-');
105
106    if trimmed.is_empty() {
107        // Fall back to slug
108        let slug = link_parser::slugify(title);
109        if slug.is_empty() { "untitled".to_string() } else { slug }
110    } else {
111        let mut name = trimmed.to_string();
112        // Check Windows reserved names (case-insensitive, with or without extension)
113        let upper = name.to_uppercase();
114        if WINDOWS_RESERVED.contains(&upper.as_str()) {
115            name.push('_');
116        }
117        name
118    }
119}
120
121fn should_skip(entry: &walkdir::DirEntry) -> bool {
122    let name = entry.file_name().to_string_lossy();
123
124    if name.starts_with('.') {
125        return true;
126    }
127
128    if entry.file_type().is_dir() {
129        return matches!(
130            name.as_ref(),
131            "node_modules" | "__pycache__" | ".git" | ".lore"
132        );
133    }
134
135    false
136}
137
138/// Scan a single file and return a `ScannedFile` if it's a valid .md file.
139pub fn scan_single_file(vault_path: &Path, absolute_path: &Path) -> Option<ScannedFile> {
140    let extension = absolute_path
141        .extension()
142        .map(|e| e.to_string_lossy().to_lowercase());
143
144    if extension.as_deref() != Some("md") {
145        return None;
146    }
147
148    if let Ok(relative) = absolute_path.strip_prefix(vault_path) {
149        for component in relative.components() {
150            let name = component.as_os_str().to_string_lossy();
151            if name.starts_with('.')
152                || matches!(
153                    name.as_ref(),
154                    "node_modules" | "__pycache__" | ".git" | ".lore"
155                )
156            {
157                return None;
158            }
159        }
160    }
161
162    let content = match std::fs::read_to_string(absolute_path) {
163        Ok(c) => {
164            let normalized = c.replace("\r\n", "\n");
165            // Strip UTF-8 BOM if present
166            if let Some(stripped) = normalized.strip_prefix('\u{FEFF}') {
167                stripped.to_string()
168            } else {
169                normalized
170            }
171        }
172        Err(e) => {
173            log::warn!("Failed to read {}: {e}", absolute_path.display());
174            return None;
175        }
176    };
177
178    let relative = match absolute_path.strip_prefix(vault_path) {
179        Ok(rel) => rel.to_string_lossy().replace('\\', "/"),
180        Err(_) => {
181            log::warn!(
182                "File {} is outside vault root {}, skipping",
183                absolute_path.display(),
184                vault_path.display()
185            );
186            return None;
187        }
188    };
189
190    let hash = content_hash(&content);
191    let title = title_from_path(&relative);
192    let slug = slug_from_path(&relative);
193
194    Some(ScannedFile {
195        relative_path: relative,
196        absolute_path: absolute_path.to_path_buf(),
197        content,
198        content_hash: hash,
199        title,
200        slug,
201    })
202}
203
204/// Recursively scan a folder for .md files.
205pub fn scan_folder(vault_path: &Path) -> Result<Vec<ScannedFile>, std::io::Error> {
206    let mut files = Vec::new();
207
208    let walker = WalkDir::new(vault_path)
209        .follow_links(false)
210        .into_iter()
211        .filter_entry(|e| !should_skip(e));
212
213    for entry in walker {
214        let entry = entry?;
215
216        if !entry.file_type().is_file() {
217            continue;
218        }
219
220        if let Some(scanned) = scan_single_file(vault_path, entry.path()) {
221            files.push(scanned);
222        }
223    }
224
225    log::info!(
226        "Scanned {} markdown files in {}",
227        files.len(),
228        vault_path.display()
229    );
230    Ok(files)
231}
232
233/// Diff scanned files against existing DB hashes.
234/// Returns (`new_or_changed` files, deleted slugs).
235pub fn diff_scan<'a>(
236    scanned: &'a [ScannedFile],
237    existing_hashes: &HashMap<String, String>,
238) -> (Vec<&'a ScannedFile>, Vec<String>) {
239    let scanned_slugs: std::collections::HashSet<&str> =
240        scanned.iter().map(|f| f.slug.as_str()).collect();
241
242    let new_or_changed: Vec<&ScannedFile> = scanned
243        .iter()
244        .filter(|f| match existing_hashes.get(&f.slug) {
245            Some(old_hash) => old_hash != &f.content_hash,
246            None => true,
247        })
248        .collect();
249
250    let deleted: Vec<String> = existing_hashes
251        .keys()
252        .filter(|slug| !scanned_slugs.contains(slug.as_str()))
253        .cloned()
254        .collect();
255
256    (new_or_changed, deleted)
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262
263    #[test]
264    fn test_content_hash_deterministic() {
265        let h1 = content_hash("hello world");
266        let h2 = content_hash("hello world");
267        assert_eq!(h1, h2);
268        assert_eq!(h1.len(), 64);
269    }
270
271    #[test]
272    fn test_title_from_path() {
273        assert_eq!(title_from_path("My Page.md"), "My Page");
274        assert_eq!(title_from_path("notes/My Page.md"), "My Page");
275        assert_eq!(title_from_path("README.md"), "README");
276    }
277
278    #[test]
279    fn test_slug_from_path() {
280        assert_eq!(slug_from_path("My Page.md"), "my-page");
281        assert_eq!(slug_from_path("notes/My Page.md"), "notes/my-page");
282        assert_eq!(slug_from_path("README.md"), "readme");
283        assert_eq!(
284            slug_from_path("deep/nested/Page.md"),
285            "deep/nested/page"
286        );
287    }
288
289    #[test]
290    fn test_slug_from_path_backslashes() {
291        assert_eq!(slug_from_path("notes\\My Page.md"), "notes/my-page");
292    }
293
294    #[test]
295    fn test_sanitize_filename_passthrough() {
296        assert_eq!(sanitize_filename("My Page"), "My Page");
297        assert_eq!(sanitize_filename("simple"), "simple");
298    }
299
300    #[test]
301    fn test_sanitize_filename_special_chars() {
302        assert_eq!(sanitize_filename("Notes/Ideas"), "Notes-Ideas");
303        assert_eq!(sanitize_filename("file:name*here"), "file-name-here");
304        assert_eq!(sanitize_filename("a<b>c?d"), "a-b-c-d");
305        assert_eq!(sanitize_filename("pipe|test"), "pipe-test");
306        assert_eq!(sanitize_filename("back\\slash"), "back-slash");
307        assert_eq!(sanitize_filename("quote\"mark"), "quote-mark");
308    }
309
310    #[test]
311    fn test_sanitize_filename_windows_reserved() {
312        assert_eq!(sanitize_filename("CON"), "CON_");
313        assert_eq!(sanitize_filename("con"), "con_");
314        assert_eq!(sanitize_filename("PRN"), "PRN_");
315        assert_eq!(sanitize_filename("NUL"), "NUL_");
316        assert_eq!(sanitize_filename("COM1"), "COM1_");
317        assert_eq!(sanitize_filename("LPT3"), "LPT3_");
318    }
319
320    #[test]
321    fn test_sanitize_filename_empty_fallback() {
322        assert_eq!(sanitize_filename("***"), "untitled");
323        assert_eq!(sanitize_filename("..."), "untitled");
324        assert_eq!(sanitize_filename(""), "untitled");
325    }
326
327    #[test]
328    fn test_sanitize_filename_leading_trailing_dots() {
329        assert_eq!(sanitize_filename("...title..."), "title");
330        assert_eq!(sanitize_filename("  spaces  "), "spaces");
331    }
332}