Skip to main content

mdvault_core/index/
builder.rs

1//! Index building orchestration.
2
3use std::path::Path;
4
5use chrono::{DateTime, Utc};
6use thiserror::Error;
7
8use super::db::{IndexDb, IndexError};
9use super::types::{IndexedLink, IndexedNote};
10use crate::vault::{
11    VaultWalker, VaultWalkerError, WalkedFile, content_hash, extract_note,
12};
13
14#[derive(Debug, Error)]
15pub enum BuilderError {
16    #[error("Vault walker error: {0}")]
17    Walker(#[from] VaultWalkerError),
18
19    #[error("Index database error: {0}")]
20    Index(#[from] IndexError),
21
22    #[error("Failed to read file {path}: {source}")]
23    FileRead {
24        path: String,
25        #[source]
26        source: std::io::Error,
27    },
28}
29
30/// Statistics from an indexing operation.
31#[derive(Debug, Clone, Default)]
32pub struct IndexStats {
33    /// Number of files discovered.
34    pub files_found: usize,
35    /// Number of notes indexed (new or updated).
36    pub notes_indexed: usize,
37    /// Number of notes skipped due to errors.
38    pub notes_skipped: usize,
39    /// Number of links indexed.
40    pub links_indexed: usize,
41    /// Number of broken links found.
42    pub broken_links: usize,
43    /// Indexing duration in milliseconds.
44    pub duration_ms: u64,
45    /// Number of files unchanged (hash match) - incremental mode only.
46    pub files_unchanged: usize,
47    /// Number of files added (new to index) - incremental mode only.
48    pub files_added: usize,
49    /// Number of files updated (content changed) - incremental mode only.
50    pub files_updated: usize,
51    /// Number of files deleted (removed from vault) - incremental mode only.
52    pub files_deleted: usize,
53}
54
55/// File change classification for incremental updates.
56#[derive(Debug, Clone, Copy, PartialEq, Eq)]
57pub enum FileChange {
58    /// File is new (not in index).
59    Added,
60    /// File content has changed.
61    Modified,
62    /// File content unchanged.
63    Unchanged,
64}
65
66/// Progress callback for indexing operations.
67/// Parameters: (current, total, current_path)
68pub type ProgressCallback = Box<dyn Fn(usize, usize, &str)>;
69
70/// Builder for populating the vault index.
71pub struct IndexBuilder<'a> {
72    db: &'a IndexDb,
73    vault_root: &'a Path,
74    excluded_folders: Vec<std::path::PathBuf>,
75}
76
77impl<'a> IndexBuilder<'a> {
78    /// Create a new index builder.
79    pub fn new(db: &'a IndexDb, vault_root: &'a Path) -> Self {
80        Self { db, vault_root, excluded_folders: Vec::new() }
81    }
82
83    /// Create a new index builder with folder exclusions.
84    pub fn with_exclusions(
85        db: &'a IndexDb,
86        vault_root: &'a Path,
87        excluded_folders: Vec<std::path::PathBuf>,
88    ) -> Self {
89        Self { db, vault_root, excluded_folders }
90    }
91
92    /// Perform a full reindex of the vault.
93    /// Clears existing data and rebuilds from scratch.
94    pub fn full_reindex(
95        &self,
96        progress: Option<ProgressCallback>,
97    ) -> Result<IndexStats, BuilderError> {
98        let start = std::time::Instant::now();
99        let mut stats = IndexStats::default();
100
101        // Walk the vault with exclusions
102        let walker =
103            VaultWalker::with_exclusions(self.vault_root, self.excluded_folders.clone())?;
104        let files = walker.walk()?;
105        stats.files_found = files.len();
106
107        // Clear existing index
108        self.db.clear_all()?;
109
110        // Phase 1: Index all notes
111        for (i, file) in files.iter().enumerate() {
112            if let Some(ref cb) = progress {
113                cb(i + 1, files.len(), &file.relative_path.to_string_lossy());
114            }
115
116            match self.index_note(file) {
117                Ok(link_count) => {
118                    stats.notes_indexed += 1;
119                    stats.links_indexed += link_count;
120                }
121                Err(e) => {
122                    // Log error but continue indexing
123                    tracing::warn!(
124                        "Failed to index {}: {}",
125                        file.relative_path.display(),
126                        e
127                    );
128                    stats.notes_skipped += 1;
129                }
130            }
131        }
132
133        // Phase 2: Resolve link targets
134        self.db.resolve_link_targets()?;
135        stats.broken_links = self.db.count_broken_links()? as usize;
136
137        stats.duration_ms = start.elapsed().as_millis() as u64;
138        Ok(stats)
139    }
140
141    /// Perform an incremental reindex of the vault.
142    /// Only processes files that have changed since last index.
143    pub fn incremental_reindex(
144        &self,
145        progress: Option<ProgressCallback>,
146    ) -> Result<IndexStats, BuilderError> {
147        let start = std::time::Instant::now();
148        let mut stats = IndexStats::default();
149
150        // Phase 1: Walk the vault and collect all current files (with exclusions)
151        let walker =
152            VaultWalker::with_exclusions(self.vault_root, self.excluded_folders.clone())?;
153        let files = walker.walk()?;
154        stats.files_found = files.len();
155
156        // Phase 2: Get all currently indexed paths for deletion detection
157        let indexed_paths: std::collections::HashSet<std::path::PathBuf> =
158            self.db.get_all_paths()?.into_iter().collect();
159
160        // Track which paths we've seen in the vault
161        let mut seen_paths: std::collections::HashSet<std::path::PathBuf> =
162            std::collections::HashSet::with_capacity(files.len());
163
164        // Phase 3: Classify and process each file
165        for (i, file) in files.iter().enumerate() {
166            if let Some(ref cb) = progress {
167                cb(i + 1, files.len(), &file.relative_path.to_string_lossy());
168            }
169
170            seen_paths.insert(file.relative_path.clone());
171
172            // Classify the change
173            let change = self.classify_change(file)?;
174
175            match change {
176                FileChange::Unchanged => {
177                    stats.files_unchanged += 1;
178                }
179                FileChange::Added | FileChange::Modified => match self.index_note(file) {
180                    Ok(link_count) => {
181                        stats.notes_indexed += 1;
182                        stats.links_indexed += link_count;
183                        if change == FileChange::Added {
184                            stats.files_added += 1;
185                        } else {
186                            stats.files_updated += 1;
187                        }
188                    }
189                    Err(e) => {
190                        tracing::warn!(
191                            "Failed to index {}: {}",
192                            file.relative_path.display(),
193                            e
194                        );
195                        stats.notes_skipped += 1;
196                    }
197                },
198            }
199        }
200
201        // Phase 4: Detect and delete removed files
202        for indexed_path in &indexed_paths {
203            if !seen_paths.contains(indexed_path) && self.db.delete_note(indexed_path)? {
204                stats.files_deleted += 1;
205                tracing::debug!("Deleted from index: {}", indexed_path.display());
206            }
207        }
208
209        // Phase 5: Resolve link targets (handles newly valid links)
210        self.db.resolve_link_targets()?;
211        stats.broken_links = self.db.count_broken_links()? as usize;
212
213        stats.duration_ms = start.elapsed().as_millis() as u64;
214        Ok(stats)
215    }
216
217    /// Classify a file's change status by comparing content hashes.
218    fn classify_change(&self, file: &WalkedFile) -> Result<FileChange, BuilderError> {
219        // Get stored hash (if any)
220        let stored_hash = self.db.get_content_hash(&file.relative_path)?;
221
222        match stored_hash {
223            None => Ok(FileChange::Added),
224            Some(stored) => {
225                // Compute current hash
226                let current = content_hash(&file.absolute_path).map_err(|e| {
227                    BuilderError::FileRead {
228                        path: file.absolute_path.display().to_string(),
229                        source: e,
230                    }
231                })?;
232
233                if current == stored {
234                    Ok(FileChange::Unchanged)
235                } else {
236                    Ok(FileChange::Modified)
237                }
238            }
239        }
240    }
241
242    /// Index a single note file.
243    /// Returns the number of links indexed.
244    fn index_note(&self, file: &WalkedFile) -> Result<usize, BuilderError> {
245        // Read file content
246        let content = std::fs::read_to_string(&file.absolute_path).map_err(|e| {
247            BuilderError::FileRead {
248                path: file.absolute_path.display().to_string(),
249                source: e,
250            }
251        })?;
252
253        // Compute content hash
254        let hash =
255            content_hash(&file.absolute_path).map_err(|e| BuilderError::FileRead {
256                path: file.absolute_path.display().to_string(),
257                source: e,
258            })?;
259
260        // Extract note metadata
261        let extracted = extract_note(&content, &file.relative_path);
262
263        // Convert modified time to DateTime<Utc>
264        let modified: DateTime<Utc> = file.modified.into();
265
266        // Create indexed note
267        let note = IndexedNote {
268            id: None,
269            path: file.relative_path.clone(),
270            note_type: extracted.note_type,
271            title: extracted.title,
272            created: None, // Could extract from frontmatter if present
273            modified,
274            frontmatter_json: extracted.frontmatter_json,
275            content_hash: hash,
276        };
277
278        // Insert note and get ID
279        let note_id = self.db.upsert_note(&note)?;
280
281        // Delete existing links for this note (in case of update)
282        self.db.delete_links_from(note_id)?;
283
284        // Insert links
285        let link_count = extracted.links.len();
286        for link in extracted.links {
287            let indexed_link = IndexedLink {
288                id: None,
289                source_id: note_id,
290                target_id: None, // Resolved in phase 2
291                target_path: link.target,
292                link_text: link.text,
293                link_type: link.link_type,
294                context: link.context,
295                line_number: Some(link.line_number),
296            };
297            self.db.insert_link(&indexed_link)?;
298        }
299
300        Ok(link_count)
301    }
302
303    /// Reindex a single file by its path relative to the vault root.
304    pub fn reindex_file(&self, relative_path: &Path) -> Result<(), BuilderError> {
305        let absolute_path = self.vault_root.join(relative_path);
306        let metadata =
307            std::fs::metadata(&absolute_path).map_err(|e| BuilderError::FileRead {
308                path: absolute_path.display().to_string(),
309                source: e,
310            })?;
311        let file = WalkedFile {
312            absolute_path,
313            relative_path: relative_path.to_path_buf(),
314            modified: metadata.modified().unwrap_or(std::time::SystemTime::now()),
315            size: metadata.len(),
316        };
317        self.index_note(&file)?;
318        Ok(())
319    }
320}
321
322#[cfg(test)]
323mod tests {
324    use super::*;
325    use std::fs;
326    use tempfile::TempDir;
327
328    fn create_test_vault() -> TempDir {
329        let dir = TempDir::new().unwrap();
330        let root = dir.path();
331
332        // Create some markdown files with links
333        fs::write(
334            root.join("note1.md"),
335            r#"---
336title: Note One
337type: zettel
338---
339# Note One
340
341This links to [[note2]] and [[missing-note]].
342"#,
343        )
344        .unwrap();
345
346        fs::write(
347            root.join("note2.md"),
348            r#"---
349title: Note Two
350type: task
351project: note1
352---
353# Note Two
354
355Back to [[note1]].
356"#,
357        )
358        .unwrap();
359
360        fs::create_dir(root.join("subdir")).unwrap();
361        fs::write(
362            root.join("subdir/note3.md"),
363            r#"# Note Three
364
365Links to [Note One](../note1.md).
366"#,
367        )
368        .unwrap();
369
370        dir
371    }
372
373    #[test]
374    fn test_full_reindex() {
375        let vault = create_test_vault();
376        let db = IndexDb::open_in_memory().unwrap();
377
378        let builder = IndexBuilder::new(&db, vault.path());
379        let stats = builder.full_reindex(None).unwrap();
380
381        assert_eq!(stats.files_found, 3);
382        assert_eq!(stats.notes_indexed, 3);
383        assert_eq!(stats.notes_skipped, 0);
384        assert!(stats.links_indexed >= 4); // At least 4 links across all notes
385    }
386
387    #[test]
388    fn test_notes_are_indexed_correctly() {
389        let vault = create_test_vault();
390        let db = IndexDb::open_in_memory().unwrap();
391
392        let builder = IndexBuilder::new(&db, vault.path());
393        builder.full_reindex(None).unwrap();
394
395        // Check note1 is indexed
396        let note1 = db
397            .get_note_by_path(Path::new("note1.md"))
398            .unwrap()
399            .expect("note1 should exist");
400        assert_eq!(note1.title, "Note One");
401        assert_eq!(note1.note_type, crate::index::types::NoteType::Zettel);
402
403        // Check note2 is indexed
404        let note2 = db
405            .get_note_by_path(Path::new("note2.md"))
406            .unwrap()
407            .expect("note2 should exist");
408        assert_eq!(note2.title, "Note Two");
409        assert_eq!(note2.note_type, crate::index::types::NoteType::Task);
410    }
411
412    #[test]
413    fn test_links_are_indexed() {
414        let vault = create_test_vault();
415        let db = IndexDb::open_in_memory().unwrap();
416
417        let builder = IndexBuilder::new(&db, vault.path());
418        builder.full_reindex(None).unwrap();
419
420        let note1 = db
421            .get_note_by_path(Path::new("note1.md"))
422            .unwrap()
423            .expect("note1 should exist");
424
425        let outgoing = db.get_outgoing_links(note1.id.unwrap()).unwrap();
426        assert_eq!(outgoing.len(), 2); // [[note2]] and [[missing-note]]
427    }
428
429    #[test]
430    fn test_link_targets_resolved() {
431        let vault = create_test_vault();
432        let db = IndexDb::open_in_memory().unwrap();
433
434        let builder = IndexBuilder::new(&db, vault.path());
435        let stats = builder.full_reindex(None).unwrap();
436
437        // At least one broken link (missing-note)
438        assert!(stats.broken_links >= 1);
439
440        // Check that existing links have target_id resolved
441        let note2 = db
442            .get_note_by_path(Path::new("note2.md"))
443            .unwrap()
444            .expect("note2 should exist");
445
446        let backlinks = db.get_backlinks(note2.id.unwrap()).unwrap();
447        // note1 links to note2
448        assert!(!backlinks.is_empty());
449    }
450
451    #[test]
452    fn test_reindex_clears_old_data() {
453        let vault = create_test_vault();
454        let db = IndexDb::open_in_memory().unwrap();
455
456        let builder = IndexBuilder::new(&db, vault.path());
457
458        // Index twice
459        builder.full_reindex(None).unwrap();
460        let stats = builder.full_reindex(None).unwrap();
461
462        // Should still have same counts (not doubled)
463        assert_eq!(stats.notes_indexed, 3);
464        assert_eq!(db.count_notes().unwrap(), 3);
465    }
466
467    // ─────────────────────────────────────────────────────────────────────────
468    // Incremental reindex tests
469    // ─────────────────────────────────────────────────────────────────────────
470
471    #[test]
472    fn test_incremental_first_run() {
473        let vault = create_test_vault();
474        let db = IndexDb::open_in_memory().unwrap();
475        let builder = IndexBuilder::new(&db, vault.path());
476
477        let stats = builder.incremental_reindex(None).unwrap();
478
479        assert_eq!(stats.files_found, 3);
480        assert_eq!(stats.files_added, 3);
481        assert_eq!(stats.files_unchanged, 0);
482        assert_eq!(stats.files_updated, 0);
483        assert_eq!(stats.files_deleted, 0);
484        assert_eq!(stats.notes_indexed, 3);
485    }
486
487    #[test]
488    fn test_incremental_no_changes() {
489        let vault = create_test_vault();
490        let db = IndexDb::open_in_memory().unwrap();
491        let builder = IndexBuilder::new(&db, vault.path());
492
493        builder.incremental_reindex(None).unwrap();
494        let stats = builder.incremental_reindex(None).unwrap();
495
496        assert_eq!(stats.files_found, 3);
497        assert_eq!(stats.files_unchanged, 3);
498        assert_eq!(stats.files_added, 0);
499        assert_eq!(stats.files_updated, 0);
500        assert_eq!(stats.files_deleted, 0);
501        assert_eq!(stats.notes_indexed, 0);
502    }
503
504    #[test]
505    fn test_incremental_file_modified() {
506        let vault = create_test_vault();
507        let db = IndexDb::open_in_memory().unwrap();
508        let builder = IndexBuilder::new(&db, vault.path());
509
510        builder.incremental_reindex(None).unwrap();
511
512        // Modify a file
513        fs::write(vault.path().join("note1.md"), "# Note 1 Modified\n\nNew content.")
514            .unwrap();
515
516        let stats = builder.incremental_reindex(None).unwrap();
517
518        assert_eq!(stats.files_unchanged, 2);
519        assert_eq!(stats.files_updated, 1);
520        assert_eq!(stats.files_added, 0);
521        assert_eq!(stats.notes_indexed, 1);
522    }
523
524    #[test]
525    fn test_incremental_file_added() {
526        let vault = create_test_vault();
527        let db = IndexDb::open_in_memory().unwrap();
528        let builder = IndexBuilder::new(&db, vault.path());
529
530        builder.incremental_reindex(None).unwrap();
531
532        // Add a new file
533        fs::write(vault.path().join("note4.md"), "# Note 4\n\nBrand new note.").unwrap();
534
535        let stats = builder.incremental_reindex(None).unwrap();
536
537        assert_eq!(stats.files_found, 4);
538        assert_eq!(stats.files_unchanged, 3);
539        assert_eq!(stats.files_added, 1);
540        assert_eq!(stats.files_updated, 0);
541        assert_eq!(stats.notes_indexed, 1);
542    }
543
544    #[test]
545    fn test_incremental_file_deleted() {
546        let vault = create_test_vault();
547        let db = IndexDb::open_in_memory().unwrap();
548        let builder = IndexBuilder::new(&db, vault.path());
549
550        builder.incremental_reindex(None).unwrap();
551
552        // Delete a file
553        fs::remove_file(vault.path().join("note2.md")).unwrap();
554
555        let stats = builder.incremental_reindex(None).unwrap();
556
557        assert_eq!(stats.files_found, 2);
558        assert_eq!(stats.files_deleted, 1);
559        assert_eq!(stats.files_unchanged, 2);
560
561        // Verify it's gone from the index
562        assert!(db.get_note_by_path(Path::new("note2.md")).unwrap().is_none());
563        assert_eq!(db.count_notes().unwrap(), 2);
564    }
565
566    #[test]
567    fn test_incremental_links_updated_on_change() {
568        let vault = create_test_vault();
569        let db = IndexDb::open_in_memory().unwrap();
570        let builder = IndexBuilder::new(&db, vault.path());
571
572        builder.incremental_reindex(None).unwrap();
573
574        let note1 = db.get_note_by_path(Path::new("note1.md")).unwrap().unwrap();
575        let links_before = db.get_outgoing_links(note1.id.unwrap()).unwrap();
576        assert_eq!(links_before.len(), 2); // [[note2]] and [[missing-note]]
577
578        // Modify to have different links
579        fs::write(vault.path().join("note1.md"), "# Note 1\n\n[[note3]] only now.")
580            .unwrap();
581        builder.incremental_reindex(None).unwrap();
582
583        let note1 = db.get_note_by_path(Path::new("note1.md")).unwrap().unwrap();
584        let links_after = db.get_outgoing_links(note1.id.unwrap()).unwrap();
585        assert_eq!(links_after.len(), 1);
586        assert_eq!(links_after[0].target_path, "note3");
587    }
588
589    #[test]
590    fn test_incremental_broken_links_resolved() {
591        let vault = create_test_vault();
592        let db = IndexDb::open_in_memory().unwrap();
593        let builder = IndexBuilder::new(&db, vault.path());
594
595        // note1 links to note2 and missing-note
596        let stats1 = builder.incremental_reindex(None).unwrap();
597        assert!(stats1.broken_links > 0); // missing-note is broken
598
599        // Now create the missing note
600        fs::write(vault.path().join("missing-note.md"), "# Missing Note\n\nNow exists!")
601            .unwrap();
602
603        let stats2 = builder.incremental_reindex(None).unwrap();
604        assert_eq!(stats2.files_added, 1);
605
606        // The link to missing-note should now be resolved
607        let missing = db.get_note_by_path(Path::new("missing-note.md")).unwrap().unwrap();
608        let backlinks = db.get_backlinks(missing.id.unwrap()).unwrap();
609        assert!(!backlinks.is_empty());
610    }
611}