rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: bincode-serialized WorkspaceIndex]
19//! ```
20
21use serde::{Deserialize, Serialize};
22use std::collections::{HashMap, HashSet};
23use std::path::{Path, PathBuf};
24
25/// Magic bytes identifying a workspace index cache file
26#[cfg(feature = "native")]
27const CACHE_MAGIC: &[u8; 4] = b"RWSI";
28
29/// Cache format version - increment when WorkspaceIndex serialization changes
30#[cfg(feature = "native")]
31const CACHE_FORMAT_VERSION: u32 = 3;
32
33/// Cache file name within the version directory
34#[cfg(feature = "native")]
35const CACHE_FILE_NAME: &str = "workspace_index.bin";
36
37/// Workspace-wide index for cross-file analysis
38///
39/// Contains pre-extracted information from all markdown files in the workspace,
40/// enabling rules to validate cross-file references efficiently.
41#[derive(Debug, Default, Clone, Serialize, Deserialize)]
42pub struct WorkspaceIndex {
43    /// Map from file path to its extracted data
44    files: HashMap<PathBuf, FileIndex>,
45    /// Reverse dependency graph: target file → files that link to it
46    /// Used to efficiently re-lint dependent files when a target changes
47    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
48    /// Version counter for cache invalidation (incremented on any change)
49    version: u64,
50}
51
52/// Index data extracted from a single file
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
54pub struct FileIndex {
55    /// Headings in this file with their anchors
56    pub headings: Vec<HeadingIndex>,
57    /// Reference links in this file (for cross-file analysis)
58    pub reference_links: Vec<ReferenceLinkIndex>,
59    /// Cross-file links in this file (for MD051 cross-file validation)
60    pub cross_file_links: Vec<CrossFileLinkIndex>,
61    /// Defined reference IDs (e.g., from [ref]: url definitions)
62    /// Used to filter out reference links that have explicit definitions
63    pub defined_references: HashSet<String>,
64    /// Content hash for change detection
65    pub content_hash: String,
66    /// O(1) anchor lookup: lowercased anchor → heading index
67    /// Includes both auto-generated and custom anchors
68    anchor_to_heading: HashMap<String, usize>,
69    /// Rules disabled for the entire file (from inline comments)
70    /// Used by cross-file rules to respect inline disable directives
71    pub file_disabled_rules: HashSet<String>,
72    /// Rules disabled at specific lines (line number -> set of rule names)
73    /// Merges both persistent disables and line-specific disables
74    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
75}
76
77/// Information about a heading for cross-file lookup
78#[derive(Debug, Clone, Serialize, Deserialize)]
79pub struct HeadingIndex {
80    /// The heading text (e.g., "Installation Guide")
81    pub text: String,
82    /// Auto-generated anchor (e.g., "installation-guide")
83    pub auto_anchor: String,
84    /// Custom anchor if present (e.g., "install")
85    pub custom_anchor: Option<String>,
86    /// Line number (1-indexed)
87    pub line: usize,
88}
89
90/// Information about a reference link for cross-file analysis
91#[derive(Debug, Clone, Serialize, Deserialize)]
92pub struct ReferenceLinkIndex {
93    /// The reference ID (the part in [text][ref])
94    pub reference_id: String,
95    /// Line number (1-indexed)
96    pub line: usize,
97    /// Column number (1-indexed)
98    pub column: usize,
99}
100
101/// Information about a cross-file link for validation
102#[derive(Debug, Clone, Serialize, Deserialize)]
103pub struct CrossFileLinkIndex {
104    /// The target file path (relative, as it appears in the link)
105    pub target_path: String,
106    /// The fragment/anchor being linked to (without #)
107    pub fragment: String,
108    /// Line number (1-indexed)
109    pub line: usize,
110    /// Column number (1-indexed)
111    pub column: usize,
112}
113
114/// Information about a vulnerable anchor (heading without custom ID)
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct VulnerableAnchor {
117    /// File path where the heading is located
118    pub file: PathBuf,
119    /// Line number of the heading
120    pub line: usize,
121    /// The heading text
122    pub text: String,
123}
124
125impl WorkspaceIndex {
126    /// Create a new empty workspace index
127    pub fn new() -> Self {
128        Self::default()
129    }
130
131    /// Get the current version (for cache invalidation)
132    pub fn version(&self) -> u64 {
133        self.version
134    }
135
136    /// Get the number of indexed files
137    pub fn file_count(&self) -> usize {
138        self.files.len()
139    }
140
141    /// Check if a file is in the index
142    pub fn contains_file(&self, path: &Path) -> bool {
143        self.files.contains_key(path)
144    }
145
146    /// Get the index data for a specific file
147    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
148        self.files.get(path)
149    }
150
151    /// Insert or update a file's index data
152    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
153        self.files.insert(path, index);
154        self.version = self.version.wrapping_add(1);
155    }
156
157    /// Remove a file from the index
158    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
159        // Clean up reverse deps for this file
160        self.clear_reverse_deps_for(path);
161
162        let result = self.files.remove(path);
163        if result.is_some() {
164            self.version = self.version.wrapping_add(1);
165        }
166        result
167    }
168
169    /// Build a map of all "vulnerable" anchors across the workspace
170    ///
171    /// A vulnerable anchor is an auto-generated anchor for a heading that
172    /// does NOT have a custom anchor defined. These are problematic for
173    /// translated content because the anchor changes when the heading is translated.
174    ///
175    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
176    /// Multiple files can have headings with the same auto-generated anchor,
177    /// so we collect all occurrences.
178    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
179        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
180
181        for (file_path, file_index) in &self.files {
182            for heading in &file_index.headings {
183                // Only include headings WITHOUT custom anchors
184                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
185                    let anchor_key = heading.auto_anchor.to_lowercase();
186                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
187                        file: file_path.clone(),
188                        line: heading.line,
189                        text: heading.text.clone(),
190                    });
191                }
192            }
193        }
194
195        vulnerable
196    }
197
198    /// Get all headings across the workspace (for debugging/testing)
199    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
200        self.files
201            .iter()
202            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
203    }
204
205    /// Iterate over all files in the index
206    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
207        self.files.iter().map(|(p, i)| (p.as_path(), i))
208    }
209
210    /// Clear the entire index
211    pub fn clear(&mut self) {
212        self.files.clear();
213        self.reverse_deps.clear();
214        self.version = self.version.wrapping_add(1);
215    }
216
217    /// Update a file's index and maintain reverse dependencies
218    ///
219    /// This method:
220    /// 1. Removes this file as a source (dependent) from all reverse deps
221    /// 2. Inserts the new file index
222    /// 3. Builds new reverse deps from cross_file_links
223    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
224        // Remove this file as a source (dependent) from all target entries
225        // Note: We don't remove it as a target - other files may still link to it
226        self.clear_reverse_deps_as_source(path);
227
228        // Build new reverse deps from cross_file_links
229        for link in &index.cross_file_links {
230            let target = self.resolve_target_path(path, &link.target_path);
231            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
232        }
233
234        self.files.insert(path.to_path_buf(), index);
235        self.version = self.version.wrapping_add(1);
236    }
237
238    /// Get files that depend on (link to) the given file
239    ///
240    /// Returns a list of file paths that contain links targeting this file.
241    /// Used to re-lint dependent files when a target file changes.
242    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
243        self.reverse_deps
244            .get(path)
245            .map(|set| set.iter().cloned().collect())
246            .unwrap_or_default()
247    }
248
249    /// Check if a file needs re-indexing based on its content hash
250    ///
251    /// Returns `true` if the file is not in the index or has a different hash.
252    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
253        self.files
254            .get(path)
255            .map(|f| f.content_hash != current_hash)
256            .unwrap_or(true)
257    }
258
259    /// Retain only files that exist in the given set, removing deleted files
260    ///
261    /// This prunes stale entries from the cache for files that no longer exist.
262    /// Returns the number of files removed.
263    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
264        let before_count = self.files.len();
265
266        // Collect files to remove
267        let to_remove: Vec<PathBuf> = self
268            .files
269            .keys()
270            .filter(|path| !current_files.contains(*path))
271            .cloned()
272            .collect();
273
274        // Remove each file properly (clears reverse deps)
275        for path in &to_remove {
276            self.remove_file(path);
277        }
278
279        before_count - self.files.len()
280    }
281
282    /// Save the workspace index to a cache file
283    ///
284    /// Uses bincode for efficient binary serialization with:
285    /// - Magic header for file type validation
286    /// - Format version for compatibility detection
287    /// - Atomic writes (temp file + rename) to prevent corruption
288    #[cfg(feature = "native")]
289    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
290        use std::fs;
291        use std::io::Write;
292
293        // Ensure cache directory exists
294        fs::create_dir_all(cache_dir)?;
295
296        // Serialize the index data using bincode 2.x serde compatibility
297        let encoded = bincode::serde::encode_to_vec(self, bincode::config::standard())
298            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
299
300        // Build versioned cache file: [magic][version][data]
301        let mut cache_data = Vec::with_capacity(8 + encoded.len());
302        cache_data.extend_from_slice(CACHE_MAGIC);
303        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
304        cache_data.extend_from_slice(&encoded);
305
306        // Write atomically: write to temp file then rename
307        let final_path = cache_dir.join(CACHE_FILE_NAME);
308        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
309
310        // Write to temp file
311        {
312            let mut file = fs::File::create(&temp_path)?;
313            file.write_all(&cache_data)?;
314            file.sync_all()?;
315        }
316
317        // Atomic rename
318        fs::rename(&temp_path, &final_path)?;
319
320        log::debug!(
321            "Saved workspace index to cache: {} files, {} bytes (format v{})",
322            self.files.len(),
323            cache_data.len(),
324            CACHE_FORMAT_VERSION
325        );
326
327        Ok(())
328    }
329
330    /// Load the workspace index from a cache file
331    ///
332    /// Returns `None` if:
333    /// - Cache file doesn't exist
334    /// - Magic header doesn't match
335    /// - Format version is incompatible
336    /// - Data is corrupted
337    #[cfg(feature = "native")]
338    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
339        use std::fs;
340
341        let path = cache_dir.join(CACHE_FILE_NAME);
342        let data = fs::read(&path).ok()?;
343
344        // Validate header: need at least 8 bytes for magic + version
345        if data.len() < 8 {
346            log::warn!("Workspace index cache too small, discarding");
347            let _ = fs::remove_file(&path);
348            return None;
349        }
350
351        // Check magic header
352        if &data[0..4] != CACHE_MAGIC {
353            log::warn!("Workspace index cache has invalid magic header, discarding");
354            let _ = fs::remove_file(&path);
355            return None;
356        }
357
358        // Check format version
359        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
360        if version != CACHE_FORMAT_VERSION {
361            log::info!(
362                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
363            );
364            let _ = fs::remove_file(&path);
365            return None;
366        }
367
368        // Deserialize the index data using bincode 2.x serde compatibility
369        match bincode::serde::decode_from_slice(&data[8..], bincode::config::standard()) {
370            Ok((index, _bytes_read)) => {
371                let index: Self = index;
372                log::debug!(
373                    "Loaded workspace index from cache: {} files (format v{})",
374                    index.files.len(),
375                    version
376                );
377                Some(index)
378            }
379            Err(e) => {
380                log::warn!("Failed to deserialize workspace index cache: {e}");
381                let _ = fs::remove_file(&path);
382                None
383            }
384        }
385    }
386
387    /// Remove a file as a source from all reverse dependency entries
388    ///
389    /// This removes the file from being listed as a dependent in all target entries.
390    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
391    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
392        for deps in self.reverse_deps.values_mut() {
393            deps.remove(path);
394        }
395        // Clean up empty entries
396        self.reverse_deps.retain(|_, deps| !deps.is_empty());
397    }
398
399    /// Remove a file completely from reverse dependency tracking
400    ///
401    /// Removes the file as both a source (dependent) and as a target.
402    /// Used when deleting a file from the index.
403    fn clear_reverse_deps_for(&mut self, path: &Path) {
404        // Remove as source (dependent)
405        self.clear_reverse_deps_as_source(path);
406
407        // Also remove as target
408        self.reverse_deps.remove(path);
409    }
410
411    /// Resolve a relative path from a source file to an absolute target path
412    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
413        // Get the directory containing the source file
414        let source_dir = source_file.parent().unwrap_or(Path::new(""));
415
416        // Join with the relative target and normalize
417        let target = source_dir.join(relative_target);
418
419        // Normalize the path (handle .., ., etc.)
420        Self::normalize_path(&target)
421    }
422
423    /// Normalize a path by resolving . and .. components
424    fn normalize_path(path: &Path) -> PathBuf {
425        let mut components = Vec::new();
426
427        for component in path.components() {
428            match component {
429                std::path::Component::ParentDir => {
430                    // Go up one level if possible
431                    if !components.is_empty() {
432                        components.pop();
433                    }
434                }
435                std::path::Component::CurDir => {
436                    // Skip current directory markers
437                }
438                _ => {
439                    components.push(component);
440                }
441            }
442        }
443
444        components.iter().collect()
445    }
446}
447
448impl FileIndex {
449    /// Create a new empty file index
450    pub fn new() -> Self {
451        Self::default()
452    }
453
454    /// Create a file index with the given content hash
455    pub fn with_hash(content_hash: String) -> Self {
456        Self {
457            content_hash,
458            ..Default::default()
459        }
460    }
461
462    /// Add a heading to the index
463    ///
464    /// Also updates the anchor lookup map for O(1) anchor queries
465    pub fn add_heading(&mut self, heading: HeadingIndex) {
466        let index = self.headings.len();
467
468        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
469        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
470
471        // Add custom anchor if present
472        if let Some(ref custom) = heading.custom_anchor {
473            self.anchor_to_heading.insert(custom.to_lowercase(), index);
474        }
475
476        self.headings.push(heading);
477    }
478
479    /// Check if an anchor exists in this file (O(1) lookup)
480    ///
481    /// Returns true if the anchor matches either an auto-generated or custom anchor.
482    /// Matching is case-insensitive.
483    pub fn has_anchor(&self, anchor: &str) -> bool {
484        self.anchor_to_heading.contains_key(&anchor.to_lowercase())
485    }
486
487    /// Get the heading index for an anchor (O(1) lookup)
488    ///
489    /// Returns the index into `self.headings` if found.
490    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
491        self.anchor_to_heading
492            .get(&anchor.to_lowercase())
493            .and_then(|&idx| self.headings.get(idx))
494    }
495
496    /// Add a reference link to the index
497    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
498        self.reference_links.push(link);
499    }
500
501    /// Check if a rule is disabled at a specific line
502    ///
503    /// Used by cross-file rules to respect inline disable directives.
504    /// Checks both file-wide disables and line-specific disables.
505    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
506        // Check file-wide disables (highest priority)
507        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
508            return true;
509        }
510
511        // Check line-specific disables
512        if let Some(rules) = self.line_disabled_rules.get(&line) {
513            return rules.contains("*") || rules.contains(rule_name);
514        }
515
516        false
517    }
518
519    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line, column)
520    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
521        // Deduplicate: multiple rules may contribute the same link
522        let is_duplicate = self.cross_file_links.iter().any(|existing| {
523            existing.target_path == link.target_path
524                && existing.fragment == link.fragment
525                && existing.line == link.line
526                && existing.column == link.column
527        });
528        if !is_duplicate {
529            self.cross_file_links.push(link);
530        }
531    }
532
533    /// Add a defined reference ID (e.g., from [ref]: url)
534    pub fn add_defined_reference(&mut self, ref_id: String) {
535        self.defined_references.insert(ref_id);
536    }
537
538    /// Check if a reference ID has an explicit definition
539    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
540        self.defined_references.contains(ref_id)
541    }
542
543    /// Check if the content hash matches
544    pub fn hash_matches(&self, hash: &str) -> bool {
545        self.content_hash == hash
546    }
547
548    /// Get the number of headings
549    pub fn heading_count(&self) -> usize {
550        self.headings.len()
551    }
552
553    /// Get the number of reference links
554    pub fn reference_link_count(&self) -> usize {
555        self.reference_links.len()
556    }
557}
558
559#[cfg(test)]
560mod tests {
561    use super::*;
562
563    #[test]
564    fn test_workspace_index_basic() {
565        let mut index = WorkspaceIndex::new();
566        assert_eq!(index.file_count(), 0);
567        assert_eq!(index.version(), 0);
568
569        let mut file_index = FileIndex::with_hash("abc123".to_string());
570        file_index.add_heading(HeadingIndex {
571            text: "Installation".to_string(),
572            auto_anchor: "installation".to_string(),
573            custom_anchor: None,
574            line: 1,
575        });
576
577        index.insert_file(PathBuf::from("docs/install.md"), file_index);
578        assert_eq!(index.file_count(), 1);
579        assert_eq!(index.version(), 1);
580
581        assert!(index.contains_file(Path::new("docs/install.md")));
582        assert!(!index.contains_file(Path::new("docs/other.md")));
583    }
584
585    #[test]
586    fn test_vulnerable_anchors() {
587        let mut index = WorkspaceIndex::new();
588
589        // File 1: heading without custom anchor (vulnerable)
590        let mut file1 = FileIndex::new();
591        file1.add_heading(HeadingIndex {
592            text: "Getting Started".to_string(),
593            auto_anchor: "getting-started".to_string(),
594            custom_anchor: None,
595            line: 1,
596        });
597        index.insert_file(PathBuf::from("docs/guide.md"), file1);
598
599        // File 2: heading with custom anchor (not vulnerable)
600        let mut file2 = FileIndex::new();
601        file2.add_heading(HeadingIndex {
602            text: "Installation".to_string(),
603            auto_anchor: "installation".to_string(),
604            custom_anchor: Some("install".to_string()),
605            line: 1,
606        });
607        index.insert_file(PathBuf::from("docs/install.md"), file2);
608
609        let vulnerable = index.get_vulnerable_anchors();
610        assert_eq!(vulnerable.len(), 1);
611        assert!(vulnerable.contains_key("getting-started"));
612        assert!(!vulnerable.contains_key("installation"));
613
614        let anchors = vulnerable.get("getting-started").unwrap();
615        assert_eq!(anchors.len(), 1);
616        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
617        assert_eq!(anchors[0].text, "Getting Started");
618    }
619
620    #[test]
621    fn test_vulnerable_anchors_multiple_files_same_anchor() {
622        // Multiple files can have headings with the same auto-generated anchor
623        // get_vulnerable_anchors() should collect all of them
624        let mut index = WorkspaceIndex::new();
625
626        // File 1: has "Installation" heading (vulnerable)
627        let mut file1 = FileIndex::new();
628        file1.add_heading(HeadingIndex {
629            text: "Installation".to_string(),
630            auto_anchor: "installation".to_string(),
631            custom_anchor: None,
632            line: 1,
633        });
634        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
635
636        // File 2: also has "Installation" heading with same anchor (vulnerable)
637        let mut file2 = FileIndex::new();
638        file2.add_heading(HeadingIndex {
639            text: "Installation".to_string(),
640            auto_anchor: "installation".to_string(),
641            custom_anchor: None,
642            line: 5,
643        });
644        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
645
646        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
647        let mut file3 = FileIndex::new();
648        file3.add_heading(HeadingIndex {
649            text: "Installation".to_string(),
650            auto_anchor: "installation".to_string(),
651            custom_anchor: Some("install".to_string()),
652            line: 10,
653        });
654        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
655
656        let vulnerable = index.get_vulnerable_anchors();
657        assert_eq!(vulnerable.len(), 1); // One unique anchor
658        assert!(vulnerable.contains_key("installation"));
659
660        let anchors = vulnerable.get("installation").unwrap();
661        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
662        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
663
664        // Verify both files are represented
665        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
666        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
667        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
668    }
669
670    #[test]
671    fn test_file_index_hash() {
672        let index = FileIndex::with_hash("hash123".to_string());
673        assert!(index.hash_matches("hash123"));
674        assert!(!index.hash_matches("other"));
675    }
676
677    #[test]
678    fn test_version_increment() {
679        let mut index = WorkspaceIndex::new();
680        assert_eq!(index.version(), 0);
681
682        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
683        assert_eq!(index.version(), 1);
684
685        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
686        assert_eq!(index.version(), 2);
687
688        index.remove_file(Path::new("a.md"));
689        assert_eq!(index.version(), 3);
690
691        // Removing non-existent file doesn't increment
692        index.remove_file(Path::new("nonexistent.md"));
693        assert_eq!(index.version(), 3);
694    }
695
696    #[test]
697    fn test_reverse_deps_basic() {
698        let mut index = WorkspaceIndex::new();
699
700        // File A links to file B
701        let mut file_a = FileIndex::new();
702        file_a.add_cross_file_link(CrossFileLinkIndex {
703            target_path: "b.md".to_string(),
704            fragment: "section".to_string(),
705            line: 10,
706            column: 5,
707        });
708        index.update_file(Path::new("docs/a.md"), file_a);
709
710        // Check that B has A as a dependent
711        let dependents = index.get_dependents(Path::new("docs/b.md"));
712        assert_eq!(dependents.len(), 1);
713        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
714
715        // A has no dependents
716        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
717        assert!(a_dependents.is_empty());
718    }
719
720    #[test]
721    fn test_reverse_deps_multiple() {
722        let mut index = WorkspaceIndex::new();
723
724        // Files A and C both link to B
725        let mut file_a = FileIndex::new();
726        file_a.add_cross_file_link(CrossFileLinkIndex {
727            target_path: "../b.md".to_string(),
728            fragment: "".to_string(),
729            line: 1,
730            column: 1,
731        });
732        index.update_file(Path::new("docs/sub/a.md"), file_a);
733
734        let mut file_c = FileIndex::new();
735        file_c.add_cross_file_link(CrossFileLinkIndex {
736            target_path: "b.md".to_string(),
737            fragment: "".to_string(),
738            line: 1,
739            column: 1,
740        });
741        index.update_file(Path::new("docs/c.md"), file_c);
742
743        // B should have both A and C as dependents
744        let dependents = index.get_dependents(Path::new("docs/b.md"));
745        assert_eq!(dependents.len(), 2);
746        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
747        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
748    }
749
750    #[test]
751    fn test_reverse_deps_update_clears_old() {
752        let mut index = WorkspaceIndex::new();
753
754        // File A initially links to B
755        let mut file_a = FileIndex::new();
756        file_a.add_cross_file_link(CrossFileLinkIndex {
757            target_path: "b.md".to_string(),
758            fragment: "".to_string(),
759            line: 1,
760            column: 1,
761        });
762        index.update_file(Path::new("docs/a.md"), file_a);
763
764        // Verify B has A as dependent
765        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
766
767        // Update A to link to C instead of B
768        let mut file_a_updated = FileIndex::new();
769        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
770            target_path: "c.md".to_string(),
771            fragment: "".to_string(),
772            line: 1,
773            column: 1,
774        });
775        index.update_file(Path::new("docs/a.md"), file_a_updated);
776
777        // B should no longer have A as dependent
778        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
779
780        // C should now have A as dependent
781        let c_deps = index.get_dependents(Path::new("docs/c.md"));
782        assert_eq!(c_deps.len(), 1);
783        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
784    }
785
786    #[test]
787    fn test_reverse_deps_remove_file() {
788        let mut index = WorkspaceIndex::new();
789
790        // File A links to B
791        let mut file_a = FileIndex::new();
792        file_a.add_cross_file_link(CrossFileLinkIndex {
793            target_path: "b.md".to_string(),
794            fragment: "".to_string(),
795            line: 1,
796            column: 1,
797        });
798        index.update_file(Path::new("docs/a.md"), file_a);
799
800        // Verify B has A as dependent
801        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
802
803        // Remove file A
804        index.remove_file(Path::new("docs/a.md"));
805
806        // B should no longer have any dependents
807        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
808    }
809
810    #[test]
811    fn test_normalize_path() {
812        // Test .. handling
813        let path = Path::new("docs/sub/../other.md");
814        let normalized = WorkspaceIndex::normalize_path(path);
815        assert_eq!(normalized, PathBuf::from("docs/other.md"));
816
817        // Test . handling
818        let path2 = Path::new("docs/./other.md");
819        let normalized2 = WorkspaceIndex::normalize_path(path2);
820        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
821
822        // Test multiple ..
823        let path3 = Path::new("a/b/c/../../d.md");
824        let normalized3 = WorkspaceIndex::normalize_path(path3);
825        assert_eq!(normalized3, PathBuf::from("a/d.md"));
826    }
827
828    #[test]
829    fn test_clear_clears_reverse_deps() {
830        let mut index = WorkspaceIndex::new();
831
832        // File A links to B
833        let mut file_a = FileIndex::new();
834        file_a.add_cross_file_link(CrossFileLinkIndex {
835            target_path: "b.md".to_string(),
836            fragment: "".to_string(),
837            line: 1,
838            column: 1,
839        });
840        index.update_file(Path::new("docs/a.md"), file_a);
841
842        // Verify B has A as dependent
843        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
844
845        // Clear the index
846        index.clear();
847
848        // Both files and reverse deps should be cleared
849        assert_eq!(index.file_count(), 0);
850        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
851    }
852
853    #[test]
854    fn test_is_file_stale() {
855        let mut index = WorkspaceIndex::new();
856
857        // Non-existent file is always stale
858        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
859
860        // Add a file with known hash
861        let file_index = FileIndex::with_hash("hash123".to_string());
862        index.insert_file(PathBuf::from("docs/test.md"), file_index);
863
864        // Same hash means not stale
865        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
866
867        // Different hash means stale
868        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
869    }
870
871    #[cfg(feature = "native")]
872    #[test]
873    fn test_cache_roundtrip() {
874        use std::fs;
875
876        // Create a temp directory
877        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
878        let _ = fs::remove_dir_all(&temp_dir);
879        fs::create_dir_all(&temp_dir).unwrap();
880
881        // Create an index with some data
882        let mut index = WorkspaceIndex::new();
883
884        let mut file1 = FileIndex::with_hash("abc123".to_string());
885        file1.add_heading(HeadingIndex {
886            text: "Test Heading".to_string(),
887            auto_anchor: "test-heading".to_string(),
888            custom_anchor: Some("test".to_string()),
889            line: 1,
890        });
891        file1.add_cross_file_link(CrossFileLinkIndex {
892            target_path: "./other.md".to_string(),
893            fragment: "section".to_string(),
894            line: 5,
895            column: 3,
896        });
897        index.update_file(Path::new("docs/file1.md"), file1);
898
899        let mut file2 = FileIndex::with_hash("def456".to_string());
900        file2.add_heading(HeadingIndex {
901            text: "Another Heading".to_string(),
902            auto_anchor: "another-heading".to_string(),
903            custom_anchor: None,
904            line: 1,
905        });
906        index.update_file(Path::new("docs/other.md"), file2);
907
908        // Save to cache
909        index.save_to_cache(&temp_dir).expect("Failed to save cache");
910
911        // Verify cache file exists
912        assert!(temp_dir.join("workspace_index.bin").exists());
913
914        // Load from cache
915        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
916
917        // Verify data matches
918        assert_eq!(loaded.file_count(), 2);
919        assert!(loaded.contains_file(Path::new("docs/file1.md")));
920        assert!(loaded.contains_file(Path::new("docs/other.md")));
921
922        // Check file1 details
923        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
924        assert_eq!(file1_loaded.content_hash, "abc123");
925        assert_eq!(file1_loaded.headings.len(), 1);
926        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
927        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
928        assert_eq!(file1_loaded.cross_file_links.len(), 1);
929        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
930
931        // Check reverse deps were serialized correctly
932        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
933        assert_eq!(dependents.len(), 1);
934        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
935
936        // Clean up
937        let _ = fs::remove_dir_all(&temp_dir);
938    }
939
940    #[cfg(feature = "native")]
941    #[test]
942    fn test_cache_missing_file() {
943        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
944        let _ = std::fs::remove_dir_all(&temp_dir);
945
946        // Should return None for non-existent cache
947        let result = WorkspaceIndex::load_from_cache(&temp_dir);
948        assert!(result.is_none());
949    }
950
951    #[cfg(feature = "native")]
952    #[test]
953    fn test_cache_corrupted_file() {
954        use std::fs;
955
956        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
957        let _ = fs::remove_dir_all(&temp_dir);
958        fs::create_dir_all(&temp_dir).unwrap();
959
960        // Write corrupted data (too small for header)
961        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
962
963        // Should return None for corrupted cache (and remove the file)
964        let result = WorkspaceIndex::load_from_cache(&temp_dir);
965        assert!(result.is_none());
966
967        // Corrupted file should be removed
968        assert!(!temp_dir.join("workspace_index.bin").exists());
969
970        // Clean up
971        let _ = fs::remove_dir_all(&temp_dir);
972    }
973
974    #[cfg(feature = "native")]
975    #[test]
976    fn test_cache_invalid_magic() {
977        use std::fs;
978
979        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
980        let _ = fs::remove_dir_all(&temp_dir);
981        fs::create_dir_all(&temp_dir).unwrap();
982
983        // Write data with wrong magic header
984        let mut data = Vec::new();
985        data.extend_from_slice(b"XXXX"); // Wrong magic
986        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
987        data.extend_from_slice(&[0; 100]); // Some garbage data
988        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
989
990        // Should return None for invalid magic
991        let result = WorkspaceIndex::load_from_cache(&temp_dir);
992        assert!(result.is_none());
993
994        // File should be removed
995        assert!(!temp_dir.join("workspace_index.bin").exists());
996
997        // Clean up
998        let _ = fs::remove_dir_all(&temp_dir);
999    }
1000
1001    #[cfg(feature = "native")]
1002    #[test]
1003    fn test_cache_version_mismatch() {
1004        use std::fs;
1005
1006        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1007        let _ = fs::remove_dir_all(&temp_dir);
1008        fs::create_dir_all(&temp_dir).unwrap();
1009
1010        // Write data with correct magic but wrong version
1011        let mut data = Vec::new();
1012        data.extend_from_slice(b"RWSI"); // Correct magic
1013        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1014        data.extend_from_slice(&[0; 100]); // Some garbage data
1015        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1016
1017        // Should return None for version mismatch
1018        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1019        assert!(result.is_none());
1020
1021        // File should be removed to trigger rebuild
1022        assert!(!temp_dir.join("workspace_index.bin").exists());
1023
1024        // Clean up
1025        let _ = fs::remove_dir_all(&temp_dir);
1026    }
1027
1028    #[cfg(feature = "native")]
1029    #[test]
1030    fn test_cache_atomic_write() {
1031        use std::fs;
1032
1033        // Test that atomic writes work (no temp files left behind)
1034        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1035        let _ = fs::remove_dir_all(&temp_dir);
1036        fs::create_dir_all(&temp_dir).unwrap();
1037
1038        let index = WorkspaceIndex::new();
1039        index.save_to_cache(&temp_dir).expect("Failed to save");
1040
1041        // Only the final cache file should exist, no temp files
1042        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1043        assert_eq!(entries.len(), 1);
1044        assert!(temp_dir.join("workspace_index.bin").exists());
1045
1046        // Clean up
1047        let _ = fs::remove_dir_all(&temp_dir);
1048    }
1049
1050    #[test]
1051    fn test_has_anchor_auto_generated() {
1052        let mut file_index = FileIndex::new();
1053        file_index.add_heading(HeadingIndex {
1054            text: "Installation Guide".to_string(),
1055            auto_anchor: "installation-guide".to_string(),
1056            custom_anchor: None,
1057            line: 1,
1058        });
1059
1060        // Should find by auto-generated anchor
1061        assert!(file_index.has_anchor("installation-guide"));
1062
1063        // Case-insensitive matching
1064        assert!(file_index.has_anchor("Installation-Guide"));
1065        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1066
1067        // Should not find non-existent anchor
1068        assert!(!file_index.has_anchor("nonexistent"));
1069    }
1070
1071    #[test]
1072    fn test_has_anchor_custom() {
1073        let mut file_index = FileIndex::new();
1074        file_index.add_heading(HeadingIndex {
1075            text: "Installation Guide".to_string(),
1076            auto_anchor: "installation-guide".to_string(),
1077            custom_anchor: Some("install".to_string()),
1078            line: 1,
1079        });
1080
1081        // Should find by auto-generated anchor
1082        assert!(file_index.has_anchor("installation-guide"));
1083
1084        // Should also find by custom anchor
1085        assert!(file_index.has_anchor("install"));
1086        assert!(file_index.has_anchor("Install")); // case-insensitive
1087
1088        // Should not find non-existent anchor
1089        assert!(!file_index.has_anchor("nonexistent"));
1090    }
1091
1092    #[test]
1093    fn test_get_heading_by_anchor() {
1094        let mut file_index = FileIndex::new();
1095        file_index.add_heading(HeadingIndex {
1096            text: "Installation Guide".to_string(),
1097            auto_anchor: "installation-guide".to_string(),
1098            custom_anchor: Some("install".to_string()),
1099            line: 10,
1100        });
1101        file_index.add_heading(HeadingIndex {
1102            text: "Configuration".to_string(),
1103            auto_anchor: "configuration".to_string(),
1104            custom_anchor: None,
1105            line: 20,
1106        });
1107
1108        // Get by auto anchor
1109        let heading = file_index.get_heading_by_anchor("installation-guide");
1110        assert!(heading.is_some());
1111        assert_eq!(heading.unwrap().text, "Installation Guide");
1112        assert_eq!(heading.unwrap().line, 10);
1113
1114        // Get by custom anchor
1115        let heading = file_index.get_heading_by_anchor("install");
1116        assert!(heading.is_some());
1117        assert_eq!(heading.unwrap().text, "Installation Guide");
1118
1119        // Get second heading
1120        let heading = file_index.get_heading_by_anchor("configuration");
1121        assert!(heading.is_some());
1122        assert_eq!(heading.unwrap().text, "Configuration");
1123        assert_eq!(heading.unwrap().line, 20);
1124
1125        // Non-existent
1126        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1127    }
1128
1129    #[test]
1130    fn test_anchor_lookup_many_headings() {
1131        // Test that O(1) lookup works with many headings
1132        let mut file_index = FileIndex::new();
1133
1134        // Add 100 headings
1135        for i in 0..100 {
1136            file_index.add_heading(HeadingIndex {
1137                text: format!("Heading {i}"),
1138                auto_anchor: format!("heading-{i}"),
1139                custom_anchor: Some(format!("h{i}")),
1140                line: i + 1,
1141            });
1142        }
1143
1144        // Verify all can be found
1145        for i in 0..100 {
1146            assert!(file_index.has_anchor(&format!("heading-{i}")));
1147            assert!(file_index.has_anchor(&format!("h{i}")));
1148
1149            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1150            assert!(heading.is_some());
1151            assert_eq!(heading.unwrap().line, i + 1);
1152        }
1153    }
1154}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs