rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: bincode-serialized WorkspaceIndex]
19//! ```
20
21use serde::{Deserialize, Serialize};
22use std::collections::{HashMap, HashSet};
23use std::path::{Path, PathBuf};
24
25/// Magic bytes identifying a workspace index cache file
26#[cfg(feature = "native")]
27const CACHE_MAGIC: &[u8; 4] = b"RWSI";
28
29/// Cache format version - increment when WorkspaceIndex serialization changes
30#[cfg(feature = "native")]
31const CACHE_FORMAT_VERSION: u32 = 2;
32
33/// Cache file name within the version directory
34#[cfg(feature = "native")]
35const CACHE_FILE_NAME: &str = "workspace_index.bin";
36
37/// Workspace-wide index for cross-file analysis
38///
39/// Contains pre-extracted information from all markdown files in the workspace,
40/// enabling rules to validate cross-file references efficiently.
41#[derive(Debug, Default, Clone, Serialize, Deserialize)]
42pub struct WorkspaceIndex {
43    /// Map from file path to its extracted data
44    files: HashMap<PathBuf, FileIndex>,
45    /// Reverse dependency graph: target file → files that link to it
46    /// Used to efficiently re-lint dependent files when a target changes
47    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
48    /// Version counter for cache invalidation (incremented on any change)
49    version: u64,
50}
51
52/// Index data extracted from a single file
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
54pub struct FileIndex {
55    /// Headings in this file with their anchors
56    pub headings: Vec<HeadingIndex>,
57    /// Reference links in this file (for cross-file analysis)
58    pub reference_links: Vec<ReferenceLinkIndex>,
59    /// Cross-file links in this file (for MD051 cross-file validation)
60    pub cross_file_links: Vec<CrossFileLinkIndex>,
61    /// Defined reference IDs (e.g., from [ref]: url definitions)
62    /// Used to filter out reference links that have explicit definitions
63    pub defined_references: HashSet<String>,
64    /// Content hash for change detection
65    pub content_hash: String,
66    /// O(1) anchor lookup: lowercased anchor → heading index
67    /// Includes both auto-generated and custom anchors
68    anchor_to_heading: HashMap<String, usize>,
69}
70
71/// Information about a heading for cross-file lookup
72#[derive(Debug, Clone, Serialize, Deserialize)]
73pub struct HeadingIndex {
74    /// The heading text (e.g., "Installation Guide")
75    pub text: String,
76    /// Auto-generated anchor (e.g., "installation-guide")
77    pub auto_anchor: String,
78    /// Custom anchor if present (e.g., "install")
79    pub custom_anchor: Option<String>,
80    /// Line number (1-indexed)
81    pub line: usize,
82}
83
84/// Information about a reference link for cross-file analysis
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct ReferenceLinkIndex {
87    /// The reference ID (the part in [text][ref])
88    pub reference_id: String,
89    /// Line number (1-indexed)
90    pub line: usize,
91    /// Column number (1-indexed)
92    pub column: usize,
93}
94
95/// Information about a cross-file link for validation
96#[derive(Debug, Clone, Serialize, Deserialize)]
97pub struct CrossFileLinkIndex {
98    /// The target file path (relative, as it appears in the link)
99    pub target_path: String,
100    /// The fragment/anchor being linked to (without #)
101    pub fragment: String,
102    /// Line number (1-indexed)
103    pub line: usize,
104    /// Column number (1-indexed)
105    pub column: usize,
106}
107
108/// Information about a vulnerable anchor (heading without custom ID)
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct VulnerableAnchor {
111    /// File path where the heading is located
112    pub file: PathBuf,
113    /// Line number of the heading
114    pub line: usize,
115    /// The heading text
116    pub text: String,
117}
118
119impl WorkspaceIndex {
120    /// Create a new empty workspace index
121    pub fn new() -> Self {
122        Self::default()
123    }
124
125    /// Get the current version (for cache invalidation)
126    pub fn version(&self) -> u64 {
127        self.version
128    }
129
130    /// Get the number of indexed files
131    pub fn file_count(&self) -> usize {
132        self.files.len()
133    }
134
135    /// Check if a file is in the index
136    pub fn contains_file(&self, path: &Path) -> bool {
137        self.files.contains_key(path)
138    }
139
140    /// Get the index data for a specific file
141    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
142        self.files.get(path)
143    }
144
145    /// Insert or update a file's index data
146    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
147        self.files.insert(path, index);
148        self.version = self.version.wrapping_add(1);
149    }
150
151    /// Remove a file from the index
152    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
153        // Clean up reverse deps for this file
154        self.clear_reverse_deps_for(path);
155
156        let result = self.files.remove(path);
157        if result.is_some() {
158            self.version = self.version.wrapping_add(1);
159        }
160        result
161    }
162
163    /// Build a map of all "vulnerable" anchors across the workspace
164    ///
165    /// A vulnerable anchor is an auto-generated anchor for a heading that
166    /// does NOT have a custom anchor defined. These are problematic for
167    /// translated content because the anchor changes when the heading is translated.
168    ///
169    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
170    /// Multiple files can have headings with the same auto-generated anchor,
171    /// so we collect all occurrences.
172    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
173        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
174
175        for (file_path, file_index) in &self.files {
176            for heading in &file_index.headings {
177                // Only include headings WITHOUT custom anchors
178                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
179                    let anchor_key = heading.auto_anchor.to_lowercase();
180                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
181                        file: file_path.clone(),
182                        line: heading.line,
183                        text: heading.text.clone(),
184                    });
185                }
186            }
187        }
188
189        vulnerable
190    }
191
192    /// Get all headings across the workspace (for debugging/testing)
193    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
194        self.files
195            .iter()
196            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
197    }
198
199    /// Iterate over all files in the index
200    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
201        self.files.iter().map(|(p, i)| (p.as_path(), i))
202    }
203
204    /// Clear the entire index
205    pub fn clear(&mut self) {
206        self.files.clear();
207        self.reverse_deps.clear();
208        self.version = self.version.wrapping_add(1);
209    }
210
211    /// Update a file's index and maintain reverse dependencies
212    ///
213    /// This method:
214    /// 1. Removes this file as a source (dependent) from all reverse deps
215    /// 2. Inserts the new file index
216    /// 3. Builds new reverse deps from cross_file_links
217    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
218        // Remove this file as a source (dependent) from all target entries
219        // Note: We don't remove it as a target - other files may still link to it
220        self.clear_reverse_deps_as_source(path);
221
222        // Build new reverse deps from cross_file_links
223        for link in &index.cross_file_links {
224            let target = self.resolve_target_path(path, &link.target_path);
225            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
226        }
227
228        self.files.insert(path.to_path_buf(), index);
229        self.version = self.version.wrapping_add(1);
230    }
231
232    /// Get files that depend on (link to) the given file
233    ///
234    /// Returns a list of file paths that contain links targeting this file.
235    /// Used to re-lint dependent files when a target file changes.
236    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
237        self.reverse_deps
238            .get(path)
239            .map(|set| set.iter().cloned().collect())
240            .unwrap_or_default()
241    }
242
243    /// Check if a file needs re-indexing based on its content hash
244    ///
245    /// Returns `true` if the file is not in the index or has a different hash.
246    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
247        self.files
248            .get(path)
249            .map(|f| f.content_hash != current_hash)
250            .unwrap_or(true)
251    }
252
253    /// Retain only files that exist in the given set, removing deleted files
254    ///
255    /// This prunes stale entries from the cache for files that no longer exist.
256    /// Returns the number of files removed.
257    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
258        let before_count = self.files.len();
259
260        // Collect files to remove
261        let to_remove: Vec<PathBuf> = self
262            .files
263            .keys()
264            .filter(|path| !current_files.contains(*path))
265            .cloned()
266            .collect();
267
268        // Remove each file properly (clears reverse deps)
269        for path in &to_remove {
270            self.remove_file(path);
271        }
272
273        before_count - self.files.len()
274    }
275
276    /// Save the workspace index to a cache file
277    ///
278    /// Uses bincode for efficient binary serialization with:
279    /// - Magic header for file type validation
280    /// - Format version for compatibility detection
281    /// - Atomic writes (temp file + rename) to prevent corruption
282    #[cfg(feature = "native")]
283    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
284        use std::fs;
285        use std::io::Write;
286
287        // Ensure cache directory exists
288        fs::create_dir_all(cache_dir)?;
289
290        // Serialize the index data
291        let encoded = bincode::serialize(self)
292            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
293
294        // Build versioned cache file: [magic][version][data]
295        let mut cache_data = Vec::with_capacity(8 + encoded.len());
296        cache_data.extend_from_slice(CACHE_MAGIC);
297        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
298        cache_data.extend_from_slice(&encoded);
299
300        // Write atomically: write to temp file then rename
301        let final_path = cache_dir.join(CACHE_FILE_NAME);
302        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
303
304        // Write to temp file
305        {
306            let mut file = fs::File::create(&temp_path)?;
307            file.write_all(&cache_data)?;
308            file.sync_all()?;
309        }
310
311        // Atomic rename
312        fs::rename(&temp_path, &final_path)?;
313
314        log::debug!(
315            "Saved workspace index to cache: {} files, {} bytes (format v{})",
316            self.files.len(),
317            cache_data.len(),
318            CACHE_FORMAT_VERSION
319        );
320
321        Ok(())
322    }
323
324    /// Load the workspace index from a cache file
325    ///
326    /// Returns `None` if:
327    /// - Cache file doesn't exist
328    /// - Magic header doesn't match
329    /// - Format version is incompatible
330    /// - Data is corrupted
331    #[cfg(feature = "native")]
332    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
333        use std::fs;
334
335        let path = cache_dir.join(CACHE_FILE_NAME);
336        let data = fs::read(&path).ok()?;
337
338        // Validate header: need at least 8 bytes for magic + version
339        if data.len() < 8 {
340            log::warn!("Workspace index cache too small, discarding");
341            let _ = fs::remove_file(&path);
342            return None;
343        }
344
345        // Check magic header
346        if &data[0..4] != CACHE_MAGIC {
347            log::warn!("Workspace index cache has invalid magic header, discarding");
348            let _ = fs::remove_file(&path);
349            return None;
350        }
351
352        // Check format version
353        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
354        if version != CACHE_FORMAT_VERSION {
355            log::info!(
356                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
357            );
358            let _ = fs::remove_file(&path);
359            return None;
360        }
361
362        // Deserialize the index data
363        match bincode::deserialize(&data[8..]) {
364            Ok(index) => {
365                let index: Self = index;
366                log::debug!(
367                    "Loaded workspace index from cache: {} files (format v{})",
368                    index.files.len(),
369                    version
370                );
371                Some(index)
372            }
373            Err(e) => {
374                log::warn!("Failed to deserialize workspace index cache: {e}");
375                let _ = fs::remove_file(&path);
376                None
377            }
378        }
379    }
380
381    /// Remove a file as a source from all reverse dependency entries
382    ///
383    /// This removes the file from being listed as a dependent in all target entries.
384    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
385    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
386        for deps in self.reverse_deps.values_mut() {
387            deps.remove(path);
388        }
389        // Clean up empty entries
390        self.reverse_deps.retain(|_, deps| !deps.is_empty());
391    }
392
393    /// Remove a file completely from reverse dependency tracking
394    ///
395    /// Removes the file as both a source (dependent) and as a target.
396    /// Used when deleting a file from the index.
397    fn clear_reverse_deps_for(&mut self, path: &Path) {
398        // Remove as source (dependent)
399        self.clear_reverse_deps_as_source(path);
400
401        // Also remove as target
402        self.reverse_deps.remove(path);
403    }
404
405    /// Resolve a relative path from a source file to an absolute target path
406    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
407        // Get the directory containing the source file
408        let source_dir = source_file.parent().unwrap_or(Path::new(""));
409
410        // Join with the relative target and normalize
411        let target = source_dir.join(relative_target);
412
413        // Normalize the path (handle .., ., etc.)
414        Self::normalize_path(&target)
415    }
416
417    /// Normalize a path by resolving . and .. components
418    fn normalize_path(path: &Path) -> PathBuf {
419        let mut components = Vec::new();
420
421        for component in path.components() {
422            match component {
423                std::path::Component::ParentDir => {
424                    // Go up one level if possible
425                    if !components.is_empty() {
426                        components.pop();
427                    }
428                }
429                std::path::Component::CurDir => {
430                    // Skip current directory markers
431                }
432                _ => {
433                    components.push(component);
434                }
435            }
436        }
437
438        components.iter().collect()
439    }
440}
441
442impl FileIndex {
443    /// Create a new empty file index
444    pub fn new() -> Self {
445        Self::default()
446    }
447
448    /// Create a file index with the given content hash
449    pub fn with_hash(content_hash: String) -> Self {
450        Self {
451            content_hash,
452            ..Default::default()
453        }
454    }
455
456    /// Add a heading to the index
457    ///
458    /// Also updates the anchor lookup map for O(1) anchor queries
459    pub fn add_heading(&mut self, heading: HeadingIndex) {
460        let index = self.headings.len();
461
462        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
463        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
464
465        // Add custom anchor if present
466        if let Some(ref custom) = heading.custom_anchor {
467            self.anchor_to_heading.insert(custom.to_lowercase(), index);
468        }
469
470        self.headings.push(heading);
471    }
472
473    /// Check if an anchor exists in this file (O(1) lookup)
474    ///
475    /// Returns true if the anchor matches either an auto-generated or custom anchor.
476    /// Matching is case-insensitive.
477    pub fn has_anchor(&self, anchor: &str) -> bool {
478        self.anchor_to_heading.contains_key(&anchor.to_lowercase())
479    }
480
481    /// Get the heading index for an anchor (O(1) lookup)
482    ///
483    /// Returns the index into `self.headings` if found.
484    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
485        self.anchor_to_heading
486            .get(&anchor.to_lowercase())
487            .and_then(|&idx| self.headings.get(idx))
488    }
489
490    /// Add a reference link to the index
491    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
492        self.reference_links.push(link);
493    }
494
495    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line, column)
496    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
497        // Deduplicate: multiple rules may contribute the same link
498        let is_duplicate = self.cross_file_links.iter().any(|existing| {
499            existing.target_path == link.target_path
500                && existing.fragment == link.fragment
501                && existing.line == link.line
502                && existing.column == link.column
503        });
504        if !is_duplicate {
505            self.cross_file_links.push(link);
506        }
507    }
508
509    /// Add a defined reference ID (e.g., from [ref]: url)
510    pub fn add_defined_reference(&mut self, ref_id: String) {
511        self.defined_references.insert(ref_id);
512    }
513
514    /// Check if a reference ID has an explicit definition
515    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
516        self.defined_references.contains(ref_id)
517    }
518
519    /// Check if the content hash matches
520    pub fn hash_matches(&self, hash: &str) -> bool {
521        self.content_hash == hash
522    }
523
524    /// Get the number of headings
525    pub fn heading_count(&self) -> usize {
526        self.headings.len()
527    }
528
529    /// Get the number of reference links
530    pub fn reference_link_count(&self) -> usize {
531        self.reference_links.len()
532    }
533}
534
535#[cfg(test)]
536mod tests {
537    use super::*;
538
539    #[test]
540    fn test_workspace_index_basic() {
541        let mut index = WorkspaceIndex::new();
542        assert_eq!(index.file_count(), 0);
543        assert_eq!(index.version(), 0);
544
545        let mut file_index = FileIndex::with_hash("abc123".to_string());
546        file_index.add_heading(HeadingIndex {
547            text: "Installation".to_string(),
548            auto_anchor: "installation".to_string(),
549            custom_anchor: None,
550            line: 1,
551        });
552
553        index.insert_file(PathBuf::from("docs/install.md"), file_index);
554        assert_eq!(index.file_count(), 1);
555        assert_eq!(index.version(), 1);
556
557        assert!(index.contains_file(Path::new("docs/install.md")));
558        assert!(!index.contains_file(Path::new("docs/other.md")));
559    }
560
561    #[test]
562    fn test_vulnerable_anchors() {
563        let mut index = WorkspaceIndex::new();
564
565        // File 1: heading without custom anchor (vulnerable)
566        let mut file1 = FileIndex::new();
567        file1.add_heading(HeadingIndex {
568            text: "Getting Started".to_string(),
569            auto_anchor: "getting-started".to_string(),
570            custom_anchor: None,
571            line: 1,
572        });
573        index.insert_file(PathBuf::from("docs/guide.md"), file1);
574
575        // File 2: heading with custom anchor (not vulnerable)
576        let mut file2 = FileIndex::new();
577        file2.add_heading(HeadingIndex {
578            text: "Installation".to_string(),
579            auto_anchor: "installation".to_string(),
580            custom_anchor: Some("install".to_string()),
581            line: 1,
582        });
583        index.insert_file(PathBuf::from("docs/install.md"), file2);
584
585        let vulnerable = index.get_vulnerable_anchors();
586        assert_eq!(vulnerable.len(), 1);
587        assert!(vulnerable.contains_key("getting-started"));
588        assert!(!vulnerable.contains_key("installation"));
589
590        let anchors = vulnerable.get("getting-started").unwrap();
591        assert_eq!(anchors.len(), 1);
592        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
593        assert_eq!(anchors[0].text, "Getting Started");
594    }
595
596    #[test]
597    fn test_vulnerable_anchors_multiple_files_same_anchor() {
598        // Multiple files can have headings with the same auto-generated anchor
599        // get_vulnerable_anchors() should collect all of them
600        let mut index = WorkspaceIndex::new();
601
602        // File 1: has "Installation" heading (vulnerable)
603        let mut file1 = FileIndex::new();
604        file1.add_heading(HeadingIndex {
605            text: "Installation".to_string(),
606            auto_anchor: "installation".to_string(),
607            custom_anchor: None,
608            line: 1,
609        });
610        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
611
612        // File 2: also has "Installation" heading with same anchor (vulnerable)
613        let mut file2 = FileIndex::new();
614        file2.add_heading(HeadingIndex {
615            text: "Installation".to_string(),
616            auto_anchor: "installation".to_string(),
617            custom_anchor: None,
618            line: 5,
619        });
620        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
621
622        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
623        let mut file3 = FileIndex::new();
624        file3.add_heading(HeadingIndex {
625            text: "Installation".to_string(),
626            auto_anchor: "installation".to_string(),
627            custom_anchor: Some("install".to_string()),
628            line: 10,
629        });
630        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
631
632        let vulnerable = index.get_vulnerable_anchors();
633        assert_eq!(vulnerable.len(), 1); // One unique anchor
634        assert!(vulnerable.contains_key("installation"));
635
636        let anchors = vulnerable.get("installation").unwrap();
637        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
638        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
639
640        // Verify both files are represented
641        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
642        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
643        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
644    }
645
646    #[test]
647    fn test_file_index_hash() {
648        let index = FileIndex::with_hash("hash123".to_string());
649        assert!(index.hash_matches("hash123"));
650        assert!(!index.hash_matches("other"));
651    }
652
653    #[test]
654    fn test_version_increment() {
655        let mut index = WorkspaceIndex::new();
656        assert_eq!(index.version(), 0);
657
658        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
659        assert_eq!(index.version(), 1);
660
661        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
662        assert_eq!(index.version(), 2);
663
664        index.remove_file(Path::new("a.md"));
665        assert_eq!(index.version(), 3);
666
667        // Removing non-existent file doesn't increment
668        index.remove_file(Path::new("nonexistent.md"));
669        assert_eq!(index.version(), 3);
670    }
671
672    #[test]
673    fn test_reverse_deps_basic() {
674        let mut index = WorkspaceIndex::new();
675
676        // File A links to file B
677        let mut file_a = FileIndex::new();
678        file_a.add_cross_file_link(CrossFileLinkIndex {
679            target_path: "b.md".to_string(),
680            fragment: "section".to_string(),
681            line: 10,
682            column: 5,
683        });
684        index.update_file(Path::new("docs/a.md"), file_a);
685
686        // Check that B has A as a dependent
687        let dependents = index.get_dependents(Path::new("docs/b.md"));
688        assert_eq!(dependents.len(), 1);
689        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
690
691        // A has no dependents
692        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
693        assert!(a_dependents.is_empty());
694    }
695
696    #[test]
697    fn test_reverse_deps_multiple() {
698        let mut index = WorkspaceIndex::new();
699
700        // Files A and C both link to B
701        let mut file_a = FileIndex::new();
702        file_a.add_cross_file_link(CrossFileLinkIndex {
703            target_path: "../b.md".to_string(),
704            fragment: "".to_string(),
705            line: 1,
706            column: 1,
707        });
708        index.update_file(Path::new("docs/sub/a.md"), file_a);
709
710        let mut file_c = FileIndex::new();
711        file_c.add_cross_file_link(CrossFileLinkIndex {
712            target_path: "b.md".to_string(),
713            fragment: "".to_string(),
714            line: 1,
715            column: 1,
716        });
717        index.update_file(Path::new("docs/c.md"), file_c);
718
719        // B should have both A and C as dependents
720        let dependents = index.get_dependents(Path::new("docs/b.md"));
721        assert_eq!(dependents.len(), 2);
722        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
723        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
724    }
725
726    #[test]
727    fn test_reverse_deps_update_clears_old() {
728        let mut index = WorkspaceIndex::new();
729
730        // File A initially links to B
731        let mut file_a = FileIndex::new();
732        file_a.add_cross_file_link(CrossFileLinkIndex {
733            target_path: "b.md".to_string(),
734            fragment: "".to_string(),
735            line: 1,
736            column: 1,
737        });
738        index.update_file(Path::new("docs/a.md"), file_a);
739
740        // Verify B has A as dependent
741        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
742
743        // Update A to link to C instead of B
744        let mut file_a_updated = FileIndex::new();
745        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
746            target_path: "c.md".to_string(),
747            fragment: "".to_string(),
748            line: 1,
749            column: 1,
750        });
751        index.update_file(Path::new("docs/a.md"), file_a_updated);
752
753        // B should no longer have A as dependent
754        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
755
756        // C should now have A as dependent
757        let c_deps = index.get_dependents(Path::new("docs/c.md"));
758        assert_eq!(c_deps.len(), 1);
759        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
760    }
761
762    #[test]
763    fn test_reverse_deps_remove_file() {
764        let mut index = WorkspaceIndex::new();
765
766        // File A links to B
767        let mut file_a = FileIndex::new();
768        file_a.add_cross_file_link(CrossFileLinkIndex {
769            target_path: "b.md".to_string(),
770            fragment: "".to_string(),
771            line: 1,
772            column: 1,
773        });
774        index.update_file(Path::new("docs/a.md"), file_a);
775
776        // Verify B has A as dependent
777        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
778
779        // Remove file A
780        index.remove_file(Path::new("docs/a.md"));
781
782        // B should no longer have any dependents
783        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
784    }
785
786    #[test]
787    fn test_normalize_path() {
788        // Test .. handling
789        let path = Path::new("docs/sub/../other.md");
790        let normalized = WorkspaceIndex::normalize_path(path);
791        assert_eq!(normalized, PathBuf::from("docs/other.md"));
792
793        // Test . handling
794        let path2 = Path::new("docs/./other.md");
795        let normalized2 = WorkspaceIndex::normalize_path(path2);
796        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
797
798        // Test multiple ..
799        let path3 = Path::new("a/b/c/../../d.md");
800        let normalized3 = WorkspaceIndex::normalize_path(path3);
801        assert_eq!(normalized3, PathBuf::from("a/d.md"));
802    }
803
804    #[test]
805    fn test_clear_clears_reverse_deps() {
806        let mut index = WorkspaceIndex::new();
807
808        // File A links to B
809        let mut file_a = FileIndex::new();
810        file_a.add_cross_file_link(CrossFileLinkIndex {
811            target_path: "b.md".to_string(),
812            fragment: "".to_string(),
813            line: 1,
814            column: 1,
815        });
816        index.update_file(Path::new("docs/a.md"), file_a);
817
818        // Verify B has A as dependent
819        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
820
821        // Clear the index
822        index.clear();
823
824        // Both files and reverse deps should be cleared
825        assert_eq!(index.file_count(), 0);
826        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
827    }
828
829    #[test]
830    fn test_is_file_stale() {
831        let mut index = WorkspaceIndex::new();
832
833        // Non-existent file is always stale
834        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
835
836        // Add a file with known hash
837        let file_index = FileIndex::with_hash("hash123".to_string());
838        index.insert_file(PathBuf::from("docs/test.md"), file_index);
839
840        // Same hash means not stale
841        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
842
843        // Different hash means stale
844        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
845    }
846
847    #[cfg(feature = "native")]
848    #[test]
849    fn test_cache_roundtrip() {
850        use std::fs;
851
852        // Create a temp directory
853        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
854        let _ = fs::remove_dir_all(&temp_dir);
855        fs::create_dir_all(&temp_dir).unwrap();
856
857        // Create an index with some data
858        let mut index = WorkspaceIndex::new();
859
860        let mut file1 = FileIndex::with_hash("abc123".to_string());
861        file1.add_heading(HeadingIndex {
862            text: "Test Heading".to_string(),
863            auto_anchor: "test-heading".to_string(),
864            custom_anchor: Some("test".to_string()),
865            line: 1,
866        });
867        file1.add_cross_file_link(CrossFileLinkIndex {
868            target_path: "./other.md".to_string(),
869            fragment: "section".to_string(),
870            line: 5,
871            column: 3,
872        });
873        index.update_file(Path::new("docs/file1.md"), file1);
874
875        let mut file2 = FileIndex::with_hash("def456".to_string());
876        file2.add_heading(HeadingIndex {
877            text: "Another Heading".to_string(),
878            auto_anchor: "another-heading".to_string(),
879            custom_anchor: None,
880            line: 1,
881        });
882        index.update_file(Path::new("docs/other.md"), file2);
883
884        // Save to cache
885        index.save_to_cache(&temp_dir).expect("Failed to save cache");
886
887        // Verify cache file exists
888        assert!(temp_dir.join("workspace_index.bin").exists());
889
890        // Load from cache
891        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
892
893        // Verify data matches
894        assert_eq!(loaded.file_count(), 2);
895        assert!(loaded.contains_file(Path::new("docs/file1.md")));
896        assert!(loaded.contains_file(Path::new("docs/other.md")));
897
898        // Check file1 details
899        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
900        assert_eq!(file1_loaded.content_hash, "abc123");
901        assert_eq!(file1_loaded.headings.len(), 1);
902        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
903        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
904        assert_eq!(file1_loaded.cross_file_links.len(), 1);
905        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
906
907        // Check reverse deps were serialized correctly
908        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
909        assert_eq!(dependents.len(), 1);
910        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
911
912        // Clean up
913        let _ = fs::remove_dir_all(&temp_dir);
914    }
915
916    #[cfg(feature = "native")]
917    #[test]
918    fn test_cache_missing_file() {
919        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
920        let _ = std::fs::remove_dir_all(&temp_dir);
921
922        // Should return None for non-existent cache
923        let result = WorkspaceIndex::load_from_cache(&temp_dir);
924        assert!(result.is_none());
925    }
926
927    #[cfg(feature = "native")]
928    #[test]
929    fn test_cache_corrupted_file() {
930        use std::fs;
931
932        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
933        let _ = fs::remove_dir_all(&temp_dir);
934        fs::create_dir_all(&temp_dir).unwrap();
935
936        // Write corrupted data (too small for header)
937        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
938
939        // Should return None for corrupted cache (and remove the file)
940        let result = WorkspaceIndex::load_from_cache(&temp_dir);
941        assert!(result.is_none());
942
943        // Corrupted file should be removed
944        assert!(!temp_dir.join("workspace_index.bin").exists());
945
946        // Clean up
947        let _ = fs::remove_dir_all(&temp_dir);
948    }
949
950    #[cfg(feature = "native")]
951    #[test]
952    fn test_cache_invalid_magic() {
953        use std::fs;
954
955        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
956        let _ = fs::remove_dir_all(&temp_dir);
957        fs::create_dir_all(&temp_dir).unwrap();
958
959        // Write data with wrong magic header
960        let mut data = Vec::new();
961        data.extend_from_slice(b"XXXX"); // Wrong magic
962        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
963        data.extend_from_slice(&[0; 100]); // Some garbage data
964        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
965
966        // Should return None for invalid magic
967        let result = WorkspaceIndex::load_from_cache(&temp_dir);
968        assert!(result.is_none());
969
970        // File should be removed
971        assert!(!temp_dir.join("workspace_index.bin").exists());
972
973        // Clean up
974        let _ = fs::remove_dir_all(&temp_dir);
975    }
976
977    #[cfg(feature = "native")]
978    #[test]
979    fn test_cache_version_mismatch() {
980        use std::fs;
981
982        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
983        let _ = fs::remove_dir_all(&temp_dir);
984        fs::create_dir_all(&temp_dir).unwrap();
985
986        // Write data with correct magic but wrong version
987        let mut data = Vec::new();
988        data.extend_from_slice(b"RWSI"); // Correct magic
989        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
990        data.extend_from_slice(&[0; 100]); // Some garbage data
991        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
992
993        // Should return None for version mismatch
994        let result = WorkspaceIndex::load_from_cache(&temp_dir);
995        assert!(result.is_none());
996
997        // File should be removed to trigger rebuild
998        assert!(!temp_dir.join("workspace_index.bin").exists());
999
1000        // Clean up
1001        let _ = fs::remove_dir_all(&temp_dir);
1002    }
1003
1004    #[cfg(feature = "native")]
1005    #[test]
1006    fn test_cache_atomic_write() {
1007        use std::fs;
1008
1009        // Test that atomic writes work (no temp files left behind)
1010        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1011        let _ = fs::remove_dir_all(&temp_dir);
1012        fs::create_dir_all(&temp_dir).unwrap();
1013
1014        let index = WorkspaceIndex::new();
1015        index.save_to_cache(&temp_dir).expect("Failed to save");
1016
1017        // Only the final cache file should exist, no temp files
1018        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1019        assert_eq!(entries.len(), 1);
1020        assert!(temp_dir.join("workspace_index.bin").exists());
1021
1022        // Clean up
1023        let _ = fs::remove_dir_all(&temp_dir);
1024    }
1025
1026    #[test]
1027    fn test_has_anchor_auto_generated() {
1028        let mut file_index = FileIndex::new();
1029        file_index.add_heading(HeadingIndex {
1030            text: "Installation Guide".to_string(),
1031            auto_anchor: "installation-guide".to_string(),
1032            custom_anchor: None,
1033            line: 1,
1034        });
1035
1036        // Should find by auto-generated anchor
1037        assert!(file_index.has_anchor("installation-guide"));
1038
1039        // Case-insensitive matching
1040        assert!(file_index.has_anchor("Installation-Guide"));
1041        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1042
1043        // Should not find non-existent anchor
1044        assert!(!file_index.has_anchor("nonexistent"));
1045    }
1046
1047    #[test]
1048    fn test_has_anchor_custom() {
1049        let mut file_index = FileIndex::new();
1050        file_index.add_heading(HeadingIndex {
1051            text: "Installation Guide".to_string(),
1052            auto_anchor: "installation-guide".to_string(),
1053            custom_anchor: Some("install".to_string()),
1054            line: 1,
1055        });
1056
1057        // Should find by auto-generated anchor
1058        assert!(file_index.has_anchor("installation-guide"));
1059
1060        // Should also find by custom anchor
1061        assert!(file_index.has_anchor("install"));
1062        assert!(file_index.has_anchor("Install")); // case-insensitive
1063
1064        // Should not find non-existent anchor
1065        assert!(!file_index.has_anchor("nonexistent"));
1066    }
1067
1068    #[test]
1069    fn test_get_heading_by_anchor() {
1070        let mut file_index = FileIndex::new();
1071        file_index.add_heading(HeadingIndex {
1072            text: "Installation Guide".to_string(),
1073            auto_anchor: "installation-guide".to_string(),
1074            custom_anchor: Some("install".to_string()),
1075            line: 10,
1076        });
1077        file_index.add_heading(HeadingIndex {
1078            text: "Configuration".to_string(),
1079            auto_anchor: "configuration".to_string(),
1080            custom_anchor: None,
1081            line: 20,
1082        });
1083
1084        // Get by auto anchor
1085        let heading = file_index.get_heading_by_anchor("installation-guide");
1086        assert!(heading.is_some());
1087        assert_eq!(heading.unwrap().text, "Installation Guide");
1088        assert_eq!(heading.unwrap().line, 10);
1089
1090        // Get by custom anchor
1091        let heading = file_index.get_heading_by_anchor("install");
1092        assert!(heading.is_some());
1093        assert_eq!(heading.unwrap().text, "Installation Guide");
1094
1095        // Get second heading
1096        let heading = file_index.get_heading_by_anchor("configuration");
1097        assert!(heading.is_some());
1098        assert_eq!(heading.unwrap().text, "Configuration");
1099        assert_eq!(heading.unwrap().line, 20);
1100
1101        // Non-existent
1102        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1103    }
1104
1105    #[test]
1106    fn test_anchor_lookup_many_headings() {
1107        // Test that O(1) lookup works with many headings
1108        let mut file_index = FileIndex::new();
1109
1110        // Add 100 headings
1111        for i in 0..100 {
1112            file_index.add_heading(HeadingIndex {
1113                text: format!("Heading {i}"),
1114                auto_anchor: format!("heading-{i}"),
1115                custom_anchor: Some(format!("h{i}")),
1116                line: i + 1,
1117            });
1118        }
1119
1120        // Verify all can be found
1121        for i in 0..100 {
1122            assert!(file_index.has_anchor(&format!("heading-{i}")));
1123            assert!(file_index.has_anchor(&format!("h{i}")));
1124
1125            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1126            assert!(heading.is_some());
1127            assert_eq!(heading.unwrap().line, i + 1);
1128        }
1129    }
1130}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs