rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: bincode-serialized WorkspaceIndex]
19//! ```
20
21use serde::{Deserialize, Serialize};
22use std::collections::{HashMap, HashSet};
23use std::path::{Path, PathBuf};
24
25/// Magic bytes identifying a workspace index cache file
26#[cfg(feature = "native")]
27const CACHE_MAGIC: &[u8; 4] = b"RWSI";
28
29/// Cache format version - increment when WorkspaceIndex serialization changes
30#[cfg(feature = "native")]
31const CACHE_FORMAT_VERSION: u32 = 4;
32
33/// Cache file name within the version directory
34#[cfg(feature = "native")]
35const CACHE_FILE_NAME: &str = "workspace_index.bin";
36
37/// Workspace-wide index for cross-file analysis
38///
39/// Contains pre-extracted information from all markdown files in the workspace,
40/// enabling rules to validate cross-file references efficiently.
41#[derive(Debug, Default, Clone, Serialize, Deserialize)]
42pub struct WorkspaceIndex {
43    /// Map from file path to its extracted data
44    files: HashMap<PathBuf, FileIndex>,
45    /// Reverse dependency graph: target file → files that link to it
46    /// Used to efficiently re-lint dependent files when a target changes
47    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
48    /// Version counter for cache invalidation (incremented on any change)
49    version: u64,
50}
51
52/// Index data extracted from a single file
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
54pub struct FileIndex {
55    /// Headings in this file with their anchors
56    pub headings: Vec<HeadingIndex>,
57    /// Reference links in this file (for cross-file analysis)
58    pub reference_links: Vec<ReferenceLinkIndex>,
59    /// Cross-file links in this file (for MD051 cross-file validation)
60    pub cross_file_links: Vec<CrossFileLinkIndex>,
61    /// Defined reference IDs (e.g., from [ref]: url definitions)
62    /// Used to filter out reference links that have explicit definitions
63    pub defined_references: HashSet<String>,
64    /// Content hash for change detection
65    pub content_hash: String,
66    /// O(1) anchor lookup: lowercased anchor → heading index
67    /// Includes both auto-generated and custom anchors
68    anchor_to_heading: HashMap<String, usize>,
69    /// HTML anchors defined via <a id="..."> or <element id="..."> tags
70    /// Stored lowercase for case-insensitive matching
71    html_anchors: HashSet<String>,
72    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list)
73    /// Can appear on any element, not just headings
74    /// Stored lowercase for case-insensitive matching
75    attribute_anchors: HashSet<String>,
76    /// Rules disabled for the entire file (from inline comments)
77    /// Used by cross-file rules to respect inline disable directives
78    pub file_disabled_rules: HashSet<String>,
79    /// Rules disabled at specific lines (line number -> set of rule names)
80    /// Merges both persistent disables and line-specific disables
81    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
82}
83
84/// Information about a heading for cross-file lookup
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct HeadingIndex {
87    /// The heading text (e.g., "Installation Guide")
88    pub text: String,
89    /// Auto-generated anchor (e.g., "installation-guide")
90    pub auto_anchor: String,
91    /// Custom anchor if present (e.g., "install")
92    pub custom_anchor: Option<String>,
93    /// Line number (1-indexed)
94    pub line: usize,
95}
96
97/// Information about a reference link for cross-file analysis
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct ReferenceLinkIndex {
100    /// The reference ID (the part in [text][ref])
101    pub reference_id: String,
102    /// Line number (1-indexed)
103    pub line: usize,
104    /// Column number (1-indexed)
105    pub column: usize,
106}
107
108/// Information about a cross-file link for validation
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct CrossFileLinkIndex {
111    /// The target file path (relative, as it appears in the link)
112    pub target_path: String,
113    /// The fragment/anchor being linked to (without #)
114    pub fragment: String,
115    /// Line number (1-indexed)
116    pub line: usize,
117    /// Column number (1-indexed)
118    pub column: usize,
119}
120
121/// Information about a vulnerable anchor (heading without custom ID)
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct VulnerableAnchor {
124    /// File path where the heading is located
125    pub file: PathBuf,
126    /// Line number of the heading
127    pub line: usize,
128    /// The heading text
129    pub text: String,
130}
131
132impl WorkspaceIndex {
133    /// Create a new empty workspace index
134    pub fn new() -> Self {
135        Self::default()
136    }
137
138    /// Get the current version (for cache invalidation)
139    pub fn version(&self) -> u64 {
140        self.version
141    }
142
143    /// Get the number of indexed files
144    pub fn file_count(&self) -> usize {
145        self.files.len()
146    }
147
148    /// Check if a file is in the index
149    pub fn contains_file(&self, path: &Path) -> bool {
150        self.files.contains_key(path)
151    }
152
153    /// Get the index data for a specific file
154    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
155        self.files.get(path)
156    }
157
158    /// Insert or update a file's index data
159    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
160        self.files.insert(path, index);
161        self.version = self.version.wrapping_add(1);
162    }
163
164    /// Remove a file from the index
165    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
166        // Clean up reverse deps for this file
167        self.clear_reverse_deps_for(path);
168
169        let result = self.files.remove(path);
170        if result.is_some() {
171            self.version = self.version.wrapping_add(1);
172        }
173        result
174    }
175
176    /// Build a map of all "vulnerable" anchors across the workspace
177    ///
178    /// A vulnerable anchor is an auto-generated anchor for a heading that
179    /// does NOT have a custom anchor defined. These are problematic for
180    /// translated content because the anchor changes when the heading is translated.
181    ///
182    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
183    /// Multiple files can have headings with the same auto-generated anchor,
184    /// so we collect all occurrences.
185    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
186        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
187
188        for (file_path, file_index) in &self.files {
189            for heading in &file_index.headings {
190                // Only include headings WITHOUT custom anchors
191                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
192                    let anchor_key = heading.auto_anchor.to_lowercase();
193                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
194                        file: file_path.clone(),
195                        line: heading.line,
196                        text: heading.text.clone(),
197                    });
198                }
199            }
200        }
201
202        vulnerable
203    }
204
205    /// Get all headings across the workspace (for debugging/testing)
206    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
207        self.files
208            .iter()
209            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
210    }
211
212    /// Iterate over all files in the index
213    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
214        self.files.iter().map(|(p, i)| (p.as_path(), i))
215    }
216
217    /// Clear the entire index
218    pub fn clear(&mut self) {
219        self.files.clear();
220        self.reverse_deps.clear();
221        self.version = self.version.wrapping_add(1);
222    }
223
224    /// Update a file's index and maintain reverse dependencies
225    ///
226    /// This method:
227    /// 1. Removes this file as a source (dependent) from all reverse deps
228    /// 2. Inserts the new file index
229    /// 3. Builds new reverse deps from cross_file_links
230    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
231        // Remove this file as a source (dependent) from all target entries
232        // Note: We don't remove it as a target - other files may still link to it
233        self.clear_reverse_deps_as_source(path);
234
235        // Build new reverse deps from cross_file_links
236        for link in &index.cross_file_links {
237            let target = self.resolve_target_path(path, &link.target_path);
238            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
239        }
240
241        self.files.insert(path.to_path_buf(), index);
242        self.version = self.version.wrapping_add(1);
243    }
244
245    /// Get files that depend on (link to) the given file
246    ///
247    /// Returns a list of file paths that contain links targeting this file.
248    /// Used to re-lint dependent files when a target file changes.
249    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
250        self.reverse_deps
251            .get(path)
252            .map(|set| set.iter().cloned().collect())
253            .unwrap_or_default()
254    }
255
256    /// Check if a file needs re-indexing based on its content hash
257    ///
258    /// Returns `true` if the file is not in the index or has a different hash.
259    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
260        self.files
261            .get(path)
262            .map(|f| f.content_hash != current_hash)
263            .unwrap_or(true)
264    }
265
266    /// Retain only files that exist in the given set, removing deleted files
267    ///
268    /// This prunes stale entries from the cache for files that no longer exist.
269    /// Returns the number of files removed.
270    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
271        let before_count = self.files.len();
272
273        // Collect files to remove
274        let to_remove: Vec<PathBuf> = self
275            .files
276            .keys()
277            .filter(|path| !current_files.contains(*path))
278            .cloned()
279            .collect();
280
281        // Remove each file properly (clears reverse deps)
282        for path in &to_remove {
283            self.remove_file(path);
284        }
285
286        before_count - self.files.len()
287    }
288
289    /// Save the workspace index to a cache file
290    ///
291    /// Uses bincode for efficient binary serialization with:
292    /// - Magic header for file type validation
293    /// - Format version for compatibility detection
294    /// - Atomic writes (temp file + rename) to prevent corruption
295    #[cfg(feature = "native")]
296    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
297        use std::fs;
298        use std::io::Write;
299
300        // Ensure cache directory exists
301        fs::create_dir_all(cache_dir)?;
302
303        // Serialize the index data using bincode 2.x serde compatibility
304        let encoded = bincode::serde::encode_to_vec(self, bincode::config::standard())
305            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
306
307        // Build versioned cache file: [magic][version][data]
308        let mut cache_data = Vec::with_capacity(8 + encoded.len());
309        cache_data.extend_from_slice(CACHE_MAGIC);
310        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
311        cache_data.extend_from_slice(&encoded);
312
313        // Write atomically: write to temp file then rename
314        let final_path = cache_dir.join(CACHE_FILE_NAME);
315        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
316
317        // Write to temp file
318        {
319            let mut file = fs::File::create(&temp_path)?;
320            file.write_all(&cache_data)?;
321            file.sync_all()?;
322        }
323
324        // Atomic rename
325        fs::rename(&temp_path, &final_path)?;
326
327        log::debug!(
328            "Saved workspace index to cache: {} files, {} bytes (format v{})",
329            self.files.len(),
330            cache_data.len(),
331            CACHE_FORMAT_VERSION
332        );
333
334        Ok(())
335    }
336
337    /// Load the workspace index from a cache file
338    ///
339    /// Returns `None` if:
340    /// - Cache file doesn't exist
341    /// - Magic header doesn't match
342    /// - Format version is incompatible
343    /// - Data is corrupted
344    #[cfg(feature = "native")]
345    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
346        use std::fs;
347
348        let path = cache_dir.join(CACHE_FILE_NAME);
349        let data = fs::read(&path).ok()?;
350
351        // Validate header: need at least 8 bytes for magic + version
352        if data.len() < 8 {
353            log::warn!("Workspace index cache too small, discarding");
354            let _ = fs::remove_file(&path);
355            return None;
356        }
357
358        // Check magic header
359        if &data[0..4] != CACHE_MAGIC {
360            log::warn!("Workspace index cache has invalid magic header, discarding");
361            let _ = fs::remove_file(&path);
362            return None;
363        }
364
365        // Check format version
366        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
367        if version != CACHE_FORMAT_VERSION {
368            log::info!(
369                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
370            );
371            let _ = fs::remove_file(&path);
372            return None;
373        }
374
375        // Deserialize the index data using bincode 2.x serde compatibility
376        match bincode::serde::decode_from_slice(&data[8..], bincode::config::standard()) {
377            Ok((index, _bytes_read)) => {
378                let index: Self = index;
379                log::debug!(
380                    "Loaded workspace index from cache: {} files (format v{})",
381                    index.files.len(),
382                    version
383                );
384                Some(index)
385            }
386            Err(e) => {
387                log::warn!("Failed to deserialize workspace index cache: {e}");
388                let _ = fs::remove_file(&path);
389                None
390            }
391        }
392    }
393
394    /// Remove a file as a source from all reverse dependency entries
395    ///
396    /// This removes the file from being listed as a dependent in all target entries.
397    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
398    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
399        for deps in self.reverse_deps.values_mut() {
400            deps.remove(path);
401        }
402        // Clean up empty entries
403        self.reverse_deps.retain(|_, deps| !deps.is_empty());
404    }
405
406    /// Remove a file completely from reverse dependency tracking
407    ///
408    /// Removes the file as both a source (dependent) and as a target.
409    /// Used when deleting a file from the index.
410    fn clear_reverse_deps_for(&mut self, path: &Path) {
411        // Remove as source (dependent)
412        self.clear_reverse_deps_as_source(path);
413
414        // Also remove as target
415        self.reverse_deps.remove(path);
416    }
417
418    /// Resolve a relative path from a source file to an absolute target path
419    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
420        // Get the directory containing the source file
421        let source_dir = source_file.parent().unwrap_or(Path::new(""));
422
423        // Join with the relative target and normalize
424        let target = source_dir.join(relative_target);
425
426        // Normalize the path (handle .., ., etc.)
427        Self::normalize_path(&target)
428    }
429
430    /// Normalize a path by resolving . and .. components
431    fn normalize_path(path: &Path) -> PathBuf {
432        let mut components = Vec::new();
433
434        for component in path.components() {
435            match component {
436                std::path::Component::ParentDir => {
437                    // Go up one level if possible
438                    if !components.is_empty() {
439                        components.pop();
440                    }
441                }
442                std::path::Component::CurDir => {
443                    // Skip current directory markers
444                }
445                _ => {
446                    components.push(component);
447                }
448            }
449        }
450
451        components.iter().collect()
452    }
453}
454
455impl FileIndex {
456    /// Create a new empty file index
457    pub fn new() -> Self {
458        Self::default()
459    }
460
461    /// Create a file index with the given content hash
462    pub fn with_hash(content_hash: String) -> Self {
463        Self {
464            content_hash,
465            ..Default::default()
466        }
467    }
468
469    /// Add a heading to the index
470    ///
471    /// Also updates the anchor lookup map for O(1) anchor queries
472    pub fn add_heading(&mut self, heading: HeadingIndex) {
473        let index = self.headings.len();
474
475        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
476        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
477
478        // Add custom anchor if present
479        if let Some(ref custom) = heading.custom_anchor {
480            self.anchor_to_heading.insert(custom.to_lowercase(), index);
481        }
482
483        self.headings.push(heading);
484    }
485
486    /// Check if an anchor exists in this file (O(1) lookup)
487    ///
488    /// Returns true if the anchor matches any of:
489    /// - Auto-generated heading anchors
490    /// - Custom heading anchors (from {#id} syntax on headings)
491    /// - HTML anchors (from <a id="..."> or <element id="...">)
492    /// - Attribute anchors (from { #id } syntax on non-heading elements)
493    ///
494    /// Matching is case-insensitive.
495    pub fn has_anchor(&self, anchor: &str) -> bool {
496        let lower = anchor.to_lowercase();
497        self.anchor_to_heading.contains_key(&lower)
498            || self.html_anchors.contains(&lower)
499            || self.attribute_anchors.contains(&lower)
500    }
501
502    /// Add an HTML anchor (from <a id="..."> or <element id="..."> tags)
503    pub fn add_html_anchor(&mut self, anchor: String) {
504        if !anchor.is_empty() {
505            self.html_anchors.insert(anchor.to_lowercase());
506        }
507    }
508
509    /// Add an attribute anchor (from { #id } syntax on non-heading elements)
510    pub fn add_attribute_anchor(&mut self, anchor: String) {
511        if !anchor.is_empty() {
512            self.attribute_anchors.insert(anchor.to_lowercase());
513        }
514    }
515
516    /// Get the heading index for an anchor (O(1) lookup)
517    ///
518    /// Returns the index into `self.headings` if found.
519    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
520        self.anchor_to_heading
521            .get(&anchor.to_lowercase())
522            .and_then(|&idx| self.headings.get(idx))
523    }
524
525    /// Add a reference link to the index
526    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
527        self.reference_links.push(link);
528    }
529
530    /// Check if a rule is disabled at a specific line
531    ///
532    /// Used by cross-file rules to respect inline disable directives.
533    /// Checks both file-wide disables and line-specific disables.
534    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
535        // Check file-wide disables (highest priority)
536        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
537            return true;
538        }
539
540        // Check line-specific disables
541        if let Some(rules) = self.line_disabled_rules.get(&line) {
542            return rules.contains("*") || rules.contains(rule_name);
543        }
544
545        false
546    }
547
548    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line, column)
549    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
550        // Deduplicate: multiple rules may contribute the same link
551        let is_duplicate = self.cross_file_links.iter().any(|existing| {
552            existing.target_path == link.target_path
553                && existing.fragment == link.fragment
554                && existing.line == link.line
555                && existing.column == link.column
556        });
557        if !is_duplicate {
558            self.cross_file_links.push(link);
559        }
560    }
561
562    /// Add a defined reference ID (e.g., from [ref]: url)
563    pub fn add_defined_reference(&mut self, ref_id: String) {
564        self.defined_references.insert(ref_id);
565    }
566
567    /// Check if a reference ID has an explicit definition
568    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
569        self.defined_references.contains(ref_id)
570    }
571
572    /// Check if the content hash matches
573    pub fn hash_matches(&self, hash: &str) -> bool {
574        self.content_hash == hash
575    }
576
577    /// Get the number of headings
578    pub fn heading_count(&self) -> usize {
579        self.headings.len()
580    }
581
582    /// Get the number of reference links
583    pub fn reference_link_count(&self) -> usize {
584        self.reference_links.len()
585    }
586}
587
588#[cfg(test)]
589mod tests {
590    use super::*;
591
592    #[test]
593    fn test_workspace_index_basic() {
594        let mut index = WorkspaceIndex::new();
595        assert_eq!(index.file_count(), 0);
596        assert_eq!(index.version(), 0);
597
598        let mut file_index = FileIndex::with_hash("abc123".to_string());
599        file_index.add_heading(HeadingIndex {
600            text: "Installation".to_string(),
601            auto_anchor: "installation".to_string(),
602            custom_anchor: None,
603            line: 1,
604        });
605
606        index.insert_file(PathBuf::from("docs/install.md"), file_index);
607        assert_eq!(index.file_count(), 1);
608        assert_eq!(index.version(), 1);
609
610        assert!(index.contains_file(Path::new("docs/install.md")));
611        assert!(!index.contains_file(Path::new("docs/other.md")));
612    }
613
614    #[test]
615    fn test_vulnerable_anchors() {
616        let mut index = WorkspaceIndex::new();
617
618        // File 1: heading without custom anchor (vulnerable)
619        let mut file1 = FileIndex::new();
620        file1.add_heading(HeadingIndex {
621            text: "Getting Started".to_string(),
622            auto_anchor: "getting-started".to_string(),
623            custom_anchor: None,
624            line: 1,
625        });
626        index.insert_file(PathBuf::from("docs/guide.md"), file1);
627
628        // File 2: heading with custom anchor (not vulnerable)
629        let mut file2 = FileIndex::new();
630        file2.add_heading(HeadingIndex {
631            text: "Installation".to_string(),
632            auto_anchor: "installation".to_string(),
633            custom_anchor: Some("install".to_string()),
634            line: 1,
635        });
636        index.insert_file(PathBuf::from("docs/install.md"), file2);
637
638        let vulnerable = index.get_vulnerable_anchors();
639        assert_eq!(vulnerable.len(), 1);
640        assert!(vulnerable.contains_key("getting-started"));
641        assert!(!vulnerable.contains_key("installation"));
642
643        let anchors = vulnerable.get("getting-started").unwrap();
644        assert_eq!(anchors.len(), 1);
645        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
646        assert_eq!(anchors[0].text, "Getting Started");
647    }
648
649    #[test]
650    fn test_vulnerable_anchors_multiple_files_same_anchor() {
651        // Multiple files can have headings with the same auto-generated anchor
652        // get_vulnerable_anchors() should collect all of them
653        let mut index = WorkspaceIndex::new();
654
655        // File 1: has "Installation" heading (vulnerable)
656        let mut file1 = FileIndex::new();
657        file1.add_heading(HeadingIndex {
658            text: "Installation".to_string(),
659            auto_anchor: "installation".to_string(),
660            custom_anchor: None,
661            line: 1,
662        });
663        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
664
665        // File 2: also has "Installation" heading with same anchor (vulnerable)
666        let mut file2 = FileIndex::new();
667        file2.add_heading(HeadingIndex {
668            text: "Installation".to_string(),
669            auto_anchor: "installation".to_string(),
670            custom_anchor: None,
671            line: 5,
672        });
673        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
674
675        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
676        let mut file3 = FileIndex::new();
677        file3.add_heading(HeadingIndex {
678            text: "Installation".to_string(),
679            auto_anchor: "installation".to_string(),
680            custom_anchor: Some("install".to_string()),
681            line: 10,
682        });
683        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
684
685        let vulnerable = index.get_vulnerable_anchors();
686        assert_eq!(vulnerable.len(), 1); // One unique anchor
687        assert!(vulnerable.contains_key("installation"));
688
689        let anchors = vulnerable.get("installation").unwrap();
690        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
691        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
692
693        // Verify both files are represented
694        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
695        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
696        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
697    }
698
699    #[test]
700    fn test_file_index_hash() {
701        let index = FileIndex::with_hash("hash123".to_string());
702        assert!(index.hash_matches("hash123"));
703        assert!(!index.hash_matches("other"));
704    }
705
706    #[test]
707    fn test_version_increment() {
708        let mut index = WorkspaceIndex::new();
709        assert_eq!(index.version(), 0);
710
711        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
712        assert_eq!(index.version(), 1);
713
714        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
715        assert_eq!(index.version(), 2);
716
717        index.remove_file(Path::new("a.md"));
718        assert_eq!(index.version(), 3);
719
720        // Removing non-existent file doesn't increment
721        index.remove_file(Path::new("nonexistent.md"));
722        assert_eq!(index.version(), 3);
723    }
724
725    #[test]
726    fn test_reverse_deps_basic() {
727        let mut index = WorkspaceIndex::new();
728
729        // File A links to file B
730        let mut file_a = FileIndex::new();
731        file_a.add_cross_file_link(CrossFileLinkIndex {
732            target_path: "b.md".to_string(),
733            fragment: "section".to_string(),
734            line: 10,
735            column: 5,
736        });
737        index.update_file(Path::new("docs/a.md"), file_a);
738
739        // Check that B has A as a dependent
740        let dependents = index.get_dependents(Path::new("docs/b.md"));
741        assert_eq!(dependents.len(), 1);
742        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
743
744        // A has no dependents
745        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
746        assert!(a_dependents.is_empty());
747    }
748
749    #[test]
750    fn test_reverse_deps_multiple() {
751        let mut index = WorkspaceIndex::new();
752
753        // Files A and C both link to B
754        let mut file_a = FileIndex::new();
755        file_a.add_cross_file_link(CrossFileLinkIndex {
756            target_path: "../b.md".to_string(),
757            fragment: "".to_string(),
758            line: 1,
759            column: 1,
760        });
761        index.update_file(Path::new("docs/sub/a.md"), file_a);
762
763        let mut file_c = FileIndex::new();
764        file_c.add_cross_file_link(CrossFileLinkIndex {
765            target_path: "b.md".to_string(),
766            fragment: "".to_string(),
767            line: 1,
768            column: 1,
769        });
770        index.update_file(Path::new("docs/c.md"), file_c);
771
772        // B should have both A and C as dependents
773        let dependents = index.get_dependents(Path::new("docs/b.md"));
774        assert_eq!(dependents.len(), 2);
775        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
776        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
777    }
778
779    #[test]
780    fn test_reverse_deps_update_clears_old() {
781        let mut index = WorkspaceIndex::new();
782
783        // File A initially links to B
784        let mut file_a = FileIndex::new();
785        file_a.add_cross_file_link(CrossFileLinkIndex {
786            target_path: "b.md".to_string(),
787            fragment: "".to_string(),
788            line: 1,
789            column: 1,
790        });
791        index.update_file(Path::new("docs/a.md"), file_a);
792
793        // Verify B has A as dependent
794        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
795
796        // Update A to link to C instead of B
797        let mut file_a_updated = FileIndex::new();
798        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
799            target_path: "c.md".to_string(),
800            fragment: "".to_string(),
801            line: 1,
802            column: 1,
803        });
804        index.update_file(Path::new("docs/a.md"), file_a_updated);
805
806        // B should no longer have A as dependent
807        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
808
809        // C should now have A as dependent
810        let c_deps = index.get_dependents(Path::new("docs/c.md"));
811        assert_eq!(c_deps.len(), 1);
812        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
813    }
814
815    #[test]
816    fn test_reverse_deps_remove_file() {
817        let mut index = WorkspaceIndex::new();
818
819        // File A links to B
820        let mut file_a = FileIndex::new();
821        file_a.add_cross_file_link(CrossFileLinkIndex {
822            target_path: "b.md".to_string(),
823            fragment: "".to_string(),
824            line: 1,
825            column: 1,
826        });
827        index.update_file(Path::new("docs/a.md"), file_a);
828
829        // Verify B has A as dependent
830        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
831
832        // Remove file A
833        index.remove_file(Path::new("docs/a.md"));
834
835        // B should no longer have any dependents
836        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
837    }
838
839    #[test]
840    fn test_normalize_path() {
841        // Test .. handling
842        let path = Path::new("docs/sub/../other.md");
843        let normalized = WorkspaceIndex::normalize_path(path);
844        assert_eq!(normalized, PathBuf::from("docs/other.md"));
845
846        // Test . handling
847        let path2 = Path::new("docs/./other.md");
848        let normalized2 = WorkspaceIndex::normalize_path(path2);
849        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
850
851        // Test multiple ..
852        let path3 = Path::new("a/b/c/../../d.md");
853        let normalized3 = WorkspaceIndex::normalize_path(path3);
854        assert_eq!(normalized3, PathBuf::from("a/d.md"));
855    }
856
857    #[test]
858    fn test_clear_clears_reverse_deps() {
859        let mut index = WorkspaceIndex::new();
860
861        // File A links to B
862        let mut file_a = FileIndex::new();
863        file_a.add_cross_file_link(CrossFileLinkIndex {
864            target_path: "b.md".to_string(),
865            fragment: "".to_string(),
866            line: 1,
867            column: 1,
868        });
869        index.update_file(Path::new("docs/a.md"), file_a);
870
871        // Verify B has A as dependent
872        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
873
874        // Clear the index
875        index.clear();
876
877        // Both files and reverse deps should be cleared
878        assert_eq!(index.file_count(), 0);
879        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
880    }
881
882    #[test]
883    fn test_is_file_stale() {
884        let mut index = WorkspaceIndex::new();
885
886        // Non-existent file is always stale
887        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
888
889        // Add a file with known hash
890        let file_index = FileIndex::with_hash("hash123".to_string());
891        index.insert_file(PathBuf::from("docs/test.md"), file_index);
892
893        // Same hash means not stale
894        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
895
896        // Different hash means stale
897        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
898    }
899
900    #[cfg(feature = "native")]
901    #[test]
902    fn test_cache_roundtrip() {
903        use std::fs;
904
905        // Create a temp directory
906        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
907        let _ = fs::remove_dir_all(&temp_dir);
908        fs::create_dir_all(&temp_dir).unwrap();
909
910        // Create an index with some data
911        let mut index = WorkspaceIndex::new();
912
913        let mut file1 = FileIndex::with_hash("abc123".to_string());
914        file1.add_heading(HeadingIndex {
915            text: "Test Heading".to_string(),
916            auto_anchor: "test-heading".to_string(),
917            custom_anchor: Some("test".to_string()),
918            line: 1,
919        });
920        file1.add_cross_file_link(CrossFileLinkIndex {
921            target_path: "./other.md".to_string(),
922            fragment: "section".to_string(),
923            line: 5,
924            column: 3,
925        });
926        index.update_file(Path::new("docs/file1.md"), file1);
927
928        let mut file2 = FileIndex::with_hash("def456".to_string());
929        file2.add_heading(HeadingIndex {
930            text: "Another Heading".to_string(),
931            auto_anchor: "another-heading".to_string(),
932            custom_anchor: None,
933            line: 1,
934        });
935        index.update_file(Path::new("docs/other.md"), file2);
936
937        // Save to cache
938        index.save_to_cache(&temp_dir).expect("Failed to save cache");
939
940        // Verify cache file exists
941        assert!(temp_dir.join("workspace_index.bin").exists());
942
943        // Load from cache
944        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
945
946        // Verify data matches
947        assert_eq!(loaded.file_count(), 2);
948        assert!(loaded.contains_file(Path::new("docs/file1.md")));
949        assert!(loaded.contains_file(Path::new("docs/other.md")));
950
951        // Check file1 details
952        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
953        assert_eq!(file1_loaded.content_hash, "abc123");
954        assert_eq!(file1_loaded.headings.len(), 1);
955        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
956        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
957        assert_eq!(file1_loaded.cross_file_links.len(), 1);
958        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
959
960        // Check reverse deps were serialized correctly
961        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
962        assert_eq!(dependents.len(), 1);
963        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
964
965        // Clean up
966        let _ = fs::remove_dir_all(&temp_dir);
967    }
968
969    #[cfg(feature = "native")]
970    #[test]
971    fn test_cache_missing_file() {
972        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
973        let _ = std::fs::remove_dir_all(&temp_dir);
974
975        // Should return None for non-existent cache
976        let result = WorkspaceIndex::load_from_cache(&temp_dir);
977        assert!(result.is_none());
978    }
979
980    #[cfg(feature = "native")]
981    #[test]
982    fn test_cache_corrupted_file() {
983        use std::fs;
984
985        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
986        let _ = fs::remove_dir_all(&temp_dir);
987        fs::create_dir_all(&temp_dir).unwrap();
988
989        // Write corrupted data (too small for header)
990        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
991
992        // Should return None for corrupted cache (and remove the file)
993        let result = WorkspaceIndex::load_from_cache(&temp_dir);
994        assert!(result.is_none());
995
996        // Corrupted file should be removed
997        assert!(!temp_dir.join("workspace_index.bin").exists());
998
999        // Clean up
1000        let _ = fs::remove_dir_all(&temp_dir);
1001    }
1002
1003    #[cfg(feature = "native")]
1004    #[test]
1005    fn test_cache_invalid_magic() {
1006        use std::fs;
1007
1008        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1009        let _ = fs::remove_dir_all(&temp_dir);
1010        fs::create_dir_all(&temp_dir).unwrap();
1011
1012        // Write data with wrong magic header
1013        let mut data = Vec::new();
1014        data.extend_from_slice(b"XXXX"); // Wrong magic
1015        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1016        data.extend_from_slice(&[0; 100]); // Some garbage data
1017        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1018
1019        // Should return None for invalid magic
1020        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1021        assert!(result.is_none());
1022
1023        // File should be removed
1024        assert!(!temp_dir.join("workspace_index.bin").exists());
1025
1026        // Clean up
1027        let _ = fs::remove_dir_all(&temp_dir);
1028    }
1029
1030    #[cfg(feature = "native")]
1031    #[test]
1032    fn test_cache_version_mismatch() {
1033        use std::fs;
1034
1035        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1036        let _ = fs::remove_dir_all(&temp_dir);
1037        fs::create_dir_all(&temp_dir).unwrap();
1038
1039        // Write data with correct magic but wrong version
1040        let mut data = Vec::new();
1041        data.extend_from_slice(b"RWSI"); // Correct magic
1042        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1043        data.extend_from_slice(&[0; 100]); // Some garbage data
1044        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1045
1046        // Should return None for version mismatch
1047        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1048        assert!(result.is_none());
1049
1050        // File should be removed to trigger rebuild
1051        assert!(!temp_dir.join("workspace_index.bin").exists());
1052
1053        // Clean up
1054        let _ = fs::remove_dir_all(&temp_dir);
1055    }
1056
1057    #[cfg(feature = "native")]
1058    #[test]
1059    fn test_cache_atomic_write() {
1060        use std::fs;
1061
1062        // Test that atomic writes work (no temp files left behind)
1063        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1064        let _ = fs::remove_dir_all(&temp_dir);
1065        fs::create_dir_all(&temp_dir).unwrap();
1066
1067        let index = WorkspaceIndex::new();
1068        index.save_to_cache(&temp_dir).expect("Failed to save");
1069
1070        // Only the final cache file should exist, no temp files
1071        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1072        assert_eq!(entries.len(), 1);
1073        assert!(temp_dir.join("workspace_index.bin").exists());
1074
1075        // Clean up
1076        let _ = fs::remove_dir_all(&temp_dir);
1077    }
1078
1079    #[test]
1080    fn test_has_anchor_auto_generated() {
1081        let mut file_index = FileIndex::new();
1082        file_index.add_heading(HeadingIndex {
1083            text: "Installation Guide".to_string(),
1084            auto_anchor: "installation-guide".to_string(),
1085            custom_anchor: None,
1086            line: 1,
1087        });
1088
1089        // Should find by auto-generated anchor
1090        assert!(file_index.has_anchor("installation-guide"));
1091
1092        // Case-insensitive matching
1093        assert!(file_index.has_anchor("Installation-Guide"));
1094        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1095
1096        // Should not find non-existent anchor
1097        assert!(!file_index.has_anchor("nonexistent"));
1098    }
1099
1100    #[test]
1101    fn test_has_anchor_custom() {
1102        let mut file_index = FileIndex::new();
1103        file_index.add_heading(HeadingIndex {
1104            text: "Installation Guide".to_string(),
1105            auto_anchor: "installation-guide".to_string(),
1106            custom_anchor: Some("install".to_string()),
1107            line: 1,
1108        });
1109
1110        // Should find by auto-generated anchor
1111        assert!(file_index.has_anchor("installation-guide"));
1112
1113        // Should also find by custom anchor
1114        assert!(file_index.has_anchor("install"));
1115        assert!(file_index.has_anchor("Install")); // case-insensitive
1116
1117        // Should not find non-existent anchor
1118        assert!(!file_index.has_anchor("nonexistent"));
1119    }
1120
1121    #[test]
1122    fn test_get_heading_by_anchor() {
1123        let mut file_index = FileIndex::new();
1124        file_index.add_heading(HeadingIndex {
1125            text: "Installation Guide".to_string(),
1126            auto_anchor: "installation-guide".to_string(),
1127            custom_anchor: Some("install".to_string()),
1128            line: 10,
1129        });
1130        file_index.add_heading(HeadingIndex {
1131            text: "Configuration".to_string(),
1132            auto_anchor: "configuration".to_string(),
1133            custom_anchor: None,
1134            line: 20,
1135        });
1136
1137        // Get by auto anchor
1138        let heading = file_index.get_heading_by_anchor("installation-guide");
1139        assert!(heading.is_some());
1140        assert_eq!(heading.unwrap().text, "Installation Guide");
1141        assert_eq!(heading.unwrap().line, 10);
1142
1143        // Get by custom anchor
1144        let heading = file_index.get_heading_by_anchor("install");
1145        assert!(heading.is_some());
1146        assert_eq!(heading.unwrap().text, "Installation Guide");
1147
1148        // Get second heading
1149        let heading = file_index.get_heading_by_anchor("configuration");
1150        assert!(heading.is_some());
1151        assert_eq!(heading.unwrap().text, "Configuration");
1152        assert_eq!(heading.unwrap().line, 20);
1153
1154        // Non-existent
1155        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1156    }
1157
1158    #[test]
1159    fn test_anchor_lookup_many_headings() {
1160        // Test that O(1) lookup works with many headings
1161        let mut file_index = FileIndex::new();
1162
1163        // Add 100 headings
1164        for i in 0..100 {
1165            file_index.add_heading(HeadingIndex {
1166                text: format!("Heading {i}"),
1167                auto_anchor: format!("heading-{i}"),
1168                custom_anchor: Some(format!("h{i}")),
1169                line: i + 1,
1170            });
1171        }
1172
1173        // Verify all can be found
1174        for i in 0..100 {
1175            assert!(file_index.has_anchor(&format!("heading-{i}")));
1176            assert!(file_index.has_anchor(&format!("h{i}")));
1177
1178            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1179            assert!(heading.is_some());
1180            assert_eq!(heading.unwrap().line, i + 1);
1181        }
1182    }
1183}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs