rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: postcard-serialized WorkspaceIndex]
19//! ```
20
21use serde::{Deserialize, Serialize};
22use std::collections::{HashMap, HashSet};
23use std::path::{Path, PathBuf};
24
25/// Magic bytes identifying a workspace index cache file
26#[cfg(feature = "native")]
27const CACHE_MAGIC: &[u8; 4] = b"RWSI";
28
29/// Cache format version - increment when WorkspaceIndex serialization changes
30#[cfg(feature = "native")]
31const CACHE_FORMAT_VERSION: u32 = 5;
32
33/// Cache file name within the version directory
34#[cfg(feature = "native")]
35const CACHE_FILE_NAME: &str = "workspace_index.bin";
36
37/// Workspace-wide index for cross-file analysis
38///
39/// Contains pre-extracted information from all markdown files in the workspace,
40/// enabling rules to validate cross-file references efficiently.
41#[derive(Debug, Default, Clone, Serialize, Deserialize)]
42pub struct WorkspaceIndex {
43    /// Map from file path to its extracted data
44    files: HashMap<PathBuf, FileIndex>,
45    /// Reverse dependency graph: target file → files that link to it
46    /// Used to efficiently re-lint dependent files when a target changes
47    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
48    /// Version counter for cache invalidation (incremented on any change)
49    version: u64,
50}
51
52/// Index data extracted from a single file
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
54pub struct FileIndex {
55    /// Headings in this file with their anchors
56    pub headings: Vec<HeadingIndex>,
57    /// Reference links in this file (for cross-file analysis)
58    pub reference_links: Vec<ReferenceLinkIndex>,
59    /// Cross-file links in this file (for MD051 cross-file validation)
60    pub cross_file_links: Vec<CrossFileLinkIndex>,
61    /// Defined reference IDs (e.g., from [ref]: url definitions)
62    /// Used to filter out reference links that have explicit definitions
63    pub defined_references: HashSet<String>,
64    /// Content hash for change detection
65    pub content_hash: String,
66    /// O(1) anchor lookup: lowercased anchor → heading index
67    /// Includes both auto-generated and custom anchors
68    anchor_to_heading: HashMap<String, usize>,
69    /// HTML anchors defined via <a id="..."> or <element id="..."> tags
70    /// Stored lowercase for case-insensitive matching
71    html_anchors: HashSet<String>,
72    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list)
73    /// Can appear on any element, not just headings
74    /// Stored lowercase for case-insensitive matching
75    attribute_anchors: HashSet<String>,
76    /// Rules disabled for the entire file (from inline comments)
77    /// Used by cross-file rules to respect inline disable directives
78    pub file_disabled_rules: HashSet<String>,
79    /// Rules disabled at specific lines (line number -> set of rule names)
80    /// Merges both persistent disables and line-specific disables
81    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
82}
83
84/// Information about a heading for cross-file lookup
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct HeadingIndex {
87    /// The heading text (e.g., "Installation Guide")
88    pub text: String,
89    /// Auto-generated anchor (e.g., "installation-guide")
90    pub auto_anchor: String,
91    /// Custom anchor if present (e.g., "install")
92    pub custom_anchor: Option<String>,
93    /// Line number (1-indexed)
94    pub line: usize,
95}
96
97/// Information about a reference link for cross-file analysis
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct ReferenceLinkIndex {
100    /// The reference ID (the part in [text][ref])
101    pub reference_id: String,
102    /// Line number (1-indexed)
103    pub line: usize,
104    /// Column number (1-indexed)
105    pub column: usize,
106}
107
108/// Information about a cross-file link for validation
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct CrossFileLinkIndex {
111    /// The target file path (relative, as it appears in the link)
112    pub target_path: String,
113    /// The fragment/anchor being linked to (without #)
114    pub fragment: String,
115    /// Line number (1-indexed)
116    pub line: usize,
117    /// Column number (1-indexed)
118    pub column: usize,
119}
120
121/// Information about a vulnerable anchor (heading without custom ID)
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct VulnerableAnchor {
124    /// File path where the heading is located
125    pub file: PathBuf,
126    /// Line number of the heading
127    pub line: usize,
128    /// The heading text
129    pub text: String,
130}
131
132impl WorkspaceIndex {
133    /// Create a new empty workspace index
134    pub fn new() -> Self {
135        Self::default()
136    }
137
138    /// Get the current version (for cache invalidation)
139    pub fn version(&self) -> u64 {
140        self.version
141    }
142
143    /// Get the number of indexed files
144    pub fn file_count(&self) -> usize {
145        self.files.len()
146    }
147
148    /// Check if a file is in the index
149    pub fn contains_file(&self, path: &Path) -> bool {
150        self.files.contains_key(path)
151    }
152
153    /// Get the index data for a specific file
154    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
155        self.files.get(path)
156    }
157
158    /// Insert or update a file's index data
159    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
160        self.files.insert(path, index);
161        self.version = self.version.wrapping_add(1);
162    }
163
164    /// Remove a file from the index
165    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
166        // Clean up reverse deps for this file
167        self.clear_reverse_deps_for(path);
168
169        let result = self.files.remove(path);
170        if result.is_some() {
171            self.version = self.version.wrapping_add(1);
172        }
173        result
174    }
175
176    /// Build a map of all "vulnerable" anchors across the workspace
177    ///
178    /// A vulnerable anchor is an auto-generated anchor for a heading that
179    /// does NOT have a custom anchor defined. These are problematic for
180    /// translated content because the anchor changes when the heading is translated.
181    ///
182    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
183    /// Multiple files can have headings with the same auto-generated anchor,
184    /// so we collect all occurrences.
185    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
186        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
187
188        for (file_path, file_index) in &self.files {
189            for heading in &file_index.headings {
190                // Only include headings WITHOUT custom anchors
191                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
192                    let anchor_key = heading.auto_anchor.to_lowercase();
193                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
194                        file: file_path.clone(),
195                        line: heading.line,
196                        text: heading.text.clone(),
197                    });
198                }
199            }
200        }
201
202        vulnerable
203    }
204
205    /// Get all headings across the workspace (for debugging/testing)
206    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
207        self.files
208            .iter()
209            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
210    }
211
212    /// Iterate over all files in the index
213    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
214        self.files.iter().map(|(p, i)| (p.as_path(), i))
215    }
216
217    /// Clear the entire index
218    pub fn clear(&mut self) {
219        self.files.clear();
220        self.reverse_deps.clear();
221        self.version = self.version.wrapping_add(1);
222    }
223
224    /// Update a file's index and maintain reverse dependencies
225    ///
226    /// This method:
227    /// 1. Removes this file as a source (dependent) from all reverse deps
228    /// 2. Inserts the new file index
229    /// 3. Builds new reverse deps from cross_file_links
230    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
231        // Remove this file as a source (dependent) from all target entries
232        // Note: We don't remove it as a target - other files may still link to it
233        self.clear_reverse_deps_as_source(path);
234
235        // Build new reverse deps from cross_file_links
236        for link in &index.cross_file_links {
237            let target = self.resolve_target_path(path, &link.target_path);
238            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
239        }
240
241        self.files.insert(path.to_path_buf(), index);
242        self.version = self.version.wrapping_add(1);
243    }
244
245    /// Get files that depend on (link to) the given file
246    ///
247    /// Returns a list of file paths that contain links targeting this file.
248    /// Used to re-lint dependent files when a target file changes.
249    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
250        self.reverse_deps
251            .get(path)
252            .map(|set| set.iter().cloned().collect())
253            .unwrap_or_default()
254    }
255
256    /// Check if a file needs re-indexing based on its content hash
257    ///
258    /// Returns `true` if the file is not in the index or has a different hash.
259    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
260        self.files
261            .get(path)
262            .map(|f| f.content_hash != current_hash)
263            .unwrap_or(true)
264    }
265
266    /// Retain only files that exist in the given set, removing deleted files
267    ///
268    /// This prunes stale entries from the cache for files that no longer exist.
269    /// Returns the number of files removed.
270    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
271        let before_count = self.files.len();
272
273        // Collect files to remove
274        let to_remove: Vec<PathBuf> = self
275            .files
276            .keys()
277            .filter(|path| !current_files.contains(*path))
278            .cloned()
279            .collect();
280
281        // Remove each file properly (clears reverse deps)
282        for path in &to_remove {
283            self.remove_file(path);
284        }
285
286        before_count - self.files.len()
287    }
288
289    /// Save the workspace index to a cache file
290    ///
291    /// Uses postcard for efficient binary serialization with:
292    /// - Magic header for file type validation
293    /// - Format version for compatibility detection
294    /// - Atomic writes (temp file + rename) to prevent corruption
295    #[cfg(feature = "native")]
296    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
297        use std::fs;
298        use std::io::Write;
299
300        // Ensure cache directory exists
301        fs::create_dir_all(cache_dir)?;
302
303        // Serialize the index data using postcard
304        let encoded = postcard::to_allocvec(self)
305            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
306
307        // Build versioned cache file: [magic][version][data]
308        let mut cache_data = Vec::with_capacity(8 + encoded.len());
309        cache_data.extend_from_slice(CACHE_MAGIC);
310        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
311        cache_data.extend_from_slice(&encoded);
312
313        // Write atomically: write to temp file then rename
314        let final_path = cache_dir.join(CACHE_FILE_NAME);
315        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
316
317        // Write to temp file
318        {
319            let mut file = fs::File::create(&temp_path)?;
320            file.write_all(&cache_data)?;
321            file.sync_all()?;
322        }
323
324        // Atomic rename
325        fs::rename(&temp_path, &final_path)?;
326
327        log::debug!(
328            "Saved workspace index to cache: {} files, {} bytes (format v{})",
329            self.files.len(),
330            cache_data.len(),
331            CACHE_FORMAT_VERSION
332        );
333
334        Ok(())
335    }
336
337    /// Load the workspace index from a cache file
338    ///
339    /// Returns `None` if:
340    /// - Cache file doesn't exist
341    /// - Magic header doesn't match
342    /// - Format version is incompatible
343    /// - Data is corrupted
344    #[cfg(feature = "native")]
345    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
346        use std::fs;
347
348        let path = cache_dir.join(CACHE_FILE_NAME);
349        let data = fs::read(&path).ok()?;
350
351        // Validate header: need at least 8 bytes for magic + version
352        if data.len() < 8 {
353            log::warn!("Workspace index cache too small, discarding");
354            let _ = fs::remove_file(&path);
355            return None;
356        }
357
358        // Check magic header
359        if &data[0..4] != CACHE_MAGIC {
360            log::warn!("Workspace index cache has invalid magic header, discarding");
361            let _ = fs::remove_file(&path);
362            return None;
363        }
364
365        // Check format version
366        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
367        if version != CACHE_FORMAT_VERSION {
368            log::info!(
369                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
370            );
371            let _ = fs::remove_file(&path);
372            return None;
373        }
374
375        // Deserialize the index data using postcard
376        match postcard::from_bytes::<Self>(&data[8..]) {
377            Ok(index) => {
378                log::debug!(
379                    "Loaded workspace index from cache: {} files (format v{})",
380                    index.files.len(),
381                    version
382                );
383                Some(index)
384            }
385            Err(e) => {
386                log::warn!("Failed to deserialize workspace index cache: {e}");
387                let _ = fs::remove_file(&path);
388                None
389            }
390        }
391    }
392
393    /// Remove a file as a source from all reverse dependency entries
394    ///
395    /// This removes the file from being listed as a dependent in all target entries.
396    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
397    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
398        for deps in self.reverse_deps.values_mut() {
399            deps.remove(path);
400        }
401        // Clean up empty entries
402        self.reverse_deps.retain(|_, deps| !deps.is_empty());
403    }
404
405    /// Remove a file completely from reverse dependency tracking
406    ///
407    /// Removes the file as both a source (dependent) and as a target.
408    /// Used when deleting a file from the index.
409    fn clear_reverse_deps_for(&mut self, path: &Path) {
410        // Remove as source (dependent)
411        self.clear_reverse_deps_as_source(path);
412
413        // Also remove as target
414        self.reverse_deps.remove(path);
415    }
416
417    /// Resolve a relative path from a source file to an absolute target path
418    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
419        // Get the directory containing the source file
420        let source_dir = source_file.parent().unwrap_or(Path::new(""));
421
422        // Join with the relative target and normalize
423        let target = source_dir.join(relative_target);
424
425        // Normalize the path (handle .., ., etc.)
426        Self::normalize_path(&target)
427    }
428
429    /// Normalize a path by resolving . and .. components
430    fn normalize_path(path: &Path) -> PathBuf {
431        let mut components = Vec::new();
432
433        for component in path.components() {
434            match component {
435                std::path::Component::ParentDir => {
436                    // Go up one level if possible
437                    if !components.is_empty() {
438                        components.pop();
439                    }
440                }
441                std::path::Component::CurDir => {
442                    // Skip current directory markers
443                }
444                _ => {
445                    components.push(component);
446                }
447            }
448        }
449
450        components.iter().collect()
451    }
452}
453
454impl FileIndex {
455    /// Create a new empty file index
456    pub fn new() -> Self {
457        Self::default()
458    }
459
460    /// Create a file index with the given content hash
461    pub fn with_hash(content_hash: String) -> Self {
462        Self {
463            content_hash,
464            ..Default::default()
465        }
466    }
467
468    /// Add a heading to the index
469    ///
470    /// Also updates the anchor lookup map for O(1) anchor queries
471    pub fn add_heading(&mut self, heading: HeadingIndex) {
472        let index = self.headings.len();
473
474        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
475        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
476
477        // Add custom anchor if present
478        if let Some(ref custom) = heading.custom_anchor {
479            self.anchor_to_heading.insert(custom.to_lowercase(), index);
480        }
481
482        self.headings.push(heading);
483    }
484
485    /// Check if an anchor exists in this file (O(1) lookup)
486    ///
487    /// Returns true if the anchor matches any of:
488    /// - Auto-generated heading anchors
489    /// - Custom heading anchors (from {#id} syntax on headings)
490    /// - HTML anchors (from <a id="..."> or <element id="...">)
491    /// - Attribute anchors (from { #id } syntax on non-heading elements)
492    ///
493    /// Matching is case-insensitive.
494    pub fn has_anchor(&self, anchor: &str) -> bool {
495        let lower = anchor.to_lowercase();
496        self.anchor_to_heading.contains_key(&lower)
497            || self.html_anchors.contains(&lower)
498            || self.attribute_anchors.contains(&lower)
499    }
500
501    /// Add an HTML anchor (from <a id="..."> or <element id="..."> tags)
502    pub fn add_html_anchor(&mut self, anchor: String) {
503        if !anchor.is_empty() {
504            self.html_anchors.insert(anchor.to_lowercase());
505        }
506    }
507
508    /// Add an attribute anchor (from { #id } syntax on non-heading elements)
509    pub fn add_attribute_anchor(&mut self, anchor: String) {
510        if !anchor.is_empty() {
511            self.attribute_anchors.insert(anchor.to_lowercase());
512        }
513    }
514
515    /// Get the heading index for an anchor (O(1) lookup)
516    ///
517    /// Returns the index into `self.headings` if found.
518    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
519        self.anchor_to_heading
520            .get(&anchor.to_lowercase())
521            .and_then(|&idx| self.headings.get(idx))
522    }
523
524    /// Add a reference link to the index
525    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
526        self.reference_links.push(link);
527    }
528
529    /// Check if a rule is disabled at a specific line
530    ///
531    /// Used by cross-file rules to respect inline disable directives.
532    /// Checks both file-wide disables and line-specific disables.
533    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
534        // Check file-wide disables (highest priority)
535        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
536            return true;
537        }
538
539        // Check line-specific disables
540        if let Some(rules) = self.line_disabled_rules.get(&line) {
541            return rules.contains("*") || rules.contains(rule_name);
542        }
543
544        false
545    }
546
547    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line, column)
548    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
549        // Deduplicate: multiple rules may contribute the same link
550        let is_duplicate = self.cross_file_links.iter().any(|existing| {
551            existing.target_path == link.target_path
552                && existing.fragment == link.fragment
553                && existing.line == link.line
554                && existing.column == link.column
555        });
556        if !is_duplicate {
557            self.cross_file_links.push(link);
558        }
559    }
560
561    /// Add a defined reference ID (e.g., from [ref]: url)
562    pub fn add_defined_reference(&mut self, ref_id: String) {
563        self.defined_references.insert(ref_id);
564    }
565
566    /// Check if a reference ID has an explicit definition
567    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
568        self.defined_references.contains(ref_id)
569    }
570
571    /// Check if the content hash matches
572    pub fn hash_matches(&self, hash: &str) -> bool {
573        self.content_hash == hash
574    }
575
576    /// Get the number of headings
577    pub fn heading_count(&self) -> usize {
578        self.headings.len()
579    }
580
581    /// Get the number of reference links
582    pub fn reference_link_count(&self) -> usize {
583        self.reference_links.len()
584    }
585}
586
587#[cfg(test)]
588mod tests {
589    use super::*;
590
591    #[test]
592    fn test_workspace_index_basic() {
593        let mut index = WorkspaceIndex::new();
594        assert_eq!(index.file_count(), 0);
595        assert_eq!(index.version(), 0);
596
597        let mut file_index = FileIndex::with_hash("abc123".to_string());
598        file_index.add_heading(HeadingIndex {
599            text: "Installation".to_string(),
600            auto_anchor: "installation".to_string(),
601            custom_anchor: None,
602            line: 1,
603        });
604
605        index.insert_file(PathBuf::from("docs/install.md"), file_index);
606        assert_eq!(index.file_count(), 1);
607        assert_eq!(index.version(), 1);
608
609        assert!(index.contains_file(Path::new("docs/install.md")));
610        assert!(!index.contains_file(Path::new("docs/other.md")));
611    }
612
613    #[test]
614    fn test_vulnerable_anchors() {
615        let mut index = WorkspaceIndex::new();
616
617        // File 1: heading without custom anchor (vulnerable)
618        let mut file1 = FileIndex::new();
619        file1.add_heading(HeadingIndex {
620            text: "Getting Started".to_string(),
621            auto_anchor: "getting-started".to_string(),
622            custom_anchor: None,
623            line: 1,
624        });
625        index.insert_file(PathBuf::from("docs/guide.md"), file1);
626
627        // File 2: heading with custom anchor (not vulnerable)
628        let mut file2 = FileIndex::new();
629        file2.add_heading(HeadingIndex {
630            text: "Installation".to_string(),
631            auto_anchor: "installation".to_string(),
632            custom_anchor: Some("install".to_string()),
633            line: 1,
634        });
635        index.insert_file(PathBuf::from("docs/install.md"), file2);
636
637        let vulnerable = index.get_vulnerable_anchors();
638        assert_eq!(vulnerable.len(), 1);
639        assert!(vulnerable.contains_key("getting-started"));
640        assert!(!vulnerable.contains_key("installation"));
641
642        let anchors = vulnerable.get("getting-started").unwrap();
643        assert_eq!(anchors.len(), 1);
644        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
645        assert_eq!(anchors[0].text, "Getting Started");
646    }
647
648    #[test]
649    fn test_vulnerable_anchors_multiple_files_same_anchor() {
650        // Multiple files can have headings with the same auto-generated anchor
651        // get_vulnerable_anchors() should collect all of them
652        let mut index = WorkspaceIndex::new();
653
654        // File 1: has "Installation" heading (vulnerable)
655        let mut file1 = FileIndex::new();
656        file1.add_heading(HeadingIndex {
657            text: "Installation".to_string(),
658            auto_anchor: "installation".to_string(),
659            custom_anchor: None,
660            line: 1,
661        });
662        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
663
664        // File 2: also has "Installation" heading with same anchor (vulnerable)
665        let mut file2 = FileIndex::new();
666        file2.add_heading(HeadingIndex {
667            text: "Installation".to_string(),
668            auto_anchor: "installation".to_string(),
669            custom_anchor: None,
670            line: 5,
671        });
672        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
673
674        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
675        let mut file3 = FileIndex::new();
676        file3.add_heading(HeadingIndex {
677            text: "Installation".to_string(),
678            auto_anchor: "installation".to_string(),
679            custom_anchor: Some("install".to_string()),
680            line: 10,
681        });
682        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
683
684        let vulnerable = index.get_vulnerable_anchors();
685        assert_eq!(vulnerable.len(), 1); // One unique anchor
686        assert!(vulnerable.contains_key("installation"));
687
688        let anchors = vulnerable.get("installation").unwrap();
689        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
690        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
691
692        // Verify both files are represented
693        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
694        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
695        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
696    }
697
698    #[test]
699    fn test_file_index_hash() {
700        let index = FileIndex::with_hash("hash123".to_string());
701        assert!(index.hash_matches("hash123"));
702        assert!(!index.hash_matches("other"));
703    }
704
705    #[test]
706    fn test_version_increment() {
707        let mut index = WorkspaceIndex::new();
708        assert_eq!(index.version(), 0);
709
710        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
711        assert_eq!(index.version(), 1);
712
713        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
714        assert_eq!(index.version(), 2);
715
716        index.remove_file(Path::new("a.md"));
717        assert_eq!(index.version(), 3);
718
719        // Removing non-existent file doesn't increment
720        index.remove_file(Path::new("nonexistent.md"));
721        assert_eq!(index.version(), 3);
722    }
723
724    #[test]
725    fn test_reverse_deps_basic() {
726        let mut index = WorkspaceIndex::new();
727
728        // File A links to file B
729        let mut file_a = FileIndex::new();
730        file_a.add_cross_file_link(CrossFileLinkIndex {
731            target_path: "b.md".to_string(),
732            fragment: "section".to_string(),
733            line: 10,
734            column: 5,
735        });
736        index.update_file(Path::new("docs/a.md"), file_a);
737
738        // Check that B has A as a dependent
739        let dependents = index.get_dependents(Path::new("docs/b.md"));
740        assert_eq!(dependents.len(), 1);
741        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
742
743        // A has no dependents
744        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
745        assert!(a_dependents.is_empty());
746    }
747
748    #[test]
749    fn test_reverse_deps_multiple() {
750        let mut index = WorkspaceIndex::new();
751
752        // Files A and C both link to B
753        let mut file_a = FileIndex::new();
754        file_a.add_cross_file_link(CrossFileLinkIndex {
755            target_path: "../b.md".to_string(),
756            fragment: "".to_string(),
757            line: 1,
758            column: 1,
759        });
760        index.update_file(Path::new("docs/sub/a.md"), file_a);
761
762        let mut file_c = FileIndex::new();
763        file_c.add_cross_file_link(CrossFileLinkIndex {
764            target_path: "b.md".to_string(),
765            fragment: "".to_string(),
766            line: 1,
767            column: 1,
768        });
769        index.update_file(Path::new("docs/c.md"), file_c);
770
771        // B should have both A and C as dependents
772        let dependents = index.get_dependents(Path::new("docs/b.md"));
773        assert_eq!(dependents.len(), 2);
774        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
775        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
776    }
777
778    #[test]
779    fn test_reverse_deps_update_clears_old() {
780        let mut index = WorkspaceIndex::new();
781
782        // File A initially links to B
783        let mut file_a = FileIndex::new();
784        file_a.add_cross_file_link(CrossFileLinkIndex {
785            target_path: "b.md".to_string(),
786            fragment: "".to_string(),
787            line: 1,
788            column: 1,
789        });
790        index.update_file(Path::new("docs/a.md"), file_a);
791
792        // Verify B has A as dependent
793        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
794
795        // Update A to link to C instead of B
796        let mut file_a_updated = FileIndex::new();
797        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
798            target_path: "c.md".to_string(),
799            fragment: "".to_string(),
800            line: 1,
801            column: 1,
802        });
803        index.update_file(Path::new("docs/a.md"), file_a_updated);
804
805        // B should no longer have A as dependent
806        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
807
808        // C should now have A as dependent
809        let c_deps = index.get_dependents(Path::new("docs/c.md"));
810        assert_eq!(c_deps.len(), 1);
811        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
812    }
813
814    #[test]
815    fn test_reverse_deps_remove_file() {
816        let mut index = WorkspaceIndex::new();
817
818        // File A links to B
819        let mut file_a = FileIndex::new();
820        file_a.add_cross_file_link(CrossFileLinkIndex {
821            target_path: "b.md".to_string(),
822            fragment: "".to_string(),
823            line: 1,
824            column: 1,
825        });
826        index.update_file(Path::new("docs/a.md"), file_a);
827
828        // Verify B has A as dependent
829        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
830
831        // Remove file A
832        index.remove_file(Path::new("docs/a.md"));
833
834        // B should no longer have any dependents
835        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
836    }
837
838    #[test]
839    fn test_normalize_path() {
840        // Test .. handling
841        let path = Path::new("docs/sub/../other.md");
842        let normalized = WorkspaceIndex::normalize_path(path);
843        assert_eq!(normalized, PathBuf::from("docs/other.md"));
844
845        // Test . handling
846        let path2 = Path::new("docs/./other.md");
847        let normalized2 = WorkspaceIndex::normalize_path(path2);
848        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
849
850        // Test multiple ..
851        let path3 = Path::new("a/b/c/../../d.md");
852        let normalized3 = WorkspaceIndex::normalize_path(path3);
853        assert_eq!(normalized3, PathBuf::from("a/d.md"));
854    }
855
856    #[test]
857    fn test_clear_clears_reverse_deps() {
858        let mut index = WorkspaceIndex::new();
859
860        // File A links to B
861        let mut file_a = FileIndex::new();
862        file_a.add_cross_file_link(CrossFileLinkIndex {
863            target_path: "b.md".to_string(),
864            fragment: "".to_string(),
865            line: 1,
866            column: 1,
867        });
868        index.update_file(Path::new("docs/a.md"), file_a);
869
870        // Verify B has A as dependent
871        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
872
873        // Clear the index
874        index.clear();
875
876        // Both files and reverse deps should be cleared
877        assert_eq!(index.file_count(), 0);
878        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
879    }
880
881    #[test]
882    fn test_is_file_stale() {
883        let mut index = WorkspaceIndex::new();
884
885        // Non-existent file is always stale
886        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
887
888        // Add a file with known hash
889        let file_index = FileIndex::with_hash("hash123".to_string());
890        index.insert_file(PathBuf::from("docs/test.md"), file_index);
891
892        // Same hash means not stale
893        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
894
895        // Different hash means stale
896        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
897    }
898
899    #[cfg(feature = "native")]
900    #[test]
901    fn test_cache_roundtrip() {
902        use std::fs;
903
904        // Create a temp directory
905        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
906        let _ = fs::remove_dir_all(&temp_dir);
907        fs::create_dir_all(&temp_dir).unwrap();
908
909        // Create an index with some data
910        let mut index = WorkspaceIndex::new();
911
912        let mut file1 = FileIndex::with_hash("abc123".to_string());
913        file1.add_heading(HeadingIndex {
914            text: "Test Heading".to_string(),
915            auto_anchor: "test-heading".to_string(),
916            custom_anchor: Some("test".to_string()),
917            line: 1,
918        });
919        file1.add_cross_file_link(CrossFileLinkIndex {
920            target_path: "./other.md".to_string(),
921            fragment: "section".to_string(),
922            line: 5,
923            column: 3,
924        });
925        index.update_file(Path::new("docs/file1.md"), file1);
926
927        let mut file2 = FileIndex::with_hash("def456".to_string());
928        file2.add_heading(HeadingIndex {
929            text: "Another Heading".to_string(),
930            auto_anchor: "another-heading".to_string(),
931            custom_anchor: None,
932            line: 1,
933        });
934        index.update_file(Path::new("docs/other.md"), file2);
935
936        // Save to cache
937        index.save_to_cache(&temp_dir).expect("Failed to save cache");
938
939        // Verify cache file exists
940        assert!(temp_dir.join("workspace_index.bin").exists());
941
942        // Load from cache
943        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
944
945        // Verify data matches
946        assert_eq!(loaded.file_count(), 2);
947        assert!(loaded.contains_file(Path::new("docs/file1.md")));
948        assert!(loaded.contains_file(Path::new("docs/other.md")));
949
950        // Check file1 details
951        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
952        assert_eq!(file1_loaded.content_hash, "abc123");
953        assert_eq!(file1_loaded.headings.len(), 1);
954        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
955        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
956        assert_eq!(file1_loaded.cross_file_links.len(), 1);
957        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
958
959        // Check reverse deps were serialized correctly
960        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
961        assert_eq!(dependents.len(), 1);
962        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
963
964        // Clean up
965        let _ = fs::remove_dir_all(&temp_dir);
966    }
967
968    #[cfg(feature = "native")]
969    #[test]
970    fn test_cache_missing_file() {
971        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
972        let _ = std::fs::remove_dir_all(&temp_dir);
973
974        // Should return None for non-existent cache
975        let result = WorkspaceIndex::load_from_cache(&temp_dir);
976        assert!(result.is_none());
977    }
978
979    #[cfg(feature = "native")]
980    #[test]
981    fn test_cache_corrupted_file() {
982        use std::fs;
983
984        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
985        let _ = fs::remove_dir_all(&temp_dir);
986        fs::create_dir_all(&temp_dir).unwrap();
987
988        // Write corrupted data (too small for header)
989        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
990
991        // Should return None for corrupted cache (and remove the file)
992        let result = WorkspaceIndex::load_from_cache(&temp_dir);
993        assert!(result.is_none());
994
995        // Corrupted file should be removed
996        assert!(!temp_dir.join("workspace_index.bin").exists());
997
998        // Clean up
999        let _ = fs::remove_dir_all(&temp_dir);
1000    }
1001
1002    #[cfg(feature = "native")]
1003    #[test]
1004    fn test_cache_invalid_magic() {
1005        use std::fs;
1006
1007        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1008        let _ = fs::remove_dir_all(&temp_dir);
1009        fs::create_dir_all(&temp_dir).unwrap();
1010
1011        // Write data with wrong magic header
1012        let mut data = Vec::new();
1013        data.extend_from_slice(b"XXXX"); // Wrong magic
1014        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1015        data.extend_from_slice(&[0; 100]); // Some garbage data
1016        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1017
1018        // Should return None for invalid magic
1019        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1020        assert!(result.is_none());
1021
1022        // File should be removed
1023        assert!(!temp_dir.join("workspace_index.bin").exists());
1024
1025        // Clean up
1026        let _ = fs::remove_dir_all(&temp_dir);
1027    }
1028
1029    #[cfg(feature = "native")]
1030    #[test]
1031    fn test_cache_version_mismatch() {
1032        use std::fs;
1033
1034        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1035        let _ = fs::remove_dir_all(&temp_dir);
1036        fs::create_dir_all(&temp_dir).unwrap();
1037
1038        // Write data with correct magic but wrong version
1039        let mut data = Vec::new();
1040        data.extend_from_slice(b"RWSI"); // Correct magic
1041        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1042        data.extend_from_slice(&[0; 100]); // Some garbage data
1043        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1044
1045        // Should return None for version mismatch
1046        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1047        assert!(result.is_none());
1048
1049        // File should be removed to trigger rebuild
1050        assert!(!temp_dir.join("workspace_index.bin").exists());
1051
1052        // Clean up
1053        let _ = fs::remove_dir_all(&temp_dir);
1054    }
1055
1056    #[cfg(feature = "native")]
1057    #[test]
1058    fn test_cache_atomic_write() {
1059        use std::fs;
1060
1061        // Test that atomic writes work (no temp files left behind)
1062        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1063        let _ = fs::remove_dir_all(&temp_dir);
1064        fs::create_dir_all(&temp_dir).unwrap();
1065
1066        let index = WorkspaceIndex::new();
1067        index.save_to_cache(&temp_dir).expect("Failed to save");
1068
1069        // Only the final cache file should exist, no temp files
1070        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1071        assert_eq!(entries.len(), 1);
1072        assert!(temp_dir.join("workspace_index.bin").exists());
1073
1074        // Clean up
1075        let _ = fs::remove_dir_all(&temp_dir);
1076    }
1077
1078    #[test]
1079    fn test_has_anchor_auto_generated() {
1080        let mut file_index = FileIndex::new();
1081        file_index.add_heading(HeadingIndex {
1082            text: "Installation Guide".to_string(),
1083            auto_anchor: "installation-guide".to_string(),
1084            custom_anchor: None,
1085            line: 1,
1086        });
1087
1088        // Should find by auto-generated anchor
1089        assert!(file_index.has_anchor("installation-guide"));
1090
1091        // Case-insensitive matching
1092        assert!(file_index.has_anchor("Installation-Guide"));
1093        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1094
1095        // Should not find non-existent anchor
1096        assert!(!file_index.has_anchor("nonexistent"));
1097    }
1098
1099    #[test]
1100    fn test_has_anchor_custom() {
1101        let mut file_index = FileIndex::new();
1102        file_index.add_heading(HeadingIndex {
1103            text: "Installation Guide".to_string(),
1104            auto_anchor: "installation-guide".to_string(),
1105            custom_anchor: Some("install".to_string()),
1106            line: 1,
1107        });
1108
1109        // Should find by auto-generated anchor
1110        assert!(file_index.has_anchor("installation-guide"));
1111
1112        // Should also find by custom anchor
1113        assert!(file_index.has_anchor("install"));
1114        assert!(file_index.has_anchor("Install")); // case-insensitive
1115
1116        // Should not find non-existent anchor
1117        assert!(!file_index.has_anchor("nonexistent"));
1118    }
1119
1120    #[test]
1121    fn test_get_heading_by_anchor() {
1122        let mut file_index = FileIndex::new();
1123        file_index.add_heading(HeadingIndex {
1124            text: "Installation Guide".to_string(),
1125            auto_anchor: "installation-guide".to_string(),
1126            custom_anchor: Some("install".to_string()),
1127            line: 10,
1128        });
1129        file_index.add_heading(HeadingIndex {
1130            text: "Configuration".to_string(),
1131            auto_anchor: "configuration".to_string(),
1132            custom_anchor: None,
1133            line: 20,
1134        });
1135
1136        // Get by auto anchor
1137        let heading = file_index.get_heading_by_anchor("installation-guide");
1138        assert!(heading.is_some());
1139        assert_eq!(heading.unwrap().text, "Installation Guide");
1140        assert_eq!(heading.unwrap().line, 10);
1141
1142        // Get by custom anchor
1143        let heading = file_index.get_heading_by_anchor("install");
1144        assert!(heading.is_some());
1145        assert_eq!(heading.unwrap().text, "Installation Guide");
1146
1147        // Get second heading
1148        let heading = file_index.get_heading_by_anchor("configuration");
1149        assert!(heading.is_some());
1150        assert_eq!(heading.unwrap().text, "Configuration");
1151        assert_eq!(heading.unwrap().line, 20);
1152
1153        // Non-existent
1154        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1155    }
1156
1157    #[test]
1158    fn test_anchor_lookup_many_headings() {
1159        // Test that O(1) lookup works with many headings
1160        let mut file_index = FileIndex::new();
1161
1162        // Add 100 headings
1163        for i in 0..100 {
1164            file_index.add_heading(HeadingIndex {
1165                text: format!("Heading {i}"),
1166                auto_anchor: format!("heading-{i}"),
1167                custom_anchor: Some(format!("h{i}")),
1168                line: i + 1,
1169            });
1170        }
1171
1172        // Verify all can be found
1173        for i in 0..100 {
1174            assert!(file_index.has_anchor(&format!("heading-{i}")));
1175            assert!(file_index.has_anchor(&format!("h{i}")));
1176
1177            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1178            assert!(heading.is_some());
1179            assert_eq!(heading.unwrap().line, i + 1);
1180        }
1181    }
1182}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs