rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: postcard-serialized WorkspaceIndex]
19//! ```
20
21use serde::{Deserialize, Serialize};
22use std::collections::{HashMap, HashSet};
23use std::path::{Path, PathBuf};
24
25/// Magic bytes identifying a workspace index cache file
26#[cfg(feature = "native")]
27const CACHE_MAGIC: &[u8; 4] = b"RWSI";
28
29/// Cache format version - increment when WorkspaceIndex serialization changes
30#[cfg(feature = "native")]
31const CACHE_FORMAT_VERSION: u32 = 5;
32
33/// Cache file name within the version directory
34#[cfg(feature = "native")]
35const CACHE_FILE_NAME: &str = "workspace_index.bin";
36
37/// Workspace-wide index for cross-file analysis
38///
39/// Contains pre-extracted information from all markdown files in the workspace,
40/// enabling rules to validate cross-file references efficiently.
41#[derive(Debug, Default, Clone, Serialize, Deserialize)]
42pub struct WorkspaceIndex {
43    /// Map from file path to its extracted data
44    files: HashMap<PathBuf, FileIndex>,
45    /// Reverse dependency graph: target file → files that link to it
46    /// Used to efficiently re-lint dependent files when a target changes
47    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
48    /// Version counter for cache invalidation (incremented on any change)
49    version: u64,
50}
51
52/// Index data extracted from a single file
53#[derive(Debug, Clone, Default, Serialize, Deserialize)]
54pub struct FileIndex {
55    /// Headings in this file with their anchors
56    pub headings: Vec<HeadingIndex>,
57    /// Reference links in this file (for cross-file analysis)
58    pub reference_links: Vec<ReferenceLinkIndex>,
59    /// Cross-file links in this file (for MD051 cross-file validation)
60    pub cross_file_links: Vec<CrossFileLinkIndex>,
61    /// Defined reference IDs (e.g., from [ref]: url definitions)
62    /// Used to filter out reference links that have explicit definitions
63    pub defined_references: HashSet<String>,
64    /// Content hash for change detection
65    pub content_hash: String,
66    /// O(1) anchor lookup: lowercased anchor → heading index
67    /// Includes both auto-generated and custom anchors
68    anchor_to_heading: HashMap<String, usize>,
69    /// HTML anchors defined via <a id="..."> or <element id="..."> tags
70    /// Stored lowercase for case-insensitive matching
71    html_anchors: HashSet<String>,
72    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list)
73    /// Can appear on any element, not just headings
74    /// Stored lowercase for case-insensitive matching
75    attribute_anchors: HashSet<String>,
76    /// Rules disabled for the entire file (from inline comments)
77    /// Used by cross-file rules to respect inline disable directives
78    pub file_disabled_rules: HashSet<String>,
79    /// Rules disabled at specific lines (line number -> set of rule names)
80    /// Merges both persistent disables and line-specific disables
81    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
82}
83
84/// Information about a heading for cross-file lookup
85#[derive(Debug, Clone, Serialize, Deserialize)]
86pub struct HeadingIndex {
87    /// The heading text (e.g., "Installation Guide")
88    pub text: String,
89    /// Auto-generated anchor (e.g., "installation-guide")
90    pub auto_anchor: String,
91    /// Custom anchor if present (e.g., "install")
92    pub custom_anchor: Option<String>,
93    /// Line number (1-indexed)
94    pub line: usize,
95}
96
97/// Information about a reference link for cross-file analysis
98#[derive(Debug, Clone, Serialize, Deserialize)]
99pub struct ReferenceLinkIndex {
100    /// The reference ID (the part in [text][ref])
101    pub reference_id: String,
102    /// Line number (1-indexed)
103    pub line: usize,
104    /// Column number (1-indexed)
105    pub column: usize,
106}
107
108/// Information about a cross-file link for validation
109#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct CrossFileLinkIndex {
111    /// The target file path (relative, as it appears in the link)
112    pub target_path: String,
113    /// The fragment/anchor being linked to (without #)
114    pub fragment: String,
115    /// Line number (1-indexed)
116    pub line: usize,
117    /// Column number (1-indexed)
118    pub column: usize,
119}
120
121/// Information about a vulnerable anchor (heading without custom ID)
122#[derive(Debug, Clone, Serialize, Deserialize)]
123pub struct VulnerableAnchor {
124    /// File path where the heading is located
125    pub file: PathBuf,
126    /// Line number of the heading
127    pub line: usize,
128    /// The heading text
129    pub text: String,
130}
131
132impl WorkspaceIndex {
133    /// Create a new empty workspace index
134    pub fn new() -> Self {
135        Self::default()
136    }
137
138    /// Get the current version (for cache invalidation)
139    pub fn version(&self) -> u64 {
140        self.version
141    }
142
143    /// Get the number of indexed files
144    pub fn file_count(&self) -> usize {
145        self.files.len()
146    }
147
148    /// Check if a file is in the index
149    pub fn contains_file(&self, path: &Path) -> bool {
150        self.files.contains_key(path)
151    }
152
153    /// Get the index data for a specific file
154    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
155        self.files.get(path)
156    }
157
158    /// Insert or update a file's index data
159    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
160        self.files.insert(path, index);
161        self.version = self.version.wrapping_add(1);
162    }
163
164    /// Remove a file from the index
165    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
166        // Clean up reverse deps for this file
167        self.clear_reverse_deps_for(path);
168
169        let result = self.files.remove(path);
170        if result.is_some() {
171            self.version = self.version.wrapping_add(1);
172        }
173        result
174    }
175
176    /// Build a map of all "vulnerable" anchors across the workspace
177    ///
178    /// A vulnerable anchor is an auto-generated anchor for a heading that
179    /// does NOT have a custom anchor defined. These are problematic for
180    /// translated content because the anchor changes when the heading is translated.
181    ///
182    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
183    /// Multiple files can have headings with the same auto-generated anchor,
184    /// so we collect all occurrences.
185    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
186        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
187
188        for (file_path, file_index) in &self.files {
189            for heading in &file_index.headings {
190                // Only include headings WITHOUT custom anchors
191                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
192                    let anchor_key = heading.auto_anchor.to_lowercase();
193                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
194                        file: file_path.clone(),
195                        line: heading.line,
196                        text: heading.text.clone(),
197                    });
198                }
199            }
200        }
201
202        vulnerable
203    }
204
205    /// Get all headings across the workspace (for debugging/testing)
206    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
207        self.files
208            .iter()
209            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
210    }
211
212    /// Iterate over all files in the index
213    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
214        self.files.iter().map(|(p, i)| (p.as_path(), i))
215    }
216
217    /// Clear the entire index
218    pub fn clear(&mut self) {
219        self.files.clear();
220        self.reverse_deps.clear();
221        self.version = self.version.wrapping_add(1);
222    }
223
224    /// Update a file's index and maintain reverse dependencies
225    ///
226    /// This method:
227    /// 1. Removes this file as a source (dependent) from all reverse deps
228    /// 2. Inserts the new file index
229    /// 3. Builds new reverse deps from cross_file_links
230    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
231        // Remove this file as a source (dependent) from all target entries
232        // Note: We don't remove it as a target - other files may still link to it
233        self.clear_reverse_deps_as_source(path);
234
235        // Build new reverse deps from cross_file_links
236        for link in &index.cross_file_links {
237            let target = self.resolve_target_path(path, &link.target_path);
238            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
239        }
240
241        self.files.insert(path.to_path_buf(), index);
242        self.version = self.version.wrapping_add(1);
243    }
244
245    /// Get files that depend on (link to) the given file
246    ///
247    /// Returns a list of file paths that contain links targeting this file.
248    /// Used to re-lint dependent files when a target file changes.
249    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
250        self.reverse_deps
251            .get(path)
252            .map(|set| set.iter().cloned().collect())
253            .unwrap_or_default()
254    }
255
256    /// Check if a file needs re-indexing based on its content hash
257    ///
258    /// Returns `true` if the file is not in the index or has a different hash.
259    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
260        self.files
261            .get(path)
262            .map(|f| f.content_hash != current_hash)
263            .unwrap_or(true)
264    }
265
266    /// Retain only files that exist in the given set, removing deleted files
267    ///
268    /// This prunes stale entries from the cache for files that no longer exist.
269    /// Returns the number of files removed.
270    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
271        let before_count = self.files.len();
272
273        // Collect files to remove
274        let to_remove: Vec<PathBuf> = self
275            .files
276            .keys()
277            .filter(|path| !current_files.contains(*path))
278            .cloned()
279            .collect();
280
281        // Remove each file properly (clears reverse deps)
282        for path in &to_remove {
283            self.remove_file(path);
284        }
285
286        before_count - self.files.len()
287    }
288
289    /// Save the workspace index to a cache file
290    ///
291    /// Uses postcard for efficient binary serialization with:
292    /// - Magic header for file type validation
293    /// - Format version for compatibility detection
294    /// - Atomic writes (temp file + rename) to prevent corruption
295    #[cfg(feature = "native")]
296    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
297        use std::fs;
298        use std::io::Write;
299
300        // Ensure cache directory exists
301        fs::create_dir_all(cache_dir)?;
302
303        // Serialize the index data using postcard
304        let encoded = postcard::to_allocvec(self)
305            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
306
307        // Build versioned cache file: [magic][version][data]
308        let mut cache_data = Vec::with_capacity(8 + encoded.len());
309        cache_data.extend_from_slice(CACHE_MAGIC);
310        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
311        cache_data.extend_from_slice(&encoded);
312
313        // Write atomically: write to temp file then rename
314        let final_path = cache_dir.join(CACHE_FILE_NAME);
315        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
316
317        // Write to temp file
318        {
319            let mut file = fs::File::create(&temp_path)?;
320            file.write_all(&cache_data)?;
321            file.sync_all()?;
322        }
323
324        // Atomic rename
325        fs::rename(&temp_path, &final_path)?;
326
327        log::debug!(
328            "Saved workspace index to cache: {} files, {} bytes (format v{})",
329            self.files.len(),
330            cache_data.len(),
331            CACHE_FORMAT_VERSION
332        );
333
334        Ok(())
335    }
336
337    /// Load the workspace index from a cache file
338    ///
339    /// Returns `None` if:
340    /// - Cache file doesn't exist
341    /// - Magic header doesn't match
342    /// - Format version is incompatible
343    /// - Data is corrupted
344    #[cfg(feature = "native")]
345    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
346        use std::fs;
347
348        let path = cache_dir.join(CACHE_FILE_NAME);
349        let data = fs::read(&path).ok()?;
350
351        // Validate header: need at least 8 bytes for magic + version
352        if data.len() < 8 {
353            log::warn!("Workspace index cache too small, discarding");
354            let _ = fs::remove_file(&path);
355            return None;
356        }
357
358        // Check magic header
359        if &data[0..4] != CACHE_MAGIC {
360            log::warn!("Workspace index cache has invalid magic header, discarding");
361            let _ = fs::remove_file(&path);
362            return None;
363        }
364
365        // Check format version
366        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
367        if version != CACHE_FORMAT_VERSION {
368            log::info!(
369                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
370            );
371            let _ = fs::remove_file(&path);
372            return None;
373        }
374
375        // Deserialize the index data using postcard
376        match postcard::from_bytes::<Self>(&data[8..]) {
377            Ok(index) => {
378                log::debug!(
379                    "Loaded workspace index from cache: {} files (format v{})",
380                    index.files.len(),
381                    version
382                );
383                Some(index)
384            }
385            Err(e) => {
386                log::warn!("Failed to deserialize workspace index cache: {e}");
387                let _ = fs::remove_file(&path);
388                None
389            }
390        }
391    }
392
393    /// Remove a file as a source from all reverse dependency entries
394    ///
395    /// This removes the file from being listed as a dependent in all target entries.
396    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
397    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
398        for deps in self.reverse_deps.values_mut() {
399            deps.remove(path);
400        }
401        // Clean up empty entries
402        self.reverse_deps.retain(|_, deps| !deps.is_empty());
403    }
404
405    /// Remove a file completely from reverse dependency tracking
406    ///
407    /// Removes the file as both a source (dependent) and as a target.
408    /// Used when deleting a file from the index.
409    fn clear_reverse_deps_for(&mut self, path: &Path) {
410        // Remove as source (dependent)
411        self.clear_reverse_deps_as_source(path);
412
413        // Also remove as target
414        self.reverse_deps.remove(path);
415    }
416
417    /// Resolve a relative path from a source file to an absolute target path
418    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
419        // Get the directory containing the source file
420        let source_dir = source_file.parent().unwrap_or(Path::new(""));
421
422        // Join with the relative target and normalize
423        let target = source_dir.join(relative_target);
424
425        // Normalize the path (handle .., ., etc.)
426        Self::normalize_path(&target)
427    }
428
429    /// Normalize a path by resolving . and .. components
430    fn normalize_path(path: &Path) -> PathBuf {
431        let mut components = Vec::new();
432
433        for component in path.components() {
434            match component {
435                std::path::Component::ParentDir => {
436                    // Go up one level if possible
437                    if !components.is_empty() {
438                        components.pop();
439                    }
440                }
441                std::path::Component::CurDir => {
442                    // Skip current directory markers
443                }
444                _ => {
445                    components.push(component);
446                }
447            }
448        }
449
450        components.iter().collect()
451    }
452}
453
454impl FileIndex {
455    /// Create a new empty file index
456    pub fn new() -> Self {
457        Self::default()
458    }
459
460    /// Create a file index with the given content hash
461    pub fn with_hash(content_hash: String) -> Self {
462        Self {
463            content_hash,
464            ..Default::default()
465        }
466    }
467
468    /// Add a heading to the index
469    ///
470    /// Also updates the anchor lookup map for O(1) anchor queries
471    pub fn add_heading(&mut self, heading: HeadingIndex) {
472        let index = self.headings.len();
473
474        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
475        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
476
477        // Add custom anchor if present
478        if let Some(ref custom) = heading.custom_anchor {
479            self.anchor_to_heading.insert(custom.to_lowercase(), index);
480        }
481
482        self.headings.push(heading);
483    }
484
485    /// Check if an anchor exists in this file (O(1) lookup)
486    ///
487    /// Returns true if the anchor matches any of:
488    /// - Auto-generated heading anchors
489    /// - Custom heading anchors (from {#id} syntax on headings)
490    /// - HTML anchors (from <a id="..."> or <element id="...">)
491    /// - Attribute anchors (from { #id } syntax on non-heading elements)
492    ///
493    /// Matching is case-insensitive.
494    pub fn has_anchor(&self, anchor: &str) -> bool {
495        let lower = anchor.to_lowercase();
496        self.anchor_to_heading.contains_key(&lower)
497            || self.html_anchors.contains(&lower)
498            || self.attribute_anchors.contains(&lower)
499    }
500
501    /// Add an HTML anchor (from <a id="..."> or <element id="..."> tags)
502    pub fn add_html_anchor(&mut self, anchor: String) {
503        if !anchor.is_empty() {
504            self.html_anchors.insert(anchor.to_lowercase());
505        }
506    }
507
508    /// Add an attribute anchor (from { #id } syntax on non-heading elements)
509    pub fn add_attribute_anchor(&mut self, anchor: String) {
510        if !anchor.is_empty() {
511            self.attribute_anchors.insert(anchor.to_lowercase());
512        }
513    }
514
515    /// Get the heading index for an anchor (O(1) lookup)
516    ///
517    /// Returns the index into `self.headings` if found.
518    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
519        self.anchor_to_heading
520            .get(&anchor.to_lowercase())
521            .and_then(|&idx| self.headings.get(idx))
522    }
523
524    /// Add a reference link to the index
525    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
526        self.reference_links.push(link);
527    }
528
529    /// Check if a rule is disabled at a specific line
530    ///
531    /// Used by cross-file rules to respect inline disable directives.
532    /// Checks both file-wide disables and line-specific disables.
533    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
534        // Check file-wide disables (highest priority)
535        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
536            return true;
537        }
538
539        // Check line-specific disables
540        if let Some(rules) = self.line_disabled_rules.get(&line) {
541            return rules.contains("*") || rules.contains(rule_name);
542        }
543
544        false
545    }
546
547    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line)
548    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
549        // Deduplicate: multiple rules may contribute the same link with different columns
550        // (e.g., MD051 uses link start, MD057 uses URL start)
551        let is_duplicate = self.cross_file_links.iter().any(|existing| {
552            existing.target_path == link.target_path && existing.fragment == link.fragment && existing.line == link.line
553        });
554        if !is_duplicate {
555            self.cross_file_links.push(link);
556        }
557    }
558
559    /// Add a defined reference ID (e.g., from [ref]: url)
560    pub fn add_defined_reference(&mut self, ref_id: String) {
561        self.defined_references.insert(ref_id);
562    }
563
564    /// Check if a reference ID has an explicit definition
565    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
566        self.defined_references.contains(ref_id)
567    }
568
569    /// Check if the content hash matches
570    pub fn hash_matches(&self, hash: &str) -> bool {
571        self.content_hash == hash
572    }
573
574    /// Get the number of headings
575    pub fn heading_count(&self) -> usize {
576        self.headings.len()
577    }
578
579    /// Get the number of reference links
580    pub fn reference_link_count(&self) -> usize {
581        self.reference_links.len()
582    }
583}
584
585#[cfg(test)]
586mod tests {
587    use super::*;
588
589    #[test]
590    fn test_workspace_index_basic() {
591        let mut index = WorkspaceIndex::new();
592        assert_eq!(index.file_count(), 0);
593        assert_eq!(index.version(), 0);
594
595        let mut file_index = FileIndex::with_hash("abc123".to_string());
596        file_index.add_heading(HeadingIndex {
597            text: "Installation".to_string(),
598            auto_anchor: "installation".to_string(),
599            custom_anchor: None,
600            line: 1,
601        });
602
603        index.insert_file(PathBuf::from("docs/install.md"), file_index);
604        assert_eq!(index.file_count(), 1);
605        assert_eq!(index.version(), 1);
606
607        assert!(index.contains_file(Path::new("docs/install.md")));
608        assert!(!index.contains_file(Path::new("docs/other.md")));
609    }
610
611    #[test]
612    fn test_vulnerable_anchors() {
613        let mut index = WorkspaceIndex::new();
614
615        // File 1: heading without custom anchor (vulnerable)
616        let mut file1 = FileIndex::new();
617        file1.add_heading(HeadingIndex {
618            text: "Getting Started".to_string(),
619            auto_anchor: "getting-started".to_string(),
620            custom_anchor: None,
621            line: 1,
622        });
623        index.insert_file(PathBuf::from("docs/guide.md"), file1);
624
625        // File 2: heading with custom anchor (not vulnerable)
626        let mut file2 = FileIndex::new();
627        file2.add_heading(HeadingIndex {
628            text: "Installation".to_string(),
629            auto_anchor: "installation".to_string(),
630            custom_anchor: Some("install".to_string()),
631            line: 1,
632        });
633        index.insert_file(PathBuf::from("docs/install.md"), file2);
634
635        let vulnerable = index.get_vulnerable_anchors();
636        assert_eq!(vulnerable.len(), 1);
637        assert!(vulnerable.contains_key("getting-started"));
638        assert!(!vulnerable.contains_key("installation"));
639
640        let anchors = vulnerable.get("getting-started").unwrap();
641        assert_eq!(anchors.len(), 1);
642        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
643        assert_eq!(anchors[0].text, "Getting Started");
644    }
645
646    #[test]
647    fn test_vulnerable_anchors_multiple_files_same_anchor() {
648        // Multiple files can have headings with the same auto-generated anchor
649        // get_vulnerable_anchors() should collect all of them
650        let mut index = WorkspaceIndex::new();
651
652        // File 1: has "Installation" heading (vulnerable)
653        let mut file1 = FileIndex::new();
654        file1.add_heading(HeadingIndex {
655            text: "Installation".to_string(),
656            auto_anchor: "installation".to_string(),
657            custom_anchor: None,
658            line: 1,
659        });
660        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
661
662        // File 2: also has "Installation" heading with same anchor (vulnerable)
663        let mut file2 = FileIndex::new();
664        file2.add_heading(HeadingIndex {
665            text: "Installation".to_string(),
666            auto_anchor: "installation".to_string(),
667            custom_anchor: None,
668            line: 5,
669        });
670        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
671
672        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
673        let mut file3 = FileIndex::new();
674        file3.add_heading(HeadingIndex {
675            text: "Installation".to_string(),
676            auto_anchor: "installation".to_string(),
677            custom_anchor: Some("install".to_string()),
678            line: 10,
679        });
680        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
681
682        let vulnerable = index.get_vulnerable_anchors();
683        assert_eq!(vulnerable.len(), 1); // One unique anchor
684        assert!(vulnerable.contains_key("installation"));
685
686        let anchors = vulnerable.get("installation").unwrap();
687        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
688        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
689
690        // Verify both files are represented
691        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
692        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
693        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
694    }
695
696    #[test]
697    fn test_file_index_hash() {
698        let index = FileIndex::with_hash("hash123".to_string());
699        assert!(index.hash_matches("hash123"));
700        assert!(!index.hash_matches("other"));
701    }
702
703    #[test]
704    fn test_version_increment() {
705        let mut index = WorkspaceIndex::new();
706        assert_eq!(index.version(), 0);
707
708        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
709        assert_eq!(index.version(), 1);
710
711        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
712        assert_eq!(index.version(), 2);
713
714        index.remove_file(Path::new("a.md"));
715        assert_eq!(index.version(), 3);
716
717        // Removing non-existent file doesn't increment
718        index.remove_file(Path::new("nonexistent.md"));
719        assert_eq!(index.version(), 3);
720    }
721
722    #[test]
723    fn test_reverse_deps_basic() {
724        let mut index = WorkspaceIndex::new();
725
726        // File A links to file B
727        let mut file_a = FileIndex::new();
728        file_a.add_cross_file_link(CrossFileLinkIndex {
729            target_path: "b.md".to_string(),
730            fragment: "section".to_string(),
731            line: 10,
732            column: 5,
733        });
734        index.update_file(Path::new("docs/a.md"), file_a);
735
736        // Check that B has A as a dependent
737        let dependents = index.get_dependents(Path::new("docs/b.md"));
738        assert_eq!(dependents.len(), 1);
739        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
740
741        // A has no dependents
742        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
743        assert!(a_dependents.is_empty());
744    }
745
746    #[test]
747    fn test_reverse_deps_multiple() {
748        let mut index = WorkspaceIndex::new();
749
750        // Files A and C both link to B
751        let mut file_a = FileIndex::new();
752        file_a.add_cross_file_link(CrossFileLinkIndex {
753            target_path: "../b.md".to_string(),
754            fragment: "".to_string(),
755            line: 1,
756            column: 1,
757        });
758        index.update_file(Path::new("docs/sub/a.md"), file_a);
759
760        let mut file_c = FileIndex::new();
761        file_c.add_cross_file_link(CrossFileLinkIndex {
762            target_path: "b.md".to_string(),
763            fragment: "".to_string(),
764            line: 1,
765            column: 1,
766        });
767        index.update_file(Path::new("docs/c.md"), file_c);
768
769        // B should have both A and C as dependents
770        let dependents = index.get_dependents(Path::new("docs/b.md"));
771        assert_eq!(dependents.len(), 2);
772        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
773        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
774    }
775
776    #[test]
777    fn test_reverse_deps_update_clears_old() {
778        let mut index = WorkspaceIndex::new();
779
780        // File A initially links to B
781        let mut file_a = FileIndex::new();
782        file_a.add_cross_file_link(CrossFileLinkIndex {
783            target_path: "b.md".to_string(),
784            fragment: "".to_string(),
785            line: 1,
786            column: 1,
787        });
788        index.update_file(Path::new("docs/a.md"), file_a);
789
790        // Verify B has A as dependent
791        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
792
793        // Update A to link to C instead of B
794        let mut file_a_updated = FileIndex::new();
795        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
796            target_path: "c.md".to_string(),
797            fragment: "".to_string(),
798            line: 1,
799            column: 1,
800        });
801        index.update_file(Path::new("docs/a.md"), file_a_updated);
802
803        // B should no longer have A as dependent
804        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
805
806        // C should now have A as dependent
807        let c_deps = index.get_dependents(Path::new("docs/c.md"));
808        assert_eq!(c_deps.len(), 1);
809        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
810    }
811
812    #[test]
813    fn test_reverse_deps_remove_file() {
814        let mut index = WorkspaceIndex::new();
815
816        // File A links to B
817        let mut file_a = FileIndex::new();
818        file_a.add_cross_file_link(CrossFileLinkIndex {
819            target_path: "b.md".to_string(),
820            fragment: "".to_string(),
821            line: 1,
822            column: 1,
823        });
824        index.update_file(Path::new("docs/a.md"), file_a);
825
826        // Verify B has A as dependent
827        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
828
829        // Remove file A
830        index.remove_file(Path::new("docs/a.md"));
831
832        // B should no longer have any dependents
833        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
834    }
835
836    #[test]
837    fn test_normalize_path() {
838        // Test .. handling
839        let path = Path::new("docs/sub/../other.md");
840        let normalized = WorkspaceIndex::normalize_path(path);
841        assert_eq!(normalized, PathBuf::from("docs/other.md"));
842
843        // Test . handling
844        let path2 = Path::new("docs/./other.md");
845        let normalized2 = WorkspaceIndex::normalize_path(path2);
846        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
847
848        // Test multiple ..
849        let path3 = Path::new("a/b/c/../../d.md");
850        let normalized3 = WorkspaceIndex::normalize_path(path3);
851        assert_eq!(normalized3, PathBuf::from("a/d.md"));
852    }
853
854    #[test]
855    fn test_clear_clears_reverse_deps() {
856        let mut index = WorkspaceIndex::new();
857
858        // File A links to B
859        let mut file_a = FileIndex::new();
860        file_a.add_cross_file_link(CrossFileLinkIndex {
861            target_path: "b.md".to_string(),
862            fragment: "".to_string(),
863            line: 1,
864            column: 1,
865        });
866        index.update_file(Path::new("docs/a.md"), file_a);
867
868        // Verify B has A as dependent
869        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
870
871        // Clear the index
872        index.clear();
873
874        // Both files and reverse deps should be cleared
875        assert_eq!(index.file_count(), 0);
876        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
877    }
878
879    #[test]
880    fn test_is_file_stale() {
881        let mut index = WorkspaceIndex::new();
882
883        // Non-existent file is always stale
884        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
885
886        // Add a file with known hash
887        let file_index = FileIndex::with_hash("hash123".to_string());
888        index.insert_file(PathBuf::from("docs/test.md"), file_index);
889
890        // Same hash means not stale
891        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
892
893        // Different hash means stale
894        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
895    }
896
897    #[cfg(feature = "native")]
898    #[test]
899    fn test_cache_roundtrip() {
900        use std::fs;
901
902        // Create a temp directory
903        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
904        let _ = fs::remove_dir_all(&temp_dir);
905        fs::create_dir_all(&temp_dir).unwrap();
906
907        // Create an index with some data
908        let mut index = WorkspaceIndex::new();
909
910        let mut file1 = FileIndex::with_hash("abc123".to_string());
911        file1.add_heading(HeadingIndex {
912            text: "Test Heading".to_string(),
913            auto_anchor: "test-heading".to_string(),
914            custom_anchor: Some("test".to_string()),
915            line: 1,
916        });
917        file1.add_cross_file_link(CrossFileLinkIndex {
918            target_path: "./other.md".to_string(),
919            fragment: "section".to_string(),
920            line: 5,
921            column: 3,
922        });
923        index.update_file(Path::new("docs/file1.md"), file1);
924
925        let mut file2 = FileIndex::with_hash("def456".to_string());
926        file2.add_heading(HeadingIndex {
927            text: "Another Heading".to_string(),
928            auto_anchor: "another-heading".to_string(),
929            custom_anchor: None,
930            line: 1,
931        });
932        index.update_file(Path::new("docs/other.md"), file2);
933
934        // Save to cache
935        index.save_to_cache(&temp_dir).expect("Failed to save cache");
936
937        // Verify cache file exists
938        assert!(temp_dir.join("workspace_index.bin").exists());
939
940        // Load from cache
941        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
942
943        // Verify data matches
944        assert_eq!(loaded.file_count(), 2);
945        assert!(loaded.contains_file(Path::new("docs/file1.md")));
946        assert!(loaded.contains_file(Path::new("docs/other.md")));
947
948        // Check file1 details
949        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
950        assert_eq!(file1_loaded.content_hash, "abc123");
951        assert_eq!(file1_loaded.headings.len(), 1);
952        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
953        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
954        assert_eq!(file1_loaded.cross_file_links.len(), 1);
955        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
956
957        // Check reverse deps were serialized correctly
958        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
959        assert_eq!(dependents.len(), 1);
960        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
961
962        // Clean up
963        let _ = fs::remove_dir_all(&temp_dir);
964    }
965
966    #[cfg(feature = "native")]
967    #[test]
968    fn test_cache_missing_file() {
969        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
970        let _ = std::fs::remove_dir_all(&temp_dir);
971
972        // Should return None for non-existent cache
973        let result = WorkspaceIndex::load_from_cache(&temp_dir);
974        assert!(result.is_none());
975    }
976
977    #[cfg(feature = "native")]
978    #[test]
979    fn test_cache_corrupted_file() {
980        use std::fs;
981
982        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
983        let _ = fs::remove_dir_all(&temp_dir);
984        fs::create_dir_all(&temp_dir).unwrap();
985
986        // Write corrupted data (too small for header)
987        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
988
989        // Should return None for corrupted cache (and remove the file)
990        let result = WorkspaceIndex::load_from_cache(&temp_dir);
991        assert!(result.is_none());
992
993        // Corrupted file should be removed
994        assert!(!temp_dir.join("workspace_index.bin").exists());
995
996        // Clean up
997        let _ = fs::remove_dir_all(&temp_dir);
998    }
999
1000    #[cfg(feature = "native")]
1001    #[test]
1002    fn test_cache_invalid_magic() {
1003        use std::fs;
1004
1005        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1006        let _ = fs::remove_dir_all(&temp_dir);
1007        fs::create_dir_all(&temp_dir).unwrap();
1008
1009        // Write data with wrong magic header
1010        let mut data = Vec::new();
1011        data.extend_from_slice(b"XXXX"); // Wrong magic
1012        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1013        data.extend_from_slice(&[0; 100]); // Some garbage data
1014        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1015
1016        // Should return None for invalid magic
1017        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1018        assert!(result.is_none());
1019
1020        // File should be removed
1021        assert!(!temp_dir.join("workspace_index.bin").exists());
1022
1023        // Clean up
1024        let _ = fs::remove_dir_all(&temp_dir);
1025    }
1026
1027    #[cfg(feature = "native")]
1028    #[test]
1029    fn test_cache_version_mismatch() {
1030        use std::fs;
1031
1032        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1033        let _ = fs::remove_dir_all(&temp_dir);
1034        fs::create_dir_all(&temp_dir).unwrap();
1035
1036        // Write data with correct magic but wrong version
1037        let mut data = Vec::new();
1038        data.extend_from_slice(b"RWSI"); // Correct magic
1039        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1040        data.extend_from_slice(&[0; 100]); // Some garbage data
1041        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1042
1043        // Should return None for version mismatch
1044        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1045        assert!(result.is_none());
1046
1047        // File should be removed to trigger rebuild
1048        assert!(!temp_dir.join("workspace_index.bin").exists());
1049
1050        // Clean up
1051        let _ = fs::remove_dir_all(&temp_dir);
1052    }
1053
1054    #[cfg(feature = "native")]
1055    #[test]
1056    fn test_cache_atomic_write() {
1057        use std::fs;
1058
1059        // Test that atomic writes work (no temp files left behind)
1060        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1061        let _ = fs::remove_dir_all(&temp_dir);
1062        fs::create_dir_all(&temp_dir).unwrap();
1063
1064        let index = WorkspaceIndex::new();
1065        index.save_to_cache(&temp_dir).expect("Failed to save");
1066
1067        // Only the final cache file should exist, no temp files
1068        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1069        assert_eq!(entries.len(), 1);
1070        assert!(temp_dir.join("workspace_index.bin").exists());
1071
1072        // Clean up
1073        let _ = fs::remove_dir_all(&temp_dir);
1074    }
1075
1076    #[test]
1077    fn test_has_anchor_auto_generated() {
1078        let mut file_index = FileIndex::new();
1079        file_index.add_heading(HeadingIndex {
1080            text: "Installation Guide".to_string(),
1081            auto_anchor: "installation-guide".to_string(),
1082            custom_anchor: None,
1083            line: 1,
1084        });
1085
1086        // Should find by auto-generated anchor
1087        assert!(file_index.has_anchor("installation-guide"));
1088
1089        // Case-insensitive matching
1090        assert!(file_index.has_anchor("Installation-Guide"));
1091        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1092
1093        // Should not find non-existent anchor
1094        assert!(!file_index.has_anchor("nonexistent"));
1095    }
1096
1097    #[test]
1098    fn test_has_anchor_custom() {
1099        let mut file_index = FileIndex::new();
1100        file_index.add_heading(HeadingIndex {
1101            text: "Installation Guide".to_string(),
1102            auto_anchor: "installation-guide".to_string(),
1103            custom_anchor: Some("install".to_string()),
1104            line: 1,
1105        });
1106
1107        // Should find by auto-generated anchor
1108        assert!(file_index.has_anchor("installation-guide"));
1109
1110        // Should also find by custom anchor
1111        assert!(file_index.has_anchor("install"));
1112        assert!(file_index.has_anchor("Install")); // case-insensitive
1113
1114        // Should not find non-existent anchor
1115        assert!(!file_index.has_anchor("nonexistent"));
1116    }
1117
1118    #[test]
1119    fn test_get_heading_by_anchor() {
1120        let mut file_index = FileIndex::new();
1121        file_index.add_heading(HeadingIndex {
1122            text: "Installation Guide".to_string(),
1123            auto_anchor: "installation-guide".to_string(),
1124            custom_anchor: Some("install".to_string()),
1125            line: 10,
1126        });
1127        file_index.add_heading(HeadingIndex {
1128            text: "Configuration".to_string(),
1129            auto_anchor: "configuration".to_string(),
1130            custom_anchor: None,
1131            line: 20,
1132        });
1133
1134        // Get by auto anchor
1135        let heading = file_index.get_heading_by_anchor("installation-guide");
1136        assert!(heading.is_some());
1137        assert_eq!(heading.unwrap().text, "Installation Guide");
1138        assert_eq!(heading.unwrap().line, 10);
1139
1140        // Get by custom anchor
1141        let heading = file_index.get_heading_by_anchor("install");
1142        assert!(heading.is_some());
1143        assert_eq!(heading.unwrap().text, "Installation Guide");
1144
1145        // Get second heading
1146        let heading = file_index.get_heading_by_anchor("configuration");
1147        assert!(heading.is_some());
1148        assert_eq!(heading.unwrap().text, "Configuration");
1149        assert_eq!(heading.unwrap().line, 20);
1150
1151        // Non-existent
1152        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1153    }
1154
1155    #[test]
1156    fn test_anchor_lookup_many_headings() {
1157        // Test that O(1) lookup works with many headings
1158        let mut file_index = FileIndex::new();
1159
1160        // Add 100 headings
1161        for i in 0..100 {
1162            file_index.add_heading(HeadingIndex {
1163                text: format!("Heading {i}"),
1164                auto_anchor: format!("heading-{i}"),
1165                custom_anchor: Some(format!("h{i}")),
1166                line: i + 1,
1167            });
1168        }
1169
1170        // Verify all can be found
1171        for i in 0..100 {
1172            assert!(file_index.has_anchor(&format!("heading-{i}")));
1173            assert!(file_index.has_anchor(&format!("h{i}")));
1174
1175            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1176            assert!(heading.is_some());
1177            assert_eq!(heading.unwrap().line, i + 1);
1178        }
1179    }
1180}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs