Skip to main content

rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: postcard-serialized WorkspaceIndex]
19//! ```
20
21use regex::Regex;
22use serde::{Deserialize, Serialize};
23use std::collections::{HashMap, HashSet};
24use std::path::{Path, PathBuf};
25use std::sync::LazyLock;
26
27use crate::lint_context::LintContext;
28
29// =============================================================================
30// URL Decoding Helper
31// =============================================================================
32
33/// Convert a hex digit character to its numeric value (0-15)
34fn hex_digit_to_value(c: u8) -> Option<u8> {
35    match c {
36        b'0'..=b'9' => Some(c - b'0'),
37        b'a'..=b'f' => Some(c - b'a' + 10),
38        b'A'..=b'F' => Some(c - b'A' + 10),
39        _ => None,
40    }
41}
42
43/// URL-decode a string, handling percent-encoded characters.
44/// Returns the decoded string, or the original if decoding fails.
45/// Used for matching URL-encoded CJK fragments against raw anchors.
46fn url_decode(s: &str) -> String {
47    // Fast path: no percent signs means no encoding
48    if !s.contains('%') {
49        return s.to_string();
50    }
51
52    let bytes = s.as_bytes();
53    let mut result = Vec::with_capacity(bytes.len());
54    let mut i = 0;
55
56    while i < bytes.len() {
57        if bytes[i] == b'%' && i + 2 < bytes.len() {
58            // Try to parse the two hex digits following %
59            let hex1 = bytes[i + 1];
60            let hex2 = bytes[i + 2];
61            if let (Some(d1), Some(d2)) = (hex_digit_to_value(hex1), hex_digit_to_value(hex2)) {
62                result.push(d1 * 16 + d2);
63                i += 3;
64                continue;
65            }
66        }
67        result.push(bytes[i]);
68        i += 1;
69    }
70
71    // Convert to UTF-8, falling back to original if invalid
72    String::from_utf8(result).unwrap_or_else(|_| s.to_string())
73}
74
75// =============================================================================
76// Shared cross-file link extraction utilities
77//
78// These regexes and helpers are the canonical implementation for extracting
79// cross-file links. Both MD057 and LSP use this shared code path for correct
80// position tracking.
81// =============================================================================
82
83/// Regex to match the start of a link
84static LINK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!?\[[^\]]*\]").unwrap());
85
86/// Regex to extract the URL from an angle-bracketed markdown link
87/// Format: `](<URL>)` or `](<URL> "title")`
88static URL_EXTRACT_ANGLE_BRACKET_REGEX: LazyLock<Regex> =
89    LazyLock::new(|| Regex::new(r#"\]\(\s*<([^>]+)>(#[^\)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
90
91/// Regex to extract the URL from a normal markdown link (without angle brackets)
92/// Format: `](URL)` or `](URL "title")`
93static URL_EXTRACT_REGEX: LazyLock<Regex> =
94    LazyLock::new(|| Regex::new(r#"]\(\s*([^>)\s#]+)(#[^)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
95
96/// Regex to detect URLs with explicit schemes
97pub(crate) static PROTOCOL_DOMAIN_REGEX: LazyLock<Regex> =
98    LazyLock::new(|| Regex::new(r"^([a-zA-Z][a-zA-Z0-9+.-]*://|[a-zA-Z][a-zA-Z0-9+.-]*:|www\.)").unwrap());
99
100/// Supported markdown file extensions
101const MARKDOWN_EXTENSIONS: &[&str] = &[
102    ".md",
103    ".markdown",
104    ".mdx",
105    ".mkd",
106    ".mkdn",
107    ".mdown",
108    ".mdwn",
109    ".qmd",
110    ".rmd",
111];
112
113/// Check if a path has a markdown extension (case-insensitive)
114#[inline]
115fn is_markdown_file(path: &str) -> bool {
116    let path_lower = path.to_lowercase();
117    MARKDOWN_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext))
118}
119
120/// Strip query parameters and fragments from a URL path
121/// Returns the path portion before `?` or `#`
122fn strip_query_and_fragment(url: &str) -> &str {
123    let query_pos = url.find('?');
124    let fragment_pos = url.find('#');
125
126    match (query_pos, fragment_pos) {
127        (Some(q), Some(f)) => &url[..q.min(f)],
128        (Some(q), None) => &url[..q],
129        (None, Some(f)) => &url[..f],
130        (None, None) => url,
131    }
132}
133
134/// Markdown file links extracted from a document, split by how they resolve.
135///
136/// Linting rules only understand `relative` links (resolved against the source
137/// file's directory). `root_relative` links (leading `/`) are an LSP concept
138/// resolved against the configured content roots, so they are kept separate to
139/// avoid changing linting behavior.
140#[derive(Debug, Default)]
141pub struct ExtractedCrossFileLinks {
142    /// Links resolved relative to the source file's directory.
143    pub relative: Vec<CrossFileLinkIndex>,
144    /// Root-relative links. `target_path` has the leading `/` stripped so it can
145    /// be joined directly to a content root. Parent-traversal and
146    /// protocol-relative (`//host`) links are excluded.
147    pub root_relative: Vec<CrossFileLinkIndex>,
148}
149
150/// Extract cross-file links from content using correct regex-based position tracking.
151///
152/// This is the canonical implementation used by both MD057 and LSP to ensure
153/// consistent and correct column positions for diagnostic reporting.
154///
155/// Returns one `CrossFileLinkIndex` per markdown file link, split into directory
156/// relative links and root-relative links (see `ExtractedCrossFileLinks`).
157pub fn extract_cross_file_links(ctx: &LintContext) -> ExtractedCrossFileLinks {
158    let content = ctx.content;
159
160    // Early returns for performance
161    if content.is_empty() || !content.contains("](") {
162        return ExtractedCrossFileLinks::default();
163    }
164
165    let mut links = ExtractedCrossFileLinks::default();
166    let lines: Vec<&str> = content.lines().collect();
167    let line_index = &ctx.line_index;
168
169    // Track which lines we've already processed to avoid duplicates
170    // (ctx.links may have multiple entries for the same line)
171    let mut processed_lines = HashSet::new();
172
173    for link in &ctx.links {
174        let line_idx = link.line - 1;
175        if line_idx >= lines.len() {
176            continue;
177        }
178
179        // Skip if we've already processed this line
180        if !processed_lines.insert(line_idx) {
181            continue;
182        }
183
184        let line = lines[line_idx];
185        if !line.contains("](") {
186            continue;
187        }
188
189        // Find all links in this line
190        for link_match in LINK_START_REGEX.find_iter(line) {
191            let start_pos = link_match.start();
192            let end_pos = link_match.end();
193
194            // Calculate absolute position for code span detection
195            let line_start_byte = line_index.get_line_start_byte(line_idx + 1).unwrap_or(0);
196            let absolute_start_pos = line_start_byte + start_pos;
197
198            // Skip if in code span
199            if ctx.is_in_code_span_byte(absolute_start_pos) {
200                continue;
201            }
202
203            // Extract the URL (group 1) and fragment (group 2)
204            // Try angle-bracket regex first (handles URLs with parens)
205            let caps_result = URL_EXTRACT_ANGLE_BRACKET_REGEX
206                .captures_at(line, end_pos - 1)
207                .or_else(|| URL_EXTRACT_REGEX.captures_at(line, end_pos - 1));
208
209            if let Some(caps) = caps_result
210                && let Some(url_group) = caps.get(1)
211            {
212                let file_path = url_group.as_str().trim();
213
214                // Root-relative links (leading `/`) resolve against content roots
215                // in the LSP, not the source directory, so they are captured in a
216                // separate bucket. Protocol-relative (`//host`) and parent-traversal
217                // links are excluded so they cannot escape a content root.
218                if let Some(rel) = file_path.strip_prefix('/') {
219                    if !rel.starts_with('/')
220                        && !Path::new(rel)
221                            .components()
222                            .any(|c| matches!(c, std::path::Component::ParentDir))
223                    {
224                        let stripped = strip_query_and_fragment(rel);
225                        if is_markdown_file(stripped) {
226                            let fragment = caps.get(2).map_or("", |m| m.as_str().trim_start_matches('#'));
227                            links.root_relative.push(CrossFileLinkIndex {
228                                target_path: stripped.to_string(),
229                                fragment: fragment.to_string(),
230                                line: link.line,
231                                column: url_group.start() + 1,
232                            });
233                        }
234                    }
235                    continue;
236                }
237
238                // Skip empty, external, template variables, framework aliases,
239                // fragment-only URLs, or rustdoc intra-doc links
240                if file_path.is_empty()
241                    || PROTOCOL_DOMAIN_REGEX.is_match(file_path)
242                    || file_path.starts_with("www.")
243                    || file_path.starts_with('#')
244                    || file_path.starts_with("{{")
245                    || file_path.starts_with("{%")
246                    || file_path.starts_with('~')
247                    || file_path.starts_with('@')
248                    || (file_path.starts_with('`') && file_path.ends_with('`'))
249                {
250                    continue;
251                }
252
253                // Strip query parameters before indexing
254                let file_path = strip_query_and_fragment(file_path);
255
256                // Get fragment from capture group 2 (includes # prefix)
257                let fragment = caps.get(2).map_or("", |m| m.as_str().trim_start_matches('#'));
258
259                // Only index markdown file links for cross-file validation
260                if is_markdown_file(file_path) {
261                    links.relative.push(CrossFileLinkIndex {
262                        target_path: file_path.to_string(),
263                        fragment: fragment.to_string(),
264                        line: link.line,
265                        column: url_group.start() + 1,
266                    });
267                }
268            }
269        }
270    }
271
272    links
273}
274
275/// Magic bytes identifying a workspace index cache file
276#[cfg(feature = "native")]
277const CACHE_MAGIC: &[u8; 4] = b"RWSI";
278
279/// Cache format version - increment when WorkspaceIndex serialization changes
280/// or when the meaning of persisted fields changes such that older caches are
281/// no longer correct. Version 8 forces a rebuild so the new `root_relative_links`
282/// field is populated; earlier caches lack it, leaving find-references unable to
283/// discover root-relative (`/path`) links until a rescan.
284#[cfg(feature = "native")]
285const CACHE_FORMAT_VERSION: u32 = 8;
286
287/// Cache file name within the version directory
288#[cfg(feature = "native")]
289const CACHE_FILE_NAME: &str = "workspace_index.bin";
290
291/// Workspace-wide index for cross-file analysis
292///
293/// Contains pre-extracted information from all markdown files in the workspace,
294/// enabling rules to validate cross-file references efficiently.
295#[derive(Debug, Default, Clone, Serialize, Deserialize)]
296pub struct WorkspaceIndex {
297    /// Map from file path to its extracted data
298    files: HashMap<PathBuf, FileIndex>,
299    /// Reverse dependency graph: target file → files that link to it
300    /// Used to efficiently re-lint dependent files when a target changes
301    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
302    /// Version counter for cache invalidation (incremented on any change)
303    version: u64,
304}
305
306/// Index data extracted from a single file
307#[derive(Debug, Clone, Default, Serialize, Deserialize)]
308pub struct FileIndex {
309    /// Headings in this file with their anchors
310    pub headings: Vec<HeadingIndex>,
311    /// Reference links in this file (for cross-file analysis)
312    pub reference_links: Vec<ReferenceLinkIndex>,
313    /// Cross-file links in this file (for MD051 cross-file validation)
314    pub cross_file_links: Vec<CrossFileLinkIndex>,
315    /// Root-relative links (leading `/`) in this file. Resolved against the
316    /// configured content roots by the LSP for go-to-definition, hover, and
317    /// find-references. `target_path` has the leading `/` stripped. Linting does
318    /// not use these, so they never affect diagnostics.
319    #[serde(default)]
320    pub root_relative_links: Vec<CrossFileLinkIndex>,
321    /// Defined reference IDs (e.g., from `[ref]: url` definitions)
322    /// Used to filter out reference links that have explicit definitions
323    pub defined_references: HashSet<String>,
324    /// Content hash for change detection
325    pub content_hash: String,
326    /// O(1) anchor lookup: lowercased anchor → heading index
327    /// Includes both auto-generated and custom anchors
328    anchor_to_heading: HashMap<String, usize>,
329    /// O(1) anchor lookup with original case preserved → heading index.
330    /// Used for `ignore_case = false` (markdownlint strict parity). Skipped at
331    /// query time when the lowercase map is sufficient.
332    #[serde(default)]
333    anchor_to_heading_exact: HashMap<String, usize>,
334    /// HTML anchors defined via `<a id="...">` or `<element id="...">` tags.
335    /// Stored lowercase for case-insensitive matching.
336    html_anchors: HashSet<String>,
337    /// HTML anchors with original case preserved.
338    /// Used for `ignore_case = false` (markdownlint strict parity).
339    #[serde(default)]
340    html_anchors_exact: HashSet<String>,
341    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list).
342    /// Can appear on any element, not just headings.
343    /// Stored lowercase for case-insensitive matching.
344    attribute_anchors: HashSet<String>,
345    /// Attribute anchors with original case preserved.
346    /// Used for `ignore_case = false` (markdownlint strict parity).
347    #[serde(default)]
348    attribute_anchors_exact: HashSet<String>,
349    /// Rules disabled for the entire file (from inline comments)
350    /// Used by cross-file rules to respect inline disable directives
351    pub file_disabled_rules: HashSet<String>,
352    /// Persistent disable/enable state transitions, sorted by line number.
353    /// Each entry: (line, disabled_rules, enabled_rules). Use binary search to query.
354    pub persistent_transitions: Vec<(usize, HashSet<String>, HashSet<String>)>,
355    /// Rules disabled at specific lines via disable-line / disable-next-line
356    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
357}
358
359/// Information about a heading for cross-file lookup
360#[derive(Debug, Clone, Serialize, Deserialize)]
361pub struct HeadingIndex {
362    /// The heading text (e.g., "Installation Guide")
363    pub text: String,
364    /// Auto-generated anchor (e.g., "installation-guide")
365    pub auto_anchor: String,
366    /// Custom anchor if present (e.g., "install")
367    pub custom_anchor: Option<String>,
368    /// Line number (1-indexed)
369    pub line: usize,
370    /// Whether this is a Setext-style heading (underlined with = or -)
371    #[serde(default)]
372    pub is_setext: bool,
373}
374
375/// Information about a reference link for cross-file analysis
376#[derive(Debug, Clone, Serialize, Deserialize)]
377pub struct ReferenceLinkIndex {
378    /// The reference ID (the part in `[text][ref]`)
379    pub reference_id: String,
380    /// Line number (1-indexed)
381    pub line: usize,
382    /// Column number (1-indexed)
383    pub column: usize,
384}
385
386/// Information about a cross-file link for validation
387#[derive(Debug, Clone, Serialize, Deserialize)]
388pub struct CrossFileLinkIndex {
389    /// The target file path (relative, as it appears in the link)
390    pub target_path: String,
391    /// The fragment/anchor being linked to (without #)
392    pub fragment: String,
393    /// Line number (1-indexed)
394    pub line: usize,
395    /// Column number (1-indexed)
396    pub column: usize,
397}
398
399/// Information about a vulnerable anchor (heading without custom ID)
400#[derive(Debug, Clone, Serialize, Deserialize)]
401pub struct VulnerableAnchor {
402    /// File path where the heading is located
403    pub file: PathBuf,
404    /// Line number of the heading
405    pub line: usize,
406    /// The heading text
407    pub text: String,
408}
409
410impl WorkspaceIndex {
411    /// Create a new empty workspace index
412    pub fn new() -> Self {
413        Self::default()
414    }
415
416    /// Get the current version (for cache invalidation)
417    pub fn version(&self) -> u64 {
418        self.version
419    }
420
421    /// Get the number of indexed files
422    pub fn file_count(&self) -> usize {
423        self.files.len()
424    }
425
426    /// Check if a file is in the index
427    pub fn contains_file(&self, path: &Path) -> bool {
428        self.files.contains_key(path)
429    }
430
431    /// Get the index data for a specific file
432    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
433        self.files.get(path)
434    }
435
436    /// Insert or update a file's index data
437    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
438        self.files.insert(path, index);
439        self.version = self.version.wrapping_add(1);
440    }
441
442    /// Remove a file from the index
443    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
444        // Clean up reverse deps for this file
445        self.clear_reverse_deps_for(path);
446
447        let result = self.files.remove(path);
448        if result.is_some() {
449            self.version = self.version.wrapping_add(1);
450        }
451        result
452    }
453
454    /// Build a map of all "vulnerable" anchors across the workspace
455    ///
456    /// A vulnerable anchor is an auto-generated anchor for a heading that
457    /// does NOT have a custom anchor defined. These are problematic for
458    /// translated content because the anchor changes when the heading is translated.
459    ///
460    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
461    /// Multiple files can have headings with the same auto-generated anchor,
462    /// so we collect all occurrences.
463    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
464        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
465
466        for (file_path, file_index) in &self.files {
467            for heading in &file_index.headings {
468                // Only include headings WITHOUT custom anchors
469                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
470                    let anchor_key = heading.auto_anchor.to_lowercase();
471                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
472                        file: file_path.clone(),
473                        line: heading.line,
474                        text: heading.text.clone(),
475                    });
476                }
477            }
478        }
479
480        vulnerable
481    }
482
483    /// Get all headings across the workspace (for debugging/testing)
484    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
485        self.files
486            .iter()
487            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
488    }
489
490    /// Iterate over all files in the index
491    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
492        self.files.iter().map(|(p, i)| (p.as_path(), i))
493    }
494
495    /// Clear the entire index
496    pub fn clear(&mut self) {
497        self.files.clear();
498        self.reverse_deps.clear();
499        self.version = self.version.wrapping_add(1);
500    }
501
502    /// Update a file's index and maintain reverse dependencies
503    ///
504    /// This method:
505    /// 1. Removes this file as a source (dependent) from all reverse deps
506    /// 2. Inserts the new file index
507    /// 3. Builds new reverse deps from cross_file_links
508    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
509        // Remove this file as a source (dependent) from all target entries
510        // Note: We don't remove it as a target - other files may still link to it
511        self.clear_reverse_deps_as_source(path);
512
513        // Build new reverse deps from cross_file_links
514        for link in &index.cross_file_links {
515            let target = self.resolve_target_path(path, &link.target_path);
516            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
517        }
518
519        self.files.insert(path.to_path_buf(), index);
520        self.version = self.version.wrapping_add(1);
521    }
522
523    /// Get files that depend on (link to) the given file
524    ///
525    /// Returns a list of file paths that contain links targeting this file.
526    /// Used to re-lint dependent files when a target file changes.
527    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
528        self.reverse_deps
529            .get(path)
530            .map(|set| set.iter().cloned().collect())
531            .unwrap_or_default()
532    }
533
534    /// Check if a file needs re-indexing based on its content hash
535    ///
536    /// Returns `true` if the file is not in the index or has a different hash.
537    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
538        self.files.get(path).is_none_or(|f| f.content_hash != current_hash)
539    }
540
541    /// Retain only files that exist in the given set, removing deleted files
542    ///
543    /// This prunes stale entries from the cache for files that no longer exist.
544    /// Returns the number of files removed.
545    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
546        let before_count = self.files.len();
547
548        // Collect files to remove
549        let to_remove: Vec<PathBuf> = self
550            .files
551            .keys()
552            .filter(|path| !current_files.contains(*path))
553            .cloned()
554            .collect();
555
556        // Remove each file properly (clears reverse deps)
557        for path in &to_remove {
558            self.remove_file(path);
559        }
560
561        before_count - self.files.len()
562    }
563
564    /// Save the workspace index to a cache file
565    ///
566    /// Uses postcard for efficient binary serialization with:
567    /// - Magic header for file type validation
568    /// - Format version for compatibility detection
569    /// - Atomic writes (temp file + rename) to prevent corruption
570    #[cfg(feature = "native")]
571    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
572        use std::fs;
573        use std::io::Write;
574
575        // Ensure cache directory exists
576        fs::create_dir_all(cache_dir)?;
577
578        // Serialize the index data using postcard
579        let encoded = postcard::to_allocvec(self)
580            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
581
582        // Build versioned cache file: [magic][version][data]
583        let mut cache_data = Vec::with_capacity(8 + encoded.len());
584        cache_data.extend_from_slice(CACHE_MAGIC);
585        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
586        cache_data.extend_from_slice(&encoded);
587
588        // Write atomically: write to temp file then rename
589        let final_path = cache_dir.join(CACHE_FILE_NAME);
590        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
591
592        // Write to temp file
593        {
594            let mut file = fs::File::create(&temp_path)?;
595            file.write_all(&cache_data)?;
596            file.sync_all()?;
597        }
598
599        // Atomic rename
600        fs::rename(&temp_path, &final_path)?;
601
602        log::debug!(
603            "Saved workspace index to cache: {} files, {} bytes (format v{})",
604            self.files.len(),
605            cache_data.len(),
606            CACHE_FORMAT_VERSION
607        );
608
609        Ok(())
610    }
611
612    /// Load the workspace index from a cache file
613    ///
614    /// Returns `None` if:
615    /// - Cache file doesn't exist
616    /// - Magic header doesn't match
617    /// - Format version is incompatible
618    /// - Data is corrupted
619    #[cfg(feature = "native")]
620    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
621        use std::fs;
622
623        let path = cache_dir.join(CACHE_FILE_NAME);
624        let data = fs::read(&path).ok()?;
625
626        // Validate header: need at least 8 bytes for magic + version
627        if data.len() < 8 {
628            log::warn!("Workspace index cache too small, discarding");
629            let _ = fs::remove_file(&path);
630            return None;
631        }
632
633        // Check magic header
634        if &data[0..4] != CACHE_MAGIC {
635            log::warn!("Workspace index cache has invalid magic header, discarding");
636            let _ = fs::remove_file(&path);
637            return None;
638        }
639
640        // Check format version
641        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
642        if version != CACHE_FORMAT_VERSION {
643            log::info!(
644                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
645            );
646            let _ = fs::remove_file(&path);
647            return None;
648        }
649
650        // Deserialize the index data using postcard
651        match postcard::from_bytes::<Self>(&data[8..]) {
652            Ok(index) => {
653                log::debug!(
654                    "Loaded workspace index from cache: {} files (format v{})",
655                    index.files.len(),
656                    version
657                );
658                Some(index)
659            }
660            Err(e) => {
661                log::warn!("Failed to deserialize workspace index cache: {e}");
662                let _ = fs::remove_file(&path);
663                None
664            }
665        }
666    }
667
668    /// Remove a file as a source from all reverse dependency entries
669    ///
670    /// This removes the file from being listed as a dependent in all target entries.
671    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
672    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
673        for deps in self.reverse_deps.values_mut() {
674            deps.remove(path);
675        }
676        // Clean up empty entries
677        self.reverse_deps.retain(|_, deps| !deps.is_empty());
678    }
679
680    /// Remove a file completely from reverse dependency tracking
681    ///
682    /// Removes the file as both a source (dependent) and as a target.
683    /// Used when deleting a file from the index.
684    fn clear_reverse_deps_for(&mut self, path: &Path) {
685        // Remove as source (dependent)
686        self.clear_reverse_deps_as_source(path);
687
688        // Also remove as target
689        self.reverse_deps.remove(path);
690    }
691
692    /// Resolve a relative path from a source file to an absolute target path
693    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
694        // Get the directory containing the source file
695        let source_dir = source_file.parent().unwrap_or(Path::new(""));
696
697        // Join with the relative target and normalize
698        let target = source_dir.join(relative_target);
699
700        // Normalize the path (handle .., ., etc.)
701        Self::normalize_path(&target)
702    }
703
704    /// Normalize a path by resolving . and .. components
705    fn normalize_path(path: &Path) -> PathBuf {
706        let mut components = Vec::new();
707
708        for component in path.components() {
709            match component {
710                std::path::Component::ParentDir => {
711                    // Go up one level if possible
712                    if !components.is_empty() {
713                        components.pop();
714                    }
715                }
716                std::path::Component::CurDir => {
717                    // Skip current directory markers
718                }
719                _ => {
720                    components.push(component);
721                }
722            }
723        }
724
725        components.iter().collect()
726    }
727}
728
729impl FileIndex {
730    /// Create a new empty file index
731    pub fn new() -> Self {
732        Self::default()
733    }
734
735    /// Create a file index with the given content hash
736    pub fn with_hash(content_hash: String) -> Self {
737        Self {
738            content_hash,
739            ..Default::default()
740        }
741    }
742
743    /// Add a heading to the index
744    ///
745    /// Also updates the anchor lookup maps for O(1) anchor queries. Both
746    /// lowercased (for `ignore_case = true`) and case-preserving (for
747    /// `ignore_case = false`) maps are populated.
748    pub fn add_heading(&mut self, heading: HeadingIndex) {
749        let index = self.headings.len();
750
751        // Auto-generated anchor — slugs are already lowercase, but we still
752        // populate both maps so query-time dispatch is uniform.
753        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
754        self.anchor_to_heading_exact.insert(heading.auto_anchor.clone(), index);
755
756        // Custom anchor preserves original case as written by the author.
757        if let Some(ref custom) = heading.custom_anchor {
758            self.anchor_to_heading.insert(custom.to_lowercase(), index);
759            self.anchor_to_heading_exact.insert(custom.clone(), index);
760        }
761
762        self.headings.push(heading);
763    }
764
765    /// Add an alternative anchor that resolves to an existing heading.
766    /// Used for platform-specific anchor conventions (e.g., Python-Markdown `_N` dedup).
767    pub fn add_anchor_alias(&mut self, anchor: &str, heading_index: usize) {
768        if heading_index < self.headings.len() {
769            self.anchor_to_heading.insert(anchor.to_lowercase(), heading_index);
770            self.anchor_to_heading_exact.insert(anchor.to_string(), heading_index);
771        }
772    }
773
774    /// Check if an anchor exists in this file (O(1) lookup)
775    ///
776    /// Returns true if the anchor matches any of:
777    /// - Auto-generated heading anchors
778    /// - Custom heading anchors (from {#id} syntax on headings)
779    /// - HTML anchors (from `<a id="...">` or `<element id="...">`)
780    /// - Attribute anchors (from { #id } syntax on non-heading elements)
781    ///
782    /// Matching is case-insensitive. URL-encoded anchors (e.g., CJK characters
783    /// like `%E6%97%A5%E6%9C%AC%E8%AA%9E` for `日本語`) are decoded before matching.
784    pub fn has_anchor(&self, anchor: &str) -> bool {
785        self.has_anchor_with_case(anchor, true)
786    }
787
788    /// Check if an anchor exists in this file, with explicit case sensitivity.
789    ///
790    /// When `ignore_case` is `true`, behaves identically to [`has_anchor`] —
791    /// inputs are lowercased and matched against the lowercase storage.
792    /// When `false`, the input is compared as-is against parallel
793    /// case-preserving storage, matching markdownlint's strict behavior for
794    /// generated heading slugs, custom heading IDs, HTML anchors, and
795    /// attribute anchors.
796    pub fn has_anchor_with_case(&self, anchor: &str, ignore_case: bool) -> bool {
797        if self.lookup_anchor(anchor, ignore_case) {
798            return true;
799        }
800
801        // Slow path: if anchor contains percent-encoding, try decoded version
802        if anchor.contains('%') {
803            let decoded = url_decode(anchor);
804            if decoded != anchor {
805                return self.lookup_anchor(&decoded, ignore_case);
806            }
807        }
808
809        false
810    }
811
812    /// Direct anchor lookup, dispatching to the lowercase or exact-case
813    /// storage based on `ignore_case`.
814    fn lookup_anchor(&self, anchor: &str, ignore_case: bool) -> bool {
815        if ignore_case {
816            let lower = anchor.to_lowercase();
817            self.anchor_to_heading.contains_key(&lower)
818                || self.html_anchors.contains(&lower)
819                || self.attribute_anchors.contains(&lower)
820        } else {
821            self.anchor_to_heading_exact.contains_key(anchor)
822                || self.html_anchors_exact.contains(anchor)
823                || self.attribute_anchors_exact.contains(anchor)
824        }
825    }
826
827    /// Add an HTML anchor (from `<a id="...">` or `<element id="...">` tags).
828    /// Populates both lowercase (case-insensitive) and case-preserving sets.
829    pub fn add_html_anchor(&mut self, anchor: &str) {
830        if !anchor.is_empty() {
831            self.html_anchors.insert(anchor.to_lowercase());
832            self.html_anchors_exact.insert(anchor.to_string());
833        }
834    }
835
836    /// Add an attribute anchor (from { #id } syntax on non-heading elements).
837    /// Populates both lowercase (case-insensitive) and case-preserving sets.
838    pub fn add_attribute_anchor(&mut self, anchor: &str) {
839        if !anchor.is_empty() {
840            self.attribute_anchors.insert(anchor.to_lowercase());
841            self.attribute_anchors_exact.insert(anchor.to_string());
842        }
843    }
844
845    /// Get the heading index for an anchor (O(1) lookup)
846    ///
847    /// Returns the index into `self.headings` if found.
848    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
849        self.anchor_to_heading
850            .get(&anchor.to_lowercase())
851            .and_then(|&idx| self.headings.get(idx))
852    }
853
854    /// Add a reference link to the index
855    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
856        self.reference_links.push(link);
857    }
858
859    /// Check if a rule is disabled at a specific line
860    ///
861    /// Used by cross-file rules to respect inline disable directives.
862    /// Checks both file-wide disables and line-specific disables.
863    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
864        // Check file-wide disables (highest priority)
865        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
866            return true;
867        }
868
869        // Check line-specific disables (disable-line / disable-next-line)
870        if let Some(rules) = self.line_disabled_rules.get(&line)
871            && (rules.contains("*") || rules.contains(rule_name))
872        {
873            return true;
874        }
875
876        // Check persistent disable/enable transitions via binary search
877        if !self.persistent_transitions.is_empty() {
878            let idx = match self.persistent_transitions.binary_search_by_key(&line, |t| t.0) {
879                Ok(i) => Some(i),
880                Err(i) => {
881                    if i > 0 {
882                        Some(i - 1)
883                    } else {
884                        None
885                    }
886                }
887            };
888            if let Some(i) = idx {
889                let (_, ref disabled, ref enabled) = self.persistent_transitions[i];
890                if disabled.contains("*") {
891                    return !enabled.contains(rule_name);
892                }
893                return disabled.contains(rule_name);
894            }
895        }
896
897        false
898    }
899
900    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line)
901    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
902        // Deduplicate: multiple rules may contribute the same link with different columns
903        // (e.g., MD051 uses link start, MD057 uses URL start)
904        let is_duplicate = self.cross_file_links.iter().any(|existing| {
905            existing.target_path == link.target_path && existing.fragment == link.fragment && existing.line == link.line
906        });
907        if !is_duplicate {
908            self.cross_file_links.push(link);
909        }
910    }
911
912    /// Add a root-relative link to the index (deduplicates by target_path, fragment, line)
913    pub fn add_root_relative_link(&mut self, link: CrossFileLinkIndex) {
914        let is_duplicate = self.root_relative_links.iter().any(|existing| {
915            existing.target_path == link.target_path && existing.fragment == link.fragment && existing.line == link.line
916        });
917        if !is_duplicate {
918            self.root_relative_links.push(link);
919        }
920    }
921
922    /// Add a defined reference ID (e.g., from `[ref]: url`)
923    pub fn add_defined_reference(&mut self, ref_id: String) {
924        self.defined_references.insert(ref_id);
925    }
926
927    /// Check if a reference ID has an explicit definition
928    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
929        self.defined_references.contains(ref_id)
930    }
931
932    /// Check if the content hash matches
933    pub fn hash_matches(&self, hash: &str) -> bool {
934        self.content_hash == hash
935    }
936
937    /// Get the number of headings
938    pub fn heading_count(&self) -> usize {
939        self.headings.len()
940    }
941
942    /// Get the number of reference links
943    pub fn reference_link_count(&self) -> usize {
944        self.reference_links.len()
945    }
946}
947
948#[cfg(test)]
949mod tests {
950    use super::*;
951
952    #[test]
953    fn test_workspace_index_basic() {
954        let mut index = WorkspaceIndex::new();
955        assert_eq!(index.file_count(), 0);
956        assert_eq!(index.version(), 0);
957
958        let mut file_index = FileIndex::with_hash("abc123".to_string());
959        file_index.add_heading(HeadingIndex {
960            text: "Installation".to_string(),
961            auto_anchor: "installation".to_string(),
962            custom_anchor: None,
963            line: 1,
964            is_setext: false,
965        });
966
967        index.insert_file(PathBuf::from("docs/install.md"), file_index);
968        assert_eq!(index.file_count(), 1);
969        assert_eq!(index.version(), 1);
970
971        assert!(index.contains_file(Path::new("docs/install.md")));
972        assert!(!index.contains_file(Path::new("docs/other.md")));
973    }
974
975    #[test]
976    fn test_vulnerable_anchors() {
977        let mut index = WorkspaceIndex::new();
978
979        // File 1: heading without custom anchor (vulnerable)
980        let mut file1 = FileIndex::new();
981        file1.add_heading(HeadingIndex {
982            text: "Getting Started".to_string(),
983            auto_anchor: "getting-started".to_string(),
984            custom_anchor: None,
985            line: 1,
986            is_setext: false,
987        });
988        index.insert_file(PathBuf::from("docs/guide.md"), file1);
989
990        // File 2: heading with custom anchor (not vulnerable)
991        let mut file2 = FileIndex::new();
992        file2.add_heading(HeadingIndex {
993            text: "Installation".to_string(),
994            auto_anchor: "installation".to_string(),
995            custom_anchor: Some("install".to_string()),
996            line: 1,
997            is_setext: false,
998        });
999        index.insert_file(PathBuf::from("docs/install.md"), file2);
1000
1001        let vulnerable = index.get_vulnerable_anchors();
1002        assert_eq!(vulnerable.len(), 1);
1003        assert!(vulnerable.contains_key("getting-started"));
1004        assert!(!vulnerable.contains_key("installation"));
1005
1006        let anchors = vulnerable.get("getting-started").unwrap();
1007        assert_eq!(anchors.len(), 1);
1008        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
1009        assert_eq!(anchors[0].text, "Getting Started");
1010    }
1011
1012    #[test]
1013    fn test_vulnerable_anchors_multiple_files_same_anchor() {
1014        // Multiple files can have headings with the same auto-generated anchor
1015        // get_vulnerable_anchors() should collect all of them
1016        let mut index = WorkspaceIndex::new();
1017
1018        // File 1: has "Installation" heading (vulnerable)
1019        let mut file1 = FileIndex::new();
1020        file1.add_heading(HeadingIndex {
1021            text: "Installation".to_string(),
1022            auto_anchor: "installation".to_string(),
1023            custom_anchor: None,
1024            line: 1,
1025            is_setext: false,
1026        });
1027        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
1028
1029        // File 2: also has "Installation" heading with same anchor (vulnerable)
1030        let mut file2 = FileIndex::new();
1031        file2.add_heading(HeadingIndex {
1032            text: "Installation".to_string(),
1033            auto_anchor: "installation".to_string(),
1034            custom_anchor: None,
1035            line: 5,
1036            is_setext: false,
1037        });
1038        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
1039
1040        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
1041        let mut file3 = FileIndex::new();
1042        file3.add_heading(HeadingIndex {
1043            text: "Installation".to_string(),
1044            auto_anchor: "installation".to_string(),
1045            custom_anchor: Some("install".to_string()),
1046            line: 10,
1047            is_setext: false,
1048        });
1049        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
1050
1051        let vulnerable = index.get_vulnerable_anchors();
1052        assert_eq!(vulnerable.len(), 1); // One unique anchor
1053        assert!(vulnerable.contains_key("installation"));
1054
1055        let anchors = vulnerable.get("installation").unwrap();
1056        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
1057        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
1058
1059        // Verify both files are represented
1060        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
1061        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
1062        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
1063    }
1064
1065    #[test]
1066    fn test_file_index_hash() {
1067        let index = FileIndex::with_hash("hash123".to_string());
1068        assert!(index.hash_matches("hash123"));
1069        assert!(!index.hash_matches("other"));
1070    }
1071
1072    #[test]
1073    fn test_version_increment() {
1074        let mut index = WorkspaceIndex::new();
1075        assert_eq!(index.version(), 0);
1076
1077        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
1078        assert_eq!(index.version(), 1);
1079
1080        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
1081        assert_eq!(index.version(), 2);
1082
1083        index.remove_file(Path::new("a.md"));
1084        assert_eq!(index.version(), 3);
1085
1086        // Removing non-existent file doesn't increment
1087        index.remove_file(Path::new("nonexistent.md"));
1088        assert_eq!(index.version(), 3);
1089    }
1090
1091    #[test]
1092    fn test_reverse_deps_basic() {
1093        let mut index = WorkspaceIndex::new();
1094
1095        // File A links to file B
1096        let mut file_a = FileIndex::new();
1097        file_a.add_cross_file_link(CrossFileLinkIndex {
1098            target_path: "b.md".to_string(),
1099            fragment: "section".to_string(),
1100            line: 10,
1101            column: 5,
1102        });
1103        index.update_file(Path::new("docs/a.md"), file_a);
1104
1105        // Check that B has A as a dependent
1106        let dependents = index.get_dependents(Path::new("docs/b.md"));
1107        assert_eq!(dependents.len(), 1);
1108        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
1109
1110        // A has no dependents
1111        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
1112        assert!(a_dependents.is_empty());
1113    }
1114
1115    #[test]
1116    fn test_reverse_deps_multiple() {
1117        let mut index = WorkspaceIndex::new();
1118
1119        // Files A and C both link to B
1120        let mut file_a = FileIndex::new();
1121        file_a.add_cross_file_link(CrossFileLinkIndex {
1122            target_path: "../b.md".to_string(),
1123            fragment: "".to_string(),
1124            line: 1,
1125            column: 1,
1126        });
1127        index.update_file(Path::new("docs/sub/a.md"), file_a);
1128
1129        let mut file_c = FileIndex::new();
1130        file_c.add_cross_file_link(CrossFileLinkIndex {
1131            target_path: "b.md".to_string(),
1132            fragment: "".to_string(),
1133            line: 1,
1134            column: 1,
1135        });
1136        index.update_file(Path::new("docs/c.md"), file_c);
1137
1138        // B should have both A and C as dependents
1139        let dependents = index.get_dependents(Path::new("docs/b.md"));
1140        assert_eq!(dependents.len(), 2);
1141        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
1142        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
1143    }
1144
1145    #[test]
1146    fn test_reverse_deps_update_clears_old() {
1147        let mut index = WorkspaceIndex::new();
1148
1149        // File A initially links to B
1150        let mut file_a = FileIndex::new();
1151        file_a.add_cross_file_link(CrossFileLinkIndex {
1152            target_path: "b.md".to_string(),
1153            fragment: "".to_string(),
1154            line: 1,
1155            column: 1,
1156        });
1157        index.update_file(Path::new("docs/a.md"), file_a);
1158
1159        // Verify B has A as dependent
1160        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1161
1162        // Update A to link to C instead of B
1163        let mut file_a_updated = FileIndex::new();
1164        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
1165            target_path: "c.md".to_string(),
1166            fragment: "".to_string(),
1167            line: 1,
1168            column: 1,
1169        });
1170        index.update_file(Path::new("docs/a.md"), file_a_updated);
1171
1172        // B should no longer have A as dependent
1173        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1174
1175        // C should now have A as dependent
1176        let c_deps = index.get_dependents(Path::new("docs/c.md"));
1177        assert_eq!(c_deps.len(), 1);
1178        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
1179    }
1180
1181    #[test]
1182    fn test_reverse_deps_remove_file() {
1183        let mut index = WorkspaceIndex::new();
1184
1185        // File A links to B
1186        let mut file_a = FileIndex::new();
1187        file_a.add_cross_file_link(CrossFileLinkIndex {
1188            target_path: "b.md".to_string(),
1189            fragment: "".to_string(),
1190            line: 1,
1191            column: 1,
1192        });
1193        index.update_file(Path::new("docs/a.md"), file_a);
1194
1195        // Verify B has A as dependent
1196        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1197
1198        // Remove file A
1199        index.remove_file(Path::new("docs/a.md"));
1200
1201        // B should no longer have any dependents
1202        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1203    }
1204
1205    #[test]
1206    fn test_normalize_path() {
1207        // Test .. handling
1208        let path = Path::new("docs/sub/../other.md");
1209        let normalized = WorkspaceIndex::normalize_path(path);
1210        assert_eq!(normalized, PathBuf::from("docs/other.md"));
1211
1212        // Test . handling
1213        let path2 = Path::new("docs/./other.md");
1214        let normalized2 = WorkspaceIndex::normalize_path(path2);
1215        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
1216
1217        // Test multiple ..
1218        let path3 = Path::new("a/b/c/../../d.md");
1219        let normalized3 = WorkspaceIndex::normalize_path(path3);
1220        assert_eq!(normalized3, PathBuf::from("a/d.md"));
1221    }
1222
1223    #[test]
1224    fn test_clear_clears_reverse_deps() {
1225        let mut index = WorkspaceIndex::new();
1226
1227        // File A links to B
1228        let mut file_a = FileIndex::new();
1229        file_a.add_cross_file_link(CrossFileLinkIndex {
1230            target_path: "b.md".to_string(),
1231            fragment: "".to_string(),
1232            line: 1,
1233            column: 1,
1234        });
1235        index.update_file(Path::new("docs/a.md"), file_a);
1236
1237        // Verify B has A as dependent
1238        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1239
1240        // Clear the index
1241        index.clear();
1242
1243        // Both files and reverse deps should be cleared
1244        assert_eq!(index.file_count(), 0);
1245        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1246    }
1247
1248    #[test]
1249    fn test_is_file_stale() {
1250        let mut index = WorkspaceIndex::new();
1251
1252        // Non-existent file is always stale
1253        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
1254
1255        // Add a file with known hash
1256        let file_index = FileIndex::with_hash("hash123".to_string());
1257        index.insert_file(PathBuf::from("docs/test.md"), file_index);
1258
1259        // Same hash means not stale
1260        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
1261
1262        // Different hash means stale
1263        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
1264    }
1265
1266    #[cfg(feature = "native")]
1267    #[test]
1268    fn test_cache_roundtrip() {
1269        use std::fs;
1270
1271        // Create a temp directory
1272        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
1273        let _ = fs::remove_dir_all(&temp_dir);
1274        fs::create_dir_all(&temp_dir).unwrap();
1275
1276        // Create an index with some data
1277        let mut index = WorkspaceIndex::new();
1278
1279        let mut file1 = FileIndex::with_hash("abc123".to_string());
1280        file1.add_heading(HeadingIndex {
1281            text: "Test Heading".to_string(),
1282            auto_anchor: "test-heading".to_string(),
1283            custom_anchor: Some("test".to_string()),
1284            line: 1,
1285            is_setext: false,
1286        });
1287        file1.add_cross_file_link(CrossFileLinkIndex {
1288            target_path: "./other.md".to_string(),
1289            fragment: "section".to_string(),
1290            line: 5,
1291            column: 3,
1292        });
1293        index.update_file(Path::new("docs/file1.md"), file1);
1294
1295        let mut file2 = FileIndex::with_hash("def456".to_string());
1296        file2.add_heading(HeadingIndex {
1297            text: "Another Heading".to_string(),
1298            auto_anchor: "another-heading".to_string(),
1299            custom_anchor: None,
1300            line: 1,
1301            is_setext: false,
1302        });
1303        index.update_file(Path::new("docs/other.md"), file2);
1304
1305        // Save to cache
1306        index.save_to_cache(&temp_dir).expect("Failed to save cache");
1307
1308        // Verify cache file exists
1309        assert!(temp_dir.join("workspace_index.bin").exists());
1310
1311        // Load from cache
1312        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
1313
1314        // Verify data matches
1315        assert_eq!(loaded.file_count(), 2);
1316        assert!(loaded.contains_file(Path::new("docs/file1.md")));
1317        assert!(loaded.contains_file(Path::new("docs/other.md")));
1318
1319        // Check file1 details
1320        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
1321        assert_eq!(file1_loaded.content_hash, "abc123");
1322        assert_eq!(file1_loaded.headings.len(), 1);
1323        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
1324        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
1325        assert_eq!(file1_loaded.cross_file_links.len(), 1);
1326        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
1327
1328        // Check reverse deps were serialized correctly
1329        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
1330        assert_eq!(dependents.len(), 1);
1331        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
1332
1333        // Clean up
1334        let _ = fs::remove_dir_all(&temp_dir);
1335    }
1336
1337    #[cfg(feature = "native")]
1338    #[test]
1339    fn test_cache_missing_file() {
1340        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
1341        let _ = std::fs::remove_dir_all(&temp_dir);
1342
1343        // Should return None for non-existent cache
1344        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1345        assert!(result.is_none());
1346    }
1347
1348    #[cfg(feature = "native")]
1349    #[test]
1350    fn test_cache_corrupted_file() {
1351        use std::fs;
1352
1353        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
1354        let _ = fs::remove_dir_all(&temp_dir);
1355        fs::create_dir_all(&temp_dir).unwrap();
1356
1357        // Write corrupted data (too small for header)
1358        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
1359
1360        // Should return None for corrupted cache (and remove the file)
1361        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1362        assert!(result.is_none());
1363
1364        // Corrupted file should be removed
1365        assert!(!temp_dir.join("workspace_index.bin").exists());
1366
1367        // Clean up
1368        let _ = fs::remove_dir_all(&temp_dir);
1369    }
1370
1371    #[cfg(feature = "native")]
1372    #[test]
1373    fn test_cache_invalid_magic() {
1374        use std::fs;
1375
1376        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1377        let _ = fs::remove_dir_all(&temp_dir);
1378        fs::create_dir_all(&temp_dir).unwrap();
1379
1380        // Write data with wrong magic header
1381        let mut data = Vec::new();
1382        data.extend_from_slice(b"XXXX"); // Wrong magic
1383        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1384        data.extend_from_slice(&[0; 100]); // Some garbage data
1385        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1386
1387        // Should return None for invalid magic
1388        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1389        assert!(result.is_none());
1390
1391        // File should be removed
1392        assert!(!temp_dir.join("workspace_index.bin").exists());
1393
1394        // Clean up
1395        let _ = fs::remove_dir_all(&temp_dir);
1396    }
1397
1398    #[cfg(feature = "native")]
1399    #[test]
1400    fn test_cache_version_mismatch() {
1401        use std::fs;
1402
1403        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1404        let _ = fs::remove_dir_all(&temp_dir);
1405        fs::create_dir_all(&temp_dir).unwrap();
1406
1407        // Write data with correct magic but wrong version
1408        let mut data = Vec::new();
1409        data.extend_from_slice(b"RWSI"); // Correct magic
1410        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1411        data.extend_from_slice(&[0; 100]); // Some garbage data
1412        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1413
1414        // Should return None for version mismatch
1415        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1416        assert!(result.is_none());
1417
1418        // File should be removed to trigger rebuild
1419        assert!(!temp_dir.join("workspace_index.bin").exists());
1420
1421        // Clean up
1422        let _ = fs::remove_dir_all(&temp_dir);
1423    }
1424
1425    #[cfg(feature = "native")]
1426    #[test]
1427    fn test_cache_atomic_write() {
1428        use std::fs;
1429
1430        // Test that atomic writes work (no temp files left behind)
1431        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1432        let _ = fs::remove_dir_all(&temp_dir);
1433        fs::create_dir_all(&temp_dir).unwrap();
1434
1435        let index = WorkspaceIndex::new();
1436        index.save_to_cache(&temp_dir).expect("Failed to save");
1437
1438        // Only the final cache file should exist, no temp files
1439        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1440        assert_eq!(entries.len(), 1);
1441        assert!(temp_dir.join("workspace_index.bin").exists());
1442
1443        // Clean up
1444        let _ = fs::remove_dir_all(&temp_dir);
1445    }
1446
1447    #[test]
1448    fn test_has_anchor_auto_generated() {
1449        let mut file_index = FileIndex::new();
1450        file_index.add_heading(HeadingIndex {
1451            text: "Installation Guide".to_string(),
1452            auto_anchor: "installation-guide".to_string(),
1453            custom_anchor: None,
1454            line: 1,
1455            is_setext: false,
1456        });
1457
1458        // Should find by auto-generated anchor
1459        assert!(file_index.has_anchor("installation-guide"));
1460
1461        // Case-insensitive matching
1462        assert!(file_index.has_anchor("Installation-Guide"));
1463        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1464
1465        // Should not find non-existent anchor
1466        assert!(!file_index.has_anchor("nonexistent"));
1467    }
1468
1469    #[test]
1470    fn test_has_anchor_custom() {
1471        let mut file_index = FileIndex::new();
1472        file_index.add_heading(HeadingIndex {
1473            text: "Installation Guide".to_string(),
1474            auto_anchor: "installation-guide".to_string(),
1475            custom_anchor: Some("install".to_string()),
1476            line: 1,
1477            is_setext: false,
1478        });
1479
1480        // Should find by auto-generated anchor
1481        assert!(file_index.has_anchor("installation-guide"));
1482
1483        // Should also find by custom anchor
1484        assert!(file_index.has_anchor("install"));
1485        assert!(file_index.has_anchor("Install")); // case-insensitive
1486
1487        // Should not find non-existent anchor
1488        assert!(!file_index.has_anchor("nonexistent"));
1489    }
1490
1491    #[test]
1492    fn test_get_heading_by_anchor() {
1493        let mut file_index = FileIndex::new();
1494        file_index.add_heading(HeadingIndex {
1495            text: "Installation Guide".to_string(),
1496            auto_anchor: "installation-guide".to_string(),
1497            custom_anchor: Some("install".to_string()),
1498            line: 10,
1499            is_setext: false,
1500        });
1501        file_index.add_heading(HeadingIndex {
1502            text: "Configuration".to_string(),
1503            auto_anchor: "configuration".to_string(),
1504            custom_anchor: None,
1505            line: 20,
1506            is_setext: false,
1507        });
1508
1509        // Get by auto anchor
1510        let heading = file_index.get_heading_by_anchor("installation-guide");
1511        assert!(heading.is_some());
1512        assert_eq!(heading.unwrap().text, "Installation Guide");
1513        assert_eq!(heading.unwrap().line, 10);
1514
1515        // Get by custom anchor
1516        let heading = file_index.get_heading_by_anchor("install");
1517        assert!(heading.is_some());
1518        assert_eq!(heading.unwrap().text, "Installation Guide");
1519
1520        // Get second heading
1521        let heading = file_index.get_heading_by_anchor("configuration");
1522        assert!(heading.is_some());
1523        assert_eq!(heading.unwrap().text, "Configuration");
1524        assert_eq!(heading.unwrap().line, 20);
1525
1526        // Non-existent
1527        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1528    }
1529
1530    #[test]
1531    fn test_anchor_lookup_many_headings() {
1532        // Test that O(1) lookup works with many headings
1533        let mut file_index = FileIndex::new();
1534
1535        // Add 100 headings
1536        for i in 0..100 {
1537            file_index.add_heading(HeadingIndex {
1538                text: format!("Heading {i}"),
1539                auto_anchor: format!("heading-{i}"),
1540                custom_anchor: Some(format!("h{i}")),
1541                line: i + 1,
1542                is_setext: false,
1543            });
1544        }
1545
1546        // Verify all can be found
1547        for i in 0..100 {
1548            assert!(file_index.has_anchor(&format!("heading-{i}")));
1549            assert!(file_index.has_anchor(&format!("h{i}")));
1550
1551            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1552            assert!(heading.is_some());
1553            assert_eq!(heading.unwrap().line, i + 1);
1554        }
1555    }
1556
1557    // =============================================================================
1558    // Tests for extract_cross_file_links utility
1559    // =============================================================================
1560
1561    #[test]
1562    fn test_extract_cross_file_links_basic() {
1563        use crate::config::MarkdownFlavor;
1564
1565        let content = "# Test\n\nSee [link](./other.md) for info.\n";
1566        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1567        let links = extract_cross_file_links(&ctx).relative;
1568
1569        assert_eq!(links.len(), 1);
1570        assert_eq!(links[0].target_path, "./other.md");
1571        assert_eq!(links[0].fragment, "");
1572        assert_eq!(links[0].line, 3);
1573        // "See [link](" = 11 chars, so column 12 is where "./other.md" starts
1574        assert_eq!(links[0].column, 12);
1575    }
1576
1577    #[test]
1578    fn test_extract_cross_file_links_with_fragment() {
1579        use crate::config::MarkdownFlavor;
1580
1581        let content = "Check [guide](./guide.md#install) here.\n";
1582        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1583        let links = extract_cross_file_links(&ctx).relative;
1584
1585        assert_eq!(links.len(), 1);
1586        assert_eq!(links[0].target_path, "./guide.md");
1587        assert_eq!(links[0].fragment, "install");
1588        assert_eq!(links[0].line, 1);
1589        // "Check [guide](" = 14 chars, so column 15 is where "./guide.md" starts
1590        assert_eq!(links[0].column, 15);
1591    }
1592
1593    #[test]
1594    fn test_extract_cross_file_links_multiple_on_same_line() {
1595        use crate::config::MarkdownFlavor;
1596
1597        let content = "See [a](a.md) and [b](b.md) here.\n";
1598        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1599        let links = extract_cross_file_links(&ctx).relative;
1600
1601        assert_eq!(links.len(), 2);
1602
1603        assert_eq!(links[0].target_path, "a.md");
1604        assert_eq!(links[0].line, 1);
1605        // "See [a](" = 8 chars, so column 9
1606        assert_eq!(links[0].column, 9);
1607
1608        assert_eq!(links[1].target_path, "b.md");
1609        assert_eq!(links[1].line, 1);
1610        // "See [a](a.md) and [b](" = 22 chars, so column 23
1611        assert_eq!(links[1].column, 23);
1612    }
1613
1614    #[test]
1615    fn test_extract_cross_file_links_angle_brackets() {
1616        use crate::config::MarkdownFlavor;
1617
1618        let content = "See [link](<path/with (parens).md>) here.\n";
1619        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1620        let links = extract_cross_file_links(&ctx).relative;
1621
1622        assert_eq!(links.len(), 1);
1623        assert_eq!(links[0].target_path, "path/with (parens).md");
1624        assert_eq!(links[0].line, 1);
1625        // "See [link](<" = 12 chars, so column 13
1626        assert_eq!(links[0].column, 13);
1627    }
1628
1629    #[test]
1630    fn test_extract_cross_file_links_skips_external() {
1631        use crate::config::MarkdownFlavor;
1632
1633        let content = r#"
1634[external](https://example.com)
1635[mailto](mailto:test@example.com)
1636[local](./local.md)
1637[fragment](#section)
1638[absolute](/docs/page.md)
1639"#;
1640        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1641        let extracted = extract_cross_file_links(&ctx);
1642
1643        // Only the local markdown link is a directory-relative link.
1644        assert_eq!(extracted.relative.len(), 1);
1645        assert_eq!(extracted.relative[0].target_path, "./local.md");
1646        // The root-relative link is captured separately, leading `/` stripped.
1647        assert_eq!(extracted.root_relative.len(), 1);
1648        assert_eq!(extracted.root_relative[0].target_path, "docs/page.md");
1649    }
1650
1651    #[test]
1652    fn test_extract_cross_file_links_root_relative() {
1653        use crate::config::MarkdownFlavor;
1654
1655        // Root-relative markdown links land in the root_relative bucket with the
1656        // leading `/` stripped; parent traversal and protocol-relative links are
1657        // excluded so they cannot escape a content root.
1658        let content = "[a](/guide.md#install)\n[b](/../escape.md)\n[c](//host/x.md)\n[d](/img/pic.png)\n";
1659        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1660        let extracted = extract_cross_file_links(&ctx);
1661
1662        assert!(extracted.relative.is_empty(), "no directory-relative links here");
1663        assert_eq!(
1664            extracted
1665                .root_relative
1666                .iter()
1667                .map(|l| (l.target_path.as_str(), l.fragment.as_str()))
1668                .collect::<Vec<_>>(),
1669            vec![("guide.md", "install")],
1670            "only the safe root-relative markdown link is captured"
1671        );
1672    }
1673
1674    #[test]
1675    fn test_extract_cross_file_links_skips_non_markdown() {
1676        use crate::config::MarkdownFlavor;
1677
1678        let content = r#"
1679[image](./photo.png)
1680[doc](./readme.md)
1681[pdf](./document.pdf)
1682"#;
1683        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1684        let links = extract_cross_file_links(&ctx).relative;
1685
1686        // Only markdown files are indexed for cross-file validation
1687        assert_eq!(links.len(), 1);
1688        assert_eq!(links[0].target_path, "./readme.md");
1689    }
1690
1691    #[test]
1692    fn test_extract_cross_file_links_skips_code_spans() {
1693        use crate::config::MarkdownFlavor;
1694
1695        let content = "Normal [link](./file.md) and `[code](./ignored.md)` here.\n";
1696        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1697        let links = extract_cross_file_links(&ctx).relative;
1698
1699        // Only the link outside code span should be extracted
1700        assert_eq!(links.len(), 1);
1701        assert_eq!(links[0].target_path, "./file.md");
1702    }
1703
1704    #[test]
1705    fn test_extract_cross_file_links_with_query_params() {
1706        use crate::config::MarkdownFlavor;
1707
1708        let content = "See [doc](./file.md?raw=true) here.\n";
1709        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1710        let links = extract_cross_file_links(&ctx).relative;
1711
1712        assert_eq!(links.len(), 1);
1713        // Query params should be stripped
1714        assert_eq!(links[0].target_path, "./file.md");
1715    }
1716
1717    #[test]
1718    fn test_extract_cross_file_links_empty_content() {
1719        use crate::config::MarkdownFlavor;
1720
1721        let content = "";
1722        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1723        let links = extract_cross_file_links(&ctx).relative;
1724
1725        assert!(links.is_empty());
1726    }
1727
1728    #[test]
1729    fn test_extract_cross_file_links_no_links() {
1730        use crate::config::MarkdownFlavor;
1731
1732        let content = "# Just a heading\n\nSome text without links.\n";
1733        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1734        let links = extract_cross_file_links(&ctx).relative;
1735
1736        assert!(links.is_empty());
1737    }
1738
1739    #[test]
1740    fn test_extract_cross_file_links_position_accuracy_issue_234() {
1741        // This test verifies the fix for GitHub issue #234
1742        // The LSP was reporting incorrect column positions for MD057 diagnostics
1743        use crate::config::MarkdownFlavor;
1744
1745        let content = r#"# Test Document
1746
1747Here is a [broken link](nonexistent-file.md) that should trigger MD057.
1748
1749And another [link](also-missing.md) on this line.
1750"#;
1751        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1752        let links = extract_cross_file_links(&ctx).relative;
1753
1754        assert_eq!(links.len(), 2);
1755
1756        // First link: "Here is a [broken link](" = 24 chars, column 25
1757        assert_eq!(links[0].target_path, "nonexistent-file.md");
1758        assert_eq!(links[0].line, 3);
1759        assert_eq!(links[0].column, 25);
1760
1761        // Second link: "And another [link](" = 19 chars, column 20
1762        assert_eq!(links[1].target_path, "also-missing.md");
1763        assert_eq!(links[1].line, 5);
1764        assert_eq!(links[1].column, 20);
1765    }
1766}