rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: postcard-serialized WorkspaceIndex]
19//! ```
20
21use regex::Regex;
22use serde::{Deserialize, Serialize};
23use std::collections::{HashMap, HashSet};
24use std::path::{Path, PathBuf};
25use std::sync::LazyLock;
26
27use crate::lint_context::LintContext;
28
29// =============================================================================
30// URL Decoding Helper
31// =============================================================================
32
33/// Convert a hex digit character to its numeric value (0-15)
34fn hex_digit_to_value(c: u8) -> Option<u8> {
35    match c {
36        b'0'..=b'9' => Some(c - b'0'),
37        b'a'..=b'f' => Some(c - b'a' + 10),
38        b'A'..=b'F' => Some(c - b'A' + 10),
39        _ => None,
40    }
41}
42
43/// URL-decode a string, handling percent-encoded characters.
44/// Returns the decoded string, or the original if decoding fails.
45/// Used for matching URL-encoded CJK fragments against raw anchors.
46fn url_decode(s: &str) -> String {
47    // Fast path: no percent signs means no encoding
48    if !s.contains('%') {
49        return s.to_string();
50    }
51
52    let bytes = s.as_bytes();
53    let mut result = Vec::with_capacity(bytes.len());
54    let mut i = 0;
55
56    while i < bytes.len() {
57        if bytes[i] == b'%' && i + 2 < bytes.len() {
58            // Try to parse the two hex digits following %
59            let hex1 = bytes[i + 1];
60            let hex2 = bytes[i + 2];
61            if let (Some(d1), Some(d2)) = (hex_digit_to_value(hex1), hex_digit_to_value(hex2)) {
62                result.push(d1 * 16 + d2);
63                i += 3;
64                continue;
65            }
66        }
67        result.push(bytes[i]);
68        i += 1;
69    }
70
71    // Convert to UTF-8, falling back to original if invalid
72    String::from_utf8(result).unwrap_or_else(|_| s.to_string())
73}
74
75// =============================================================================
76// Shared cross-file link extraction utilities
77//
78// These regexes and helpers are the canonical implementation for extracting
79// cross-file links. Both MD057 and LSP use this shared code path for correct
80// position tracking.
81// =============================================================================
82
83/// Regex to match the start of a link
84static LINK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!?\[[^\]]*\]").unwrap());
85
86/// Regex to extract the URL from an angle-bracketed markdown link
87/// Format: `](<URL>)` or `](<URL> "title")`
88static URL_EXTRACT_ANGLE_BRACKET_REGEX: LazyLock<Regex> =
89    LazyLock::new(|| Regex::new(r#"\]\(\s*<([^>]+)>(#[^\)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
90
91/// Regex to extract the URL from a normal markdown link (without angle brackets)
92/// Format: `](URL)` or `](URL "title")`
93static URL_EXTRACT_REGEX: LazyLock<Regex> =
94    LazyLock::new(|| Regex::new(r#"]\(\s*([^>)\s#]+)(#[^)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
95
96/// Regex to detect URLs with explicit schemes
97pub(crate) static PROTOCOL_DOMAIN_REGEX: LazyLock<Regex> =
98    LazyLock::new(|| Regex::new(r"^([a-zA-Z][a-zA-Z0-9+.-]*://|[a-zA-Z][a-zA-Z0-9+.-]*:|www\.)").unwrap());
99
100/// Supported markdown file extensions
101const MARKDOWN_EXTENSIONS: &[&str] = &[
102    ".md",
103    ".markdown",
104    ".mdx",
105    ".mkd",
106    ".mkdn",
107    ".mdown",
108    ".mdwn",
109    ".qmd",
110    ".rmd",
111];
112
113/// Check if a path has a markdown extension (case-insensitive)
114#[inline]
115fn is_markdown_file(path: &str) -> bool {
116    let path_lower = path.to_lowercase();
117    MARKDOWN_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext))
118}
119
120/// Strip query parameters and fragments from a URL path
121/// Returns the path portion before `?` or `#`
122fn strip_query_and_fragment(url: &str) -> &str {
123    let query_pos = url.find('?');
124    let fragment_pos = url.find('#');
125
126    match (query_pos, fragment_pos) {
127        (Some(q), Some(f)) => &url[..q.min(f)],
128        (Some(q), None) => &url[..q],
129        (None, Some(f)) => &url[..f],
130        (None, None) => url,
131    }
132}
133
134/// Extract cross-file links from content using correct regex-based position tracking.
135///
136/// This is the canonical implementation used by both MD057 and LSP to ensure
137/// consistent and correct column positions for diagnostic reporting.
138///
139/// Returns a vector of `CrossFileLinkIndex` entries, one for each markdown file
140/// link found in the content.
141pub fn extract_cross_file_links(ctx: &LintContext) -> Vec<CrossFileLinkIndex> {
142    let content = ctx.content;
143
144    // Early returns for performance
145    if content.is_empty() || !content.contains("](") {
146        return Vec::new();
147    }
148
149    let mut links = Vec::new();
150    let lines: Vec<&str> = content.lines().collect();
151    let line_index = &ctx.line_index;
152
153    // Track which lines we've already processed to avoid duplicates
154    // (ctx.links may have multiple entries for the same line)
155    let mut processed_lines = HashSet::new();
156
157    for link in &ctx.links {
158        let line_idx = link.line - 1;
159        if line_idx >= lines.len() {
160            continue;
161        }
162
163        // Skip if we've already processed this line
164        if !processed_lines.insert(line_idx) {
165            continue;
166        }
167
168        let line = lines[line_idx];
169        if !line.contains("](") {
170            continue;
171        }
172
173        // Find all links in this line
174        for link_match in LINK_START_REGEX.find_iter(line) {
175            let start_pos = link_match.start();
176            let end_pos = link_match.end();
177
178            // Calculate absolute position for code span detection
179            let line_start_byte = line_index.get_line_start_byte(line_idx + 1).unwrap_or(0);
180            let absolute_start_pos = line_start_byte + start_pos;
181
182            // Skip if in code span
183            if ctx.is_in_code_span_byte(absolute_start_pos) {
184                continue;
185            }
186
187            // Extract the URL (group 1) and fragment (group 2)
188            // Try angle-bracket regex first (handles URLs with parens)
189            let caps_result = URL_EXTRACT_ANGLE_BRACKET_REGEX
190                .captures_at(line, end_pos - 1)
191                .or_else(|| URL_EXTRACT_REGEX.captures_at(line, end_pos - 1));
192
193            if let Some(caps) = caps_result
194                && let Some(url_group) = caps.get(1)
195            {
196                let file_path = url_group.as_str().trim();
197
198                // Skip empty, external, template variables, absolute URL paths,
199                // framework aliases, fragment-only URLs, or rustdoc intra-doc links
200                if file_path.is_empty()
201                    || PROTOCOL_DOMAIN_REGEX.is_match(file_path)
202                    || file_path.starts_with("www.")
203                    || file_path.starts_with('#')
204                    || file_path.starts_with("{{")
205                    || file_path.starts_with("{%")
206                    || file_path.starts_with('/')
207                    || file_path.starts_with('~')
208                    || file_path.starts_with('@')
209                    || (file_path.starts_with('`') && file_path.ends_with('`'))
210                {
211                    continue;
212                }
213
214                // Strip query parameters before indexing
215                let file_path = strip_query_and_fragment(file_path);
216
217                // Get fragment from capture group 2 (includes # prefix)
218                let fragment = caps.get(2).map_or("", |m| m.as_str().trim_start_matches('#'));
219
220                // Only index markdown file links for cross-file validation
221                if is_markdown_file(file_path) {
222                    links.push(CrossFileLinkIndex {
223                        target_path: file_path.to_string(),
224                        fragment: fragment.to_string(),
225                        line: link.line,
226                        column: url_group.start() + 1,
227                    });
228                }
229            }
230        }
231    }
232
233    links
234}
235
236/// Magic bytes identifying a workspace index cache file
237#[cfg(feature = "native")]
238const CACHE_MAGIC: &[u8; 4] = b"RWSI";
239
240/// Cache format version - increment when WorkspaceIndex serialization changes
241#[cfg(feature = "native")]
242const CACHE_FORMAT_VERSION: u32 = 6;
243
244/// Cache file name within the version directory
245#[cfg(feature = "native")]
246const CACHE_FILE_NAME: &str = "workspace_index.bin";
247
248/// Workspace-wide index for cross-file analysis
249///
250/// Contains pre-extracted information from all markdown files in the workspace,
251/// enabling rules to validate cross-file references efficiently.
252#[derive(Debug, Default, Clone, Serialize, Deserialize)]
253pub struct WorkspaceIndex {
254    /// Map from file path to its extracted data
255    files: HashMap<PathBuf, FileIndex>,
256    /// Reverse dependency graph: target file → files that link to it
257    /// Used to efficiently re-lint dependent files when a target changes
258    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
259    /// Version counter for cache invalidation (incremented on any change)
260    version: u64,
261}
262
263/// Index data extracted from a single file
264#[derive(Debug, Clone, Default, Serialize, Deserialize)]
265pub struct FileIndex {
266    /// Headings in this file with their anchors
267    pub headings: Vec<HeadingIndex>,
268    /// Reference links in this file (for cross-file analysis)
269    pub reference_links: Vec<ReferenceLinkIndex>,
270    /// Cross-file links in this file (for MD051 cross-file validation)
271    pub cross_file_links: Vec<CrossFileLinkIndex>,
272    /// Defined reference IDs (e.g., from `[ref]: url` definitions)
273    /// Used to filter out reference links that have explicit definitions
274    pub defined_references: HashSet<String>,
275    /// Content hash for change detection
276    pub content_hash: String,
277    /// O(1) anchor lookup: lowercased anchor → heading index
278    /// Includes both auto-generated and custom anchors
279    anchor_to_heading: HashMap<String, usize>,
280    /// O(1) anchor lookup with original case preserved → heading index.
281    /// Used for `ignore_case = false` (markdownlint strict parity). Skipped at
282    /// query time when the lowercase map is sufficient.
283    #[serde(default)]
284    anchor_to_heading_exact: HashMap<String, usize>,
285    /// HTML anchors defined via `<a id="...">` or `<element id="...">` tags.
286    /// Stored lowercase for case-insensitive matching.
287    html_anchors: HashSet<String>,
288    /// HTML anchors with original case preserved.
289    /// Used for `ignore_case = false` (markdownlint strict parity).
290    #[serde(default)]
291    html_anchors_exact: HashSet<String>,
292    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list).
293    /// Can appear on any element, not just headings.
294    /// Stored lowercase for case-insensitive matching.
295    attribute_anchors: HashSet<String>,
296    /// Attribute anchors with original case preserved.
297    /// Used for `ignore_case = false` (markdownlint strict parity).
298    #[serde(default)]
299    attribute_anchors_exact: HashSet<String>,
300    /// Rules disabled for the entire file (from inline comments)
301    /// Used by cross-file rules to respect inline disable directives
302    pub file_disabled_rules: HashSet<String>,
303    /// Persistent disable/enable state transitions, sorted by line number.
304    /// Each entry: (line, disabled_rules, enabled_rules). Use binary search to query.
305    pub persistent_transitions: Vec<(usize, HashSet<String>, HashSet<String>)>,
306    /// Rules disabled at specific lines via disable-line / disable-next-line
307    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
308}
309
310/// Information about a heading for cross-file lookup
311#[derive(Debug, Clone, Serialize, Deserialize)]
312pub struct HeadingIndex {
313    /// The heading text (e.g., "Installation Guide")
314    pub text: String,
315    /// Auto-generated anchor (e.g., "installation-guide")
316    pub auto_anchor: String,
317    /// Custom anchor if present (e.g., "install")
318    pub custom_anchor: Option<String>,
319    /// Line number (1-indexed)
320    pub line: usize,
321    /// Whether this is a Setext-style heading (underlined with = or -)
322    #[serde(default)]
323    pub is_setext: bool,
324}
325
326/// Information about a reference link for cross-file analysis
327#[derive(Debug, Clone, Serialize, Deserialize)]
328pub struct ReferenceLinkIndex {
329    /// The reference ID (the part in `[text][ref]`)
330    pub reference_id: String,
331    /// Line number (1-indexed)
332    pub line: usize,
333    /// Column number (1-indexed)
334    pub column: usize,
335}
336
337/// Information about a cross-file link for validation
338#[derive(Debug, Clone, Serialize, Deserialize)]
339pub struct CrossFileLinkIndex {
340    /// The target file path (relative, as it appears in the link)
341    pub target_path: String,
342    /// The fragment/anchor being linked to (without #)
343    pub fragment: String,
344    /// Line number (1-indexed)
345    pub line: usize,
346    /// Column number (1-indexed)
347    pub column: usize,
348}
349
350/// Information about a vulnerable anchor (heading without custom ID)
351#[derive(Debug, Clone, Serialize, Deserialize)]
352pub struct VulnerableAnchor {
353    /// File path where the heading is located
354    pub file: PathBuf,
355    /// Line number of the heading
356    pub line: usize,
357    /// The heading text
358    pub text: String,
359}
360
361impl WorkspaceIndex {
362    /// Create a new empty workspace index
363    pub fn new() -> Self {
364        Self::default()
365    }
366
367    /// Get the current version (for cache invalidation)
368    pub fn version(&self) -> u64 {
369        self.version
370    }
371
372    /// Get the number of indexed files
373    pub fn file_count(&self) -> usize {
374        self.files.len()
375    }
376
377    /// Check if a file is in the index
378    pub fn contains_file(&self, path: &Path) -> bool {
379        self.files.contains_key(path)
380    }
381
382    /// Get the index data for a specific file
383    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
384        self.files.get(path)
385    }
386
387    /// Insert or update a file's index data
388    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
389        self.files.insert(path, index);
390        self.version = self.version.wrapping_add(1);
391    }
392
393    /// Remove a file from the index
394    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
395        // Clean up reverse deps for this file
396        self.clear_reverse_deps_for(path);
397
398        let result = self.files.remove(path);
399        if result.is_some() {
400            self.version = self.version.wrapping_add(1);
401        }
402        result
403    }
404
405    /// Build a map of all "vulnerable" anchors across the workspace
406    ///
407    /// A vulnerable anchor is an auto-generated anchor for a heading that
408    /// does NOT have a custom anchor defined. These are problematic for
409    /// translated content because the anchor changes when the heading is translated.
410    ///
411    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
412    /// Multiple files can have headings with the same auto-generated anchor,
413    /// so we collect all occurrences.
414    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
415        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
416
417        for (file_path, file_index) in &self.files {
418            for heading in &file_index.headings {
419                // Only include headings WITHOUT custom anchors
420                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
421                    let anchor_key = heading.auto_anchor.to_lowercase();
422                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
423                        file: file_path.clone(),
424                        line: heading.line,
425                        text: heading.text.clone(),
426                    });
427                }
428            }
429        }
430
431        vulnerable
432    }
433
434    /// Get all headings across the workspace (for debugging/testing)
435    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
436        self.files
437            .iter()
438            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
439    }
440
441    /// Iterate over all files in the index
442    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
443        self.files.iter().map(|(p, i)| (p.as_path(), i))
444    }
445
446    /// Clear the entire index
447    pub fn clear(&mut self) {
448        self.files.clear();
449        self.reverse_deps.clear();
450        self.version = self.version.wrapping_add(1);
451    }
452
453    /// Update a file's index and maintain reverse dependencies
454    ///
455    /// This method:
456    /// 1. Removes this file as a source (dependent) from all reverse deps
457    /// 2. Inserts the new file index
458    /// 3. Builds new reverse deps from cross_file_links
459    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
460        // Remove this file as a source (dependent) from all target entries
461        // Note: We don't remove it as a target - other files may still link to it
462        self.clear_reverse_deps_as_source(path);
463
464        // Build new reverse deps from cross_file_links
465        for link in &index.cross_file_links {
466            let target = self.resolve_target_path(path, &link.target_path);
467            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
468        }
469
470        self.files.insert(path.to_path_buf(), index);
471        self.version = self.version.wrapping_add(1);
472    }
473
474    /// Get files that depend on (link to) the given file
475    ///
476    /// Returns a list of file paths that contain links targeting this file.
477    /// Used to re-lint dependent files when a target file changes.
478    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
479        self.reverse_deps
480            .get(path)
481            .map(|set| set.iter().cloned().collect())
482            .unwrap_or_default()
483    }
484
485    /// Check if a file needs re-indexing based on its content hash
486    ///
487    /// Returns `true` if the file is not in the index or has a different hash.
488    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
489        self.files.get(path).is_none_or(|f| f.content_hash != current_hash)
490    }
491
492    /// Retain only files that exist in the given set, removing deleted files
493    ///
494    /// This prunes stale entries from the cache for files that no longer exist.
495    /// Returns the number of files removed.
496    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
497        let before_count = self.files.len();
498
499        // Collect files to remove
500        let to_remove: Vec<PathBuf> = self
501            .files
502            .keys()
503            .filter(|path| !current_files.contains(*path))
504            .cloned()
505            .collect();
506
507        // Remove each file properly (clears reverse deps)
508        for path in &to_remove {
509            self.remove_file(path);
510        }
511
512        before_count - self.files.len()
513    }
514
515    /// Save the workspace index to a cache file
516    ///
517    /// Uses postcard for efficient binary serialization with:
518    /// - Magic header for file type validation
519    /// - Format version for compatibility detection
520    /// - Atomic writes (temp file + rename) to prevent corruption
521    #[cfg(feature = "native")]
522    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
523        use std::fs;
524        use std::io::Write;
525
526        // Ensure cache directory exists
527        fs::create_dir_all(cache_dir)?;
528
529        // Serialize the index data using postcard
530        let encoded = postcard::to_allocvec(self)
531            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
532
533        // Build versioned cache file: [magic][version][data]
534        let mut cache_data = Vec::with_capacity(8 + encoded.len());
535        cache_data.extend_from_slice(CACHE_MAGIC);
536        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
537        cache_data.extend_from_slice(&encoded);
538
539        // Write atomically: write to temp file then rename
540        let final_path = cache_dir.join(CACHE_FILE_NAME);
541        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
542
543        // Write to temp file
544        {
545            let mut file = fs::File::create(&temp_path)?;
546            file.write_all(&cache_data)?;
547            file.sync_all()?;
548        }
549
550        // Atomic rename
551        fs::rename(&temp_path, &final_path)?;
552
553        log::debug!(
554            "Saved workspace index to cache: {} files, {} bytes (format v{})",
555            self.files.len(),
556            cache_data.len(),
557            CACHE_FORMAT_VERSION
558        );
559
560        Ok(())
561    }
562
563    /// Load the workspace index from a cache file
564    ///
565    /// Returns `None` if:
566    /// - Cache file doesn't exist
567    /// - Magic header doesn't match
568    /// - Format version is incompatible
569    /// - Data is corrupted
570    #[cfg(feature = "native")]
571    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
572        use std::fs;
573
574        let path = cache_dir.join(CACHE_FILE_NAME);
575        let data = fs::read(&path).ok()?;
576
577        // Validate header: need at least 8 bytes for magic + version
578        if data.len() < 8 {
579            log::warn!("Workspace index cache too small, discarding");
580            let _ = fs::remove_file(&path);
581            return None;
582        }
583
584        // Check magic header
585        if &data[0..4] != CACHE_MAGIC {
586            log::warn!("Workspace index cache has invalid magic header, discarding");
587            let _ = fs::remove_file(&path);
588            return None;
589        }
590
591        // Check format version
592        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
593        if version != CACHE_FORMAT_VERSION {
594            log::info!(
595                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
596            );
597            let _ = fs::remove_file(&path);
598            return None;
599        }
600
601        // Deserialize the index data using postcard
602        match postcard::from_bytes::<Self>(&data[8..]) {
603            Ok(index) => {
604                log::debug!(
605                    "Loaded workspace index from cache: {} files (format v{})",
606                    index.files.len(),
607                    version
608                );
609                Some(index)
610            }
611            Err(e) => {
612                log::warn!("Failed to deserialize workspace index cache: {e}");
613                let _ = fs::remove_file(&path);
614                None
615            }
616        }
617    }
618
619    /// Remove a file as a source from all reverse dependency entries
620    ///
621    /// This removes the file from being listed as a dependent in all target entries.
622    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
623    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
624        for deps in self.reverse_deps.values_mut() {
625            deps.remove(path);
626        }
627        // Clean up empty entries
628        self.reverse_deps.retain(|_, deps| !deps.is_empty());
629    }
630
631    /// Remove a file completely from reverse dependency tracking
632    ///
633    /// Removes the file as both a source (dependent) and as a target.
634    /// Used when deleting a file from the index.
635    fn clear_reverse_deps_for(&mut self, path: &Path) {
636        // Remove as source (dependent)
637        self.clear_reverse_deps_as_source(path);
638
639        // Also remove as target
640        self.reverse_deps.remove(path);
641    }
642
643    /// Resolve a relative path from a source file to an absolute target path
644    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
645        // Get the directory containing the source file
646        let source_dir = source_file.parent().unwrap_or(Path::new(""));
647
648        // Join with the relative target and normalize
649        let target = source_dir.join(relative_target);
650
651        // Normalize the path (handle .., ., etc.)
652        Self::normalize_path(&target)
653    }
654
655    /// Normalize a path by resolving . and .. components
656    fn normalize_path(path: &Path) -> PathBuf {
657        let mut components = Vec::new();
658
659        for component in path.components() {
660            match component {
661                std::path::Component::ParentDir => {
662                    // Go up one level if possible
663                    if !components.is_empty() {
664                        components.pop();
665                    }
666                }
667                std::path::Component::CurDir => {
668                    // Skip current directory markers
669                }
670                _ => {
671                    components.push(component);
672                }
673            }
674        }
675
676        components.iter().collect()
677    }
678}
679
680impl FileIndex {
681    /// Create a new empty file index
682    pub fn new() -> Self {
683        Self::default()
684    }
685
686    /// Create a file index with the given content hash
687    pub fn with_hash(content_hash: String) -> Self {
688        Self {
689            content_hash,
690            ..Default::default()
691        }
692    }
693
694    /// Add a heading to the index
695    ///
696    /// Also updates the anchor lookup maps for O(1) anchor queries. Both
697    /// lowercased (for `ignore_case = true`) and case-preserving (for
698    /// `ignore_case = false`) maps are populated.
699    pub fn add_heading(&mut self, heading: HeadingIndex) {
700        let index = self.headings.len();
701
702        // Auto-generated anchor — slugs are already lowercase, but we still
703        // populate both maps so query-time dispatch is uniform.
704        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
705        self.anchor_to_heading_exact.insert(heading.auto_anchor.clone(), index);
706
707        // Custom anchor preserves original case as written by the author.
708        if let Some(ref custom) = heading.custom_anchor {
709            self.anchor_to_heading.insert(custom.to_lowercase(), index);
710            self.anchor_to_heading_exact.insert(custom.clone(), index);
711        }
712
713        self.headings.push(heading);
714    }
715
716    /// Add an alternative anchor that resolves to an existing heading.
717    /// Used for platform-specific anchor conventions (e.g., Python-Markdown `_N` dedup).
718    pub fn add_anchor_alias(&mut self, anchor: &str, heading_index: usize) {
719        if heading_index < self.headings.len() {
720            self.anchor_to_heading.insert(anchor.to_lowercase(), heading_index);
721            self.anchor_to_heading_exact.insert(anchor.to_string(), heading_index);
722        }
723    }
724
725    /// Check if an anchor exists in this file (O(1) lookup)
726    ///
727    /// Returns true if the anchor matches any of:
728    /// - Auto-generated heading anchors
729    /// - Custom heading anchors (from {#id} syntax on headings)
730    /// - HTML anchors (from `<a id="...">` or `<element id="...">`)
731    /// - Attribute anchors (from { #id } syntax on non-heading elements)
732    ///
733    /// Matching is case-insensitive. URL-encoded anchors (e.g., CJK characters
734    /// like `%E6%97%A5%E6%9C%AC%E8%AA%9E` for `日本語`) are decoded before matching.
735    pub fn has_anchor(&self, anchor: &str) -> bool {
736        self.has_anchor_with_case(anchor, true)
737    }
738
739    /// Check if an anchor exists in this file, with explicit case sensitivity.
740    ///
741    /// When `ignore_case` is `true`, behaves identically to [`has_anchor`] —
742    /// inputs are lowercased and matched against the lowercase storage.
743    /// When `false`, the input is compared as-is against parallel
744    /// case-preserving storage, matching markdownlint's strict behavior for
745    /// generated heading slugs, custom heading IDs, HTML anchors, and
746    /// attribute anchors.
747    pub fn has_anchor_with_case(&self, anchor: &str, ignore_case: bool) -> bool {
748        if self.lookup_anchor(anchor, ignore_case) {
749            return true;
750        }
751
752        // Slow path: if anchor contains percent-encoding, try decoded version
753        if anchor.contains('%') {
754            let decoded = url_decode(anchor);
755            if decoded != anchor {
756                return self.lookup_anchor(&decoded, ignore_case);
757            }
758        }
759
760        false
761    }
762
763    /// Direct anchor lookup, dispatching to the lowercase or exact-case
764    /// storage based on `ignore_case`.
765    fn lookup_anchor(&self, anchor: &str, ignore_case: bool) -> bool {
766        if ignore_case {
767            let lower = anchor.to_lowercase();
768            self.anchor_to_heading.contains_key(&lower)
769                || self.html_anchors.contains(&lower)
770                || self.attribute_anchors.contains(&lower)
771        } else {
772            self.anchor_to_heading_exact.contains_key(anchor)
773                || self.html_anchors_exact.contains(anchor)
774                || self.attribute_anchors_exact.contains(anchor)
775        }
776    }
777
778    /// Add an HTML anchor (from `<a id="...">` or `<element id="...">` tags).
779    /// Populates both lowercase (case-insensitive) and case-preserving sets.
780    pub fn add_html_anchor(&mut self, anchor: &str) {
781        if !anchor.is_empty() {
782            self.html_anchors.insert(anchor.to_lowercase());
783            self.html_anchors_exact.insert(anchor.to_string());
784        }
785    }
786
787    /// Add an attribute anchor (from { #id } syntax on non-heading elements).
788    /// Populates both lowercase (case-insensitive) and case-preserving sets.
789    pub fn add_attribute_anchor(&mut self, anchor: &str) {
790        if !anchor.is_empty() {
791            self.attribute_anchors.insert(anchor.to_lowercase());
792            self.attribute_anchors_exact.insert(anchor.to_string());
793        }
794    }
795
796    /// Get the heading index for an anchor (O(1) lookup)
797    ///
798    /// Returns the index into `self.headings` if found.
799    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
800        self.anchor_to_heading
801            .get(&anchor.to_lowercase())
802            .and_then(|&idx| self.headings.get(idx))
803    }
804
805    /// Add a reference link to the index
806    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
807        self.reference_links.push(link);
808    }
809
810    /// Check if a rule is disabled at a specific line
811    ///
812    /// Used by cross-file rules to respect inline disable directives.
813    /// Checks both file-wide disables and line-specific disables.
814    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
815        // Check file-wide disables (highest priority)
816        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
817            return true;
818        }
819
820        // Check line-specific disables (disable-line / disable-next-line)
821        if let Some(rules) = self.line_disabled_rules.get(&line)
822            && (rules.contains("*") || rules.contains(rule_name))
823        {
824            return true;
825        }
826
827        // Check persistent disable/enable transitions via binary search
828        if !self.persistent_transitions.is_empty() {
829            let idx = match self.persistent_transitions.binary_search_by_key(&line, |t| t.0) {
830                Ok(i) => Some(i),
831                Err(i) => {
832                    if i > 0 {
833                        Some(i - 1)
834                    } else {
835                        None
836                    }
837                }
838            };
839            if let Some(i) = idx {
840                let (_, ref disabled, ref enabled) = self.persistent_transitions[i];
841                if disabled.contains("*") {
842                    return !enabled.contains(rule_name);
843                }
844                return disabled.contains(rule_name);
845            }
846        }
847
848        false
849    }
850
851    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line)
852    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
853        // Deduplicate: multiple rules may contribute the same link with different columns
854        // (e.g., MD051 uses link start, MD057 uses URL start)
855        let is_duplicate = self.cross_file_links.iter().any(|existing| {
856            existing.target_path == link.target_path && existing.fragment == link.fragment && existing.line == link.line
857        });
858        if !is_duplicate {
859            self.cross_file_links.push(link);
860        }
861    }
862
863    /// Add a defined reference ID (e.g., from `[ref]: url`)
864    pub fn add_defined_reference(&mut self, ref_id: String) {
865        self.defined_references.insert(ref_id);
866    }
867
868    /// Check if a reference ID has an explicit definition
869    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
870        self.defined_references.contains(ref_id)
871    }
872
873    /// Check if the content hash matches
874    pub fn hash_matches(&self, hash: &str) -> bool {
875        self.content_hash == hash
876    }
877
878    /// Get the number of headings
879    pub fn heading_count(&self) -> usize {
880        self.headings.len()
881    }
882
883    /// Get the number of reference links
884    pub fn reference_link_count(&self) -> usize {
885        self.reference_links.len()
886    }
887}
888
889#[cfg(test)]
890mod tests {
891    use super::*;
892
893    #[test]
894    fn test_workspace_index_basic() {
895        let mut index = WorkspaceIndex::new();
896        assert_eq!(index.file_count(), 0);
897        assert_eq!(index.version(), 0);
898
899        let mut file_index = FileIndex::with_hash("abc123".to_string());
900        file_index.add_heading(HeadingIndex {
901            text: "Installation".to_string(),
902            auto_anchor: "installation".to_string(),
903            custom_anchor: None,
904            line: 1,
905            is_setext: false,
906        });
907
908        index.insert_file(PathBuf::from("docs/install.md"), file_index);
909        assert_eq!(index.file_count(), 1);
910        assert_eq!(index.version(), 1);
911
912        assert!(index.contains_file(Path::new("docs/install.md")));
913        assert!(!index.contains_file(Path::new("docs/other.md")));
914    }
915
916    #[test]
917    fn test_vulnerable_anchors() {
918        let mut index = WorkspaceIndex::new();
919
920        // File 1: heading without custom anchor (vulnerable)
921        let mut file1 = FileIndex::new();
922        file1.add_heading(HeadingIndex {
923            text: "Getting Started".to_string(),
924            auto_anchor: "getting-started".to_string(),
925            custom_anchor: None,
926            line: 1,
927            is_setext: false,
928        });
929        index.insert_file(PathBuf::from("docs/guide.md"), file1);
930
931        // File 2: heading with custom anchor (not vulnerable)
932        let mut file2 = FileIndex::new();
933        file2.add_heading(HeadingIndex {
934            text: "Installation".to_string(),
935            auto_anchor: "installation".to_string(),
936            custom_anchor: Some("install".to_string()),
937            line: 1,
938            is_setext: false,
939        });
940        index.insert_file(PathBuf::from("docs/install.md"), file2);
941
942        let vulnerable = index.get_vulnerable_anchors();
943        assert_eq!(vulnerable.len(), 1);
944        assert!(vulnerable.contains_key("getting-started"));
945        assert!(!vulnerable.contains_key("installation"));
946
947        let anchors = vulnerable.get("getting-started").unwrap();
948        assert_eq!(anchors.len(), 1);
949        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
950        assert_eq!(anchors[0].text, "Getting Started");
951    }
952
953    #[test]
954    fn test_vulnerable_anchors_multiple_files_same_anchor() {
955        // Multiple files can have headings with the same auto-generated anchor
956        // get_vulnerable_anchors() should collect all of them
957        let mut index = WorkspaceIndex::new();
958
959        // File 1: has "Installation" heading (vulnerable)
960        let mut file1 = FileIndex::new();
961        file1.add_heading(HeadingIndex {
962            text: "Installation".to_string(),
963            auto_anchor: "installation".to_string(),
964            custom_anchor: None,
965            line: 1,
966            is_setext: false,
967        });
968        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
969
970        // File 2: also has "Installation" heading with same anchor (vulnerable)
971        let mut file2 = FileIndex::new();
972        file2.add_heading(HeadingIndex {
973            text: "Installation".to_string(),
974            auto_anchor: "installation".to_string(),
975            custom_anchor: None,
976            line: 5,
977            is_setext: false,
978        });
979        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
980
981        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
982        let mut file3 = FileIndex::new();
983        file3.add_heading(HeadingIndex {
984            text: "Installation".to_string(),
985            auto_anchor: "installation".to_string(),
986            custom_anchor: Some("install".to_string()),
987            line: 10,
988            is_setext: false,
989        });
990        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
991
992        let vulnerable = index.get_vulnerable_anchors();
993        assert_eq!(vulnerable.len(), 1); // One unique anchor
994        assert!(vulnerable.contains_key("installation"));
995
996        let anchors = vulnerable.get("installation").unwrap();
997        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
998        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
999
1000        // Verify both files are represented
1001        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
1002        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
1003        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
1004    }
1005
1006    #[test]
1007    fn test_file_index_hash() {
1008        let index = FileIndex::with_hash("hash123".to_string());
1009        assert!(index.hash_matches("hash123"));
1010        assert!(!index.hash_matches("other"));
1011    }
1012
1013    #[test]
1014    fn test_version_increment() {
1015        let mut index = WorkspaceIndex::new();
1016        assert_eq!(index.version(), 0);
1017
1018        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
1019        assert_eq!(index.version(), 1);
1020
1021        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
1022        assert_eq!(index.version(), 2);
1023
1024        index.remove_file(Path::new("a.md"));
1025        assert_eq!(index.version(), 3);
1026
1027        // Removing non-existent file doesn't increment
1028        index.remove_file(Path::new("nonexistent.md"));
1029        assert_eq!(index.version(), 3);
1030    }
1031
1032    #[test]
1033    fn test_reverse_deps_basic() {
1034        let mut index = WorkspaceIndex::new();
1035
1036        // File A links to file B
1037        let mut file_a = FileIndex::new();
1038        file_a.add_cross_file_link(CrossFileLinkIndex {
1039            target_path: "b.md".to_string(),
1040            fragment: "section".to_string(),
1041            line: 10,
1042            column: 5,
1043        });
1044        index.update_file(Path::new("docs/a.md"), file_a);
1045
1046        // Check that B has A as a dependent
1047        let dependents = index.get_dependents(Path::new("docs/b.md"));
1048        assert_eq!(dependents.len(), 1);
1049        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
1050
1051        // A has no dependents
1052        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
1053        assert!(a_dependents.is_empty());
1054    }
1055
1056    #[test]
1057    fn test_reverse_deps_multiple() {
1058        let mut index = WorkspaceIndex::new();
1059
1060        // Files A and C both link to B
1061        let mut file_a = FileIndex::new();
1062        file_a.add_cross_file_link(CrossFileLinkIndex {
1063            target_path: "../b.md".to_string(),
1064            fragment: "".to_string(),
1065            line: 1,
1066            column: 1,
1067        });
1068        index.update_file(Path::new("docs/sub/a.md"), file_a);
1069
1070        let mut file_c = FileIndex::new();
1071        file_c.add_cross_file_link(CrossFileLinkIndex {
1072            target_path: "b.md".to_string(),
1073            fragment: "".to_string(),
1074            line: 1,
1075            column: 1,
1076        });
1077        index.update_file(Path::new("docs/c.md"), file_c);
1078
1079        // B should have both A and C as dependents
1080        let dependents = index.get_dependents(Path::new("docs/b.md"));
1081        assert_eq!(dependents.len(), 2);
1082        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
1083        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
1084    }
1085
1086    #[test]
1087    fn test_reverse_deps_update_clears_old() {
1088        let mut index = WorkspaceIndex::new();
1089
1090        // File A initially links to B
1091        let mut file_a = FileIndex::new();
1092        file_a.add_cross_file_link(CrossFileLinkIndex {
1093            target_path: "b.md".to_string(),
1094            fragment: "".to_string(),
1095            line: 1,
1096            column: 1,
1097        });
1098        index.update_file(Path::new("docs/a.md"), file_a);
1099
1100        // Verify B has A as dependent
1101        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1102
1103        // Update A to link to C instead of B
1104        let mut file_a_updated = FileIndex::new();
1105        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
1106            target_path: "c.md".to_string(),
1107            fragment: "".to_string(),
1108            line: 1,
1109            column: 1,
1110        });
1111        index.update_file(Path::new("docs/a.md"), file_a_updated);
1112
1113        // B should no longer have A as dependent
1114        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1115
1116        // C should now have A as dependent
1117        let c_deps = index.get_dependents(Path::new("docs/c.md"));
1118        assert_eq!(c_deps.len(), 1);
1119        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
1120    }
1121
1122    #[test]
1123    fn test_reverse_deps_remove_file() {
1124        let mut index = WorkspaceIndex::new();
1125
1126        // File A links to B
1127        let mut file_a = FileIndex::new();
1128        file_a.add_cross_file_link(CrossFileLinkIndex {
1129            target_path: "b.md".to_string(),
1130            fragment: "".to_string(),
1131            line: 1,
1132            column: 1,
1133        });
1134        index.update_file(Path::new("docs/a.md"), file_a);
1135
1136        // Verify B has A as dependent
1137        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1138
1139        // Remove file A
1140        index.remove_file(Path::new("docs/a.md"));
1141
1142        // B should no longer have any dependents
1143        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1144    }
1145
1146    #[test]
1147    fn test_normalize_path() {
1148        // Test .. handling
1149        let path = Path::new("docs/sub/../other.md");
1150        let normalized = WorkspaceIndex::normalize_path(path);
1151        assert_eq!(normalized, PathBuf::from("docs/other.md"));
1152
1153        // Test . handling
1154        let path2 = Path::new("docs/./other.md");
1155        let normalized2 = WorkspaceIndex::normalize_path(path2);
1156        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
1157
1158        // Test multiple ..
1159        let path3 = Path::new("a/b/c/../../d.md");
1160        let normalized3 = WorkspaceIndex::normalize_path(path3);
1161        assert_eq!(normalized3, PathBuf::from("a/d.md"));
1162    }
1163
1164    #[test]
1165    fn test_clear_clears_reverse_deps() {
1166        let mut index = WorkspaceIndex::new();
1167
1168        // File A links to B
1169        let mut file_a = FileIndex::new();
1170        file_a.add_cross_file_link(CrossFileLinkIndex {
1171            target_path: "b.md".to_string(),
1172            fragment: "".to_string(),
1173            line: 1,
1174            column: 1,
1175        });
1176        index.update_file(Path::new("docs/a.md"), file_a);
1177
1178        // Verify B has A as dependent
1179        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1180
1181        // Clear the index
1182        index.clear();
1183
1184        // Both files and reverse deps should be cleared
1185        assert_eq!(index.file_count(), 0);
1186        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1187    }
1188
1189    #[test]
1190    fn test_is_file_stale() {
1191        let mut index = WorkspaceIndex::new();
1192
1193        // Non-existent file is always stale
1194        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
1195
1196        // Add a file with known hash
1197        let file_index = FileIndex::with_hash("hash123".to_string());
1198        index.insert_file(PathBuf::from("docs/test.md"), file_index);
1199
1200        // Same hash means not stale
1201        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
1202
1203        // Different hash means stale
1204        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
1205    }
1206
1207    #[cfg(feature = "native")]
1208    #[test]
1209    fn test_cache_roundtrip() {
1210        use std::fs;
1211
1212        // Create a temp directory
1213        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
1214        let _ = fs::remove_dir_all(&temp_dir);
1215        fs::create_dir_all(&temp_dir).unwrap();
1216
1217        // Create an index with some data
1218        let mut index = WorkspaceIndex::new();
1219
1220        let mut file1 = FileIndex::with_hash("abc123".to_string());
1221        file1.add_heading(HeadingIndex {
1222            text: "Test Heading".to_string(),
1223            auto_anchor: "test-heading".to_string(),
1224            custom_anchor: Some("test".to_string()),
1225            line: 1,
1226            is_setext: false,
1227        });
1228        file1.add_cross_file_link(CrossFileLinkIndex {
1229            target_path: "./other.md".to_string(),
1230            fragment: "section".to_string(),
1231            line: 5,
1232            column: 3,
1233        });
1234        index.update_file(Path::new("docs/file1.md"), file1);
1235
1236        let mut file2 = FileIndex::with_hash("def456".to_string());
1237        file2.add_heading(HeadingIndex {
1238            text: "Another Heading".to_string(),
1239            auto_anchor: "another-heading".to_string(),
1240            custom_anchor: None,
1241            line: 1,
1242            is_setext: false,
1243        });
1244        index.update_file(Path::new("docs/other.md"), file2);
1245
1246        // Save to cache
1247        index.save_to_cache(&temp_dir).expect("Failed to save cache");
1248
1249        // Verify cache file exists
1250        assert!(temp_dir.join("workspace_index.bin").exists());
1251
1252        // Load from cache
1253        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
1254
1255        // Verify data matches
1256        assert_eq!(loaded.file_count(), 2);
1257        assert!(loaded.contains_file(Path::new("docs/file1.md")));
1258        assert!(loaded.contains_file(Path::new("docs/other.md")));
1259
1260        // Check file1 details
1261        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
1262        assert_eq!(file1_loaded.content_hash, "abc123");
1263        assert_eq!(file1_loaded.headings.len(), 1);
1264        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
1265        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
1266        assert_eq!(file1_loaded.cross_file_links.len(), 1);
1267        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
1268
1269        // Check reverse deps were serialized correctly
1270        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
1271        assert_eq!(dependents.len(), 1);
1272        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
1273
1274        // Clean up
1275        let _ = fs::remove_dir_all(&temp_dir);
1276    }
1277
1278    #[cfg(feature = "native")]
1279    #[test]
1280    fn test_cache_missing_file() {
1281        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
1282        let _ = std::fs::remove_dir_all(&temp_dir);
1283
1284        // Should return None for non-existent cache
1285        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1286        assert!(result.is_none());
1287    }
1288
1289    #[cfg(feature = "native")]
1290    #[test]
1291    fn test_cache_corrupted_file() {
1292        use std::fs;
1293
1294        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
1295        let _ = fs::remove_dir_all(&temp_dir);
1296        fs::create_dir_all(&temp_dir).unwrap();
1297
1298        // Write corrupted data (too small for header)
1299        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
1300
1301        // Should return None for corrupted cache (and remove the file)
1302        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1303        assert!(result.is_none());
1304
1305        // Corrupted file should be removed
1306        assert!(!temp_dir.join("workspace_index.bin").exists());
1307
1308        // Clean up
1309        let _ = fs::remove_dir_all(&temp_dir);
1310    }
1311
1312    #[cfg(feature = "native")]
1313    #[test]
1314    fn test_cache_invalid_magic() {
1315        use std::fs;
1316
1317        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1318        let _ = fs::remove_dir_all(&temp_dir);
1319        fs::create_dir_all(&temp_dir).unwrap();
1320
1321        // Write data with wrong magic header
1322        let mut data = Vec::new();
1323        data.extend_from_slice(b"XXXX"); // Wrong magic
1324        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1325        data.extend_from_slice(&[0; 100]); // Some garbage data
1326        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1327
1328        // Should return None for invalid magic
1329        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1330        assert!(result.is_none());
1331
1332        // File should be removed
1333        assert!(!temp_dir.join("workspace_index.bin").exists());
1334
1335        // Clean up
1336        let _ = fs::remove_dir_all(&temp_dir);
1337    }
1338
1339    #[cfg(feature = "native")]
1340    #[test]
1341    fn test_cache_version_mismatch() {
1342        use std::fs;
1343
1344        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1345        let _ = fs::remove_dir_all(&temp_dir);
1346        fs::create_dir_all(&temp_dir).unwrap();
1347
1348        // Write data with correct magic but wrong version
1349        let mut data = Vec::new();
1350        data.extend_from_slice(b"RWSI"); // Correct magic
1351        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1352        data.extend_from_slice(&[0; 100]); // Some garbage data
1353        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1354
1355        // Should return None for version mismatch
1356        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1357        assert!(result.is_none());
1358
1359        // File should be removed to trigger rebuild
1360        assert!(!temp_dir.join("workspace_index.bin").exists());
1361
1362        // Clean up
1363        let _ = fs::remove_dir_all(&temp_dir);
1364    }
1365
1366    #[cfg(feature = "native")]
1367    #[test]
1368    fn test_cache_atomic_write() {
1369        use std::fs;
1370
1371        // Test that atomic writes work (no temp files left behind)
1372        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1373        let _ = fs::remove_dir_all(&temp_dir);
1374        fs::create_dir_all(&temp_dir).unwrap();
1375
1376        let index = WorkspaceIndex::new();
1377        index.save_to_cache(&temp_dir).expect("Failed to save");
1378
1379        // Only the final cache file should exist, no temp files
1380        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1381        assert_eq!(entries.len(), 1);
1382        assert!(temp_dir.join("workspace_index.bin").exists());
1383
1384        // Clean up
1385        let _ = fs::remove_dir_all(&temp_dir);
1386    }
1387
1388    #[test]
1389    fn test_has_anchor_auto_generated() {
1390        let mut file_index = FileIndex::new();
1391        file_index.add_heading(HeadingIndex {
1392            text: "Installation Guide".to_string(),
1393            auto_anchor: "installation-guide".to_string(),
1394            custom_anchor: None,
1395            line: 1,
1396            is_setext: false,
1397        });
1398
1399        // Should find by auto-generated anchor
1400        assert!(file_index.has_anchor("installation-guide"));
1401
1402        // Case-insensitive matching
1403        assert!(file_index.has_anchor("Installation-Guide"));
1404        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1405
1406        // Should not find non-existent anchor
1407        assert!(!file_index.has_anchor("nonexistent"));
1408    }
1409
1410    #[test]
1411    fn test_has_anchor_custom() {
1412        let mut file_index = FileIndex::new();
1413        file_index.add_heading(HeadingIndex {
1414            text: "Installation Guide".to_string(),
1415            auto_anchor: "installation-guide".to_string(),
1416            custom_anchor: Some("install".to_string()),
1417            line: 1,
1418            is_setext: false,
1419        });
1420
1421        // Should find by auto-generated anchor
1422        assert!(file_index.has_anchor("installation-guide"));
1423
1424        // Should also find by custom anchor
1425        assert!(file_index.has_anchor("install"));
1426        assert!(file_index.has_anchor("Install")); // case-insensitive
1427
1428        // Should not find non-existent anchor
1429        assert!(!file_index.has_anchor("nonexistent"));
1430    }
1431
1432    #[test]
1433    fn test_get_heading_by_anchor() {
1434        let mut file_index = FileIndex::new();
1435        file_index.add_heading(HeadingIndex {
1436            text: "Installation Guide".to_string(),
1437            auto_anchor: "installation-guide".to_string(),
1438            custom_anchor: Some("install".to_string()),
1439            line: 10,
1440            is_setext: false,
1441        });
1442        file_index.add_heading(HeadingIndex {
1443            text: "Configuration".to_string(),
1444            auto_anchor: "configuration".to_string(),
1445            custom_anchor: None,
1446            line: 20,
1447            is_setext: false,
1448        });
1449
1450        // Get by auto anchor
1451        let heading = file_index.get_heading_by_anchor("installation-guide");
1452        assert!(heading.is_some());
1453        assert_eq!(heading.unwrap().text, "Installation Guide");
1454        assert_eq!(heading.unwrap().line, 10);
1455
1456        // Get by custom anchor
1457        let heading = file_index.get_heading_by_anchor("install");
1458        assert!(heading.is_some());
1459        assert_eq!(heading.unwrap().text, "Installation Guide");
1460
1461        // Get second heading
1462        let heading = file_index.get_heading_by_anchor("configuration");
1463        assert!(heading.is_some());
1464        assert_eq!(heading.unwrap().text, "Configuration");
1465        assert_eq!(heading.unwrap().line, 20);
1466
1467        // Non-existent
1468        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1469    }
1470
1471    #[test]
1472    fn test_anchor_lookup_many_headings() {
1473        // Test that O(1) lookup works with many headings
1474        let mut file_index = FileIndex::new();
1475
1476        // Add 100 headings
1477        for i in 0..100 {
1478            file_index.add_heading(HeadingIndex {
1479                text: format!("Heading {i}"),
1480                auto_anchor: format!("heading-{i}"),
1481                custom_anchor: Some(format!("h{i}")),
1482                line: i + 1,
1483                is_setext: false,
1484            });
1485        }
1486
1487        // Verify all can be found
1488        for i in 0..100 {
1489            assert!(file_index.has_anchor(&format!("heading-{i}")));
1490            assert!(file_index.has_anchor(&format!("h{i}")));
1491
1492            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1493            assert!(heading.is_some());
1494            assert_eq!(heading.unwrap().line, i + 1);
1495        }
1496    }
1497
1498    // =============================================================================
1499    // Tests for extract_cross_file_links utility
1500    // =============================================================================
1501
1502    #[test]
1503    fn test_extract_cross_file_links_basic() {
1504        use crate::config::MarkdownFlavor;
1505
1506        let content = "# Test\n\nSee [link](./other.md) for info.\n";
1507        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1508        let links = extract_cross_file_links(&ctx);
1509
1510        assert_eq!(links.len(), 1);
1511        assert_eq!(links[0].target_path, "./other.md");
1512        assert_eq!(links[0].fragment, "");
1513        assert_eq!(links[0].line, 3);
1514        // "See [link](" = 11 chars, so column 12 is where "./other.md" starts
1515        assert_eq!(links[0].column, 12);
1516    }
1517
1518    #[test]
1519    fn test_extract_cross_file_links_with_fragment() {
1520        use crate::config::MarkdownFlavor;
1521
1522        let content = "Check [guide](./guide.md#install) here.\n";
1523        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1524        let links = extract_cross_file_links(&ctx);
1525
1526        assert_eq!(links.len(), 1);
1527        assert_eq!(links[0].target_path, "./guide.md");
1528        assert_eq!(links[0].fragment, "install");
1529        assert_eq!(links[0].line, 1);
1530        // "Check [guide](" = 14 chars, so column 15 is where "./guide.md" starts
1531        assert_eq!(links[0].column, 15);
1532    }
1533
1534    #[test]
1535    fn test_extract_cross_file_links_multiple_on_same_line() {
1536        use crate::config::MarkdownFlavor;
1537
1538        let content = "See [a](a.md) and [b](b.md) here.\n";
1539        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1540        let links = extract_cross_file_links(&ctx);
1541
1542        assert_eq!(links.len(), 2);
1543
1544        assert_eq!(links[0].target_path, "a.md");
1545        assert_eq!(links[0].line, 1);
1546        // "See [a](" = 8 chars, so column 9
1547        assert_eq!(links[0].column, 9);
1548
1549        assert_eq!(links[1].target_path, "b.md");
1550        assert_eq!(links[1].line, 1);
1551        // "See [a](a.md) and [b](" = 22 chars, so column 23
1552        assert_eq!(links[1].column, 23);
1553    }
1554
1555    #[test]
1556    fn test_extract_cross_file_links_angle_brackets() {
1557        use crate::config::MarkdownFlavor;
1558
1559        let content = "See [link](<path/with (parens).md>) here.\n";
1560        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1561        let links = extract_cross_file_links(&ctx);
1562
1563        assert_eq!(links.len(), 1);
1564        assert_eq!(links[0].target_path, "path/with (parens).md");
1565        assert_eq!(links[0].line, 1);
1566        // "See [link](<" = 12 chars, so column 13
1567        assert_eq!(links[0].column, 13);
1568    }
1569
1570    #[test]
1571    fn test_extract_cross_file_links_skips_external() {
1572        use crate::config::MarkdownFlavor;
1573
1574        let content = r#"
1575[external](https://example.com)
1576[mailto](mailto:test@example.com)
1577[local](./local.md)
1578[fragment](#section)
1579[absolute](/docs/page.md)
1580"#;
1581        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1582        let links = extract_cross_file_links(&ctx);
1583
1584        // Only the local markdown link should be extracted
1585        assert_eq!(links.len(), 1);
1586        assert_eq!(links[0].target_path, "./local.md");
1587    }
1588
1589    #[test]
1590    fn test_extract_cross_file_links_skips_non_markdown() {
1591        use crate::config::MarkdownFlavor;
1592
1593        let content = r#"
1594[image](./photo.png)
1595[doc](./readme.md)
1596[pdf](./document.pdf)
1597"#;
1598        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1599        let links = extract_cross_file_links(&ctx);
1600
1601        // Only markdown files are indexed for cross-file validation
1602        assert_eq!(links.len(), 1);
1603        assert_eq!(links[0].target_path, "./readme.md");
1604    }
1605
1606    #[test]
1607    fn test_extract_cross_file_links_skips_code_spans() {
1608        use crate::config::MarkdownFlavor;
1609
1610        let content = "Normal [link](./file.md) and `[code](./ignored.md)` here.\n";
1611        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1612        let links = extract_cross_file_links(&ctx);
1613
1614        // Only the link outside code span should be extracted
1615        assert_eq!(links.len(), 1);
1616        assert_eq!(links[0].target_path, "./file.md");
1617    }
1618
1619    #[test]
1620    fn test_extract_cross_file_links_with_query_params() {
1621        use crate::config::MarkdownFlavor;
1622
1623        let content = "See [doc](./file.md?raw=true) here.\n";
1624        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1625        let links = extract_cross_file_links(&ctx);
1626
1627        assert_eq!(links.len(), 1);
1628        // Query params should be stripped
1629        assert_eq!(links[0].target_path, "./file.md");
1630    }
1631
1632    #[test]
1633    fn test_extract_cross_file_links_empty_content() {
1634        use crate::config::MarkdownFlavor;
1635
1636        let content = "";
1637        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1638        let links = extract_cross_file_links(&ctx);
1639
1640        assert!(links.is_empty());
1641    }
1642
1643    #[test]
1644    fn test_extract_cross_file_links_no_links() {
1645        use crate::config::MarkdownFlavor;
1646
1647        let content = "# Just a heading\n\nSome text without links.\n";
1648        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1649        let links = extract_cross_file_links(&ctx);
1650
1651        assert!(links.is_empty());
1652    }
1653
1654    #[test]
1655    fn test_extract_cross_file_links_position_accuracy_issue_234() {
1656        // This test verifies the fix for GitHub issue #234
1657        // The LSP was reporting incorrect column positions for MD057 diagnostics
1658        use crate::config::MarkdownFlavor;
1659
1660        let content = r#"# Test Document
1661
1662Here is a [broken link](nonexistent-file.md) that should trigger MD057.
1663
1664And another [link](also-missing.md) on this line.
1665"#;
1666        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1667        let links = extract_cross_file_links(&ctx);
1668
1669        assert_eq!(links.len(), 2);
1670
1671        // First link: "Here is a [broken link](" = 24 chars, column 25
1672        assert_eq!(links[0].target_path, "nonexistent-file.md");
1673        assert_eq!(links[0].line, 3);
1674        assert_eq!(links[0].column, 25);
1675
1676        // Second link: "And another [link](" = 19 chars, column 20
1677        assert_eq!(links[1].target_path, "also-missing.md");
1678        assert_eq!(links[1].line, 5);
1679        assert_eq!(links[1].column, 20);
1680    }
1681}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs