rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: postcard-serialized WorkspaceIndex]
19//! ```
20
21use regex::Regex;
22use serde::{Deserialize, Serialize};
23use std::collections::{HashMap, HashSet};
24use std::path::{Path, PathBuf};
25use std::sync::LazyLock;
26
27use crate::lint_context::LintContext;
28
29// =============================================================================
30// URL Decoding Helper
31// =============================================================================
32
33/// Convert a hex digit character to its numeric value (0-15)
34fn hex_digit_to_value(c: u8) -> Option<u8> {
35    match c {
36        b'0'..=b'9' => Some(c - b'0'),
37        b'a'..=b'f' => Some(c - b'a' + 10),
38        b'A'..=b'F' => Some(c - b'A' + 10),
39        _ => None,
40    }
41}
42
43/// URL-decode a string, handling percent-encoded characters.
44/// Returns the decoded string, or the original if decoding fails.
45/// Used for matching URL-encoded CJK fragments against raw anchors.
46fn url_decode(s: &str) -> String {
47    // Fast path: no percent signs means no encoding
48    if !s.contains('%') {
49        return s.to_string();
50    }
51
52    let bytes = s.as_bytes();
53    let mut result = Vec::with_capacity(bytes.len());
54    let mut i = 0;
55
56    while i < bytes.len() {
57        if bytes[i] == b'%' && i + 2 < bytes.len() {
58            // Try to parse the two hex digits following %
59            let hex1 = bytes[i + 1];
60            let hex2 = bytes[i + 2];
61            if let (Some(d1), Some(d2)) = (hex_digit_to_value(hex1), hex_digit_to_value(hex2)) {
62                result.push(d1 * 16 + d2);
63                i += 3;
64                continue;
65            }
66        }
67        result.push(bytes[i]);
68        i += 1;
69    }
70
71    // Convert to UTF-8, falling back to original if invalid
72    String::from_utf8(result).unwrap_or_else(|_| s.to_string())
73}
74
75// =============================================================================
76// Shared cross-file link extraction utilities
77//
78// These regexes and helpers are the canonical implementation for extracting
79// cross-file links. Both MD057 and LSP use this shared code path for correct
80// position tracking.
81// =============================================================================
82
83/// Regex to match the start of a link
84static LINK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!?\[[^\]]*\]").unwrap());
85
86/// Regex to extract the URL from an angle-bracketed markdown link
87/// Format: `](<URL>)` or `](<URL> "title")`
88static URL_EXTRACT_ANGLE_BRACKET_REGEX: LazyLock<Regex> =
89    LazyLock::new(|| Regex::new(r#"\]\(\s*<([^>]+)>(#[^\)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
90
91/// Regex to extract the URL from a normal markdown link (without angle brackets)
92/// Format: `](URL)` or `](URL "title")`
93static URL_EXTRACT_REGEX: LazyLock<Regex> =
94    LazyLock::new(|| Regex::new(r#"]\(\s*([^>)\s#]+)(#[^)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
95
96/// Regex to detect URLs with explicit schemes
97pub(crate) static PROTOCOL_DOMAIN_REGEX: LazyLock<Regex> =
98    LazyLock::new(|| Regex::new(r"^([a-zA-Z][a-zA-Z0-9+.-]*://|[a-zA-Z][a-zA-Z0-9+.-]*:|www\.)").unwrap());
99
100/// Supported markdown file extensions
101const MARKDOWN_EXTENSIONS: &[&str] = &[
102    ".md",
103    ".markdown",
104    ".mdx",
105    ".mkd",
106    ".mkdn",
107    ".mdown",
108    ".mdwn",
109    ".qmd",
110    ".rmd",
111];
112
113/// Check if a path has a markdown extension (case-insensitive)
114#[inline]
115fn is_markdown_file(path: &str) -> bool {
116    let path_lower = path.to_lowercase();
117    MARKDOWN_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext))
118}
119
120/// Strip query parameters and fragments from a URL path
121/// Returns the path portion before `?` or `#`
122fn strip_query_and_fragment(url: &str) -> &str {
123    let query_pos = url.find('?');
124    let fragment_pos = url.find('#');
125
126    match (query_pos, fragment_pos) {
127        (Some(q), Some(f)) => &url[..q.min(f)],
128        (Some(q), None) => &url[..q],
129        (None, Some(f)) => &url[..f],
130        (None, None) => url,
131    }
132}
133
134/// Extract cross-file links from content using correct regex-based position tracking.
135///
136/// This is the canonical implementation used by both MD057 and LSP to ensure
137/// consistent and correct column positions for diagnostic reporting.
138///
139/// Returns a vector of `CrossFileLinkIndex` entries, one for each markdown file
140/// link found in the content.
141pub fn extract_cross_file_links(ctx: &LintContext) -> Vec<CrossFileLinkIndex> {
142    let content = ctx.content;
143
144    // Early returns for performance
145    if content.is_empty() || !content.contains("](") {
146        return Vec::new();
147    }
148
149    let mut links = Vec::new();
150    let lines: Vec<&str> = content.lines().collect();
151    let line_index = &ctx.line_index;
152
153    // Track which lines we've already processed to avoid duplicates
154    // (ctx.links may have multiple entries for the same line)
155    let mut processed_lines = HashSet::new();
156
157    for link in &ctx.links {
158        let line_idx = link.line - 1;
159        if line_idx >= lines.len() {
160            continue;
161        }
162
163        // Skip if we've already processed this line
164        if !processed_lines.insert(line_idx) {
165            continue;
166        }
167
168        let line = lines[line_idx];
169        if !line.contains("](") {
170            continue;
171        }
172
173        // Find all links in this line
174        for link_match in LINK_START_REGEX.find_iter(line) {
175            let start_pos = link_match.start();
176            let end_pos = link_match.end();
177
178            // Calculate absolute position for code span detection
179            let line_start_byte = line_index.get_line_start_byte(line_idx + 1).unwrap_or(0);
180            let absolute_start_pos = line_start_byte + start_pos;
181
182            // Skip if in code span
183            if ctx.is_in_code_span_byte(absolute_start_pos) {
184                continue;
185            }
186
187            // Extract the URL (group 1) and fragment (group 2)
188            // Try angle-bracket regex first (handles URLs with parens)
189            let caps_result = URL_EXTRACT_ANGLE_BRACKET_REGEX
190                .captures_at(line, end_pos - 1)
191                .or_else(|| URL_EXTRACT_REGEX.captures_at(line, end_pos - 1));
192
193            if let Some(caps) = caps_result
194                && let Some(url_group) = caps.get(1)
195            {
196                let file_path = url_group.as_str().trim();
197
198                // Skip empty, external, template variables, absolute URL paths,
199                // framework aliases, fragment-only URLs, or rustdoc intra-doc links
200                if file_path.is_empty()
201                    || PROTOCOL_DOMAIN_REGEX.is_match(file_path)
202                    || file_path.starts_with("www.")
203                    || file_path.starts_with('#')
204                    || file_path.starts_with("{{")
205                    || file_path.starts_with("{%")
206                    || file_path.starts_with('/')
207                    || file_path.starts_with('~')
208                    || file_path.starts_with('@')
209                    || (file_path.starts_with('`') && file_path.ends_with('`'))
210                {
211                    continue;
212                }
213
214                // Strip query parameters before indexing
215                let file_path = strip_query_and_fragment(file_path);
216
217                // Get fragment from capture group 2 (includes # prefix)
218                let fragment = caps.get(2).map(|m| m.as_str().trim_start_matches('#')).unwrap_or("");
219
220                // Only index markdown file links for cross-file validation
221                if is_markdown_file(file_path) {
222                    links.push(CrossFileLinkIndex {
223                        target_path: file_path.to_string(),
224                        fragment: fragment.to_string(),
225                        line: link.line,
226                        column: url_group.start() + 1,
227                    });
228                }
229            }
230        }
231    }
232
233    links
234}
235
236/// Magic bytes identifying a workspace index cache file
237#[cfg(feature = "native")]
238const CACHE_MAGIC: &[u8; 4] = b"RWSI";
239
240/// Cache format version - increment when WorkspaceIndex serialization changes
241#[cfg(feature = "native")]
242const CACHE_FORMAT_VERSION: u32 = 5;
243
244/// Cache file name within the version directory
245#[cfg(feature = "native")]
246const CACHE_FILE_NAME: &str = "workspace_index.bin";
247
248/// Workspace-wide index for cross-file analysis
249///
250/// Contains pre-extracted information from all markdown files in the workspace,
251/// enabling rules to validate cross-file references efficiently.
252#[derive(Debug, Default, Clone, Serialize, Deserialize)]
253pub struct WorkspaceIndex {
254    /// Map from file path to its extracted data
255    files: HashMap<PathBuf, FileIndex>,
256    /// Reverse dependency graph: target file → files that link to it
257    /// Used to efficiently re-lint dependent files when a target changes
258    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
259    /// Version counter for cache invalidation (incremented on any change)
260    version: u64,
261}
262
263/// Index data extracted from a single file
264#[derive(Debug, Clone, Default, Serialize, Deserialize)]
265pub struct FileIndex {
266    /// Headings in this file with their anchors
267    pub headings: Vec<HeadingIndex>,
268    /// Reference links in this file (for cross-file analysis)
269    pub reference_links: Vec<ReferenceLinkIndex>,
270    /// Cross-file links in this file (for MD051 cross-file validation)
271    pub cross_file_links: Vec<CrossFileLinkIndex>,
272    /// Defined reference IDs (e.g., from `[ref]: url` definitions)
273    /// Used to filter out reference links that have explicit definitions
274    pub defined_references: HashSet<String>,
275    /// Content hash for change detection
276    pub content_hash: String,
277    /// O(1) anchor lookup: lowercased anchor → heading index
278    /// Includes both auto-generated and custom anchors
279    anchor_to_heading: HashMap<String, usize>,
280    /// HTML anchors defined via `<a id="...">` or `<element id="...">` tags
281    /// Stored lowercase for case-insensitive matching
282    html_anchors: HashSet<String>,
283    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list)
284    /// Can appear on any element, not just headings
285    /// Stored lowercase for case-insensitive matching
286    attribute_anchors: HashSet<String>,
287    /// Rules disabled for the entire file (from inline comments)
288    /// Used by cross-file rules to respect inline disable directives
289    pub file_disabled_rules: HashSet<String>,
290    /// Persistent disable/enable state transitions, sorted by line number.
291    /// Each entry: (line, disabled_rules, enabled_rules). Use binary search to query.
292    pub persistent_transitions: Vec<(usize, HashSet<String>, HashSet<String>)>,
293    /// Rules disabled at specific lines via disable-line / disable-next-line
294    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
295}
296
297/// Information about a heading for cross-file lookup
298#[derive(Debug, Clone, Serialize, Deserialize)]
299pub struct HeadingIndex {
300    /// The heading text (e.g., "Installation Guide")
301    pub text: String,
302    /// Auto-generated anchor (e.g., "installation-guide")
303    pub auto_anchor: String,
304    /// Custom anchor if present (e.g., "install")
305    pub custom_anchor: Option<String>,
306    /// Line number (1-indexed)
307    pub line: usize,
308    /// Whether this is a Setext-style heading (underlined with = or -)
309    #[serde(default)]
310    pub is_setext: bool,
311}
312
313/// Information about a reference link for cross-file analysis
314#[derive(Debug, Clone, Serialize, Deserialize)]
315pub struct ReferenceLinkIndex {
316    /// The reference ID (the part in `[text][ref]`)
317    pub reference_id: String,
318    /// Line number (1-indexed)
319    pub line: usize,
320    /// Column number (1-indexed)
321    pub column: usize,
322}
323
324/// Information about a cross-file link for validation
325#[derive(Debug, Clone, Serialize, Deserialize)]
326pub struct CrossFileLinkIndex {
327    /// The target file path (relative, as it appears in the link)
328    pub target_path: String,
329    /// The fragment/anchor being linked to (without #)
330    pub fragment: String,
331    /// Line number (1-indexed)
332    pub line: usize,
333    /// Column number (1-indexed)
334    pub column: usize,
335}
336
337/// Information about a vulnerable anchor (heading without custom ID)
338#[derive(Debug, Clone, Serialize, Deserialize)]
339pub struct VulnerableAnchor {
340    /// File path where the heading is located
341    pub file: PathBuf,
342    /// Line number of the heading
343    pub line: usize,
344    /// The heading text
345    pub text: String,
346}
347
348impl WorkspaceIndex {
349    /// Create a new empty workspace index
350    pub fn new() -> Self {
351        Self::default()
352    }
353
354    /// Get the current version (for cache invalidation)
355    pub fn version(&self) -> u64 {
356        self.version
357    }
358
359    /// Get the number of indexed files
360    pub fn file_count(&self) -> usize {
361        self.files.len()
362    }
363
364    /// Check if a file is in the index
365    pub fn contains_file(&self, path: &Path) -> bool {
366        self.files.contains_key(path)
367    }
368
369    /// Get the index data for a specific file
370    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
371        self.files.get(path)
372    }
373
374    /// Insert or update a file's index data
375    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
376        self.files.insert(path, index);
377        self.version = self.version.wrapping_add(1);
378    }
379
380    /// Remove a file from the index
381    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
382        // Clean up reverse deps for this file
383        self.clear_reverse_deps_for(path);
384
385        let result = self.files.remove(path);
386        if result.is_some() {
387            self.version = self.version.wrapping_add(1);
388        }
389        result
390    }
391
392    /// Build a map of all "vulnerable" anchors across the workspace
393    ///
394    /// A vulnerable anchor is an auto-generated anchor for a heading that
395    /// does NOT have a custom anchor defined. These are problematic for
396    /// translated content because the anchor changes when the heading is translated.
397    ///
398    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
399    /// Multiple files can have headings with the same auto-generated anchor,
400    /// so we collect all occurrences.
401    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
402        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
403
404        for (file_path, file_index) in &self.files {
405            for heading in &file_index.headings {
406                // Only include headings WITHOUT custom anchors
407                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
408                    let anchor_key = heading.auto_anchor.to_lowercase();
409                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
410                        file: file_path.clone(),
411                        line: heading.line,
412                        text: heading.text.clone(),
413                    });
414                }
415            }
416        }
417
418        vulnerable
419    }
420
421    /// Get all headings across the workspace (for debugging/testing)
422    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
423        self.files
424            .iter()
425            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
426    }
427
428    /// Iterate over all files in the index
429    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
430        self.files.iter().map(|(p, i)| (p.as_path(), i))
431    }
432
433    /// Clear the entire index
434    pub fn clear(&mut self) {
435        self.files.clear();
436        self.reverse_deps.clear();
437        self.version = self.version.wrapping_add(1);
438    }
439
440    /// Update a file's index and maintain reverse dependencies
441    ///
442    /// This method:
443    /// 1. Removes this file as a source (dependent) from all reverse deps
444    /// 2. Inserts the new file index
445    /// 3. Builds new reverse deps from cross_file_links
446    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
447        // Remove this file as a source (dependent) from all target entries
448        // Note: We don't remove it as a target - other files may still link to it
449        self.clear_reverse_deps_as_source(path);
450
451        // Build new reverse deps from cross_file_links
452        for link in &index.cross_file_links {
453            let target = self.resolve_target_path(path, &link.target_path);
454            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
455        }
456
457        self.files.insert(path.to_path_buf(), index);
458        self.version = self.version.wrapping_add(1);
459    }
460
461    /// Get files that depend on (link to) the given file
462    ///
463    /// Returns a list of file paths that contain links targeting this file.
464    /// Used to re-lint dependent files when a target file changes.
465    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
466        self.reverse_deps
467            .get(path)
468            .map(|set| set.iter().cloned().collect())
469            .unwrap_or_default()
470    }
471
472    /// Check if a file needs re-indexing based on its content hash
473    ///
474    /// Returns `true` if the file is not in the index or has a different hash.
475    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
476        self.files
477            .get(path)
478            .map(|f| f.content_hash != current_hash)
479            .unwrap_or(true)
480    }
481
482    /// Retain only files that exist in the given set, removing deleted files
483    ///
484    /// This prunes stale entries from the cache for files that no longer exist.
485    /// Returns the number of files removed.
486    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
487        let before_count = self.files.len();
488
489        // Collect files to remove
490        let to_remove: Vec<PathBuf> = self
491            .files
492            .keys()
493            .filter(|path| !current_files.contains(*path))
494            .cloned()
495            .collect();
496
497        // Remove each file properly (clears reverse deps)
498        for path in &to_remove {
499            self.remove_file(path);
500        }
501
502        before_count - self.files.len()
503    }
504
505    /// Save the workspace index to a cache file
506    ///
507    /// Uses postcard for efficient binary serialization with:
508    /// - Magic header for file type validation
509    /// - Format version for compatibility detection
510    /// - Atomic writes (temp file + rename) to prevent corruption
511    #[cfg(feature = "native")]
512    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
513        use std::fs;
514        use std::io::Write;
515
516        // Ensure cache directory exists
517        fs::create_dir_all(cache_dir)?;
518
519        // Serialize the index data using postcard
520        let encoded = postcard::to_allocvec(self)
521            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
522
523        // Build versioned cache file: [magic][version][data]
524        let mut cache_data = Vec::with_capacity(8 + encoded.len());
525        cache_data.extend_from_slice(CACHE_MAGIC);
526        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
527        cache_data.extend_from_slice(&encoded);
528
529        // Write atomically: write to temp file then rename
530        let final_path = cache_dir.join(CACHE_FILE_NAME);
531        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
532
533        // Write to temp file
534        {
535            let mut file = fs::File::create(&temp_path)?;
536            file.write_all(&cache_data)?;
537            file.sync_all()?;
538        }
539
540        // Atomic rename
541        fs::rename(&temp_path, &final_path)?;
542
543        log::debug!(
544            "Saved workspace index to cache: {} files, {} bytes (format v{})",
545            self.files.len(),
546            cache_data.len(),
547            CACHE_FORMAT_VERSION
548        );
549
550        Ok(())
551    }
552
553    /// Load the workspace index from a cache file
554    ///
555    /// Returns `None` if:
556    /// - Cache file doesn't exist
557    /// - Magic header doesn't match
558    /// - Format version is incompatible
559    /// - Data is corrupted
560    #[cfg(feature = "native")]
561    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
562        use std::fs;
563
564        let path = cache_dir.join(CACHE_FILE_NAME);
565        let data = fs::read(&path).ok()?;
566
567        // Validate header: need at least 8 bytes for magic + version
568        if data.len() < 8 {
569            log::warn!("Workspace index cache too small, discarding");
570            let _ = fs::remove_file(&path);
571            return None;
572        }
573
574        // Check magic header
575        if &data[0..4] != CACHE_MAGIC {
576            log::warn!("Workspace index cache has invalid magic header, discarding");
577            let _ = fs::remove_file(&path);
578            return None;
579        }
580
581        // Check format version
582        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
583        if version != CACHE_FORMAT_VERSION {
584            log::info!(
585                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
586            );
587            let _ = fs::remove_file(&path);
588            return None;
589        }
590
591        // Deserialize the index data using postcard
592        match postcard::from_bytes::<Self>(&data[8..]) {
593            Ok(index) => {
594                log::debug!(
595                    "Loaded workspace index from cache: {} files (format v{})",
596                    index.files.len(),
597                    version
598                );
599                Some(index)
600            }
601            Err(e) => {
602                log::warn!("Failed to deserialize workspace index cache: {e}");
603                let _ = fs::remove_file(&path);
604                None
605            }
606        }
607    }
608
609    /// Remove a file as a source from all reverse dependency entries
610    ///
611    /// This removes the file from being listed as a dependent in all target entries.
612    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
613    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
614        for deps in self.reverse_deps.values_mut() {
615            deps.remove(path);
616        }
617        // Clean up empty entries
618        self.reverse_deps.retain(|_, deps| !deps.is_empty());
619    }
620
621    /// Remove a file completely from reverse dependency tracking
622    ///
623    /// Removes the file as both a source (dependent) and as a target.
624    /// Used when deleting a file from the index.
625    fn clear_reverse_deps_for(&mut self, path: &Path) {
626        // Remove as source (dependent)
627        self.clear_reverse_deps_as_source(path);
628
629        // Also remove as target
630        self.reverse_deps.remove(path);
631    }
632
633    /// Resolve a relative path from a source file to an absolute target path
634    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
635        // Get the directory containing the source file
636        let source_dir = source_file.parent().unwrap_or(Path::new(""));
637
638        // Join with the relative target and normalize
639        let target = source_dir.join(relative_target);
640
641        // Normalize the path (handle .., ., etc.)
642        Self::normalize_path(&target)
643    }
644
645    /// Normalize a path by resolving . and .. components
646    fn normalize_path(path: &Path) -> PathBuf {
647        let mut components = Vec::new();
648
649        for component in path.components() {
650            match component {
651                std::path::Component::ParentDir => {
652                    // Go up one level if possible
653                    if !components.is_empty() {
654                        components.pop();
655                    }
656                }
657                std::path::Component::CurDir => {
658                    // Skip current directory markers
659                }
660                _ => {
661                    components.push(component);
662                }
663            }
664        }
665
666        components.iter().collect()
667    }
668}
669
670impl FileIndex {
671    /// Create a new empty file index
672    pub fn new() -> Self {
673        Self::default()
674    }
675
676    /// Create a file index with the given content hash
677    pub fn with_hash(content_hash: String) -> Self {
678        Self {
679            content_hash,
680            ..Default::default()
681        }
682    }
683
684    /// Add a heading to the index
685    ///
686    /// Also updates the anchor lookup map for O(1) anchor queries
687    pub fn add_heading(&mut self, heading: HeadingIndex) {
688        let index = self.headings.len();
689
690        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
691        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
692
693        // Add custom anchor if present
694        if let Some(ref custom) = heading.custom_anchor {
695            self.anchor_to_heading.insert(custom.to_lowercase(), index);
696        }
697
698        self.headings.push(heading);
699    }
700
701    /// Add an alternative anchor that resolves to an existing heading.
702    /// Used for platform-specific anchor conventions (e.g., Python-Markdown `_N` dedup).
703    pub fn add_anchor_alias(&mut self, anchor: String, heading_index: usize) {
704        if heading_index < self.headings.len() {
705            self.anchor_to_heading.insert(anchor.to_lowercase(), heading_index);
706        }
707    }
708
709    /// Check if an anchor exists in this file (O(1) lookup)
710    ///
711    /// Returns true if the anchor matches any of:
712    /// - Auto-generated heading anchors
713    /// - Custom heading anchors (from {#id} syntax on headings)
714    /// - HTML anchors (from `<a id="...">` or `<element id="...">`)
715    /// - Attribute anchors (from { #id } syntax on non-heading elements)
716    ///
717    /// Matching is case-insensitive. URL-encoded anchors (e.g., CJK characters
718    /// like `%E6%97%A5%E6%9C%AC%E8%AA%9E` for `日本語`) are decoded before matching.
719    pub fn has_anchor(&self, anchor: &str) -> bool {
720        let lower = anchor.to_lowercase();
721
722        // Fast path: try exact match first
723        if self.anchor_to_heading.contains_key(&lower)
724            || self.html_anchors.contains(&lower)
725            || self.attribute_anchors.contains(&lower)
726        {
727            return true;
728        }
729
730        // Slow path: if anchor contains percent-encoding, try decoded version
731        if anchor.contains('%') {
732            let decoded = url_decode(anchor).to_lowercase();
733            if decoded != lower {
734                return self.anchor_to_heading.contains_key(&decoded)
735                    || self.html_anchors.contains(&decoded)
736                    || self.attribute_anchors.contains(&decoded);
737            }
738        }
739
740        false
741    }
742
743    /// Add an HTML anchor (from `<a id="...">` or `<element id="...">` tags)
744    pub fn add_html_anchor(&mut self, anchor: String) {
745        if !anchor.is_empty() {
746            self.html_anchors.insert(anchor.to_lowercase());
747        }
748    }
749
750    /// Add an attribute anchor (from { #id } syntax on non-heading elements)
751    pub fn add_attribute_anchor(&mut self, anchor: String) {
752        if !anchor.is_empty() {
753            self.attribute_anchors.insert(anchor.to_lowercase());
754        }
755    }
756
757    /// Get the heading index for an anchor (O(1) lookup)
758    ///
759    /// Returns the index into `self.headings` if found.
760    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
761        self.anchor_to_heading
762            .get(&anchor.to_lowercase())
763            .and_then(|&idx| self.headings.get(idx))
764    }
765
766    /// Add a reference link to the index
767    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
768        self.reference_links.push(link);
769    }
770
771    /// Check if a rule is disabled at a specific line
772    ///
773    /// Used by cross-file rules to respect inline disable directives.
774    /// Checks both file-wide disables and line-specific disables.
775    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
776        // Check file-wide disables (highest priority)
777        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
778            return true;
779        }
780
781        // Check line-specific disables (disable-line / disable-next-line)
782        if let Some(rules) = self.line_disabled_rules.get(&line)
783            && (rules.contains("*") || rules.contains(rule_name))
784        {
785            return true;
786        }
787
788        // Check persistent disable/enable transitions via binary search
789        if !self.persistent_transitions.is_empty() {
790            let idx = match self.persistent_transitions.binary_search_by_key(&line, |t| t.0) {
791                Ok(i) => Some(i),
792                Err(i) => {
793                    if i > 0 {
794                        Some(i - 1)
795                    } else {
796                        None
797                    }
798                }
799            };
800            if let Some(i) = idx {
801                let (_, ref disabled, ref enabled) = self.persistent_transitions[i];
802                if disabled.contains("*") {
803                    return !enabled.contains(rule_name);
804                }
805                return disabled.contains(rule_name);
806            }
807        }
808
809        false
810    }
811
812    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line)
813    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
814        // Deduplicate: multiple rules may contribute the same link with different columns
815        // (e.g., MD051 uses link start, MD057 uses URL start)
816        let is_duplicate = self.cross_file_links.iter().any(|existing| {
817            existing.target_path == link.target_path && existing.fragment == link.fragment && existing.line == link.line
818        });
819        if !is_duplicate {
820            self.cross_file_links.push(link);
821        }
822    }
823
824    /// Add a defined reference ID (e.g., from `[ref]: url`)
825    pub fn add_defined_reference(&mut self, ref_id: String) {
826        self.defined_references.insert(ref_id);
827    }
828
829    /// Check if a reference ID has an explicit definition
830    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
831        self.defined_references.contains(ref_id)
832    }
833
834    /// Check if the content hash matches
835    pub fn hash_matches(&self, hash: &str) -> bool {
836        self.content_hash == hash
837    }
838
839    /// Get the number of headings
840    pub fn heading_count(&self) -> usize {
841        self.headings.len()
842    }
843
844    /// Get the number of reference links
845    pub fn reference_link_count(&self) -> usize {
846        self.reference_links.len()
847    }
848}
849
850#[cfg(test)]
851mod tests {
852    use super::*;
853
854    #[test]
855    fn test_workspace_index_basic() {
856        let mut index = WorkspaceIndex::new();
857        assert_eq!(index.file_count(), 0);
858        assert_eq!(index.version(), 0);
859
860        let mut file_index = FileIndex::with_hash("abc123".to_string());
861        file_index.add_heading(HeadingIndex {
862            text: "Installation".to_string(),
863            auto_anchor: "installation".to_string(),
864            custom_anchor: None,
865            line: 1,
866            is_setext: false,
867        });
868
869        index.insert_file(PathBuf::from("docs/install.md"), file_index);
870        assert_eq!(index.file_count(), 1);
871        assert_eq!(index.version(), 1);
872
873        assert!(index.contains_file(Path::new("docs/install.md")));
874        assert!(!index.contains_file(Path::new("docs/other.md")));
875    }
876
877    #[test]
878    fn test_vulnerable_anchors() {
879        let mut index = WorkspaceIndex::new();
880
881        // File 1: heading without custom anchor (vulnerable)
882        let mut file1 = FileIndex::new();
883        file1.add_heading(HeadingIndex {
884            text: "Getting Started".to_string(),
885            auto_anchor: "getting-started".to_string(),
886            custom_anchor: None,
887            line: 1,
888            is_setext: false,
889        });
890        index.insert_file(PathBuf::from("docs/guide.md"), file1);
891
892        // File 2: heading with custom anchor (not vulnerable)
893        let mut file2 = FileIndex::new();
894        file2.add_heading(HeadingIndex {
895            text: "Installation".to_string(),
896            auto_anchor: "installation".to_string(),
897            custom_anchor: Some("install".to_string()),
898            line: 1,
899            is_setext: false,
900        });
901        index.insert_file(PathBuf::from("docs/install.md"), file2);
902
903        let vulnerable = index.get_vulnerable_anchors();
904        assert_eq!(vulnerable.len(), 1);
905        assert!(vulnerable.contains_key("getting-started"));
906        assert!(!vulnerable.contains_key("installation"));
907
908        let anchors = vulnerable.get("getting-started").unwrap();
909        assert_eq!(anchors.len(), 1);
910        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
911        assert_eq!(anchors[0].text, "Getting Started");
912    }
913
914    #[test]
915    fn test_vulnerable_anchors_multiple_files_same_anchor() {
916        // Multiple files can have headings with the same auto-generated anchor
917        // get_vulnerable_anchors() should collect all of them
918        let mut index = WorkspaceIndex::new();
919
920        // File 1: has "Installation" heading (vulnerable)
921        let mut file1 = FileIndex::new();
922        file1.add_heading(HeadingIndex {
923            text: "Installation".to_string(),
924            auto_anchor: "installation".to_string(),
925            custom_anchor: None,
926            line: 1,
927            is_setext: false,
928        });
929        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
930
931        // File 2: also has "Installation" heading with same anchor (vulnerable)
932        let mut file2 = FileIndex::new();
933        file2.add_heading(HeadingIndex {
934            text: "Installation".to_string(),
935            auto_anchor: "installation".to_string(),
936            custom_anchor: None,
937            line: 5,
938            is_setext: false,
939        });
940        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
941
942        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
943        let mut file3 = FileIndex::new();
944        file3.add_heading(HeadingIndex {
945            text: "Installation".to_string(),
946            auto_anchor: "installation".to_string(),
947            custom_anchor: Some("install".to_string()),
948            line: 10,
949            is_setext: false,
950        });
951        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
952
953        let vulnerable = index.get_vulnerable_anchors();
954        assert_eq!(vulnerable.len(), 1); // One unique anchor
955        assert!(vulnerable.contains_key("installation"));
956
957        let anchors = vulnerable.get("installation").unwrap();
958        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
959        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
960
961        // Verify both files are represented
962        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
963        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
964        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
965    }
966
967    #[test]
968    fn test_file_index_hash() {
969        let index = FileIndex::with_hash("hash123".to_string());
970        assert!(index.hash_matches("hash123"));
971        assert!(!index.hash_matches("other"));
972    }
973
974    #[test]
975    fn test_version_increment() {
976        let mut index = WorkspaceIndex::new();
977        assert_eq!(index.version(), 0);
978
979        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
980        assert_eq!(index.version(), 1);
981
982        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
983        assert_eq!(index.version(), 2);
984
985        index.remove_file(Path::new("a.md"));
986        assert_eq!(index.version(), 3);
987
988        // Removing non-existent file doesn't increment
989        index.remove_file(Path::new("nonexistent.md"));
990        assert_eq!(index.version(), 3);
991    }
992
993    #[test]
994    fn test_reverse_deps_basic() {
995        let mut index = WorkspaceIndex::new();
996
997        // File A links to file B
998        let mut file_a = FileIndex::new();
999        file_a.add_cross_file_link(CrossFileLinkIndex {
1000            target_path: "b.md".to_string(),
1001            fragment: "section".to_string(),
1002            line: 10,
1003            column: 5,
1004        });
1005        index.update_file(Path::new("docs/a.md"), file_a);
1006
1007        // Check that B has A as a dependent
1008        let dependents = index.get_dependents(Path::new("docs/b.md"));
1009        assert_eq!(dependents.len(), 1);
1010        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
1011
1012        // A has no dependents
1013        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
1014        assert!(a_dependents.is_empty());
1015    }
1016
1017    #[test]
1018    fn test_reverse_deps_multiple() {
1019        let mut index = WorkspaceIndex::new();
1020
1021        // Files A and C both link to B
1022        let mut file_a = FileIndex::new();
1023        file_a.add_cross_file_link(CrossFileLinkIndex {
1024            target_path: "../b.md".to_string(),
1025            fragment: "".to_string(),
1026            line: 1,
1027            column: 1,
1028        });
1029        index.update_file(Path::new("docs/sub/a.md"), file_a);
1030
1031        let mut file_c = FileIndex::new();
1032        file_c.add_cross_file_link(CrossFileLinkIndex {
1033            target_path: "b.md".to_string(),
1034            fragment: "".to_string(),
1035            line: 1,
1036            column: 1,
1037        });
1038        index.update_file(Path::new("docs/c.md"), file_c);
1039
1040        // B should have both A and C as dependents
1041        let dependents = index.get_dependents(Path::new("docs/b.md"));
1042        assert_eq!(dependents.len(), 2);
1043        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
1044        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
1045    }
1046
1047    #[test]
1048    fn test_reverse_deps_update_clears_old() {
1049        let mut index = WorkspaceIndex::new();
1050
1051        // File A initially links to B
1052        let mut file_a = FileIndex::new();
1053        file_a.add_cross_file_link(CrossFileLinkIndex {
1054            target_path: "b.md".to_string(),
1055            fragment: "".to_string(),
1056            line: 1,
1057            column: 1,
1058        });
1059        index.update_file(Path::new("docs/a.md"), file_a);
1060
1061        // Verify B has A as dependent
1062        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1063
1064        // Update A to link to C instead of B
1065        let mut file_a_updated = FileIndex::new();
1066        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
1067            target_path: "c.md".to_string(),
1068            fragment: "".to_string(),
1069            line: 1,
1070            column: 1,
1071        });
1072        index.update_file(Path::new("docs/a.md"), file_a_updated);
1073
1074        // B should no longer have A as dependent
1075        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1076
1077        // C should now have A as dependent
1078        let c_deps = index.get_dependents(Path::new("docs/c.md"));
1079        assert_eq!(c_deps.len(), 1);
1080        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
1081    }
1082
1083    #[test]
1084    fn test_reverse_deps_remove_file() {
1085        let mut index = WorkspaceIndex::new();
1086
1087        // File A links to B
1088        let mut file_a = FileIndex::new();
1089        file_a.add_cross_file_link(CrossFileLinkIndex {
1090            target_path: "b.md".to_string(),
1091            fragment: "".to_string(),
1092            line: 1,
1093            column: 1,
1094        });
1095        index.update_file(Path::new("docs/a.md"), file_a);
1096
1097        // Verify B has A as dependent
1098        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1099
1100        // Remove file A
1101        index.remove_file(Path::new("docs/a.md"));
1102
1103        // B should no longer have any dependents
1104        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1105    }
1106
1107    #[test]
1108    fn test_normalize_path() {
1109        // Test .. handling
1110        let path = Path::new("docs/sub/../other.md");
1111        let normalized = WorkspaceIndex::normalize_path(path);
1112        assert_eq!(normalized, PathBuf::from("docs/other.md"));
1113
1114        // Test . handling
1115        let path2 = Path::new("docs/./other.md");
1116        let normalized2 = WorkspaceIndex::normalize_path(path2);
1117        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
1118
1119        // Test multiple ..
1120        let path3 = Path::new("a/b/c/../../d.md");
1121        let normalized3 = WorkspaceIndex::normalize_path(path3);
1122        assert_eq!(normalized3, PathBuf::from("a/d.md"));
1123    }
1124
1125    #[test]
1126    fn test_clear_clears_reverse_deps() {
1127        let mut index = WorkspaceIndex::new();
1128
1129        // File A links to B
1130        let mut file_a = FileIndex::new();
1131        file_a.add_cross_file_link(CrossFileLinkIndex {
1132            target_path: "b.md".to_string(),
1133            fragment: "".to_string(),
1134            line: 1,
1135            column: 1,
1136        });
1137        index.update_file(Path::new("docs/a.md"), file_a);
1138
1139        // Verify B has A as dependent
1140        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1141
1142        // Clear the index
1143        index.clear();
1144
1145        // Both files and reverse deps should be cleared
1146        assert_eq!(index.file_count(), 0);
1147        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1148    }
1149
1150    #[test]
1151    fn test_is_file_stale() {
1152        let mut index = WorkspaceIndex::new();
1153
1154        // Non-existent file is always stale
1155        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
1156
1157        // Add a file with known hash
1158        let file_index = FileIndex::with_hash("hash123".to_string());
1159        index.insert_file(PathBuf::from("docs/test.md"), file_index);
1160
1161        // Same hash means not stale
1162        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
1163
1164        // Different hash means stale
1165        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
1166    }
1167
1168    #[cfg(feature = "native")]
1169    #[test]
1170    fn test_cache_roundtrip() {
1171        use std::fs;
1172
1173        // Create a temp directory
1174        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
1175        let _ = fs::remove_dir_all(&temp_dir);
1176        fs::create_dir_all(&temp_dir).unwrap();
1177
1178        // Create an index with some data
1179        let mut index = WorkspaceIndex::new();
1180
1181        let mut file1 = FileIndex::with_hash("abc123".to_string());
1182        file1.add_heading(HeadingIndex {
1183            text: "Test Heading".to_string(),
1184            auto_anchor: "test-heading".to_string(),
1185            custom_anchor: Some("test".to_string()),
1186            line: 1,
1187            is_setext: false,
1188        });
1189        file1.add_cross_file_link(CrossFileLinkIndex {
1190            target_path: "./other.md".to_string(),
1191            fragment: "section".to_string(),
1192            line: 5,
1193            column: 3,
1194        });
1195        index.update_file(Path::new("docs/file1.md"), file1);
1196
1197        let mut file2 = FileIndex::with_hash("def456".to_string());
1198        file2.add_heading(HeadingIndex {
1199            text: "Another Heading".to_string(),
1200            auto_anchor: "another-heading".to_string(),
1201            custom_anchor: None,
1202            line: 1,
1203            is_setext: false,
1204        });
1205        index.update_file(Path::new("docs/other.md"), file2);
1206
1207        // Save to cache
1208        index.save_to_cache(&temp_dir).expect("Failed to save cache");
1209
1210        // Verify cache file exists
1211        assert!(temp_dir.join("workspace_index.bin").exists());
1212
1213        // Load from cache
1214        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
1215
1216        // Verify data matches
1217        assert_eq!(loaded.file_count(), 2);
1218        assert!(loaded.contains_file(Path::new("docs/file1.md")));
1219        assert!(loaded.contains_file(Path::new("docs/other.md")));
1220
1221        // Check file1 details
1222        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
1223        assert_eq!(file1_loaded.content_hash, "abc123");
1224        assert_eq!(file1_loaded.headings.len(), 1);
1225        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
1226        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
1227        assert_eq!(file1_loaded.cross_file_links.len(), 1);
1228        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
1229
1230        // Check reverse deps were serialized correctly
1231        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
1232        assert_eq!(dependents.len(), 1);
1233        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
1234
1235        // Clean up
1236        let _ = fs::remove_dir_all(&temp_dir);
1237    }
1238
1239    #[cfg(feature = "native")]
1240    #[test]
1241    fn test_cache_missing_file() {
1242        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
1243        let _ = std::fs::remove_dir_all(&temp_dir);
1244
1245        // Should return None for non-existent cache
1246        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1247        assert!(result.is_none());
1248    }
1249
1250    #[cfg(feature = "native")]
1251    #[test]
1252    fn test_cache_corrupted_file() {
1253        use std::fs;
1254
1255        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
1256        let _ = fs::remove_dir_all(&temp_dir);
1257        fs::create_dir_all(&temp_dir).unwrap();
1258
1259        // Write corrupted data (too small for header)
1260        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
1261
1262        // Should return None for corrupted cache (and remove the file)
1263        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1264        assert!(result.is_none());
1265
1266        // Corrupted file should be removed
1267        assert!(!temp_dir.join("workspace_index.bin").exists());
1268
1269        // Clean up
1270        let _ = fs::remove_dir_all(&temp_dir);
1271    }
1272
1273    #[cfg(feature = "native")]
1274    #[test]
1275    fn test_cache_invalid_magic() {
1276        use std::fs;
1277
1278        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1279        let _ = fs::remove_dir_all(&temp_dir);
1280        fs::create_dir_all(&temp_dir).unwrap();
1281
1282        // Write data with wrong magic header
1283        let mut data = Vec::new();
1284        data.extend_from_slice(b"XXXX"); // Wrong magic
1285        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1286        data.extend_from_slice(&[0; 100]); // Some garbage data
1287        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1288
1289        // Should return None for invalid magic
1290        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1291        assert!(result.is_none());
1292
1293        // File should be removed
1294        assert!(!temp_dir.join("workspace_index.bin").exists());
1295
1296        // Clean up
1297        let _ = fs::remove_dir_all(&temp_dir);
1298    }
1299
1300    #[cfg(feature = "native")]
1301    #[test]
1302    fn test_cache_version_mismatch() {
1303        use std::fs;
1304
1305        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1306        let _ = fs::remove_dir_all(&temp_dir);
1307        fs::create_dir_all(&temp_dir).unwrap();
1308
1309        // Write data with correct magic but wrong version
1310        let mut data = Vec::new();
1311        data.extend_from_slice(b"RWSI"); // Correct magic
1312        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1313        data.extend_from_slice(&[0; 100]); // Some garbage data
1314        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1315
1316        // Should return None for version mismatch
1317        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1318        assert!(result.is_none());
1319
1320        // File should be removed to trigger rebuild
1321        assert!(!temp_dir.join("workspace_index.bin").exists());
1322
1323        // Clean up
1324        let _ = fs::remove_dir_all(&temp_dir);
1325    }
1326
1327    #[cfg(feature = "native")]
1328    #[test]
1329    fn test_cache_atomic_write() {
1330        use std::fs;
1331
1332        // Test that atomic writes work (no temp files left behind)
1333        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1334        let _ = fs::remove_dir_all(&temp_dir);
1335        fs::create_dir_all(&temp_dir).unwrap();
1336
1337        let index = WorkspaceIndex::new();
1338        index.save_to_cache(&temp_dir).expect("Failed to save");
1339
1340        // Only the final cache file should exist, no temp files
1341        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1342        assert_eq!(entries.len(), 1);
1343        assert!(temp_dir.join("workspace_index.bin").exists());
1344
1345        // Clean up
1346        let _ = fs::remove_dir_all(&temp_dir);
1347    }
1348
1349    #[test]
1350    fn test_has_anchor_auto_generated() {
1351        let mut file_index = FileIndex::new();
1352        file_index.add_heading(HeadingIndex {
1353            text: "Installation Guide".to_string(),
1354            auto_anchor: "installation-guide".to_string(),
1355            custom_anchor: None,
1356            line: 1,
1357            is_setext: false,
1358        });
1359
1360        // Should find by auto-generated anchor
1361        assert!(file_index.has_anchor("installation-guide"));
1362
1363        // Case-insensitive matching
1364        assert!(file_index.has_anchor("Installation-Guide"));
1365        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1366
1367        // Should not find non-existent anchor
1368        assert!(!file_index.has_anchor("nonexistent"));
1369    }
1370
1371    #[test]
1372    fn test_has_anchor_custom() {
1373        let mut file_index = FileIndex::new();
1374        file_index.add_heading(HeadingIndex {
1375            text: "Installation Guide".to_string(),
1376            auto_anchor: "installation-guide".to_string(),
1377            custom_anchor: Some("install".to_string()),
1378            line: 1,
1379            is_setext: false,
1380        });
1381
1382        // Should find by auto-generated anchor
1383        assert!(file_index.has_anchor("installation-guide"));
1384
1385        // Should also find by custom anchor
1386        assert!(file_index.has_anchor("install"));
1387        assert!(file_index.has_anchor("Install")); // case-insensitive
1388
1389        // Should not find non-existent anchor
1390        assert!(!file_index.has_anchor("nonexistent"));
1391    }
1392
1393    #[test]
1394    fn test_get_heading_by_anchor() {
1395        let mut file_index = FileIndex::new();
1396        file_index.add_heading(HeadingIndex {
1397            text: "Installation Guide".to_string(),
1398            auto_anchor: "installation-guide".to_string(),
1399            custom_anchor: Some("install".to_string()),
1400            line: 10,
1401            is_setext: false,
1402        });
1403        file_index.add_heading(HeadingIndex {
1404            text: "Configuration".to_string(),
1405            auto_anchor: "configuration".to_string(),
1406            custom_anchor: None,
1407            line: 20,
1408            is_setext: false,
1409        });
1410
1411        // Get by auto anchor
1412        let heading = file_index.get_heading_by_anchor("installation-guide");
1413        assert!(heading.is_some());
1414        assert_eq!(heading.unwrap().text, "Installation Guide");
1415        assert_eq!(heading.unwrap().line, 10);
1416
1417        // Get by custom anchor
1418        let heading = file_index.get_heading_by_anchor("install");
1419        assert!(heading.is_some());
1420        assert_eq!(heading.unwrap().text, "Installation Guide");
1421
1422        // Get second heading
1423        let heading = file_index.get_heading_by_anchor("configuration");
1424        assert!(heading.is_some());
1425        assert_eq!(heading.unwrap().text, "Configuration");
1426        assert_eq!(heading.unwrap().line, 20);
1427
1428        // Non-existent
1429        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1430    }
1431
1432    #[test]
1433    fn test_anchor_lookup_many_headings() {
1434        // Test that O(1) lookup works with many headings
1435        let mut file_index = FileIndex::new();
1436
1437        // Add 100 headings
1438        for i in 0..100 {
1439            file_index.add_heading(HeadingIndex {
1440                text: format!("Heading {i}"),
1441                auto_anchor: format!("heading-{i}"),
1442                custom_anchor: Some(format!("h{i}")),
1443                line: i + 1,
1444                is_setext: false,
1445            });
1446        }
1447
1448        // Verify all can be found
1449        for i in 0..100 {
1450            assert!(file_index.has_anchor(&format!("heading-{i}")));
1451            assert!(file_index.has_anchor(&format!("h{i}")));
1452
1453            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1454            assert!(heading.is_some());
1455            assert_eq!(heading.unwrap().line, i + 1);
1456        }
1457    }
1458
1459    // =============================================================================
1460    // Tests for extract_cross_file_links utility
1461    // =============================================================================
1462
1463    #[test]
1464    fn test_extract_cross_file_links_basic() {
1465        use crate::config::MarkdownFlavor;
1466
1467        let content = "# Test\n\nSee [link](./other.md) for info.\n";
1468        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1469        let links = extract_cross_file_links(&ctx);
1470
1471        assert_eq!(links.len(), 1);
1472        assert_eq!(links[0].target_path, "./other.md");
1473        assert_eq!(links[0].fragment, "");
1474        assert_eq!(links[0].line, 3);
1475        // "See [link](" = 11 chars, so column 12 is where "./other.md" starts
1476        assert_eq!(links[0].column, 12);
1477    }
1478
1479    #[test]
1480    fn test_extract_cross_file_links_with_fragment() {
1481        use crate::config::MarkdownFlavor;
1482
1483        let content = "Check [guide](./guide.md#install) here.\n";
1484        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1485        let links = extract_cross_file_links(&ctx);
1486
1487        assert_eq!(links.len(), 1);
1488        assert_eq!(links[0].target_path, "./guide.md");
1489        assert_eq!(links[0].fragment, "install");
1490        assert_eq!(links[0].line, 1);
1491        // "Check [guide](" = 14 chars, so column 15 is where "./guide.md" starts
1492        assert_eq!(links[0].column, 15);
1493    }
1494
1495    #[test]
1496    fn test_extract_cross_file_links_multiple_on_same_line() {
1497        use crate::config::MarkdownFlavor;
1498
1499        let content = "See [a](a.md) and [b](b.md) here.\n";
1500        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1501        let links = extract_cross_file_links(&ctx);
1502
1503        assert_eq!(links.len(), 2);
1504
1505        assert_eq!(links[0].target_path, "a.md");
1506        assert_eq!(links[0].line, 1);
1507        // "See [a](" = 8 chars, so column 9
1508        assert_eq!(links[0].column, 9);
1509
1510        assert_eq!(links[1].target_path, "b.md");
1511        assert_eq!(links[1].line, 1);
1512        // "See [a](a.md) and [b](" = 22 chars, so column 23
1513        assert_eq!(links[1].column, 23);
1514    }
1515
1516    #[test]
1517    fn test_extract_cross_file_links_angle_brackets() {
1518        use crate::config::MarkdownFlavor;
1519
1520        let content = "See [link](<path/with (parens).md>) here.\n";
1521        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1522        let links = extract_cross_file_links(&ctx);
1523
1524        assert_eq!(links.len(), 1);
1525        assert_eq!(links[0].target_path, "path/with (parens).md");
1526        assert_eq!(links[0].line, 1);
1527        // "See [link](<" = 12 chars, so column 13
1528        assert_eq!(links[0].column, 13);
1529    }
1530
1531    #[test]
1532    fn test_extract_cross_file_links_skips_external() {
1533        use crate::config::MarkdownFlavor;
1534
1535        let content = r#"
1536[external](https://example.com)
1537[mailto](mailto:test@example.com)
1538[local](./local.md)
1539[fragment](#section)
1540[absolute](/docs/page.md)
1541"#;
1542        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1543        let links = extract_cross_file_links(&ctx);
1544
1545        // Only the local markdown link should be extracted
1546        assert_eq!(links.len(), 1);
1547        assert_eq!(links[0].target_path, "./local.md");
1548    }
1549
1550    #[test]
1551    fn test_extract_cross_file_links_skips_non_markdown() {
1552        use crate::config::MarkdownFlavor;
1553
1554        let content = r#"
1555[image](./photo.png)
1556[doc](./readme.md)
1557[pdf](./document.pdf)
1558"#;
1559        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1560        let links = extract_cross_file_links(&ctx);
1561
1562        // Only markdown files are indexed for cross-file validation
1563        assert_eq!(links.len(), 1);
1564        assert_eq!(links[0].target_path, "./readme.md");
1565    }
1566
1567    #[test]
1568    fn test_extract_cross_file_links_skips_code_spans() {
1569        use crate::config::MarkdownFlavor;
1570
1571        let content = "Normal [link](./file.md) and `[code](./ignored.md)` here.\n";
1572        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1573        let links = extract_cross_file_links(&ctx);
1574
1575        // Only the link outside code span should be extracted
1576        assert_eq!(links.len(), 1);
1577        assert_eq!(links[0].target_path, "./file.md");
1578    }
1579
1580    #[test]
1581    fn test_extract_cross_file_links_with_query_params() {
1582        use crate::config::MarkdownFlavor;
1583
1584        let content = "See [doc](./file.md?raw=true) here.\n";
1585        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1586        let links = extract_cross_file_links(&ctx);
1587
1588        assert_eq!(links.len(), 1);
1589        // Query params should be stripped
1590        assert_eq!(links[0].target_path, "./file.md");
1591    }
1592
1593    #[test]
1594    fn test_extract_cross_file_links_empty_content() {
1595        use crate::config::MarkdownFlavor;
1596
1597        let content = "";
1598        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1599        let links = extract_cross_file_links(&ctx);
1600
1601        assert!(links.is_empty());
1602    }
1603
1604    #[test]
1605    fn test_extract_cross_file_links_no_links() {
1606        use crate::config::MarkdownFlavor;
1607
1608        let content = "# Just a heading\n\nSome text without links.\n";
1609        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1610        let links = extract_cross_file_links(&ctx);
1611
1612        assert!(links.is_empty());
1613    }
1614
1615    #[test]
1616    fn test_extract_cross_file_links_position_accuracy_issue_234() {
1617        // This test verifies the fix for GitHub issue #234
1618        // The LSP was reporting incorrect column positions for MD057 diagnostics
1619        use crate::config::MarkdownFlavor;
1620
1621        let content = r#"# Test Document
1622
1623Here is a [broken link](nonexistent-file.md) that should trigger MD057.
1624
1625And another [link](also-missing.md) on this line.
1626"#;
1627        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1628        let links = extract_cross_file_links(&ctx);
1629
1630        assert_eq!(links.len(), 2);
1631
1632        // First link: "Here is a [broken link](" = 24 chars, column 25
1633        assert_eq!(links[0].target_path, "nonexistent-file.md");
1634        assert_eq!(links[0].line, 3);
1635        assert_eq!(links[0].column, 25);
1636
1637        // Second link: "And another [link](" = 19 chars, column 20
1638        assert_eq!(links[1].target_path, "also-missing.md");
1639        assert_eq!(links[1].line, 5);
1640        assert_eq!(links[1].column, 20);
1641    }
1642}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs