rumdl_lib/
workspace_index.rs

1//! Workspace-wide index for cross-file analysis
2//!
3//! This module provides infrastructure for rules that need to validate
4//! references across multiple files, such as MD051 which validates that
5//! cross-file link fragments point to valid headings.
6//!
7//! The index is built in parallel and designed for minimal memory overhead.
8//!
9//! ## Cache Format
10//!
11//! The workspace index can be persisted to disk for faster startup on
12//! repeated runs. The cache format includes a version header to detect
13//! incompatible format changes:
14//!
15//! ```text
16//! [4 bytes: magic "RWSI" - Rumdl Workspace Index]
17//! [4 bytes: format version (u32 little-endian)]
18//! [N bytes: postcard-serialized WorkspaceIndex]
19//! ```
20
21use regex::Regex;
22use serde::{Deserialize, Serialize};
23use std::collections::{HashMap, HashSet};
24use std::path::{Path, PathBuf};
25use std::sync::LazyLock;
26
27use crate::lint_context::LintContext;
28use crate::utils::element_cache::ElementCache;
29
30// =============================================================================
31// URL Decoding Helper
32// =============================================================================
33
34/// Convert a hex digit character to its numeric value (0-15)
35fn hex_digit_to_value(c: u8) -> Option<u8> {
36    match c {
37        b'0'..=b'9' => Some(c - b'0'),
38        b'a'..=b'f' => Some(c - b'a' + 10),
39        b'A'..=b'F' => Some(c - b'A' + 10),
40        _ => None,
41    }
42}
43
44/// URL-decode a string, handling percent-encoded characters.
45/// Returns the decoded string, or the original if decoding fails.
46/// Used for matching URL-encoded CJK fragments against raw anchors.
47fn url_decode(s: &str) -> String {
48    // Fast path: no percent signs means no encoding
49    if !s.contains('%') {
50        return s.to_string();
51    }
52
53    let bytes = s.as_bytes();
54    let mut result = Vec::with_capacity(bytes.len());
55    let mut i = 0;
56
57    while i < bytes.len() {
58        if bytes[i] == b'%' && i + 2 < bytes.len() {
59            // Try to parse the two hex digits following %
60            let hex1 = bytes[i + 1];
61            let hex2 = bytes[i + 2];
62            if let (Some(d1), Some(d2)) = (hex_digit_to_value(hex1), hex_digit_to_value(hex2)) {
63                result.push(d1 * 16 + d2);
64                i += 3;
65                continue;
66            }
67        }
68        result.push(bytes[i]);
69        i += 1;
70    }
71
72    // Convert to UTF-8, falling back to original if invalid
73    String::from_utf8(result).unwrap_or_else(|_| s.to_string())
74}
75
76// =============================================================================
77// Shared cross-file link extraction utilities
78//
79// These regexes and helpers are the canonical implementation for extracting
80// cross-file links. Both MD057 and LSP use this shared code path for correct
81// position tracking.
82// =============================================================================
83
84/// Regex to match the start of a link
85static LINK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!?\[[^\]]*\]").unwrap());
86
87/// Regex to extract the URL from an angle-bracketed markdown link
88/// Format: `](<URL>)` or `](<URL> "title")`
89static URL_EXTRACT_ANGLE_BRACKET_REGEX: LazyLock<Regex> =
90    LazyLock::new(|| Regex::new(r#"\]\(\s*<([^>]+)>(#[^\)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
91
92/// Regex to extract the URL from a normal markdown link (without angle brackets)
93/// Format: `](URL)` or `](URL "title")`
94static URL_EXTRACT_REGEX: LazyLock<Regex> =
95    LazyLock::new(|| Regex::new(r#"]\(\s*([^>)\s#]+)(#[^)\s]*)?\s*(?:"[^"]*")?\s*\)"#).unwrap());
96
97/// Regex to detect URLs with explicit schemes
98pub(crate) static PROTOCOL_DOMAIN_REGEX: LazyLock<Regex> =
99    LazyLock::new(|| Regex::new(r"^([a-zA-Z][a-zA-Z0-9+.-]*://|[a-zA-Z][a-zA-Z0-9+.-]*:|www\.)").unwrap());
100
101/// Supported markdown file extensions
102const MARKDOWN_EXTENSIONS: &[&str] = &[
103    ".md",
104    ".markdown",
105    ".mdx",
106    ".mkd",
107    ".mkdn",
108    ".mdown",
109    ".mdwn",
110    ".qmd",
111    ".rmd",
112];
113
114/// Check if a path has a markdown extension (case-insensitive)
115#[inline]
116fn is_markdown_file(path: &str) -> bool {
117    let path_lower = path.to_lowercase();
118    MARKDOWN_EXTENSIONS.iter().any(|ext| path_lower.ends_with(ext))
119}
120
121/// Strip query parameters and fragments from a URL path
122/// Returns the path portion before `?` or `#`
123fn strip_query_and_fragment(url: &str) -> &str {
124    let query_pos = url.find('?');
125    let fragment_pos = url.find('#');
126
127    match (query_pos, fragment_pos) {
128        (Some(q), Some(f)) => &url[..q.min(f)],
129        (Some(q), None) => &url[..q],
130        (None, Some(f)) => &url[..f],
131        (None, None) => url,
132    }
133}
134
135/// Extract cross-file links from content using correct regex-based position tracking.
136///
137/// This is the canonical implementation used by both MD057 and LSP to ensure
138/// consistent and correct column positions for diagnostic reporting.
139///
140/// Returns a vector of `CrossFileLinkIndex` entries, one for each markdown file
141/// link found in the content.
142pub fn extract_cross_file_links(ctx: &LintContext) -> Vec<CrossFileLinkIndex> {
143    let content = ctx.content;
144
145    // Early returns for performance
146    if content.is_empty() || !content.contains("](") {
147        return Vec::new();
148    }
149
150    let mut links = Vec::new();
151    let lines: Vec<&str> = content.lines().collect();
152    let element_cache = ElementCache::new(content);
153    let line_index = &ctx.line_index;
154
155    // Track which lines we've already processed to avoid duplicates
156    // (ctx.links may have multiple entries for the same line)
157    let mut processed_lines = HashSet::new();
158
159    for link in &ctx.links {
160        let line_idx = link.line - 1;
161        if line_idx >= lines.len() {
162            continue;
163        }
164
165        // Skip if we've already processed this line
166        if !processed_lines.insert(line_idx) {
167            continue;
168        }
169
170        let line = lines[line_idx];
171        if !line.contains("](") {
172            continue;
173        }
174
175        // Find all links in this line
176        for link_match in LINK_START_REGEX.find_iter(line) {
177            let start_pos = link_match.start();
178            let end_pos = link_match.end();
179
180            // Calculate absolute position for code span detection
181            let line_start_byte = line_index.get_line_start_byte(line_idx + 1).unwrap_or(0);
182            let absolute_start_pos = line_start_byte + start_pos;
183
184            // Skip if in code span
185            if element_cache.is_in_code_span(absolute_start_pos) {
186                continue;
187            }
188
189            // Extract the URL (group 1) and fragment (group 2)
190            // Try angle-bracket regex first (handles URLs with parens)
191            let caps_result = URL_EXTRACT_ANGLE_BRACKET_REGEX
192                .captures_at(line, end_pos - 1)
193                .or_else(|| URL_EXTRACT_REGEX.captures_at(line, end_pos - 1));
194
195            if let Some(caps) = caps_result
196                && let Some(url_group) = caps.get(1)
197            {
198                let file_path = url_group.as_str().trim();
199
200                // Skip empty, external, template variables, absolute URL paths,
201                // framework aliases, fragment-only URLs, or rustdoc intra-doc links
202                if file_path.is_empty()
203                    || PROTOCOL_DOMAIN_REGEX.is_match(file_path)
204                    || file_path.starts_with("www.")
205                    || file_path.starts_with('#')
206                    || file_path.starts_with("{{")
207                    || file_path.starts_with("{%")
208                    || file_path.starts_with('/')
209                    || file_path.starts_with('~')
210                    || file_path.starts_with('@')
211                    || (file_path.starts_with('`') && file_path.ends_with('`'))
212                {
213                    continue;
214                }
215
216                // Strip query parameters before indexing
217                let file_path = strip_query_and_fragment(file_path);
218
219                // Get fragment from capture group 2 (includes # prefix)
220                let fragment = caps.get(2).map(|m| m.as_str().trim_start_matches('#')).unwrap_or("");
221
222                // Only index markdown file links for cross-file validation
223                if is_markdown_file(file_path) {
224                    links.push(CrossFileLinkIndex {
225                        target_path: file_path.to_string(),
226                        fragment: fragment.to_string(),
227                        line: link.line,
228                        column: url_group.start() + 1,
229                    });
230                }
231            }
232        }
233    }
234
235    links
236}
237
238/// Magic bytes identifying a workspace index cache file
239#[cfg(feature = "native")]
240const CACHE_MAGIC: &[u8; 4] = b"RWSI";
241
242/// Cache format version - increment when WorkspaceIndex serialization changes
243#[cfg(feature = "native")]
244const CACHE_FORMAT_VERSION: u32 = 5;
245
246/// Cache file name within the version directory
247#[cfg(feature = "native")]
248const CACHE_FILE_NAME: &str = "workspace_index.bin";
249
250/// Workspace-wide index for cross-file analysis
251///
252/// Contains pre-extracted information from all markdown files in the workspace,
253/// enabling rules to validate cross-file references efficiently.
254#[derive(Debug, Default, Clone, Serialize, Deserialize)]
255pub struct WorkspaceIndex {
256    /// Map from file path to its extracted data
257    files: HashMap<PathBuf, FileIndex>,
258    /// Reverse dependency graph: target file → files that link to it
259    /// Used to efficiently re-lint dependent files when a target changes
260    reverse_deps: HashMap<PathBuf, HashSet<PathBuf>>,
261    /// Version counter for cache invalidation (incremented on any change)
262    version: u64,
263}
264
265/// Index data extracted from a single file
266#[derive(Debug, Clone, Default, Serialize, Deserialize)]
267pub struct FileIndex {
268    /// Headings in this file with their anchors
269    pub headings: Vec<HeadingIndex>,
270    /// Reference links in this file (for cross-file analysis)
271    pub reference_links: Vec<ReferenceLinkIndex>,
272    /// Cross-file links in this file (for MD051 cross-file validation)
273    pub cross_file_links: Vec<CrossFileLinkIndex>,
274    /// Defined reference IDs (e.g., from `[ref]: url` definitions)
275    /// Used to filter out reference links that have explicit definitions
276    pub defined_references: HashSet<String>,
277    /// Content hash for change detection
278    pub content_hash: String,
279    /// O(1) anchor lookup: lowercased anchor → heading index
280    /// Includes both auto-generated and custom anchors
281    anchor_to_heading: HashMap<String, usize>,
282    /// HTML anchors defined via `<a id="...">` or `<element id="...">` tags
283    /// Stored lowercase for case-insensitive matching
284    html_anchors: HashSet<String>,
285    /// Attribute anchors defined via { #id } syntax (kramdown/MkDocs attr_list)
286    /// Can appear on any element, not just headings
287    /// Stored lowercase for case-insensitive matching
288    attribute_anchors: HashSet<String>,
289    /// Rules disabled for the entire file (from inline comments)
290    /// Used by cross-file rules to respect inline disable directives
291    pub file_disabled_rules: HashSet<String>,
292    /// Persistent disable/enable state transitions, sorted by line number.
293    /// Each entry: (line, disabled_rules, enabled_rules). Use binary search to query.
294    pub persistent_transitions: Vec<(usize, HashSet<String>, HashSet<String>)>,
295    /// Rules disabled at specific lines via disable-line / disable-next-line
296    pub line_disabled_rules: HashMap<usize, HashSet<String>>,
297}
298
299/// Information about a heading for cross-file lookup
300#[derive(Debug, Clone, Serialize, Deserialize)]
301pub struct HeadingIndex {
302    /// The heading text (e.g., "Installation Guide")
303    pub text: String,
304    /// Auto-generated anchor (e.g., "installation-guide")
305    pub auto_anchor: String,
306    /// Custom anchor if present (e.g., "install")
307    pub custom_anchor: Option<String>,
308    /// Line number (1-indexed)
309    pub line: usize,
310    /// Whether this is a Setext-style heading (underlined with = or -)
311    #[serde(default)]
312    pub is_setext: bool,
313}
314
315/// Information about a reference link for cross-file analysis
316#[derive(Debug, Clone, Serialize, Deserialize)]
317pub struct ReferenceLinkIndex {
318    /// The reference ID (the part in `[text][ref]`)
319    pub reference_id: String,
320    /// Line number (1-indexed)
321    pub line: usize,
322    /// Column number (1-indexed)
323    pub column: usize,
324}
325
326/// Information about a cross-file link for validation
327#[derive(Debug, Clone, Serialize, Deserialize)]
328pub struct CrossFileLinkIndex {
329    /// The target file path (relative, as it appears in the link)
330    pub target_path: String,
331    /// The fragment/anchor being linked to (without #)
332    pub fragment: String,
333    /// Line number (1-indexed)
334    pub line: usize,
335    /// Column number (1-indexed)
336    pub column: usize,
337}
338
339/// Information about a vulnerable anchor (heading without custom ID)
340#[derive(Debug, Clone, Serialize, Deserialize)]
341pub struct VulnerableAnchor {
342    /// File path where the heading is located
343    pub file: PathBuf,
344    /// Line number of the heading
345    pub line: usize,
346    /// The heading text
347    pub text: String,
348}
349
350impl WorkspaceIndex {
351    /// Create a new empty workspace index
352    pub fn new() -> Self {
353        Self::default()
354    }
355
356    /// Get the current version (for cache invalidation)
357    pub fn version(&self) -> u64 {
358        self.version
359    }
360
361    /// Get the number of indexed files
362    pub fn file_count(&self) -> usize {
363        self.files.len()
364    }
365
366    /// Check if a file is in the index
367    pub fn contains_file(&self, path: &Path) -> bool {
368        self.files.contains_key(path)
369    }
370
371    /// Get the index data for a specific file
372    pub fn get_file(&self, path: &Path) -> Option<&FileIndex> {
373        self.files.get(path)
374    }
375
376    /// Insert or update a file's index data
377    pub fn insert_file(&mut self, path: PathBuf, index: FileIndex) {
378        self.files.insert(path, index);
379        self.version = self.version.wrapping_add(1);
380    }
381
382    /// Remove a file from the index
383    pub fn remove_file(&mut self, path: &Path) -> Option<FileIndex> {
384        // Clean up reverse deps for this file
385        self.clear_reverse_deps_for(path);
386
387        let result = self.files.remove(path);
388        if result.is_some() {
389            self.version = self.version.wrapping_add(1);
390        }
391        result
392    }
393
394    /// Build a map of all "vulnerable" anchors across the workspace
395    ///
396    /// A vulnerable anchor is an auto-generated anchor for a heading that
397    /// does NOT have a custom anchor defined. These are problematic for
398    /// translated content because the anchor changes when the heading is translated.
399    ///
400    /// Returns: Map from lowercase anchor → Vec of VulnerableAnchor info
401    /// Multiple files can have headings with the same auto-generated anchor,
402    /// so we collect all occurrences.
403    pub fn get_vulnerable_anchors(&self) -> HashMap<String, Vec<VulnerableAnchor>> {
404        let mut vulnerable: HashMap<String, Vec<VulnerableAnchor>> = HashMap::new();
405
406        for (file_path, file_index) in &self.files {
407            for heading in &file_index.headings {
408                // Only include headings WITHOUT custom anchors
409                if heading.custom_anchor.is_none() && !heading.auto_anchor.is_empty() {
410                    let anchor_key = heading.auto_anchor.to_lowercase();
411                    vulnerable.entry(anchor_key).or_default().push(VulnerableAnchor {
412                        file: file_path.clone(),
413                        line: heading.line,
414                        text: heading.text.clone(),
415                    });
416                }
417            }
418        }
419
420        vulnerable
421    }
422
423    /// Get all headings across the workspace (for debugging/testing)
424    pub fn all_headings(&self) -> impl Iterator<Item = (&Path, &HeadingIndex)> {
425        self.files
426            .iter()
427            .flat_map(|(path, index)| index.headings.iter().map(move |h| (path.as_path(), h)))
428    }
429
430    /// Iterate over all files in the index
431    pub fn files(&self) -> impl Iterator<Item = (&Path, &FileIndex)> {
432        self.files.iter().map(|(p, i)| (p.as_path(), i))
433    }
434
435    /// Clear the entire index
436    pub fn clear(&mut self) {
437        self.files.clear();
438        self.reverse_deps.clear();
439        self.version = self.version.wrapping_add(1);
440    }
441
442    /// Update a file's index and maintain reverse dependencies
443    ///
444    /// This method:
445    /// 1. Removes this file as a source (dependent) from all reverse deps
446    /// 2. Inserts the new file index
447    /// 3. Builds new reverse deps from cross_file_links
448    pub fn update_file(&mut self, path: &Path, index: FileIndex) {
449        // Remove this file as a source (dependent) from all target entries
450        // Note: We don't remove it as a target - other files may still link to it
451        self.clear_reverse_deps_as_source(path);
452
453        // Build new reverse deps from cross_file_links
454        for link in &index.cross_file_links {
455            let target = self.resolve_target_path(path, &link.target_path);
456            self.reverse_deps.entry(target).or_default().insert(path.to_path_buf());
457        }
458
459        self.files.insert(path.to_path_buf(), index);
460        self.version = self.version.wrapping_add(1);
461    }
462
463    /// Get files that depend on (link to) the given file
464    ///
465    /// Returns a list of file paths that contain links targeting this file.
466    /// Used to re-lint dependent files when a target file changes.
467    pub fn get_dependents(&self, path: &Path) -> Vec<PathBuf> {
468        self.reverse_deps
469            .get(path)
470            .map(|set| set.iter().cloned().collect())
471            .unwrap_or_default()
472    }
473
474    /// Check if a file needs re-indexing based on its content hash
475    ///
476    /// Returns `true` if the file is not in the index or has a different hash.
477    pub fn is_file_stale(&self, path: &Path, current_hash: &str) -> bool {
478        self.files
479            .get(path)
480            .map(|f| f.content_hash != current_hash)
481            .unwrap_or(true)
482    }
483
484    /// Retain only files that exist in the given set, removing deleted files
485    ///
486    /// This prunes stale entries from the cache for files that no longer exist.
487    /// Returns the number of files removed.
488    pub fn retain_only(&mut self, current_files: &std::collections::HashSet<PathBuf>) -> usize {
489        let before_count = self.files.len();
490
491        // Collect files to remove
492        let to_remove: Vec<PathBuf> = self
493            .files
494            .keys()
495            .filter(|path| !current_files.contains(*path))
496            .cloned()
497            .collect();
498
499        // Remove each file properly (clears reverse deps)
500        for path in &to_remove {
501            self.remove_file(path);
502        }
503
504        before_count - self.files.len()
505    }
506
507    /// Save the workspace index to a cache file
508    ///
509    /// Uses postcard for efficient binary serialization with:
510    /// - Magic header for file type validation
511    /// - Format version for compatibility detection
512    /// - Atomic writes (temp file + rename) to prevent corruption
513    #[cfg(feature = "native")]
514    pub fn save_to_cache(&self, cache_dir: &Path) -> std::io::Result<()> {
515        use std::fs;
516        use std::io::Write;
517
518        // Ensure cache directory exists
519        fs::create_dir_all(cache_dir)?;
520
521        // Serialize the index data using postcard
522        let encoded = postcard::to_allocvec(self)
523            .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e.to_string()))?;
524
525        // Build versioned cache file: [magic][version][data]
526        let mut cache_data = Vec::with_capacity(8 + encoded.len());
527        cache_data.extend_from_slice(CACHE_MAGIC);
528        cache_data.extend_from_slice(&CACHE_FORMAT_VERSION.to_le_bytes());
529        cache_data.extend_from_slice(&encoded);
530
531        // Write atomically: write to temp file then rename
532        let final_path = cache_dir.join(CACHE_FILE_NAME);
533        let temp_path = cache_dir.join(format!("{}.tmp.{}", CACHE_FILE_NAME, std::process::id()));
534
535        // Write to temp file
536        {
537            let mut file = fs::File::create(&temp_path)?;
538            file.write_all(&cache_data)?;
539            file.sync_all()?;
540        }
541
542        // Atomic rename
543        fs::rename(&temp_path, &final_path)?;
544
545        log::debug!(
546            "Saved workspace index to cache: {} files, {} bytes (format v{})",
547            self.files.len(),
548            cache_data.len(),
549            CACHE_FORMAT_VERSION
550        );
551
552        Ok(())
553    }
554
555    /// Load the workspace index from a cache file
556    ///
557    /// Returns `None` if:
558    /// - Cache file doesn't exist
559    /// - Magic header doesn't match
560    /// - Format version is incompatible
561    /// - Data is corrupted
562    #[cfg(feature = "native")]
563    pub fn load_from_cache(cache_dir: &Path) -> Option<Self> {
564        use std::fs;
565
566        let path = cache_dir.join(CACHE_FILE_NAME);
567        let data = fs::read(&path).ok()?;
568
569        // Validate header: need at least 8 bytes for magic + version
570        if data.len() < 8 {
571            log::warn!("Workspace index cache too small, discarding");
572            let _ = fs::remove_file(&path);
573            return None;
574        }
575
576        // Check magic header
577        if &data[0..4] != CACHE_MAGIC {
578            log::warn!("Workspace index cache has invalid magic header, discarding");
579            let _ = fs::remove_file(&path);
580            return None;
581        }
582
583        // Check format version
584        let version = u32::from_le_bytes([data[4], data[5], data[6], data[7]]);
585        if version != CACHE_FORMAT_VERSION {
586            log::info!(
587                "Workspace index cache format version mismatch (got {version}, expected {CACHE_FORMAT_VERSION}), rebuilding"
588            );
589            let _ = fs::remove_file(&path);
590            return None;
591        }
592
593        // Deserialize the index data using postcard
594        match postcard::from_bytes::<Self>(&data[8..]) {
595            Ok(index) => {
596                log::debug!(
597                    "Loaded workspace index from cache: {} files (format v{})",
598                    index.files.len(),
599                    version
600                );
601                Some(index)
602            }
603            Err(e) => {
604                log::warn!("Failed to deserialize workspace index cache: {e}");
605                let _ = fs::remove_file(&path);
606                None
607            }
608        }
609    }
610
611    /// Remove a file as a source from all reverse dependency entries
612    ///
613    /// This removes the file from being listed as a dependent in all target entries.
614    /// Used when updating a file (we need to remove old outgoing links before adding new ones).
615    fn clear_reverse_deps_as_source(&mut self, path: &Path) {
616        for deps in self.reverse_deps.values_mut() {
617            deps.remove(path);
618        }
619        // Clean up empty entries
620        self.reverse_deps.retain(|_, deps| !deps.is_empty());
621    }
622
623    /// Remove a file completely from reverse dependency tracking
624    ///
625    /// Removes the file as both a source (dependent) and as a target.
626    /// Used when deleting a file from the index.
627    fn clear_reverse_deps_for(&mut self, path: &Path) {
628        // Remove as source (dependent)
629        self.clear_reverse_deps_as_source(path);
630
631        // Also remove as target
632        self.reverse_deps.remove(path);
633    }
634
635    /// Resolve a relative path from a source file to an absolute target path
636    fn resolve_target_path(&self, source_file: &Path, relative_target: &str) -> PathBuf {
637        // Get the directory containing the source file
638        let source_dir = source_file.parent().unwrap_or(Path::new(""));
639
640        // Join with the relative target and normalize
641        let target = source_dir.join(relative_target);
642
643        // Normalize the path (handle .., ., etc.)
644        Self::normalize_path(&target)
645    }
646
647    /// Normalize a path by resolving . and .. components
648    fn normalize_path(path: &Path) -> PathBuf {
649        let mut components = Vec::new();
650
651        for component in path.components() {
652            match component {
653                std::path::Component::ParentDir => {
654                    // Go up one level if possible
655                    if !components.is_empty() {
656                        components.pop();
657                    }
658                }
659                std::path::Component::CurDir => {
660                    // Skip current directory markers
661                }
662                _ => {
663                    components.push(component);
664                }
665            }
666        }
667
668        components.iter().collect()
669    }
670}
671
672impl FileIndex {
673    /// Create a new empty file index
674    pub fn new() -> Self {
675        Self::default()
676    }
677
678    /// Create a file index with the given content hash
679    pub fn with_hash(content_hash: String) -> Self {
680        Self {
681            content_hash,
682            ..Default::default()
683        }
684    }
685
686    /// Add a heading to the index
687    ///
688    /// Also updates the anchor lookup map for O(1) anchor queries
689    pub fn add_heading(&mut self, heading: HeadingIndex) {
690        let index = self.headings.len();
691
692        // Add auto-generated anchor to lookup map (lowercased for case-insensitive matching)
693        self.anchor_to_heading.insert(heading.auto_anchor.to_lowercase(), index);
694
695        // Add custom anchor if present
696        if let Some(ref custom) = heading.custom_anchor {
697            self.anchor_to_heading.insert(custom.to_lowercase(), index);
698        }
699
700        self.headings.push(heading);
701    }
702
703    /// Add an alternative anchor that resolves to an existing heading.
704    /// Used for platform-specific anchor conventions (e.g., Python-Markdown `_N` dedup).
705    pub fn add_anchor_alias(&mut self, anchor: String, heading_index: usize) {
706        if heading_index < self.headings.len() {
707            self.anchor_to_heading.insert(anchor.to_lowercase(), heading_index);
708        }
709    }
710
711    /// Check if an anchor exists in this file (O(1) lookup)
712    ///
713    /// Returns true if the anchor matches any of:
714    /// - Auto-generated heading anchors
715    /// - Custom heading anchors (from {#id} syntax on headings)
716    /// - HTML anchors (from `<a id="...">` or `<element id="...">`)
717    /// - Attribute anchors (from { #id } syntax on non-heading elements)
718    ///
719    /// Matching is case-insensitive. URL-encoded anchors (e.g., CJK characters
720    /// like `%E6%97%A5%E6%9C%AC%E8%AA%9E` for `日本語`) are decoded before matching.
721    pub fn has_anchor(&self, anchor: &str) -> bool {
722        let lower = anchor.to_lowercase();
723
724        // Fast path: try exact match first
725        if self.anchor_to_heading.contains_key(&lower)
726            || self.html_anchors.contains(&lower)
727            || self.attribute_anchors.contains(&lower)
728        {
729            return true;
730        }
731
732        // Slow path: if anchor contains percent-encoding, try decoded version
733        if anchor.contains('%') {
734            let decoded = url_decode(anchor).to_lowercase();
735            if decoded != lower {
736                return self.anchor_to_heading.contains_key(&decoded)
737                    || self.html_anchors.contains(&decoded)
738                    || self.attribute_anchors.contains(&decoded);
739            }
740        }
741
742        false
743    }
744
745    /// Add an HTML anchor (from `<a id="...">` or `<element id="...">` tags)
746    pub fn add_html_anchor(&mut self, anchor: String) {
747        if !anchor.is_empty() {
748            self.html_anchors.insert(anchor.to_lowercase());
749        }
750    }
751
752    /// Add an attribute anchor (from { #id } syntax on non-heading elements)
753    pub fn add_attribute_anchor(&mut self, anchor: String) {
754        if !anchor.is_empty() {
755            self.attribute_anchors.insert(anchor.to_lowercase());
756        }
757    }
758
759    /// Get the heading index for an anchor (O(1) lookup)
760    ///
761    /// Returns the index into `self.headings` if found.
762    pub fn get_heading_by_anchor(&self, anchor: &str) -> Option<&HeadingIndex> {
763        self.anchor_to_heading
764            .get(&anchor.to_lowercase())
765            .and_then(|&idx| self.headings.get(idx))
766    }
767
768    /// Add a reference link to the index
769    pub fn add_reference_link(&mut self, link: ReferenceLinkIndex) {
770        self.reference_links.push(link);
771    }
772
773    /// Check if a rule is disabled at a specific line
774    ///
775    /// Used by cross-file rules to respect inline disable directives.
776    /// Checks both file-wide disables and line-specific disables.
777    pub fn is_rule_disabled_at_line(&self, rule_name: &str, line: usize) -> bool {
778        // Check file-wide disables (highest priority)
779        if self.file_disabled_rules.contains("*") || self.file_disabled_rules.contains(rule_name) {
780            return true;
781        }
782
783        // Check line-specific disables (disable-line / disable-next-line)
784        if let Some(rules) = self.line_disabled_rules.get(&line)
785            && (rules.contains("*") || rules.contains(rule_name))
786        {
787            return true;
788        }
789
790        // Check persistent disable/enable transitions via binary search
791        if !self.persistent_transitions.is_empty() {
792            let idx = match self.persistent_transitions.binary_search_by_key(&line, |t| t.0) {
793                Ok(i) => Some(i),
794                Err(i) => {
795                    if i > 0 {
796                        Some(i - 1)
797                    } else {
798                        None
799                    }
800                }
801            };
802            if let Some(i) = idx {
803                let (_, ref disabled, ref enabled) = self.persistent_transitions[i];
804                if disabled.contains("*") {
805                    return !enabled.contains(rule_name);
806                }
807                return disabled.contains(rule_name);
808            }
809        }
810
811        false
812    }
813
814    /// Add a cross-file link to the index (deduplicates by target_path, fragment, line)
815    pub fn add_cross_file_link(&mut self, link: CrossFileLinkIndex) {
816        // Deduplicate: multiple rules may contribute the same link with different columns
817        // (e.g., MD051 uses link start, MD057 uses URL start)
818        let is_duplicate = self.cross_file_links.iter().any(|existing| {
819            existing.target_path == link.target_path && existing.fragment == link.fragment && existing.line == link.line
820        });
821        if !is_duplicate {
822            self.cross_file_links.push(link);
823        }
824    }
825
826    /// Add a defined reference ID (e.g., from `[ref]: url`)
827    pub fn add_defined_reference(&mut self, ref_id: String) {
828        self.defined_references.insert(ref_id);
829    }
830
831    /// Check if a reference ID has an explicit definition
832    pub fn has_defined_reference(&self, ref_id: &str) -> bool {
833        self.defined_references.contains(ref_id)
834    }
835
836    /// Check if the content hash matches
837    pub fn hash_matches(&self, hash: &str) -> bool {
838        self.content_hash == hash
839    }
840
841    /// Get the number of headings
842    pub fn heading_count(&self) -> usize {
843        self.headings.len()
844    }
845
846    /// Get the number of reference links
847    pub fn reference_link_count(&self) -> usize {
848        self.reference_links.len()
849    }
850}
851
852#[cfg(test)]
853mod tests {
854    use super::*;
855
856    #[test]
857    fn test_workspace_index_basic() {
858        let mut index = WorkspaceIndex::new();
859        assert_eq!(index.file_count(), 0);
860        assert_eq!(index.version(), 0);
861
862        let mut file_index = FileIndex::with_hash("abc123".to_string());
863        file_index.add_heading(HeadingIndex {
864            text: "Installation".to_string(),
865            auto_anchor: "installation".to_string(),
866            custom_anchor: None,
867            line: 1,
868            is_setext: false,
869        });
870
871        index.insert_file(PathBuf::from("docs/install.md"), file_index);
872        assert_eq!(index.file_count(), 1);
873        assert_eq!(index.version(), 1);
874
875        assert!(index.contains_file(Path::new("docs/install.md")));
876        assert!(!index.contains_file(Path::new("docs/other.md")));
877    }
878
879    #[test]
880    fn test_vulnerable_anchors() {
881        let mut index = WorkspaceIndex::new();
882
883        // File 1: heading without custom anchor (vulnerable)
884        let mut file1 = FileIndex::new();
885        file1.add_heading(HeadingIndex {
886            text: "Getting Started".to_string(),
887            auto_anchor: "getting-started".to_string(),
888            custom_anchor: None,
889            line: 1,
890            is_setext: false,
891        });
892        index.insert_file(PathBuf::from("docs/guide.md"), file1);
893
894        // File 2: heading with custom anchor (not vulnerable)
895        let mut file2 = FileIndex::new();
896        file2.add_heading(HeadingIndex {
897            text: "Installation".to_string(),
898            auto_anchor: "installation".to_string(),
899            custom_anchor: Some("install".to_string()),
900            line: 1,
901            is_setext: false,
902        });
903        index.insert_file(PathBuf::from("docs/install.md"), file2);
904
905        let vulnerable = index.get_vulnerable_anchors();
906        assert_eq!(vulnerable.len(), 1);
907        assert!(vulnerable.contains_key("getting-started"));
908        assert!(!vulnerable.contains_key("installation"));
909
910        let anchors = vulnerable.get("getting-started").unwrap();
911        assert_eq!(anchors.len(), 1);
912        assert_eq!(anchors[0].file, PathBuf::from("docs/guide.md"));
913        assert_eq!(anchors[0].text, "Getting Started");
914    }
915
916    #[test]
917    fn test_vulnerable_anchors_multiple_files_same_anchor() {
918        // Multiple files can have headings with the same auto-generated anchor
919        // get_vulnerable_anchors() should collect all of them
920        let mut index = WorkspaceIndex::new();
921
922        // File 1: has "Installation" heading (vulnerable)
923        let mut file1 = FileIndex::new();
924        file1.add_heading(HeadingIndex {
925            text: "Installation".to_string(),
926            auto_anchor: "installation".to_string(),
927            custom_anchor: None,
928            line: 1,
929            is_setext: false,
930        });
931        index.insert_file(PathBuf::from("docs/en/guide.md"), file1);
932
933        // File 2: also has "Installation" heading with same anchor (vulnerable)
934        let mut file2 = FileIndex::new();
935        file2.add_heading(HeadingIndex {
936            text: "Installation".to_string(),
937            auto_anchor: "installation".to_string(),
938            custom_anchor: None,
939            line: 5,
940            is_setext: false,
941        });
942        index.insert_file(PathBuf::from("docs/fr/guide.md"), file2);
943
944        // File 3: has "Installation" but WITH custom anchor (not vulnerable)
945        let mut file3 = FileIndex::new();
946        file3.add_heading(HeadingIndex {
947            text: "Installation".to_string(),
948            auto_anchor: "installation".to_string(),
949            custom_anchor: Some("install".to_string()),
950            line: 10,
951            is_setext: false,
952        });
953        index.insert_file(PathBuf::from("docs/de/guide.md"), file3);
954
955        let vulnerable = index.get_vulnerable_anchors();
956        assert_eq!(vulnerable.len(), 1); // One unique anchor
957        assert!(vulnerable.contains_key("installation"));
958
959        let anchors = vulnerable.get("installation").unwrap();
960        // Should have 2 entries (en and fr), NOT 3 (de has custom anchor)
961        assert_eq!(anchors.len(), 2, "Should collect both vulnerable anchors");
962
963        // Verify both files are represented
964        let files: std::collections::HashSet<_> = anchors.iter().map(|a| &a.file).collect();
965        assert!(files.contains(&PathBuf::from("docs/en/guide.md")));
966        assert!(files.contains(&PathBuf::from("docs/fr/guide.md")));
967    }
968
969    #[test]
970    fn test_file_index_hash() {
971        let index = FileIndex::with_hash("hash123".to_string());
972        assert!(index.hash_matches("hash123"));
973        assert!(!index.hash_matches("other"));
974    }
975
976    #[test]
977    fn test_version_increment() {
978        let mut index = WorkspaceIndex::new();
979        assert_eq!(index.version(), 0);
980
981        index.insert_file(PathBuf::from("a.md"), FileIndex::new());
982        assert_eq!(index.version(), 1);
983
984        index.insert_file(PathBuf::from("b.md"), FileIndex::new());
985        assert_eq!(index.version(), 2);
986
987        index.remove_file(Path::new("a.md"));
988        assert_eq!(index.version(), 3);
989
990        // Removing non-existent file doesn't increment
991        index.remove_file(Path::new("nonexistent.md"));
992        assert_eq!(index.version(), 3);
993    }
994
995    #[test]
996    fn test_reverse_deps_basic() {
997        let mut index = WorkspaceIndex::new();
998
999        // File A links to file B
1000        let mut file_a = FileIndex::new();
1001        file_a.add_cross_file_link(CrossFileLinkIndex {
1002            target_path: "b.md".to_string(),
1003            fragment: "section".to_string(),
1004            line: 10,
1005            column: 5,
1006        });
1007        index.update_file(Path::new("docs/a.md"), file_a);
1008
1009        // Check that B has A as a dependent
1010        let dependents = index.get_dependents(Path::new("docs/b.md"));
1011        assert_eq!(dependents.len(), 1);
1012        assert_eq!(dependents[0], PathBuf::from("docs/a.md"));
1013
1014        // A has no dependents
1015        let a_dependents = index.get_dependents(Path::new("docs/a.md"));
1016        assert!(a_dependents.is_empty());
1017    }
1018
1019    #[test]
1020    fn test_reverse_deps_multiple() {
1021        let mut index = WorkspaceIndex::new();
1022
1023        // Files A and C both link to B
1024        let mut file_a = FileIndex::new();
1025        file_a.add_cross_file_link(CrossFileLinkIndex {
1026            target_path: "../b.md".to_string(),
1027            fragment: "".to_string(),
1028            line: 1,
1029            column: 1,
1030        });
1031        index.update_file(Path::new("docs/sub/a.md"), file_a);
1032
1033        let mut file_c = FileIndex::new();
1034        file_c.add_cross_file_link(CrossFileLinkIndex {
1035            target_path: "b.md".to_string(),
1036            fragment: "".to_string(),
1037            line: 1,
1038            column: 1,
1039        });
1040        index.update_file(Path::new("docs/c.md"), file_c);
1041
1042        // B should have both A and C as dependents
1043        let dependents = index.get_dependents(Path::new("docs/b.md"));
1044        assert_eq!(dependents.len(), 2);
1045        assert!(dependents.contains(&PathBuf::from("docs/sub/a.md")));
1046        assert!(dependents.contains(&PathBuf::from("docs/c.md")));
1047    }
1048
1049    #[test]
1050    fn test_reverse_deps_update_clears_old() {
1051        let mut index = WorkspaceIndex::new();
1052
1053        // File A initially links to B
1054        let mut file_a = FileIndex::new();
1055        file_a.add_cross_file_link(CrossFileLinkIndex {
1056            target_path: "b.md".to_string(),
1057            fragment: "".to_string(),
1058            line: 1,
1059            column: 1,
1060        });
1061        index.update_file(Path::new("docs/a.md"), file_a);
1062
1063        // Verify B has A as dependent
1064        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1065
1066        // Update A to link to C instead of B
1067        let mut file_a_updated = FileIndex::new();
1068        file_a_updated.add_cross_file_link(CrossFileLinkIndex {
1069            target_path: "c.md".to_string(),
1070            fragment: "".to_string(),
1071            line: 1,
1072            column: 1,
1073        });
1074        index.update_file(Path::new("docs/a.md"), file_a_updated);
1075
1076        // B should no longer have A as dependent
1077        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1078
1079        // C should now have A as dependent
1080        let c_deps = index.get_dependents(Path::new("docs/c.md"));
1081        assert_eq!(c_deps.len(), 1);
1082        assert_eq!(c_deps[0], PathBuf::from("docs/a.md"));
1083    }
1084
1085    #[test]
1086    fn test_reverse_deps_remove_file() {
1087        let mut index = WorkspaceIndex::new();
1088
1089        // File A links to B
1090        let mut file_a = FileIndex::new();
1091        file_a.add_cross_file_link(CrossFileLinkIndex {
1092            target_path: "b.md".to_string(),
1093            fragment: "".to_string(),
1094            line: 1,
1095            column: 1,
1096        });
1097        index.update_file(Path::new("docs/a.md"), file_a);
1098
1099        // Verify B has A as dependent
1100        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1101
1102        // Remove file A
1103        index.remove_file(Path::new("docs/a.md"));
1104
1105        // B should no longer have any dependents
1106        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1107    }
1108
1109    #[test]
1110    fn test_normalize_path() {
1111        // Test .. handling
1112        let path = Path::new("docs/sub/../other.md");
1113        let normalized = WorkspaceIndex::normalize_path(path);
1114        assert_eq!(normalized, PathBuf::from("docs/other.md"));
1115
1116        // Test . handling
1117        let path2 = Path::new("docs/./other.md");
1118        let normalized2 = WorkspaceIndex::normalize_path(path2);
1119        assert_eq!(normalized2, PathBuf::from("docs/other.md"));
1120
1121        // Test multiple ..
1122        let path3 = Path::new("a/b/c/../../d.md");
1123        let normalized3 = WorkspaceIndex::normalize_path(path3);
1124        assert_eq!(normalized3, PathBuf::from("a/d.md"));
1125    }
1126
1127    #[test]
1128    fn test_clear_clears_reverse_deps() {
1129        let mut index = WorkspaceIndex::new();
1130
1131        // File A links to B
1132        let mut file_a = FileIndex::new();
1133        file_a.add_cross_file_link(CrossFileLinkIndex {
1134            target_path: "b.md".to_string(),
1135            fragment: "".to_string(),
1136            line: 1,
1137            column: 1,
1138        });
1139        index.update_file(Path::new("docs/a.md"), file_a);
1140
1141        // Verify B has A as dependent
1142        assert_eq!(index.get_dependents(Path::new("docs/b.md")).len(), 1);
1143
1144        // Clear the index
1145        index.clear();
1146
1147        // Both files and reverse deps should be cleared
1148        assert_eq!(index.file_count(), 0);
1149        assert!(index.get_dependents(Path::new("docs/b.md")).is_empty());
1150    }
1151
1152    #[test]
1153    fn test_is_file_stale() {
1154        let mut index = WorkspaceIndex::new();
1155
1156        // Non-existent file is always stale
1157        assert!(index.is_file_stale(Path::new("nonexistent.md"), "hash123"));
1158
1159        // Add a file with known hash
1160        let file_index = FileIndex::with_hash("hash123".to_string());
1161        index.insert_file(PathBuf::from("docs/test.md"), file_index);
1162
1163        // Same hash means not stale
1164        assert!(!index.is_file_stale(Path::new("docs/test.md"), "hash123"));
1165
1166        // Different hash means stale
1167        assert!(index.is_file_stale(Path::new("docs/test.md"), "different_hash"));
1168    }
1169
1170    #[cfg(feature = "native")]
1171    #[test]
1172    fn test_cache_roundtrip() {
1173        use std::fs;
1174
1175        // Create a temp directory
1176        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_roundtrip");
1177        let _ = fs::remove_dir_all(&temp_dir);
1178        fs::create_dir_all(&temp_dir).unwrap();
1179
1180        // Create an index with some data
1181        let mut index = WorkspaceIndex::new();
1182
1183        let mut file1 = FileIndex::with_hash("abc123".to_string());
1184        file1.add_heading(HeadingIndex {
1185            text: "Test Heading".to_string(),
1186            auto_anchor: "test-heading".to_string(),
1187            custom_anchor: Some("test".to_string()),
1188            line: 1,
1189            is_setext: false,
1190        });
1191        file1.add_cross_file_link(CrossFileLinkIndex {
1192            target_path: "./other.md".to_string(),
1193            fragment: "section".to_string(),
1194            line: 5,
1195            column: 3,
1196        });
1197        index.update_file(Path::new("docs/file1.md"), file1);
1198
1199        let mut file2 = FileIndex::with_hash("def456".to_string());
1200        file2.add_heading(HeadingIndex {
1201            text: "Another Heading".to_string(),
1202            auto_anchor: "another-heading".to_string(),
1203            custom_anchor: None,
1204            line: 1,
1205            is_setext: false,
1206        });
1207        index.update_file(Path::new("docs/other.md"), file2);
1208
1209        // Save to cache
1210        index.save_to_cache(&temp_dir).expect("Failed to save cache");
1211
1212        // Verify cache file exists
1213        assert!(temp_dir.join("workspace_index.bin").exists());
1214
1215        // Load from cache
1216        let loaded = WorkspaceIndex::load_from_cache(&temp_dir).expect("Failed to load cache");
1217
1218        // Verify data matches
1219        assert_eq!(loaded.file_count(), 2);
1220        assert!(loaded.contains_file(Path::new("docs/file1.md")));
1221        assert!(loaded.contains_file(Path::new("docs/other.md")));
1222
1223        // Check file1 details
1224        let file1_loaded = loaded.get_file(Path::new("docs/file1.md")).unwrap();
1225        assert_eq!(file1_loaded.content_hash, "abc123");
1226        assert_eq!(file1_loaded.headings.len(), 1);
1227        assert_eq!(file1_loaded.headings[0].text, "Test Heading");
1228        assert_eq!(file1_loaded.headings[0].custom_anchor, Some("test".to_string()));
1229        assert_eq!(file1_loaded.cross_file_links.len(), 1);
1230        assert_eq!(file1_loaded.cross_file_links[0].target_path, "./other.md");
1231
1232        // Check reverse deps were serialized correctly
1233        let dependents = loaded.get_dependents(Path::new("docs/other.md"));
1234        assert_eq!(dependents.len(), 1);
1235        assert_eq!(dependents[0], PathBuf::from("docs/file1.md"));
1236
1237        // Clean up
1238        let _ = fs::remove_dir_all(&temp_dir);
1239    }
1240
1241    #[cfg(feature = "native")]
1242    #[test]
1243    fn test_cache_missing_file() {
1244        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_missing");
1245        let _ = std::fs::remove_dir_all(&temp_dir);
1246
1247        // Should return None for non-existent cache
1248        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1249        assert!(result.is_none());
1250    }
1251
1252    #[cfg(feature = "native")]
1253    #[test]
1254    fn test_cache_corrupted_file() {
1255        use std::fs;
1256
1257        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_corrupted");
1258        let _ = fs::remove_dir_all(&temp_dir);
1259        fs::create_dir_all(&temp_dir).unwrap();
1260
1261        // Write corrupted data (too small for header)
1262        fs::write(temp_dir.join("workspace_index.bin"), b"bad").unwrap();
1263
1264        // Should return None for corrupted cache (and remove the file)
1265        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1266        assert!(result.is_none());
1267
1268        // Corrupted file should be removed
1269        assert!(!temp_dir.join("workspace_index.bin").exists());
1270
1271        // Clean up
1272        let _ = fs::remove_dir_all(&temp_dir);
1273    }
1274
1275    #[cfg(feature = "native")]
1276    #[test]
1277    fn test_cache_invalid_magic() {
1278        use std::fs;
1279
1280        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_invalid_magic");
1281        let _ = fs::remove_dir_all(&temp_dir);
1282        fs::create_dir_all(&temp_dir).unwrap();
1283
1284        // Write data with wrong magic header
1285        let mut data = Vec::new();
1286        data.extend_from_slice(b"XXXX"); // Wrong magic
1287        data.extend_from_slice(&1u32.to_le_bytes()); // Version 1
1288        data.extend_from_slice(&[0; 100]); // Some garbage data
1289        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1290
1291        // Should return None for invalid magic
1292        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1293        assert!(result.is_none());
1294
1295        // File should be removed
1296        assert!(!temp_dir.join("workspace_index.bin").exists());
1297
1298        // Clean up
1299        let _ = fs::remove_dir_all(&temp_dir);
1300    }
1301
1302    #[cfg(feature = "native")]
1303    #[test]
1304    fn test_cache_version_mismatch() {
1305        use std::fs;
1306
1307        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_version_mismatch");
1308        let _ = fs::remove_dir_all(&temp_dir);
1309        fs::create_dir_all(&temp_dir).unwrap();
1310
1311        // Write data with correct magic but wrong version
1312        let mut data = Vec::new();
1313        data.extend_from_slice(b"RWSI"); // Correct magic
1314        data.extend_from_slice(&999u32.to_le_bytes()); // Future version
1315        data.extend_from_slice(&[0; 100]); // Some garbage data
1316        fs::write(temp_dir.join("workspace_index.bin"), &data).unwrap();
1317
1318        // Should return None for version mismatch
1319        let result = WorkspaceIndex::load_from_cache(&temp_dir);
1320        assert!(result.is_none());
1321
1322        // File should be removed to trigger rebuild
1323        assert!(!temp_dir.join("workspace_index.bin").exists());
1324
1325        // Clean up
1326        let _ = fs::remove_dir_all(&temp_dir);
1327    }
1328
1329    #[cfg(feature = "native")]
1330    #[test]
1331    fn test_cache_atomic_write() {
1332        use std::fs;
1333
1334        // Test that atomic writes work (no temp files left behind)
1335        let temp_dir = std::env::temp_dir().join("rumdl_test_cache_atomic");
1336        let _ = fs::remove_dir_all(&temp_dir);
1337        fs::create_dir_all(&temp_dir).unwrap();
1338
1339        let index = WorkspaceIndex::new();
1340        index.save_to_cache(&temp_dir).expect("Failed to save");
1341
1342        // Only the final cache file should exist, no temp files
1343        let entries: Vec<_> = fs::read_dir(&temp_dir).unwrap().collect();
1344        assert_eq!(entries.len(), 1);
1345        assert!(temp_dir.join("workspace_index.bin").exists());
1346
1347        // Clean up
1348        let _ = fs::remove_dir_all(&temp_dir);
1349    }
1350
1351    #[test]
1352    fn test_has_anchor_auto_generated() {
1353        let mut file_index = FileIndex::new();
1354        file_index.add_heading(HeadingIndex {
1355            text: "Installation Guide".to_string(),
1356            auto_anchor: "installation-guide".to_string(),
1357            custom_anchor: None,
1358            line: 1,
1359            is_setext: false,
1360        });
1361
1362        // Should find by auto-generated anchor
1363        assert!(file_index.has_anchor("installation-guide"));
1364
1365        // Case-insensitive matching
1366        assert!(file_index.has_anchor("Installation-Guide"));
1367        assert!(file_index.has_anchor("INSTALLATION-GUIDE"));
1368
1369        // Should not find non-existent anchor
1370        assert!(!file_index.has_anchor("nonexistent"));
1371    }
1372
1373    #[test]
1374    fn test_has_anchor_custom() {
1375        let mut file_index = FileIndex::new();
1376        file_index.add_heading(HeadingIndex {
1377            text: "Installation Guide".to_string(),
1378            auto_anchor: "installation-guide".to_string(),
1379            custom_anchor: Some("install".to_string()),
1380            line: 1,
1381            is_setext: false,
1382        });
1383
1384        // Should find by auto-generated anchor
1385        assert!(file_index.has_anchor("installation-guide"));
1386
1387        // Should also find by custom anchor
1388        assert!(file_index.has_anchor("install"));
1389        assert!(file_index.has_anchor("Install")); // case-insensitive
1390
1391        // Should not find non-existent anchor
1392        assert!(!file_index.has_anchor("nonexistent"));
1393    }
1394
1395    #[test]
1396    fn test_get_heading_by_anchor() {
1397        let mut file_index = FileIndex::new();
1398        file_index.add_heading(HeadingIndex {
1399            text: "Installation Guide".to_string(),
1400            auto_anchor: "installation-guide".to_string(),
1401            custom_anchor: Some("install".to_string()),
1402            line: 10,
1403            is_setext: false,
1404        });
1405        file_index.add_heading(HeadingIndex {
1406            text: "Configuration".to_string(),
1407            auto_anchor: "configuration".to_string(),
1408            custom_anchor: None,
1409            line: 20,
1410            is_setext: false,
1411        });
1412
1413        // Get by auto anchor
1414        let heading = file_index.get_heading_by_anchor("installation-guide");
1415        assert!(heading.is_some());
1416        assert_eq!(heading.unwrap().text, "Installation Guide");
1417        assert_eq!(heading.unwrap().line, 10);
1418
1419        // Get by custom anchor
1420        let heading = file_index.get_heading_by_anchor("install");
1421        assert!(heading.is_some());
1422        assert_eq!(heading.unwrap().text, "Installation Guide");
1423
1424        // Get second heading
1425        let heading = file_index.get_heading_by_anchor("configuration");
1426        assert!(heading.is_some());
1427        assert_eq!(heading.unwrap().text, "Configuration");
1428        assert_eq!(heading.unwrap().line, 20);
1429
1430        // Non-existent
1431        assert!(file_index.get_heading_by_anchor("nonexistent").is_none());
1432    }
1433
1434    #[test]
1435    fn test_anchor_lookup_many_headings() {
1436        // Test that O(1) lookup works with many headings
1437        let mut file_index = FileIndex::new();
1438
1439        // Add 100 headings
1440        for i in 0..100 {
1441            file_index.add_heading(HeadingIndex {
1442                text: format!("Heading {i}"),
1443                auto_anchor: format!("heading-{i}"),
1444                custom_anchor: Some(format!("h{i}")),
1445                line: i + 1,
1446                is_setext: false,
1447            });
1448        }
1449
1450        // Verify all can be found
1451        for i in 0..100 {
1452            assert!(file_index.has_anchor(&format!("heading-{i}")));
1453            assert!(file_index.has_anchor(&format!("h{i}")));
1454
1455            let heading = file_index.get_heading_by_anchor(&format!("heading-{i}"));
1456            assert!(heading.is_some());
1457            assert_eq!(heading.unwrap().line, i + 1);
1458        }
1459    }
1460
1461    // =============================================================================
1462    // Tests for extract_cross_file_links utility
1463    // =============================================================================
1464
1465    #[test]
1466    fn test_extract_cross_file_links_basic() {
1467        use crate::config::MarkdownFlavor;
1468
1469        let content = "# Test\n\nSee [link](./other.md) for info.\n";
1470        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1471        let links = extract_cross_file_links(&ctx);
1472
1473        assert_eq!(links.len(), 1);
1474        assert_eq!(links[0].target_path, "./other.md");
1475        assert_eq!(links[0].fragment, "");
1476        assert_eq!(links[0].line, 3);
1477        // "See [link](" = 11 chars, so column 12 is where "./other.md" starts
1478        assert_eq!(links[0].column, 12);
1479    }
1480
1481    #[test]
1482    fn test_extract_cross_file_links_with_fragment() {
1483        use crate::config::MarkdownFlavor;
1484
1485        let content = "Check [guide](./guide.md#install) here.\n";
1486        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1487        let links = extract_cross_file_links(&ctx);
1488
1489        assert_eq!(links.len(), 1);
1490        assert_eq!(links[0].target_path, "./guide.md");
1491        assert_eq!(links[0].fragment, "install");
1492        assert_eq!(links[0].line, 1);
1493        // "Check [guide](" = 14 chars, so column 15 is where "./guide.md" starts
1494        assert_eq!(links[0].column, 15);
1495    }
1496
1497    #[test]
1498    fn test_extract_cross_file_links_multiple_on_same_line() {
1499        use crate::config::MarkdownFlavor;
1500
1501        let content = "See [a](a.md) and [b](b.md) here.\n";
1502        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1503        let links = extract_cross_file_links(&ctx);
1504
1505        assert_eq!(links.len(), 2);
1506
1507        assert_eq!(links[0].target_path, "a.md");
1508        assert_eq!(links[0].line, 1);
1509        // "See [a](" = 8 chars, so column 9
1510        assert_eq!(links[0].column, 9);
1511
1512        assert_eq!(links[1].target_path, "b.md");
1513        assert_eq!(links[1].line, 1);
1514        // "See [a](a.md) and [b](" = 22 chars, so column 23
1515        assert_eq!(links[1].column, 23);
1516    }
1517
1518    #[test]
1519    fn test_extract_cross_file_links_angle_brackets() {
1520        use crate::config::MarkdownFlavor;
1521
1522        let content = "See [link](<path/with (parens).md>) here.\n";
1523        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1524        let links = extract_cross_file_links(&ctx);
1525
1526        assert_eq!(links.len(), 1);
1527        assert_eq!(links[0].target_path, "path/with (parens).md");
1528        assert_eq!(links[0].line, 1);
1529        // "See [link](<" = 12 chars, so column 13
1530        assert_eq!(links[0].column, 13);
1531    }
1532
1533    #[test]
1534    fn test_extract_cross_file_links_skips_external() {
1535        use crate::config::MarkdownFlavor;
1536
1537        let content = r#"
1538[external](https://example.com)
1539[mailto](mailto:test@example.com)
1540[local](./local.md)
1541[fragment](#section)
1542[absolute](/docs/page.md)
1543"#;
1544        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1545        let links = extract_cross_file_links(&ctx);
1546
1547        // Only the local markdown link should be extracted
1548        assert_eq!(links.len(), 1);
1549        assert_eq!(links[0].target_path, "./local.md");
1550    }
1551
1552    #[test]
1553    fn test_extract_cross_file_links_skips_non_markdown() {
1554        use crate::config::MarkdownFlavor;
1555
1556        let content = r#"
1557[image](./photo.png)
1558[doc](./readme.md)
1559[pdf](./document.pdf)
1560"#;
1561        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1562        let links = extract_cross_file_links(&ctx);
1563
1564        // Only markdown files are indexed for cross-file validation
1565        assert_eq!(links.len(), 1);
1566        assert_eq!(links[0].target_path, "./readme.md");
1567    }
1568
1569    #[test]
1570    fn test_extract_cross_file_links_skips_code_spans() {
1571        use crate::config::MarkdownFlavor;
1572
1573        let content = "Normal [link](./file.md) and `[code](./ignored.md)` here.\n";
1574        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1575        let links = extract_cross_file_links(&ctx);
1576
1577        // Only the link outside code span should be extracted
1578        assert_eq!(links.len(), 1);
1579        assert_eq!(links[0].target_path, "./file.md");
1580    }
1581
1582    #[test]
1583    fn test_extract_cross_file_links_with_query_params() {
1584        use crate::config::MarkdownFlavor;
1585
1586        let content = "See [doc](./file.md?raw=true) here.\n";
1587        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1588        let links = extract_cross_file_links(&ctx);
1589
1590        assert_eq!(links.len(), 1);
1591        // Query params should be stripped
1592        assert_eq!(links[0].target_path, "./file.md");
1593    }
1594
1595    #[test]
1596    fn test_extract_cross_file_links_empty_content() {
1597        use crate::config::MarkdownFlavor;
1598
1599        let content = "";
1600        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1601        let links = extract_cross_file_links(&ctx);
1602
1603        assert!(links.is_empty());
1604    }
1605
1606    #[test]
1607    fn test_extract_cross_file_links_no_links() {
1608        use crate::config::MarkdownFlavor;
1609
1610        let content = "# Just a heading\n\nSome text without links.\n";
1611        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1612        let links = extract_cross_file_links(&ctx);
1613
1614        assert!(links.is_empty());
1615    }
1616
1617    #[test]
1618    fn test_extract_cross_file_links_position_accuracy_issue_234() {
1619        // This test verifies the fix for GitHub issue #234
1620        // The LSP was reporting incorrect column positions for MD057 diagnostics
1621        use crate::config::MarkdownFlavor;
1622
1623        let content = r#"# Test Document
1624
1625Here is a [broken link](nonexistent-file.md) that should trigger MD057.
1626
1627And another [link](also-missing.md) on this line.
1628"#;
1629        let ctx = LintContext::new(content, MarkdownFlavor::default(), None);
1630        let links = extract_cross_file_links(&ctx);
1631
1632        assert_eq!(links.len(), 2);
1633
1634        // First link: "Here is a [broken link](" = 24 chars, column 25
1635        assert_eq!(links[0].target_path, "nonexistent-file.md");
1636        assert_eq!(links[0].line, 3);
1637        assert_eq!(links[0].column, 25);
1638
1639        // Second link: "And another [link](" = 19 chars, column 20
1640        assert_eq!(links[1].target_path, "also-missing.md");
1641        assert_eq!(links[1].line, 5);
1642        assert_eq!(links[1].column, 20);
1643    }
1644}
rumdl_lib/workspace_index.rs

rumdl_lib/
workspace_index.rs