rumdl_lib/rules/
md057_existing_relative_links.rs

1//!
2//! Rule MD057: Existing relative links
3//!
4//! See [docs/md057.md](../../docs/md057.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::document_structure::{DocumentStructure, DocumentStructureExtensions};
8use crate::utils::element_cache::ElementCache;
9use lazy_static::lazy_static;
10use regex::Regex;
11use std::collections::HashMap;
12use std::env;
13use std::path::{Path, PathBuf};
14use std::sync::{Arc, Mutex};
15
16mod md057_config;
17use md057_config::MD057Config;
18
19// Thread-safe cache for file existence checks to avoid redundant filesystem operations
20lazy_static! {
21    static ref FILE_EXISTENCE_CACHE: Arc<Mutex<HashMap<PathBuf, bool>>> = Arc::new(Mutex::new(HashMap::new()));
22}
23
24// Reset the file existence cache (typically between rule runs)
25fn reset_file_existence_cache() {
26    let mut cache = FILE_EXISTENCE_CACHE.lock().unwrap();
27    cache.clear();
28}
29
30// Check if a file exists with caching
31fn file_exists_with_cache(path: &Path) -> bool {
32    let mut cache = FILE_EXISTENCE_CACHE.lock().unwrap();
33    *cache.entry(path.to_path_buf()).or_insert_with(|| path.exists())
34}
35
36lazy_static! {
37    // Regex to match the start of a link - simplified for performance
38    static ref LINK_START_REGEX: Regex =
39        Regex::new(r"!?\[[^\]]*\]").unwrap();
40
41    /// Regex to extract the URL from a markdown link
42    /// Format: `](URL)` or `](URL "title")`
43    static ref URL_EXTRACT_REGEX: Regex =
44        Regex::new("\\]\\(\\s*<?([^>\\)\\s#]+)(#[^)\\s]*)?\\s*(?:\"[^\"]*\")?\\s*>?\\s*\\)").unwrap();
45
46    /// Regex to detect code fence blocks
47    static ref CODE_FENCE_REGEX: Regex =
48        Regex::new(r"^( {0,3})(`{3,}|~{3,})").unwrap();
49
50    /// Regex to detect protocol and domain for external links
51    static ref PROTOCOL_DOMAIN_REGEX: Regex =
52        Regex::new(r"^(https?://|ftp://|mailto:|www\.)").unwrap();
53
54    /// Regex to detect media file types
55    static ref MEDIA_FILE_REGEX: Regex =
56        Regex::new(r"\.(jpg|jpeg|png|gif|bmp|svg|webp|tiff|mp3|mp4|avi|mov|webm|wav|ogg|pdf)$").unwrap();
57
58    /// Regex to detect fragment-only links
59    static ref FRAGMENT_ONLY_REGEX: Regex =
60        Regex::new(r"^#").unwrap();
61
62    // Current working directory
63    static ref CURRENT_DIR: PathBuf = env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
64}
65
66/// Rule MD057: Existing relative links should point to valid files or directories.
67#[derive(Debug, Default, Clone)]
68pub struct MD057ExistingRelativeLinks {
69    /// Base directory for resolving relative links
70    base_path: Arc<Mutex<Option<PathBuf>>>,
71    /// Configuration
72    config: MD057Config,
73}
74
75impl MD057ExistingRelativeLinks {
76    /// Create a new instance with default settings
77    pub fn new() -> Self {
78        Self::default()
79    }
80
81    /// Set the base path for resolving relative links
82    pub fn with_path<P: AsRef<Path>>(self, path: P) -> Self {
83        let path = path.as_ref();
84        let dir_path = if path.is_file() {
85            path.parent().map(|p| p.to_path_buf())
86        } else {
87            Some(path.to_path_buf())
88        };
89
90        *self.base_path.lock().unwrap() = dir_path;
91        self
92    }
93
94    /// Configure whether to skip checking media files
95    pub fn with_skip_media_files(mut self, skip_media_files: bool) -> Self {
96        self.config.skip_media_files = skip_media_files;
97        self
98    }
99
100    pub fn from_config_struct(config: MD057Config) -> Self {
101        Self {
102            base_path: Arc::new(Mutex::new(None)),
103            config,
104        }
105    }
106
107    /// Check if a URL is external (optimized version)
108    #[inline]
109    fn is_external_url(&self, url: &str) -> bool {
110        if url.is_empty() {
111            return false;
112        }
113
114        // Quick checks for common external URL patterns
115        if PROTOCOL_DOMAIN_REGEX.is_match(url) || url.starts_with("www.") {
116            return true;
117        }
118
119        // More restrictive domain check using a simpler pattern
120        if !self.is_media_file(url) && url.ends_with(".com") {
121            return true;
122        }
123
124        // Absolute paths within the site are not external
125        if url.starts_with('/') {
126            return false;
127        }
128
129        // All other cases (relative paths, etc.) are not external
130        false
131    }
132
133    /// Check if the URL is a fragment-only link (internal document link)
134    #[inline]
135    fn is_fragment_only_link(&self, url: &str) -> bool {
136        url.starts_with('#')
137    }
138
139    /// Check if the URL has a media file extension (optimized with early returns)
140    #[inline]
141    fn is_media_file(&self, url: &str) -> bool {
142        // Quick check before using regex
143        if !url.contains('.') {
144            return false;
145        }
146        MEDIA_FILE_REGEX.is_match(url)
147    }
148
149    /// Determine if we should skip checking this media file
150    #[inline]
151    fn should_skip_media_file(&self, url: &str) -> bool {
152        self.config.skip_media_files && self.is_media_file(url)
153    }
154
155    /// Resolve a relative link against the base path
156    fn resolve_link_path(&self, link: &str) -> Option<PathBuf> {
157        self.base_path
158            .lock()
159            .unwrap()
160            .as_ref()
161            .map(|base_path| base_path.join(link))
162    }
163
164    /// Process a single link and check if it exists
165    fn process_link(&self, url: &str, line_num: usize, column: usize, warnings: &mut Vec<LintWarning>) {
166        // Skip empty URLs
167        if url.is_empty() {
168            return;
169        }
170
171        // Skip external URLs and fragment-only links (optimized order)
172        if self.is_external_url(url) || self.is_fragment_only_link(url) {
173            return;
174        }
175
176        // Skip media files if configured to do so
177        if self.should_skip_media_file(url) {
178            return;
179        }
180
181        // Resolve the relative link against the base path
182        if let Some(resolved_path) = self.resolve_link_path(url) {
183            // Check if the file exists (with caching to avoid filesystem calls)
184            if !file_exists_with_cache(&resolved_path) {
185                warnings.push(LintWarning {
186                    rule_name: Some(self.name()),
187                    line: line_num,
188                    column,
189                    end_line: line_num,
190                    end_column: column + url.len(),
191                    message: format!("Relative link '{url}' does not exist"),
192                    severity: Severity::Warning,
193                    fix: None, // No automatic fix for missing files
194                });
195            }
196        }
197    }
198}
199
200impl Rule for MD057ExistingRelativeLinks {
201    fn name(&self) -> &'static str {
202        "MD057"
203    }
204
205    fn description(&self) -> &'static str {
206        "Relative links should point to existing files"
207    }
208
209    fn category(&self) -> RuleCategory {
210        RuleCategory::Link
211    }
212
213    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
214        let content = ctx.content;
215        content.is_empty() || !content.contains('[') || !content.contains("](")
216    }
217
218    /// Optimized implementation using document structure
219    fn check_with_structure(
220        &self,
221        ctx: &crate::lint_context::LintContext,
222        structure: &DocumentStructure,
223    ) -> LintResult {
224        let content = ctx.content;
225
226        // Early returns for performance
227        if content.is_empty() || !content.contains('[') {
228            return Ok(Vec::new());
229        }
230
231        // Quick check for any potential links before expensive operations
232        if !content.contains("](") {
233            return Ok(Vec::new());
234        }
235
236        // Reset the file existence cache for a fresh run
237        reset_file_existence_cache();
238
239        let mut warnings = Vec::new();
240
241        // Cache base path lookup to avoid repeated mutex operations
242        let base_path = {
243            let base_path_guard = self.base_path.lock().unwrap();
244            if base_path_guard.is_some() {
245                base_path_guard.clone()
246            } else {
247                // Try to determine the base path from the file being processed (cached)
248                static CACHED_FILE_PATH: std::sync::OnceLock<Option<PathBuf>> = std::sync::OnceLock::new();
249                CACHED_FILE_PATH
250                    .get_or_init(|| {
251                        if let Ok(file_path) = env::var("RUMDL_FILE_PATH") {
252                            let path = Path::new(&file_path);
253                            if path.exists() {
254                                path.parent()
255                                    .map(|p| p.to_path_buf())
256                                    .or_else(|| Some(CURRENT_DIR.clone()))
257                            } else {
258                                Some(CURRENT_DIR.clone())
259                            }
260                        } else {
261                            Some(CURRENT_DIR.clone())
262                        }
263                    })
264                    .clone()
265            }
266        };
267
268        // If we still don't have a base path, we can't validate relative links
269        if base_path.is_none() {
270            return Ok(warnings);
271        }
272
273        // Use DocumentStructure links instead of expensive regex parsing
274        if !structure.links.is_empty() {
275            // Pre-compute line positions for efficient absolute position calculation
276            let mut line_positions = Vec::new();
277            let mut pos = 0;
278            line_positions.push(0);
279            for ch in content.chars() {
280                pos += ch.len_utf8();
281                if ch == '\n' {
282                    line_positions.push(pos);
283                }
284            }
285
286            // Create element cache once for all links
287            let element_cache = ElementCache::new(content);
288
289            // Pre-collect lines to avoid repeated line iteration
290            let lines: Vec<&str> = content.lines().collect();
291
292            for link in &structure.links {
293                let line_idx = link.line - 1;
294                if line_idx >= lines.len() {
295                    continue;
296                }
297
298                let line = lines[line_idx];
299
300                // Quick check for link pattern in this line
301                if !line.contains("](") {
302                    continue;
303                }
304
305                // Find all links in this line using optimized regex
306                for link_match in LINK_START_REGEX.find_iter(line) {
307                    let start_pos = link_match.start();
308                    let end_pos = link_match.end();
309
310                    // Calculate absolute position efficiently using pre-computed positions
311                    let absolute_start_pos = if line_idx < line_positions.len() {
312                        line_positions[line_idx] + start_pos
313                    } else {
314                        // Fallback for edge cases
315                        content.lines().take(line_idx).map(|l| l.len() + 1).sum::<usize>() + start_pos
316                    };
317
318                    // Skip if this link is in a code span
319                    if element_cache.is_in_code_span(absolute_start_pos) {
320                        continue;
321                    }
322
323                    // Find the URL part after the link text
324                    if let Some(caps) = URL_EXTRACT_REGEX.captures_at(line, end_pos - 1)
325                        && let Some(url_group) = caps.get(1)
326                    {
327                        let url = url_group.as_str().trim();
328
329                        // Calculate column position
330                        let column = start_pos + 1;
331
332                        // Process and validate the link
333                        self.process_link(url, link.line, column, &mut warnings);
334                    }
335                }
336            }
337        }
338
339        Ok(warnings)
340    }
341
342    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
343        let content = ctx.content;
344        // If document structure is available, use the optimized version
345        let structure = DocumentStructure::new(content);
346        self.check_with_structure(ctx, &structure)
347
348        // The code below is now unreachable because we always use the document structure
349    }
350
351    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
352        Ok(ctx.content.to_string())
353    }
354
355    fn as_any(&self) -> &dyn std::any::Any {
356        self
357    }
358
359    fn default_config_section(&self) -> Option<(String, toml::Value)> {
360        let json_value = serde_json::to_value(&self.config).ok()?;
361        Some((
362            self.name().to_string(),
363            crate::rule_config_serde::json_to_toml_value(&json_value)?,
364        ))
365    }
366
367    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
368    where
369        Self: Sized,
370    {
371        let rule_config = crate::rule_config_serde::load_rule_config::<MD057Config>(config);
372        Box::new(Self::from_config_struct(rule_config))
373    }
374}
375
376impl DocumentStructureExtensions for MD057ExistingRelativeLinks {
377    fn has_relevant_elements(
378        &self,
379        _ctx: &crate::lint_context::LintContext,
380        _doc_structure: &DocumentStructure,
381    ) -> bool {
382        true
383    }
384}
385
386#[cfg(test)]
387mod tests {
388    use super::*;
389    use std::fs::File;
390    use std::io::Write;
391    use tempfile::tempdir;
392
393    #[test]
394    fn test_external_urls() {
395        let rule = MD057ExistingRelativeLinks::new();
396
397        assert!(rule.is_external_url("https://example.com"));
398        assert!(rule.is_external_url("http://example.com"));
399        assert!(rule.is_external_url("ftp://example.com"));
400        assert!(rule.is_external_url("www.example.com"));
401        assert!(rule.is_external_url("example.com"));
402
403        assert!(!rule.is_external_url("./relative/path.md"));
404        assert!(!rule.is_external_url("relative/path.md"));
405        assert!(!rule.is_external_url("../parent/path.md"));
406    }
407
408    #[test]
409    fn test_media_files() {
410        // Test with default settings (skip_media_files = true)
411        let rule_default = MD057ExistingRelativeLinks::new();
412
413        // Test media file identification
414        assert!(
415            rule_default.is_media_file("image.jpg"),
416            "image.jpg should be identified as a media file"
417        );
418        assert!(
419            rule_default.is_media_file("video.mp4"),
420            "video.mp4 should be identified as a media file"
421        );
422        assert!(
423            rule_default.is_media_file("document.pdf"),
424            "document.pdf should be identified as a media file"
425        );
426        assert!(
427            rule_default.is_media_file("path/to/audio.mp3"),
428            "path/to/audio.mp3 should be identified as a media file"
429        );
430
431        assert!(
432            !rule_default.is_media_file("document.md"),
433            "document.md should not be identified as a media file"
434        );
435        assert!(
436            !rule_default.is_media_file("code.rs"),
437            "code.rs should not be identified as a media file"
438        );
439
440        // Test media file skipping with default settings (skip_media_files = true)
441        assert!(
442            rule_default.should_skip_media_file("image.jpg"),
443            "image.jpg should be skipped with default settings"
444        );
445        assert!(
446            !rule_default.should_skip_media_file("document.md"),
447            "document.md should not be skipped"
448        );
449
450        // Test media file skipping with skip_media_files = false
451        let rule_no_skip = MD057ExistingRelativeLinks::new().with_skip_media_files(false);
452        assert!(
453            !rule_no_skip.should_skip_media_file("image.jpg"),
454            "image.jpg should not be skipped when skip_media_files is false"
455        );
456    }
457
458    #[test]
459    fn test_no_warnings_without_base_path() {
460        let rule = MD057ExistingRelativeLinks::new();
461        let content = "[Link](missing.md)";
462
463        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
464        let result = rule.check(&ctx).unwrap();
465        assert!(result.is_empty(), "Should have no warnings without base path");
466    }
467
468    #[test]
469    fn test_existing_and_missing_links() {
470        // Create a temporary directory for test files
471        let temp_dir = tempdir().unwrap();
472        let base_path = temp_dir.path();
473
474        // Create an existing file
475        let exists_path = base_path.join("exists.md");
476        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
477
478        // Verify the file exists
479        assert!(exists_path.exists(), "exists.md should exist for this test");
480
481        // Create test content with both existing and missing links
482        let content = r#"
483# Test Document
484
485[Valid Link](exists.md)
486[Invalid Link](missing.md)
487[External Link](https://example.com)
488[Media Link](image.jpg)
489        "#;
490
491        // Initialize rule with the base path
492        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
493
494        // Test the rule
495        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
496        let result = rule.check(&ctx).unwrap();
497
498        // Should have one warning for the missing.md link but not for the media file
499        assert_eq!(result.len(), 1);
500        assert!(result[0].message.contains("missing.md"));
501
502        // Test with document structure
503        let structure = DocumentStructure::new(content);
504        let result_with_structure = rule.check_with_structure(&ctx, &structure).unwrap();
505
506        // Results should be the same
507        assert_eq!(result.len(), result_with_structure.len());
508        assert!(result_with_structure[0].message.contains("missing.md"));
509    }
510
511    #[test]
512    fn test_angle_bracket_links() {
513        // Create a temporary directory for test files
514        let temp_dir = tempdir().unwrap();
515        let base_path = temp_dir.path();
516
517        // Create an existing file
518        let exists_path = base_path.join("exists.md");
519        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
520
521        // Create test content with angle bracket links
522        let content = r#"
523# Test Document
524
525[Valid Link](<exists.md>)
526[Invalid Link](<missing.md>)
527[External Link](<https://example.com>)
528    "#;
529
530        // Test with default settings
531        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
532
533        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
534        let result = rule.check(&ctx).unwrap();
535
536        // Should have one warning for missing.md
537        assert_eq!(result.len(), 1, "Should have exactly one warning");
538        assert!(
539            result[0].message.contains("missing.md"),
540            "Warning should mention missing.md"
541        );
542    }
543
544    #[test]
545    fn test_media_file_handling() {
546        // Create a temporary directory for test files
547        let temp_dir = tempdir().unwrap();
548        let base_path = temp_dir.path();
549
550        // Explicitly check that image.jpg doesn't exist in the test directory
551        let image_path = base_path.join("image.jpg");
552        assert!(
553            !image_path.exists(),
554            "Test precondition failed: image.jpg should not exist"
555        );
556
557        // Create a test content with a media link - make sure it's very explicit
558        let content = "[Media Link](image.jpg)";
559
560        // Test with skip_media_files = true (default)
561        let rule_skip_media = MD057ExistingRelativeLinks::new().with_path(base_path);
562
563        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
564        let result_skip = rule_skip_media.check(&ctx).unwrap();
565
566        // Should have no warnings when media files are skipped
567        assert_eq!(
568            result_skip.len(),
569            0,
570            "Should have no warnings when skip_media_files is true"
571        );
572
573        // Test with skip_media_files = false
574        let rule_check_all = MD057ExistingRelativeLinks::new()
575            .with_path(base_path)
576            .with_skip_media_files(false);
577
578        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
579        let result_all = rule_check_all.check(&ctx).unwrap();
580
581        // Should warn about the missing media file
582        assert_eq!(
583            result_all.len(),
584            1,
585            "Should have one warning when skip_media_files is false"
586        );
587        assert!(
588            result_all[0].message.contains("image.jpg"),
589            "Warning should mention image.jpg"
590        );
591    }
592
593    #[test]
594    fn test_code_span_detection() {
595        let rule = MD057ExistingRelativeLinks::new();
596
597        // Create a temporary directory for test files
598        let temp_dir = tempdir().unwrap();
599        let base_path = temp_dir.path();
600
601        let rule = rule.with_path(base_path);
602
603        // Test with document structure
604        let content = "This is a [link](nonexistent.md) and `[not a link](not-checked.md)` in code.";
605        let structure = DocumentStructure::new(content);
606
607        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
608        let result = rule.check_with_structure(&ctx, &structure).unwrap();
609
610        // Should only find the real link, not the one in code
611        assert_eq!(result.len(), 1, "Should only flag the real link");
612        assert!(result[0].message.contains("nonexistent.md"));
613    }
614
615    #[test]
616    fn test_inline_code_spans() {
617        // Create a temporary directory for test files
618        let temp_dir = tempdir().unwrap();
619        let base_path = temp_dir.path();
620
621        // Create test content with links in inline code spans
622        let content = r#"
623# Test Document
624
625This is a normal link: [Link](missing.md)
626
627This is a code span with a link: `[Link](another-missing.md)`
628
629Some more text with `inline code [Link](yet-another-missing.md) embedded`.
630
631    "#;
632
633        // Initialize rule with the base path
634        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
635
636        // Test the rule
637        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
638        let result = rule.check(&ctx).unwrap();
639
640        // Should only have warning for the normal link, not for links in code spans
641        assert_eq!(result.len(), 1, "Should have exactly one warning");
642        assert!(
643            result[0].message.contains("missing.md"),
644            "Warning should be for missing.md"
645        );
646        assert!(
647            !result.iter().any(|w| w.message.contains("another-missing.md")),
648            "Should not warn about link in code span"
649        );
650        assert!(
651            !result.iter().any(|w| w.message.contains("yet-another-missing.md")),
652            "Should not warn about link in inline code"
653        );
654    }
655}