rumdl_lib/rules/
md057_existing_relative_links.rs

1//!
2//! Rule MD057: Existing relative links
3//!
4//! See [docs/md057.md](../../docs/md057.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::element_cache::ElementCache;
8use regex::Regex;
9use std::collections::HashMap;
10use std::env;
11use std::path::{Path, PathBuf};
12use std::sync::LazyLock;
13use std::sync::{Arc, Mutex};
14
15mod md057_config;
16use md057_config::MD057Config;
17
18// Thread-safe cache for file existence checks to avoid redundant filesystem operations
19static FILE_EXISTENCE_CACHE: LazyLock<Arc<Mutex<HashMap<PathBuf, bool>>>> =
20    LazyLock::new(|| Arc::new(Mutex::new(HashMap::new())));
21
22// Reset the file existence cache (typically between rule runs)
23fn reset_file_existence_cache() {
24    let mut cache = FILE_EXISTENCE_CACHE
25        .lock()
26        .expect("File existence cache mutex poisoned");
27    cache.clear();
28}
29
30// Check if a file exists with caching
31fn file_exists_with_cache(path: &Path) -> bool {
32    let mut cache = FILE_EXISTENCE_CACHE
33        .lock()
34        .expect("File existence cache mutex poisoned");
35    *cache.entry(path.to_path_buf()).or_insert_with(|| path.exists())
36}
37
38// Regex to match the start of a link - simplified for performance
39static LINK_START_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!?\[[^\]]*\]").unwrap());
40
41/// Regex to extract the URL from a markdown link
42/// Format: `](URL)` or `](URL "title")`
43static URL_EXTRACT_REGEX: LazyLock<Regex> =
44    LazyLock::new(|| Regex::new("\\]\\(\\s*<?([^>\\)\\s#]+)(#[^)\\s]*)?\\s*(?:\"[^\"]*\")?\\s*>?\\s*\\)").unwrap());
45
46/// Regex to detect protocol and domain for external links
47static PROTOCOL_DOMAIN_REGEX: LazyLock<Regex> =
48    LazyLock::new(|| Regex::new(r"^(https?://|ftp://|mailto:|www\.)").unwrap());
49
50/// Regex to detect media file types
51static MEDIA_FILE_REGEX: LazyLock<Regex> =
52    LazyLock::new(|| Regex::new(r"\.(jpg|jpeg|png|gif|bmp|svg|webp|tiff|mp3|mp4|avi|mov|webm|wav|ogg|pdf)$").unwrap());
53
54// Current working directory
55static CURRENT_DIR: LazyLock<PathBuf> = LazyLock::new(|| env::current_dir().unwrap_or_else(|_| PathBuf::from(".")));
56
57/// Rule MD057: Existing relative links should point to valid files or directories.
58#[derive(Debug, Default, Clone)]
59pub struct MD057ExistingRelativeLinks {
60    /// Base directory for resolving relative links
61    base_path: Arc<Mutex<Option<PathBuf>>>,
62    /// Configuration
63    config: MD057Config,
64}
65
66impl MD057ExistingRelativeLinks {
67    /// Create a new instance with default settings
68    pub fn new() -> Self {
69        Self::default()
70    }
71
72    /// Set the base path for resolving relative links
73    pub fn with_path<P: AsRef<Path>>(self, path: P) -> Self {
74        let path = path.as_ref();
75        let dir_path = if path.is_file() {
76            path.parent().map(|p| p.to_path_buf())
77        } else {
78            Some(path.to_path_buf())
79        };
80
81        *self.base_path.lock().expect("Base path mutex poisoned") = dir_path;
82        self
83    }
84
85    /// Configure whether to skip checking media files
86    pub fn with_skip_media_files(mut self, skip_media_files: bool) -> Self {
87        self.config.skip_media_files = skip_media_files;
88        self
89    }
90
91    pub fn from_config_struct(config: MD057Config) -> Self {
92        Self {
93            base_path: Arc::new(Mutex::new(None)),
94            config,
95        }
96    }
97
98    /// Check if a URL is external (optimized version)
99    #[inline]
100    fn is_external_url(&self, url: &str) -> bool {
101        if url.is_empty() {
102            return false;
103        }
104
105        // Quick checks for common external URL patterns
106        if PROTOCOL_DOMAIN_REGEX.is_match(url) || url.starts_with("www.") {
107            return true;
108        }
109
110        // More restrictive domain check using a simpler pattern
111        if !self.is_media_file(url) && url.ends_with(".com") {
112            return true;
113        }
114
115        // Absolute paths within the site are not external
116        if url.starts_with('/') {
117            return false;
118        }
119
120        // All other cases (relative paths, etc.) are not external
121        false
122    }
123
124    /// Check if the URL is a fragment-only link (internal document link)
125    #[inline]
126    fn is_fragment_only_link(&self, url: &str) -> bool {
127        url.starts_with('#')
128    }
129
130    /// Check if the URL has a media file extension (optimized with early returns)
131    #[inline]
132    fn is_media_file(&self, url: &str) -> bool {
133        // Quick check before using regex
134        if !url.contains('.') {
135            return false;
136        }
137        MEDIA_FILE_REGEX.is_match(url)
138    }
139
140    /// Determine if we should skip checking this media file
141    #[inline]
142    fn should_skip_media_file(&self, url: &str) -> bool {
143        self.config.skip_media_files && self.is_media_file(url)
144    }
145
146    /// Resolve a relative link against the base path
147    fn resolve_link_path(&self, link: &str) -> Option<PathBuf> {
148        self.base_path
149            .lock()
150            .unwrap()
151            .as_ref()
152            .map(|base_path| base_path.join(link))
153    }
154
155    /// Process a single link and check if it exists
156    fn process_link(&self, url: &str, line_num: usize, column: usize, warnings: &mut Vec<LintWarning>) {
157        // Skip empty URLs
158        if url.is_empty() {
159            return;
160        }
161
162        // Skip external URLs and fragment-only links (optimized order)
163        if self.is_external_url(url) || self.is_fragment_only_link(url) {
164            return;
165        }
166
167        // Skip media files if configured to do so
168        if self.should_skip_media_file(url) {
169            return;
170        }
171
172        // Resolve the relative link against the base path
173        if let Some(resolved_path) = self.resolve_link_path(url) {
174            // Check if the file exists (with caching to avoid filesystem calls)
175            if !file_exists_with_cache(&resolved_path) {
176                warnings.push(LintWarning {
177                    rule_name: Some(self.name().to_string()),
178                    line: line_num,
179                    column,
180                    end_line: line_num,
181                    end_column: column + url.len(),
182                    message: format!("Relative link '{url}' does not exist"),
183                    severity: Severity::Warning,
184                    fix: None, // No automatic fix for missing files
185                });
186            }
187        }
188    }
189}
190
191impl Rule for MD057ExistingRelativeLinks {
192    fn name(&self) -> &'static str {
193        "MD057"
194    }
195
196    fn description(&self) -> &'static str {
197        "Relative links should point to existing files"
198    }
199
200    fn category(&self) -> RuleCategory {
201        RuleCategory::Link
202    }
203
204    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
205        ctx.content.is_empty() || !ctx.likely_has_links_or_images()
206    }
207
208    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
209        let content = ctx.content;
210
211        // Early returns for performance
212        if content.is_empty() || !content.contains('[') {
213            return Ok(Vec::new());
214        }
215
216        // Quick check for any potential links before expensive operations
217        if !content.contains("](") {
218            return Ok(Vec::new());
219        }
220
221        // Reset the file existence cache for a fresh run
222        reset_file_existence_cache();
223
224        let mut warnings = Vec::new();
225
226        // Cache base path lookup to avoid repeated mutex operations
227        let base_path = {
228            let base_path_guard = self.base_path.lock().expect("Base path mutex poisoned");
229            if base_path_guard.is_some() {
230                base_path_guard.clone()
231            } else {
232                // Try to determine the base path from the file being processed (cached)
233                static CACHED_FILE_PATH: std::sync::OnceLock<Option<PathBuf>> = std::sync::OnceLock::new();
234                CACHED_FILE_PATH
235                    .get_or_init(|| {
236                        if let Ok(file_path) = env::var("RUMDL_FILE_PATH") {
237                            let path = Path::new(&file_path);
238                            if path.exists() {
239                                path.parent()
240                                    .map(|p| p.to_path_buf())
241                                    .or_else(|| Some(CURRENT_DIR.clone()))
242                            } else {
243                                Some(CURRENT_DIR.clone())
244                            }
245                        } else {
246                            Some(CURRENT_DIR.clone())
247                        }
248                    })
249                    .clone()
250            }
251        };
252
253        // If we still don't have a base path, we can't validate relative links
254        if base_path.is_none() {
255            return Ok(warnings);
256        }
257
258        // Use LintContext links instead of expensive regex parsing
259        if !ctx.links.is_empty() {
260            // Use LineIndex for correct position calculation across all line ending types
261            let line_index = &ctx.line_index;
262
263            // Create element cache once for all links
264            let element_cache = ElementCache::new(content);
265
266            // Pre-collect lines to avoid repeated line iteration
267            let lines: Vec<&str> = content.lines().collect();
268
269            for link in &ctx.links {
270                let line_idx = link.line - 1;
271                if line_idx >= lines.len() {
272                    continue;
273                }
274
275                let line = lines[line_idx];
276
277                // Quick check for link pattern in this line
278                if !line.contains("](") {
279                    continue;
280                }
281
282                // Find all links in this line using optimized regex
283                for link_match in LINK_START_REGEX.find_iter(line) {
284                    let start_pos = link_match.start();
285                    let end_pos = link_match.end();
286
287                    // Calculate absolute position using LineIndex
288                    let line_start_byte = line_index.get_line_start_byte(line_idx + 1).unwrap_or(0);
289                    let absolute_start_pos = line_start_byte + start_pos;
290
291                    // Skip if this link is in a code span
292                    if element_cache.is_in_code_span(absolute_start_pos) {
293                        continue;
294                    }
295
296                    // Find the URL part after the link text
297                    if let Some(caps) = URL_EXTRACT_REGEX.captures_at(line, end_pos - 1)
298                        && let Some(url_group) = caps.get(1)
299                    {
300                        let url = url_group.as_str().trim();
301
302                        // Calculate column position
303                        let column = start_pos + 1;
304
305                        // Process and validate the link
306                        self.process_link(url, link.line, column, &mut warnings);
307                    }
308                }
309            }
310        }
311
312        Ok(warnings)
313    }
314
315    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
316        Ok(ctx.content.to_string())
317    }
318
319    fn as_any(&self) -> &dyn std::any::Any {
320        self
321    }
322
323    fn default_config_section(&self) -> Option<(String, toml::Value)> {
324        let json_value = serde_json::to_value(&self.config).ok()?;
325        Some((
326            self.name().to_string(),
327            crate::rule_config_serde::json_to_toml_value(&json_value)?,
328        ))
329    }
330
331    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
332    where
333        Self: Sized,
334    {
335        let rule_config = crate::rule_config_serde::load_rule_config::<MD057Config>(config);
336        Box::new(Self::from_config_struct(rule_config))
337    }
338}
339
340#[cfg(test)]
341mod tests {
342    use super::*;
343    use std::fs::File;
344    use std::io::Write;
345    use tempfile::tempdir;
346
347    #[test]
348    fn test_external_urls() {
349        let rule = MD057ExistingRelativeLinks::new();
350
351        assert!(rule.is_external_url("https://example.com"));
352        assert!(rule.is_external_url("http://example.com"));
353        assert!(rule.is_external_url("ftp://example.com"));
354        assert!(rule.is_external_url("www.example.com"));
355        assert!(rule.is_external_url("example.com"));
356
357        assert!(!rule.is_external_url("./relative/path.md"));
358        assert!(!rule.is_external_url("relative/path.md"));
359        assert!(!rule.is_external_url("../parent/path.md"));
360    }
361
362    #[test]
363    fn test_media_files() {
364        // Test with default settings (skip_media_files = true)
365        let rule_default = MD057ExistingRelativeLinks::new();
366
367        // Test media file identification
368        assert!(
369            rule_default.is_media_file("image.jpg"),
370            "image.jpg should be identified as a media file"
371        );
372        assert!(
373            rule_default.is_media_file("video.mp4"),
374            "video.mp4 should be identified as a media file"
375        );
376        assert!(
377            rule_default.is_media_file("document.pdf"),
378            "document.pdf should be identified as a media file"
379        );
380        assert!(
381            rule_default.is_media_file("path/to/audio.mp3"),
382            "path/to/audio.mp3 should be identified as a media file"
383        );
384
385        assert!(
386            !rule_default.is_media_file("document.md"),
387            "document.md should not be identified as a media file"
388        );
389        assert!(
390            !rule_default.is_media_file("code.rs"),
391            "code.rs should not be identified as a media file"
392        );
393
394        // Test media file skipping with default settings (skip_media_files = true)
395        assert!(
396            rule_default.should_skip_media_file("image.jpg"),
397            "image.jpg should be skipped with default settings"
398        );
399        assert!(
400            !rule_default.should_skip_media_file("document.md"),
401            "document.md should not be skipped"
402        );
403
404        // Test media file skipping with skip_media_files = false
405        let rule_no_skip = MD057ExistingRelativeLinks::new().with_skip_media_files(false);
406        assert!(
407            !rule_no_skip.should_skip_media_file("image.jpg"),
408            "image.jpg should not be skipped when skip_media_files is false"
409        );
410    }
411
412    #[test]
413    fn test_no_warnings_without_base_path() {
414        let rule = MD057ExistingRelativeLinks::new();
415        let content = "[Link](missing.md)";
416
417        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
418        let result = rule.check(&ctx).unwrap();
419        assert!(result.is_empty(), "Should have no warnings without base path");
420    }
421
422    #[test]
423    fn test_existing_and_missing_links() {
424        // Create a temporary directory for test files
425        let temp_dir = tempdir().unwrap();
426        let base_path = temp_dir.path();
427
428        // Create an existing file
429        let exists_path = base_path.join("exists.md");
430        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
431
432        // Verify the file exists
433        assert!(exists_path.exists(), "exists.md should exist for this test");
434
435        // Create test content with both existing and missing links
436        let content = r#"
437# Test Document
438
439[Valid Link](exists.md)
440[Invalid Link](missing.md)
441[External Link](https://example.com)
442[Media Link](image.jpg)
443        "#;
444
445        // Initialize rule with the base path
446        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
447
448        // Test the rule
449        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
450        let result = rule.check(&ctx).unwrap();
451
452        // Should have one warning for the missing.md link but not for the media file
453        assert_eq!(result.len(), 1);
454        assert!(result[0].message.contains("missing.md"));
455
456        // Test with check method
457        let result_with_structure = rule.check(&ctx).unwrap();
458
459        // Results should be the same
460        assert_eq!(result.len(), result_with_structure.len());
461        assert!(result_with_structure[0].message.contains("missing.md"));
462    }
463
464    #[test]
465    fn test_angle_bracket_links() {
466        // Create a temporary directory for test files
467        let temp_dir = tempdir().unwrap();
468        let base_path = temp_dir.path();
469
470        // Create an existing file
471        let exists_path = base_path.join("exists.md");
472        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
473
474        // Create test content with angle bracket links
475        let content = r#"
476# Test Document
477
478[Valid Link](<exists.md>)
479[Invalid Link](<missing.md>)
480[External Link](<https://example.com>)
481    "#;
482
483        // Test with default settings
484        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
485
486        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
487        let result = rule.check(&ctx).unwrap();
488
489        // Should have one warning for missing.md
490        assert_eq!(result.len(), 1, "Should have exactly one warning");
491        assert!(
492            result[0].message.contains("missing.md"),
493            "Warning should mention missing.md"
494        );
495    }
496
497    #[test]
498    fn test_media_file_handling() {
499        // Create a temporary directory for test files
500        let temp_dir = tempdir().unwrap();
501        let base_path = temp_dir.path();
502
503        // Explicitly check that image.jpg doesn't exist in the test directory
504        let image_path = base_path.join("image.jpg");
505        assert!(
506            !image_path.exists(),
507            "Test precondition failed: image.jpg should not exist"
508        );
509
510        // Create a test content with a media link - make sure it's very explicit
511        let content = "[Media Link](image.jpg)";
512
513        // Test with skip_media_files = true (default)
514        let rule_skip_media = MD057ExistingRelativeLinks::new().with_path(base_path);
515
516        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
517        let result_skip = rule_skip_media.check(&ctx).unwrap();
518
519        // Should have no warnings when media files are skipped
520        assert_eq!(
521            result_skip.len(),
522            0,
523            "Should have no warnings when skip_media_files is true"
524        );
525
526        // Test with skip_media_files = false
527        let rule_check_all = MD057ExistingRelativeLinks::new()
528            .with_path(base_path)
529            .with_skip_media_files(false);
530
531        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
532        let result_all = rule_check_all.check(&ctx).unwrap();
533
534        // Should warn about the missing media file
535        assert_eq!(
536            result_all.len(),
537            1,
538            "Should have one warning when skip_media_files is false"
539        );
540        assert!(
541            result_all[0].message.contains("image.jpg"),
542            "Warning should mention image.jpg"
543        );
544    }
545
546    #[test]
547    fn test_code_span_detection() {
548        let rule = MD057ExistingRelativeLinks::new();
549
550        // Create a temporary directory for test files
551        let temp_dir = tempdir().unwrap();
552        let base_path = temp_dir.path();
553
554        let rule = rule.with_path(base_path);
555
556        // Test with document structure
557        let content = "This is a [link](nonexistent.md) and `[not a link](not-checked.md)` in code.";
558
559        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
560        let result = rule.check(&ctx).unwrap();
561
562        // Should only find the real link, not the one in code
563        assert_eq!(result.len(), 1, "Should only flag the real link");
564        assert!(result[0].message.contains("nonexistent.md"));
565    }
566
567    #[test]
568    fn test_inline_code_spans() {
569        // Create a temporary directory for test files
570        let temp_dir = tempdir().unwrap();
571        let base_path = temp_dir.path();
572
573        // Create test content with links in inline code spans
574        let content = r#"
575# Test Document
576
577This is a normal link: [Link](missing.md)
578
579This is a code span with a link: `[Link](another-missing.md)`
580
581Some more text with `inline code [Link](yet-another-missing.md) embedded`.
582
583    "#;
584
585        // Initialize rule with the base path
586        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
587
588        // Test the rule
589        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
590        let result = rule.check(&ctx).unwrap();
591
592        // Should only have warning for the normal link, not for links in code spans
593        assert_eq!(result.len(), 1, "Should have exactly one warning");
594        assert!(
595            result[0].message.contains("missing.md"),
596            "Warning should be for missing.md"
597        );
598        assert!(
599            !result.iter().any(|w| w.message.contains("another-missing.md")),
600            "Should not warn about link in code span"
601        );
602        assert!(
603            !result.iter().any(|w| w.message.contains("yet-another-missing.md")),
604            "Should not warn about link in inline code"
605        );
606    }
607}