rumdl_lib/rules/
md057_existing_relative_links.rs

1//!
2//! Rule MD057: Existing relative links
3//!
4//! See [docs/md057.md](../../docs/md057.md) for full documentation, configuration, and examples.
5
6use crate::rule::{LintError, LintResult, LintWarning, Rule, RuleCategory, Severity};
7use crate::utils::element_cache::ElementCache;
8use lazy_static::lazy_static;
9use regex::Regex;
10use std::collections::HashMap;
11use std::env;
12use std::path::{Path, PathBuf};
13use std::sync::{Arc, Mutex};
14
15mod md057_config;
16use md057_config::MD057Config;
17
18// Thread-safe cache for file existence checks to avoid redundant filesystem operations
19lazy_static! {
20    static ref FILE_EXISTENCE_CACHE: Arc<Mutex<HashMap<PathBuf, bool>>> = Arc::new(Mutex::new(HashMap::new()));
21}
22
23// Reset the file existence cache (typically between rule runs)
24fn reset_file_existence_cache() {
25    let mut cache = FILE_EXISTENCE_CACHE.lock().unwrap();
26    cache.clear();
27}
28
29// Check if a file exists with caching
30fn file_exists_with_cache(path: &Path) -> bool {
31    let mut cache = FILE_EXISTENCE_CACHE.lock().unwrap();
32    *cache.entry(path.to_path_buf()).or_insert_with(|| path.exists())
33}
34
35lazy_static! {
36    // Regex to match the start of a link - simplified for performance
37    static ref LINK_START_REGEX: Regex =
38        Regex::new(r"!?\[[^\]]*\]").unwrap();
39
40    /// Regex to extract the URL from a markdown link
41    /// Format: `](URL)` or `](URL "title")`
42    static ref URL_EXTRACT_REGEX: Regex =
43        Regex::new("\\]\\(\\s*<?([^>\\)\\s#]+)(#[^)\\s]*)?\\s*(?:\"[^\"]*\")?\\s*>?\\s*\\)").unwrap();
44
45    /// Regex to detect code fence blocks
46    static ref CODE_FENCE_REGEX: Regex =
47        Regex::new(r"^( {0,3})(`{3,}|~{3,})").unwrap();
48
49    /// Regex to detect protocol and domain for external links
50    static ref PROTOCOL_DOMAIN_REGEX: Regex =
51        Regex::new(r"^(https?://|ftp://|mailto:|www\.)").unwrap();
52
53    /// Regex to detect media file types
54    static ref MEDIA_FILE_REGEX: Regex =
55        Regex::new(r"\.(jpg|jpeg|png|gif|bmp|svg|webp|tiff|mp3|mp4|avi|mov|webm|wav|ogg|pdf)$").unwrap();
56
57    /// Regex to detect fragment-only links
58    static ref FRAGMENT_ONLY_REGEX: Regex =
59        Regex::new(r"^#").unwrap();
60
61    // Current working directory
62    static ref CURRENT_DIR: PathBuf = env::current_dir().unwrap_or_else(|_| PathBuf::from("."));
63}
64
65/// Rule MD057: Existing relative links should point to valid files or directories.
66#[derive(Debug, Default, Clone)]
67pub struct MD057ExistingRelativeLinks {
68    /// Base directory for resolving relative links
69    base_path: Arc<Mutex<Option<PathBuf>>>,
70    /// Configuration
71    config: MD057Config,
72}
73
74impl MD057ExistingRelativeLinks {
75    /// Create a new instance with default settings
76    pub fn new() -> Self {
77        Self::default()
78    }
79
80    /// Set the base path for resolving relative links
81    pub fn with_path<P: AsRef<Path>>(self, path: P) -> Self {
82        let path = path.as_ref();
83        let dir_path = if path.is_file() {
84            path.parent().map(|p| p.to_path_buf())
85        } else {
86            Some(path.to_path_buf())
87        };
88
89        *self.base_path.lock().unwrap() = dir_path;
90        self
91    }
92
93    /// Configure whether to skip checking media files
94    pub fn with_skip_media_files(mut self, skip_media_files: bool) -> Self {
95        self.config.skip_media_files = skip_media_files;
96        self
97    }
98
99    pub fn from_config_struct(config: MD057Config) -> Self {
100        Self {
101            base_path: Arc::new(Mutex::new(None)),
102            config,
103        }
104    }
105
106    /// Check if a URL is external (optimized version)
107    #[inline]
108    fn is_external_url(&self, url: &str) -> bool {
109        if url.is_empty() {
110            return false;
111        }
112
113        // Quick checks for common external URL patterns
114        if PROTOCOL_DOMAIN_REGEX.is_match(url) || url.starts_with("www.") {
115            return true;
116        }
117
118        // More restrictive domain check using a simpler pattern
119        if !self.is_media_file(url) && url.ends_with(".com") {
120            return true;
121        }
122
123        // Absolute paths within the site are not external
124        if url.starts_with('/') {
125            return false;
126        }
127
128        // All other cases (relative paths, etc.) are not external
129        false
130    }
131
132    /// Check if the URL is a fragment-only link (internal document link)
133    #[inline]
134    fn is_fragment_only_link(&self, url: &str) -> bool {
135        url.starts_with('#')
136    }
137
138    /// Check if the URL has a media file extension (optimized with early returns)
139    #[inline]
140    fn is_media_file(&self, url: &str) -> bool {
141        // Quick check before using regex
142        if !url.contains('.') {
143            return false;
144        }
145        MEDIA_FILE_REGEX.is_match(url)
146    }
147
148    /// Determine if we should skip checking this media file
149    #[inline]
150    fn should_skip_media_file(&self, url: &str) -> bool {
151        self.config.skip_media_files && self.is_media_file(url)
152    }
153
154    /// Resolve a relative link against the base path
155    fn resolve_link_path(&self, link: &str) -> Option<PathBuf> {
156        self.base_path
157            .lock()
158            .unwrap()
159            .as_ref()
160            .map(|base_path| base_path.join(link))
161    }
162
163    /// Process a single link and check if it exists
164    fn process_link(&self, url: &str, line_num: usize, column: usize, warnings: &mut Vec<LintWarning>) {
165        // Skip empty URLs
166        if url.is_empty() {
167            return;
168        }
169
170        // Skip external URLs and fragment-only links (optimized order)
171        if self.is_external_url(url) || self.is_fragment_only_link(url) {
172            return;
173        }
174
175        // Skip media files if configured to do so
176        if self.should_skip_media_file(url) {
177            return;
178        }
179
180        // Resolve the relative link against the base path
181        if let Some(resolved_path) = self.resolve_link_path(url) {
182            // Check if the file exists (with caching to avoid filesystem calls)
183            if !file_exists_with_cache(&resolved_path) {
184                warnings.push(LintWarning {
185                    rule_name: Some(self.name()),
186                    line: line_num,
187                    column,
188                    end_line: line_num,
189                    end_column: column + url.len(),
190                    message: format!("Relative link '{url}' does not exist"),
191                    severity: Severity::Warning,
192                    fix: None, // No automatic fix for missing files
193                });
194            }
195        }
196    }
197}
198
199impl Rule for MD057ExistingRelativeLinks {
200    fn name(&self) -> &'static str {
201        "MD057"
202    }
203
204    fn description(&self) -> &'static str {
205        "Relative links should point to existing files"
206    }
207
208    fn category(&self) -> RuleCategory {
209        RuleCategory::Link
210    }
211
212    fn should_skip(&self, ctx: &crate::lint_context::LintContext) -> bool {
213        let content = ctx.content;
214        content.is_empty() || !content.contains('[') || !content.contains("](")
215    }
216
217    fn check(&self, ctx: &crate::lint_context::LintContext) -> LintResult {
218        let content = ctx.content;
219
220        // Early returns for performance
221        if content.is_empty() || !content.contains('[') {
222            return Ok(Vec::new());
223        }
224
225        // Quick check for any potential links before expensive operations
226        if !content.contains("](") {
227            return Ok(Vec::new());
228        }
229
230        // Reset the file existence cache for a fresh run
231        reset_file_existence_cache();
232
233        let mut warnings = Vec::new();
234
235        // Cache base path lookup to avoid repeated mutex operations
236        let base_path = {
237            let base_path_guard = self.base_path.lock().unwrap();
238            if base_path_guard.is_some() {
239                base_path_guard.clone()
240            } else {
241                // Try to determine the base path from the file being processed (cached)
242                static CACHED_FILE_PATH: std::sync::OnceLock<Option<PathBuf>> = std::sync::OnceLock::new();
243                CACHED_FILE_PATH
244                    .get_or_init(|| {
245                        if let Ok(file_path) = env::var("RUMDL_FILE_PATH") {
246                            let path = Path::new(&file_path);
247                            if path.exists() {
248                                path.parent()
249                                    .map(|p| p.to_path_buf())
250                                    .or_else(|| Some(CURRENT_DIR.clone()))
251                            } else {
252                                Some(CURRENT_DIR.clone())
253                            }
254                        } else {
255                            Some(CURRENT_DIR.clone())
256                        }
257                    })
258                    .clone()
259            }
260        };
261
262        // If we still don't have a base path, we can't validate relative links
263        if base_path.is_none() {
264            return Ok(warnings);
265        }
266
267        // Use LintContext links instead of expensive regex parsing
268        if !ctx.links.is_empty() {
269            // Pre-compute line positions for efficient absolute position calculation
270            let mut line_positions = Vec::new();
271            let mut pos = 0;
272            line_positions.push(0);
273            for ch in content.chars() {
274                pos += ch.len_utf8();
275                if ch == '\n' {
276                    line_positions.push(pos);
277                }
278            }
279
280            // Create element cache once for all links
281            let element_cache = ElementCache::new(content);
282
283            // Pre-collect lines to avoid repeated line iteration
284            let lines: Vec<&str> = content.lines().collect();
285
286            for link in &ctx.links {
287                let line_idx = link.line - 1;
288                if line_idx >= lines.len() {
289                    continue;
290                }
291
292                let line = lines[line_idx];
293
294                // Quick check for link pattern in this line
295                if !line.contains("](") {
296                    continue;
297                }
298
299                // Find all links in this line using optimized regex
300                for link_match in LINK_START_REGEX.find_iter(line) {
301                    let start_pos = link_match.start();
302                    let end_pos = link_match.end();
303
304                    // Calculate absolute position efficiently using pre-computed positions
305                    let absolute_start_pos = if line_idx < line_positions.len() {
306                        line_positions[line_idx] + start_pos
307                    } else {
308                        // Fallback for edge cases
309                        content.lines().take(line_idx).map(|l| l.len() + 1).sum::<usize>() + start_pos
310                    };
311
312                    // Skip if this link is in a code span
313                    if element_cache.is_in_code_span(absolute_start_pos) {
314                        continue;
315                    }
316
317                    // Find the URL part after the link text
318                    if let Some(caps) = URL_EXTRACT_REGEX.captures_at(line, end_pos - 1)
319                        && let Some(url_group) = caps.get(1)
320                    {
321                        let url = url_group.as_str().trim();
322
323                        // Calculate column position
324                        let column = start_pos + 1;
325
326                        // Process and validate the link
327                        self.process_link(url, link.line, column, &mut warnings);
328                    }
329                }
330            }
331        }
332
333        Ok(warnings)
334    }
335
336    fn fix(&self, ctx: &crate::lint_context::LintContext) -> Result<String, LintError> {
337        Ok(ctx.content.to_string())
338    }
339
340    fn as_any(&self) -> &dyn std::any::Any {
341        self
342    }
343
344    fn default_config_section(&self) -> Option<(String, toml::Value)> {
345        let json_value = serde_json::to_value(&self.config).ok()?;
346        Some((
347            self.name().to_string(),
348            crate::rule_config_serde::json_to_toml_value(&json_value)?,
349        ))
350    }
351
352    fn from_config(config: &crate::config::Config) -> Box<dyn Rule>
353    where
354        Self: Sized,
355    {
356        let rule_config = crate::rule_config_serde::load_rule_config::<MD057Config>(config);
357        Box::new(Self::from_config_struct(rule_config))
358    }
359}
360
361#[cfg(test)]
362mod tests {
363    use super::*;
364    use std::fs::File;
365    use std::io::Write;
366    use tempfile::tempdir;
367
368    #[test]
369    fn test_external_urls() {
370        let rule = MD057ExistingRelativeLinks::new();
371
372        assert!(rule.is_external_url("https://example.com"));
373        assert!(rule.is_external_url("http://example.com"));
374        assert!(rule.is_external_url("ftp://example.com"));
375        assert!(rule.is_external_url("www.example.com"));
376        assert!(rule.is_external_url("example.com"));
377
378        assert!(!rule.is_external_url("./relative/path.md"));
379        assert!(!rule.is_external_url("relative/path.md"));
380        assert!(!rule.is_external_url("../parent/path.md"));
381    }
382
383    #[test]
384    fn test_media_files() {
385        // Test with default settings (skip_media_files = true)
386        let rule_default = MD057ExistingRelativeLinks::new();
387
388        // Test media file identification
389        assert!(
390            rule_default.is_media_file("image.jpg"),
391            "image.jpg should be identified as a media file"
392        );
393        assert!(
394            rule_default.is_media_file("video.mp4"),
395            "video.mp4 should be identified as a media file"
396        );
397        assert!(
398            rule_default.is_media_file("document.pdf"),
399            "document.pdf should be identified as a media file"
400        );
401        assert!(
402            rule_default.is_media_file("path/to/audio.mp3"),
403            "path/to/audio.mp3 should be identified as a media file"
404        );
405
406        assert!(
407            !rule_default.is_media_file("document.md"),
408            "document.md should not be identified as a media file"
409        );
410        assert!(
411            !rule_default.is_media_file("code.rs"),
412            "code.rs should not be identified as a media file"
413        );
414
415        // Test media file skipping with default settings (skip_media_files = true)
416        assert!(
417            rule_default.should_skip_media_file("image.jpg"),
418            "image.jpg should be skipped with default settings"
419        );
420        assert!(
421            !rule_default.should_skip_media_file("document.md"),
422            "document.md should not be skipped"
423        );
424
425        // Test media file skipping with skip_media_files = false
426        let rule_no_skip = MD057ExistingRelativeLinks::new().with_skip_media_files(false);
427        assert!(
428            !rule_no_skip.should_skip_media_file("image.jpg"),
429            "image.jpg should not be skipped when skip_media_files is false"
430        );
431    }
432
433    #[test]
434    fn test_no_warnings_without_base_path() {
435        let rule = MD057ExistingRelativeLinks::new();
436        let content = "[Link](missing.md)";
437
438        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
439        let result = rule.check(&ctx).unwrap();
440        assert!(result.is_empty(), "Should have no warnings without base path");
441    }
442
443    #[test]
444    fn test_existing_and_missing_links() {
445        // Create a temporary directory for test files
446        let temp_dir = tempdir().unwrap();
447        let base_path = temp_dir.path();
448
449        // Create an existing file
450        let exists_path = base_path.join("exists.md");
451        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
452
453        // Verify the file exists
454        assert!(exists_path.exists(), "exists.md should exist for this test");
455
456        // Create test content with both existing and missing links
457        let content = r#"
458# Test Document
459
460[Valid Link](exists.md)
461[Invalid Link](missing.md)
462[External Link](https://example.com)
463[Media Link](image.jpg)
464        "#;
465
466        // Initialize rule with the base path
467        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
468
469        // Test the rule
470        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
471        let result = rule.check(&ctx).unwrap();
472
473        // Should have one warning for the missing.md link but not for the media file
474        assert_eq!(result.len(), 1);
475        assert!(result[0].message.contains("missing.md"));
476
477        // Test with check method
478        let result_with_structure = rule.check(&ctx).unwrap();
479
480        // Results should be the same
481        assert_eq!(result.len(), result_with_structure.len());
482        assert!(result_with_structure[0].message.contains("missing.md"));
483    }
484
485    #[test]
486    fn test_angle_bracket_links() {
487        // Create a temporary directory for test files
488        let temp_dir = tempdir().unwrap();
489        let base_path = temp_dir.path();
490
491        // Create an existing file
492        let exists_path = base_path.join("exists.md");
493        File::create(&exists_path).unwrap().write_all(b"# Test File").unwrap();
494
495        // Create test content with angle bracket links
496        let content = r#"
497# Test Document
498
499[Valid Link](<exists.md>)
500[Invalid Link](<missing.md>)
501[External Link](<https://example.com>)
502    "#;
503
504        // Test with default settings
505        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
506
507        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
508        let result = rule.check(&ctx).unwrap();
509
510        // Should have one warning for missing.md
511        assert_eq!(result.len(), 1, "Should have exactly one warning");
512        assert!(
513            result[0].message.contains("missing.md"),
514            "Warning should mention missing.md"
515        );
516    }
517
518    #[test]
519    fn test_media_file_handling() {
520        // Create a temporary directory for test files
521        let temp_dir = tempdir().unwrap();
522        let base_path = temp_dir.path();
523
524        // Explicitly check that image.jpg doesn't exist in the test directory
525        let image_path = base_path.join("image.jpg");
526        assert!(
527            !image_path.exists(),
528            "Test precondition failed: image.jpg should not exist"
529        );
530
531        // Create a test content with a media link - make sure it's very explicit
532        let content = "[Media Link](image.jpg)";
533
534        // Test with skip_media_files = true (default)
535        let rule_skip_media = MD057ExistingRelativeLinks::new().with_path(base_path);
536
537        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
538        let result_skip = rule_skip_media.check(&ctx).unwrap();
539
540        // Should have no warnings when media files are skipped
541        assert_eq!(
542            result_skip.len(),
543            0,
544            "Should have no warnings when skip_media_files is true"
545        );
546
547        // Test with skip_media_files = false
548        let rule_check_all = MD057ExistingRelativeLinks::new()
549            .with_path(base_path)
550            .with_skip_media_files(false);
551
552        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
553        let result_all = rule_check_all.check(&ctx).unwrap();
554
555        // Should warn about the missing media file
556        assert_eq!(
557            result_all.len(),
558            1,
559            "Should have one warning when skip_media_files is false"
560        );
561        assert!(
562            result_all[0].message.contains("image.jpg"),
563            "Warning should mention image.jpg"
564        );
565    }
566
567    #[test]
568    fn test_code_span_detection() {
569        let rule = MD057ExistingRelativeLinks::new();
570
571        // Create a temporary directory for test files
572        let temp_dir = tempdir().unwrap();
573        let base_path = temp_dir.path();
574
575        let rule = rule.with_path(base_path);
576
577        // Test with document structure
578        let content = "This is a [link](nonexistent.md) and `[not a link](not-checked.md)` in code.";
579
580        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
581        let result = rule.check(&ctx).unwrap();
582
583        // Should only find the real link, not the one in code
584        assert_eq!(result.len(), 1, "Should only flag the real link");
585        assert!(result[0].message.contains("nonexistent.md"));
586    }
587
588    #[test]
589    fn test_inline_code_spans() {
590        // Create a temporary directory for test files
591        let temp_dir = tempdir().unwrap();
592        let base_path = temp_dir.path();
593
594        // Create test content with links in inline code spans
595        let content = r#"
596# Test Document
597
598This is a normal link: [Link](missing.md)
599
600This is a code span with a link: `[Link](another-missing.md)`
601
602Some more text with `inline code [Link](yet-another-missing.md) embedded`.
603
604    "#;
605
606        // Initialize rule with the base path
607        let rule = MD057ExistingRelativeLinks::new().with_path(base_path);
608
609        // Test the rule
610        let ctx = crate::lint_context::LintContext::new(content, crate::config::MarkdownFlavor::Standard);
611        let result = rule.check(&ctx).unwrap();
612
613        // Should only have warning for the normal link, not for links in code spans
614        assert_eq!(result.len(), 1, "Should have exactly one warning");
615        assert!(
616            result[0].message.contains("missing.md"),
617            "Warning should be for missing.md"
618        );
619        assert!(
620            !result.iter().any(|w| w.message.contains("another-missing.md")),
621            "Should not warn about link in code span"
622        );
623        assert!(
624            !result.iter().any(|w| w.message.contains("yet-another-missing.md")),
625            "Should not warn about link in inline code"
626        );
627    }
628}