mdbook_lint_core/rules/
mdbook005.rs

1//! MDBOOK005: Detect orphaned markdown files not referenced in SUMMARY.md
2//!
3//! This rule finds markdown files in the project that are not referenced in SUMMARY.md.
4//! Orphaned files can indicate incomplete documentation or forgotten content.
5
6use crate::rule::{Rule, RuleCategory, RuleMetadata};
7use crate::{
8    Document,
9    violation::{Severity, Violation},
10};
11use std::collections::HashSet;
12use std::path::{Path, PathBuf};
13use std::{fs, io};
14
15/// MDBOOK005: Detect orphaned markdown files not referenced in SUMMARY.md
16///
17/// This rule checks for markdown files in the project that are not referenced
18/// in SUMMARY.md. Such files are "orphaned" and won't be included in the
19/// generated book, which may indicate:
20/// - Incomplete documentation structure
21/// - Forgotten content that should be added to the book
22/// - Old files that should be removed
23///
24/// The rule:
25/// - Only runs on SUMMARY.md files
26/// - Parses all chapter references in SUMMARY.md
27/// - Scans for .md and .markdown files in the project directory
28/// - Reports files that exist but aren't referenced
29/// - Ignores common files like README.md by default
30/// - Supports configuration for custom ignore patterns
31pub struct MDBOOK005 {
32    /// Files to ignore when checking for orphans (case-insensitive)
33    ignored_files: HashSet<String>,
34}
35
36impl Default for MDBOOK005 {
37    fn default() -> Self {
38        let mut ignored_files = HashSet::new();
39        // Common files that are typically not in SUMMARY.md
40        ignored_files.insert("readme.md".to_string());
41        ignored_files.insert("contributing.md".to_string());
42        ignored_files.insert("license.md".to_string());
43        ignored_files.insert("changelog.md".to_string());
44        ignored_files.insert("summary.md".to_string()); // Don't report SUMMARY.md itself
45
46        Self { ignored_files }
47    }
48}
49
50impl MDBOOK005 {
51    /// Create a new instance with custom ignored files (in addition to defaults)
52    pub fn with_ignored_files(additional_ignored: Vec<String>) -> Self {
53        let mut instance = Self::default();
54        for file in additional_ignored {
55            instance.ignored_files.insert(file.to_lowercase());
56        }
57        instance
58    }
59
60    /// Add a file to the ignore list
61    pub fn ignore_file(&mut self, filename: &str) {
62        self.ignored_files.insert(filename.to_lowercase());
63    }
64}
65
66impl Rule for MDBOOK005 {
67    fn id(&self) -> &'static str {
68        "MDBOOK005"
69    }
70
71    fn name(&self) -> &'static str {
72        "orphaned-files"
73    }
74
75    fn description(&self) -> &'static str {
76        "Detect orphaned markdown files not referenced in SUMMARY.md"
77    }
78
79    fn metadata(&self) -> RuleMetadata {
80        RuleMetadata::stable(RuleCategory::MdBook).introduced_in("mdbook-lint v0.2.0")
81    }
82
83    fn check_with_ast<'a>(
84        &self,
85        document: &Document,
86        _ast: Option<&'a comrak::nodes::AstNode<'a>>,
87    ) -> crate::error::Result<Vec<Violation>> {
88        let mut violations = Vec::new();
89
90        // Only check SUMMARY.md files
91        if !is_summary_file(document) {
92            return Ok(violations);
93        }
94
95        // Find the project root (directory containing SUMMARY.md)
96        let project_root = if document.path.is_absolute() {
97            document.path.parent().unwrap_or(Path::new("."))
98        } else {
99            // If path is relative, use current directory
100            Path::new(".")
101        };
102
103        // Parse referenced files from SUMMARY.md
104        let referenced_files = match self.parse_referenced_files(document) {
105            Ok(files) => files,
106            Err(_) => {
107                // If we can't parse SUMMARY.md, we can't check for orphans
108                return Ok(violations);
109            }
110        };
111
112        // Find all markdown files in the project
113        let all_markdown_files = match self.find_markdown_files(project_root) {
114            Ok(files) => files,
115            Err(_) => {
116                // If we can't scan the directory, we can't check for orphans
117                return Ok(violations);
118            }
119        };
120
121        // Find orphaned files
122        let orphaned_files = self.find_orphaned_files(&referenced_files, &all_markdown_files);
123
124        // Create violations for each orphaned file
125        for orphaned_file in orphaned_files {
126            let relative_path = orphaned_file
127                .strip_prefix(project_root)
128                .unwrap_or(orphaned_file.as_path())
129                .to_string_lossy()
130                .replace('\\', "/") // Ensure consistent forward slashes for cross-platform compatibility
131                .to_string();
132
133            violations.push(self.create_violation(
134                format!("Orphaned file '{relative_path}' is not referenced in SUMMARY.md"),
135                1, // Report on line 1 of SUMMARY.md since it's a structural issue
136                1,
137                Severity::Warning,
138            ));
139        }
140
141        Ok(violations)
142    }
143}
144
145impl MDBOOK005 {
146    /// Parse all file paths referenced in SUMMARY.md
147    fn parse_referenced_files(
148        &self,
149        document: &Document,
150    ) -> Result<HashSet<PathBuf>, Box<dyn std::error::Error>> {
151        let mut referenced = HashSet::new();
152        let project_root = document.path.parent().unwrap_or(Path::new("."));
153
154        for line in &document.lines {
155            if let Some(path) = self.extract_file_path(line) {
156                // Resolve path relative to SUMMARY.md location
157                let absolute_path = project_root.join(&path);
158                if let Ok(canonical) = absolute_path.canonicalize() {
159                    referenced.insert(canonical);
160                } else {
161                    // If canonicalize fails, use the resolved path
162                    referenced.insert(absolute_path);
163                }
164            }
165        }
166
167        Ok(referenced)
168    }
169
170    /// Extract file path from a SUMMARY.md line if present
171    fn extract_file_path(&self, line: &str) -> Option<String> {
172        // Look for markdown link syntax: [title](path)
173        if let Some(start) = line.find("](") {
174            let after_bracket = &line[start + 2..];
175            if let Some(end) = after_bracket.find(')') {
176                let path = &after_bracket[..end];
177
178                // Skip empty paths (draft chapters) and external URLs
179                if path.is_empty() || path.starts_with("http://") || path.starts_with("https://") {
180                    return None;
181                }
182
183                // Remove anchor fragments
184                let path_without_anchor = path.split('#').next().unwrap_or(path);
185
186                // Only include markdown files
187                if path_without_anchor.ends_with(".md")
188                    || path_without_anchor.ends_with(".markdown")
189                {
190                    return Some(path_without_anchor.to_string());
191                }
192            }
193        }
194
195        None
196    }
197
198    /// Find all markdown files in the project directory
199    fn find_markdown_files(&self, project_root: &Path) -> io::Result<HashSet<PathBuf>> {
200        let mut markdown_files = HashSet::new();
201        scan_directory_recursive(project_root, &mut markdown_files)?;
202        Ok(markdown_files)
203    }
204
205    /// Find files that exist but are not referenced
206    fn find_orphaned_files(
207        &self,
208        referenced: &HashSet<PathBuf>,
209        all_files: &HashSet<PathBuf>,
210    ) -> Vec<PathBuf> {
211        all_files
212            .iter()
213            .filter(|&file| {
214                // Skip if file is referenced in SUMMARY.md
215                if referenced.contains(file) {
216                    return false;
217                }
218
219                // Skip files in our ignore list
220                if let Some(filename) = file.file_name().and_then(|n| n.to_str())
221                    && self.ignored_files.contains(&filename.to_lowercase())
222                {
223                    return false;
224                }
225
226                true
227            })
228            .cloned()
229            .collect()
230    }
231}
232
233/// Check if the document represents a SUMMARY.md file
234fn is_summary_file(document: &Document) -> bool {
235    document
236        .path
237        .file_name()
238        .and_then(|name| name.to_str())
239        .map(|name| name.eq_ignore_ascii_case("summary.md"))
240        .unwrap_or(false)
241}
242
243/// Recursively scan directory for markdown files
244fn scan_directory_recursive(dir: &Path, markdown_files: &mut HashSet<PathBuf>) -> io::Result<()> {
245    let entries = fs::read_dir(dir)?;
246
247    for entry in entries {
248        let entry = entry?;
249        let path = entry.path();
250
251        if path.is_dir() {
252            // Skip common directories that shouldn't be scanned
253            if let Some(dir_name) = path.file_name().and_then(|n| n.to_str())
254                && matches!(
255                    dir_name,
256                    "target" | "node_modules" | ".git" | ".svn" | ".hg"
257                )
258            {
259                continue;
260            }
261            // Recursively scan subdirectories
262            scan_directory_recursive(&path, markdown_files)?;
263        } else if let Some(extension) = path.extension().and_then(|e| e.to_str())
264            && matches!(extension, "md" | "markdown")
265        {
266            if let Ok(canonical) = path.canonicalize() {
267                markdown_files.insert(canonical);
268            } else {
269                markdown_files.insert(path);
270            }
271        }
272    }
273
274    Ok(())
275}
276
277#[cfg(test)]
278mod tests {
279    use super::*;
280    use std::fs;
281    use tempfile::TempDir;
282
283    fn create_test_document(content: &str, file_path: &Path) -> crate::error::Result<Document> {
284        // Ensure parent directory exists
285        if let Some(parent) = file_path.parent() {
286            fs::create_dir_all(parent)?;
287        }
288        fs::write(file_path, content)?;
289        Document::new(content.to_string(), file_path.to_path_buf())
290    }
291
292    #[test]
293    fn test_mdbook005_no_orphans() -> crate::error::Result<()> {
294        let temp_dir = TempDir::new()?;
295        let root = temp_dir.path();
296
297        // Create SUMMARY.md that references all files
298        let summary_content = r#"# Summary
299
300[Introduction](intro.md)
301- [Chapter 1](chapter1.md)
302- [Chapter 2](chapter2.md)
303"#;
304        let summary_path = root.join("SUMMARY.md");
305        let doc = create_test_document(summary_content, &summary_path)?;
306
307        // Create the referenced files
308        create_test_document("# Intro", &root.join("intro.md"))?;
309        create_test_document("# Chapter 1", &root.join("chapter1.md"))?;
310        create_test_document("# Chapter 2", &root.join("chapter2.md"))?;
311
312        let rule = MDBOOK005::default();
313        let violations = rule.check(&doc)?;
314
315        assert_eq!(
316            violations.len(),
317            0,
318            "Should have no violations when all files are referenced"
319        );
320        Ok(())
321    }
322
323    #[test]
324    fn test_mdbook005_detect_orphans() -> crate::error::Result<()> {
325        let temp_dir = TempDir::new()?;
326        let root = temp_dir.path();
327
328        // Create SUMMARY.md that only references some files
329        let summary_content = r#"# Summary
330
331[Introduction](intro.md)
332- [Chapter 1](chapter1.md)
333"#;
334        let summary_path = root.join("SUMMARY.md");
335        let doc = create_test_document(summary_content, &summary_path)?;
336
337        // Create referenced files
338        create_test_document("# Intro", &root.join("intro.md"))?;
339        create_test_document("# Chapter 1", &root.join("chapter1.md"))?;
340
341        // Create orphaned files
342        create_test_document("# Orphan", &root.join("orphan.md"))?;
343        create_test_document("# Another", &root.join("another.md"))?;
344
345        let rule = MDBOOK005::default();
346        let violations = rule.check(&doc)?;
347
348        assert_eq!(violations.len(), 2, "Should detect 2 orphaned files");
349
350        let messages: Vec<_> = violations.iter().map(|v| &v.message).collect();
351        assert!(messages.iter().any(|m| m.contains("orphan.md")));
352        assert!(messages.iter().any(|m| m.contains("another.md")));
353
354        Ok(())
355    }
356
357    #[test]
358    fn test_mdbook005_ignore_common_files() -> crate::error::Result<()> {
359        let temp_dir = TempDir::new()?;
360        let root = temp_dir.path();
361
362        // Create SUMMARY.md with minimal content
363        let summary_content = r#"# Summary
364
365- [Chapter 1](chapter1.md)
366"#;
367        let summary_path = root.join("SUMMARY.md");
368        let doc = create_test_document(summary_content, &summary_path)?;
369
370        create_test_document("# Chapter 1", &root.join("chapter1.md"))?;
371
372        // Create files that should be ignored by default
373        create_test_document("# README", &root.join("README.md"))?;
374        create_test_document("# Contributing", &root.join("CONTRIBUTING.md"))?;
375        create_test_document("# License", &root.join("LICENSE.md"))?;
376
377        let rule = MDBOOK005::default();
378        let violations = rule.check(&doc)?;
379
380        assert_eq!(
381            violations.len(),
382            0,
383            "Should ignore common files like README.md"
384        );
385        Ok(())
386    }
387
388    #[test]
389    fn test_mdbook005_nested_directories() -> crate::error::Result<()> {
390        let temp_dir = TempDir::new()?;
391        let root = temp_dir.path();
392
393        // Create SUMMARY.md that references nested files
394        let summary_content = r#"# Summary
395
396- [Chapter 1](guide/chapter1.md)
397"#;
398        let summary_path = root.join("SUMMARY.md");
399        let doc = create_test_document(summary_content, &summary_path)?;
400
401        // Create referenced nested file
402        create_test_document("# Chapter 1", &root.join("guide/chapter1.md"))?;
403
404        // Create orphaned nested file
405        create_test_document("# Orphan", &root.join("guide/orphan.md"))?;
406
407        let rule = MDBOOK005::default();
408        let violations = rule.check(&doc)?;
409
410        assert_eq!(
411            violations.len(),
412            1,
413            "Should detect orphaned files in subdirectories"
414        );
415        assert!(violations[0].message.contains("guide/orphan.md"));
416        Ok(())
417    }
418
419    #[test]
420    fn test_mdbook005_draft_chapters() -> crate::error::Result<()> {
421        let temp_dir = TempDir::new()?;
422        let root = temp_dir.path();
423
424        // Create SUMMARY.md with draft chapters (empty paths)
425        let summary_content = r#"# Summary
426
427- [Chapter 1](chapter1.md)
428- [Draft Chapter]()
429"#;
430        let summary_path = root.join("SUMMARY.md");
431        let doc = create_test_document(summary_content, &summary_path)?;
432
433        create_test_document("# Chapter 1", &root.join("chapter1.md"))?;
434        create_test_document("# Orphan", &root.join("orphan.md"))?;
435
436        let rule = MDBOOK005::default();
437        let violations = rule.check(&doc)?;
438
439        // Should still detect the orphan, but not complain about the draft
440        assert_eq!(violations.len(), 1);
441        assert!(violations[0].message.contains("orphan.md"));
442        Ok(())
443    }
444
445    #[test]
446    fn test_mdbook005_non_summary_files() -> crate::error::Result<()> {
447        let temp_dir = TempDir::new()?;
448
449        // Test on a non-SUMMARY.md file
450        let content = "# Regular File";
451        let doc_path = temp_dir.path().join("README.md");
452        let doc = create_test_document(content, &doc_path)?;
453
454        let rule = MDBOOK005::default();
455        let violations = rule.check(&doc)?;
456
457        assert_eq!(
458            violations.len(),
459            0,
460            "Should not run on non-SUMMARY.md files"
461        );
462        Ok(())
463    }
464
465    #[test]
466    fn test_extract_file_path() {
467        let rule = MDBOOK005::default();
468
469        // Valid paths
470        assert_eq!(
471            rule.extract_file_path("- [Chapter](chapter.md)"),
472            Some("chapter.md".to_string())
473        );
474        assert_eq!(
475            rule.extract_file_path("[Intro](intro.md)"),
476            Some("intro.md".to_string())
477        );
478        assert_eq!(
479            rule.extract_file_path("    - [Nested](sub/nested.md)"),
480            Some("sub/nested.md".to_string())
481        );
482
483        // Paths with anchors
484        assert_eq!(
485            rule.extract_file_path("- [Link](file.md#section)"),
486            Some("file.md".to_string())
487        );
488
489        // Invalid or ignored paths
490        assert_eq!(rule.extract_file_path("- [Draft]()"), None);
491        assert_eq!(
492            rule.extract_file_path("- [External](https://example.com)"),
493            None
494        );
495        assert_eq!(rule.extract_file_path("- [Non-MD](image.png)"), None);
496        assert_eq!(rule.extract_file_path("Regular text"), None);
497    }
498
499    #[test]
500    fn test_custom_ignored_files() -> crate::error::Result<()> {
501        let temp_dir = TempDir::new()?;
502        let root = temp_dir.path();
503
504        let summary_content = r#"# Summary
505
506- [Chapter 1](chapter1.md)
507"#;
508        let summary_path = root.join("SUMMARY.md");
509        let doc = create_test_document(summary_content, &summary_path)?;
510
511        create_test_document("# Chapter 1", &root.join("chapter1.md"))?;
512        create_test_document("# Custom", &root.join("custom.md"))?;
513        create_test_document("# Orphan", &root.join("orphan.md"))?;
514
515        // Create rule that ignores custom.md
516        let rule = MDBOOK005::with_ignored_files(vec!["custom.md".to_string()]);
517        let violations = rule.check(&doc)?;
518
519        // Should only report orphan.md, not custom.md
520        assert_eq!(violations.len(), 1);
521        assert!(violations[0].message.contains("orphan.md"));
522        assert!(!violations[0].message.contains("custom.md"));
523        Ok(())
524    }
525}