use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::time::Instant;
use ignore::WalkBuilder;
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
use crate::markdown::heading_to_anchor;
#[derive(Debug, Clone, Default)]
pub struct CheckOpts {
pub check_external: bool,
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BrokenLink {
pub line: u32,
pub reason: String,
pub raw_target: String,
}
#[derive(Debug, Clone)]
pub struct FileReport {
pub path: PathBuf,
pub broken: Vec<BrokenLink>,
}
#[derive(Debug)]
pub struct CheckReport {
pub files: Vec<FileReport>,
pub files_scanned: usize,
pub broken_count: usize,
pub elapsed: std::time::Duration,
}
impl CheckReport {
pub fn is_clean(&self) -> bool {
self.broken_count == 0
}
pub fn print(&self, root: &Path) {
println!("Checking links in {} ...\n", root.display());
for file_report in &self.files {
println!("{}:", file_report.path.display());
for broken in &file_report.broken {
println!(
" line {}: {} [{}]",
broken.line, broken.reason, broken.raw_target
);
}
println!();
}
let secs = self.elapsed.as_secs_f64();
if self.broken_count == 0 {
println!(
"All links OK. Scanned {} file(s) in {:.2}s.",
self.files_scanned, secs
);
} else {
let file_count = self.files.len();
println!(
"{} broken link(s) across {} file(s) ({} .md files scanned in {:.2}s).",
self.broken_count, file_count, self.files_scanned, secs
);
}
}
}
pub fn check_dir(root: &Path, opts: &CheckOpts) -> CheckReport {
let started = Instant::now();
if opts.check_external {
eprintln!("note: external link checking is not yet implemented; skipping HTTP(S) links.");
}
let md_paths = collect_md_files(root);
let files_scanned = md_paths.len();
let anchor_index: HashMap<PathBuf, HashSet<String>> = md_paths
.iter()
.map(|p| {
let anchors = parse_anchors_from_file(p);
(p.clone(), anchors)
})
.collect();
let mut file_reports: Vec<FileReport> = Vec::new();
let mut total_broken = 0usize;
for abs_path in &md_paths {
let content = match std::fs::read_to_string(abs_path) {
Ok(c) => c,
Err(_) => continue,
};
let broken = validate_links(abs_path, &content, &anchor_index);
if !broken.is_empty() {
total_broken += broken.len();
let rel_path = abs_path
.strip_prefix(root)
.unwrap_or(abs_path)
.to_path_buf();
file_reports.push(FileReport {
path: rel_path,
broken,
});
}
}
file_reports.sort_by(|a, b| a.path.cmp(&b.path));
CheckReport {
files: file_reports,
files_scanned,
broken_count: total_broken,
elapsed: started.elapsed(),
}
}
fn collect_md_files(root: &Path) -> Vec<PathBuf> {
let mut paths = Vec::new();
for entry in WalkBuilder::new(root).build().flatten() {
let path = entry.into_path();
if path.is_file() && path.extension().is_some_and(|e| e == "md") {
paths.push(path);
}
}
paths.sort();
paths
}
fn parse_anchors_from_file(path: &Path) -> HashSet<String> {
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return HashSet::new(),
};
parse_anchors(&content)
}
fn parse_anchors(content: &str) -> HashSet<String> {
let opts = Options::ENABLE_TABLES
| Options::ENABLE_STRIKETHROUGH
| Options::ENABLE_TASKLISTS
| Options::ENABLE_MATH;
let parser = Parser::new_ext(content, opts);
let mut anchors = HashSet::new();
let mut in_heading = false;
let mut heading_text = String::new();
for event in parser {
match event {
Event::Start(Tag::Heading { .. }) => {
in_heading = true;
heading_text.clear();
}
Event::End(TagEnd::Heading(_)) if in_heading => {
anchors.insert(heading_to_anchor(&heading_text));
in_heading = false;
heading_text.clear();
}
Event::End(TagEnd::Heading(_)) => {}
Event::Text(text) | Event::Code(text) if in_heading => {
heading_text.push_str(&text);
}
_ => {}
}
}
anchors
}
#[derive(Debug)]
enum LinkKind {
SameFileAnchor(String),
CrossFile(PathBuf),
CrossFileAnchor(PathBuf, String),
External,
Ignored,
}
fn classify_url(url: &str, file_dir: &Path) -> LinkKind {
if let Some(fragment) = url.strip_prefix('#') {
return LinkKind::SameFileAnchor(fragment.to_string());
}
if url.starts_with("http://") || url.starts_with("https://") {
return LinkKind::External;
}
if url.contains("://") || url.starts_with("mailto:") {
return LinkKind::Ignored;
}
let (path_part, fragment) = match url.find('#') {
Some(idx) => (&url[..idx], Some(&url[idx + 1..])),
None => (url, None),
};
let target = file_dir.join(path_part);
match fragment {
Some(frag) => LinkKind::CrossFileAnchor(target, frag.to_string()),
None => LinkKind::CrossFile(target),
}
}
struct RawLink {
url: String,
line: u32,
}
fn extract_links(content: &str) -> Vec<RawLink> {
let opts = Options::ENABLE_TABLES
| Options::ENABLE_STRIKETHROUGH
| Options::ENABLE_TASKLISTS
| Options::ENABLE_MATH;
let line_starts = build_line_starts(content);
let parser = Parser::new_ext(content, opts).into_offset_iter();
let mut links = Vec::new();
let mut current_link: Option<(String, u32)> = None;
for (event, range) in parser {
match event {
Event::Start(Tag::Link { dest_url, .. }) => {
let line = byte_offset_to_line(range.start, &line_starts);
current_link = Some((dest_url.into_string(), line));
}
Event::End(TagEnd::Link) => {
if let Some((url, line)) = current_link.take() {
links.push(RawLink { url, line });
}
}
_ => {}
}
}
links
}
fn build_line_starts(content: &str) -> Vec<usize> {
let mut starts = vec![0usize];
for (i, ch) in content.char_indices() {
if ch == '\n' {
starts.push(i + 1);
}
}
starts
}
fn byte_offset_to_line(offset: usize, line_starts: &[usize]) -> u32 {
let idx = line_starts.partition_point(|&s| s <= offset);
idx.saturating_sub(1) as u32 + 1
}
fn validate_links(
abs_path: &Path,
content: &str,
anchor_index: &HashMap<PathBuf, HashSet<String>>,
) -> Vec<BrokenLink> {
let file_dir = abs_path.parent().unwrap_or(Path::new("."));
let self_anchors = parse_anchors(content);
let raw_links = extract_links(content);
let mut broken = Vec::new();
for raw in raw_links {
match classify_url(&raw.url, file_dir) {
LinkKind::SameFileAnchor(anchor) => {
if !self_anchors.contains(&anchor) {
broken.push(BrokenLink {
line: raw.line,
reason: format!("broken anchor {}", &raw.url),
raw_target: raw.url,
});
}
}
LinkKind::CrossFile(target) => {
let resolved = normalize_path(&target);
if !resolved.exists() {
broken.push(BrokenLink {
line: raw.line,
reason: format!("missing file {}", &raw.url),
raw_target: raw.url,
});
}
}
LinkKind::CrossFileAnchor(target, anchor) => {
let resolved = normalize_path(&target);
if !resolved.exists() {
broken.push(BrokenLink {
line: raw.line,
reason: format!("missing file {}", &raw.url),
raw_target: raw.url,
});
} else {
let anchors = anchor_index.get(&resolved).cloned().unwrap_or_else(|| {
parse_anchors_from_file(&resolved)
});
if !anchors.contains(&anchor) {
broken.push(BrokenLink {
line: raw.line,
reason: format!("broken cross-file anchor {}", &raw.url),
raw_target: raw.url,
});
}
}
}
LinkKind::External | LinkKind::Ignored => {}
}
}
broken
}
fn normalize_path(path: &Path) -> PathBuf {
let mut out = PathBuf::new();
for component in path.components() {
match component {
std::path::Component::ParentDir => {
if !out.pop() {
out.push(component);
}
}
std::path::Component::CurDir => {
}
other => out.push(other),
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use tempfile::TempDir;
fn make_temp_dir(files: &[(&str, &str)]) -> (TempDir, PathBuf) {
let dir = tempfile::tempdir().expect("failed to create tempdir");
let root = dir.path().to_path_buf();
for (name, content) in files {
let path = root.join(name);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).expect("failed to create subdir");
}
fs::write(&path, content).expect("failed to write test file");
}
(dir, root)
}
#[test]
fn parse_anchors_extracts_heading_slugs() {
let content = "# Hello World\n\n## API v2.0\n\nsome text\n";
let anchors = parse_anchors(content);
assert!(anchors.contains("hello-world"), "expected 'hello-world'");
assert!(anchors.contains("api-v20"), "expected 'api-v20'");
}
#[test]
fn valid_internal_anchor_passes() {
let (_dir, root) = make_temp_dir(&[("doc.md", "# Title\n\n[link](#title)\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 0, "expected no broken links");
}
#[test]
fn broken_internal_anchor_reported() {
let (_dir, root) = make_temp_dir(&[("doc.md", "# Title\n\n[link](#nonexistent)\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
assert_eq!(report.files[0].broken[0].raw_target, "#nonexistent");
}
#[test]
fn valid_cross_file_link_passes() {
let (_dir, root) = make_temp_dir(&[("a.md", "[link](./b.md)\n"), ("b.md", "# B file\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 0, "expected no broken links");
}
#[test]
fn missing_file_reported() {
let (_dir, root) = make_temp_dir(&[("a.md", "[link](./nonexistent.md)\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
assert_eq!(report.files[0].broken[0].raw_target, "./nonexistent.md");
}
#[test]
fn cross_file_with_valid_anchor_passes() {
let (_dir, root) = make_temp_dir(&[
("a.md", "[link](./b.md#real-section)\n"),
("b.md", "# Real Section\n\nsome content.\n"),
]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 0, "expected no broken links");
}
#[test]
fn cross_file_with_bad_anchor_reported() {
let (_dir, root) = make_temp_dir(&[
("a.md", "[link](./b.md#fake)\n"),
("b.md", "# Real Section\n\nsome content.\n"),
]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
assert!(
report.files[0].broken[0].raw_target.contains("#fake"),
"raw_target should contain #fake"
);
}
#[test]
fn external_link_skipped_silently_when_check_external_off() {
let (_dir, root) = make_temp_dir(&[("doc.md", "[link](https://example.com)\n")]);
let report = check_dir(
&root,
&CheckOpts {
check_external: false,
},
);
assert_eq!(report.broken_count, 0, "external links must be skipped");
}
#[test]
fn normalize_path_resolves_parent_components() {
let p = PathBuf::from("/tmp/docs/../other.md");
assert_eq!(normalize_path(&p), PathBuf::from("/tmp/other.md"));
}
#[test]
fn byte_offset_to_line_maps_correctly() {
let content = "abc\ndef\n";
let starts = build_line_starts(content);
assert_eq!(byte_offset_to_line(0, &starts), 1);
assert_eq!(byte_offset_to_line(3, &starts), 1); assert_eq!(byte_offset_to_line(4, &starts), 2); assert_eq!(byte_offset_to_line(7, &starts), 2);
}
}