use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::mpsc;
use std::time::{Duration, Instant};
use ignore::WalkBuilder;
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
use crate::markdown::heading_to_anchor;
#[derive(Debug, Clone)]
pub struct CheckOpts {
pub check_external: bool,
pub external_timeout_secs: u64,
}
impl Default for CheckOpts {
fn default() -> Self {
Self {
check_external: false,
external_timeout_secs: 10,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct BrokenLink {
pub line: u32,
pub reason: String,
pub raw_target: String,
}
#[derive(Debug, Clone)]
pub struct FileReport {
pub path: PathBuf,
pub broken: Vec<BrokenLink>,
}
#[derive(Debug)]
pub struct CheckReport {
pub files: Vec<FileReport>,
pub files_scanned: usize,
pub broken_count: usize,
pub elapsed: std::time::Duration,
}
impl CheckReport {
pub fn is_clean(&self) -> bool {
self.broken_count == 0
}
pub fn print(&self, root: &Path) {
println!("Checking links in {} ...\n", root.display());
for file_report in &self.files {
println!("{}:", file_report.path.display());
for broken in &file_report.broken {
println!(
" line {}: {} [{}]",
broken.line, broken.reason, broken.raw_target
);
}
println!();
}
let secs = self.elapsed.as_secs_f64();
if self.broken_count == 0 {
println!(
"All links OK. Scanned {} file(s) in {:.2}s.",
self.files_scanned, secs
);
} else {
let file_count = self.files.len();
println!(
"{} broken link(s) across {} file(s) ({} .md files scanned in {:.2}s).",
self.broken_count, file_count, self.files_scanned, secs
);
}
}
}
pub fn check_dir(root: &Path, opts: &CheckOpts) -> CheckReport {
let started = Instant::now();
let md_paths = collect_md_files(root);
let files_scanned = md_paths.len();
let anchor_index: HashMap<PathBuf, HashSet<String>> = md_paths
.iter()
.map(|p| {
let anchors = parse_anchors_from_file(p);
(p.clone(), anchors)
})
.collect();
let mut external_map: HashMap<String, Vec<(PathBuf, u32)>> = HashMap::new();
if opts.check_external {
for abs_path in &md_paths {
let content = match std::fs::read_to_string(abs_path) {
Ok(c) => c,
Err(_) => continue,
};
for raw in extract_links(&content) {
if matches!(classify_url(&raw.url, abs_path.parent().unwrap_or(Path::new("."))), LinkKind::External) {
external_map
.entry(raw.url)
.or_default()
.push((abs_path.clone(), raw.line));
}
}
}
let ext_count = external_map.len();
if ext_count > 0 {
println!(
"Checking {} external link(s)... (this may take a few seconds)",
ext_count
);
}
}
let external_results: HashMap<String, Option<String>> = if opts.check_external && !external_map.is_empty() {
check_external_links(external_map.keys().cloned().collect(), opts)
} else {
HashMap::new()
};
let mut file_reports: Vec<FileReport> = Vec::new();
let mut total_broken = 0usize;
for abs_path in &md_paths {
let content = match std::fs::read_to_string(abs_path) {
Ok(c) => c,
Err(_) => continue,
};
let mut broken = validate_links(abs_path, &content, &anchor_index);
if opts.check_external {
for (url, occurrences) in &external_map {
let broken_reason = external_results
.get(url)
.and_then(|r| r.as_ref());
if let Some(reason) = broken_reason {
for (path, line) in occurrences {
if path == abs_path {
broken.push(BrokenLink {
line: *line,
reason: format!("{} [external]", reason),
raw_target: url.clone(),
});
}
}
}
}
}
broken.sort_by_key(|b| b.line);
if !broken.is_empty() {
total_broken += broken.len();
let rel_path = abs_path
.strip_prefix(root)
.unwrap_or(abs_path)
.to_path_buf();
file_reports.push(FileReport {
path: rel_path,
broken,
});
}
}
file_reports.sort_by(|a, b| a.path.cmp(&b.path));
CheckReport {
files: file_reports,
files_scanned,
broken_count: total_broken,
elapsed: started.elapsed(),
}
}
#[derive(Debug)]
enum ExternalOutcome {
Ok,
Broken(String),
}
fn check_external_links(
urls: Vec<String>,
opts: &CheckOpts,
) -> HashMap<String, Option<String>> {
const MAX_WORKERS: usize = 10;
const MAX_REDIRECTS: u32 = 5;
let timeout = Duration::from_secs(opts.external_timeout_secs);
let (tx, rx) = mpsc::channel::<(String, ExternalOutcome)>();
std::thread::scope(|scope| {
for chunk in urls.chunks(MAX_WORKERS) {
let handles: Vec<_> = chunk
.iter()
.map(|url| {
let tx = tx.clone();
let url = url.clone();
scope.spawn(move || {
let outcome = head_request(&url, timeout, MAX_REDIRECTS);
let _ = tx.send((url, outcome));
})
})
.collect();
for handle in handles {
let _ = handle.join();
}
}
});
drop(tx);
let mut results = HashMap::new();
for (url, outcome) in rx {
let entry = match outcome {
ExternalOutcome::Ok => None,
ExternalOutcome::Broken(reason) => Some(reason),
};
results.insert(url, entry);
}
results
}
fn head_request(url: &str, timeout: Duration, max_redirects: u32) -> ExternalOutcome {
use ureq::config::Config;
let agent: ureq::Agent = Config::builder()
.timeout_global(Some(timeout))
.max_redirects(max_redirects)
.build()
.into();
match agent.head(url).call() {
Ok(_response) => {
ExternalOutcome::Ok
}
Err(ureq::Error::StatusCode(code)) => {
let reason = http_status_reason(code);
ExternalOutcome::Broken(reason)
}
Err(ureq::Error::Timeout(_)) => {
ExternalOutcome::Broken("connection timeout".to_string())
}
Err(ureq::Error::HostNotFound) => {
ExternalOutcome::Broken("host not found (DNS failure)".to_string())
}
Err(ureq::Error::Io(e)) => {
ExternalOutcome::Broken(format!("connection error: {}", e))
}
Err(e) => ExternalOutcome::Broken(format!("request error: {}", e)),
}
}
fn http_status_reason(code: u16) -> String {
let label = match code {
400 => "Bad Request",
401 => "Unauthorized",
403 => "Forbidden",
404 => "Not Found",
405 => "Method Not Allowed",
408 => "Request Timeout",
410 => "Gone",
429 => "Too Many Requests",
500 => "Internal Server Error",
502 => "Bad Gateway",
503 => "Service Unavailable",
504 => "Gateway Timeout",
_ => "HTTP error",
};
format!("{} {}", code, label)
}
fn collect_md_files(root: &Path) -> Vec<PathBuf> {
let mut paths = Vec::new();
for entry in WalkBuilder::new(root).build().flatten() {
let path = entry.into_path();
if path.is_file() && path.extension().is_some_and(|e| e == "md") {
paths.push(path);
}
}
paths.sort();
paths
}
fn parse_anchors_from_file(path: &Path) -> HashSet<String> {
let content = match std::fs::read_to_string(path) {
Ok(c) => c,
Err(_) => return HashSet::new(),
};
parse_anchors(&content)
}
fn parse_anchors(content: &str) -> HashSet<String> {
let opts = Options::ENABLE_TABLES
| Options::ENABLE_STRIKETHROUGH
| Options::ENABLE_TASKLISTS
| Options::ENABLE_MATH;
let parser = Parser::new_ext(content, opts);
let mut anchors = HashSet::new();
let mut in_heading = false;
let mut heading_text = String::new();
for event in parser {
match event {
Event::Start(Tag::Heading { .. }) => {
in_heading = true;
heading_text.clear();
}
Event::End(TagEnd::Heading(_)) if in_heading => {
anchors.insert(heading_to_anchor(&heading_text));
in_heading = false;
heading_text.clear();
}
Event::End(TagEnd::Heading(_)) => {}
Event::Text(text) | Event::Code(text) if in_heading => {
heading_text.push_str(&text);
}
_ => {}
}
}
anchors
}
#[derive(Debug)]
enum LinkKind {
SameFileAnchor(String),
CrossFile(PathBuf),
CrossFileAnchor(PathBuf, String),
External,
Ignored,
}
fn classify_url(url: &str, file_dir: &Path) -> LinkKind {
if let Some(fragment) = url.strip_prefix('#') {
return LinkKind::SameFileAnchor(fragment.to_string());
}
if url.starts_with("http://") || url.starts_with("https://") {
return LinkKind::External;
}
if url.contains("://") || url.starts_with("mailto:") {
return LinkKind::Ignored;
}
let (path_part, fragment) = match url.find('#') {
Some(idx) => (&url[..idx], Some(&url[idx + 1..])),
None => (url, None),
};
let target = file_dir.join(path_part);
match fragment {
Some(frag) => LinkKind::CrossFileAnchor(target, frag.to_string()),
None => LinkKind::CrossFile(target),
}
}
struct RawLink {
url: String,
line: u32,
}
fn extract_links(content: &str) -> Vec<RawLink> {
let opts = Options::ENABLE_TABLES
| Options::ENABLE_STRIKETHROUGH
| Options::ENABLE_TASKLISTS
| Options::ENABLE_MATH;
let line_starts = build_line_starts(content);
let parser = Parser::new_ext(content, opts).into_offset_iter();
let mut links = Vec::new();
let mut current_link: Option<(String, u32)> = None;
for (event, range) in parser {
match event {
Event::Start(Tag::Link { dest_url, .. }) => {
let line = byte_offset_to_line(range.start, &line_starts);
current_link = Some((dest_url.into_string(), line));
}
Event::End(TagEnd::Link) => {
if let Some((url, line)) = current_link.take() {
links.push(RawLink { url, line });
}
}
_ => {}
}
}
links
}
fn build_line_starts(content: &str) -> Vec<usize> {
let mut starts = vec![0usize];
for (i, ch) in content.char_indices() {
if ch == '\n' {
starts.push(i + 1);
}
}
starts
}
fn byte_offset_to_line(offset: usize, line_starts: &[usize]) -> u32 {
let idx = line_starts.partition_point(|&s| s <= offset);
idx.saturating_sub(1) as u32 + 1
}
fn validate_links(
abs_path: &Path,
content: &str,
anchor_index: &HashMap<PathBuf, HashSet<String>>,
) -> Vec<BrokenLink> {
let file_dir = abs_path.parent().unwrap_or(Path::new("."));
let self_anchors = parse_anchors(content);
let raw_links = extract_links(content);
let mut broken = Vec::new();
for raw in raw_links {
match classify_url(&raw.url, file_dir) {
LinkKind::SameFileAnchor(anchor) => {
if !self_anchors.contains(&anchor) {
broken.push(BrokenLink {
line: raw.line,
reason: format!("broken anchor {}", &raw.url),
raw_target: raw.url,
});
}
}
LinkKind::CrossFile(target) => {
let resolved = normalize_path(&target);
if !resolved.exists() {
broken.push(BrokenLink {
line: raw.line,
reason: format!("missing file {}", &raw.url),
raw_target: raw.url,
});
}
}
LinkKind::CrossFileAnchor(target, anchor) => {
let resolved = normalize_path(&target);
if !resolved.exists() {
broken.push(BrokenLink {
line: raw.line,
reason: format!("missing file {}", &raw.url),
raw_target: raw.url,
});
} else {
let anchors = anchor_index.get(&resolved).cloned().unwrap_or_else(|| {
parse_anchors_from_file(&resolved)
});
if !anchors.contains(&anchor) {
broken.push(BrokenLink {
line: raw.line,
reason: format!("broken cross-file anchor {}", &raw.url),
raw_target: raw.url,
});
}
}
}
LinkKind::External | LinkKind::Ignored => {}
}
}
broken
}
fn normalize_path(path: &Path) -> PathBuf {
let mut out = PathBuf::new();
for component in path.components() {
match component {
std::path::Component::ParentDir => {
if !out.pop() {
out.push(component);
}
}
std::path::Component::CurDir => {
}
other => out.push(other),
}
}
out
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use std::net::TcpListener;
use std::io::{Read, Write};
use tempfile::TempDir;
fn make_temp_dir(files: &[(&str, &str)]) -> (TempDir, PathBuf) {
let dir = tempfile::tempdir().expect("failed to create tempdir");
let root = dir.path().to_path_buf();
for (name, content) in files {
let path = root.join(name);
if let Some(parent) = path.parent() {
fs::create_dir_all(parent).expect("failed to create subdir");
}
fs::write(&path, content).expect("failed to write test file");
}
(dir, root)
}
fn bind_mock_server() -> (TcpListener, String) {
let listener = TcpListener::bind("127.0.0.1:0").expect("bind");
let addr = listener.local_addr().expect("local_addr");
let base_url = format!("http://127.0.0.1:{}", addr.port());
(listener, base_url)
}
fn serve_once(listener: TcpListener, status_line: &'static str) {
std::thread::spawn(move || {
if let Ok((mut stream, _)) = listener.accept() {
let mut buf = [0u8; 4096];
let _ = stream.read(&mut buf);
let response = format!(
"{}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n",
status_line
);
let _ = stream.write_all(response.as_bytes());
}
});
}
fn serve_redirect(listener: TcpListener, location: String) {
std::thread::spawn(move || {
if let Ok((mut stream, _)) = listener.accept() {
let mut buf = [0u8; 4096];
let _ = stream.read(&mut buf);
let response = format!(
"HTTP/1.1 301 Moved Permanently\r\nLocation: {}\r\nContent-Length: 0\r\nConnection: close\r\n\r\n",
location
);
let _ = stream.write_all(response.as_bytes());
}
});
}
#[test]
fn parse_anchors_extracts_heading_slugs() {
let content = "# Hello World\n\n## API v2.0\n\nsome text\n";
let anchors = parse_anchors(content);
assert!(anchors.contains("hello-world"), "expected 'hello-world'");
assert!(anchors.contains("api-v20"), "expected 'api-v20'");
}
#[test]
fn valid_internal_anchor_passes() {
let (_dir, root) = make_temp_dir(&[("doc.md", "# Title\n\n[link](#title)\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 0, "expected no broken links");
}
#[test]
fn broken_internal_anchor_reported() {
let (_dir, root) = make_temp_dir(&[("doc.md", "# Title\n\n[link](#nonexistent)\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
assert_eq!(report.files[0].broken[0].raw_target, "#nonexistent");
}
#[test]
fn valid_cross_file_link_passes() {
let (_dir, root) = make_temp_dir(&[("a.md", "[link](./b.md)\n"), ("b.md", "# B file\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 0, "expected no broken links");
}
#[test]
fn missing_file_reported() {
let (_dir, root) = make_temp_dir(&[("a.md", "[link](./nonexistent.md)\n")]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
assert_eq!(report.files[0].broken[0].raw_target, "./nonexistent.md");
}
#[test]
fn cross_file_with_valid_anchor_passes() {
let (_dir, root) = make_temp_dir(&[
("a.md", "[link](./b.md#real-section)\n"),
("b.md", "# Real Section\n\nsome content.\n"),
]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 0, "expected no broken links");
}
#[test]
fn cross_file_with_bad_anchor_reported() {
let (_dir, root) = make_temp_dir(&[
("a.md", "[link](./b.md#fake)\n"),
("b.md", "# Real Section\n\nsome content.\n"),
]);
let report = check_dir(&root, &CheckOpts::default());
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
assert!(
report.files[0].broken[0].raw_target.contains("#fake"),
"raw_target should contain #fake"
);
}
#[test]
fn external_link_skipped_silently_when_check_external_off() {
let (_dir, root) = make_temp_dir(&[("doc.md", "[link](https://example.com)\n")]);
let report = check_dir(
&root,
&CheckOpts {
check_external: false,
..CheckOpts::default()
},
);
assert_eq!(report.broken_count, 0, "external links must be skipped");
}
#[test]
fn external_link_with_2xx_passes() {
let (listener, base_url) = bind_mock_server();
serve_once(listener, "HTTP/1.1 200 OK");
let outcome = head_request(&base_url, Duration::from_secs(5), 5);
assert!(
matches!(outcome, ExternalOutcome::Ok),
"200 OK should pass"
);
}
#[test]
fn external_link_with_4xx_reported() {
let (listener, base_url) = bind_mock_server();
serve_once(listener, "HTTP/1.1 404 Not Found");
let outcome = head_request(&base_url, Duration::from_secs(5), 5);
match outcome {
ExternalOutcome::Broken(reason) => {
assert!(
reason.contains("404"),
"expected 404 in reason, got: {reason}"
);
}
ExternalOutcome::Ok => panic!("404 response should be reported as broken"),
}
}
#[test]
fn external_link_with_5xx_reported() {
let (listener, base_url) = bind_mock_server();
serve_once(listener, "HTTP/1.1 500 Internal Server Error");
let outcome = head_request(&base_url, Duration::from_secs(5), 5);
match outcome {
ExternalOutcome::Broken(reason) => {
assert!(
reason.contains("500"),
"expected 500 in reason, got: {reason}"
);
}
ExternalOutcome::Ok => panic!("500 response should be reported as broken"),
}
}
#[test]
fn external_link_redirect_followed() {
let (listener_dest, dest_url) = bind_mock_server();
serve_once(listener_dest, "HTTP/1.1 200 OK");
let (listener_src, src_url) = bind_mock_server();
serve_redirect(listener_src, dest_url);
let outcome = head_request(&src_url, Duration::from_secs(5), 5);
assert!(
matches!(outcome, ExternalOutcome::Ok),
"redirect chain ending in 200 should pass"
);
}
#[test]
fn external_link_dns_failure_reported() {
let url = "http://this-will-never-resolve.invalid/path";
let outcome = head_request(url, Duration::from_secs(5), 5);
assert!(
matches!(outcome, ExternalOutcome::Broken(_)),
"DNS failure should be reported as broken"
);
}
#[test]
fn external_link_connection_error_reported() {
let listener = TcpListener::bind("127.0.0.1:0").expect("bind");
let port = listener.local_addr().expect("addr").port();
drop(listener);
let url = format!("http://127.0.0.1:{}/", port);
let outcome = head_request(&url, Duration::from_secs(5), 5);
assert!(
matches!(outcome, ExternalOutcome::Broken(_)),
"connection refused should be reported as broken"
);
}
#[test]
fn check_dir_reports_external_broken_link() {
let (listener, base_url) = bind_mock_server();
serve_once(listener, "HTTP/1.1 404 Not Found");
let content = format!("[broken external]({})\n", base_url);
let (_dir, root) = make_temp_dir(&[("doc.md", &content)]);
let report = check_dir(
&root,
&CheckOpts {
check_external: true,
external_timeout_secs: 5,
},
);
assert_eq!(report.broken_count, 1, "expected exactly one broken link");
let broken = &report.files[0].broken[0];
assert!(
broken.reason.contains("404"),
"reason should mention 404: {}",
broken.reason
);
assert!(
broken.reason.contains("[external]"),
"reason should be tagged [external]: {}",
broken.reason
);
}
#[test]
fn check_dir_passes_external_2xx_link() {
let (listener, base_url) = bind_mock_server();
serve_once(listener, "HTTP/1.1 200 OK");
let content = format!("[valid external]({})\n", base_url);
let (_dir, root) = make_temp_dir(&[("doc.md", &content)]);
let report = check_dir(
&root,
&CheckOpts {
check_external: true,
external_timeout_secs: 5,
},
);
assert_eq!(report.broken_count, 0, "200 external link should pass");
}
#[test]
fn normalize_path_resolves_parent_components() {
let p = PathBuf::from("/tmp/docs/../other.md");
assert_eq!(normalize_path(&p), PathBuf::from("/tmp/other.md"));
}
#[test]
fn byte_offset_to_line_maps_correctly() {
let content = "abc\ndef\n";
let starts = build_line_starts(content);
assert_eq!(byte_offset_to_line(0, &starts), 1);
assert_eq!(byte_offset_to_line(3, &starts), 1); assert_eq!(byte_offset_to_line(4, &starts), 2); assert_eq!(byte_offset_to_line(7, &starts), 2);
}
}