1#![forbid(unsafe_code)]
32#![forbid(unused_must_use)]
33
34use colored::Colorize;
35use log::{debug, error, info, trace, warn};
36use pulldown_cmark::{BrokenLink, Event, LinkType, Options, Parser, Tag, TagEnd};
37use regex::Regex;
38use std::collections::HashMap;
39use std::path::{Component, Path, PathBuf};
40use std::sync::LazyLock;
41
42static EMAIL_REGEX: LazyLock<Regex> = LazyLock::new(|| {
43 Regex::new("\
44 (?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|\"\
45 (?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21\\x23-\\x5b\\x5d-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])*\")@\
46 (?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\\[\
47 (?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:\
48 (?:[\\x01-\\x08\\x0b\\x0c\\x0e-\\x1f\\x21-\\x5a\\x53-\\x7f]|\\\\[\\x01-\\x09\\x0b\\x0c\\x0e-\\x7f])+)\\])"
49 ).unwrap()
50});
51
52fn simplify_path(path: &Path) -> String {
53 let mut out = vec![];
55
56 for comp in path.components() {
57 match comp {
58 Component::Prefix(_) | Component::RootDir | Component::Normal(_) => out.push(comp),
60
61 Component::CurDir => {}
63
64 Component::ParentDir => {
67 if let Some(Component::Normal(_)) = out.last() {
68 out.pop();
69 } else if path.is_relative() {
70 out.push(Component::ParentDir)
71 }
72 }
73 }
74 }
75
76 out.iter()
78 .collect::<PathBuf>()
79 .to_string_lossy()
80 .into_owned()
81}
82
83pub fn slugify(header: &str) -> String {
95 header
96 .chars()
97 .map(|c| if c == ' ' { '-' } else { c })
98 .filter(|c| c.is_ascii_alphanumeric() || *c == '-' || *c == '_')
99 .collect::<String>()
100 .to_lowercase()
101}
102
103pub fn generate_slugs(path: &Path) -> Result<Vec<String>, String> {
107 let canon = simplify_path(path);
109
110 debug!("Generating slugs for file: {}", canon);
111
112 let content = std::fs::read_to_string(path)
114 .map_err(|err| format!("Failed to read file at '{}': {}", canon.green(), err))?;
115
116 trace!(
117 "In '{}': just read file, which is {} bytes long.",
118 canon,
119 content.len()
120 );
121
122 let mut headers = vec![];
124
125 let mut header_counts = HashMap::<String, usize>::new();
127
128 let mut header: Option<String> = None;
131
132 let parser = Parser::new_ext(&content, Options::all());
134
135 for (event, range) in parser.into_offset_iter() {
136 macro_rules! format_msg {
137 ($($param: expr),*) => {{
138 let line = content.chars().take(range.start).filter(|c| *c == '\n').count();
140 format!("In '{}', line {}: {}", canon.green(), (line + 1).to_string().bright_magenta(), format!($($param),*))
141 }}
142 }
143
144 if let Some(ref mut header_str) = header {
146 match event {
147 Event::End(TagEnd::Heading { .. }) => {
149 let slug = slugify(header_str);
151 debug!("{}", format_msg!("found header: #{}", slug));
152
153 if header_str.trim().is_empty() {
155 warn!(
157 "{}",
158 format_msg!("heading was not directly followed by a title")
159 );
160 trace!("Faulty event: {:?}", event);
161 }
162
163 let duplicates = header_counts
165 .entry(slug.clone())
166 .and_modify(|d| *d += 1)
167 .or_insert(0);
168
169 if *duplicates > 0 {
171 headers.push(format!("{}-{}", slug, duplicates));
172 } else {
173 headers.push(slug);
174 }
175
176 header = None;
178 }
179
180 Event::Start(_)
181 | Event::End(_)
182 | Event::SoftBreak
183 | Event::HardBreak
184 | Event::Rule
185 | Event::TaskListMarker(_)
186 | Event::InlineMath(_)
187 | Event::DisplayMath(_)
188 | Event::InlineHtml(_) => {}
189
190 Event::Text(text)
191 | Event::Code(text)
192 | Event::Html(text)
193 | Event::FootnoteReference(text) => header_str.push_str(&text),
194 }
195 }
196 else if let Event::Start(Tag::Heading { .. }) = event {
198 header = Some(String::new())
200 }
201 }
202
203 Ok(headers)
205}
206
207#[derive(Debug, Clone, Copy)]
209pub struct CheckerOptions {
210 pub ignore_header_links: bool,
211 pub disallow_dir_links: bool,
212}
213
214pub enum CheckerError {
216 Io(String),
217 BrokenLinks(Vec<DetectedBrokenLink>),
218}
219
220pub type FileLinksCache = HashMap<PathBuf, Vec<String>>;
222
223pub struct DetectedBrokenLink {
225 pub file: PathBuf,
226 pub line: usize,
227 pub error: String,
228}
229
230pub fn check_broken_links(
244 path: &Path,
245 options: CheckerOptions,
246 links_cache: &mut FileLinksCache,
247) -> Result<(), CheckerError> {
248 let errors = if path.is_dir() {
250 check_broken_links_in_dir(path, &options, links_cache).map_err(CheckerError::Io)?
251 } else {
252 check_file_broken_links(path, &options, links_cache).map_err(CheckerError::Io)?
253 };
254
255 if errors.is_empty() {
256 Ok(())
257 } else {
258 Err(CheckerError::BrokenLinks(errors))
259 }
260}
261
262pub fn check_broken_links_in_dir(
263 path: &Path,
264 options: &CheckerOptions,
265 links_cache: &mut FileLinksCache,
266) -> Result<Vec<DetectedBrokenLink>, String> {
267 let canon = simplify_path(path);
269
270 debug!("Analyzing directory: {}", canon);
271
272 let dir_iter = path.read_dir().map_err(|err| {
273 format!(
274 "Failed to read input directory at '{}': {}",
275 canon.green(),
276 err
277 )
278 })?;
279
280 let mut errors = vec![];
281
282 for item in dir_iter {
283 let item = item.map_err(|err| {
284 format!(
285 "Failed to get item from directory at '{}': {}",
286 canon.green(),
287 err
288 )
289 })?;
290 let path = item.path();
291 let file_type = item.file_type().map_err(|err| {
292 format!(
293 "Failed to read file type of item at '{}': {}",
294 canon.green(),
295 err
296 )
297 })?;
298
299 if file_type.is_dir() {
300 errors.append(&mut check_broken_links_in_dir(&path, options, links_cache)?);
302 } else if file_type.is_file() {
303 if let Some(ext) = path.extension() {
305 if let Some(ext) = ext.to_str() {
306 if ext.to_ascii_lowercase() == "md" {
307 errors.append(&mut check_file_broken_links(&path, options, links_cache)?);
309 }
310 }
311 }
312 } else {
313 warn!(
314 "Item at path '{}' is neither a file nor a directory so it will be ignored",
315 canon
316 );
317 }
318 }
319
320 Ok(errors)
321}
322
323pub fn check_file_broken_links(
324 path: &Path,
325 options: &CheckerOptions,
326 links_cache: &mut FileLinksCache,
327) -> Result<Vec<DetectedBrokenLink>, String> {
328 let canon = simplify_path(path);
330
331 info!("Analyzing: {}", canon);
332
333 let CheckerOptions {
334 ignore_header_links,
335 disallow_dir_links,
336 } = &options;
337
338 let mut errors = vec![];
339
340 let content = std::fs::read_to_string(path)
341 .map_err(|err| format!("Failed to read file at '{}': {}", canon.green(), err))?;
342
343 trace!(
344 "In '{}': just read file, which is {} bytes long.",
345 canon,
346 content.len()
347 );
348
349 let mut handle_broken_links = |link: BrokenLink| {
351 error!(
352 "In '{}': Missing target for link '{}'",
353 canon.green(),
354 link.reference.yellow()
355 );
356
357 None
358 };
359
360 let parser = Parser::new_with_broken_link_callback(
362 &content,
363 Options::all(),
364 Some(&mut handle_broken_links),
365 );
366
367 for (event, range) in parser.into_offset_iter() {
368 macro_rules! make_err {
369 ($($param: expr),*) => {{
370 let line = content.chars().take(range.start).filter(|c| *c == '\n').count();
372 DetectedBrokenLink { file: path.to_path_buf(), line: line + 1, error: format!($($param),*) }
373 }}
374 }
375
376 if let Event::Start(Tag::Link {
378 link_type: LinkType::Inline,
379 dest_url,
380 title: _,
381 id: _,
382 }) = event
383 {
384 let (target, header): (String, Option<String>) =
386 match dest_url.chars().position(|c| c == '#') {
387 Some(index) => (
388 dest_url.chars().take(index).collect(),
389 Some(dest_url.chars().skip(index + 1).collect()),
390 ),
391 None => (dest_url.into_string(), None),
392 };
393
394 if target.starts_with("http://")
396 || target.starts_with("https://")
397 || target.starts_with("ftp://")
398 {
399 trace!("found link to URL: {target}");
400 continue;
401 }
402
403 if EMAIL_REGEX.is_match(&target) {
404 trace!("found link to e-mail addres: {target}");
405 continue;
406 }
407
408 let target = if !target.is_empty() {
409 path.parent().unwrap().join(Path::new(&target))
410 } else {
411 path.to_owned()
412 };
413
414 let target_canon = simplify_path(&target);
415
416 match std::fs::canonicalize(&target_canon) {
417 Ok(path) => {
418 if *disallow_dir_links && !path.is_file() {
419 errors.push(make_err!("invalid link found: path '{}' is a directory but only file links are allowed", target_canon.blue()));
420 continue;
421 }
422 }
423
424 Err(_) => {
425 errors.push(make_err!(
426 "broken link found: path '{}' does not exist",
427 target_canon.green()
428 ));
429 continue;
430 }
431 }
432
433 trace!("valid link found: {}", target_canon);
434
435 if !ignore_header_links {
437 if let Some(header) = header {
439 if !target.is_file() {
441 errors.push(make_err!(
442 "invalid header link found: path '{}' exists but is not a file",
443 target_canon.green()
444 ));
445 } else {
446 debug!(
447 "now checking link '{}' from file '{}'",
448 header, target_canon
449 );
450
451 let unified_target = target.canonicalize().unwrap();
454
455 if !links_cache.contains_key(&unified_target) {
457 links_cache.insert(
459 unified_target.clone(),
460 generate_slugs(&target).map_err(|err| {
463 format!(
464 "failed to generate slugs for file '{}': {}",
465 target_canon.green(),
466 err
467 )
468 })?,
469 );
470 }
471
472 let slugs = links_cache.get(&unified_target).unwrap();
474
475 if !slugs.contains(&header) {
477 errors.push(make_err!(
478 "broken link found: header '{}' not found in '{}'",
479 header.yellow(),
480 target_canon.green()
481 ));
482 } else {
483 trace!("valid header link found: {}", header);
484 }
485 }
486 }
487 }
488 }
489 }
490
491 Ok(errors)
492}