use glob::glob;
use pulldown_cmark::{Event, Options, Parser};
use serde::Serialize;
use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use thiserror::Error;
#[derive(Debug, Error)]
pub enum CheckError {
#[error("IO error: {0}")]
Io(#[from] std::io::Error),
#[error("Invalid UTF-8 in file: {0}")]
Utf8Error(#[from] std::string::FromUtf8Error),
}
#[derive(Debug, Clone, PartialEq)] pub struct CheckConfig {
pub recursive: bool,
pub output_format: OutputFormat,
pub strict: bool,
pub ignore_warnings: bool,
}
#[derive(Debug, Clone, PartialEq)] pub enum OutputFormat {
Human,
Json,
}
#[derive(Debug, Serialize)]
pub struct Issue {
pub line: usize,
pub message: String,
pub context: Option<String>,
}
#[derive(Debug, Serialize)]
pub struct CheckResult {
pub file_path: PathBuf,
pub errors: Vec<Issue>,
pub warnings: Vec<Issue>,
}
pub fn check_files(paths: &[PathBuf], config: &CheckConfig) -> Vec<CheckResult> {
let mut all_files = Vec::new();
for path in paths {
if path.is_dir() {
if config.recursive {
let pattern = path.join("**/*.md").to_string_lossy().to_string();
if let Ok(entries) = glob(&pattern) {
for entry in entries.flatten() {
all_files.push(entry);
}
}
} else {
if let Ok(entries) = fs::read_dir(path) {
for entry in entries.flatten() {
let file_path = entry.path();
if file_path
.extension()
.map_or(false, |ext| ext == "md" || ext == "markdown")
{
all_files.push(file_path);
}
}
}
}
} else if path.is_file() {
all_files.push(path.clone());
}
}
all_files
.iter()
.map(|file_path| check_file(file_path, config))
.collect()
}
pub fn check_file(file_path: &Path, _config: &CheckConfig) -> CheckResult {
let mut errors = Vec::new();
let mut warnings = Vec::new();
let content = match fs::read_to_string(file_path) {
Ok(content) => content,
Err(e) => {
errors.push(Issue {
line: 0,
message: format!("Failed to read file: {}", e),
context: None,
});
return CheckResult {
file_path: file_path.to_path_buf(),
errors,
warnings,
};
}
};
let lines: Vec<&str> = content.lines().collect();
let mut options = Options::empty();
options.insert(Options::ENABLE_TABLES);
options.insert(Options::ENABLE_FOOTNOTES);
options.insert(Options::ENABLE_STRIKETHROUGH);
options.insert(Options::ENABLE_TASKLISTS);
let parser = Parser::new_ext(&content, options);
let events: Vec<Event> = parser.collect();
check_markdown_structure(&lines, &events, &mut errors, &mut warnings);
check_link_references(&lines, &mut errors);
check_list_consistency(&lines, &mut warnings);
check_header_consistency(&lines, &mut warnings);
CheckResult {
file_path: file_path.to_path_buf(),
errors,
warnings,
}
}
fn check_markdown_structure(
lines: &[&str],
events: &[Event],
errors: &mut Vec<Issue>,
warnings: &mut Vec<Issue>,
) {
let mut in_code_block = false;
let mut line_num = 0;
for line in lines {
line_num += 1;
let trimmed = line.trim();
if trimmed.starts_with("```") {
in_code_block = !in_code_block;
continue;
}
if in_code_block {
continue; }
if !trimmed.is_empty() {
let leading_spaces = line.len() - line.trim_start().len();
if leading_spaces % 4 != 0 && leading_spaces % 2 != 0 && leading_spaces > 0 {
warnings.push(Issue {
line: line_num,
message: "Inconsistent indentation (prefer 2 or 4 spaces)".to_string(),
context: Some(line.to_string()),
});
}
}
if contains_bare_url(trimmed) {
warnings.push(Issue {
line: line_num,
message: "Bare URL detected (consider using link syntax)".to_string(),
context: Some(line.to_string()),
});
}
}
let mut open_elements = Vec::new();
for event in events {
match event {
Event::Start(tag) => {
open_elements.push((tag.clone(), line_num));
}
Event::End(tag) => {
if let Some((last_tag, _)) = open_elements.last() {
if last_tag == tag {
open_elements.pop();
}
}
}
_ => {}
}
}
for (tag, line) in open_elements {
errors.push(Issue {
line,
message: format!("Unclosed {:?} element", tag),
context: None,
});
}
}
fn check_link_references(lines: &[&str], errors: &mut Vec<Issue>) {
let mut link_references = HashSet::new();
let mut defined_references = HashSet::new();
let mut line_num = 0;
let ref_def_regex = regex::Regex::new(r"^\[([^\]]+)\]:\s*(.+)$").unwrap();
let link_ref_regex = regex::Regex::new(r"!?\[([^\]]*)\]\[([^\]]+)\]").unwrap();
for line in lines {
line_num += 1;
if let Some(caps) = ref_def_regex.captures(line) {
if let Some(reference) = caps.get(1) {
defined_references.insert(reference.as_str().to_lowercase());
}
}
if let Some(caps) = link_ref_regex.captures(line) {
if let Some(reference) = caps.get(2) {
link_references.insert((reference.as_str().to_lowercase(), line_num));
}
}
}
for (reference, line) in link_references {
if !defined_references.contains(&reference) {
errors.push(Issue {
line,
message: format!("Undefined link reference '[{}]'", reference),
context: None,
});
}
}
}
fn check_list_consistency(lines: &[&str], warnings: &mut Vec<Issue>) {
let mut in_list = false;
let mut list_indent = 0;
let mut line_num = 0;
for line in lines {
line_num += 1;
let trimmed = line.trim_start();
let indent = line.len() - trimmed.len();
if trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+') {
if in_list && indent != list_indent {
warnings.push(Issue {
line: line_num,
message: "Inconsistent list indentation".to_string(),
context: Some(line.to_string()),
});
}
in_list = true;
list_indent = indent;
} else if trimmed.starts_with(|c: char| c.is_ascii_digit()) && trimmed.contains('.') {
if in_list && indent != list_indent {
warnings.push(Issue {
line: line_num,
message: "Inconsistent list indentation".to_string(),
context: Some(line.to_string()),
});
}
in_list = true;
list_indent = indent;
} else if !trimmed.is_empty() && !line.trim().is_empty() {
in_list = false;
}
}
}
fn check_header_consistency(lines: &[&str], warnings: &mut Vec<Issue>) {
let mut previous_level = 0;
let mut line_num = 0;
for line in lines {
line_num += 1;
if let Some(level) = detect_header_level(line) {
if level > previous_level + 1 {
warnings.push(Issue {
line: line_num,
message: format!("Header level jump from {} to {}", previous_level, level),
context: Some(line.to_string()),
});
}
previous_level = level;
}
}
}
fn detect_header_level(line: &str) -> Option<u32> {
let trimmed = line.trim();
if trimmed.starts_with('#') {
let level = trimmed.chars().take_while(|&c| c == '#').count() as u32;
if level >= 1 && level <= 6 {
return Some(level);
}
}
None
}
fn contains_bare_url(text: &str) -> bool {
let url_patterns = ["http://", "https://", "www.", "ftp://", "mailto:"];
url_patterns.iter().any(|pattern| text.contains(pattern)) &&
!text.contains('[') && !text.contains("![]") }
#[cfg(test)]
mod tests {
use super::*;
use std::fs::File;
use std::io::Write;
use tempfile::tempdir;
#[test]
fn test_check_valid_markdown() {
let dir = tempdir().unwrap();
let file_path = dir.path().join("test.md");
let mut file = File::create(&file_path).unwrap();
writeln!(file, "# Valid Header").unwrap();
writeln!(file, "").unwrap();
writeln!(
file,
"This is a paragraph with a [link](http://example.com)."
)
.unwrap();
let config = CheckConfig {
recursive: false,
output_format: OutputFormat::Human,
strict: false,
ignore_warnings: false,
};
let result = check_file(&file_path, &config);
assert!(result.errors.is_empty());
}
#[test]
fn test_check_invalid_reference() {
let dir = tempdir().unwrap();
let file_path = dir.path().join("test.md");
let mut file = File::create(&file_path).unwrap();
writeln!(file, "This has an [undefined link][missing].").unwrap();
let config = CheckConfig {
recursive: false,
output_format: OutputFormat::Human,
strict: false,
ignore_warnings: false,
};
let result = check_file(&file_path, &config);
assert!(!result.errors.is_empty());
}
}