mdcheck 0.1.0

A linter/validator for Markdown files that enforces CommonMark specification
Documentation
use glob::glob;
use pulldown_cmark::{Event, Options, Parser};
use serde::Serialize;
use std::collections::HashSet;
use std::fs;
use std::path::{Path, PathBuf};
use thiserror::Error;

#[derive(Debug, Error)]
pub enum CheckError {
    #[error("IO error: {0}")]
    Io(#[from] std::io::Error),
    #[error("Invalid UTF-8 in file: {0}")]
    Utf8Error(#[from] std::string::FromUtf8Error),
}

#[derive(Debug, Clone, PartialEq)] // Added PartialEq
pub struct CheckConfig {
    pub recursive: bool,
    pub output_format: OutputFormat,
    pub strict: bool,
    pub ignore_warnings: bool,
}

#[derive(Debug, Clone, PartialEq)] // Added PartialEq
pub enum OutputFormat {
    Human,
    Json,
}

#[derive(Debug, Serialize)]
pub struct Issue {
    pub line: usize,
    pub message: String,
    pub context: Option<String>,
}

#[derive(Debug, Serialize)]
pub struct CheckResult {
    pub file_path: PathBuf,
    pub errors: Vec<Issue>,
    pub warnings: Vec<Issue>,
}

pub fn check_files(paths: &[PathBuf], config: &CheckConfig) -> Vec<CheckResult> {
    let mut all_files = Vec::new();

    for path in paths {
        if path.is_dir() {
            if config.recursive {
                let pattern = path.join("**/*.md").to_string_lossy().to_string();
                if let Ok(entries) = glob(&pattern) {
                    for entry in entries.flatten() {
                        all_files.push(entry);
                    }
                }
            } else {
                if let Ok(entries) = fs::read_dir(path) {
                    for entry in entries.flatten() {
                        let file_path = entry.path();
                        if file_path
                            .extension()
                            .map_or(false, |ext| ext == "md" || ext == "markdown")
                        {
                            all_files.push(file_path);
                        }
                    }
                }
            }
        } else if path.is_file() {
            all_files.push(path.clone());
        }
    }

    all_files
        .iter()
        .map(|file_path| check_file(file_path, config))
        .collect()
}

pub fn check_file(file_path: &Path, _config: &CheckConfig) -> CheckResult {
    let mut errors = Vec::new();
    let mut warnings = Vec::new();

    let content = match fs::read_to_string(file_path) {
        Ok(content) => content,
        Err(e) => {
            errors.push(Issue {
                line: 0,
                message: format!("Failed to read file: {}", e),
                context: None,
            });
            return CheckResult {
                file_path: file_path.to_path_buf(),
                errors,
                warnings,
            };
        }
    };

    let lines: Vec<&str> = content.lines().collect();
    let mut options = Options::empty();
    options.insert(Options::ENABLE_TABLES);
    options.insert(Options::ENABLE_FOOTNOTES);
    options.insert(Options::ENABLE_STRIKETHROUGH);
    options.insert(Options::ENABLE_TASKLISTS);

    let parser = Parser::new_ext(&content, options);
    let events: Vec<Event> = parser.collect();

    // Check for CommonMark compliance issues
    check_markdown_structure(&lines, &events, &mut errors, &mut warnings);
    check_link_references(&lines, &mut errors);
    check_list_consistency(&lines, &mut warnings);
    check_header_consistency(&lines, &mut warnings);

    CheckResult {
        file_path: file_path.to_path_buf(),
        errors,
        warnings,
    }
}

fn check_markdown_structure(
    lines: &[&str],
    events: &[Event],
    errors: &mut Vec<Issue>,
    warnings: &mut Vec<Issue>,
) {
    let mut in_code_block = false;
    let mut line_num = 0;

    for line in lines {
        line_num += 1;
        let trimmed = line.trim();

        // Check for code block boundaries
        if trimmed.starts_with("```") {
            in_code_block = !in_code_block;
            continue;
        }

        if in_code_block {
            continue; // Skip validation inside code blocks
        }

        // Check for inconsistent indentation
        if !trimmed.is_empty() {
            let leading_spaces = line.len() - line.trim_start().len();
            if leading_spaces % 4 != 0 && leading_spaces % 2 != 0 && leading_spaces > 0 {
                warnings.push(Issue {
                    line: line_num,
                    message: "Inconsistent indentation (prefer 2 or 4 spaces)".to_string(),
                    context: Some(line.to_string()),
                });
            }
        }

        // Check for bare URLs without link syntax
        if contains_bare_url(trimmed) {
            warnings.push(Issue {
                line: line_num,
                message: "Bare URL detected (consider using link syntax)".to_string(),
                context: Some(line.to_string()),
            });
        }
    }

    // Check for unclosed elements
    let mut open_elements = Vec::new();
    for event in events {
        match event {
            Event::Start(tag) => {
                open_elements.push((tag.clone(), line_num));
            }
            Event::End(tag) => {
                if let Some((last_tag, _)) = open_elements.last() {
                    if last_tag == tag {
                        open_elements.pop();
                    }
                }
            }
            _ => {}
        }
    }

    for (tag, line) in open_elements {
        errors.push(Issue {
            line,
            message: format!("Unclosed {:?} element", tag),
            context: None,
        });
    }
}

fn check_link_references(lines: &[&str], errors: &mut Vec<Issue>) {
    let mut link_references = HashSet::new();
    let mut defined_references = HashSet::new();
    let mut line_num = 0;

    // Use regex crate for pattern matching
    let ref_def_regex = regex::Regex::new(r"^\[([^\]]+)\]:\s*(.+)$").unwrap();
    let link_ref_regex = regex::Regex::new(r"!?\[([^\]]*)\]\[([^\]]+)\]").unwrap();

    for line in lines {
        line_num += 1;

        // Check for link reference definitions
        if let Some(caps) = ref_def_regex.captures(line) {
            if let Some(reference) = caps.get(1) {
                defined_references.insert(reference.as_str().to_lowercase());
            }
        }

        // Check for inline links and images that might need reference definitions
        if let Some(caps) = link_ref_regex.captures(line) {
            if let Some(reference) = caps.get(2) {
                link_references.insert((reference.as_str().to_lowercase(), line_num));
            }
        }
    }

    // Report undefined references
    for (reference, line) in link_references {
        if !defined_references.contains(&reference) {
            errors.push(Issue {
                line,
                message: format!("Undefined link reference '[{}]'", reference),
                context: None,
            });
        }
    }
}

fn check_list_consistency(lines: &[&str], warnings: &mut Vec<Issue>) {
    let mut in_list = false;
    let mut list_indent = 0;
    let mut line_num = 0;

    for line in lines {
        line_num += 1;
        let trimmed = line.trim_start();
        let indent = line.len() - trimmed.len();

        if trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+') {
            if in_list && indent != list_indent {
                warnings.push(Issue {
                    line: line_num,
                    message: "Inconsistent list indentation".to_string(),
                    context: Some(line.to_string()),
                });
            }
            in_list = true;
            list_indent = indent;
        } else if trimmed.starts_with(|c: char| c.is_ascii_digit()) && trimmed.contains('.') {
            // Ordered list item
            if in_list && indent != list_indent {
                warnings.push(Issue {
                    line: line_num,
                    message: "Inconsistent list indentation".to_string(),
                    context: Some(line.to_string()),
                });
            }
            in_list = true;
            list_indent = indent;
        } else if !trimmed.is_empty() && !line.trim().is_empty() {
            in_list = false;
        }
    }
}

fn check_header_consistency(lines: &[&str], warnings: &mut Vec<Issue>) {
    let mut previous_level = 0;
    let mut line_num = 0;

    for line in lines {
        line_num += 1;

        if let Some(level) = detect_header_level(line) {
            if level > previous_level + 1 {
                warnings.push(Issue {
                    line: line_num,
                    message: format!("Header level jump from {} to {}", previous_level, level),
                    context: Some(line.to_string()),
                });
            }
            previous_level = level;
        }
    }
}

fn detect_header_level(line: &str) -> Option<u32> {
    let trimmed = line.trim();

    // ATX-style headers: ## Header
    if trimmed.starts_with('#') {
        let level = trimmed.chars().take_while(|&c| c == '#').count() as u32;
        if level >= 1 && level <= 6 {
            return Some(level);
        }
    }

    None
}

fn contains_bare_url(text: &str) -> bool {
    let url_patterns = ["http://", "https://", "www.", "ftp://", "mailto:"];

    url_patterns.iter().any(|pattern| text.contains(pattern)) &&
    !text.contains('[') && // Not already in a link
    !text.contains("![]") // Not already in an image
}

#[cfg(test)]
mod tests {
    use super::*;
    use std::fs::File;
    use std::io::Write;
    use tempfile::tempdir;

    #[test]
    fn test_check_valid_markdown() {
        let dir = tempdir().unwrap();
        let file_path = dir.path().join("test.md");
        let mut file = File::create(&file_path).unwrap();
        writeln!(file, "# Valid Header").unwrap();
        writeln!(file, "").unwrap();
        writeln!(
            file,
            "This is a paragraph with a [link](http://example.com)."
        )
        .unwrap();

        let config = CheckConfig {
            recursive: false,
            output_format: OutputFormat::Human,
            strict: false,
            ignore_warnings: false,
        };

        let result = check_file(&file_path, &config);
        assert!(result.errors.is_empty());
    }

    #[test]
    fn test_check_invalid_reference() {
        let dir = tempdir().unwrap();
        let file_path = dir.path().join("test.md");
        let mut file = File::create(&file_path).unwrap();
        writeln!(file, "This has an [undefined link][missing].").unwrap();

        let config = CheckConfig {
            recursive: false,
            output_format: OutputFormat::Human,
            strict: false,
            ignore_warnings: false,
        };

        let result = check_file(&file_path, &config);
        assert!(!result.errors.is_empty());
    }
}