mdcheck/
lib.rs

1use glob::glob;
2use pulldown_cmark::{Event, Options, Parser};
3use serde::Serialize;
4use std::collections::HashSet;
5use std::fs;
6use std::path::{Path, PathBuf};
7use thiserror::Error;
8
9#[derive(Debug, Error)]
10pub enum CheckError {
11    #[error("IO error: {0}")]
12    Io(#[from] std::io::Error),
13    #[error("Invalid UTF-8 in file: {0}")]
14    Utf8Error(#[from] std::string::FromUtf8Error),
15}
16
17#[derive(Debug, Clone, PartialEq)] // Added PartialEq
18pub struct CheckConfig {
19    pub recursive: bool,
20    pub output_format: OutputFormat,
21    pub strict: bool,
22    pub ignore_warnings: bool,
23}
24
25#[derive(Debug, Clone, PartialEq)] // Added PartialEq
26pub enum OutputFormat {
27    Human,
28    Json,
29}
30
31#[derive(Debug, Serialize)]
32pub struct Issue {
33    pub line: usize,
34    pub message: String,
35    pub context: Option<String>,
36}
37
38#[derive(Debug, Serialize)]
39pub struct CheckResult {
40    pub file_path: PathBuf,
41    pub errors: Vec<Issue>,
42    pub warnings: Vec<Issue>,
43}
44
45pub fn check_files(paths: &[PathBuf], config: &CheckConfig) -> Vec<CheckResult> {
46    let mut all_files = Vec::new();
47
48    for path in paths {
49        if path.is_dir() {
50            if config.recursive {
51                let pattern = path.join("**/*.md").to_string_lossy().to_string();
52                if let Ok(entries) = glob(&pattern) {
53                    for entry in entries.flatten() {
54                        all_files.push(entry);
55                    }
56                }
57            } else {
58                if let Ok(entries) = fs::read_dir(path) {
59                    for entry in entries.flatten() {
60                        let file_path = entry.path();
61                        if file_path
62                            .extension()
63                            .map_or(false, |ext| ext == "md" || ext == "markdown")
64                        {
65                            all_files.push(file_path);
66                        }
67                    }
68                }
69            }
70        } else if path.is_file() {
71            all_files.push(path.clone());
72        }
73    }
74
75    all_files
76        .iter()
77        .map(|file_path| check_file(file_path, config))
78        .collect()
79}
80
81pub fn check_file(file_path: &Path, _config: &CheckConfig) -> CheckResult {
82    let mut errors = Vec::new();
83    let mut warnings = Vec::new();
84
85    let content = match fs::read_to_string(file_path) {
86        Ok(content) => content,
87        Err(e) => {
88            errors.push(Issue {
89                line: 0,
90                message: format!("Failed to read file: {}", e),
91                context: None,
92            });
93            return CheckResult {
94                file_path: file_path.to_path_buf(),
95                errors,
96                warnings,
97            };
98        }
99    };
100
101    let lines: Vec<&str> = content.lines().collect();
102    let mut options = Options::empty();
103    options.insert(Options::ENABLE_TABLES);
104    options.insert(Options::ENABLE_FOOTNOTES);
105    options.insert(Options::ENABLE_STRIKETHROUGH);
106    options.insert(Options::ENABLE_TASKLISTS);
107
108    let parser = Parser::new_ext(&content, options);
109    let events: Vec<Event> = parser.collect();
110
111    // Check for CommonMark compliance issues
112    check_markdown_structure(&lines, &events, &mut errors, &mut warnings);
113    check_link_references(&lines, &mut errors);
114    check_list_consistency(&lines, &mut warnings);
115    check_header_consistency(&lines, &mut warnings);
116
117    CheckResult {
118        file_path: file_path.to_path_buf(),
119        errors,
120        warnings,
121    }
122}
123
124fn check_markdown_structure(
125    lines: &[&str],
126    events: &[Event],
127    errors: &mut Vec<Issue>,
128    warnings: &mut Vec<Issue>,
129) {
130    let mut in_code_block = false;
131    let mut line_num = 0;
132
133    for line in lines {
134        line_num += 1;
135        let trimmed = line.trim();
136
137        // Check for code block boundaries
138        if trimmed.starts_with("```") {
139            in_code_block = !in_code_block;
140            continue;
141        }
142
143        if in_code_block {
144            continue; // Skip validation inside code blocks
145        }
146
147        // Check for inconsistent indentation
148        if !trimmed.is_empty() {
149            let leading_spaces = line.len() - line.trim_start().len();
150            if leading_spaces % 4 != 0 && leading_spaces % 2 != 0 && leading_spaces > 0 {
151                warnings.push(Issue {
152                    line: line_num,
153                    message: "Inconsistent indentation (prefer 2 or 4 spaces)".to_string(),
154                    context: Some(line.to_string()),
155                });
156            }
157        }
158
159        // Check for bare URLs without link syntax
160        if contains_bare_url(trimmed) {
161            warnings.push(Issue {
162                line: line_num,
163                message: "Bare URL detected (consider using link syntax)".to_string(),
164                context: Some(line.to_string()),
165            });
166        }
167    }
168
169    // Check for unclosed elements
170    let mut open_elements = Vec::new();
171    for event in events {
172        match event {
173            Event::Start(tag) => {
174                open_elements.push((tag.clone(), line_num));
175            }
176            Event::End(tag) => {
177                if let Some((last_tag, _)) = open_elements.last() {
178                    if last_tag == tag {
179                        open_elements.pop();
180                    }
181                }
182            }
183            _ => {}
184        }
185    }
186
187    for (tag, line) in open_elements {
188        errors.push(Issue {
189            line,
190            message: format!("Unclosed {:?} element", tag),
191            context: None,
192        });
193    }
194}
195
196fn check_link_references(lines: &[&str], errors: &mut Vec<Issue>) {
197    let mut link_references = HashSet::new();
198    let mut defined_references = HashSet::new();
199    let mut line_num = 0;
200
201    // Use regex crate for pattern matching
202    let ref_def_regex = regex::Regex::new(r"^\[([^\]]+)\]:\s*(.+)$").unwrap();
203    let link_ref_regex = regex::Regex::new(r"!?\[([^\]]*)\]\[([^\]]+)\]").unwrap();
204
205    for line in lines {
206        line_num += 1;
207
208        // Check for link reference definitions
209        if let Some(caps) = ref_def_regex.captures(line) {
210            if let Some(reference) = caps.get(1) {
211                defined_references.insert(reference.as_str().to_lowercase());
212            }
213        }
214
215        // Check for inline links and images that might need reference definitions
216        if let Some(caps) = link_ref_regex.captures(line) {
217            if let Some(reference) = caps.get(2) {
218                link_references.insert((reference.as_str().to_lowercase(), line_num));
219            }
220        }
221    }
222
223    // Report undefined references
224    for (reference, line) in link_references {
225        if !defined_references.contains(&reference) {
226            errors.push(Issue {
227                line,
228                message: format!("Undefined link reference '[{}]'", reference),
229                context: None,
230            });
231        }
232    }
233}
234
235fn check_list_consistency(lines: &[&str], warnings: &mut Vec<Issue>) {
236    let mut in_list = false;
237    let mut list_indent = 0;
238    let mut line_num = 0;
239
240    for line in lines {
241        line_num += 1;
242        let trimmed = line.trim_start();
243        let indent = line.len() - trimmed.len();
244
245        if trimmed.starts_with('-') || trimmed.starts_with('*') || trimmed.starts_with('+') {
246            if in_list && indent != list_indent {
247                warnings.push(Issue {
248                    line: line_num,
249                    message: "Inconsistent list indentation".to_string(),
250                    context: Some(line.to_string()),
251                });
252            }
253            in_list = true;
254            list_indent = indent;
255        } else if trimmed.starts_with(|c: char| c.is_ascii_digit()) && trimmed.contains('.') {
256            // Ordered list item
257            if in_list && indent != list_indent {
258                warnings.push(Issue {
259                    line: line_num,
260                    message: "Inconsistent list indentation".to_string(),
261                    context: Some(line.to_string()),
262                });
263            }
264            in_list = true;
265            list_indent = indent;
266        } else if !trimmed.is_empty() && !line.trim().is_empty() {
267            in_list = false;
268        }
269    }
270}
271
272fn check_header_consistency(lines: &[&str], warnings: &mut Vec<Issue>) {
273    let mut previous_level = 0;
274    let mut line_num = 0;
275
276    for line in lines {
277        line_num += 1;
278
279        if let Some(level) = detect_header_level(line) {
280            if level > previous_level + 1 {
281                warnings.push(Issue {
282                    line: line_num,
283                    message: format!("Header level jump from {} to {}", previous_level, level),
284                    context: Some(line.to_string()),
285                });
286            }
287            previous_level = level;
288        }
289    }
290}
291
292fn detect_header_level(line: &str) -> Option<u32> {
293    let trimmed = line.trim();
294
295    // ATX-style headers: ## Header
296    if trimmed.starts_with('#') {
297        let level = trimmed.chars().take_while(|&c| c == '#').count() as u32;
298        if level >= 1 && level <= 6 {
299            return Some(level);
300        }
301    }
302
303    None
304}
305
306fn contains_bare_url(text: &str) -> bool {
307    let url_patterns = ["http://", "https://", "www.", "ftp://", "mailto:"];
308
309    url_patterns.iter().any(|pattern| text.contains(pattern)) &&
310    !text.contains('[') && // Not already in a link
311    !text.contains("![]") // Not already in an image
312}
313
314#[cfg(test)]
315mod tests {
316    use super::*;
317    use std::fs::File;
318    use std::io::Write;
319    use tempfile::tempdir;
320
321    #[test]
322    fn test_check_valid_markdown() {
323        let dir = tempdir().unwrap();
324        let file_path = dir.path().join("test.md");
325        let mut file = File::create(&file_path).unwrap();
326        writeln!(file, "# Valid Header").unwrap();
327        writeln!(file, "").unwrap();
328        writeln!(
329            file,
330            "This is a paragraph with a [link](http://example.com)."
331        )
332        .unwrap();
333
334        let config = CheckConfig {
335            recursive: false,
336            output_format: OutputFormat::Human,
337            strict: false,
338            ignore_warnings: false,
339        };
340
341        let result = check_file(&file_path, &config);
342        assert!(result.errors.is_empty());
343    }
344
345    #[test]
346    fn test_check_invalid_reference() {
347        let dir = tempdir().unwrap();
348        let file_path = dir.path().join("test.md");
349        let mut file = File::create(&file_path).unwrap();
350        writeln!(file, "This has an [undefined link][missing].").unwrap();
351
352        let config = CheckConfig {
353            recursive: false,
354            output_format: OutputFormat::Human,
355            strict: false,
356            ignore_warnings: false,
357        };
358
359        let result = check_file(&file_path, &config);
360        assert!(!result.errors.is_empty());
361    }
362}