mdbook_lint_core/
document.rs

1use crate::error::{MdBookLintError, Result};
2use comrak::nodes::{AstNode, NodeValue};
3use comrak::{Arena, ComrakOptions, parse_document};
4use std::path::PathBuf;
5
6/// Represents a parsed markdown document with position information
7#[derive(Debug)]
8pub struct Document {
9    /// The original markdown content
10    pub content: String,
11    /// Path to the source file
12    pub path: PathBuf,
13    /// Lines split for line-based rule processing
14    pub lines: Vec<String>,
15}
16
17impl Document {
18    /// Parse a markdown document from content and path
19    pub fn new(content: String, path: PathBuf) -> Result<Self> {
20        // Allow empty documents for edge case handling
21        // Some rules need to handle empty files correctly
22
23        // Split content into lines for line-based rules
24        let lines: Vec<String> = content.lines().map(|s| s.to_owned()).collect();
25
26        Ok(Document {
27            content,
28            path,
29            lines,
30        })
31    }
32
33    /// Parse the content into a comrak AST
34    pub fn parse_ast<'a>(&self, arena: &'a Arena<AstNode<'a>>) -> &'a AstNode<'a> {
35        // Configure comrak options for position tracking and compatibility
36        let mut options = ComrakOptions::default();
37        options.extension.strikethrough = true;
38        options.extension.tagfilter = false;
39        options.extension.table = true;
40        options.extension.autolink = true;
41        options.extension.tasklist = true;
42        options.extension.superscript = false;
43        options.extension.header_ids = None;
44        options.extension.footnotes = true;
45        options.extension.description_lists = true;
46        options.extension.front_matter_delimiter = Some("---".to_owned());
47        options.parse.smart = false;
48        options.parse.default_info_string = None;
49        options.parse.relaxed_tasklist_matching = false;
50        options.parse.relaxed_autolinks = false;
51
52        parse_document(arena, &self.content, &options)
53    }
54
55    /// Parse AST with error context
56    pub fn parse_ast_with_context<'a>(
57        &self,
58        arena: &'a Arena<AstNode<'a>>,
59    ) -> Result<&'a AstNode<'a>> {
60        // For now, comrak parsing doesn't typically fail, but we can add validation
61        let ast = self.parse_ast(arena);
62
63        // Basic validation that we got a valid AST
64        if ast.children().count() == 0 && !self.content.trim().is_empty() {
65            return Err(MdBookLintError::Document(
66                "Failed to parse document AST - no content nodes found".to_string(),
67            ));
68        }
69
70        Ok(ast)
71    }
72
73    /// Get the line number (1-based) for a given byte offset
74    pub fn line_number_at_offset(&self, offset: usize) -> usize {
75        let mut current_offset = 0;
76        for (line_idx, line) in self.lines.iter().enumerate() {
77            if current_offset + line.len() >= offset {
78                return line_idx + 1; // 1-based line numbers
79            }
80            current_offset += line.len() + 1; // +1 for newline
81        }
82        self.lines.len() // Return last line if offset is at end
83    }
84
85    /// Get the column number (1-based) for a given byte offset
86    pub fn column_number_at_offset(&self, offset: usize) -> usize {
87        let mut current_offset = 0;
88        for line in &self.lines {
89            if current_offset + line.len() >= offset {
90                return offset - current_offset + 1; // 1-based column numbers
91            }
92            current_offset += line.len() + 1; // +1 for newline
93        }
94        1 // Default to column 1
95    }
96
97    /// Get all heading nodes from the AST
98    pub fn headings<'a>(&self, ast: &'a AstNode<'a>) -> Vec<&'a AstNode<'a>> {
99        let mut headings = Vec::new();
100        self.collect_headings(ast, &mut headings);
101        headings
102    }
103
104    /// Get all heading nodes with error context
105    pub fn headings_with_context<'a>(&self, ast: &'a AstNode<'a>) -> Result<Vec<&'a AstNode<'a>>> {
106        let headings = self.headings(ast);
107        Ok(headings)
108    }
109
110    /// Get all code block nodes from the AST
111    pub fn code_blocks<'a>(&self, ast: &'a AstNode<'a>) -> Vec<&'a AstNode<'a>> {
112        let mut code_blocks = Vec::new();
113        self.collect_code_blocks(ast, &mut code_blocks);
114        code_blocks
115    }
116
117    /// Get all code block nodes with error context
118    pub fn code_blocks_with_context<'a>(
119        &self,
120        ast: &'a AstNode<'a>,
121    ) -> Result<Vec<&'a AstNode<'a>>> {
122        let code_blocks = self.code_blocks(ast);
123        Ok(code_blocks)
124    }
125
126    /// Recursively collect heading nodes
127    #[allow(clippy::only_used_in_recursion)]
128    fn collect_headings<'a>(&self, node: &'a AstNode<'a>, result: &mut Vec<&'a AstNode<'a>>) {
129        if let NodeValue::Heading(..) = &node.data.borrow().value {
130            result.push(node)
131        }
132
133        // Recursively check children
134        for child in node.children() {
135            self.collect_headings(child, result);
136        }
137    }
138
139    /// Recursively collect code block nodes
140    #[allow(clippy::only_used_in_recursion)]
141    fn collect_code_blocks<'a>(&self, node: &'a AstNode<'a>, result: &mut Vec<&'a AstNode<'a>>) {
142        if let NodeValue::CodeBlock(..) = &node.data.borrow().value {
143            result.push(node)
144        }
145
146        // Recursively check children
147        for child in node.children() {
148            self.collect_code_blocks(child, result);
149        }
150    }
151
152    /// Get the heading level for a heading node
153    pub fn heading_level<'a>(node: &'a AstNode<'a>) -> Option<u32> {
154        match &node.data.borrow().value {
155            NodeValue::Heading(heading) => Some(heading.level.into()),
156            _ => None,
157        }
158    }
159
160    /// Get the text content of a node
161    pub fn node_text<'a>(&self, node: &'a AstNode<'a>) -> String {
162        let mut text = String::new();
163        self.collect_text(node, &mut text);
164        text
165    }
166
167    /// Recursively collect text from a node and its children
168    #[allow(clippy::only_used_in_recursion)]
169    fn collect_text<'a>(&self, node: &'a AstNode<'a>, text: &mut String) {
170        match &node.data.borrow().value {
171            NodeValue::Text(t) => text.push_str(t),
172            NodeValue::Code(code) => text.push_str(&code.literal),
173            _ => {
174                for child in node.children() {
175                    self.collect_text(child, text);
176                }
177            }
178        }
179    }
180
181    /// Get the source position of a node
182    pub fn node_position<'a>(&self, node: &'a AstNode<'a>) -> Option<(usize, usize)> {
183        let sourcepos = node.data.borrow().sourcepos;
184        if sourcepos.start.line > 0 {
185            Some((sourcepos.start.line, sourcepos.start.column))
186        } else {
187            None
188        }
189    }
190}
191
192#[cfg(test)]
193mod tests {
194    use super::*;
195    use comrak::Arena;
196    use std::path::PathBuf;
197
198    #[test]
199    fn test_document_creation() {
200        let content = "# Test\n\nThis is a test.".to_string();
201        let path = PathBuf::from("test.md");
202
203        let doc = Document::new(content, path).expect("Failed to create document");
204
205        assert_eq!(doc.lines.len(), 3);
206        assert_eq!(doc.lines[0], "# Test");
207        assert_eq!(doc.lines[1], "");
208        assert_eq!(doc.lines[2], "This is a test.");
209    }
210
211    #[test]
212    fn test_empty_document_allowed() {
213        let content = "".to_string();
214        let path = PathBuf::from("empty.md");
215
216        let result = Document::new(content, path);
217        assert!(result.is_ok());
218
219        let document = result.unwrap();
220        assert_eq!(document.content, "");
221        assert_eq!(document.lines.len(), 0);
222        assert_eq!(document.path, PathBuf::from("empty.md"));
223    }
224
225    #[test]
226    fn test_whitespace_only_document_allowed() {
227        let content = "   \n  \n  ".to_string();
228        let path = PathBuf::from("whitespace.md");
229
230        let result = Document::new(content, path);
231        assert!(result.is_ok());
232
233        let document = result.unwrap();
234        assert_eq!(document.content, "   \n  \n  ");
235        assert_eq!(document.lines.len(), 3);
236        assert_eq!(document.path, PathBuf::from("whitespace.md"));
237    }
238
239    #[test]
240    fn test_line_number_calculation() {
241        let content = "Line 1\nLine 2\nLine 3".to_string();
242        let path = PathBuf::from("test.md");
243
244        let doc = Document::new(content, path).expect("Failed to create document");
245
246        assert_eq!(doc.line_number_at_offset(0), 1); // Start of line 1
247        assert_eq!(doc.line_number_at_offset(7), 2); // Start of line 2
248        assert_eq!(doc.line_number_at_offset(14), 3); // Start of line 3
249    }
250
251    #[test]
252    fn test_heading_extraction() {
253        let content = "# H1\n## H2\n### H3\nText".to_string();
254        let path = PathBuf::from("test.md");
255
256        let doc = Document::new(content, path).expect("Failed to create document");
257        let arena = Arena::new();
258        let ast = doc.parse_ast(&arena);
259        let headings = doc.headings(ast);
260
261        assert_eq!(headings.len(), 3);
262
263        assert_eq!(Document::heading_level(headings[0]), Some(1));
264        assert_eq!(Document::heading_level(headings[1]), Some(2));
265        assert_eq!(Document::heading_level(headings[2]), Some(3));
266    }
267
268    #[test]
269    fn test_document_with_unicode() {
270        let content = "# 标题\n\n这是一个测试。\n\n```rust\nfn main() {\n    println!(\"Hello, 世界!\");\n}\n```".to_string();
271        let path = PathBuf::from("unicode.md");
272
273        let doc = Document::new(content, path).expect("Failed to create document");
274        assert!(doc.content.contains("标题"));
275        assert!(doc.content.contains("世界"));
276        assert_eq!(doc.lines.len(), 9);
277    }
278
279    #[test]
280    fn test_document_with_very_long_lines() {
281        let long_line = "a".repeat(10000);
282        let content = format!("# Test\n\n{long_line}\n\nEnd");
283        let path = PathBuf::from("long.md");
284
285        let doc = Document::new(content, path).expect("Failed to create document");
286        assert_eq!(doc.lines.len(), 5);
287        assert_eq!(doc.lines[2].len(), 10000);
288    }
289
290    #[test]
291    fn test_document_with_mixed_line_endings() {
292        let content = "Line 1\r\nLine 2\nLine 3\r\nLine 4".to_string();
293        let path = PathBuf::from("mixed.md");
294
295        let doc = Document::new(content, path).expect("Failed to create document");
296        // The lines() method normalizes line endings
297        assert_eq!(doc.lines.len(), 4);
298        assert_eq!(doc.lines[0], "Line 1");
299        assert_eq!(doc.lines[1], "Line 2");
300        assert_eq!(doc.lines[2], "Line 3");
301        assert_eq!(doc.lines[3], "Line 4");
302    }
303
304    #[test]
305    fn test_document_with_only_newlines() {
306        let content = "\n\n\n\n".to_string();
307        let path = PathBuf::from("newlines.md");
308
309        let doc = Document::new(content, path).expect("Failed to create document");
310        assert_eq!(doc.lines.len(), 4);
311        for line in &doc.lines {
312            assert_eq!(line, "");
313        }
314    }
315
316    #[test]
317    fn test_node_position_edge_cases() {
318        let content = "# Test\n\n```rust\ncode\n```\n\n- Item".to_string();
319        let path = PathBuf::from("test.md");
320
321        let doc = Document::new(content, path).expect("Failed to create document");
322        let arena = Arena::new();
323        let ast = doc.parse_ast(&arena);
324
325        // Test position for first node
326        let position = doc.node_position(ast);
327        assert!(position.is_some());
328        let (line, col) = position.unwrap();
329        assert!(line >= 1);
330        assert!(col >= 1);
331    }
332
333    #[test]
334    fn test_code_blocks_extraction() {
335        let content = r#"# Test
336
337```rust
338fn main() {}
339```
340
341Some text.
342
343```bash
344echo "hello"
345```
346
347    // Indented code block
348    let x = 5;
349
350```
351No language
352```
353"#
354        .to_string();
355        let path = PathBuf::from("test.md");
356
357        let doc = Document::new(content, path).expect("Failed to create document");
358        let arena = Arena::new();
359        let ast = doc.parse_ast(&arena);
360        let code_blocks = doc.code_blocks(ast);
361
362        // Should find both fenced and indented code blocks
363        assert!(code_blocks.len() >= 3);
364    }
365
366    #[test]
367    fn test_links_extraction() {
368        let content = r#"# Test
369
370[Link 1](http://example.com)
371
372[Link 2](./relative.md)
373
374<https://autolink.com>
375
376[Reference link][ref]
377
378[ref]: http://reference.com
379"#
380        .to_string();
381        let path = PathBuf::from("test.md");
382
383        let doc = Document::new(content, path).expect("Failed to create document");
384        let arena = Arena::new();
385        let ast = doc.parse_ast(&arena);
386
387        // Test that document parsing succeeds and produces valid AST
388        assert!(!doc.content.is_empty());
389        assert!(!doc.lines.is_empty());
390
391        // Verify AST structure contains expected elements
392        let mut has_heading = false;
393        for node in ast.descendants() {
394            if matches!(
395                node.data.borrow().value,
396                comrak::nodes::NodeValue::Heading(_)
397            ) {
398                has_heading = true;
399                break;
400            }
401        }
402        assert!(has_heading, "Expected to find heading in parsed AST");
403    }
404
405    #[test]
406    fn test_line_number_at_offset_edge_cases() {
407        let content = "a\nb\nc".to_string();
408        let path = PathBuf::from("test.md");
409
410        let doc = Document::new(content, path).expect("Failed to create document");
411
412        // Test offset beyond content
413        let line = doc.line_number_at_offset(100);
414        assert!(line >= 1);
415
416        // Test offset at exact line boundaries
417        assert_eq!(doc.line_number_at_offset(0), 1); // 'a'
418        assert_eq!(doc.line_number_at_offset(2), 2); // 'b'
419        assert_eq!(doc.line_number_at_offset(4), 3); // 'c'
420    }
421
422    #[test]
423    fn test_ast_parsing_with_extensions() {
424        let content = r#"# Test
425
426| Table | Header |
427|-------|--------|
428| Cell  | Data   |
429
430~~Strikethrough~~
431
432- [x] Task done
433- [ ] Task pending
434
435^Super^script
436
437[^footnote]: Footnote content
438
439Front matter:
440---
441title: Test
442---
443"#
444        .to_string();
445        let path = PathBuf::from("test.md");
446
447        let doc = Document::new(content, path).expect("Failed to create document");
448        let arena = Arena::new();
449        let ast = doc.parse_ast(&arena);
450
451        // Verify document parses and contains expected markdown extensions
452        assert!(doc.content.contains("~~Strikethrough~~"));
453        assert!(doc.content.contains("| Table | Header |"));
454        assert!(doc.content.contains("- [x] Task done"));
455        assert!(doc.content.contains("title: Test"));
456
457        // Verify AST parsing produces nodes (basic structure validation)
458        let node_count = ast.descendants().count();
459        assert!(
460            node_count > 5,
461            "Expected AST to contain multiple nodes, got {node_count}"
462        );
463    }
464
465    #[test]
466    fn test_empty_path() {
467        let content = "# Test".to_string();
468        let path = PathBuf::new();
469
470        let doc = Document::new(content, path).expect("Failed to create document");
471        assert_eq!(doc.path, PathBuf::new());
472    }
473
474    #[test]
475    fn test_complex_nested_structure() {
476        let content = r#"# Main Title
477
478## Section 1
479
480### Subsection
481
482Some text with **bold** and *italic*.
483
484> Blockquote with `code` inside.
485>
486> > Nested blockquote.
487
4881. Ordered list
489   - Nested unordered
490   - Another item
4912. Second ordered item
492
493```rust
494// Code with comments
495fn complex_function() {
496    println!("Complex: {}", "test");
497}
498```
499
500Final paragraph.
501"#
502        .to_string();
503        let path = PathBuf::from("complex.md");
504
505        let doc = Document::new(content, path).expect("Failed to create document");
506        let arena = Arena::new();
507        let ast = doc.parse_ast(&arena);
508
509        // Test various extractions work
510        let headings = doc.headings(ast);
511        assert!(headings.len() >= 3);
512
513        let code_blocks = doc.code_blocks(ast);
514        assert!(!code_blocks.is_empty());
515
516        // Verify the document contains expected content structure
517        assert!(doc.content.contains("# Main Title"));
518        assert!(doc.content.contains("## Section 1"));
519        assert!(doc.content.contains("### Subsection"));
520        assert!(doc.content.contains("```rust"));
521        assert!(doc.content.contains("Final paragraph"));
522
523        // Verify line structure is correct
524        assert!(
525            doc.lines.len() > 10,
526            "Expected multiple lines in complex document"
527        );
528    }
529}