Skip to main content

mdlint/markdown/
parser.rs

1use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
2use std::collections::HashSet;
3use std::ops::Range;
4
5pub struct MarkdownParser<'a> {
6    content: &'a str,
7    lines: Vec<&'a str>,
8}
9
10impl<'a> MarkdownParser<'a> {
11    pub fn new(content: &'a str) -> Self {
12        let lines = content.lines().collect();
13        Self { content, lines }
14    }
15
16    pub fn content(&self) -> &'a str {
17        self.content
18    }
19
20    pub fn lines(&self) -> &[&'a str] {
21        &self.lines
22    }
23
24    pub fn line_count(&self) -> usize {
25        self.lines.len()
26    }
27
28    pub fn get_line(&self, line_num: usize) -> Option<&'a str> {
29        if line_num > 0 && line_num <= self.lines.len() {
30            Some(self.lines[line_num - 1])
31        } else {
32            None
33        }
34    }
35
36    pub fn parse(&self) -> impl Iterator<Item = Event<'a>> + 'a {
37        Parser::new_ext(self.content, Self::options())
38    }
39
40    pub fn parse_with_offsets(&self) -> impl Iterator<Item = (Event<'a>, Range<usize>)> {
41        Parser::new_ext(self.content, Self::options()).into_offset_iter()
42    }
43
44    fn options() -> Options {
45        let mut options = Options::empty();
46        options.insert(Options::ENABLE_TABLES);
47        options.insert(Options::ENABLE_FOOTNOTES);
48        options.insert(Options::ENABLE_STRIKETHROUGH);
49        options.insert(Options::ENABLE_TASKLISTS);
50        options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
51        options
52    }
53
54    pub fn offset_to_line(&self, offset: usize) -> usize {
55        self.offset_to_position(offset).0
56    }
57
58    pub fn offset_to_position(&self, offset: usize) -> (usize, usize) {
59        let mut current_offset = 0;
60        for (line_num, line) in self.lines.iter().enumerate() {
61            let line_len = line.len() + 1;
62            if offset < current_offset + line_len {
63                let column = offset - current_offset + 1;
64                return (line_num + 1, column);
65            }
66            current_offset += line_len;
67        }
68        (self.lines.len(), 1)
69    }
70
71    /// Returns a set of line numbers that are inside code blocks or inline code.
72    /// This is useful for rules that should ignore code content.
73    ///
74    /// Note: For inline code, this marks the entire line as code. For more precise
75    /// detection, use `get_code_ranges()` instead.
76    pub fn get_code_line_numbers(&self) -> HashSet<usize> {
77        let mut code_lines = HashSet::new();
78        let mut in_code_block = false;
79
80        for (event, range) in self.parse_with_offsets() {
81            match event {
82                Event::Start(Tag::CodeBlock(_)) => {
83                    in_code_block = true;
84                    // Add all lines in this code block
85                    let start_line = self.offset_to_line(range.start);
86                    let end_line = self.offset_to_line(range.end);
87                    for line in start_line..=end_line {
88                        code_lines.insert(line);
89                    }
90                }
91                Event::End(TagEnd::CodeBlock) => {
92                    in_code_block = false;
93                }
94                Event::Code(_) => {
95                    // For inline code, we mark the whole line as code
96                    // This is conservative but simpler than tracking ranges
97                    let start_line = self.offset_to_line(range.start);
98                    let end_line = self.offset_to_line(range.end);
99                    for line in start_line..=end_line {
100                        code_lines.insert(line);
101                    }
102                }
103                _ => {
104                    // If we're in a code block, mark these lines too
105                    if in_code_block {
106                        let start_line = self.offset_to_line(range.start);
107                        let end_line = self.offset_to_line(range.end);
108                        for line in start_line..=end_line {
109                            code_lines.insert(line);
110                        }
111                    }
112                }
113            }
114        }
115
116        code_lines
117    }
118
119    /// Returns a set of line numbers that are inside code BLOCKS only (not inline code).
120    /// This is useful for rules that need to check list markers, URLs, etc. that might
121    /// legitimately appear on lines with inline code.
122    /// Lines are 1-indexed to match violation reporting.
123    pub fn get_code_block_line_numbers(&self) -> HashSet<usize> {
124        let mut code_lines = HashSet::new();
125        let mut in_code_block = false;
126
127        for (event, range) in self.parse_with_offsets() {
128            match event {
129                Event::Start(Tag::CodeBlock(_)) => {
130                    in_code_block = true;
131                    // Add all lines in this code block
132                    let start_line = self.offset_to_line(range.start);
133                    let end_line = self.offset_to_line(range.end);
134                    for line in start_line..=end_line {
135                        code_lines.insert(line);
136                    }
137                }
138                Event::End(TagEnd::CodeBlock) => {
139                    in_code_block = false;
140                }
141                _ => {
142                    // If we're in a code block, mark these lines too
143                    if in_code_block {
144                        let start_line = self.offset_to_line(range.start);
145                        let end_line = self.offset_to_line(range.end);
146                        for line in start_line..=end_line {
147                            code_lines.insert(line);
148                        }
149                    }
150                }
151            }
152        }
153
154        code_lines
155    }
156
157    /// Returns a vector of byte ranges that are inside code (blocks or inline).
158    /// This is more precise than `get_code_line_numbers()` for inline code.
159    pub fn get_code_ranges(&self) -> Vec<Range<usize>> {
160        let mut code_ranges = Vec::new();
161        let mut in_code_block = false;
162        let mut code_block_start = 0;
163
164        for (event, range) in self.parse_with_offsets() {
165            match event {
166                Event::Start(Tag::CodeBlock(_)) => {
167                    in_code_block = true;
168                    code_block_start = range.start;
169                }
170                Event::End(TagEnd::CodeBlock) => {
171                    if in_code_block {
172                        code_ranges.push(code_block_start..range.end);
173                        in_code_block = false;
174                    }
175                }
176                Event::Code(_) => {
177                    // Inline code span - add its byte range
178                    code_ranges.push(range);
179                }
180                _ => {}
181            }
182        }
183
184        code_ranges
185    }
186
187    /// Converts a byte offset within a line to an absolute byte offset in the content.
188    /// line_num is 1-indexed, byte_offset_in_line is 0-indexed from start of line.
189    pub fn line_offset_to_absolute(&self, line_num: usize, byte_offset_in_line: usize) -> usize {
190        let mut current_offset = 0;
191        for (i, line) in self.lines.iter().enumerate() {
192            if i + 1 == line_num {
193                return current_offset + byte_offset_in_line;
194            }
195            current_offset += line.len() + 1; // +1 for newline
196        }
197        current_offset
198    }
199
200    pub fn is_heading(&self, event: &Event) -> bool {
201        matches!(event, Event::Start(Tag::Heading { .. }))
202    }
203
204    pub fn is_code_block(&self, event: &Event) -> bool {
205        matches!(event, Event::Start(Tag::CodeBlock(_)))
206    }
207
208    pub fn is_list(&self, event: &Event) -> bool {
209        matches!(event, Event::Start(Tag::List(_)))
210    }
211}
212
213#[cfg(test)]
214mod tests {
215    use super::*;
216
217    #[test]
218    fn test_basic_parsing() {
219        let content = "# Heading\n\nSome **bold** text.";
220        let parser = MarkdownParser::new(content);
221
222        assert_eq!(parser.content(), content);
223        assert_eq!(parser.line_count(), 3);
224    }
225
226    #[test]
227    fn test_get_line() {
228        let content = "Line 1\nLine 2\nLine 3";
229        let parser = MarkdownParser::new(content);
230
231        assert_eq!(parser.get_line(1), Some("Line 1"));
232        assert_eq!(parser.get_line(2), Some("Line 2"));
233        assert_eq!(parser.get_line(3), Some("Line 3"));
234        assert_eq!(parser.get_line(0), None);
235        assert_eq!(parser.get_line(4), None);
236    }
237
238    #[test]
239    fn test_offset_to_line() {
240        let content = "Line 1\nLine 2\nLine 3";
241        let parser = MarkdownParser::new(content);
242
243        assert_eq!(parser.offset_to_line(0), 1);
244        assert_eq!(parser.offset_to_line(3), 1);
245        assert_eq!(parser.offset_to_line(7), 2);
246        assert_eq!(parser.offset_to_line(14), 3);
247    }
248
249    #[test]
250    fn test_offset_to_position() {
251        let content = "Line 1\nLine 2\nLine 3";
252        let parser = MarkdownParser::new(content);
253
254        assert_eq!(parser.offset_to_position(0), (1, 1));
255        assert_eq!(parser.offset_to_position(3), (1, 4));
256        assert_eq!(parser.offset_to_position(7), (2, 1));
257    }
258
259    #[test]
260    fn test_parse_events() {
261        let content = "# Heading";
262        let parser = MarkdownParser::new(content);
263
264        let events: Vec<_> = parser.parse().collect();
265        assert!(!events.is_empty());
266        assert!(parser.is_heading(&events[0]));
267    }
268
269    #[test]
270    fn test_parse_with_offsets() {
271        let content = "# Heading\n\nParagraph";
272        let parser = MarkdownParser::new(content);
273
274        let events: Vec<_> = parser.parse_with_offsets().collect();
275        assert!(!events.is_empty());
276    }
277
278    #[test]
279    fn test_event_type_checks() {
280        let content = "# Heading\n\n```rust\ncode\n```\n\n- item";
281        let parser = MarkdownParser::new(content);
282
283        let events: Vec<_> = parser.parse().collect();
284
285        let has_heading = events.iter().any(|e| parser.is_heading(e));
286        let has_code = events.iter().any(|e| parser.is_code_block(e));
287        let has_list = events.iter().any(|e| parser.is_list(e));
288
289        assert!(has_heading);
290        assert!(has_code);
291        assert!(has_list);
292    }
293
294    #[test]
295    fn test_code_line_numbers_fenced() {
296        let content = "Normal text\n\n```sql\nSELECT * FROM table_name\nWHERE user_id = 123\n```\n\nMore text";
297        let parser = MarkdownParser::new(content);
298        let code_lines = parser.get_code_line_numbers();
299
300        // Lines 3-6 should be marked as code (the ``` markers and content)
301        assert!(
302            code_lines.contains(&3),
303            "Line 3 (opening ```) should be code"
304        );
305        assert!(
306            code_lines.contains(&4),
307            "Line 4 (code content) should be code"
308        );
309        assert!(
310            code_lines.contains(&5),
311            "Line 5 (code content) should be code"
312        );
313        assert!(
314            code_lines.contains(&6),
315            "Line 6 (closing ```) should be code"
316        );
317
318        // Other lines should not be marked
319        assert!(!code_lines.contains(&1), "Line 1 should not be code");
320        assert!(!code_lines.contains(&2), "Line 2 should not be code");
321        assert!(!code_lines.contains(&8), "Line 8 should not be code");
322    }
323
324    #[test]
325    fn test_code_line_numbers_inline() {
326        let content = "This is `inline_code_with_underscores` in text";
327        let parser = MarkdownParser::new(content);
328        let code_lines = parser.get_code_line_numbers();
329
330        // Line 1 should be marked because it contains inline code
331        assert!(
332            code_lines.contains(&1),
333            "Line with inline code should be marked"
334        );
335    }
336
337    #[test]
338    fn test_code_line_numbers_mixed() {
339        let content =
340            "Normal text\n\nText with `inline_code` here\n\n```\nCode block\n```\n\nFinal text";
341        let parser = MarkdownParser::new(content);
342        let code_lines = parser.get_code_line_numbers();
343
344        // Line 3 has inline code
345        assert!(
346            code_lines.contains(&3),
347            "Line with inline code should be marked"
348        );
349
350        // Lines 5-7 are in code block
351        assert!(code_lines.contains(&5), "Code block line should be marked");
352        assert!(code_lines.contains(&6), "Code block line should be marked");
353        assert!(code_lines.contains(&7), "Code block line should be marked");
354
355        // Lines 1, 2, 9 are normal text
356        assert!(
357            !code_lines.contains(&1),
358            "Normal text line should not be marked"
359        );
360        assert!(!code_lines.contains(&2), "Empty line should not be marked");
361        assert!(
362            !code_lines.contains(&9),
363            "Normal text line should not be marked"
364        );
365    }
366}