Skip to main content

mdlint/markdown/
parser.rs

1use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
2use std::collections::HashSet;
3use std::ops::Range;
4
5pub struct MarkdownParser<'a> {
6    content: &'a str,
7    lines: Vec<&'a str>,
8    /// Byte offset of the start of each line (0-indexed).
9    /// Enables O(log n) offset → (line, column) lookup via binary search.
10    line_offsets: Vec<usize>,
11    /// Lines (1-indexed) that fall inside a fenced/indented code block.
12    code_block_lines: HashSet<usize>,
13    /// Lines (1-indexed) inside any code (blocks + inline spans).
14    code_lines: HashSet<usize>,
15    /// Byte ranges of all code blocks and inline code spans.
16    code_ranges: Vec<Range<usize>>,
17}
18
19impl<'a> MarkdownParser<'a> {
20    pub fn new(content: &'a str) -> Self {
21        let lines: Vec<&'a str> = content.lines().collect();
22        let line_offsets = build_line_offsets(content);
23        let (code_block_lines, code_lines, code_ranges) = build_code_info(content, &line_offsets);
24        Self {
25            content,
26            lines,
27            line_offsets,
28            code_block_lines,
29            code_lines,
30            code_ranges,
31        }
32    }
33
34    pub fn content(&self) -> &'a str {
35        self.content
36    }
37
38    pub fn lines(&self) -> &[&'a str] {
39        &self.lines
40    }
41
42    pub fn line_count(&self) -> usize {
43        self.lines.len()
44    }
45
46    pub fn get_line(&self, line_num: usize) -> Option<&'a str> {
47        if line_num > 0 && line_num <= self.lines.len() {
48            Some(self.lines[line_num - 1])
49        } else {
50            None
51        }
52    }
53
54    pub fn parse(&self) -> impl Iterator<Item = Event<'a>> + 'a {
55        Parser::new_ext(self.content, mk_options())
56    }
57
58    pub fn parse_with_offsets(&self) -> impl Iterator<Item = (Event<'a>, Range<usize>)> {
59        Parser::new_ext(self.content, mk_options()).into_offset_iter()
60    }
61
62    pub fn offset_to_line(&self, offset: usize) -> usize {
63        self.offset_to_position(offset).0
64    }
65
66    pub fn offset_to_position(&self, offset: usize) -> (usize, usize) {
67        // partition_point returns the count of elements for which the predicate holds —
68        // i.e. the index of the first line whose start offset exceeds `offset`.
69        let i = self.line_offsets.partition_point(|&start| start <= offset);
70        if i == 0 {
71            return (1, 1);
72        }
73        let line_idx = i - 1; // 0-indexed
74        let column = offset - self.line_offsets[line_idx] + 1;
75        (line_idx + 1, column) // 1-indexed
76    }
77
78    /// Returns the 1-indexed line numbers inside code blocks or inline code.
79    /// Result is precomputed in `new()` — O(1) to access.
80    pub fn get_code_line_numbers(&self) -> &HashSet<usize> {
81        &self.code_lines
82    }
83
84    /// Returns the 1-indexed line numbers inside code blocks only (not inline spans).
85    /// Result is precomputed in `new()` — O(1) to access.
86    pub fn get_code_block_line_numbers(&self) -> &HashSet<usize> {
87        &self.code_block_lines
88    }
89
90    /// Returns byte ranges (into the original content) for all code blocks and
91    /// inline code spans. Result is precomputed in `new()` — O(1) to access.
92    pub fn get_code_ranges(&self) -> &[Range<usize>] {
93        &self.code_ranges
94    }
95
96    /// Converts a (1-indexed) line number and 0-indexed byte offset within that
97    /// line to an absolute byte offset in the content.
98    pub fn line_offset_to_absolute(&self, line_num: usize, byte_offset_in_line: usize) -> usize {
99        if line_num == 0 || line_num > self.line_offsets.len() {
100            return self.content.len();
101        }
102        self.line_offsets[line_num - 1] + byte_offset_in_line
103    }
104
105    pub fn is_heading(&self, event: &Event) -> bool {
106        matches!(event, Event::Start(Tag::Heading { .. }))
107    }
108
109    pub fn is_code_block(&self, event: &Event) -> bool {
110        matches!(event, Event::Start(Tag::CodeBlock(_)))
111    }
112
113    pub fn is_list(&self, event: &Event) -> bool {
114        matches!(event, Event::Start(Tag::List(_)))
115    }
116}
117
118fn mk_options() -> Options {
119    let mut options = Options::empty();
120    options.insert(Options::ENABLE_TABLES);
121    options.insert(Options::ENABLE_FOOTNOTES);
122    options.insert(Options::ENABLE_STRIKETHROUGH);
123    options.insert(Options::ENABLE_TASKLISTS);
124    options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
125    options
126}
127
128/// Builds a table of byte offsets for the start of each line (entry `i` = byte
129/// offset where line `i+1` begins).  Handles both LF and CRLF correctly because
130/// it scans the raw bytes rather than relying on `str::lines` lengths.
131fn build_line_offsets(content: &str) -> Vec<usize> {
132    let mut offsets = vec![0usize];
133    for (i, byte) in content.bytes().enumerate() {
134        if byte == b'\n' {
135            let next = i + 1;
136            if next < content.len() {
137                offsets.push(next);
138            }
139        }
140    }
141    offsets
142}
143
144/// Map a byte offset to a 1-indexed line number using the precomputed offset
145/// table.  O(log n) via binary search.
146fn line_from_offset(offset: usize, line_offsets: &[usize]) -> usize {
147    let i = line_offsets.partition_point(|&start| start <= offset);
148    i.max(1)
149}
150
151/// Single parse pass that builds all three code-location caches simultaneously.
152/// Called once in `MarkdownParser::new()`.
153fn build_code_info(
154    content: &str,
155    line_offsets: &[usize],
156) -> (HashSet<usize>, HashSet<usize>, Vec<Range<usize>>) {
157    let mut code_block_lines: HashSet<usize> = HashSet::new();
158    let mut code_lines: HashSet<usize> = HashSet::new();
159    let mut code_ranges: Vec<Range<usize>> = Vec::new();
160
161    let mut in_code_block = false;
162    let mut code_block_start = 0usize;
163
164    for (event, range) in Parser::new_ext(content, mk_options()).into_offset_iter() {
165        match event {
166            Event::Start(Tag::CodeBlock(_)) => {
167                in_code_block = true;
168                code_block_start = range.start;
169                let start_line = line_from_offset(range.start, line_offsets);
170                let end_line = line_from_offset(range.end, line_offsets);
171                for line in start_line..=end_line {
172                    code_block_lines.insert(line);
173                    code_lines.insert(line);
174                }
175            }
176            Event::End(TagEnd::CodeBlock) => {
177                if in_code_block {
178                    code_ranges.push(code_block_start..range.end);
179                    in_code_block = false;
180                }
181            }
182            Event::Code(_) => {
183                // Inline code span
184                code_ranges.push(range.clone());
185                let start_line = line_from_offset(range.start, line_offsets);
186                let end_line = line_from_offset(range.end, line_offsets);
187                for line in start_line..=end_line {
188                    code_lines.insert(line);
189                }
190            }
191            _ => {
192                if in_code_block {
193                    let start_line = line_from_offset(range.start, line_offsets);
194                    let end_line = line_from_offset(range.end, line_offsets);
195                    for line in start_line..=end_line {
196                        code_block_lines.insert(line);
197                        code_lines.insert(line);
198                    }
199                }
200            }
201        }
202    }
203
204    (code_block_lines, code_lines, code_ranges)
205}
206
207#[cfg(test)]
208mod tests {
209    use super::*;
210
211    #[test]
212    fn test_basic_parsing() {
213        let content = "# Heading\n\nSome **bold** text.";
214        let parser = MarkdownParser::new(content);
215
216        assert_eq!(parser.content(), content);
217        assert_eq!(parser.line_count(), 3);
218    }
219
220    #[test]
221    fn test_get_line() {
222        let content = "Line 1\nLine 2\nLine 3";
223        let parser = MarkdownParser::new(content);
224
225        assert_eq!(parser.get_line(1), Some("Line 1"));
226        assert_eq!(parser.get_line(2), Some("Line 2"));
227        assert_eq!(parser.get_line(3), Some("Line 3"));
228        assert_eq!(parser.get_line(0), None);
229        assert_eq!(parser.get_line(4), None);
230    }
231
232    #[test]
233    fn test_offset_to_line() {
234        let content = "Line 1\nLine 2\nLine 3";
235        let parser = MarkdownParser::new(content);
236
237        assert_eq!(parser.offset_to_line(0), 1);
238        assert_eq!(parser.offset_to_line(3), 1);
239        assert_eq!(parser.offset_to_line(7), 2);
240        assert_eq!(parser.offset_to_line(14), 3);
241    }
242
243    #[test]
244    fn test_offset_to_position() {
245        let content = "Line 1\nLine 2\nLine 3";
246        let parser = MarkdownParser::new(content);
247
248        assert_eq!(parser.offset_to_position(0), (1, 1));
249        assert_eq!(parser.offset_to_position(3), (1, 4));
250        assert_eq!(parser.offset_to_position(7), (2, 1));
251    }
252
253    #[test]
254    fn test_parse_events() {
255        let content = "# Heading";
256        let parser = MarkdownParser::new(content);
257
258        let events: Vec<_> = parser.parse().collect();
259        assert!(!events.is_empty());
260        assert!(parser.is_heading(&events[0]));
261    }
262
263    #[test]
264    fn test_parse_with_offsets() {
265        let content = "# Heading\n\nParagraph";
266        let parser = MarkdownParser::new(content);
267
268        let events: Vec<_> = parser.parse_with_offsets().collect();
269        assert!(!events.is_empty());
270    }
271
272    #[test]
273    fn test_event_type_checks() {
274        let content = "# Heading\n\n```rust\ncode\n```\n\n- item";
275        let parser = MarkdownParser::new(content);
276
277        let events: Vec<_> = parser.parse().collect();
278
279        let has_heading = events.iter().any(|e| parser.is_heading(e));
280        let has_code = events.iter().any(|e| parser.is_code_block(e));
281        let has_list = events.iter().any(|e| parser.is_list(e));
282
283        assert!(has_heading);
284        assert!(has_code);
285        assert!(has_list);
286    }
287
288    #[test]
289    fn test_code_line_numbers_fenced() {
290        let content = "Normal text\n\n```sql\nSELECT * FROM table_name\nWHERE user_id = 123\n```\n\nMore text";
291        let parser = MarkdownParser::new(content);
292        let code_lines = parser.get_code_line_numbers();
293
294        // Lines 3-6 should be marked as code (the ``` markers and content)
295        assert!(
296            code_lines.contains(&3),
297            "Line 3 (opening ```) should be code"
298        );
299        assert!(
300            code_lines.contains(&4),
301            "Line 4 (code content) should be code"
302        );
303        assert!(
304            code_lines.contains(&5),
305            "Line 5 (code content) should be code"
306        );
307        assert!(
308            code_lines.contains(&6),
309            "Line 6 (closing ```) should be code"
310        );
311
312        // Other lines should not be marked
313        assert!(!code_lines.contains(&1), "Line 1 should not be code");
314        assert!(!code_lines.contains(&2), "Line 2 should not be code");
315        assert!(!code_lines.contains(&8), "Line 8 should not be code");
316    }
317
318    #[test]
319    fn test_code_line_numbers_inline() {
320        let content = "This is `inline_code_with_underscores` in text";
321        let parser = MarkdownParser::new(content);
322        let code_lines = parser.get_code_line_numbers();
323
324        // Line 1 should be marked because it contains inline code
325        assert!(
326            code_lines.contains(&1),
327            "Line with inline code should be marked"
328        );
329    }
330
331    #[test]
332    fn test_code_line_numbers_mixed() {
333        let content =
334            "Normal text\n\nText with `inline_code` here\n\n```\nCode block\n```\n\nFinal text";
335        let parser = MarkdownParser::new(content);
336        let code_lines = parser.get_code_line_numbers();
337
338        // Line 3 has inline code
339        assert!(
340            code_lines.contains(&3),
341            "Line with inline code should be marked"
342        );
343
344        // Lines 5-7 are in code block
345        assert!(code_lines.contains(&5), "Code block line should be marked");
346        assert!(code_lines.contains(&6), "Code block line should be marked");
347        assert!(code_lines.contains(&7), "Code block line should be marked");
348
349        // Lines 1, 2, 9 are normal text
350        assert!(
351            !code_lines.contains(&1),
352            "Normal text line should not be marked"
353        );
354        assert!(!code_lines.contains(&2), "Empty line should not be marked");
355        assert!(
356            !code_lines.contains(&9),
357            "Normal text line should not be marked"
358        );
359    }
360
361    #[test]
362    fn test_build_line_offsets() {
363        // LF line endings
364        let offsets = build_line_offsets("abc\ndef\nghi");
365        assert_eq!(offsets, vec![0, 4, 8]);
366
367        // CRLF line endings
368        let offsets = build_line_offsets("abc\r\ndef\r\nghi");
369        assert_eq!(offsets, vec![0, 5, 10]);
370
371        // Single line (no newline)
372        let offsets = build_line_offsets("abc");
373        assert_eq!(offsets, vec![0]);
374
375        // Empty content
376        let offsets = build_line_offsets("");
377        assert_eq!(offsets, vec![0]);
378
379        // Trailing newline does not add a spurious extra entry
380        let offsets = build_line_offsets("abc\n");
381        assert_eq!(offsets, vec![0]);
382    }
383
384    #[test]
385    fn test_offset_to_position_crlf() {
386        // CRLF: "abc\r\ndef" — 'a'=0,'b'=1,'c'=2,'\r'=3,'\n'=4,'d'=5,'e'=6,'f'=7
387        let content = "abc\r\ndef";
388        let parser = MarkdownParser::new(content);
389        assert_eq!(parser.offset_to_position(0), (1, 1));
390        assert_eq!(parser.offset_to_position(2), (1, 3));
391        assert_eq!(parser.offset_to_position(5), (2, 1));
392        assert_eq!(parser.offset_to_position(7), (2, 3));
393    }
394}