1use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
2use std::collections::HashSet;
3use std::ops::Range;
4
5pub struct MarkdownParser<'a> {
6 content: &'a str,
7 lines: Vec<&'a str>,
8 line_offsets: Vec<usize>,
11 code_block_lines: HashSet<usize>,
13 code_lines: HashSet<usize>,
15 code_ranges: Vec<Range<usize>>,
17}
18
19impl<'a> MarkdownParser<'a> {
20 pub fn new(content: &'a str) -> Self {
21 let lines: Vec<&'a str> = content.lines().collect();
22 let line_offsets = build_line_offsets(content);
23 let (code_block_lines, code_lines, code_ranges) = build_code_info(content, &line_offsets);
24 Self {
25 content,
26 lines,
27 line_offsets,
28 code_block_lines,
29 code_lines,
30 code_ranges,
31 }
32 }
33
34 pub fn content(&self) -> &'a str {
35 self.content
36 }
37
38 pub fn lines(&self) -> &[&'a str] {
39 &self.lines
40 }
41
42 pub fn line_count(&self) -> usize {
43 self.lines.len()
44 }
45
46 pub fn get_line(&self, line_num: usize) -> Option<&'a str> {
47 if line_num > 0 && line_num <= self.lines.len() {
48 Some(self.lines[line_num - 1])
49 } else {
50 None
51 }
52 }
53
54 pub fn parse(&self) -> impl Iterator<Item = Event<'a>> + 'a {
55 Parser::new_ext(self.content, mk_options())
56 }
57
58 pub fn parse_with_offsets(&self) -> impl Iterator<Item = (Event<'a>, Range<usize>)> {
59 Parser::new_ext(self.content, mk_options()).into_offset_iter()
60 }
61
62 pub fn offset_to_line(&self, offset: usize) -> usize {
63 self.offset_to_position(offset).0
64 }
65
66 pub fn offset_to_position(&self, offset: usize) -> (usize, usize) {
67 let i = self.line_offsets.partition_point(|&start| start <= offset);
70 if i == 0 {
71 return (1, 1);
72 }
73 let line_idx = i - 1; let column = offset - self.line_offsets[line_idx] + 1;
75 (line_idx + 1, column) }
77
78 pub fn get_code_line_numbers(&self) -> &HashSet<usize> {
81 &self.code_lines
82 }
83
84 pub fn get_code_block_line_numbers(&self) -> &HashSet<usize> {
87 &self.code_block_lines
88 }
89
90 pub fn get_code_ranges(&self) -> &[Range<usize>] {
93 &self.code_ranges
94 }
95
96 pub fn line_offset_to_absolute(&self, line_num: usize, byte_offset_in_line: usize) -> usize {
99 if line_num == 0 || line_num > self.line_offsets.len() {
100 return self.content.len();
101 }
102 self.line_offsets[line_num - 1] + byte_offset_in_line
103 }
104
105 pub fn is_heading(&self, event: &Event) -> bool {
106 matches!(event, Event::Start(Tag::Heading { .. }))
107 }
108
109 pub fn is_code_block(&self, event: &Event) -> bool {
110 matches!(event, Event::Start(Tag::CodeBlock(_)))
111 }
112
113 pub fn is_list(&self, event: &Event) -> bool {
114 matches!(event, Event::Start(Tag::List(_)))
115 }
116}
117
118fn mk_options() -> Options {
119 let mut options = Options::empty();
120 options.insert(Options::ENABLE_TABLES);
121 options.insert(Options::ENABLE_FOOTNOTES);
122 options.insert(Options::ENABLE_STRIKETHROUGH);
123 options.insert(Options::ENABLE_TASKLISTS);
124 options.insert(Options::ENABLE_HEADING_ATTRIBUTES);
125 options
126}
127
128fn build_line_offsets(content: &str) -> Vec<usize> {
132 let mut offsets = vec![0usize];
133 for (i, byte) in content.bytes().enumerate() {
134 if byte == b'\n' {
135 let next = i + 1;
136 if next < content.len() {
137 offsets.push(next);
138 }
139 }
140 }
141 offsets
142}
143
144fn line_from_offset(offset: usize, line_offsets: &[usize]) -> usize {
147 let i = line_offsets.partition_point(|&start| start <= offset);
148 i.max(1)
149}
150
151fn build_code_info(
154 content: &str,
155 line_offsets: &[usize],
156) -> (HashSet<usize>, HashSet<usize>, Vec<Range<usize>>) {
157 let mut code_block_lines: HashSet<usize> = HashSet::new();
158 let mut code_lines: HashSet<usize> = HashSet::new();
159 let mut code_ranges: Vec<Range<usize>> = Vec::new();
160
161 let mut in_code_block = false;
162 let mut code_block_start = 0usize;
163
164 for (event, range) in Parser::new_ext(content, mk_options()).into_offset_iter() {
165 match event {
166 Event::Start(Tag::CodeBlock(_)) => {
167 in_code_block = true;
168 code_block_start = range.start;
169 let start_line = line_from_offset(range.start, line_offsets);
170 let end_line = line_from_offset(range.end, line_offsets);
171 for line in start_line..=end_line {
172 code_block_lines.insert(line);
173 code_lines.insert(line);
174 }
175 }
176 Event::End(TagEnd::CodeBlock) => {
177 if in_code_block {
178 code_ranges.push(code_block_start..range.end);
179 in_code_block = false;
180 }
181 }
182 Event::Code(_) => {
183 code_ranges.push(range.clone());
185 let start_line = line_from_offset(range.start, line_offsets);
186 let end_line = line_from_offset(range.end, line_offsets);
187 for line in start_line..=end_line {
188 code_lines.insert(line);
189 }
190 }
191 _ => {
192 if in_code_block {
193 let start_line = line_from_offset(range.start, line_offsets);
194 let end_line = line_from_offset(range.end, line_offsets);
195 for line in start_line..=end_line {
196 code_block_lines.insert(line);
197 code_lines.insert(line);
198 }
199 }
200 }
201 }
202 }
203
204 (code_block_lines, code_lines, code_ranges)
205}
206
207#[cfg(test)]
208mod tests {
209 use super::*;
210
211 #[test]
212 fn test_basic_parsing() {
213 let content = "# Heading\n\nSome **bold** text.";
214 let parser = MarkdownParser::new(content);
215
216 assert_eq!(parser.content(), content);
217 assert_eq!(parser.line_count(), 3);
218 }
219
220 #[test]
221 fn test_get_line() {
222 let content = "Line 1\nLine 2\nLine 3";
223 let parser = MarkdownParser::new(content);
224
225 assert_eq!(parser.get_line(1), Some("Line 1"));
226 assert_eq!(parser.get_line(2), Some("Line 2"));
227 assert_eq!(parser.get_line(3), Some("Line 3"));
228 assert_eq!(parser.get_line(0), None);
229 assert_eq!(parser.get_line(4), None);
230 }
231
232 #[test]
233 fn test_offset_to_line() {
234 let content = "Line 1\nLine 2\nLine 3";
235 let parser = MarkdownParser::new(content);
236
237 assert_eq!(parser.offset_to_line(0), 1);
238 assert_eq!(parser.offset_to_line(3), 1);
239 assert_eq!(parser.offset_to_line(7), 2);
240 assert_eq!(parser.offset_to_line(14), 3);
241 }
242
243 #[test]
244 fn test_offset_to_position() {
245 let content = "Line 1\nLine 2\nLine 3";
246 let parser = MarkdownParser::new(content);
247
248 assert_eq!(parser.offset_to_position(0), (1, 1));
249 assert_eq!(parser.offset_to_position(3), (1, 4));
250 assert_eq!(parser.offset_to_position(7), (2, 1));
251 }
252
253 #[test]
254 fn test_parse_events() {
255 let content = "# Heading";
256 let parser = MarkdownParser::new(content);
257
258 let events: Vec<_> = parser.parse().collect();
259 assert!(!events.is_empty());
260 assert!(parser.is_heading(&events[0]));
261 }
262
263 #[test]
264 fn test_parse_with_offsets() {
265 let content = "# Heading\n\nParagraph";
266 let parser = MarkdownParser::new(content);
267
268 let events: Vec<_> = parser.parse_with_offsets().collect();
269 assert!(!events.is_empty());
270 }
271
272 #[test]
273 fn test_event_type_checks() {
274 let content = "# Heading\n\n```rust\ncode\n```\n\n- item";
275 let parser = MarkdownParser::new(content);
276
277 let events: Vec<_> = parser.parse().collect();
278
279 let has_heading = events.iter().any(|e| parser.is_heading(e));
280 let has_code = events.iter().any(|e| parser.is_code_block(e));
281 let has_list = events.iter().any(|e| parser.is_list(e));
282
283 assert!(has_heading);
284 assert!(has_code);
285 assert!(has_list);
286 }
287
288 #[test]
289 fn test_code_line_numbers_fenced() {
290 let content = "Normal text\n\n```sql\nSELECT * FROM table_name\nWHERE user_id = 123\n```\n\nMore text";
291 let parser = MarkdownParser::new(content);
292 let code_lines = parser.get_code_line_numbers();
293
294 assert!(
296 code_lines.contains(&3),
297 "Line 3 (opening ```) should be code"
298 );
299 assert!(
300 code_lines.contains(&4),
301 "Line 4 (code content) should be code"
302 );
303 assert!(
304 code_lines.contains(&5),
305 "Line 5 (code content) should be code"
306 );
307 assert!(
308 code_lines.contains(&6),
309 "Line 6 (closing ```) should be code"
310 );
311
312 assert!(!code_lines.contains(&1), "Line 1 should not be code");
314 assert!(!code_lines.contains(&2), "Line 2 should not be code");
315 assert!(!code_lines.contains(&8), "Line 8 should not be code");
316 }
317
318 #[test]
319 fn test_code_line_numbers_inline() {
320 let content = "This is `inline_code_with_underscores` in text";
321 let parser = MarkdownParser::new(content);
322 let code_lines = parser.get_code_line_numbers();
323
324 assert!(
326 code_lines.contains(&1),
327 "Line with inline code should be marked"
328 );
329 }
330
331 #[test]
332 fn test_code_line_numbers_mixed() {
333 let content =
334 "Normal text\n\nText with `inline_code` here\n\n```\nCode block\n```\n\nFinal text";
335 let parser = MarkdownParser::new(content);
336 let code_lines = parser.get_code_line_numbers();
337
338 assert!(
340 code_lines.contains(&3),
341 "Line with inline code should be marked"
342 );
343
344 assert!(code_lines.contains(&5), "Code block line should be marked");
346 assert!(code_lines.contains(&6), "Code block line should be marked");
347 assert!(code_lines.contains(&7), "Code block line should be marked");
348
349 assert!(
351 !code_lines.contains(&1),
352 "Normal text line should not be marked"
353 );
354 assert!(!code_lines.contains(&2), "Empty line should not be marked");
355 assert!(
356 !code_lines.contains(&9),
357 "Normal text line should not be marked"
358 );
359 }
360
361 #[test]
362 fn test_build_line_offsets() {
363 let offsets = build_line_offsets("abc\ndef\nghi");
365 assert_eq!(offsets, vec![0, 4, 8]);
366
367 let offsets = build_line_offsets("abc\r\ndef\r\nghi");
369 assert_eq!(offsets, vec![0, 5, 10]);
370
371 let offsets = build_line_offsets("abc");
373 assert_eq!(offsets, vec![0]);
374
375 let offsets = build_line_offsets("");
377 assert_eq!(offsets, vec![0]);
378
379 let offsets = build_line_offsets("abc\n");
381 assert_eq!(offsets, vec![0]);
382 }
383
384 #[test]
385 fn test_offset_to_position_crlf() {
386 let content = "abc\r\ndef";
388 let parser = MarkdownParser::new(content);
389 assert_eq!(parser.offset_to_position(0), (1, 1));
390 assert_eq!(parser.offset_to_position(2), (1, 3));
391 assert_eq!(parser.offset_to_position(5), (2, 1));
392 assert_eq!(parser.offset_to_position(7), (2, 3));
393 }
394}