1use regex::Regex;
2use std::collections::HashSet;
3use std::sync::LazyLock;
4
5#[derive(Debug, Clone, Copy, PartialEq)]
7pub enum ElementType {
8 CodeBlock,
9 CodeSpan,
10 Heading,
11 List,
12 FrontMatter,
13}
14
15#[derive(Debug, Clone, Copy, PartialEq)]
17pub enum ElementQuality {
18 Valid,
19 Malformed,
20}
21
22#[derive(Debug, Clone)]
24pub struct MarkdownElement {
25 pub element_type: ElementType,
26 pub start_line: usize,
27 pub end_line: usize,
28 pub text: String,
29 pub metadata: Option<String>, pub quality: ElementQuality, }
32
33static CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
35
36static ATX_HEADING: LazyLock<Regex> =
38 LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s*)([^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap());
39static ATX_HEADING_NO_SPACE: LazyLock<Regex> =
40 LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})([^#\s][^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap());
41static SETEXT_HEADING_1: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(=+)(\s*)$").unwrap());
42static SETEXT_HEADING_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(-+)(\s*)$").unwrap());
43
44static UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
46static ORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(\d+\.)(\s+)").unwrap());
47
48static MALFORMED_UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])([^\s])").unwrap());
50static MALFORMED_ORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(\d+\.)([^\s])").unwrap());
51static MALFORMED_ORDERED_LIST_WRONG_MARKER: LazyLock<Regex> =
52 LazyLock::new(|| Regex::new(r"^(\s*)(\d+[)\]])(\s*)").unwrap());
53
54static EMPTY_UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])\s*$").unwrap());
56
57static FRONT_MATTER_DELIMITER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\s*$").unwrap());
59
60pub struct MarkdownElements;
62
63impl MarkdownElements {
64 pub fn detect_code_blocks(content: &str) -> Vec<MarkdownElement> {
66 let mut blocks = Vec::new();
67 let mut in_code_block = false;
68 let mut block_start = 0;
69 let mut language = String::new();
70 let mut fence_type = String::new();
71
72 for (i, line) in content.lines().enumerate() {
73 if let Some(captures) = CODE_BLOCK_START.captures(line) {
74 if !in_code_block {
75 block_start = i;
76 in_code_block = true;
77 fence_type = captures.get(2).unwrap().as_str().to_string();
78 language = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
79 } else if line.trim().starts_with(&fence_type) {
80 blocks.push(MarkdownElement {
82 element_type: ElementType::CodeBlock,
83 start_line: block_start,
84 end_line: i,
85 text: content
86 .lines()
87 .skip(block_start)
88 .take(i - block_start + 1)
89 .collect::<Vec<&str>>()
90 .join("\n"),
91 metadata: Some(language.clone()),
92 quality: ElementQuality::Valid,
93 });
94 in_code_block = false;
95 language = String::new();
96 }
97 }
98 }
99
100 if in_code_block {
102 let line_count = content.lines().count();
103 blocks.push(MarkdownElement {
104 element_type: ElementType::CodeBlock,
105 start_line: block_start,
106 end_line: line_count - 1,
107 text: content.lines().skip(block_start).collect::<Vec<&str>>().join("\n"),
108 metadata: Some(language),
109 quality: ElementQuality::Malformed, });
111 }
112
113 blocks
114 }
115
116 pub fn detect_code_block_lines(content: &str) -> HashSet<usize> {
118 let code_blocks = Self::detect_code_blocks(content);
119 let mut lines = HashSet::new();
120
121 for block in code_blocks {
122 for i in block.start_line..=block.end_line {
123 lines.insert(i);
124 }
125 }
126
127 lines
128 }
129
130 pub fn is_in_code_span(line: &str, position: usize) -> bool {
132 let mut in_code_span = false;
133 let mut code_start = 0;
134
135 for (pos, c) in line.char_indices() {
136 if c == '`' {
137 if !in_code_span {
138 in_code_span = true;
139 code_start = pos;
140 } else {
141 if position >= code_start && position <= pos {
143 return true;
144 }
145 in_code_span = false;
146 }
147 }
148
149 if pos > position && !in_code_span {
151 return false;
152 }
153 }
154
155 in_code_span && position >= code_start
157 }
158
159 pub fn detect_headings(content: &str) -> Vec<MarkdownElement> {
161 let mut headings = Vec::new();
162 let lines: Vec<&str> = content.lines().collect();
163 let code_block_lines = Self::detect_code_block_lines(content);
164
165 let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
167 (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
168 } else {
169 HashSet::new()
170 };
171
172 for (i, line) in lines.iter().enumerate() {
174 if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
176 continue;
177 }
178
179 if let Some(captures) = ATX_HEADING.captures(line) {
181 let hashes = captures.get(2).unwrap().as_str();
182 let level = hashes.len().to_string();
183 let text = captures.get(4).map_or("", |m| m.as_str()).trim().to_string();
184 let spaces_after_hash = captures.get(3).map_or("", |m| m.as_str()).len();
185
186 let quality = if spaces_after_hash > 0 || (text.is_empty() && (hashes.len() == 1 || hashes.len() == 6))
189 {
190 ElementQuality::Valid
191 } else {
192 ElementQuality::Malformed
193 };
194
195 headings.push(MarkdownElement {
196 element_type: ElementType::Heading,
197 start_line: i,
198 end_line: i,
199 text,
200 metadata: Some(level),
201 quality,
202 });
203
204 continue;
205 }
206
207 if let Some(captures) = ATX_HEADING_NO_SPACE.captures(line) {
209 let hashes = captures.get(2).unwrap().as_str();
210 let level = hashes.len().to_string();
211 let text = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
212
213 headings.push(MarkdownElement {
214 element_type: ElementType::Heading,
215 start_line: i,
216 end_line: i,
217 text,
218 metadata: Some(level),
219 quality: ElementQuality::Malformed, });
221
222 continue;
223 }
224
225 if i + 1 < lines.len() {
227 let next_line = lines[i + 1];
228
229 if SETEXT_HEADING_1.is_match(next_line) {
230 headings.push(MarkdownElement {
231 element_type: ElementType::Heading,
232 start_line: i,
233 end_line: i + 1,
234 text: line.trim().to_string(),
235 metadata: Some("1".to_string()), quality: ElementQuality::Valid,
237 });
238
239 continue;
240 }
241
242 if SETEXT_HEADING_2.is_match(next_line) {
243 headings.push(MarkdownElement {
244 element_type: ElementType::Heading,
245 start_line: i,
246 end_line: i + 1,
247 text: line.trim().to_string(),
248 metadata: Some("2".to_string()), quality: ElementQuality::Valid,
250 });
251
252 continue;
253 }
254 }
255 }
256
257 headings
258 }
259
260 pub fn get_heading_level(element: &MarkdownElement) -> Option<u32> {
262 if element.element_type != ElementType::Heading {
263 return None;
264 }
265
266 element.metadata.as_ref().and_then(|level| level.parse::<u32>().ok())
267 }
268
269 pub fn detect_lists(content: &str) -> Vec<MarkdownElement> {
271 let mut lists = Vec::new();
272 let lines: Vec<&str> = content.lines().collect();
273 let code_block_lines = Self::detect_code_block_lines(content);
274
275 let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
277 (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
278 } else {
279 HashSet::new()
280 };
281
282 static HORIZONTAL_RULE: LazyLock<Regex> =
284 LazyLock::new(|| Regex::new(r"^(\s*)(-{3,}|\*{3,}|_{3,})(\s*)$").unwrap());
285
286 for (i, line) in lines.iter().enumerate() {
287 if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
289 continue;
290 }
291
292 if HORIZONTAL_RULE.is_match(line) {
294 continue;
295 }
296
297 if let Some(_captures) = UNORDERED_LIST.captures(line) {
299 let marker = if line.trim_start().starts_with('*') {
300 "asterisk"
301 } else if line.trim_start().starts_with('+') {
302 "plus"
303 } else {
304 "minus"
305 };
306
307 lists.push(MarkdownElement {
308 element_type: ElementType::List,
309 start_line: i,
310 end_line: i,
311 text: line.trim().to_string(),
312 metadata: Some(marker.to_string()),
313 quality: ElementQuality::Valid,
314 });
315
316 continue;
317 }
318
319 if let Some(_captures) = EMPTY_UNORDERED_LIST.captures(line) {
321 if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
323 continue;
324 }
325
326 let marker = if line.trim_start().starts_with('*') {
327 "asterisk"
328 } else if line.trim_start().starts_with('+') {
329 "plus"
330 } else {
331 "minus"
332 };
333
334 lists.push(MarkdownElement {
335 element_type: ElementType::List,
336 start_line: i,
337 end_line: i,
338 text: String::new(), metadata: Some(marker.to_string()),
340 quality: ElementQuality::Valid,
341 });
342
343 continue;
344 }
345
346 if let Some(_captures) = MALFORMED_UNORDERED_LIST.captures(line) {
348 if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
350 continue;
351 }
352
353 let marker = if line.trim_start().starts_with('*') {
354 "asterisk:no_space"
355 } else if line.trim_start().starts_with('+') {
356 "plus:no_space"
357 } else {
358 "minus:no_space"
359 };
360
361 lists.push(MarkdownElement {
362 element_type: ElementType::List,
363 start_line: i,
364 end_line: i,
365 text: line.trim().to_string(),
366 metadata: Some(marker.to_string()),
367 quality: ElementQuality::Malformed,
368 });
369
370 continue;
371 }
372
373 if let Some(_captures) = ORDERED_LIST.captures(line) {
375 lists.push(MarkdownElement {
376 element_type: ElementType::List,
377 start_line: i,
378 end_line: i,
379 text: line.trim().to_string(),
380 metadata: Some("ordered".to_string()),
381 quality: ElementQuality::Valid,
382 });
383
384 continue;
385 }
386
387 if let Some(_captures) = MALFORMED_ORDERED_LIST.captures(line) {
389 lists.push(MarkdownElement {
390 element_type: ElementType::List,
391 start_line: i,
392 end_line: i,
393 text: line.trim().to_string(),
394 metadata: Some("ordered:no_space".to_string()),
395 quality: ElementQuality::Malformed,
396 });
397
398 continue;
399 }
400
401 if let Some(_captures) = MALFORMED_ORDERED_LIST_WRONG_MARKER.captures(line) {
403 lists.push(MarkdownElement {
404 element_type: ElementType::List,
405 start_line: i,
406 end_line: i,
407 text: line.trim().to_string(),
408 metadata: Some("ordered:wrong_marker".to_string()),
409 quality: ElementQuality::Malformed,
410 });
411 }
412 }
413
414 lists
415 }
416
417 pub fn detect_front_matter(content: &str) -> Option<MarkdownElement> {
419 let lines: Vec<&str> = content.lines().collect();
420
421 if lines.is_empty() || !FRONT_MATTER_DELIMITER.is_match(lines[0]) {
422 return None;
423 }
424
425 for (i, line) in lines.iter().enumerate().skip(1) {
427 if FRONT_MATTER_DELIMITER.is_match(line) {
428 return Some(MarkdownElement {
429 element_type: ElementType::FrontMatter,
430 start_line: 0,
431 end_line: i,
432 text: lines[0..=i].join("\n"),
433 metadata: None,
434 quality: ElementQuality::Valid,
435 });
436 }
437 }
438
439 None
441 }
442
443 pub fn heading_to_fragment(text: &str) -> String {
445 let text_no_html = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(text, "");
447
448 let text_lower = text_no_html.trim().to_lowercase();
450
451 let text_with_hyphens = text_lower
453 .chars()
454 .map(|c| if c.is_alphanumeric() { c } else { '-' })
455 .collect::<String>();
456
457 let text_clean = text_with_hyphens
459 .split('-')
460 .filter(|s| !s.is_empty())
461 .collect::<Vec<_>>()
462 .join("-");
463
464 text_clean.trim_matches('-').to_string()
466 }
467
468 pub fn is_line_in_code_block(content: &str, line_number: usize) -> bool {
470 let code_block_lines = Self::detect_code_block_lines(content);
471 code_block_lines.contains(&line_number)
472 }
473
474 pub fn get_element_line_indices(element: &MarkdownElement) -> Vec<usize> {
476 (element.start_line..=element.end_line).collect()
477 }
478}
479
480#[cfg(test)]
481mod tests {
482 use super::*;
483
484 #[test]
485 fn test_detect_code_blocks() {
486 let content = "# Heading\n```js\nlet x = 1;\n```\nText";
487 let blocks = MarkdownElements::detect_code_blocks(content);
488
489 assert_eq!(blocks.len(), 1);
490 assert_eq!(blocks[0].element_type, ElementType::CodeBlock);
491 assert_eq!(blocks[0].start_line, 1);
492 assert_eq!(blocks[0].end_line, 3);
493 assert_eq!(blocks[0].metadata, Some("js".to_string()));
494 }
495
496 #[test]
497 fn test_is_in_code_span() {
498 let line = "Text with `code` and more";
499 assert!(!MarkdownElements::is_in_code_span(line, 0));
500 assert!(MarkdownElements::is_in_code_span(line, 11));
501 assert!(!MarkdownElements::is_in_code_span(line, 20));
502 }
503
504 #[test]
505 fn test_detect_headings() {
506 let content = "# Heading 1\n## Heading 2\nText\nHeading 3\n===";
507 let headings = MarkdownElements::detect_headings(content);
508
509 assert_eq!(headings.len(), 3);
510 assert_eq!(MarkdownElements::get_heading_level(&headings[0]), Some(1));
511 assert_eq!(MarkdownElements::get_heading_level(&headings[1]), Some(2));
512 assert_eq!(MarkdownElements::get_heading_level(&headings[2]), Some(1));
513 }
514
515 #[test]
516 fn test_detect_lists() {
517 let content = "- Item 1\n* Item 2\n+ Item 3\n1. Item 4";
518 let lists = MarkdownElements::detect_lists(content);
519
520 assert_eq!(lists.len(), 4);
521 assert_eq!(lists[0].metadata, Some("minus".to_string()));
522 assert_eq!(lists[1].metadata, Some("asterisk".to_string()));
523 assert_eq!(lists[2].metadata, Some("plus".to_string()));
524 assert_eq!(lists[3].metadata, Some("ordered".to_string()));
525 }
526
527 #[test]
528 fn test_detect_front_matter() {
529 let content = "---\ntitle: Test\n---\n# Content";
530 let front_matter = MarkdownElements::detect_front_matter(content);
531
532 assert!(front_matter.is_some());
533 assert_eq!(front_matter.unwrap().end_line, 2);
534 }
535
536 #[test]
537 fn test_heading_to_fragment() {
538 assert_eq!(MarkdownElements::heading_to_fragment("Hello World!"), "hello-world");
539 assert_eq!(
540 MarkdownElements::heading_to_fragment("Complex: (Header) 123"),
541 "complex-header-123"
542 );
543 }
544
545 #[test]
546 fn test_is_line_in_code_block() {
547 let content = "Text\n```\nCode\n```\nMore text";
548 assert!(!MarkdownElements::is_line_in_code_block(content, 0));
549 assert!(MarkdownElements::is_line_in_code_block(content, 1));
550 assert!(MarkdownElements::is_line_in_code_block(content, 2));
551 assert!(MarkdownElements::is_line_in_code_block(content, 3));
552 assert!(!MarkdownElements::is_line_in_code_block(content, 4));
553 }
554}