1use lazy_static::lazy_static;
2use regex::Regex;
3use std::collections::HashSet;
4
5#[derive(Debug, Clone, Copy, PartialEq)]
7pub enum ElementType {
8 CodeBlock,
9 CodeSpan,
10 Heading,
11 List,
12 FrontMatter,
13}
14
15#[derive(Debug, Clone, Copy, PartialEq)]
17pub enum ElementQuality {
18 Valid,
19 Malformed,
20}
21
22#[derive(Debug, Clone)]
24pub struct MarkdownElement {
25 pub element_type: ElementType,
26 pub start_line: usize,
27 pub end_line: usize,
28 pub text: String,
29 pub metadata: Option<String>, pub quality: ElementQuality, }
32
33lazy_static! {
34 static ref CODE_BLOCK_START: Regex = Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap();
36 static ref CODE_BLOCK_END: Regex = Regex::new(r"^(\s*)(```|~~~)\s*$").unwrap();
37 static ref CODE_SPAN_PATTERN: Regex = Regex::new(r"`+").unwrap();
38
39 static ref ATX_HEADING: Regex = Regex::new(r"^(\s*)(#{1,6})(\s*)([^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap();
41 static ref ATX_HEADING_NO_SPACE: Regex = Regex::new(r"^(\s*)(#{1,6})([^#\s][^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap();
42 static ref SETEXT_HEADING_1: Regex = Regex::new(r"^(\s*)(=+)(\s*)$").unwrap();
43 static ref SETEXT_HEADING_2: Regex = Regex::new(r"^(\s*)(-+)(\s*)$").unwrap();
44
45 static ref UNORDERED_LIST: Regex = Regex::new(r"^(\s*)([*+-])(\s+)").unwrap();
47 static ref ORDERED_LIST: Regex = Regex::new(r"^(\s*)(\d+\.)(\s+)").unwrap();
48
49 static ref MALFORMED_UNORDERED_LIST: Regex = Regex::new(r"^(\s*)([*+-])([^\s])").unwrap();
51 static ref MALFORMED_ORDERED_LIST: Regex = Regex::new(r"^(\s*)(\d+\.)([^\s])").unwrap();
52 static ref MALFORMED_ORDERED_LIST_WRONG_MARKER: Regex = Regex::new(r"^(\s*)(\d+[)\]])(\s*)").unwrap();
53
54 static ref EMPTY_UNORDERED_LIST: Regex = Regex::new(r"^(\s*)([*+-])\s*$").unwrap();
56
57 static ref FRONT_MATTER_DELIMITER: Regex = Regex::new(r"^---\s*$").unwrap();
59}
60
61pub struct MarkdownElements;
63
64impl MarkdownElements {
65 pub fn detect_code_blocks(content: &str) -> Vec<MarkdownElement> {
67 let mut blocks = Vec::new();
68 let mut in_code_block = false;
69 let mut block_start = 0;
70 let mut language = String::new();
71 let mut fence_type = String::new();
72
73 for (i, line) in content.lines().enumerate() {
74 if let Some(captures) = CODE_BLOCK_START.captures(line) {
75 if !in_code_block {
76 block_start = i;
77 in_code_block = true;
78 fence_type = captures.get(2).unwrap().as_str().to_string();
79 language = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
80 } else if line.trim().starts_with(&fence_type) {
81 blocks.push(MarkdownElement {
83 element_type: ElementType::CodeBlock,
84 start_line: block_start,
85 end_line: i,
86 text: content
87 .lines()
88 .skip(block_start)
89 .take(i - block_start + 1)
90 .collect::<Vec<&str>>()
91 .join("\n"),
92 metadata: Some(language.clone()),
93 quality: ElementQuality::Valid,
94 });
95 in_code_block = false;
96 language = String::new();
97 }
98 }
99 }
100
101 if in_code_block {
103 let line_count = content.lines().count();
104 blocks.push(MarkdownElement {
105 element_type: ElementType::CodeBlock,
106 start_line: block_start,
107 end_line: line_count - 1,
108 text: content.lines().skip(block_start).collect::<Vec<&str>>().join("\n"),
109 metadata: Some(language),
110 quality: ElementQuality::Malformed, });
112 }
113
114 blocks
115 }
116
117 pub fn detect_code_block_lines(content: &str) -> HashSet<usize> {
119 let code_blocks = Self::detect_code_blocks(content);
120 let mut lines = HashSet::new();
121
122 for block in code_blocks {
123 for i in block.start_line..=block.end_line {
124 lines.insert(i);
125 }
126 }
127
128 lines
129 }
130
131 pub fn is_in_code_span(line: &str, position: usize) -> bool {
133 let mut in_code_span = false;
134 let mut code_start = 0;
135
136 for (pos, c) in line.char_indices() {
137 if c == '`' {
138 if !in_code_span {
139 in_code_span = true;
140 code_start = pos;
141 } else {
142 if position >= code_start && position <= pos {
144 return true;
145 }
146 in_code_span = false;
147 }
148 }
149
150 if pos > position && !in_code_span {
152 return false;
153 }
154 }
155
156 in_code_span && position >= code_start
158 }
159
160 pub fn detect_headings(content: &str) -> Vec<MarkdownElement> {
162 let mut headings = Vec::new();
163 let lines: Vec<&str> = content.lines().collect();
164 let code_block_lines = Self::detect_code_block_lines(content);
165
166 let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
168 (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
169 } else {
170 HashSet::new()
171 };
172
173 for (i, line) in lines.iter().enumerate() {
175 if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
177 continue;
178 }
179
180 if let Some(captures) = ATX_HEADING.captures(line) {
182 let hashes = captures.get(2).unwrap().as_str();
183 let level = hashes.len().to_string();
184 let text = captures.get(4).map_or("", |m| m.as_str()).trim().to_string();
185 let spaces_after_hash = captures.get(3).map_or("", |m| m.as_str()).len();
186
187 let quality = if spaces_after_hash > 0 || (text.is_empty() && (hashes.len() == 1 || hashes.len() == 6))
190 {
191 ElementQuality::Valid
192 } else {
193 ElementQuality::Malformed
194 };
195
196 headings.push(MarkdownElement {
197 element_type: ElementType::Heading,
198 start_line: i,
199 end_line: i,
200 text,
201 metadata: Some(level),
202 quality,
203 });
204
205 continue;
206 }
207
208 if let Some(captures) = ATX_HEADING_NO_SPACE.captures(line) {
210 let hashes = captures.get(2).unwrap().as_str();
211 let level = hashes.len().to_string();
212 let text = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
213
214 headings.push(MarkdownElement {
215 element_type: ElementType::Heading,
216 start_line: i,
217 end_line: i,
218 text,
219 metadata: Some(level),
220 quality: ElementQuality::Malformed, });
222
223 continue;
224 }
225
226 if i + 1 < lines.len() {
228 let next_line = lines[i + 1];
229
230 if SETEXT_HEADING_1.is_match(next_line) {
231 headings.push(MarkdownElement {
232 element_type: ElementType::Heading,
233 start_line: i,
234 end_line: i + 1,
235 text: line.trim().to_string(),
236 metadata: Some("1".to_string()), quality: ElementQuality::Valid,
238 });
239
240 continue;
241 }
242
243 if SETEXT_HEADING_2.is_match(next_line) {
244 headings.push(MarkdownElement {
245 element_type: ElementType::Heading,
246 start_line: i,
247 end_line: i + 1,
248 text: line.trim().to_string(),
249 metadata: Some("2".to_string()), quality: ElementQuality::Valid,
251 });
252
253 continue;
254 }
255 }
256 }
257
258 headings
259 }
260
261 pub fn get_heading_level(element: &MarkdownElement) -> Option<u32> {
263 if element.element_type != ElementType::Heading {
264 return None;
265 }
266
267 element.metadata.as_ref().and_then(|level| level.parse::<u32>().ok())
268 }
269
270 pub fn detect_lists(content: &str) -> Vec<MarkdownElement> {
272 let mut lists = Vec::new();
273 let lines: Vec<&str> = content.lines().collect();
274 let code_block_lines = Self::detect_code_block_lines(content);
275
276 let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
278 (frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
279 } else {
280 HashSet::new()
281 };
282
283 lazy_static! {
285 static ref HORIZONTAL_RULE: Regex = Regex::new(r"^(\s*)(-{3,}|\*{3,}|_{3,})(\s*)$").unwrap();
286 }
287
288 for (i, line) in lines.iter().enumerate() {
289 if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
291 continue;
292 }
293
294 if HORIZONTAL_RULE.is_match(line) {
296 continue;
297 }
298
299 if let Some(_captures) = UNORDERED_LIST.captures(line) {
301 let marker = if line.trim_start().starts_with('*') {
302 "asterisk"
303 } else if line.trim_start().starts_with('+') {
304 "plus"
305 } else {
306 "minus"
307 };
308
309 lists.push(MarkdownElement {
310 element_type: ElementType::List,
311 start_line: i,
312 end_line: i,
313 text: line.trim().to_string(),
314 metadata: Some(marker.to_string()),
315 quality: ElementQuality::Valid,
316 });
317
318 continue;
319 }
320
321 if let Some(_captures) = EMPTY_UNORDERED_LIST.captures(line) {
323 if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
325 continue;
326 }
327
328 let marker = if line.trim_start().starts_with('*') {
329 "asterisk"
330 } else if line.trim_start().starts_with('+') {
331 "plus"
332 } else {
333 "minus"
334 };
335
336 lists.push(MarkdownElement {
337 element_type: ElementType::List,
338 start_line: i,
339 end_line: i,
340 text: String::new(), metadata: Some(marker.to_string()),
342 quality: ElementQuality::Valid,
343 });
344
345 continue;
346 }
347
348 if let Some(_captures) = MALFORMED_UNORDERED_LIST.captures(line) {
350 if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
352 continue;
353 }
354
355 let marker = if line.trim_start().starts_with('*') {
356 "asterisk:no_space"
357 } else if line.trim_start().starts_with('+') {
358 "plus:no_space"
359 } else {
360 "minus:no_space"
361 };
362
363 lists.push(MarkdownElement {
364 element_type: ElementType::List,
365 start_line: i,
366 end_line: i,
367 text: line.trim().to_string(),
368 metadata: Some(marker.to_string()),
369 quality: ElementQuality::Malformed,
370 });
371
372 continue;
373 }
374
375 if let Some(_captures) = ORDERED_LIST.captures(line) {
377 lists.push(MarkdownElement {
378 element_type: ElementType::List,
379 start_line: i,
380 end_line: i,
381 text: line.trim().to_string(),
382 metadata: Some("ordered".to_string()),
383 quality: ElementQuality::Valid,
384 });
385
386 continue;
387 }
388
389 if let Some(_captures) = MALFORMED_ORDERED_LIST.captures(line) {
391 lists.push(MarkdownElement {
392 element_type: ElementType::List,
393 start_line: i,
394 end_line: i,
395 text: line.trim().to_string(),
396 metadata: Some("ordered:no_space".to_string()),
397 quality: ElementQuality::Malformed,
398 });
399
400 continue;
401 }
402
403 if let Some(_captures) = MALFORMED_ORDERED_LIST_WRONG_MARKER.captures(line) {
405 lists.push(MarkdownElement {
406 element_type: ElementType::List,
407 start_line: i,
408 end_line: i,
409 text: line.trim().to_string(),
410 metadata: Some("ordered:wrong_marker".to_string()),
411 quality: ElementQuality::Malformed,
412 });
413 }
414 }
415
416 lists
417 }
418
419 pub fn detect_front_matter(content: &str) -> Option<MarkdownElement> {
421 let lines: Vec<&str> = content.lines().collect();
422
423 if lines.is_empty() || !FRONT_MATTER_DELIMITER.is_match(lines[0]) {
424 return None;
425 }
426
427 for (i, line) in lines.iter().enumerate().skip(1) {
429 if FRONT_MATTER_DELIMITER.is_match(line) {
430 return Some(MarkdownElement {
431 element_type: ElementType::FrontMatter,
432 start_line: 0,
433 end_line: i,
434 text: lines[0..=i].join("\n"),
435 metadata: None,
436 quality: ElementQuality::Valid,
437 });
438 }
439 }
440
441 None
443 }
444
445 pub fn heading_to_fragment(text: &str) -> String {
447 let text_no_html = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(text, "");
449
450 let text_lower = text_no_html.trim().to_lowercase();
452
453 let text_with_hyphens = text_lower
455 .chars()
456 .map(|c| if c.is_alphanumeric() { c } else { '-' })
457 .collect::<String>();
458
459 let text_clean = text_with_hyphens
461 .split('-')
462 .filter(|s| !s.is_empty())
463 .collect::<Vec<_>>()
464 .join("-");
465
466 text_clean.trim_matches('-').to_string()
468 }
469
470 pub fn is_line_in_code_block(content: &str, line_number: usize) -> bool {
472 let code_block_lines = Self::detect_code_block_lines(content);
473 code_block_lines.contains(&line_number)
474 }
475
476 pub fn get_element_line_indices(element: &MarkdownElement) -> Vec<usize> {
478 (element.start_line..=element.end_line).collect()
479 }
480}
481
482#[cfg(test)]
483mod tests {
484 use super::*;
485
486 #[test]
487 fn test_detect_code_blocks() {
488 let content = "# Heading\n```js\nlet x = 1;\n```\nText";
489 let blocks = MarkdownElements::detect_code_blocks(content);
490
491 assert_eq!(blocks.len(), 1);
492 assert_eq!(blocks[0].element_type, ElementType::CodeBlock);
493 assert_eq!(blocks[0].start_line, 1);
494 assert_eq!(blocks[0].end_line, 3);
495 assert_eq!(blocks[0].metadata, Some("js".to_string()));
496 }
497
498 #[test]
499 fn test_is_in_code_span() {
500 let line = "Text with `code` and more";
501 assert!(!MarkdownElements::is_in_code_span(line, 0));
502 assert!(MarkdownElements::is_in_code_span(line, 11));
503 assert!(!MarkdownElements::is_in_code_span(line, 20));
504 }
505
506 #[test]
507 fn test_detect_headings() {
508 let content = "# Heading 1\n## Heading 2\nText\nHeading 3\n===";
509 let headings = MarkdownElements::detect_headings(content);
510
511 assert_eq!(headings.len(), 3);
512 assert_eq!(MarkdownElements::get_heading_level(&headings[0]), Some(1));
513 assert_eq!(MarkdownElements::get_heading_level(&headings[1]), Some(2));
514 assert_eq!(MarkdownElements::get_heading_level(&headings[2]), Some(1));
515 }
516
517 #[test]
518 fn test_detect_lists() {
519 let content = "- Item 1\n* Item 2\n+ Item 3\n1. Item 4";
520 let lists = MarkdownElements::detect_lists(content);
521
522 assert_eq!(lists.len(), 4);
523 assert_eq!(lists[0].metadata, Some("minus".to_string()));
524 assert_eq!(lists[1].metadata, Some("asterisk".to_string()));
525 assert_eq!(lists[2].metadata, Some("plus".to_string()));
526 assert_eq!(lists[3].metadata, Some("ordered".to_string()));
527 }
528
529 #[test]
530 fn test_detect_front_matter() {
531 let content = "---\ntitle: Test\n---\n# Content";
532 let front_matter = MarkdownElements::detect_front_matter(content);
533
534 assert!(front_matter.is_some());
535 assert_eq!(front_matter.unwrap().end_line, 2);
536 }
537
538 #[test]
539 fn test_heading_to_fragment() {
540 assert_eq!(MarkdownElements::heading_to_fragment("Hello World!"), "hello-world");
541 assert_eq!(
542 MarkdownElements::heading_to_fragment("Complex: (Header) 123"),
543 "complex-header-123"
544 );
545 }
546
547 #[test]
548 fn test_is_line_in_code_block() {
549 let content = "Text\n```\nCode\n```\nMore text";
550 assert!(!MarkdownElements::is_line_in_code_block(content, 0));
551 assert!(MarkdownElements::is_line_in_code_block(content, 1));
552 assert!(MarkdownElements::is_line_in_code_block(content, 2));
553 assert!(MarkdownElements::is_line_in_code_block(content, 3));
554 assert!(!MarkdownElements::is_line_in_code_block(content, 4));
555 }
556}