use regex::Regex;
use std::collections::HashSet;
use std::sync::LazyLock;
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ElementType {
CodeBlock,
CodeSpan,
Heading,
List,
FrontMatter,
}
#[derive(Debug, Clone, Copy, PartialEq)]
pub enum ElementQuality {
Valid,
Malformed,
}
#[derive(Debug, Clone)]
pub struct MarkdownElement {
pub element_type: ElementType,
pub start_line: usize,
pub end_line: usize,
pub text: String,
pub metadata: Option<String>, pub quality: ElementQuality, }
static CODE_BLOCK_START: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(```|~~~)(.*)$").unwrap());
static ATX_HEADING: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})(\s*)([^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap());
static ATX_HEADING_NO_SPACE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(\s*)(#{1,6})([^#\s][^#\n]*?)(?:\s+(#{1,6}))?\s*$").unwrap());
static SETEXT_HEADING_1: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(=+)(\s*)$").unwrap());
static SETEXT_HEADING_2: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(-+)(\s*)$").unwrap());
static UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])(\s+)").unwrap());
static ORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(\d+\.)(\s+)").unwrap());
static MALFORMED_UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])([^\s])").unwrap());
static MALFORMED_ORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)(\d+\.)([^\s])").unwrap());
static MALFORMED_ORDERED_LIST_WRONG_MARKER: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(\s*)(\d+[)\]])(\s*)").unwrap());
static EMPTY_UNORDERED_LIST: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^(\s*)([*+-])\s*$").unwrap());
static FRONT_MATTER_DELIMITER: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"^---\s*$").unwrap());
pub struct MarkdownElements;
impl MarkdownElements {
pub fn detect_code_blocks(content: &str) -> Vec<MarkdownElement> {
let mut blocks = Vec::new();
let mut in_code_block = false;
let mut block_start = 0;
let mut language = String::new();
let mut fence_type = String::new();
for (i, line) in content.lines().enumerate() {
if let Some(captures) = CODE_BLOCK_START.captures(line) {
if !in_code_block {
block_start = i;
in_code_block = true;
fence_type = captures.get(2).unwrap().as_str().to_string();
language = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
} else if line.trim().starts_with(&fence_type) {
blocks.push(MarkdownElement {
element_type: ElementType::CodeBlock,
start_line: block_start,
end_line: i,
text: content
.lines()
.skip(block_start)
.take(i - block_start + 1)
.collect::<Vec<&str>>()
.join("\n"),
metadata: Some(language.clone()),
quality: ElementQuality::Valid,
});
in_code_block = false;
language = String::new();
}
}
}
if in_code_block {
let line_count = content.lines().count();
blocks.push(MarkdownElement {
element_type: ElementType::CodeBlock,
start_line: block_start,
end_line: line_count - 1,
text: content.lines().skip(block_start).collect::<Vec<&str>>().join("\n"),
metadata: Some(language),
quality: ElementQuality::Malformed, });
}
blocks
}
pub fn detect_code_block_lines(content: &str) -> HashSet<usize> {
let code_blocks = Self::detect_code_blocks(content);
let mut lines = HashSet::new();
for block in code_blocks {
for i in block.start_line..=block.end_line {
lines.insert(i);
}
}
lines
}
pub fn is_in_code_span(line: &str, position: usize) -> bool {
let mut in_code_span = false;
let mut code_start = 0;
for (pos, c) in line.char_indices() {
if c == '`' {
if !in_code_span {
in_code_span = true;
code_start = pos;
} else {
if position >= code_start && position <= pos {
return true;
}
in_code_span = false;
}
}
if pos > position && !in_code_span {
return false;
}
}
in_code_span && position >= code_start
}
pub fn detect_headings(content: &str) -> Vec<MarkdownElement> {
let mut headings = Vec::new();
let lines: Vec<&str> = content.lines().collect();
let code_block_lines = Self::detect_code_block_lines(content);
let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
(frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
} else {
HashSet::new()
};
for (i, line) in lines.iter().enumerate() {
if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
continue;
}
if let Some(captures) = ATX_HEADING.captures(line) {
let hashes = captures.get(2).unwrap().as_str();
let level = hashes.len().to_string();
let text = captures.get(4).map_or("", |m| m.as_str()).trim().to_string();
let spaces_after_hash = captures.get(3).map_or("", |m| m.as_str()).len();
let quality = if spaces_after_hash > 0 || (text.is_empty() && (hashes.len() == 1 || hashes.len() == 6))
{
ElementQuality::Valid
} else {
ElementQuality::Malformed
};
headings.push(MarkdownElement {
element_type: ElementType::Heading,
start_line: i,
end_line: i,
text,
metadata: Some(level),
quality,
});
continue;
}
if let Some(captures) = ATX_HEADING_NO_SPACE.captures(line) {
let hashes = captures.get(2).unwrap().as_str();
let level = hashes.len().to_string();
let text = captures.get(3).map_or("", |m| m.as_str()).trim().to_string();
headings.push(MarkdownElement {
element_type: ElementType::Heading,
start_line: i,
end_line: i,
text,
metadata: Some(level),
quality: ElementQuality::Malformed, });
continue;
}
if i + 1 < lines.len() {
let next_line = lines[i + 1];
if SETEXT_HEADING_1.is_match(next_line) {
headings.push(MarkdownElement {
element_type: ElementType::Heading,
start_line: i,
end_line: i + 1,
text: line.trim().to_string(),
metadata: Some("1".to_string()), quality: ElementQuality::Valid,
});
continue;
}
if SETEXT_HEADING_2.is_match(next_line) {
headings.push(MarkdownElement {
element_type: ElementType::Heading,
start_line: i,
end_line: i + 1,
text: line.trim().to_string(),
metadata: Some("2".to_string()), quality: ElementQuality::Valid,
});
continue;
}
}
}
headings
}
pub fn get_heading_level(element: &MarkdownElement) -> Option<u32> {
if element.element_type != ElementType::Heading {
return None;
}
element.metadata.as_ref().and_then(|level| level.parse::<u32>().ok())
}
pub fn detect_lists(content: &str) -> Vec<MarkdownElement> {
let mut lists = Vec::new();
let lines: Vec<&str> = content.lines().collect();
let code_block_lines = Self::detect_code_block_lines(content);
let frontmatter_lines = if let Some(frontmatter) = Self::detect_front_matter(content) {
(frontmatter.start_line..=frontmatter.end_line).collect::<HashSet<usize>>()
} else {
HashSet::new()
};
static HORIZONTAL_RULE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(\s*)(-{3,}|\*{3,}|_{3,})(\s*)$").unwrap());
for (i, line) in lines.iter().enumerate() {
if code_block_lines.contains(&i) || frontmatter_lines.contains(&i) {
continue;
}
if HORIZONTAL_RULE.is_match(line) {
continue;
}
if let Some(_captures) = UNORDERED_LIST.captures(line) {
let marker = if line.trim_start().starts_with('*') {
"asterisk"
} else if line.trim_start().starts_with('+') {
"plus"
} else {
"minus"
};
lists.push(MarkdownElement {
element_type: ElementType::List,
start_line: i,
end_line: i,
text: line.trim().to_string(),
metadata: Some(marker.to_string()),
quality: ElementQuality::Valid,
});
continue;
}
if let Some(_captures) = EMPTY_UNORDERED_LIST.captures(line) {
if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
continue;
}
let marker = if line.trim_start().starts_with('*') {
"asterisk"
} else if line.trim_start().starts_with('+') {
"plus"
} else {
"minus"
};
lists.push(MarkdownElement {
element_type: ElementType::List,
start_line: i,
end_line: i,
text: String::new(), metadata: Some(marker.to_string()),
quality: ElementQuality::Valid,
});
continue;
}
if let Some(_captures) = MALFORMED_UNORDERED_LIST.captures(line) {
if line.trim() == "---" || line.trim() == "***" || line.trim() == "___" {
continue;
}
let marker = if line.trim_start().starts_with('*') {
"asterisk:no_space"
} else if line.trim_start().starts_with('+') {
"plus:no_space"
} else {
"minus:no_space"
};
lists.push(MarkdownElement {
element_type: ElementType::List,
start_line: i,
end_line: i,
text: line.trim().to_string(),
metadata: Some(marker.to_string()),
quality: ElementQuality::Malformed,
});
continue;
}
if let Some(_captures) = ORDERED_LIST.captures(line) {
lists.push(MarkdownElement {
element_type: ElementType::List,
start_line: i,
end_line: i,
text: line.trim().to_string(),
metadata: Some("ordered".to_string()),
quality: ElementQuality::Valid,
});
continue;
}
if let Some(_captures) = MALFORMED_ORDERED_LIST.captures(line) {
lists.push(MarkdownElement {
element_type: ElementType::List,
start_line: i,
end_line: i,
text: line.trim().to_string(),
metadata: Some("ordered:no_space".to_string()),
quality: ElementQuality::Malformed,
});
continue;
}
if let Some(_captures) = MALFORMED_ORDERED_LIST_WRONG_MARKER.captures(line) {
lists.push(MarkdownElement {
element_type: ElementType::List,
start_line: i,
end_line: i,
text: line.trim().to_string(),
metadata: Some("ordered:wrong_marker".to_string()),
quality: ElementQuality::Malformed,
});
}
}
lists
}
pub fn detect_front_matter(content: &str) -> Option<MarkdownElement> {
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() || !FRONT_MATTER_DELIMITER.is_match(lines[0]) {
return None;
}
for (i, line) in lines.iter().enumerate().skip(1) {
if FRONT_MATTER_DELIMITER.is_match(line) {
return Some(MarkdownElement {
element_type: ElementType::FrontMatter,
start_line: 0,
end_line: i,
text: lines[0..=i].join("\n"),
metadata: None,
quality: ElementQuality::Valid,
});
}
}
None
}
pub fn heading_to_fragment(text: &str) -> String {
let text_no_html = regex::Regex::new(r"<[^>]*>").unwrap().replace_all(text, "");
let text_lower = text_no_html.trim().to_lowercase();
let text_with_hyphens = text_lower
.chars()
.map(|c| if c.is_alphanumeric() { c } else { '-' })
.collect::<String>();
let text_clean = text_with_hyphens
.split('-')
.filter(|s| !s.is_empty())
.collect::<Vec<_>>()
.join("-");
text_clean.trim_matches('-').to_string()
}
pub fn is_line_in_code_block(content: &str, line_number: usize) -> bool {
let code_block_lines = Self::detect_code_block_lines(content);
code_block_lines.contains(&line_number)
}
pub fn get_element_line_indices(element: &MarkdownElement) -> Vec<usize> {
(element.start_line..=element.end_line).collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_detect_code_blocks() {
let content = "# Heading\n```js\nlet x = 1;\n```\nText";
let blocks = MarkdownElements::detect_code_blocks(content);
assert_eq!(blocks.len(), 1);
assert_eq!(blocks[0].element_type, ElementType::CodeBlock);
assert_eq!(blocks[0].start_line, 1);
assert_eq!(blocks[0].end_line, 3);
assert_eq!(blocks[0].metadata, Some("js".to_string()));
}
#[test]
fn test_is_in_code_span() {
let line = "Text with `code` and more";
assert!(!MarkdownElements::is_in_code_span(line, 0));
assert!(MarkdownElements::is_in_code_span(line, 11));
assert!(!MarkdownElements::is_in_code_span(line, 20));
}
#[test]
fn test_detect_headings() {
let content = "# Heading 1\n## Heading 2\nText\nHeading 3\n===";
let headings = MarkdownElements::detect_headings(content);
assert_eq!(headings.len(), 3);
assert_eq!(MarkdownElements::get_heading_level(&headings[0]), Some(1));
assert_eq!(MarkdownElements::get_heading_level(&headings[1]), Some(2));
assert_eq!(MarkdownElements::get_heading_level(&headings[2]), Some(1));
}
#[test]
fn test_detect_lists() {
let content = "- Item 1\n* Item 2\n+ Item 3\n1. Item 4";
let lists = MarkdownElements::detect_lists(content);
assert_eq!(lists.len(), 4);
assert_eq!(lists[0].metadata, Some("minus".to_string()));
assert_eq!(lists[1].metadata, Some("asterisk".to_string()));
assert_eq!(lists[2].metadata, Some("plus".to_string()));
assert_eq!(lists[3].metadata, Some("ordered".to_string()));
}
#[test]
fn test_detect_front_matter() {
let content = "---\ntitle: Test\n---\n# Content";
let front_matter = MarkdownElements::detect_front_matter(content);
assert!(front_matter.is_some());
assert_eq!(front_matter.unwrap().end_line, 2);
}
#[test]
fn test_heading_to_fragment() {
assert_eq!(MarkdownElements::heading_to_fragment("Hello World!"), "hello-world");
assert_eq!(
MarkdownElements::heading_to_fragment("Complex: (Header) 123"),
"complex-header-123"
);
}
#[test]
fn test_is_line_in_code_block() {
let content = "Text\n```\nCode\n```\nMore text";
assert!(!MarkdownElements::is_line_in_code_block(content, 0));
assert!(MarkdownElements::is_line_in_code_block(content, 1));
assert!(MarkdownElements::is_line_in_code_block(content, 2));
assert!(MarkdownElements::is_line_in_code_block(content, 3));
assert!(!MarkdownElements::is_line_in_code_block(content, 4));
}
}