use markdown::mdast::Html;
use markdown::unist::Point as UnistPoint;
use markdown::unist::Position as UnistPosition;
use crate::MdtResult;
use crate::config::CodeBlockFilter;
use crate::lexer::memstr;
use crate::lexer::tokenize;
use crate::parser::Block;
use crate::parser::ParseDiagnostic;
use crate::parser::build_blocks_from_groups_lenient;
use crate::parser::build_blocks_from_groups_with_diagnostics;
pub fn parse_source(content: &str) -> MdtResult<Vec<Block>> {
let html_nodes = extract_html_comments(content);
let token_groups = tokenize(html_nodes)?;
build_blocks_from_groups_lenient(&token_groups)
}
pub fn parse_source_with_diagnostics(
content: &str,
filter: &CodeBlockFilter,
) -> MdtResult<(Vec<Block>, Vec<ParseDiagnostic>)> {
let mut html_nodes = extract_html_comments(content);
if filter.is_enabled() {
let code_block_ranges = find_fenced_code_block_ranges(content, filter);
html_nodes.retain(|node| {
let Some(pos) = &node.position else {
return true;
};
let offset = pos.start.offset;
!code_block_ranges
.iter()
.any(|range| offset >= range.start && offset < range.end)
});
}
let token_groups = tokenize(html_nodes)?;
build_blocks_from_groups_with_diagnostics(&token_groups)
}
struct LineTable {
line_starts: Vec<usize>,
}
impl LineTable {
fn new(content: &str) -> Self {
let mut line_starts = vec![0];
for (i, byte) in content.bytes().enumerate() {
if byte == b'\n' {
line_starts.push(i + 1);
}
}
Self { line_starts }
}
fn offset_to_point(&self, offset: usize) -> UnistPoint {
let line_idx = match self.line_starts.binary_search(&offset) {
Ok(exact) => exact,
Err(insert) => insert.saturating_sub(1),
};
let line = line_idx + 1; let column = offset - self.line_starts[line_idx] + 1;
UnistPoint {
line,
column,
offset,
}
}
}
pub fn extract_html_comments(content: &str) -> Vec<Html> {
let bytes = content.as_bytes();
let open_marker = b"<!--";
let close_marker = b"-->";
let mut nodes = Vec::new();
let mut search_from = 0;
let line_table = LineTable::new(content);
while search_from < bytes.len() {
let Some(open_offset) = memstr(&bytes[search_from..], open_marker) else {
break;
};
let abs_open = search_from + open_offset;
let after_open = abs_open + open_marker.len();
if after_open >= bytes.len() {
break;
}
let Some(close_offset) = memstr(&bytes[after_open..], close_marker) else {
break;
};
let abs_close_end = after_open + close_offset + close_marker.len();
let value = String::from_utf8_lossy(&bytes[abs_open..abs_close_end]).to_string();
let start_point = line_table.offset_to_point(abs_open);
let end_point = line_table.offset_to_point(abs_close_end);
nodes.push(Html {
value,
position: Some(UnistPosition {
start: start_point,
end: end_point,
}),
});
search_from = abs_close_end;
}
nodes
}
struct CodeBlockRange {
start: usize,
end: usize,
}
const COMMENT_PREFIXES: &[&str] = &[
"///!", "//!", "///", "//", "##", "#", "* ", "**", "*", ";", "--",
];
fn strip_comment_prefix(line: &str) -> &str {
let trimmed = line.trim_start();
for prefix in COMMENT_PREFIXES {
if let Some(rest) = trimmed.strip_prefix(prefix) {
return rest.strip_prefix(' ').unwrap_or(rest);
}
}
trimmed
}
fn find_fenced_code_block_ranges(content: &str, filter: &CodeBlockFilter) -> Vec<CodeBlockRange> {
let mut ranges = Vec::new();
let mut in_code_block = false;
let mut block_start = 0;
let mut should_skip_current = false;
let mut fence_char = '`';
let mut fence_len = 0;
let mut offset = 0;
for line in content.split('\n') {
let line_end = offset + line.len();
let stripped = strip_comment_prefix(line);
if in_code_block {
let closing_fence_len = stripped.chars().take_while(|&c| c == fence_char).count();
let after_fence = &stripped[closing_fence_len..];
if closing_fence_len >= fence_len && after_fence.trim().is_empty() {
if should_skip_current {
ranges.push(CodeBlockRange {
start: block_start,
end: line_end,
});
}
in_code_block = false;
}
} else {
let backtick_len = stripped.chars().take_while(|&c| c == '`').count();
let tilde_len = stripped.chars().take_while(|&c| c == '~').count();
let (fc, fl) = if backtick_len >= 3 {
('`', backtick_len)
} else if tilde_len >= 3 {
('~', tilde_len)
} else {
offset = line_end + 1; continue;
};
let info_string = stripped[fl..].trim();
fence_char = fc;
fence_len = fl;
in_code_block = true;
block_start = offset;
should_skip_current = filter.should_skip(info_string);
}
offset = line_end + 1; }
ranges
}