use crate::config::MarkdownFlavor;
use crate::utils::blockquote::parse_blockquote_prefix;
use crate::utils::calculate_indentation_width_default;
use crate::utils::code_block_utils::CodeBlockUtils;
use pulldown_cmark::{Event, Options, Parser, Tag, TagEnd};
use super::types::*;
use super::{ListItemMap, SkipByteRanges};
pub(super) fn compute_basic_line_info(
content: &str,
content_lines: &[&str],
line_offsets: &[usize],
code_blocks: &[(usize, usize)],
flavor: MarkdownFlavor,
skip_ranges: &SkipByteRanges<'_>,
front_matter_end: usize,
) -> (Vec<LineInfo>, Vec<EmphasisSpan>) {
let mut lines = Vec::with_capacity(content_lines.len());
let extension_line_map = if flavor.supports_kramdown_syntax() {
Some(compute_kramdown_extension_line_map(content_lines))
} else {
None
};
let filtered_code_blocks;
let code_blocks_for_map = if let Some(extension_line_map) = &extension_line_map {
filtered_code_blocks =
filter_code_blocks_starting_in_extension_blocks(content, line_offsets, code_blocks, extension_line_map);
filtered_code_blocks.as_slice()
} else {
code_blocks
};
let code_block_map = compute_code_block_line_map(content, line_offsets, code_blocks_for_map);
let math_block_map = compute_math_block_line_map(content_lines, &code_block_map);
let pulldown_content;
let pulldown_input = if let Some(extension_line_map) = &extension_line_map {
pulldown_content = mask_kramdown_extension_lines(content, line_offsets, extension_line_map);
pulldown_content.as_str()
} else {
content
};
let (list_item_map, emphasis_spans) = detect_list_items_and_emphasis_with_pulldown(
pulldown_input,
line_offsets,
flavor,
front_matter_end,
code_blocks_for_map,
);
for (i, line) in content_lines.iter().enumerate() {
let byte_offset = line_offsets.get(i).copied().unwrap_or(0);
let indent = line.len() - line.trim_start().len();
let visual_indent = calculate_indentation_width_default(line);
let blockquote_parse = parse_blockquote_prefix(line);
let is_blank = if let Some(parsed) = blockquote_parse {
parsed.content.trim().is_empty()
} else {
line.trim().is_empty()
};
let in_code_block = code_block_map.get(i).copied().unwrap_or(false);
let in_mkdocstrings =
crate::utils::mkdocstrings_refs::is_within_autodoc_block_ranges(skip_ranges.autodoc_ranges, byte_offset);
let line_end_offset = byte_offset + line.len();
let in_html_comment = crate::utils::skip_context::is_line_entirely_in_html_comment(
skip_ranges.html_comment_ranges,
byte_offset,
line_end_offset,
);
let list_item =
list_item_map
.get(&byte_offset)
.map(|(is_ordered, marker, marker_column, content_column, number)| {
Box::new(ListItemInfo {
marker: marker.clone(),
is_ordered: *is_ordered,
number: *number,
marker_column: *marker_column,
content_column: *content_column,
})
});
let in_front_matter = front_matter_end > 0 && i < front_matter_end;
let is_hr = !in_code_block && !in_front_matter && is_horizontal_rule_line(line);
let in_math_block = math_block_map.get(i).copied().unwrap_or(false);
let in_quarto_div = flavor == MarkdownFlavor::Quarto
&& crate::utils::quarto_divs::is_within_div_block_ranges(skip_ranges.quarto_div_ranges, byte_offset);
let is_div_marker = !in_code_block && !in_front_matter && line.trim_start().starts_with(":::");
let in_pymdown_block = flavor == MarkdownFlavor::MkDocs
&& crate::utils::pymdown_blocks::is_within_block_ranges(skip_ranges.pymdown_block_ranges, byte_offset);
lines.push(LineInfo {
byte_offset,
byte_len: line.len(),
indent,
visual_indent,
is_blank,
in_code_block,
in_front_matter,
in_html_block: false,
in_html_comment,
list_item,
heading: None,
blockquote: None,
in_mkdocstrings,
in_esm_block: false,
in_code_span_continuation: false,
is_horizontal_rule: is_hr,
in_math_block,
in_quarto_div,
is_div_marker,
in_jsx_expression: false,
in_mdx_comment: false,
in_admonition: false,
in_content_tab: false,
in_mkdocs_html_markdown: false,
in_definition_list: false,
in_obsidian_comment: false,
in_pymdown_block,
in_kramdown_extension_block: false,
is_kramdown_block_ial: false,
in_jsx_block: false,
});
}
(lines, emphasis_spans)
}
fn compute_kramdown_extension_line_map(content_lines: &[&str]) -> Vec<bool> {
use crate::utils::kramdown_utils;
let mut in_extension_block = false;
let mut extension_lines = vec![false; content_lines.len()];
for (i, line) in content_lines.iter().enumerate() {
let trimmed = line.trim();
if in_extension_block {
extension_lines[i] = true;
if kramdown_utils::is_kramdown_extension_close(trimmed) {
in_extension_block = false;
}
continue;
}
if kramdown_utils::is_kramdown_extension_self_closing(trimmed) {
extension_lines[i] = true;
continue;
}
if kramdown_utils::is_kramdown_extension_open(trimmed) {
extension_lines[i] = true;
in_extension_block = true;
}
}
extension_lines
}
fn mask_kramdown_extension_lines(content: &str, line_offsets: &[usize], extension_line_map: &[bool]) -> String {
let mut masked = content.as_bytes().to_vec();
for (line_idx, in_extension) in extension_line_map.iter().enumerate() {
if !in_extension {
continue;
}
let Some(start) = line_offsets.get(line_idx).copied() else {
continue;
};
let end = line_offsets
.get(line_idx + 1)
.copied()
.unwrap_or(content.len())
.min(content.len());
if start >= end || start >= masked.len() {
continue;
}
let end = end.min(masked.len());
for b in &mut masked[start..end] {
if *b != b'\n' && *b != b'\r' {
*b = b' ';
}
}
}
String::from_utf8(masked).expect("masked kramdown content should remain valid UTF-8")
}
fn filter_code_blocks_starting_in_extension_blocks(
content: &str,
line_offsets: &[usize],
code_blocks: &[(usize, usize)],
extension_line_map: &[bool],
) -> Vec<(usize, usize)> {
code_blocks
.iter()
.copied()
.filter(|(start, _end)| {
let safe_start = if *start > 0 && !content.is_char_boundary(*start) {
let mut boundary = *start;
while boundary > 0 && !content.is_char_boundary(boundary) {
boundary -= 1;
}
boundary
} else {
*start
};
let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
let first_line = first_line_after.saturating_sub(1);
!extension_line_map.get(first_line).copied().unwrap_or(false)
})
.collect()
}
pub(super) fn compute_code_block_line_map(
content: &str,
line_offsets: &[usize],
code_blocks: &[(usize, usize)],
) -> Vec<bool> {
let num_lines = line_offsets.len();
let mut in_code_block = vec![false; num_lines];
for &(start, end) in code_blocks {
let safe_start = if start > 0 && !content.is_char_boundary(start) {
let mut boundary = start;
while boundary > 0 && !content.is_char_boundary(boundary) {
boundary -= 1;
}
boundary
} else {
start
};
let safe_end = if end < content.len() && !content.is_char_boundary(end) {
let mut boundary = end;
while boundary < content.len() && !content.is_char_boundary(boundary) {
boundary += 1;
}
boundary
} else {
end.min(content.len())
};
let first_line_after = line_offsets.partition_point(|&offset| offset <= safe_start);
let first_line = first_line_after.saturating_sub(1);
let last_line = line_offsets.partition_point(|&offset| offset < safe_end);
for flag in in_code_block.iter_mut().take(last_line).skip(first_line) {
*flag = true;
}
}
in_code_block
}
pub(super) fn compute_math_block_line_map(content_lines: &[&str], code_block_map: &[bool]) -> Vec<bool> {
let num_lines = content_lines.len();
let mut in_math_block = vec![false; num_lines];
let mut inside_math = false;
for (i, line) in content_lines.iter().enumerate() {
if code_block_map.get(i).copied().unwrap_or(false) {
continue;
}
let trimmed = line.trim();
if trimmed == "$$" {
if inside_math {
in_math_block[i] = true;
inside_math = false;
} else {
in_math_block[i] = true;
inside_math = true;
}
} else if inside_math {
in_math_block[i] = true;
}
}
in_math_block
}
pub(super) fn detect_list_items_and_emphasis_with_pulldown(
content: &str,
line_offsets: &[usize],
flavor: MarkdownFlavor,
front_matter_end: usize,
code_blocks: &[(usize, usize)],
) -> (ListItemMap, Vec<EmphasisSpan>) {
use std::collections::HashMap;
let mut list_items = HashMap::new();
let mut emphasis_spans = Vec::with_capacity(content.matches('*').count() + content.matches('_').count() / 4);
let mut options = Options::empty();
options.insert(Options::ENABLE_TABLES);
options.insert(Options::ENABLE_FOOTNOTES);
options.insert(Options::ENABLE_STRIKETHROUGH);
options.insert(Options::ENABLE_TASKLISTS);
options.insert(Options::ENABLE_GFM);
let _ = flavor;
let parser = Parser::new_ext(content, options).into_offset_iter();
let mut list_depth: usize = 0;
let mut list_stack: Vec<bool> = Vec::new();
for (event, range) in parser {
match event {
Event::Start(Tag::Emphasis) | Event::Start(Tag::Strong) => {
let marker_count = if matches!(event, Event::Start(Tag::Strong)) {
2
} else {
1
};
let match_start = range.start;
let match_end = range.end;
if !CodeBlockUtils::is_in_code_block_or_span(code_blocks, match_start) {
let marker = content[match_start..].chars().next().unwrap_or('*');
if marker == '*' || marker == '_' {
let content_start = match_start + marker_count;
let content_end = if match_end >= marker_count {
match_end - marker_count
} else {
match_end
};
let content_part = if content_start < content_end && content_end <= content.len() {
&content[content_start..content_end]
} else {
""
};
let line_idx = match line_offsets.binary_search(&match_start) {
Ok(idx) => idx,
Err(idx) => idx.saturating_sub(1),
};
let line_num = line_idx + 1;
let line_start = line_offsets.get(line_idx).copied().unwrap_or(0);
let col_start = match_start - line_start;
let col_end = match_end - line_start;
emphasis_spans.push(EmphasisSpan {
line: line_num,
start_col: col_start,
end_col: col_end,
byte_offset: match_start,
byte_end: match_end,
marker,
marker_count,
content: content_part.to_string(),
});
}
}
}
Event::Start(Tag::List(start_number)) => {
list_depth += 1;
list_stack.push(start_number.is_some());
}
Event::End(TagEnd::List(_)) => {
list_depth = list_depth.saturating_sub(1);
list_stack.pop();
}
Event::Start(Tag::Item) if list_depth > 0 => {
let current_list_is_ordered = list_stack.last().copied().unwrap_or(false);
let item_start = range.start;
let mut line_idx = match line_offsets.binary_search(&item_start) {
Ok(idx) => idx,
Err(idx) => idx.saturating_sub(1),
};
if item_start < content.len() && content.as_bytes()[item_start] == b'\n' {
line_idx += 1;
}
if front_matter_end > 0 && line_idx < front_matter_end {
continue;
}
if line_idx < line_offsets.len() {
let line_start_byte = line_offsets[line_idx];
let line_end = line_offsets.get(line_idx + 1).copied().unwrap_or(content.len());
let line = &content[line_start_byte..line_end.min(content.len())];
let line = line
.strip_suffix('\n')
.or_else(|| line.strip_suffix("\r\n"))
.unwrap_or(line);
let blockquote_parse = parse_blockquote_prefix(line);
let (blockquote_prefix_len, line_to_parse) = if let Some(parsed) = blockquote_parse {
(parsed.prefix.len(), parsed.content)
} else {
(0, line)
};
if current_list_is_ordered {
if let Some((leading_spaces, number_str, delimiter, spacing, _content)) =
parse_ordered_list(line_to_parse)
{
let marker = format!("{number_str}{delimiter}");
let marker_column = blockquote_prefix_len + leading_spaces.len();
let content_column = marker_column + marker.len() + spacing.len();
let number = number_str.parse().ok();
list_items.entry(line_start_byte).or_insert((
true,
marker,
marker_column,
content_column,
number,
));
}
} else if let Some((leading_spaces, marker, spacing, _content)) =
parse_unordered_list(line_to_parse)
{
let marker_column = blockquote_prefix_len + leading_spaces.len();
let content_column = marker_column + 1 + spacing.len();
list_items.entry(line_start_byte).or_insert((
false,
marker.to_string(),
marker_column,
content_column,
None,
));
}
}
}
_ => {}
}
}
(list_items, emphasis_spans)
}
pub(super) fn compute_char_frequency(content: &str) -> CharFrequency {
let mut frequency = CharFrequency::default();
for ch in content.chars() {
match ch {
'#' => frequency.hash_count += 1,
'*' => frequency.asterisk_count += 1,
'_' => frequency.underscore_count += 1,
'-' => frequency.hyphen_count += 1,
'+' => frequency.plus_count += 1,
'>' => frequency.gt_count += 1,
'|' => frequency.pipe_count += 1,
'[' => frequency.bracket_count += 1,
'`' => frequency.backtick_count += 1,
'<' => frequency.lt_count += 1,
'!' => frequency.exclamation_count += 1,
'\n' => frequency.newline_count += 1,
_ => {}
}
}
frequency
}
#[inline]
pub(super) fn parse_unordered_list(line: &str) -> Option<(&str, char, &str, &str)> {
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
i += 1;
}
if i >= bytes.len() {
return None;
}
let marker = bytes[i] as char;
if marker != '-' && marker != '*' && marker != '+' {
return None;
}
let marker_pos = i;
i += 1;
let spacing_start = i;
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
i += 1;
}
Some((&line[..marker_pos], marker, &line[spacing_start..i], &line[i..]))
}
#[inline]
pub(super) fn parse_ordered_list(line: &str) -> Option<(&str, &str, char, &str, &str)> {
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
i += 1;
}
let number_start = i;
while i < bytes.len() && bytes[i].is_ascii_digit() {
i += 1;
}
if i == number_start {
return None;
}
if i >= bytes.len() {
return None;
}
let delimiter = bytes[i] as char;
if delimiter != '.' && delimiter != ')' {
return None;
}
let delimiter_pos = i;
i += 1;
let spacing_start = i;
while i < bytes.len() && (bytes[i] == b' ' || bytes[i] == b'\t') {
i += 1;
}
Some((
&line[..number_start],
&line[number_start..delimiter_pos],
delimiter,
&line[spacing_start..i],
&line[i..],
))
}