use crate::consts::{TAG_H1, TAG_H2};
use crate::types::HTMLToMarkdownOptions;
pub struct SplitterOptions {
pub headers_to_split_on: Vec<u8>,
pub return_each_line: bool,
pub strip_headers: bool,
pub chunk_size: usize,
pub chunk_overlap: usize,
}
impl Default for SplitterOptions {
fn default() -> Self {
Self {
headers_to_split_on: vec![TAG_H2, TAG_H2 + 1, TAG_H2 + 2, TAG_H2 + 3, TAG_H2 + 4],
return_each_line: false,
strip_headers: true,
chunk_size: 1000,
chunk_overlap: 200,
}
}
}
pub struct MarkdownChunk {
pub content: String,
pub metadata: ChunkMetadata,
}
pub struct ChunkMetadata {
pub headers: Option<Vec<(String, String)>>,
pub code: Option<String>,
pub loc: Option<ChunkLoc>,
}
pub struct ChunkLoc {
pub from: usize,
pub to: usize,
}
fn strip_markdown_formatting(text: &str) -> String {
let bytes = text.as_bytes();
let len = bytes.len();
let mut result = Vec::with_capacity(len);
let mut i = 0;
while i < len {
let b = bytes[i];
if b == b'*' || b == b'_' {
if i + 2 < len && bytes[i + 1] == b && bytes[i + 2] == b {
i += 3;
continue;
}
if i + 1 < len && bytes[i + 1] == b {
i += 2;
continue;
}
i += 1;
continue;
}
if b == b'`' {
i += 1;
continue;
}
result.push(b);
i += 1;
}
#[allow(unsafe_code)]
let s = unsafe { String::from_utf8_unchecked(result) };
let trimmed = s.trim();
trimmed.to_string()
}
#[inline]
fn detect_header(line: &str) -> Option<(u8, &str)> {
let bytes = line.as_bytes();
if bytes.is_empty() || bytes[0] != b'#' {
return None;
}
let mut level = 0u8;
while (level as usize) < bytes.len() && bytes[level as usize] == b'#' {
level += 1;
if level > 6 {
return None;
}
}
if (level as usize) >= bytes.len() || bytes[level as usize] != b' ' {
return None;
}
let text_start = level as usize + 1;
if text_start >= bytes.len() {
return None;
}
Some((level, &line[text_start..]))
}
#[inline]
fn is_hr(line: &str) -> bool {
line == "---" || line == "***" || line == "___"
}
#[inline]
fn count_newlines(s: &str) -> usize {
s.as_bytes().iter().filter(|&&b| b == b'\n').count()
}
#[inline]
fn is_header_line(line: &str) -> bool {
detect_header(line).is_some()
}
#[inline]
fn count_backtick_fences(s: &str) -> usize {
let bytes = s.as_bytes();
let len = bytes.len();
let mut count = 0;
let mut i = 0;
while i + 2 < len {
if bytes[i] == b'`' && bytes[i + 1] == b'`' && bytes[i + 2] == b'`' {
count += 1;
i += 3;
} else {
i += 1;
}
}
count
}
pub fn split_markdown(markdown: &str, opts: &SplitterOptions) -> Vec<MarkdownChunk> {
assert!(opts.chunk_overlap < opts.chunk_size, "chunk_overlap must be less than chunk_size");
let markdown = markdown.trim_start();
if markdown.trim().is_empty() {
return Vec::new();
}
let lines: Vec<&str> = markdown.split('\n').collect();
let line_count = lines.len();
let mut line_starts: Vec<usize> = Vec::with_capacity(line_count);
let mut pos = 0usize;
for line in &lines {
line_starts.push(pos);
pos += line.len() + 1; }
let mut frontmatter_end_idx: isize = -1;
if !lines.is_empty() && lines[0].trim() == "---" {
for (i, line) in lines.iter().enumerate().skip(1) {
if line.trim() == "---" {
frontmatter_end_idx = i as isize;
break;
}
}
}
let mut header_hierarchy: Vec<(u8, String)> = Vec::new();
let mut seen_split_headers: u8 = 0; let mut current_chunk_code_language = String::new();
let mut in_code_block = false;
let mut line_number: usize = 1;
let mut last_chunk_end_position: usize = 0;
let mut last_split_position: usize = 0;
let mut chunks: Vec<MarkdownChunk> = Vec::new();
let flush_chunk = |chunks: &mut Vec<MarkdownChunk>,
last_chunk_end_position: &mut usize,
last_split_position: &mut usize,
line_number: &mut usize,
current_chunk_code_language: &mut String,
header_hierarchy: &Vec<(u8, String)>,
end_position: usize,
apply_overlap: bool,
strip_headers: bool,
chunk_overlap: usize| {
let end_position = end_position.min(markdown.len());
let start = (*last_chunk_end_position).min(end_position);
let original_chunk_content = &markdown[start..end_position];
if original_chunk_content.trim().is_empty() {
*last_chunk_end_position = end_position;
return;
}
let chunk_content = if strip_headers {
let filtered: Vec<&str> = original_chunk_content
.split('\n')
.filter(|line| !is_header_line(line))
.collect();
let joined = filtered.join("\n");
let trimmed = joined.trim().to_string();
if trimmed.is_empty() {
*last_chunk_end_position = end_position;
return;
}
trimmed
} else {
original_chunk_content.trim_end().to_string()
};
let newline_count = count_newlines(original_chunk_content);
let mut metadata = ChunkMetadata {
headers: None,
code: None,
loc: Some(ChunkLoc {
from: *line_number,
to: *line_number + newline_count,
}),
};
if !header_hierarchy.is_empty() {
let mut headers = Vec::new();
for &(tag_id, ref text) in header_hierarchy {
let level = tag_id - TAG_H1 + 1;
headers.push((format!("h{level}"), text.clone()));
}
metadata.headers = Some(headers);
}
if !current_chunk_code_language.is_empty() {
metadata.code = Some(current_chunk_code_language.clone());
}
chunks.push(MarkdownChunk {
content: chunk_content,
metadata,
});
*current_chunk_code_language = String::new();
*last_split_position = end_position;
if apply_overlap && chunk_overlap > 0 {
let content_len = original_chunk_content.len();
let max_overlap = content_len.saturating_sub(1);
let actual_overlap = chunk_overlap.min(max_overlap);
let mut overlap_pos = end_position - actual_overlap;
while overlap_pos < markdown.len() && !markdown.is_char_boundary(overlap_pos) {
overlap_pos += 1;
}
*last_chunk_end_position = overlap_pos;
} else {
*last_chunk_end_position = end_position;
}
*line_number += newline_count;
};
for i in 0..line_count {
let line = lines[i];
let line_pos = line_starts[i];
let is_frontmatter = frontmatter_end_idx >= 0
&& (i == 0 || i == frontmatter_end_idx as usize);
if let Some(rest) = line.strip_prefix("```") {
if in_code_block {
in_code_block = false;
} else {
in_code_block = true;
let lang = rest.trim();
if !lang.is_empty() && current_chunk_code_language.is_empty() {
current_chunk_code_language = lang.to_string();
}
}
}
if !in_code_block && !line.starts_with("```") {
let header_match = if is_frontmatter {
None
} else {
detect_header(line)
};
if let Some((level, header_text)) = header_match {
let tag_id = TAG_H1 + level - 1;
let stripped_text = strip_markdown_formatting(header_text);
if opts.headers_to_split_on.contains(&tag_id) {
let bit = 1u8 << (tag_id - TAG_H1);
if seen_split_headers & bit != 0 {
flush_chunk(
&mut chunks, &mut last_chunk_end_position, &mut last_split_position,
&mut line_number, &mut current_chunk_code_language, &header_hierarchy,
line_pos, false, opts.strip_headers, opts.chunk_overlap,
);
header_hierarchy.retain(|(k, _)| *k < tag_id);
}
seen_split_headers |= bit;
}
if let Some(entry) = header_hierarchy.iter_mut().find(|(k, _)| *k == tag_id) {
entry.1 = stripped_text;
} else {
header_hierarchy.push((tag_id, stripped_text));
}
}
if !is_frontmatter && header_match.is_none() && is_hr(line) {
flush_chunk(
&mut chunks, &mut last_chunk_end_position, &mut last_split_position,
&mut line_number, &mut current_chunk_code_language, &header_hierarchy,
line_pos, false, opts.strip_headers, opts.chunk_overlap,
);
}
}
if !opts.return_each_line {
let line_end = (line_pos + line.len() + 1).min(markdown.len());
let current_chunk_size = line_end.saturating_sub(last_chunk_end_position);
if current_chunk_size > opts.chunk_size {
let ideal_split_pos = last_chunk_end_position + opts.chunk_size;
let current_md = &markdown[..line_end.min(markdown.len())];
let separators: [&str; 4] = ["\n\n", "```\n", "\n", " "];
let mut split_position: isize = -1;
for sep in &separators {
let mut search_end = ideal_split_pos.min(current_md.len());
while search_end < current_md.len() && !current_md.is_char_boundary(search_end) {
search_end += 1;
}
let search_region = ¤t_md[..search_end];
if let Some(idx) = search_region.rfind(sep) {
let candidate_split_pos = idx + sep.len();
let before_split = ¤t_md[..candidate_split_pos];
if count_backtick_fences(before_split) % 2 == 1 {
continue;
}
if candidate_split_pos > last_split_position {
split_position = candidate_split_pos as isize;
break;
}
}
}
let final_split = if split_position == -1 || (split_position as usize) <= last_chunk_end_position {
line_end
} else {
split_position as usize
};
flush_chunk(
&mut chunks, &mut last_chunk_end_position, &mut last_split_position,
&mut line_number, &mut current_chunk_code_language, &header_hierarchy,
final_split, true, opts.strip_headers, opts.chunk_overlap,
);
}
}
}
flush_chunk(
&mut chunks, &mut last_chunk_end_position, &mut last_split_position,
&mut line_number, &mut current_chunk_code_language, &header_hierarchy,
markdown.len(), false, opts.strip_headers, opts.chunk_overlap,
);
if opts.return_each_line && !chunks.is_empty() {
let mut line_chunks: Vec<MarkdownChunk> = Vec::new();
for chunk in chunks {
let chunk_start_line = chunk.metadata.loc.as_ref().map_or(1, |loc| loc.from);
for (i, chunk_line) in chunk.content.split('\n').enumerate() {
if !chunk_line.trim().is_empty() {
line_chunks.push(MarkdownChunk {
content: chunk_line.to_string(),
metadata: ChunkMetadata {
headers: chunk.metadata.headers.clone(),
code: chunk.metadata.code.clone(),
loc: Some(ChunkLoc {
from: chunk_start_line + i,
to: chunk_start_line + i,
}),
},
});
}
}
}
return line_chunks;
}
chunks
}
pub fn html_to_markdown_chunks(
html: &str,
md_opts: HTMLToMarkdownOptions,
split_opts: &SplitterOptions,
) -> Vec<MarkdownChunk> {
let markdown = crate::html_to_markdown(html, md_opts);
split_markdown(&markdown, split_opts)
}