use std::rc::Rc;
#[derive(Debug, Clone)]
pub struct Chunk {
pub index: usize,
pub content: String,
pub heading: Option<Rc<str>>,
}
pub fn chunk_markdown(text: &str, max_tokens: usize) -> Vec<Chunk> {
if text.trim().is_empty() {
return Vec::new();
}
let max_chars = max_tokens * 4;
let sections = split_on_headings(text);
let mut chunks = Vec::with_capacity(sections.len());
for (heading, body) in sections {
let heading: Option<Rc<str>> = heading.map(Rc::from);
let heading_prefix = heading.as_deref().map(|h| {
let mut prefix = String::with_capacity(h.len() + 1);
prefix.push_str(h);
prefix.push('\n');
prefix
});
let full_len = body.len() + heading_prefix.as_ref().map_or(0, String::len);
if full_len <= max_chars {
let content = if let Some(prefix) = heading_prefix.as_deref() {
let mut full = String::with_capacity(full_len);
full.push_str(prefix);
full.push_str(&body);
full.trim().to_string()
} else {
body.trim().to_string()
};
chunks.push(Chunk {
index: chunks.len(),
content,
heading: heading.clone(),
});
} else {
let paragraphs = split_on_blank_lines(&body);
let mut current = heading_prefix.clone().unwrap_or_default();
for para in paragraphs {
if current.len() + para.len() > max_chars && !current.trim().is_empty() {
chunks.push(Chunk {
index: chunks.len(),
content: current.trim().to_string(),
heading: heading.clone(),
});
reset_chunk_buffer(&mut current, heading_prefix.as_deref());
}
if para.len() > max_chars {
if !current.trim().is_empty() {
chunks.push(Chunk {
index: chunks.len(),
content: current.trim().to_string(),
heading: heading.clone(),
});
reset_chunk_buffer(&mut current, heading_prefix.as_deref());
}
for line_chunk in split_on_lines(¶, max_chars) {
chunks.push(Chunk {
index: chunks.len(),
content: line_chunk.trim().to_string(),
heading: heading.clone(),
});
}
} else {
current.push_str(¶);
current.push('\n');
}
}
if !current.trim().is_empty() {
chunks.push(Chunk {
index: chunks.len(),
content: current.trim().to_string(),
heading: heading.clone(),
});
}
}
}
chunks.retain(|c| !c.content.is_empty());
for (i, chunk) in chunks.iter_mut().enumerate() {
chunk.index = i;
}
chunks
}
fn reset_chunk_buffer(current: &mut String, heading_prefix: Option<&str>) {
current.clear();
if let Some(prefix) = heading_prefix {
current.push_str(prefix);
}
}
fn is_atx_heading(line: &str) -> bool {
const PREFIXES: &[&str] = &["# ", "## ", "### ", "#### ", "##### ", "###### "];
PREFIXES.iter().any(|p| line.starts_with(p))
}
fn split_on_headings(text: &str) -> Vec<(Option<String>, String)> {
let mut sections = Vec::new();
let mut current_heading: Option<String> = None;
let mut current_body = String::new();
for line in text.lines() {
if is_atx_heading(line) {
if !current_body.trim().is_empty() || current_heading.is_some() {
sections.push((current_heading.take(), std::mem::take(&mut current_body)));
}
current_heading = Some(line.to_string());
} else {
current_body.push_str(line);
current_body.push('\n');
}
}
if !current_body.trim().is_empty() || current_heading.is_some() {
sections.push((current_heading, current_body));
}
sections
}
fn split_on_blank_lines(text: &str) -> Vec<String> {
let mut paragraphs = Vec::new();
let mut current = String::new();
for line in text.lines() {
if line.trim().is_empty() {
if !current.trim().is_empty() {
paragraphs.push(std::mem::take(&mut current));
}
} else {
current.push_str(line);
current.push('\n');
}
}
if !current.trim().is_empty() {
paragraphs.push(current);
}
paragraphs
}
fn split_on_lines(text: &str, max_chars: usize) -> Vec<String> {
let effective_max = max_chars.max(1);
let mut chunks = Vec::with_capacity(text.len() / effective_max + 1);
let mut current = String::new();
for line in text.lines() {
if line.len() > effective_max {
if !current.is_empty() {
chunks.push(std::mem::take(&mut current));
}
for part in split_within_line(line, effective_max) {
chunks.push(part);
}
} else if current.len() + line.len() + 1 > effective_max && !current.is_empty() {
chunks.push(std::mem::take(&mut current));
current.push_str(line);
current.push('\n');
} else {
current.push_str(line);
current.push('\n');
}
}
if !current.is_empty() {
chunks.push(current);
}
chunks
}
fn split_within_line(line: &str, max_chars: usize) -> Vec<String> {
let mut chunks = Vec::new();
let mut start = 0;
let bytes = line.as_bytes();
while start < line.len() {
let remaining = line.len() - start;
if remaining <= max_chars {
chunks.push(format!("{}\n", &line[start..]));
break;
}
let mut end = start + max_chars;
while end > start && !line.is_char_boundary(end) {
end -= 1;
}
if end == start {
end = start + 1;
while end < line.len() && !line.is_char_boundary(end) {
end += 1;
}
}
let mut split_at = end;
while split_at > start && bytes[split_at - 1] != b' ' {
split_at -= 1;
}
if split_at == start {
split_at = end;
}
chunks.push(format!("{}\n", &line[start..split_at]));
if split_at < line.len() && bytes[split_at] == b' ' {
start = split_at + 1;
} else {
start = split_at;
}
}
chunks
}
#[cfg(test)]
#[path = "semantic_tests.rs"]
mod tests;