use crate::document::{Chunk, Document};
pub trait Chunker: Send + Sync {
fn chunk(&self, document: &Document) -> Vec<Chunk>;
}
#[derive(Debug, Clone)]
pub struct FixedSizeChunker {
chunk_size: usize,
chunk_overlap: usize,
}
impl FixedSizeChunker {
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
Self { chunk_size, chunk_overlap }
}
}
impl Chunker for FixedSizeChunker {
fn chunk(&self, document: &Document) -> Vec<Chunk> {
if document.text.is_empty() {
return Vec::new();
}
let text = &document.text;
let mut chunks = Vec::new();
let mut start = 0;
let mut chunk_index = 0;
while start < text.len() {
let end = (start + self.chunk_size).min(text.len());
let chunk_text = &text[start..end];
let mut metadata = document.metadata.clone();
metadata.insert("chunk_index".to_string(), chunk_index.to_string());
chunks.push(Chunk {
id: format!("{}_{chunk_index}", document.id),
text: chunk_text.to_string(),
embedding: Vec::new(),
metadata,
document_id: document.id.clone(),
});
chunk_index += 1;
let step = self.chunk_size.saturating_sub(self.chunk_overlap);
if step == 0 {
break;
}
start += step;
}
chunks
}
}
#[derive(Debug, Clone)]
pub struct RecursiveChunker {
chunk_size: usize,
chunk_overlap: usize,
}
impl RecursiveChunker {
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
Self { chunk_size, chunk_overlap }
}
}
fn split_and_merge(
text: &str,
chunk_size: usize,
chunk_overlap: usize,
separators: &[&str],
) -> Vec<String> {
if text.len() <= chunk_size || separators.is_empty() {
return split_by_size(text, chunk_size, chunk_overlap);
}
let separator = separators[0];
let remaining_separators = &separators[1..];
let segments: Vec<&str> = if separator == " " {
text.split(' ').collect()
} else {
split_keeping_separator(text, separator)
};
let mut chunks = Vec::new();
let mut current = String::new();
for segment in segments {
if current.is_empty() {
current = segment.to_string();
} else if current.len() + segment.len() <= chunk_size {
current.push_str(segment);
} else {
if current.len() > chunk_size {
chunks.extend(split_and_merge(
¤t,
chunk_size,
chunk_overlap,
remaining_separators,
));
} else {
chunks.push(current);
}
current = segment.to_string();
}
}
if !current.is_empty() {
if current.len() > chunk_size {
chunks.extend(split_and_merge(
¤t,
chunk_size,
chunk_overlap,
remaining_separators,
));
} else {
chunks.push(current);
}
}
chunks
}
fn split_keeping_separator<'a>(text: &'a str, separator: &str) -> Vec<&'a str> {
let mut result = Vec::new();
let mut start = 0;
while let Some(pos) = text[start..].find(separator) {
let end = start + pos + separator.len();
result.push(&text[start..end]);
start = end;
}
if start < text.len() {
result.push(&text[start..]);
}
result
}
fn split_by_size(text: &str, chunk_size: usize, chunk_overlap: usize) -> Vec<String> {
if text.is_empty() {
return Vec::new();
}
let mut chunks = Vec::new();
let mut start = 0;
while start < text.len() {
let end = (start + chunk_size).min(text.len());
chunks.push(text[start..end].to_string());
let step = chunk_size.saturating_sub(chunk_overlap);
if step == 0 {
break;
}
start += step;
}
chunks
}
impl Chunker for RecursiveChunker {
fn chunk(&self, document: &Document) -> Vec<Chunk> {
if document.text.is_empty() {
return Vec::new();
}
let separators = ["\n\n", ". ", "! ", "? ", " "];
let raw_chunks =
split_and_merge(&document.text, self.chunk_size, self.chunk_overlap, &separators);
raw_chunks
.into_iter()
.enumerate()
.map(|(i, text)| {
let mut metadata = document.metadata.clone();
metadata.insert("chunk_index".to_string(), i.to_string());
Chunk {
id: format!("{}_{i}", document.id),
text,
embedding: Vec::new(),
metadata,
document_id: document.id.clone(),
}
})
.collect()
}
}
#[derive(Debug, Clone)]
pub struct MarkdownChunker {
chunk_size: usize,
chunk_overlap: usize,
}
impl MarkdownChunker {
pub fn new(chunk_size: usize, chunk_overlap: usize) -> Self {
Self { chunk_size, chunk_overlap }
}
}
struct MarkdownSection {
header_path: String,
text: String,
}
fn parse_markdown_sections(text: &str) -> Vec<MarkdownSection> {
let mut sections = Vec::new();
let mut headers: Vec<String> = Vec::new();
let mut current_body = String::new();
let mut current_header_path = String::new();
for line in text.lines() {
let trimmed = line.trim_start();
if trimmed.starts_with('#') {
if !current_body.is_empty() || !current_header_path.is_empty() {
sections.push(MarkdownSection {
header_path: current_header_path.clone(),
text: current_body.trim().to_string(),
});
current_body = String::new();
}
let level = trimmed.chars().take_while(|c| *c == '#').count();
let header_text = trimmed[level..].trim().to_string();
headers.truncate(level.saturating_sub(1));
headers.push(header_text);
current_header_path = headers.join(" > ");
} else {
if !current_body.is_empty() {
current_body.push('\n');
}
current_body.push_str(line);
}
}
if !current_body.is_empty() || !current_header_path.is_empty() {
sections.push(MarkdownSection {
header_path: current_header_path,
text: current_body.trim().to_string(),
});
}
sections
}
impl Chunker for MarkdownChunker {
fn chunk(&self, document: &Document) -> Vec<Chunk> {
if document.text.is_empty() {
return Vec::new();
}
let sections = parse_markdown_sections(&document.text);
let mut chunks = Vec::new();
let mut chunk_index = 0;
for section in sections {
let section_text = if section.header_path.is_empty() {
section.text.clone()
} else if section.text.is_empty() {
section.header_path.clone()
} else {
format!("{}\n{}", section.header_path, section.text)
};
if section_text.is_empty() {
continue;
}
let sub_chunks = if section_text.len() > self.chunk_size {
let separators = ["\n\n", ". ", "! ", "? ", " "];
split_and_merge(§ion_text, self.chunk_size, self.chunk_overlap, &separators)
} else {
vec![section_text]
};
for text in sub_chunks {
let mut metadata = document.metadata.clone();
metadata.insert("chunk_index".to_string(), chunk_index.to_string());
metadata.insert("header_path".to_string(), section.header_path.clone());
chunks.push(Chunk {
id: format!("{}_{chunk_index}", document.id),
text,
embedding: Vec::new(),
metadata,
document_id: document.id.clone(),
});
chunk_index += 1;
}
}
chunks
}
}