use crate::token_cleaner::count_tokens;
#[derive(Clone)]
pub struct ChunkerConfig {
pub chunk_size: usize,
pub overlap_size: usize,
}
impl Default for ChunkerConfig {
fn default() -> Self {
Self {
chunk_size: 800,
overlap_size: 100,
}
}
}
#[derive(Debug, Clone)]
pub struct TextChunk {
pub content: String,
pub start_index: usize,
pub end_index: usize,
pub chunk_index: usize,
}
pub fn chunk_text(text: &str, config: &ChunkerConfig) -> Vec<TextChunk> {
if text.trim().is_empty() {
return vec![];
}
let total_tokens = count_tokens(text);
if total_tokens <= config.chunk_size {
return vec![TextChunk {
content: text.to_string(),
start_index: 0,
end_index: text.len(),
chunk_index: 0,
}];
}
let mut chunks = Vec::new();
let lines: Vec<&str> = text.lines().collect();
let mut current_chunk = String::new();
let mut current_tokens = 0;
let mut start_line = 0;
let mut chunk_index = 0;
let mut overlap_buffer = String::new();
for (line_idx, line) in lines.iter().enumerate() {
let line_with_newline = format!("{}\n", line);
let line_tokens = count_tokens(&line_with_newline);
if line_tokens > config.chunk_size {
if !current_chunk.is_empty() {
chunks.push(TextChunk {
content: current_chunk.clone(),
start_index: start_line,
end_index: line_idx,
chunk_index,
});
chunk_index += 1;
}
let char_chunks = split_large_line(line, config);
for char_chunk in char_chunks {
chunks.push(TextChunk {
content: char_chunk,
start_index: line_idx,
end_index: line_idx + 1,
chunk_index,
});
chunk_index += 1;
}
current_chunk.clear();
current_tokens = 0;
start_line = line_idx + 1;
overlap_buffer.clear();
continue;
}
if current_tokens + line_tokens > config.chunk_size && !current_chunk.is_empty() {
chunks.push(TextChunk {
content: current_chunk.clone(),
start_index: start_line,
end_index: line_idx,
chunk_index,
});
chunk_index += 1;
current_chunk = overlap_buffer.clone();
current_tokens = count_tokens(¤t_chunk);
start_line = line_idx;
}
current_chunk.push_str(&line_with_newline);
current_tokens += line_tokens;
overlap_buffer.push_str(&line_with_newline);
let overlap_tokens = count_tokens(&overlap_buffer);
if overlap_tokens > config.overlap_size {
let overlap_lines: Vec<&str> = overlap_buffer.lines().collect();
let mut new_overlap = String::new();
let mut overlap_tok = 0;
for ol in overlap_lines.iter().rev() {
let ol_with_newline = format!("{}\n", ol);
let ol_tokens = count_tokens(&ol_with_newline);
if overlap_tok + ol_tokens > config.overlap_size {
break;
}
new_overlap = format!("{}{}", ol_with_newline, new_overlap);
overlap_tok += ol_tokens;
}
overlap_buffer = new_overlap;
}
}
if !current_chunk.is_empty() {
chunks.push(TextChunk {
content: current_chunk,
start_index: start_line,
end_index: lines.len(),
chunk_index,
});
}
chunks
}
fn split_large_line(line: &str, config: &ChunkerConfig) -> Vec<String> {
let mut result = Vec::new();
let chars: Vec<char> = line.chars().collect();
let chars_per_chunk = config.chunk_size * 4;
let mut start = 0;
while start < chars.len() {
let end = (start + chars_per_chunk).min(chars.len());
let chunk: String = chars[start..end].iter().collect();
if count_tokens(&chunk) <= config.chunk_size || result.is_empty() {
result.push(chunk);
start = end;
} else {
let reduced_end = start + (chars_per_chunk / 2).max(1);
let chunk: String = chars[start..reduced_end].iter().collect();
result.push(chunk);
start = reduced_end;
}
}
result
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_empty_text() {
let config = ChunkerConfig::default();
let chunks = chunk_text("", &config);
assert_eq!(chunks.len(), 0);
}
#[test]
fn test_small_text() {
let config = ChunkerConfig::default();
let text = "Hello, world!";
let chunks = chunk_text(text, &config);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].content, text);
}
#[test]
fn test_chunking_with_overlap() {
let config = ChunkerConfig {
chunk_size: 50,
overlap_size: 10,
};
let text = (0..100).map(|i| format!("Line {}", i)).collect::<Vec<_>>().join("\n");
let chunks = chunk_text(&text, &config);
assert!(chunks.len() > 1);
for (i, chunk) in chunks.iter().enumerate() {
assert_eq!(chunk.chunk_index, i);
}
}
}