#[derive(Debug, Clone)]
pub struct TextChunkWithLines {
pub content: String,
pub start_line: usize,
pub end_line: usize,
}
pub struct TextProcessor;
impl TextProcessor {
pub fn chunk_text(content: &str, chunk_size: usize, overlap: usize) -> Vec<TextChunkWithLines> {
let mut chunks = Vec::new();
let lines: Vec<&str> = content.lines().collect();
if lines.is_empty() {
return chunks;
}
let mut start_idx = 0;
let mut current_line = 1;
while start_idx < lines.len() {
let mut end_idx = std::cmp::min(start_idx + chunk_size, lines.len());
let mut current_content = String::new();
let mut char_count = 0;
for (idx, line) in lines
.iter()
.skip(start_idx)
.take(end_idx - start_idx)
.enumerate()
{
if char_count + line.len() + 1 > chunk_size && !current_content.is_empty() {
end_idx = start_idx + idx;
break;
}
if !current_content.is_empty() {
current_content.push('\n');
char_count += 1;
}
current_content.push_str(line);
char_count += line.len();
}
if !current_content.is_empty() {
let chunk = TextChunkWithLines {
content: current_content,
start_line: current_line,
end_line: current_line + (end_idx - start_idx).saturating_sub(1),
};
chunks.push(chunk);
}
if end_idx >= lines.len() {
break;
}
let mut next_start = if overlap > 0 && end_idx > overlap {
end_idx - overlap
} else {
end_idx
};
if next_start <= start_idx {
next_start = end_idx;
}
current_line += end_idx - start_idx;
start_idx = next_start;
}
chunks
}
}