use super::pack::TmpChunk;
use super::TokenCounter;
pub fn add_overlap(chunks: &mut [TmpChunk], overlap_tokens: usize, tokenizer: &dyn TokenCounter) {
if overlap_tokens == 0 {
return;
}
for i in 1..chunks.len() {
let prev_text = &chunks[i - 1].text;
if prev_text.is_empty() {
continue;
}
let max_tail_chars = overlap_tokens.saturating_mul(5).max(8);
let mut char_pos: Vec<usize> = prev_text.char_indices().map(|(idx, _)| idx).collect();
char_pos.push(prev_text.len()); let total_chars = char_pos.len() - 1; let start_char_idx = total_chars.saturating_sub(max_tail_chars);
let mut start_byte = char_pos[start_char_idx];
let mut overlap = prev_text[start_byte..].to_string();
let mut advance_chars = 5usize; let mut start_idx = start_char_idx;
while tokenizer.count_tokens(&overlap) > overlap_tokens {
if start_idx + advance_chars >= total_chars {
break;
}
start_idx += advance_chars;
start_byte = char_pos[start_idx];
overlap = prev_text[start_byte..].to_string();
if overlap.len() < 32 && advance_chars > 1 {
advance_chars = 1;
}
}
chunks[i].text = format!("{}{}", overlap, chunks[i].text);
chunks[i].tokens = tokenizer.count_tokens(&chunks[i].text);
}
}