use cqs::{Chunk, Embedder};
const WINDOW_OVERHEAD: usize = 32;
pub(crate) fn max_tokens_per_window(model_max_seq: usize) -> usize {
if model_max_seq == 0 {
480
} else {
model_max_seq.saturating_sub(WINDOW_OVERHEAD).max(128)
}
}
pub(crate) fn window_overlap_tokens(max_tokens: usize) -> usize {
if max_tokens < 4 {
return 0;
}
let overlap = 64.max(max_tokens / 8);
let ceiling = max_tokens / 2 - 1;
overlap.min(ceiling)
}
pub(crate) fn apply_windowing(chunks: Vec<Chunk>, embedder: &Embedder) -> Vec<Chunk> {
let _span = tracing::info_span!("apply_windowing", chunk_count = chunks.len()).entered();
let mut result = Vec::with_capacity(chunks.len());
for chunk in chunks {
let max_tokens = max_tokens_per_window(embedder.model_config().max_seq_length);
let overlap = window_overlap_tokens(max_tokens);
match embedder.split_into_windows(&chunk.content, max_tokens, overlap) {
Ok(windows) if windows.len() == 1 => {
result.push(chunk);
}
Ok(windows) => {
let parent_id = chunk.id.clone();
for (window_content, window_idx) in windows {
let window_hash = blake3::hash(window_content.as_bytes()).to_hex().to_string();
result.push(Chunk {
id: format!("{}:w{}", parent_id, window_idx),
file: chunk.file.clone(),
language: chunk.language,
chunk_type: chunk.chunk_type,
name: chunk.name.clone(),
signature: chunk.signature.clone(),
content: window_content,
doc: if window_idx == 0 {
chunk.doc.clone()
} else {
None
}, line_start: chunk.line_start,
line_end: chunk.line_end,
content_hash: window_hash,
parent_id: Some(parent_id.clone()),
window_idx: Some(window_idx),
parent_type_name: chunk.parent_type_name.clone(),
});
}
}
Err(e) => {
tracing::warn!(chunk_id = %chunk.id, error = %e, "Windowing failed, passing through");
result.push(chunk);
}
}
}
result
}