use crate::models::ContentChunk;
const CHUNK_SIZE: usize = 100;
const CHUNK_OVERLAP: usize = 10;
pub fn chunk_file_content(
source: &[u8],
rel_path: &str,
project_id: &str,
language: Option<&str>,
) -> Vec<ContentChunk> {
let text = String::from_utf8_lossy(source);
let lines: Vec<&str> = text.split('\n').collect();
if lines.is_empty() {
return Vec::new();
}
let step = CHUNK_SIZE.saturating_sub(CHUNK_OVERLAP).max(1);
let mut chunks = Vec::new();
let mut chunk_index: usize = 0;
let mut start = 0;
while start < lines.len() {
let end = (start + CHUNK_SIZE).min(lines.len());
let chunk_content: String = lines[start..end].join("\n");
if !chunk_content.trim().is_empty() {
chunks.push(ContentChunk {
id: ContentChunk::make_id(project_id, rel_path, chunk_index),
project_id: project_id.to_string(),
file_path: rel_path.to_string(),
chunk_index,
line_start: start + 1,
line_end: end,
content: chunk_content,
language: language.unwrap_or("unknown").to_string(),
created_at: epoch_secs_str(),
});
chunk_index += 1;
}
if end >= lines.len() {
break;
}
start += step;
}
chunks
}
fn epoch_secs_str() -> String {
use std::time::SystemTime;
let secs = SystemTime::now()
.duration_since(SystemTime::UNIX_EPOCH)
.unwrap_or_default()
.as_secs();
format!("{secs}")
}
#[cfg(test)]
mod tests {
#[test]
fn chunker_stays_gcode_owned_with_documented_narrowing() {
let source = include_str!("chunker.rs");
let doc_phrase = ["line-based `ContentChunk`", " records"].concat();
assert!(source.contains(&doc_phrase));
for forbidden in [
["use gobby_core", "::indexing::Chunk"].concat(),
["use gobby_core", "::indexing::ChunkIdentity"].concat(),
["use gobby_core", "::indexing::IndexEvent"].concat(),
["use gobby_core", "::indexing::index_events_from_hashes"].concat(),
] {
assert!(!source.contains(&forbidden));
}
}
}