mod treesitter;
use crate::parsing::languages::language_for_extension;
const DEFAULT_CHUNK_SIZE: usize = 1500;
const OVERLAP_BYTES: usize = 200;
#[derive(Debug, Clone)]
pub struct Chunk {
pub content: String,
pub file_path: String,
pub start_line: usize,
pub end_line: usize,
pub start_byte: usize,
pub end_byte: usize,
pub language: Option<String>,
}
pub fn chunk_file(source: &str, file_path: &str, ext: Option<&str>) -> Vec<Chunk> {
if source.is_empty() {
return vec![];
}
let language = ext
.and_then(|e| crate::parsing::languages::language_name_for_extension(e).map(String::from));
let boundaries = match ext.and_then(language_for_extension) {
Some(lang) => treesitter::chunk_with_treesitter(source, &lang, DEFAULT_CHUNK_SIZE),
None => chunk_by_lines(source, DEFAULT_CHUNK_SIZE),
};
let with_overlap = add_overlap(&boundaries, source, OVERLAP_BYTES);
with_overlap
.into_iter()
.map(|(start_byte, end_byte)| {
let content = source[start_byte..end_byte].to_string();
let start_line = source[..start_byte].matches('\n').count() + 1;
let end_line = source[..end_byte].matches('\n').count() + 1;
Chunk {
content,
file_path: file_path.to_string(),
start_line,
end_line,
start_byte,
end_byte,
language: language.clone(),
}
})
.collect()
}
fn add_overlap(boundaries: &[(usize, usize)], source: &str, overlap: usize) -> Vec<(usize, usize)> {
if boundaries.len() <= 1 || overlap == 0 {
return boundaries.to_vec();
}
let mut result = Vec::with_capacity(boundaries.len());
result.push(boundaries[0]);
for i in 1..boundaries.len() {
let (orig_start, end) = boundaries[i];
let prev_start = boundaries[i - 1].0;
let overlap_start = orig_start.saturating_sub(overlap);
let snapped = snap_to_newline(source, overlap_start.max(prev_start));
let safe_start = snapped.min(end);
result.push((safe_start, end));
}
result
}
fn snap_to_newline(source: &str, pos: usize) -> usize {
let safe_pos = snap_to_char_boundary(source, pos);
source[safe_pos..]
.find('\n')
.map(|n| safe_pos + n + 1)
.unwrap_or(safe_pos)
}
fn snap_to_char_boundary(source: &str, pos: usize) -> usize {
if pos >= source.len() {
return source.len();
}
if source.is_char_boundary(pos) {
return pos;
}
let mut p = pos;
while p < source.len() && !source.is_char_boundary(p) {
p += 1;
}
p
}
fn chunk_by_lines(source: &str, target_size: usize) -> Vec<(usize, usize)> {
let mut boundaries = Vec::new();
let mut start = 0;
let mut current_size = 0;
for (offset, ch) in source.char_indices() {
current_size += ch.len_utf8();
if ch == '\n' && current_size >= target_size {
let end = offset + 1;
boundaries.push((start, end));
start = end;
current_size = 0;
}
}
if start < source.len() {
boundaries.push((start, source.len()));
}
boundaries
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_source() {
let chunks = chunk_file("", "test.rs", Some("rs"));
assert!(chunks.is_empty());
}
#[test]
fn small_file_single_chunk() {
let src = "fn main() {\n println!(\"hello\");\n}\n";
let chunks = chunk_file(src, "main.rs", Some("rs"));
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].content, src);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].start_byte, 0);
assert_eq!(chunks[0].end_byte, src.len());
}
#[test]
fn large_file_multiple_chunks() {
let func = "def process(data):\n result = []\n for item in data:\n result.append(item * 2)\n return result\n\n";
let mut src = String::new();
for i in 0..30 {
src.push_str(&func.replace("process", &format!("process_{i}")));
}
let chunks = chunk_file(&src, "big.py", Some("py"));
assert!(
chunks.len() > 1,
"expected multiple chunks, got {}",
chunks.len()
);
for window in chunks.windows(2) {
assert!(
window[1].start_byte <= window[0].end_byte,
"chunk {} should start at or before end of chunk {} (overlap expected)",
window[1].start_byte,
window[0].end_byte
);
}
assert_eq!(chunks.first().unwrap().start_byte, 0);
assert_eq!(chunks.last().unwrap().end_byte, src.len());
}
#[test]
fn fallback_for_unsupported_language() {
let mut src = String::new();
for i in 0..200 {
src.push_str(&format!(
"line {i}: some content here that fills up space\n"
));
}
let chunks = chunk_file(&src, "data.txt", None);
assert!(chunks.len() > 1);
for window in chunks.windows(2) {
assert!(window[1].start_byte <= window[0].end_byte);
}
}
#[test]
fn chunk_preserves_language() {
let src = "fn f() {}\n";
let chunks = chunk_file(src, "lib.rs", Some("rs"));
assert_eq!(chunks[0].language.as_deref(), Some("rust"));
}
#[test]
fn chunk_line_numbers_correct() {
let src = "line1\nline2\nline3\nline4\nline5\n";
let chunks = chunk_file(src, "test.txt", None);
assert_eq!(chunks[0].start_line, 1);
}
#[test]
fn no_function_split_in_python() {
let mut func1 = String::from("def alpha():\n");
for i in 0..40 {
func1.push_str(&format!(" x_{i} = {i}\n"));
}
func1.push('\n');
let mut func2 = String::from("def beta():\n");
for i in 0..40 {
func2.push_str(&format!(" y_{i} = {i}\n"));
}
func2.push('\n');
let src = format!("{func1}{func2}");
let chunks = chunk_file(&src, "test.py", Some("py"));
if chunks.len() > 1 {
for chunk in &chunks {
let has_def = chunk.content.contains("def ");
if has_def {
let def_count = chunk.content.matches("def ").count();
assert!(
def_count >= 1,
"chunk has partial function: {:?}",
&chunk.content[..80.min(chunk.content.len())]
);
}
}
}
}
}