use sha2::{Digest, Sha256};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
pub file: String,
pub start_line: usize,
pub end_line: usize,
pub text: String,
pub content_hash: String,
}
#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
pub max_lines: usize,
pub overlap: usize,
}
impl Default for ChunkOptions {
fn default() -> Self {
Self {
max_lines: 120,
overlap: 16,
}
}
}
pub fn chunk_text(file: &str, text: &str, opts: &ChunkOptions) -> Vec<Chunk> {
if text.trim().is_empty() {
return Vec::new();
}
let max_lines = opts.max_lines.max(1);
let overlap = opts.overlap.min(max_lines - 1);
let step = max_lines - overlap;
let lines: Vec<&str> = text.lines().collect();
let n = lines.len();
let mut out = Vec::new();
let mut start = 0usize; while start < n {
let end = (start + max_lines).min(n); let body = lines[start..end].join("\n");
if !body.trim().is_empty() {
out.push(Chunk {
file: file.to_string(),
start_line: start + 1,
end_line: end, content_hash: hash_text(&body),
text: body,
});
}
if end == n {
break; }
start += step;
}
out
}
pub fn chunk_file(file: &str, text: &str, opts: &ChunkOptions) -> Vec<Chunk> {
if text.trim().is_empty() {
return Vec::new();
}
if file.ends_with(".rs") {
if let Some(chunks) = chunk_rust(file, text, opts) {
return chunks;
}
}
chunk_text(file, text, opts)
}
fn chunk_rust(file: &str, text: &str, opts: &ChunkOptions) -> Option<Vec<Chunk>> {
use syn::spanned::Spanned;
let ast = syn::parse_file(text).ok()?;
let lines: Vec<&str> = text.lines().collect();
let n = lines.len();
if n == 0 {
return Some(Vec::new());
}
let mut spans: Vec<(usize, usize)> = ast
.items
.iter()
.map(|it| {
let s = it.span();
(s.start().line.max(1), s.end().line.clamp(1, n))
})
.filter(|(s, e)| e >= s)
.collect();
spans.sort_unstable();
let mut out = Vec::new();
let mut cursor = 1usize; for (s, e) in spans {
if s > cursor {
emit_range(file, &lines, cursor, s - 1, opts, &mut out); }
let start = s.max(cursor); if e >= start {
emit_range(file, &lines, start, e, opts, &mut out); cursor = e + 1;
}
}
if cursor <= n {
emit_range(file, &lines, cursor, n, opts, &mut out); }
Some(out)
}
fn emit_range(
file: &str,
lines: &[&str],
start1: usize,
end1: usize,
opts: &ChunkOptions,
out: &mut Vec<Chunk>,
) {
if start1 < 1 || end1 > lines.len() || start1 > end1 {
return;
}
let max_lines = opts.max_lines.max(1);
let overlap = opts.overlap.min(max_lines - 1);
let step = max_lines - overlap; let mut s = start1;
loop {
let e = (s + max_lines - 1).min(end1); let body = lines[(s - 1)..e].join("\n");
if !body.trim().is_empty() {
out.push(Chunk {
file: file.to_string(),
start_line: s,
end_line: e,
content_hash: hash_text(&body),
text: body,
});
}
if e >= end1 {
break;
}
s += step;
}
}
pub fn hash_text(text: &str) -> String {
let mut h = Sha256::new();
h.update(text.as_bytes());
let digest = h.finalize();
let mut s = String::with_capacity(digest.len() * 2);
const HEX: &[u8; 16] = b"0123456789abcdef";
for &b in digest.iter() {
s.push(HEX[(b >> 4) as usize] as char);
s.push(HEX[(b & 0x0f) as usize] as char);
}
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_yields_no_chunks() {
assert!(chunk_text("a.rs", "", &ChunkOptions::default()).is_empty());
assert!(chunk_text("a.rs", " \n\n\t\n", &ChunkOptions::default()).is_empty());
}
#[test]
fn small_file_is_one_whole_chunk() {
let src = "fn main() {\n println!(\"hi\");\n}\n";
let chunks = chunk_text("src/main.rs", src, &ChunkOptions::default());
assert_eq!(chunks.len(), 1);
let c = &chunks[0];
assert_eq!(c.file, "src/main.rs");
assert_eq!(c.start_line, 1);
assert_eq!(c.end_line, 3); assert!(c.text.contains("println!"));
assert_eq!(c.content_hash.len(), 64);
}
#[test]
fn large_file_splits_into_overlapping_windows() {
let src: String = (1..=300).map(|i| format!("line {i}\n")).collect();
let opts = ChunkOptions {
max_lines: 100,
overlap: 20,
};
let chunks = chunk_text("big.txt", &src, &opts);
assert_eq!(chunks.len(), 4);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 100);
assert_eq!(chunks[1].start_line, 81); assert_eq!(chunks[1].end_line, 180);
let last = chunks.last().unwrap();
assert_eq!(last.end_line, 300);
assert!(last.start_line <= 300);
}
#[test]
fn windows_cover_every_line() {
let src: String = (1..=250).map(|i| format!("x{i}\n")).collect();
let opts = ChunkOptions {
max_lines: 60,
overlap: 10,
};
let chunks = chunk_text("f", &src, &opts);
let mut covered = vec![false; 251];
for c in &chunks {
for l in c.start_line..=c.end_line {
covered[l] = true;
}
}
assert!((1..=250).all(|l| covered[l]), "every line covered");
}
#[test]
fn identical_text_has_identical_hash() {
let a = chunk_text("a.rs", "same body here\nline two\n", &ChunkOptions::default());
let b = chunk_text("b.rs", "same body here\nline two\n", &ChunkOptions::default());
assert_eq!(a[0].content_hash, b[0].content_hash, "dedup key is content-only");
assert_ne!(a[0].file, b[0].file);
}
#[test]
fn rust_file_chunks_on_item_boundaries() {
let src = "use std::fmt;\n\nfn a() {\n 1;\n}\n\nfn b() {\n 2;\n}\n";
let chunks = chunk_file("src/lib.rs", src, &ChunkOptions::default());
assert_eq!(chunks.len(), 3, "one chunk per item, not a single window");
assert!(chunks[0].text.contains("use std::fmt"));
assert!(chunks[1].text.contains("fn a") && !chunks[1].text.contains("fn b"));
assert_eq!(chunks[1].start_line, 3);
assert!(chunks[2].text.contains("fn b"));
}
#[test]
fn unparseable_rust_falls_back_to_line_window() {
let src = "fn (((\nnot rust at all ;;;\n";
let chunks = chunk_file("broken.rs", src, &ChunkOptions::default());
assert_eq!(chunks.len(), 1); assert!(chunks[0].text.contains("not rust"));
}
#[test]
fn non_rust_uses_line_window_unchanged() {
let src: String = (1..=300).map(|i| format!("line {i}\n")).collect();
let opts = ChunkOptions { max_lines: 100, overlap: 20 };
assert_eq!(
chunk_file("notes.md", &src, &opts),
chunk_text("notes.md", &src, &opts),
"non-.rs files chunk identically to the line-window splitter"
);
}
#[test]
fn oversize_rust_item_is_windowed() {
let body: String = (0..50).map(|i| format!(" let x{i} = {i};\n")).collect();
let src = format!("fn big() {{\n{body}}}\n");
let opts = ChunkOptions { max_lines: 20, overlap: 4 };
let chunks = chunk_file("big.rs", &src, &opts);
assert!(chunks.len() > 1, "an item over max_lines is line-windowed");
}
#[test]
fn bad_overlap_does_not_hang() {
let src: String = (1..=50).map(|i| format!("l{i}\n")).collect();
let opts = ChunkOptions {
max_lines: 10,
overlap: 999,
};
let chunks = chunk_text("f", &src, &opts);
assert!(!chunks.is_empty());
assert_eq!(chunks.last().unwrap().end_line, 50);
}
}