use sha2::{Digest, Sha256};
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
pub file: String,
pub start_line: usize,
pub end_line: usize,
pub text: String,
pub content_hash: String,
}
#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
pub max_lines: usize,
pub overlap: usize,
}
impl Default for ChunkOptions {
fn default() -> Self {
Self {
max_lines: 120,
overlap: 16,
}
}
}
pub fn chunk_text(file: &str, text: &str, opts: &ChunkOptions) -> Vec<Chunk> {
if text.trim().is_empty() {
return Vec::new();
}
let max_lines = opts.max_lines.max(1);
let overlap = opts.overlap.min(max_lines - 1);
let step = max_lines - overlap;
let lines: Vec<&str> = text.lines().collect();
let n = lines.len();
let mut out = Vec::new();
let mut start = 0usize; while start < n {
let end = (start + max_lines).min(n); let body = lines[start..end].join("\n");
if !body.trim().is_empty() {
out.push(Chunk {
file: file.to_string(),
start_line: start + 1,
end_line: end, content_hash: hash_text(&body),
text: body,
});
}
if end == n {
break; }
start += step;
}
out
}
pub fn hash_text(text: &str) -> String {
let mut h = Sha256::new();
h.update(text.as_bytes());
let digest = h.finalize();
let mut s = String::with_capacity(digest.len() * 2);
const HEX: &[u8; 16] = b"0123456789abcdef";
for &b in digest.iter() {
s.push(HEX[(b >> 4) as usize] as char);
s.push(HEX[(b & 0x0f) as usize] as char);
}
s
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty_input_yields_no_chunks() {
assert!(chunk_text("a.rs", "", &ChunkOptions::default()).is_empty());
assert!(chunk_text("a.rs", " \n\n\t\n", &ChunkOptions::default()).is_empty());
}
#[test]
fn small_file_is_one_whole_chunk() {
let src = "fn main() {\n println!(\"hi\");\n}\n";
let chunks = chunk_text("src/main.rs", src, &ChunkOptions::default());
assert_eq!(chunks.len(), 1);
let c = &chunks[0];
assert_eq!(c.file, "src/main.rs");
assert_eq!(c.start_line, 1);
assert_eq!(c.end_line, 3); assert!(c.text.contains("println!"));
assert_eq!(c.content_hash.len(), 64);
}
#[test]
fn large_file_splits_into_overlapping_windows() {
let src: String = (1..=300).map(|i| format!("line {i}\n")).collect();
let opts = ChunkOptions {
max_lines: 100,
overlap: 20,
};
let chunks = chunk_text("big.txt", &src, &opts);
assert_eq!(chunks.len(), 4);
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 100);
assert_eq!(chunks[1].start_line, 81); assert_eq!(chunks[1].end_line, 180);
let last = chunks.last().unwrap();
assert_eq!(last.end_line, 300);
assert!(last.start_line <= 300);
}
#[test]
fn windows_cover_every_line() {
let src: String = (1..=250).map(|i| format!("x{i}\n")).collect();
let opts = ChunkOptions {
max_lines: 60,
overlap: 10,
};
let chunks = chunk_text("f", &src, &opts);
let mut covered = vec![false; 251];
for c in &chunks {
for l in c.start_line..=c.end_line {
covered[l] = true;
}
}
assert!((1..=250).all(|l| covered[l]), "every line covered");
}
#[test]
fn identical_text_has_identical_hash() {
let a = chunk_text("a.rs", "same body here\nline two\n", &ChunkOptions::default());
let b = chunk_text("b.rs", "same body here\nline two\n", &ChunkOptions::default());
assert_eq!(a[0].content_hash, b[0].content_hash, "dedup key is content-only");
assert_ne!(a[0].file, b[0].file);
}
#[test]
fn bad_overlap_does_not_hang() {
let src: String = (1..=50).map(|i| format!("l{i}\n")).collect();
let opts = ChunkOptions {
max_lines: 10,
overlap: 999,
};
let chunks = chunk_text("f", &src, &opts);
assert!(!chunks.is_empty());
assert_eq!(chunks.last().unwrap().end_line, 50);
}
}