nornir 0.4.3

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
Documentation
//! Chunking — split source files (code or docs) into embeddable units.
//!
//! The semantic-search pipeline is: **chunk** → embed → store → search.
//! This module owns the first step and is deliberately:
//!
//! - **Dependency-free / language-agnostic.** A simple line-window splitter,
//!   not a per-language `syn` parser. v1 ("one chunk per function/file is
//!   fine") favours robustness over precise function boundaries; a syntactic
//!   chunker can refine this later without changing the [`Chunk`] shape.
//! - **Context-aware.** Windows stay well under the embedder's 8192-token
//!   context. Overlap preserves cross-boundary meaning.
//! - **Dedup-ready.** Each chunk carries a stable `content_hash` (SHA-256 of
//!   its text). Storing chunk vectors keyed by `content_hash` means an
//!   unchanged chunk is embedded + stored **once**, even across many git
//!   snapshots — the time-travel size win that compression can't give for raw
//!   `f32` embeddings.
//!
//! A [`Chunk`] is also exactly the `id → {file, span, excerpt}` map the
//! [`super::VectorIndex`] needs to turn a search hit back into a source
//! location.

use sha2::{Digest, Sha256};

/// One embeddable unit of a source file.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
    /// Repo-relative path of the source file.
    pub file: String,
    /// First source line covered, 1-based inclusive.
    pub start_line: usize,
    /// Last source line covered, 1-based inclusive.
    pub end_line: usize,
    /// The chunk's text (the lines `start_line..=end_line` joined with `\n`).
    pub text: String,
    /// SHA-256 hex digest of [`Self::text`] — stable dedup key.
    pub content_hash: String,
}

/// How to window a file into chunks.
#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
    /// Maximum source lines per chunk.
    pub max_lines: usize,
    /// Lines of overlap between consecutive windows (preserves meaning that
    /// straddles a boundary). Must be `< max_lines`.
    pub overlap: usize,
}

impl Default for ChunkOptions {
    fn default() -> Self {
        // ~120 lines is comfortably inside an 8192-token context for typical
        // code/prose, with a small overlap to keep straddling context.
        Self {
            max_lines: 120,
            overlap: 16,
        }
    }
}

/// Split `text` (the full contents of `file`) into chunks.
///
/// - Empty / whitespace-only input yields no chunks.
/// - A file no longer than `max_lines` becomes a single whole-file chunk.
/// - Larger files are split into overlapping line windows; the final window
///   is clamped to the end of the file (so the tail is never dropped).
/// - Whitespace-only windows are skipped.
///
/// `overlap` is clamped to `max_lines - 1` defensively so the window always
/// advances (no infinite loop) even if a caller passes a bad value.
pub fn chunk_text(file: &str, text: &str, opts: &ChunkOptions) -> Vec<Chunk> {
    if text.trim().is_empty() {
        return Vec::new();
    }
    let max_lines = opts.max_lines.max(1);
    let overlap = opts.overlap.min(max_lines - 1);
    let step = max_lines - overlap; // >= 1

    let lines: Vec<&str> = text.lines().collect();
    let n = lines.len();
    let mut out = Vec::new();

    let mut start = 0usize; // 0-based line index
    while start < n {
        let end = (start + max_lines).min(n); // exclusive
        let body = lines[start..end].join("\n");
        if !body.trim().is_empty() {
            out.push(Chunk {
                file: file.to_string(),
                start_line: start + 1,
                end_line: end, // 1-based inclusive == exclusive-0-based `end`
                content_hash: hash_text(&body),
                text: body,
            });
        }
        if end == n {
            break; // reached the tail; the clamped window already covered it
        }
        start += step;
    }
    out
}

/// SHA-256 hex digest of `text`.
pub fn hash_text(text: &str) -> String {
    let mut h = Sha256::new();
    h.update(text.as_bytes());
    let digest = h.finalize();
    let mut s = String::with_capacity(digest.len() * 2);
    const HEX: &[u8; 16] = b"0123456789abcdef";
    for &b in digest.iter() {
        s.push(HEX[(b >> 4) as usize] as char);
        s.push(HEX[(b & 0x0f) as usize] as char);
    }
    s
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_input_yields_no_chunks() {
        assert!(chunk_text("a.rs", "", &ChunkOptions::default()).is_empty());
        assert!(chunk_text("a.rs", "   \n\n\t\n", &ChunkOptions::default()).is_empty());
    }

    #[test]
    fn small_file_is_one_whole_chunk() {
        let src = "fn main() {\n    println!(\"hi\");\n}\n";
        let chunks = chunk_text("src/main.rs", src, &ChunkOptions::default());
        assert_eq!(chunks.len(), 1);
        let c = &chunks[0];
        assert_eq!(c.file, "src/main.rs");
        assert_eq!(c.start_line, 1);
        assert_eq!(c.end_line, 3); // three lines (trailing \n doesn't add one)
        assert!(c.text.contains("println!"));
        assert_eq!(c.content_hash.len(), 64);
    }

    #[test]
    fn large_file_splits_into_overlapping_windows() {
        let src: String = (1..=300).map(|i| format!("line {i}\n")).collect();
        let opts = ChunkOptions {
            max_lines: 100,
            overlap: 20,
        };
        let chunks = chunk_text("big.txt", &src, &opts);
        // step = 80 → windows start at lines 1, 81, 161, 241 → 4 chunks.
        assert_eq!(chunks.len(), 4);
        assert_eq!(chunks[0].start_line, 1);
        assert_eq!(chunks[0].end_line, 100);
        assert_eq!(chunks[1].start_line, 81); // 80-line step, 20 overlap
        assert_eq!(chunks[1].end_line, 180);
        // Tail window is clamped to the last line, never past it.
        let last = chunks.last().unwrap();
        assert_eq!(last.end_line, 300);
        assert!(last.start_line <= 300);
    }

    #[test]
    fn windows_cover_every_line() {
        let src: String = (1..=250).map(|i| format!("x{i}\n")).collect();
        let opts = ChunkOptions {
            max_lines: 60,
            overlap: 10,
        };
        let chunks = chunk_text("f", &src, &opts);
        // Union of [start,end] must cover 1..=250 with no gap.
        let mut covered = vec![false; 251];
        for c in &chunks {
            for l in c.start_line..=c.end_line {
                covered[l] = true;
            }
        }
        assert!((1..=250).all(|l| covered[l]), "every line covered");
    }

    #[test]
    fn identical_text_has_identical_hash() {
        let a = chunk_text("a.rs", "same body here\nline two\n", &ChunkOptions::default());
        let b = chunk_text("b.rs", "same body here\nline two\n", &ChunkOptions::default());
        assert_eq!(a[0].content_hash, b[0].content_hash, "dedup key is content-only");
        assert_ne!(a[0].file, b[0].file);
    }

    #[test]
    fn bad_overlap_does_not_hang() {
        // overlap >= max_lines would make step 0; we clamp it.
        let src: String = (1..=50).map(|i| format!("l{i}\n")).collect();
        let opts = ChunkOptions {
            max_lines: 10,
            overlap: 999,
        };
        let chunks = chunk_text("f", &src, &opts);
        assert!(!chunks.is_empty());
        assert_eq!(chunks.last().unwrap().end_line, 50);
    }
}