nornir 0.4.18

Companion to cargo: dependency tracking, release gating, deploy, benchmarks, and documentation assembly. Project-agnostic.
Documentation
//! Chunking — split source files (code or docs) into embeddable units.
//!
//! The semantic-search pipeline is: **chunk** → embed → store → search.
//! This module owns the first step and is deliberately:
//!
//! - **Item-aware for Rust, line-window elsewhere.** `.rs` files are split on
//!   top-level `syn` item spans (one chunk per fn/impl/struct/…, with gaps and
//!   any oversize item line-windowed); every other file — and any `.rs` that
//!   fails to parse — falls back to a language-agnostic line-window splitter.
//!   Both produce the same [`Chunk`] shape. Item-aligned chunks embed fewer,
//!   more coherent units (sharper retrieval, less GPU work) than blind windows.
//!   Use [`chunk_file`]; [`chunk_text`] is the line-window splitter directly.
//! - **Context-aware.** Windows stay well under the embedder's 8192-token
//!   context. Overlap preserves cross-boundary meaning.
//! - **Dedup-ready.** Each chunk carries a stable `content_hash` (SHA-256 of
//!   its text). Storing chunk vectors keyed by `content_hash` means an
//!   unchanged chunk is embedded + stored **once**, even across many git
//!   snapshots — the time-travel size win that compression can't give for raw
//!   `f32` embeddings.
//!
//! A [`Chunk`] is also exactly the `id → {file, span, excerpt}` map the
//! [`super::VectorIndex`] needs to turn a search hit back into a source
//! location.

use sha2::{Digest, Sha256};

/// One embeddable unit of a source file.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
    /// Repo-relative path of the source file.
    pub file: String,
    /// First source line covered, 1-based inclusive.
    pub start_line: usize,
    /// Last source line covered, 1-based inclusive.
    pub end_line: usize,
    /// The chunk's text (the lines `start_line..=end_line` joined with `\n`).
    pub text: String,
    /// SHA-256 hex digest of [`Self::text`] — stable dedup key.
    pub content_hash: String,
}

/// How to window a file into chunks.
#[derive(Debug, Clone, Copy)]
pub struct ChunkOptions {
    /// Maximum source lines per chunk.
    pub max_lines: usize,
    /// Lines of overlap between consecutive windows (preserves meaning that
    /// straddles a boundary). Must be `< max_lines`.
    pub overlap: usize,
}

impl Default for ChunkOptions {
    fn default() -> Self {
        // ~120 lines is comfortably inside an 8192-token context for typical
        // code/prose, with a small overlap to keep straddling context.
        Self {
            max_lines: 120,
            overlap: 16,
        }
    }
}

/// Split `text` (the full contents of `file`) into chunks.
///
/// - Empty / whitespace-only input yields no chunks.
/// - A file no longer than `max_lines` becomes a single whole-file chunk.
/// - Larger files are split into overlapping line windows; the final window
///   is clamped to the end of the file (so the tail is never dropped).
/// - Whitespace-only windows are skipped.
///
/// `overlap` is clamped to `max_lines - 1` defensively so the window always
/// advances (no infinite loop) even if a caller passes a bad value.
pub fn chunk_text(file: &str, text: &str, opts: &ChunkOptions) -> Vec<Chunk> {
    if text.trim().is_empty() {
        return Vec::new();
    }
    let max_lines = opts.max_lines.max(1);
    let overlap = opts.overlap.min(max_lines - 1);
    let step = max_lines - overlap; // >= 1

    let lines: Vec<&str> = text.lines().collect();
    let n = lines.len();
    let mut out = Vec::new();

    let mut start = 0usize; // 0-based line index
    while start < n {
        let end = (start + max_lines).min(n); // exclusive
        let body = lines[start..end].join("\n");
        if !body.trim().is_empty() {
            out.push(Chunk {
                file: file.to_string(),
                start_line: start + 1,
                end_line: end, // 1-based inclusive == exclusive-0-based `end`
                content_hash: hash_text(&body),
                text: body,
            });
        }
        if end == n {
            break; // reached the tail; the clamped window already covered it
        }
        start += step;
    }
    out
}

/// Chunk `text`, choosing the strategy by file type: `.rs` files are split on
/// top-level `syn` item boundaries (falling back to line-windows if the file
/// doesn't parse), everything else by line-window. The preferred entry point.
pub fn chunk_file(file: &str, text: &str, opts: &ChunkOptions) -> Vec<Chunk> {
    if text.trim().is_empty() {
        return Vec::new();
    }
    if file.ends_with(".rs") {
        if let Some(chunks) = chunk_rust(file, text, opts) {
            return chunks;
        }
    }
    chunk_text(file, text, opts)
}

/// Item-aware chunking for Rust: one chunk per top-level item span, with the gaps
/// between items (module docs, blank lines, `use`s) and any item longer than
/// `max_lines` line-windowed, so every line is covered exactly once. Returns
/// `None` if the file doesn't parse (caller falls back to [`chunk_text`]).
fn chunk_rust(file: &str, text: &str, opts: &ChunkOptions) -> Option<Vec<Chunk>> {
    use syn::spanned::Spanned;
    let ast = syn::parse_file(text).ok()?;
    let lines: Vec<&str> = text.lines().collect();
    let n = lines.len();
    if n == 0 {
        return Some(Vec::new());
    }

    // Top-level item spans, 1-based inclusive, clamped to the file and sorted.
    let mut spans: Vec<(usize, usize)> = ast
        .items
        .iter()
        .map(|it| {
            let s = it.span();
            (s.start().line.max(1), s.end().line.clamp(1, n))
        })
        .filter(|(s, e)| e >= s)
        .collect();
    spans.sort_unstable();

    let mut out = Vec::new();
    let mut cursor = 1usize; // next uncovered 1-based line
    for (s, e) in spans {
        if s > cursor {
            emit_range(file, &lines, cursor, s - 1, opts, &mut out); // gap before item
        }
        let start = s.max(cursor); // guard against overlapping spans
        if e >= start {
            emit_range(file, &lines, start, e, opts, &mut out); // the item itself
            cursor = e + 1;
        }
    }
    if cursor <= n {
        emit_range(file, &lines, cursor, n, opts, &mut out); // trailing gap
    }
    Some(out)
}

/// Window the 1-based inclusive line range `[start1, end1]` into `out`: one chunk
/// if it fits in `max_lines`, else overlapping windows. Whitespace-only windows
/// are skipped. Shared by the item-aware and line-window paths.
fn emit_range(
    file: &str,
    lines: &[&str],
    start1: usize,
    end1: usize,
    opts: &ChunkOptions,
    out: &mut Vec<Chunk>,
) {
    if start1 < 1 || end1 > lines.len() || start1 > end1 {
        return;
    }
    let max_lines = opts.max_lines.max(1);
    let overlap = opts.overlap.min(max_lines - 1);
    let step = max_lines - overlap; // >= 1
    let mut s = start1;
    loop {
        let e = (s + max_lines - 1).min(end1); // 1-based inclusive window end
        let body = lines[(s - 1)..e].join("\n");
        if !body.trim().is_empty() {
            out.push(Chunk {
                file: file.to_string(),
                start_line: s,
                end_line: e,
                content_hash: hash_text(&body),
                text: body,
            });
        }
        if e >= end1 {
            break;
        }
        s += step;
    }
}

/// SHA-256 hex digest of `text`.
pub fn hash_text(text: &str) -> String {
    let mut h = Sha256::new();
    h.update(text.as_bytes());
    let digest = h.finalize();
    let mut s = String::with_capacity(digest.len() * 2);
    const HEX: &[u8; 16] = b"0123456789abcdef";
    for &b in digest.iter() {
        s.push(HEX[(b >> 4) as usize] as char);
        s.push(HEX[(b & 0x0f) as usize] as char);
    }
    s
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_input_yields_no_chunks() {
        assert!(chunk_text("a.rs", "", &ChunkOptions::default()).is_empty());
        assert!(chunk_text("a.rs", "   \n\n\t\n", &ChunkOptions::default()).is_empty());
    }

    #[test]
    fn small_file_is_one_whole_chunk() {
        let src = "fn main() {\n    println!(\"hi\");\n}\n";
        let chunks = chunk_text("src/main.rs", src, &ChunkOptions::default());
        assert_eq!(chunks.len(), 1);
        let c = &chunks[0];
        assert_eq!(c.file, "src/main.rs");
        assert_eq!(c.start_line, 1);
        assert_eq!(c.end_line, 3); // three lines (trailing \n doesn't add one)
        assert!(c.text.contains("println!"));
        assert_eq!(c.content_hash.len(), 64);
    }

    #[test]
    fn large_file_splits_into_overlapping_windows() {
        let src: String = (1..=300).map(|i| format!("line {i}\n")).collect();
        let opts = ChunkOptions {
            max_lines: 100,
            overlap: 20,
        };
        let chunks = chunk_text("big.txt", &src, &opts);
        // step = 80 → windows start at lines 1, 81, 161, 241 → 4 chunks.
        assert_eq!(chunks.len(), 4);
        assert_eq!(chunks[0].start_line, 1);
        assert_eq!(chunks[0].end_line, 100);
        assert_eq!(chunks[1].start_line, 81); // 80-line step, 20 overlap
        assert_eq!(chunks[1].end_line, 180);
        // Tail window is clamped to the last line, never past it.
        let last = chunks.last().unwrap();
        assert_eq!(last.end_line, 300);
        assert!(last.start_line <= 300);
    }

    #[test]
    fn windows_cover_every_line() {
        let src: String = (1..=250).map(|i| format!("x{i}\n")).collect();
        let opts = ChunkOptions {
            max_lines: 60,
            overlap: 10,
        };
        let chunks = chunk_text("f", &src, &opts);
        // Union of [start,end] must cover 1..=250 with no gap.
        let mut covered = vec![false; 251];
        for c in &chunks {
            for l in c.start_line..=c.end_line {
                covered[l] = true;
            }
        }
        assert!((1..=250).all(|l| covered[l]), "every line covered");
    }

    #[test]
    fn identical_text_has_identical_hash() {
        let a = chunk_text("a.rs", "same body here\nline two\n", &ChunkOptions::default());
        let b = chunk_text("b.rs", "same body here\nline two\n", &ChunkOptions::default());
        assert_eq!(a[0].content_hash, b[0].content_hash, "dedup key is content-only");
        assert_ne!(a[0].file, b[0].file);
    }

    #[test]
    fn rust_file_chunks_on_item_boundaries() {
        let src = "use std::fmt;\n\nfn a() {\n    1;\n}\n\nfn b() {\n    2;\n}\n";
        let chunks = chunk_file("src/lib.rs", src, &ChunkOptions::default());
        // Three top-level items (use, fn a, fn b); blank gaps are skipped.
        assert_eq!(chunks.len(), 3, "one chunk per item, not a single window");
        assert!(chunks[0].text.contains("use std::fmt"));
        assert!(chunks[1].text.contains("fn a") && !chunks[1].text.contains("fn b"));
        assert_eq!(chunks[1].start_line, 3);
        assert!(chunks[2].text.contains("fn b"));
    }

    #[test]
    fn unparseable_rust_falls_back_to_line_window() {
        // Garbage that syn can't parse must not panic — falls back to windows.
        let src = "fn (((\nnot rust at all ;;;\n";
        let chunks = chunk_file("broken.rs", src, &ChunkOptions::default());
        assert_eq!(chunks.len(), 1); // small → one line-window chunk
        assert!(chunks[0].text.contains("not rust"));
    }

    #[test]
    fn non_rust_uses_line_window_unchanged() {
        let src: String = (1..=300).map(|i| format!("line {i}\n")).collect();
        let opts = ChunkOptions { max_lines: 100, overlap: 20 };
        assert_eq!(
            chunk_file("notes.md", &src, &opts),
            chunk_text("notes.md", &src, &opts),
            "non-.rs files chunk identically to the line-window splitter"
        );
    }

    #[test]
    fn oversize_rust_item_is_windowed() {
        // One function longer than max_lines must be split into windows.
        let body: String = (0..50).map(|i| format!("    let x{i} = {i};\n")).collect();
        let src = format!("fn big() {{\n{body}}}\n");
        let opts = ChunkOptions { max_lines: 20, overlap: 4 };
        let chunks = chunk_file("big.rs", &src, &opts);
        assert!(chunks.len() > 1, "an item over max_lines is line-windowed");
    }

    #[test]
    fn bad_overlap_does_not_hang() {
        // overlap >= max_lines would make step 0; we clamp it.
        let src: String = (1..=50).map(|i| format!("l{i}\n")).collect();
        let opts = ChunkOptions {
            max_lines: 10,
            overlap: 999,
        };
        let chunks = chunk_text("f", &src, &opts);
        assert!(!chunks.is_empty());
        assert_eq!(chunks.last().unwrap().end_line, 50);
    }
}