ripvec-core 1.0.3

Semantic code + document search engine. Cacheless static-embedding + cross-encoder rerank by default; optional ModernBERT/BGE transformer engines with GPU backends. Tree-sitter chunking, hybrid BM25 + PageRank, composable ranking layers.
Documentation
//! Tree-sitter AST-merge chunker (semble flavor).
//!
//! Port of `~/src/semble/src/semble/chunking/{core,chunking}.py`.
//! Greedily walks AST children, packing siblings up to a 1500-char
//! target before recursing into oversized nodes. Falls back to
//! line-merge chunking for unsupported languages.
//!
//! The algorithm differs from ripvec's query-based chunker in
//! [`crate::chunk`]: it does not extract specific definitions, it merges
//! AST subtrees until they fill the size budget. This produces fewer,
//! larger, more contextually-coherent chunks for semantic embedding.
//!
//! ## Boundary shape
//!
//! [`ChunkBoundary`] reports byte offsets (matching tree-sitter's
//! native units) and 1-based line numbers (matching CodeChunk's
//! convention). Callers usually pass [`ChunkBoundary::content`] +
//! line numbers into [`CodeChunk`] construction.

use tree_sitter::{Language, Node, Parser, TreeCursor};

/// Default desired chunk length used by [`chunk_source`] when callers
/// don't specify one (mirrors `_DESIRED_CHUNK_LENGTH_CHARS` from
/// `chunking.py:10`).
pub const DEFAULT_DESIRED_CHUNK_CHARS: usize = 1500;

/// Inclusive byte span and (1-based) line range for one chunk.
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct ChunkBoundary {
    /// First byte of the chunk (UTF-8 byte offset).
    pub start_byte: usize,
    /// One-past-last byte of the chunk.
    pub end_byte: usize,
    /// 1-based start line.
    pub start_line: usize,
    /// 1-based end line.
    pub end_line: usize,
}

impl ChunkBoundary {
    /// Slice the chunk's content out of the source string.
    ///
    /// Caller guarantees `start_byte..end_byte` lies on UTF-8 boundaries
    /// (tree-sitter node positions always do).
    #[must_use]
    pub fn content<'a>(&self, source: &'a str) -> &'a str {
        // Clamp end_byte to source length defensively (in case a node's
        // end position lies one past the source end, e.g., for trailing
        // EOF tokens). Also walk back to the previous UTF-8 char
        // boundary at each end so the slice never panics on
        // multi-byte glyphs (tree-sitter node positions are byte-
        // accurate but our adjacent-chunk merges occasionally widen the
        // span; defence in depth).
        let end = clamp_to_char_boundary(source, self.end_byte.min(source.len()), true);
        let start = clamp_to_char_boundary(source, self.start_byte.min(end), false);
        &source[start..end]
    }
}

/// Walk `byte` to the nearest UTF-8 character boundary. `forward = true`
/// rounds up (to the byte right after a multi-byte glyph); `false`
/// rounds down (to the start of one). `&str::is_char_boundary` is the
/// gate; the loop is bounded by a multi-byte glyph max width (4).
fn clamp_to_char_boundary(source: &str, mut byte: usize, forward: bool) -> usize {
    byte = byte.min(source.len());
    for _ in 0..4 {
        if source.is_char_boundary(byte) {
            return byte;
        }
        if forward {
            byte = (byte + 1).min(source.len());
        } else {
            byte = byte.saturating_sub(1);
        }
    }
    byte
}

// ---------------------------------------------------------------------------
// Public API.
// ---------------------------------------------------------------------------

/// Chunk source text via tree-sitter AST-merge or line-merge fallback.
///
/// `language = Some(lang)` parses with tree-sitter and runs the AST-merge
/// algorithm. `language = None` runs the line-merge fallback.
///
/// Empty / whitespace-only input returns an empty `Vec`.
#[must_use]
pub fn chunk_source(
    source: &str,
    language: Option<&Language>,
    desired_length: usize,
) -> Vec<ChunkBoundary> {
    if source.trim().is_empty() {
        return Vec::new();
    }
    if let Some(lang) = language
        && let Some(tree) = parse_source(source, lang)
    {
        return chunk_tree(source, tree.root_node(), desired_length);
    }
    chunk_lines(source, desired_length)
}

/// Chunk source by lines, merging adjacent lines up to `desired_length`.
///
/// Mirrors `chunk_lines` from `core.py:112`. The fallback chunker for
/// files without a tree-sitter grammar (or whose parse fails).
#[must_use]
pub fn chunk_lines(source: &str, desired_length: usize) -> Vec<ChunkBoundary> {
    if source.trim().is_empty() {
        return Vec::new();
    }
    // Build per-line boundaries (start..end_inclusive of '\n').
    let mut line_boundaries: Vec<ChunkBoundary> = Vec::new();
    let mut start = 0;
    let mut line_no = 1;
    for line in source.split_inclusive('\n') {
        let end = start + line.len();
        // Lines from split_inclusive may end in '\n'; tree-sitter convention is
        // 1-based start; the line itself spans [start, end) bytes.
        let line_count = line.bytes().filter(|&b| b == b'\n').count().max(1);
        line_boundaries.push(ChunkBoundary {
            start_byte: start,
            end_byte: end,
            start_line: line_no,
            end_line: line_no + line_count - 1,
        });
        start = end;
        line_no += line_count;
    }
    merge_adjacent_chunks(line_boundaries, desired_length, source)
}

// ---------------------------------------------------------------------------
// Tree-sitter path.
// ---------------------------------------------------------------------------

fn parse_source(source: &str, language: &Language) -> Option<tree_sitter::Tree> {
    let mut parser = Parser::new();
    parser.set_language(language).ok()?;
    parser.parse(source, None)
}

fn chunk_tree(source: &str, root: Node<'_>, desired_length: usize) -> Vec<ChunkBoundary> {
    let raw = merge_node_inner(root, desired_length);
    let with_lines = raw
        .into_iter()
        .map(|(start, end)| ChunkBoundary {
            start_byte: start,
            end_byte: end,
            start_line: line_at_byte(source, start),
            end_line: line_at_byte(source, end.saturating_sub(1).max(start)),
        })
        .collect();
    merge_adjacent_chunks(with_lines, desired_length, source)
}

fn line_at_byte(source: &str, byte: usize) -> usize {
    // 1-based line number: count newlines in the source up to `byte`.
    // Slice on `as_bytes()` rather than the &str so we can land on any
    // byte position — `&source[..n]` panics when `n` is mid-character
    // (e.g., for a 3-byte glyph like '─' (U+2500) at bytes [k, k+1, k+2],
    // `source[..k+2]` is a char-boundary violation). The callers compute
    // `end.saturating_sub(1)` for end-line which often lands inside a
    // multi-byte glyph; byte-level slicing is the right primitive.
    let clamped = byte.min(source.len());
    1 + bytecount::count(&source.as_bytes()[..clamped], b'\n')
}

/// Walk a node's children depth-first, packing siblings into spans up
/// to `desired_length` bytes. Mirrors `_merge_node_inner` from
/// `core.py:65`. Returns byte spans `(start, end)`.
fn merge_node_inner(node: Node<'_>, desired_length: usize) -> Vec<(usize, usize)> {
    let mut cursor = node.walk();
    if !cursor.goto_first_child() {
        // Leaf node — single span.
        return vec![(node.start_byte(), node.end_byte())];
    }

    let mut groups: Vec<(usize, usize)> = Vec::new();
    loop {
        let child = cursor.node();
        let start = child.start_byte();
        let mut end = child.end_byte();
        let mut length = end - start;

        if length > desired_length {
            // Recurse into oversized children.
            groups.extend(merge_node_inner(child, desired_length));
            if !cursor.goto_next_sibling() {
                break;
            }
            continue;
        }

        // Extend the current group with subsequent siblings that fit.
        while let Some(next) = peek_next_sibling(&cursor) {
            let next_length = next.end_byte() - next.start_byte();
            if length + next_length > desired_length {
                break;
            }
            cursor.goto_next_sibling();
            end = cursor.node().end_byte();
            length += next_length;
        }

        groups.push((start, end));
        if !cursor.goto_next_sibling() {
            break;
        }
    }
    groups
}

/// Peek at the next sibling without advancing the cursor.
///
/// `TreeCursor` has no native peek; clone the cursor, advance, and read
/// the node. Tree-sitter cursors are cheap to clone.
fn peek_next_sibling<'a>(cursor: &TreeCursor<'a>) -> Option<Node<'a>> {
    let mut probe = cursor.clone();
    if probe.goto_next_sibling() {
        Some(probe.node())
    } else {
        None
    }
}

/// Merge adjacent chunks up to `desired_length`. Mirrors
/// `_merge_adjacent_chunks` from `core.py:35`. Operates on byte spans.
fn merge_adjacent_chunks(
    chunks: Vec<ChunkBoundary>,
    desired_length: usize,
    source: &str,
) -> Vec<ChunkBoundary> {
    if chunks.is_empty() {
        return chunks;
    }
    let mut merged: Vec<ChunkBoundary> = Vec::new();
    let mut current = chunks[0].clone();
    let mut current_length = current.end_byte - current.start_byte;

    for next in chunks.into_iter().skip(1) {
        let nlen = next.end_byte - next.start_byte;
        if current_length + nlen > desired_length {
            merged.push(current);
            current = next;
            current_length = nlen;
        } else {
            current.end_byte = next.end_byte;
            current.end_line = next.end_line;
            current_length += nlen;
        }
    }
    merged.push(current);
    // Recompute end_line at boundaries to be safe (the byte-span recipe
    // can leave end_line stale when merging across multi-line blocks).
    for b in &mut merged {
        b.end_line = line_at_byte(source, b.end_byte.saturating_sub(1).max(b.start_byte));
    }
    merged
}

#[cfg(test)]
mod tests {
    use super::*;

    /// `test:chunker-line-fallback` — line-merge fallback packs lines up
    /// to the budget; lines longer than the budget get their own chunk.
    #[test]
    fn chunker_line_fallback() {
        let source = "aaa\nbbbbb\nccc\nddd\n";
        // budget=8 should merge "aaa\n" (4) + "ccc\n" (4)? No — adjacency.
        // Actually: line1=4, line2=6 (bbbbb\n), line3=4, line4=4.
        // current=line1 (len 4). next line2 (6). 4+6=10 > 8, flush. current=line2.
        // next line3 (4). 6+4=10 > 8, flush. current=line3.
        // next line4 (4). 4+4=8 <= 8, merge. current=line3+line4 (8).
        let chunks = chunk_lines(source, 8);
        assert_eq!(chunks.len(), 3);
        assert_eq!(chunks[0].content(source), "aaa\n");
        assert_eq!(chunks[1].content(source), "bbbbb\n");
        assert_eq!(chunks[2].content(source), "ccc\nddd\n");
    }

    /// Empty input returns empty.
    #[test]
    fn empty_input_returns_empty() {
        assert_eq!(chunk_lines("", 100), Vec::<ChunkBoundary>::new());
        assert_eq!(chunk_lines("   \n  ", 100), Vec::<ChunkBoundary>::new());
        assert_eq!(chunk_source("", None, 100), Vec::<ChunkBoundary>::new());
    }

    /// `test:chunker-merge-adjacent` — adjacent small AST nodes pack
    /// together up to the byte budget. Uses a Python source that has
    /// short adjacent function definitions.
    #[test]
    fn chunker_merge_adjacent() {
        let lang: Language = tree_sitter_python::LANGUAGE.into();
        // Three small functions; combined ~80 chars. With budget 200 they should
        // merge into one chunk. With budget 50, each stands alone.
        let source = "def a():\n    return 1\n\ndef b():\n    return 2\n\ndef c():\n    return 3\n";
        let combined = chunk_source(source, Some(&lang), 200);
        let split = chunk_source(source, Some(&lang), 50);
        assert!(
            combined.len() <= split.len(),
            "larger budget should produce fewer-or-equal chunks; got combined={} split={}",
            combined.len(),
            split.len()
        );
        // The content cover the whole source modulo tree-sitter trailing-whitespace
        // semantics.
        let combined_chars: usize = combined.iter().map(|c| c.end_byte - c.start_byte).sum();
        assert!(
            combined_chars >= source.len() - 4,
            "chunks should cover ~all of source ({combined_chars} of {})",
            source.len()
        );
    }

    /// `test:chunker-recurses-oversized-nodes` — a single oversized
    /// node splits into multiple smaller chunks.
    #[test]
    fn chunker_recurses_oversized_nodes() {
        let lang: Language = tree_sitter_python::LANGUAGE.into();
        // Build a function with many statements so its body exceeds the budget;
        // tree-sitter should let merge_node_inner recurse into the function body
        // and split it.
        let mut body = String::new();
        for i in 0..40 {
            use std::fmt::Write;
            let _ = writeln!(&mut body, "    x{i} = {i}");
        }
        let source = format!("def big_function():\n{body}");
        let chunks = chunk_source(&source, Some(&lang), 100);
        assert!(
            chunks.len() > 1,
            "oversized function should split into multiple chunks; got {}",
            chunks.len()
        );
    }

    /// `property:chunker-parity-python` — chunks cover the full source
    /// without overlap and without leaving gaps (modulo whitespace).
    #[test]
    fn property_chunker_parity_python() {
        let lang: Language = tree_sitter_python::LANGUAGE.into();
        let source = "def a():\n    pass\n\ndef b():\n    pass\n\nclass C:\n    def m(self):\n        pass\n";
        let chunks = chunk_source(source, Some(&lang), 1500);
        // Coverage: chunks should be non-overlapping when sorted by start_byte.
        let mut prev_end = 0;
        for c in &chunks {
            assert!(
                c.start_byte >= prev_end,
                "chunk overlap: prev_end={prev_end} c.start_byte={}",
                c.start_byte
            );
            prev_end = c.end_byte;
        }
        assert!(prev_end <= source.len(), "chunks extend past source");
    }
}