code_chunker/lib.rs
1#![warn(missing_docs)]
2//! # code-chunker
3//!
4//! AST-aware code chunking and late chunking for RAG pipelines.
5//!
6//! ## Two primitives
7//!
8//! ### `CodeChunker` — split source code at AST boundaries
9//!
10//! Tree-sitter walks the parse tree and produces chunks aligned to
11//! function, class, impl, and module boundaries. When a node fits the
12//! configured size budget it is kept intact; oversize nodes are split
13//! recursively at structural separators. Supports Rust, Python,
14//! TypeScript/JavaScript, and Go (behind the `code` feature).
15//!
16//! ### `LateChunkingPooler` — pool token embeddings into chunk embeddings
17//!
18//! Late chunking (Günther et al. 2024, arXiv:2409.04701) embeds the full
19//! document first so every token attends to the rest of the document,
20//! then mean-pools token embeddings inside each chunk's byte span. The
21//! result is a per-chunk embedding that carries document-wide context —
22//! pronouns, anaphora, and acronym definitions are no longer lost at
23//! chunk boundaries.
24//!
25//! `LateChunkingPooler` is span-only: bring your own boundaries from any
26//! source — `CodeChunker`, `text-splitter`, regex, or hand-built `Slab`s.
27//!
28//! ## What this crate does not do
29//!
30//! - **General-purpose text chunking.** Use [`text-splitter`](https://crates.io/crates/text-splitter)
31//! for fixed/sentence/recursive prose splitting; it's the de-facto Rust
32//! standard with broader Unicode and tokenizer support.
33//! - **Format conversion (PDF, HTML, DOCX).** Input is `&str`. Use
34//! [`deformat`](https://crates.io/crates/deformat) or
35//! [`pdf-extract`](https://crates.io/crates/pdf-extract) upstream.
36//! - **Embedding generation.** `LateChunkingPooler` consumes
37//! pre-computed token embeddings; bring your own long-context model
38//! (Jina v2/v3, nomic-embed-text, candle, ort).
39//! - **Vector store integration.** [`Slab`] is the boundary; enable the
40//! `serde` feature and wire to qdrant-client, lancedb, sqlx, etc. yourself.
41//!
42//! ## Quick start (code chunking)
43//!
44//! ```ignore
45//! use code_chunker::{Chunker, CodeChunker, CodeLanguage};
46//!
47//! let chunker = CodeChunker::new(CodeLanguage::Rust, 1500, 0);
48//! let slabs = chunker.chunk(source_code);
49//! ```
50//!
51//! ## Quick start (late chunking)
52//!
53//! ```ignore
54//! use code_chunker::{LateChunkingPooler, Slab};
55//!
56//! // Bring your own chunk boundaries (text-splitter, CodeChunker, ...).
57//! let chunks: Vec<Slab> = my_chunker(&document);
58//!
59//! // Embed the full document with a long-context model.
60//! let token_embeddings: Vec<Vec<f32>> = my_model.embed_tokens(&document);
61//!
62//! // Pool token embeddings into per-chunk embeddings.
63//! let pooler = LateChunkingPooler::new(384);
64//! let chunk_embeddings = pooler.pool(&token_embeddings, &chunks, document.len());
65//! ```
66
67mod error;
68mod late;
69mod sizer;
70mod slab;
71
72#[cfg(feature = "code")]
73mod code;
74#[cfg(feature = "code")]
75mod recursive;
76
77pub use error::{Error, Result};
78pub use late::LateChunkingPooler;
79pub use sizer::{ByteSizer, ChunkSizer};
80pub use slab::{compute_char_offsets, Slab};
81
82#[cfg(feature = "code")]
83pub use code::{CodeChunker, CodeLanguage};
84
85/// A chunking strategy: text in, [`Slab`]s out.
86///
87/// Implementors override [`chunk_bytes`](Chunker::chunk_bytes); the default
88/// [`chunk`](Chunker::chunk) method adds Unicode character offsets.
89///
90/// This crate only ships one public chunker — [`CodeChunker`] — but the
91/// trait is public so users can wrap external chunkers (text-splitter,
92/// regex, custom logic) and feed the output into [`LateChunkingPooler`].
93pub trait Chunker: Send + Sync {
94 /// Core chunking implementation returning [`Slab`]s with byte offsets only.
95 ///
96 /// Implementors override this method. Users should call [`chunk`](Chunker::chunk)
97 /// instead, which adds character offsets automatically.
98 fn chunk_bytes(&self, text: &str) -> Vec<Slab>;
99
100 /// Split text into chunks with both byte and character offsets.
101 ///
102 /// This calls [`chunk_bytes`](Chunker::chunk_bytes) and then computes
103 /// Unicode character offsets on every slab. Users get correct `char_start`
104 /// and `char_end` without manual work.
105 fn chunk(&self, text: &str) -> Vec<Slab> {
106 let mut slabs = self.chunk_bytes(text);
107 compute_char_offsets(text, &mut slabs);
108 slabs
109 }
110
111 /// Estimate the number of chunks for a given text length.
112 ///
113 /// Useful for pre-allocation. May be approximate.
114 fn estimate_chunks(&self, text_len: usize) -> usize {
115 // Conservative default
116 (text_len / 500).max(1)
117 }
118}