julienne 0.1.0

Range-preserving Rust text chunkers for retrieval and embedding pipelines
Documentation
//! Julienne is a Rust library for cutting text into range-preserving chunks.
//!
//! It provides simple separator splitters, recursive and sentence-aware
//! splitters, semantic chunking, token-window chunking, and structure-aware
//! chunkers for Markdown, HTML/XML, and optional tree-sitter-backed code input.
//!
//! Structured chunk APIs return [`TextChunk`] values whose `text` field is a
//! zero-copy slice of the original input. The offset invariant for every
//! structured chunk is:
//!
//! ```text
//! &input[chunk.start_byte..chunk.end_byte] == chunk.text
//! ```
//!
//! Iterator APIs named `chunks` stream structured chunks where the algorithm can
//! operate incrementally. `split_chunks` collects those chunks, and `split_text`
//! projects them into owned strings for convenience.
//!
//! # Quick start
//!
//! ```
//! use julienne::SemchunkSplitter;
//!
//! let splitter = SemchunkSplitter::new(200, 40);
//! let chunks = splitter.split_text("Julienne keeps chunking small, explicit, and provenance-safe.");
//! assert!(!chunks.is_empty());
//! ```

pub mod chunk;
pub mod error;
pub mod merge;
pub mod split;

pub mod character;
pub mod recursive;
pub mod semantic;
pub mod semchunk;
pub mod sentence;
pub mod sizing;
pub mod structure;
pub mod token;

pub use character::CharacterTextSplitter;
pub use chunk::{ChunkMetadata, TextChunk, TextChunkIter};
pub use error::ChunkError;
pub use recursive::RecursiveCharacterTextSplitter;
pub use semantic::SemanticChunker;
pub use semchunk::SemchunkSplitter;
pub use sentence::SentenceChunker;
pub use sizing::{ByteSizer, CharSizer, ChunkConfig, ChunkSizer, FunctionSizer, WordSizer};
pub use split::KeepSeparator;
pub use structure::{HtmlChunker, MarkdownChunker, XmlChunker};
pub use token::{TokenBoundaryProvider, TokenChunker, TokenSpan};

#[cfg(feature = "code")]
pub use structure::{CodeChunker, CodeLanguage};

#[cfg(feature = "unicode-segmentation")]
pub use sizing::{GraphemeSizer, UnicodeWordSizer};

/// A custom length function for text splitting (e.g. token counting).
pub type LengthFn = std::sync::Arc<dyn Fn(&str) -> usize + Send + Sync>;
pub type EmbeddingFn = std::sync::Arc<dyn Fn(&str) -> Vec<f32> + Send + Sync>;
pub type EmbedderHandle = std::sync::Arc<dyn Embedder>;

pub trait Embedder: Send + Sync {
    fn embed_batch(&self, inputs: &[&str]) -> Result<Vec<Vec<f32>>, ChunkError>;
}

impl<F> Embedder for F
where
    F: Fn(&[&str]) -> Result<Vec<Vec<f32>>, ChunkError> + Send + Sync,
{
    fn embed_batch(&self, inputs: &[&str]) -> Result<Vec<Vec<f32>>, ChunkError> {
        self(inputs)
    }
}

#[cfg(feature = "async")]
pub trait AsyncEmbedder: Send + Sync {
    fn embed_batch(
        &self,
        inputs: &[&str],
    ) -> impl std::future::Future<Output = Result<Vec<Vec<f32>>, ChunkError>> + Send;
}

/// Default length function: counts Unicode characters.
pub fn char_len(s: &str) -> usize {
    s.chars().count()
}