pub mod chunk;
pub mod error;
pub mod merge;
pub mod split;
pub mod character;
pub mod recursive;
pub mod semantic;
pub mod semchunk;
pub mod sentence;
pub mod sizing;
pub mod structure;
pub mod token;
pub use character::CharacterTextSplitter;
pub use chunk::{ChunkMetadata, TextChunk, TextChunkIter};
pub use error::ChunkError;
pub use recursive::RecursiveCharacterTextSplitter;
pub use semantic::SemanticChunker;
pub use semchunk::SemchunkSplitter;
pub use sentence::SentenceChunker;
pub use sizing::{ByteSizer, CharSizer, ChunkConfig, ChunkSizer, FunctionSizer, WordSizer};
pub use split::KeepSeparator;
pub use structure::{HtmlChunker, MarkdownChunker, XmlChunker};
pub use token::{TokenBoundaryProvider, TokenChunker, TokenSpan};
#[cfg(feature = "code")]
pub use structure::{CodeChunker, CodeLanguage};
#[cfg(feature = "unicode-segmentation")]
pub use sizing::{GraphemeSizer, UnicodeWordSizer};
pub type LengthFn = std::sync::Arc<dyn Fn(&str) -> usize + Send + Sync>;
pub type EmbeddingFn = std::sync::Arc<dyn Fn(&str) -> Vec<f32> + Send + Sync>;
pub type EmbedderHandle = std::sync::Arc<dyn Embedder>;
pub trait Embedder: Send + Sync {
fn embed_batch(&self, inputs: &[&str]) -> Result<Vec<Vec<f32>>, ChunkError>;
}
impl<F> Embedder for F
where
F: Fn(&[&str]) -> Result<Vec<Vec<f32>>, ChunkError> + Send + Sync,
{
fn embed_batch(&self, inputs: &[&str]) -> Result<Vec<Vec<f32>>, ChunkError> {
self(inputs)
}
}
#[cfg(feature = "async")]
pub trait AsyncEmbedder: Send + Sync {
fn embed_batch(
&self,
inputs: &[&str],
) -> impl std::future::Future<Output = Result<Vec<Vec<f32>>, ChunkError>> + Send;
}
pub fn char_len(s: &str) -> usize {
s.chars().count()
}