Skip to main content

cognis_rag/splitters/
mod.rs

1//! Text splitters — chunk a [`Document`] into smaller `Document`s suitable for
2//! embedding.
3
4pub mod character;
5pub mod code;
6pub mod html;
7pub mod json_splitter;
8pub mod markdown;
9pub mod recursive;
10pub mod sentence;
11pub mod token_aware;
12
13pub use character::{CharacterSplitter, LengthFn};
14pub use code::{CodeLanguage, CodeSplitter};
15pub use html::HtmlSplitter;
16pub use json_splitter::JsonSplitter;
17pub use markdown::MarkdownSplitter;
18pub use recursive::RecursiveCharSplitter;
19pub use sentence::SentenceSplitter;
20pub use token_aware::{CharTokenizer, FnTokenizer, TokenAwareSplitter, Tokenizer};
21
22use crate::document::Document;
23
24/// A text splitter takes a document and emits chunk-sized documents.
25///
26/// Implementations preserve the source's metadata on each chunk and add a
27/// `chunk_index` field so callers can re-order if needed.
28pub trait TextSplitter: Send + Sync {
29    /// Split `doc` into chunks.
30    fn split(&self, doc: &Document) -> Vec<Document>;
31
32    /// Convenience: split many documents and concatenate the chunks.
33    fn split_all(&self, docs: &[Document]) -> Vec<Document> {
34        docs.iter().flat_map(|d| self.split(d)).collect()
35    }
36}
37
38/// Helper used by every splitter to copy a parent document's metadata onto a
39/// chunk and tag it with `chunk_index`.
40pub(crate) fn child_doc(parent: &Document, content: String, chunk_index: usize) -> Document {
41    let mut metadata = parent.metadata.clone();
42    metadata.insert(
43        "chunk_index".into(),
44        serde_json::Value::Number(chunk_index.into()),
45    );
46    Document {
47        id: None,
48        content,
49        metadata,
50    }
51}