cognis_rag/splitters/
mod.rs1pub mod character;
5pub mod code;
6pub mod html;
7pub mod json_splitter;
8pub mod markdown;
9pub mod recursive;
10pub mod sentence;
11pub mod token_aware;
12
13pub use character::{CharacterSplitter, LengthFn};
14pub use code::{CodeLanguage, CodeSplitter};
15pub use html::HtmlSplitter;
16pub use json_splitter::JsonSplitter;
17pub use markdown::MarkdownSplitter;
18pub use recursive::RecursiveCharSplitter;
19pub use sentence::SentenceSplitter;
20pub use token_aware::{CharTokenizer, FnTokenizer, TokenAwareSplitter, Tokenizer};
21
22use crate::document::Document;
23
24pub trait TextSplitter: Send + Sync {
29 fn split(&self, doc: &Document) -> Vec<Document>;
31
32 fn split_all(&self, docs: &[Document]) -> Vec<Document> {
34 docs.iter().flat_map(|d| self.split(d)).collect()
35 }
36}
37
38pub(crate) fn child_doc(parent: &Document, content: String, chunk_index: usize) -> Document {
41 let mut metadata = parent.metadata.clone();
42 metadata.insert(
43 "chunk_index".into(),
44 serde_json::Value::Number(chunk_index.into()),
45 );
46 Document {
47 id: None,
48 content,
49 metadata,
50 }
51}