synaptic_splitters/
lib.rs1mod character;
2mod html_header;
3pub mod language;
4mod markdown;
5mod recursive;
6mod token;
7
8pub use character::CharacterTextSplitter;
9pub use html_header::HtmlHeaderTextSplitter;
10pub use language::Language;
11pub use markdown::{HeaderType, MarkdownHeaderTextSplitter};
12pub use recursive::RecursiveCharacterTextSplitter;
13pub use token::TokenTextSplitter;
14
15use synaptic_retrieval::Document;
16
17pub trait TextSplitter: Send + Sync {
19 fn split_text(&self, text: &str) -> Vec<String>;
21
22 fn split_documents(&self, docs: Vec<Document>) -> Vec<Document> {
25 let mut result = Vec::new();
26 for doc in docs {
27 let chunks = self.split_text(&doc.content);
28 for (i, chunk) in chunks.into_iter().enumerate() {
29 let mut metadata = doc.metadata.clone();
30 metadata.insert(
31 "chunk_index".to_string(),
32 serde_json::Value::Number(i.into()),
33 );
34 result.push(Document::with_metadata(
35 format!("{}-chunk-{i}", doc.id),
36 chunk,
37 metadata,
38 ));
39 }
40 }
41 result
42 }
43}