synaptic_splitters/
lib.rs1mod character;
2mod html_header;
3pub mod language;
4mod markdown;
5mod recursive;
6mod token;
7
8pub use character::CharacterTextSplitter;
9pub use html_header::HtmlHeaderTextSplitter;
10pub use language::Language;
11pub use markdown::{HeaderType, MarkdownHeaderTextSplitter};
12pub use recursive::RecursiveCharacterTextSplitter;
13pub use token::TokenTextSplitter;
14
15pub use synaptic_core::Document;
17
18pub trait TextSplitter: Send + Sync {
20 fn split_text(&self, text: &str) -> Vec<String>;
22
23 fn split_documents(&self, docs: Vec<Document>) -> Vec<Document> {
26 let mut result = Vec::new();
27 for doc in docs {
28 let chunks = self.split_text(&doc.content);
29 for (i, chunk) in chunks.into_iter().enumerate() {
30 let mut metadata = doc.metadata.clone();
31 metadata.insert(
32 "chunk_index".to_string(),
33 serde_json::Value::Number(i.into()),
34 );
35 result.push(Document::with_metadata(
36 format!("{}-chunk-{i}", doc.id),
37 chunk,
38 metadata,
39 ));
40 }
41 }
42 result
43 }
44}