Skip to main content

synaptic_splitters/
lib.rs

1mod character;
2mod html_header;
3pub mod language;
4mod markdown;
5mod recursive;
6mod token;
7
8pub use character::CharacterTextSplitter;
9pub use html_header::HtmlHeaderTextSplitter;
10pub use language::Language;
11pub use markdown::{HeaderType, MarkdownHeaderTextSplitter};
12pub use recursive::RecursiveCharacterTextSplitter;
13pub use token::TokenTextSplitter;
14
15use synaptic_retrieval::Document;
16
17/// Trait for splitting text into chunks.
18pub trait TextSplitter: Send + Sync {
19    /// Split a string into chunks.
20    fn split_text(&self, text: &str) -> Vec<String>;
21
22    /// Split documents by splitting each document's content and producing
23    /// new documents for each chunk. Metadata is preserved on each chunk.
24    fn split_documents(&self, docs: Vec<Document>) -> Vec<Document> {
25        let mut result = Vec::new();
26        for doc in docs {
27            let chunks = self.split_text(&doc.content);
28            for (i, chunk) in chunks.into_iter().enumerate() {
29                let mut metadata = doc.metadata.clone();
30                metadata.insert(
31                    "chunk_index".to_string(),
32                    serde_json::Value::Number(i.into()),
33                );
34                result.push(Document::with_metadata(
35                    format!("{}-chunk-{i}", doc.id),
36                    chunk,
37                    metadata,
38                ));
39            }
40        }
41        result
42    }
43}