Skip to main content

synaptic_splitters/
lib.rs

1mod character;
2mod html_header;
3pub mod language;
4mod markdown;
5mod recursive;
6mod token;
7
8pub use character::CharacterTextSplitter;
9pub use html_header::HtmlHeaderTextSplitter;
10pub use language::Language;
11pub use markdown::{HeaderType, MarkdownHeaderTextSplitter};
12pub use recursive::RecursiveCharacterTextSplitter;
13pub use token::TokenTextSplitter;
14
15// Re-export Document from core for backward compatibility
16pub use synaptic_core::Document;
17
18/// Trait for splitting text into chunks.
19pub trait TextSplitter: Send + Sync {
20    /// Split a string into chunks.
21    fn split_text(&self, text: &str) -> Vec<String>;
22
23    /// Split documents by splitting each document's content and producing
24    /// new documents for each chunk. Metadata is preserved on each chunk.
25    fn split_documents(&self, docs: Vec<Document>) -> Vec<Document> {
26        let mut result = Vec::new();
27        for doc in docs {
28            let chunks = self.split_text(&doc.content);
29            for (i, chunk) in chunks.into_iter().enumerate() {
30                let mut metadata = doc.metadata.clone();
31                metadata.insert(
32                    "chunk_index".to_string(),
33                    serde_json::Value::Number(i.into()),
34                );
35                result.push(Document::with_metadata(
36                    format!("{}-chunk-{i}", doc.id),
37                    chunk,
38                    metadata,
39                ));
40            }
41        }
42        result
43    }
44}