pub mod character;
pub mod code;
pub mod html;
pub mod json_splitter;
pub mod markdown;
pub mod recursive;
pub mod sentence;
pub mod token_aware;
pub use character::{CharacterSplitter, LengthFn};
pub use code::{CodeLanguage, CodeSplitter};
pub use html::HtmlSplitter;
pub use json_splitter::JsonSplitter;
pub use markdown::MarkdownSplitter;
pub use recursive::RecursiveCharSplitter;
pub use sentence::SentenceSplitter;
pub use token_aware::{CharTokenizer, FnTokenizer, TokenAwareSplitter, Tokenizer};
use crate::document::Document;
pub trait TextSplitter: Send + Sync {
fn split(&self, doc: &Document) -> Vec<Document>;
fn split_all(&self, docs: &[Document]) -> Vec<Document> {
docs.iter().flat_map(|d| self.split(d)).collect()
}
}
pub(crate) fn child_doc(parent: &Document, content: String, chunk_index: usize) -> Document {
let mut metadata = parent.metadata.clone();
metadata.insert(
"chunk_index".into(),
serde_json::Value::Number(chunk_index.into()),
);
Document {
id: None,
content,
metadata,
}
}