kalosm_language/search/preprocessing/
mod.rs1use kalosm_language_model::Embedder;
16
17use crate::context::Document;
18
19use super::Chunk;
20
21mod chunking;
22pub use chunking::*;
23mod hypothetical;
24pub use hypothetical::*;
25mod summary;
26pub use summary::*;
27mod sentence;
28pub use sentence::*;
29mod semantic;
30pub use semantic::*;
31mod html;
32pub use html::*;
33
34pub trait Chunker {
36 type Error<E: Send + Sync + 'static>;
38
39 fn chunk<E: Embedder + Send>(
41 &self,
42 document: &Document,
43 embedder: &E,
44 ) -> impl std::future::Future<Output = Result<Vec<Chunk>, Self::Error<E::Error>>> + Send;
45
46 fn chunk_batch<'a, I, E: Embedder + Send>(
48 &self,
49 documents: I,
50 embedder: &E,
51 ) -> impl std::future::Future<Output = Result<Vec<Vec<Chunk>>, Self::Error<E::Error>>> + Send
52 where
53 I: IntoIterator<Item = &'a Document> + Send,
54 I::IntoIter: Send,
55 Self: Sync,
56 {
57 async {
58 let mut chunks = Vec::new();
59 for document in documents {
60 chunks.push(self.chunk(document, embedder).await?);
61 }
62 Ok(chunks)
63 }
64 }
65}