kalosm_language/search/preprocessing/
mod.rs

1// Retrieval Strategies:
2// 1. Simple keyword search
3// 2. Vector search
4//  1. Search by sentence, return window around match
5//  2. Search by summary, return document
6//  3. Search through document tree
7//  4. Search by questions that may be answered by the document
8//  5. Classify documents, search by class
9//
10// Context extraction strategies:
11// 1. Dump all sentences
12// 2. Dump all sentences that mention an entity
13// 3. Extract relevant sentences with an llm
14
15use kalosm_language_model::Embedder;
16
17use crate::context::Document;
18
19use super::Chunk;
20
21mod chunking;
22pub use chunking::*;
23mod hypothetical;
24pub use hypothetical::*;
25mod summary;
26pub use summary::*;
27mod sentence;
28pub use sentence::*;
29mod semantic;
30pub use semantic::*;
31mod html;
32pub use html::*;
33
34/// A strategy for chunking a document into smaller pieces.
35pub trait Chunker {
36    /// The error type that can occur when chunking a document.
37    type Error<E: Send + Sync + 'static>;
38
39    /// Chunk a document into embedded snippets.
40    fn chunk<E: Embedder + Send>(
41        &self,
42        document: &Document,
43        embedder: &E,
44    ) -> impl std::future::Future<Output = Result<Vec<Chunk>, Self::Error<E::Error>>> + Send;
45
46    /// Chunk a batch of documents into embedded snippets.
47    fn chunk_batch<'a, I, E: Embedder + Send>(
48        &self,
49        documents: I,
50        embedder: &E,
51    ) -> impl std::future::Future<Output = Result<Vec<Vec<Chunk>>, Self::Error<E::Error>>> + Send
52    where
53        I: IntoIterator<Item = &'a Document> + Send,
54        I::IntoIter: Send,
55        Self: Sync,
56    {
57        async {
58            let mut chunks = Vec::new();
59            for document in documents {
60                chunks.push(self.chunk(document, embedder).await?);
61            }
62            Ok(chunks)
63        }
64    }
65}