synaptic_retrieval/
lib.rs1mod bm25;
2mod compression;
3mod ensemble;
4mod multi_query;
5mod parent_document;
6mod self_query;
7
8pub use bm25::BM25Retriever;
9pub use compression::{ContextualCompressionRetriever, DocumentCompressor, EmbeddingsFilter};
10pub use ensemble::EnsembleRetriever;
11pub use multi_query::MultiQueryRetriever;
12pub use parent_document::ParentDocumentRetriever;
13pub use self_query::{MetadataFieldInfo, SelfQueryRetriever};
14
15use std::collections::HashSet;
16
17use async_trait::async_trait;
18use synaptic_core::SynapticError;
19
20pub use synaptic_core::{Document, Retriever};
22
23#[derive(Debug, Clone)]
25pub struct InMemoryRetriever {
26 documents: Vec<Document>,
27}
28
29impl InMemoryRetriever {
30 pub fn new(documents: Vec<Document>) -> Self {
31 Self { documents }
32 }
33}
34
35#[async_trait]
36impl Retriever for InMemoryRetriever {
37 async fn retrieve(&self, query: &str, top_k: usize) -> Result<Vec<Document>, SynapticError> {
38 let query_terms = tokenize(query);
39 let mut scored: Vec<(usize, &Document)> = self
40 .documents
41 .iter()
42 .map(|doc| {
43 let terms = tokenize(&doc.content);
44 let score = query_terms.intersection(&terms).count();
45 (score, doc)
46 })
47 .collect();
48
49 scored.sort_by(|a, b| b.0.cmp(&a.0));
50 Ok(scored
51 .into_iter()
52 .filter(|(score, _)| *score > 0)
53 .take(top_k)
54 .map(|(_, doc)| doc.clone())
55 .collect())
56 }
57}
58
59pub(crate) fn tokenize(input: &str) -> HashSet<String> {
60 input
61 .split_whitespace()
62 .map(|term| term.to_ascii_lowercase())
63 .collect()
64}
65
66pub(crate) fn tokenize_to_vec(input: &str) -> Vec<String> {
69 input
70 .split_whitespace()
71 .map(|term| term.to_ascii_lowercase())
72 .collect()
73}