Skip to main content

synaptic_retrieval/
lib.rs

1mod bm25;
2mod compression;
3mod ensemble;
4mod multi_query;
5mod parent_document;
6mod self_query;
7
8pub use bm25::BM25Retriever;
9pub use compression::{ContextualCompressionRetriever, DocumentCompressor, EmbeddingsFilter};
10pub use ensemble::EnsembleRetriever;
11pub use multi_query::MultiQueryRetriever;
12pub use parent_document::ParentDocumentRetriever;
13pub use self_query::{MetadataFieldInfo, SelfQueryRetriever};
14
15use std::collections::HashSet;
16
17use async_trait::async_trait;
18use synaptic_core::SynapticError;
19
20// Re-export Document and Retriever from core for backward compatibility
21pub use synaptic_core::{Document, Retriever};
22
23/// A simple retriever that stores documents in memory and returns all of them for any query.
24#[derive(Debug, Clone)]
25pub struct InMemoryRetriever {
26    documents: Vec<Document>,
27}
28
29impl InMemoryRetriever {
30    pub fn new(documents: Vec<Document>) -> Self {
31        Self { documents }
32    }
33}
34
35#[async_trait]
36impl Retriever for InMemoryRetriever {
37    async fn retrieve(&self, query: &str, top_k: usize) -> Result<Vec<Document>, SynapticError> {
38        let query_terms = tokenize(query);
39        let mut scored: Vec<(usize, &Document)> = self
40            .documents
41            .iter()
42            .map(|doc| {
43                let terms = tokenize(&doc.content);
44                let score = query_terms.intersection(&terms).count();
45                (score, doc)
46            })
47            .collect();
48
49        scored.sort_by(|a, b| b.0.cmp(&a.0));
50        Ok(scored
51            .into_iter()
52            .filter(|(score, _)| *score > 0)
53            .take(top_k)
54            .map(|(_, doc)| doc.clone())
55            .collect())
56    }
57}
58
59pub(crate) fn tokenize(input: &str) -> HashSet<String> {
60    input
61        .split_whitespace()
62        .map(|term| term.to_ascii_lowercase())
63        .collect()
64}
65
66/// Tokenize text into a Vec of lowercase tokens, preserving duplicates.
67/// Used by BM25 which needs term frequency counts.
68pub(crate) fn tokenize_to_vec(input: &str) -> Vec<String> {
69    input
70        .split_whitespace()
71        .map(|term| term.to_ascii_lowercase())
72        .collect()
73}