1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
use derive_builder::Builder;
const DEFAULT_TOP_K: u64 = 10;
const DEFAULT_TOP_N: u64 = 10;
/// Search strategies provide a generic way for Retrievers to implement their
/// search in various ways.
///
/// The strategy is also yielded to the Retriever and can contain addition configuration
use crate::{indexing::EmbeddedField, querying};
/// A very simple search where it takes the embedding on the current query
/// and returns `top_k` documents.
#[derive(Debug, Clone, Copy)]
pub struct SimilaritySingleEmbedding {
/// Maximum number of documents to return
top_k: u64,
}
/// A hybrid search strategy that combines a similarity search with a
/// keyword search / sparse search.
///
/// Defaults to a a maximum of 10 documents and `EmbeddedField::Combined` for the field(s).
#[derive(Debug, Clone, Builder)]
#[builder(setter(into))]
pub struct HybridSearch {
/// Maximum number of documents to return
#[builder(default)]
top_k: u64,
/// Maximum number of documents to return per query
#[builder(default)]
top_n: u64,
/// The field to use for the dense vector
#[builder(default)]
dense_vector_field: EmbeddedField,
/// The field to use for the sparse vector
/// TODO: I.e. lancedb does not use sparse embeddings for hybrid search
#[builder(default)]
sparse_vector_field: EmbeddedField,
}
impl Default for HybridSearch {
fn default() -> Self {
Self {
top_k: DEFAULT_TOP_K,
top_n: DEFAULT_TOP_N,
dense_vector_field: EmbeddedField::Combined,
sparse_vector_field: EmbeddedField::Combined,
}
}
}
impl HybridSearch {
/// Sets the maximum amount of total documents retrieved
pub fn with_top_k(&mut self, top_k: u64) -> &mut Self {
self.top_k = top_k;
self
}
/// Returns the maximum amount of total documents to be retrieved
pub fn top_k(&self) -> u64 {
self.top_k
}
/// Sets the maximum amount of documents to be retrieved
/// per individual query
pub fn with_top_n(&mut self, top_n: u64) -> &mut Self {
self.top_n = top_n;
self
}
/// Returns the maximum amount of documents per query
pub fn top_n(&self) -> u64 {
self.top_n
}
/// Sets the vector field for the dense vector
///
/// Defaults to `EmbeddedField::Combined`
pub fn with_dense_vector_field(
&mut self,
dense_vector_field: impl Into<EmbeddedField>,
) -> &mut Self {
self.dense_vector_field = dense_vector_field.into();
self
}
/// Returns the field for the dense vector
pub fn dense_vector_field(&self) -> &EmbeddedField {
&self.dense_vector_field
}
/// Sets the vector field for the sparse vector (if applicable)
///
/// Defaults to `EmbeddedField::Combined`
pub fn with_sparse_vector_field(
&mut self,
sparse_vector_field: impl Into<EmbeddedField>,
) -> &mut Self {
self.sparse_vector_field = sparse_vector_field.into();
self
}
/// Returns the field for the dense vector
pub fn sparse_vector_field(&self) -> &EmbeddedField {
&self.sparse_vector_field
}
}
impl Default for SimilaritySingleEmbedding {
fn default() -> Self {
Self {
top_k: DEFAULT_TOP_K,
}
}
}
impl SimilaritySingleEmbedding {
/// Set the maximum amount of documents to be returned
pub fn with_top_k(&mut self, top_k: u64) -> &mut Self {
self.top_k = top_k;
self
}
/// Returns the maximum of documents to be returned
pub fn top_k(&self) -> u64 {
self.top_k
}
}
impl querying::SearchStrategy for SimilaritySingleEmbedding {}
impl querying::SearchStrategy for HybridSearch {}