Skip to main content

DedupVectorStore

Struct DedupVectorStore 

Source
pub struct DedupVectorStore<S, F = fn(&str) -> String>
where S: VectorStore, F: Fn(&str) -> String + Send + Sync,
{ /* private fields */ }
Expand description

A VectorStore decorator that silently skips documents whose normalised content fingerprint is already in the seen-set.

§Type parameters

§Persistence across restarts

The seen-set lives in memory and is cleared on process restart. To survive restarts, query your storage for existing content fingerprints on startup and pass them to DedupVectorStore::with_seen. The normalized_fingerprint function is public so you can pre-compute hashes from stored records.

let hashes = db.query_all("SELECT content_hash FROM facts").await?;
let store = DedupVectorStore::with_seen(inner, hashes);

Implementations§

Source§

impl<S: VectorStore> DedupVectorStore<S, fn(&str) -> String>

Source

pub fn new(inner: S) -> Self

Wrap inner with an empty seen-set and the default normalized_fingerprint function.

Source

pub fn with_seen(inner: S, seen: impl IntoIterator<Item = String>) -> Self

Wrap inner and pre-populate the seen-set from seen fingerprints.

Use this when re-starting a process and you want to restore the dedup state from previously persisted fingerprints.

Source§

impl<S, F> DedupVectorStore<S, F>
where S: VectorStore, F: Fn(&str) -> String + Send + Sync,

Source

pub fn with_fingerprint(inner: S, f: F) -> Self

Wrap inner with a custom fingerprint function and an empty seen-set.

The function receives the raw document text and returns a string key. Documents whose key is already in the seen-set are skipped.

Source

pub fn with_fingerprint_and_seen( inner: S, f: F, seen: impl IntoIterator<Item = String>, ) -> Self

Wrap inner with a custom fingerprint function and a pre-populated seen-set.

Source

pub fn contains(&self, text: &str) -> bool

Whether text is already recorded in the seen-set (using the configured fingerprint function).

Source

pub fn inner(&self) -> &S

Read-only access to the inner store.

Source

pub fn inner_mut(&mut self) -> &mut S

Mutable access to the inner store — e.g. to call store-specific methods not on the VectorStore trait.

Source

pub fn seen_fingerprints(&self) -> impl Iterator<Item = &str>

Iterate over all fingerprints currently held in the seen-set. Useful when persisting state to storage before shutdown.

Source

pub fn seen_count(&self) -> usize

Number of unique fingerprints recorded (≥ documents in the inner store when duplicates were skipped).

Trait Implementations§

Source§

impl<S, F> VectorStore for DedupVectorStore<S, F>
where S: VectorStore + Send + Sync, F: Fn(&str) -> String + Send + Sync,

Source§

fn add_texts<'life0, 'async_trait>( &'life0 mut self, texts: Vec<String>, metadata: Option<Vec<HashMap<String, Value>>>, ) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Filter out texts whose fingerprint is already seen, then delegate the remainder to the inner store.

Skipped documents are represented as "dedup:skipped:{fingerprint}" in the returned ID list so that callers whose code expects ids.len() == texts.len() still holds.

Source§

fn add_vectors<'life0, 'async_trait>( &'life0 mut self, vectors: Vec<Vec<f32>>, texts: Vec<String>, metadata: Option<Vec<HashMap<String, Value>>>, ) -> Pin<Box<dyn Future<Output = Result<Vec<String>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Filter out pre-embedded vectors whose text fingerprint is already seen, then delegate the remainder to the inner store.

Similarity search: embed the query, return top-k matches.
Source§

fn similarity_search_by_vector<'life0, 'async_trait>( &'life0 self, query_vector: Vec<f32>, k: usize, ) -> Pin<Box<dyn Future<Output = Result<Vec<SearchResult>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Similarity search by pre-computed query vector.
Source§

fn similarity_search_with_filter<'life0, 'life1, 'life2, 'async_trait>( &'life0 self, query: &'life1 str, k: usize, filter: &'life2 Filter, ) -> Pin<Box<dyn Future<Output = Result<Vec<SearchResult>>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait, 'life1: 'async_trait, 'life2: 'async_trait,

Similarity search with a metadata filter. Read more
Source§

fn delete<'life0, 'async_trait>( &'life0 mut self, ids: Vec<String>, ) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'async_trait>>
where Self: 'async_trait, 'life0: 'async_trait,

Delete documents by ID. IDs not found are silently ignored.
Source§

fn len(&self) -> usize

Number of documents currently stored.
Source§

fn is_empty(&self) -> bool

True if no documents are stored.

Auto Trait Implementations§

§

impl<S, F> Freeze for DedupVectorStore<S, F>
where S: Freeze, F: Freeze,

§

impl<S, F> RefUnwindSafe for DedupVectorStore<S, F>

§

impl<S, F> Send for DedupVectorStore<S, F>

§

impl<S, F> Sync for DedupVectorStore<S, F>

§

impl<S, F> Unpin for DedupVectorStore<S, F>
where S: Unpin, F: Unpin,

§

impl<S, F> UnsafeUnpin for DedupVectorStore<S, F>
where S: UnsafeUnpin, F: UnsafeUnpin,

§

impl<S, F> UnwindSafe for DedupVectorStore<S, F>
where S: UnwindSafe, F: UnwindSafe,

Blanket Implementations§

Source§

impl<T> Any for T
where T: 'static + ?Sized,

Source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
Source§

impl<T> Borrow<T> for T
where T: ?Sized,

Source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
Source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

Source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
Source§

impl<T> From<T> for T

Source§

fn from(t: T) -> T

Returns the argument unchanged.

Source§

impl<T> Instrument for T

Source§

fn instrument(self, span: Span) -> Instrumented<Self>

Instruments this type with the provided Span, returning an Instrumented wrapper. Read more
Source§

fn in_current_span(self) -> Instrumented<Self>

Instruments this type with the current Span, returning an Instrumented wrapper. Read more
Source§

impl<T, U> Into<U> for T
where U: From<T>,

Source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

Source§

impl<T> PolicyExt for T
where T: ?Sized,

Source§

fn and<P, B, E>(self, other: P) -> And<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow only if self and other return Action::Follow. Read more
Source§

fn or<P, B, E>(self, other: P) -> Or<T, P>
where T: Policy<B, E>, P: Policy<B, E>,

Create a new Policy that returns Action::Follow if either self or other returns Action::Follow. Read more
Source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

Source§

type Error = Infallible

The type returned in the event of a conversion error.
Source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
Source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

Source§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
Source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
Source§

impl<T> WithSubscriber for T

Source§

fn with_subscriber<S>(self, subscriber: S) -> WithDispatch<Self>
where S: Into<Dispatch>,

Attaches the provided Subscriber to this type, returning a WithDispatch wrapper. Read more
Source§

fn with_current_subscriber(self) -> WithDispatch<Self>

Attaches the current default Subscriber to this type, returning a WithDispatch wrapper. Read more