text_retrieval/
lib.rs

1#![doc = include_str!("../README.md")]
2
3pub mod surface;
4use std::collections::{BTreeMap, BTreeSet};
5
6mod storage;
7
8use serde::{Deserialize, Serialize};
9use text_core::{
10    split_paragraphs, split_sentence_spans, tokenize, TextAnnotationSpan, TextDocument,
11    TextDocumentContract, TextProcessingOptions, TextProvenance, TextSegmentContract,
12    TextSourceRef, TextSpan, TokenKind,
13};
14use text_embeddings::{EmbeddingModelInfo, TextEmbedderBackend};
15use text_lexical::{Bm25Corpus, Bm25Options, CorpusOptions, TextCorpus, TextCorpusDocument};
16use text_model_runtime::{TextReranker, TextRuntimeBackend};
17use vector_analysis_index::{
18    SerializableVectorRecord, VectorRecord, VectorRecordMetadata, VectorSearchFilter,
19    VectorSearchIndex,
20};
21use video_analysis_core::{DetectError, Result as CoreResult};
22
23pub use storage::*;
24
25const TITLE_METADATA_KEY: &str = "__title";
26
27#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
28/// Data type for search document.
29pub struct SearchDocument {
30    /// Identifier for this value.
31    pub id: String,
32    /// The title value.
33    #[serde(default)]
34    pub title: Option<String>,
35    /// The body value.
36    pub body: String,
37    /// Metadata associated with this value.
38    #[serde(default)]
39    pub metadata: BTreeMap<String, String>,
40    /// Optional rich source reference.
41    #[serde(default)]
42    pub source: Option<TextSourceRef>,
43    /// Provenance records for this document.
44    #[serde(default)]
45    pub provenance: Vec<TextProvenance>,
46    /// Annotation spans associated with this document.
47    #[serde(default)]
48    pub annotations: Vec<TextAnnotationSpan>,
49}
50
51impl SearchDocument {
52    /// Creates a new value.
53    pub fn new(id: impl Into<String>, body: impl Into<String>) -> Self {
54        Self {
55            id: id.into(),
56            title: None,
57            body: body.into(),
58            metadata: BTreeMap::new(),
59            source: None,
60            provenance: Vec::new(),
61            annotations: Vec::new(),
62        }
63    }
64
65    /// Builds this value from text document.
66    pub fn from_text_document(document: &TextDocument<'_>) -> Self {
67        let mut metadata = BTreeMap::new();
68        if let Some(language) = document.language {
69            metadata.insert("language".to_string(), language.to_string());
70        }
71        if let Some(timestamp) = document.timestamp {
72            metadata.insert(
73                "timestamp_seconds".to_string(),
74                timestamp.seconds().to_string(),
75            );
76        }
77        Self {
78            id: document.id.to_string(),
79            title: None,
80            body: document.text.to_string(),
81            metadata,
82            source: document.timestamp.map(|timestamp| TextSourceRef {
83                source_id: None,
84                source_kind: Some("text_document".to_string()),
85                uri: None,
86                media_timestamp: Some(timestamp.into()),
87                duration_seconds: None,
88            }),
89            provenance: Vec::new(),
90            annotations: Vec::new(),
91        }
92    }
93
94    /// Builds this value from a portable text document contract.
95    pub fn from_text_document_contract(document: &TextDocumentContract) -> Self {
96        let mut metadata = document.attributes.clone();
97        if let Some(language) = &document.language {
98            metadata.insert("language".to_string(), language.clone());
99        }
100        if let Some(timestamp) = document.timestamp {
101            metadata.insert(
102                "timestamp_seconds".to_string(),
103                timestamp.seconds().to_string(),
104            );
105        }
106        Self {
107            id: document.id.clone(),
108            title: None,
109            body: document.text.clone(),
110            metadata,
111            source: document.source.clone(),
112            provenance: document.provenance.clone(),
113            annotations: document.annotations.clone(),
114        }
115    }
116
117    /// Builds this value from a portable text segment contract.
118    pub fn from_text_segment_contract(segment: &TextSegmentContract) -> Self {
119        let mut metadata = segment.attributes.clone();
120        if let Some(language) = &segment.language {
121            metadata.insert("language".to_string(), language.clone());
122        }
123        if let Some(timestamp) = segment.timestamp {
124            metadata.insert(
125                "timestamp_seconds".to_string(),
126                timestamp.seconds().to_string(),
127            );
128        }
129        if let Some(duration_seconds) = segment.duration_seconds {
130            metadata.insert("duration_seconds".to_string(), duration_seconds.to_string());
131        }
132        Self {
133            id: segment
134                .document_id()
135                .unwrap_or_else(|| segment.segment_index.to_string()),
136            title: None,
137            body: segment.text.clone(),
138            metadata,
139            source: segment.source.clone().or_else(|| {
140                (segment.timestamp.is_some() || segment.duration_seconds.is_some()).then(|| {
141                    TextSourceRef {
142                        source_id: segment.stream_id.clone(),
143                        source_kind: Some("text_segment".to_string()),
144                        uri: None,
145                        media_timestamp: segment.timestamp,
146                        duration_seconds: segment.duration_seconds,
147                    }
148                })
149            }),
150            provenance: segment.provenance.clone(),
151            annotations: segment.annotations.clone(),
152        }
153    }
154
155    /// Builds this value from a lexical corpus document without losing rich metadata.
156    pub fn from_text_corpus_document(document: &TextCorpusDocument) -> Self {
157        let mut metadata = document.metadata.clone();
158        if let Some(language) = &document.language {
159            metadata.insert("language".to_string(), language.clone());
160        }
161        if let Some(timestamp) = document.timestamp {
162            metadata.insert(
163                "timestamp_seconds".to_string(),
164                timestamp.seconds().to_string(),
165            );
166        }
167        if let Some(duration_seconds) = document
168            .source
169            .as_ref()
170            .and_then(|source| source.duration_seconds)
171        {
172            metadata.insert("duration_seconds".to_string(), duration_seconds.to_string());
173        }
174        Self {
175            id: document.id.clone(),
176            title: None,
177            body: document.text.clone(),
178            metadata,
179            source: document.source.clone(),
180            provenance: document.provenance.clone(),
181            annotations: document.annotations.clone(),
182        }
183    }
184
185    /// Builds retrieval documents from a lexical corpus.
186    pub fn from_text_corpus(corpus: &TextCorpus) -> Vec<Self> {
187        corpus
188            .documents
189            .iter()
190            .map(Self::from_text_corpus_document)
191            .collect()
192    }
193}
194
195pub trait IntoSearchDocument {
196    fn into_search_document(self) -> SearchDocument;
197}
198
199impl IntoSearchDocument for TextDocument<'_> {
200    fn into_search_document(self) -> SearchDocument {
201        SearchDocument::from_text_document(&self)
202    }
203}
204
205impl IntoSearchDocument for &TextDocument<'_> {
206    fn into_search_document(self) -> SearchDocument {
207        SearchDocument::from_text_document(self)
208    }
209}
210
211impl IntoSearchDocument for TextDocumentContract {
212    fn into_search_document(self) -> SearchDocument {
213        SearchDocument::from_text_document_contract(&self)
214    }
215}
216
217impl IntoSearchDocument for &TextDocumentContract {
218    fn into_search_document(self) -> SearchDocument {
219        SearchDocument::from_text_document_contract(self)
220    }
221}
222
223impl IntoSearchDocument for TextSegmentContract {
224    fn into_search_document(self) -> SearchDocument {
225        SearchDocument::from_text_segment_contract(&self)
226    }
227}
228
229impl IntoSearchDocument for &TextSegmentContract {
230    fn into_search_document(self) -> SearchDocument {
231        SearchDocument::from_text_segment_contract(self)
232    }
233}
234
235#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
236/// Data type for document chunk.
237pub struct DocumentChunk {
238    /// The chunk identifier value.
239    pub chunk_id: String,
240    /// The document identifier value.
241    pub document_id: String,
242    /// Text content for this value.
243    pub text: String,
244    /// The ordinal value.
245    pub ordinal: usize,
246    /// Metadata associated with this value.
247    pub metadata: BTreeMap<String, String>,
248    /// Optional rich source reference.
249    #[serde(default)]
250    pub source: Option<TextSourceRef>,
251    /// Provenance records for this chunk.
252    #[serde(default)]
253    pub provenance: Vec<TextProvenance>,
254    /// Annotation spans associated with this chunk.
255    #[serde(default)]
256    pub annotations: Vec<TextAnnotationSpan>,
257}
258
259impl DocumentChunk {
260    /// Converts this retrieval chunk into a portable document contract.
261    pub fn to_text_document_contract(&self) -> TextDocumentContract {
262        TextDocumentContract {
263            id: self.chunk_id.clone(),
264            text: self.text.clone(),
265            language: self.metadata.get("language").cloned(),
266            timestamp: self
267                .source
268                .as_ref()
269                .and_then(|source| source.media_timestamp),
270            attributes: self.metadata.clone(),
271            source: self.source.clone(),
272            provenance: self.provenance.clone(),
273            annotations: self.annotations.clone(),
274        }
275    }
276
277    /// Converts this retrieval chunk into a portable segment contract.
278    pub fn to_text_segment_contract(&self) -> TextSegmentContract {
279        TextSegmentContract {
280            stream_id: Some(self.document_id.clone()),
281            segment_index: self.ordinal as u64,
282            text: self.text.clone(),
283            language: self.metadata.get("language").cloned(),
284            timestamp: self
285                .source
286                .as_ref()
287                .and_then(|source| source.media_timestamp),
288            duration_seconds: self
289                .source
290                .as_ref()
291                .and_then(|source| source.duration_seconds),
292            is_final: true,
293            attributes: self.metadata.clone(),
294            source: self.source.clone(),
295            provenance: self.provenance.clone(),
296            annotations: self.annotations.clone(),
297        }
298    }
299}
300
301#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
302/// Data type for ingestion options.
303pub struct IngestionOptions {
304    /// The chunk tokens value.
305    pub chunk_tokens: usize,
306    /// The chunk overlap tokens value.
307    pub chunk_overlap_tokens: usize,
308    /// The store raw text value.
309    pub store_raw_text: bool,
310}
311
312#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
313/// Strategy for constructing retrieval chunks.
314pub enum ChunkingStrategy {
315    /// Fixed token windows with overlap.
316    TokenWindow,
317    /// One sentence per chunk.
318    Sentence,
319    /// One paragraph per chunk.
320    Paragraph,
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
324/// Options for explicit chunk construction.
325pub struct ChunkingOptions {
326    /// Chunking strategy.
327    pub strategy: ChunkingStrategy,
328    /// Token-window options used when `strategy` is `TokenWindow`.
329    pub ingestion: IngestionOptions,
330}
331
332impl Default for ChunkingOptions {
333    fn default() -> Self {
334        Self {
335            strategy: ChunkingStrategy::TokenWindow,
336            ingestion: IngestionOptions::default(),
337        }
338    }
339}
340
341impl Default for IngestionOptions {
342    fn default() -> Self {
343        Self {
344            chunk_tokens: 256,
345            chunk_overlap_tokens: 32,
346            store_raw_text: true,
347        }
348    }
349}
350
351/// Chunks one search document with explicit strategy options.
352pub fn chunk_search_document(
353    document: &SearchDocument,
354    options: &ChunkingOptions,
355    processing: &TextProcessingOptions,
356) -> CoreResult<Vec<DocumentChunk>> {
357    validate_document(document)?;
358    match options.strategy {
359        ChunkingStrategy::TokenWindow => chunk_document(document, &options.ingestion, processing),
360        ChunkingStrategy::Sentence => chunk_spans(
361            document,
362            split_sentence_spans(&document.body, processing)
363                .into_iter()
364                .map(|sentence| sentence.span),
365        ),
366        ChunkingStrategy::Paragraph => chunk_spans(
367            document,
368            split_paragraphs(&document.body)
369                .into_iter()
370                .map(|paragraph| paragraph.span),
371        ),
372    }
373}
374
375#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
376/// Data type for search filter.
377pub struct SearchFilter {
378    /// The metadata equals value.
379    pub metadata_equals: BTreeMap<String, String>,
380    /// The metadata contains value.
381    pub metadata_contains: BTreeMap<String, String>,
382    /// The required tags value.
383    pub required_tags: Vec<String>,
384    /// The document identifiers value.
385    pub document_ids: BTreeSet<String>,
386}
387
388#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
389/// Variants describing retrieval mode.
390pub enum RetrievalMode {
391    /// The full text variant.
392    FullText,
393    /// The semantic variant.
394    Semantic,
395    /// The hybrid variant.
396    Hybrid,
397}
398
399#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
400/// Data type for hybrid config.
401pub struct HybridConfig {
402    /// The semantic weight value.
403    pub semantic_weight: f32,
404    /// The lexical weight value.
405    pub lexical_weight: f32,
406    /// The rerank window value.
407    pub rerank_window: usize,
408    /// Whether search results should be reranked after initial retrieval.
409    pub rerank: bool,
410}
411
412impl Default for HybridConfig {
413    fn default() -> Self {
414        Self {
415            semantic_weight: 0.8,
416            lexical_weight: 0.2,
417            rerank_window: 64,
418            rerank: false,
419        }
420    }
421}
422
423#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
424/// Data type for search query.
425pub struct SearchQuery {
426    /// Text content for this value.
427    pub text: String,
428    /// The top k value.
429    pub top_k: usize,
430    /// The filter value.
431    pub filter: Option<SearchFilter>,
432    /// The hybrid value.
433    pub hybrid: HybridConfig,
434}
435
436impl SearchQuery {
437    /// Creates a new value.
438    pub fn new(text: impl Into<String>, top_k: usize) -> Self {
439        Self {
440            text: text.into(),
441            top_k,
442            filter: None,
443            hybrid: HybridConfig::default(),
444        }
445    }
446
447    /// Returns full text.
448    pub fn full_text(text: impl Into<String>, top_k: usize) -> Self {
449        Self::new(text, top_k).mode(RetrievalMode::FullText)
450    }
451
452    /// Returns semantic.
453    pub fn semantic(text: impl Into<String>, top_k: usize) -> Self {
454        Self::new(text, top_k).mode(RetrievalMode::Semantic)
455    }
456
457    /// Returns hybrid.
458    pub fn hybrid(text: impl Into<String>, top_k: usize, config: HybridConfig) -> Self {
459        Self {
460            text: text.into(),
461            top_k,
462            filter: None,
463            hybrid: config,
464        }
465    }
466
467    /// Returns filter.
468    pub fn filter(mut self, filter: SearchFilter) -> Self {
469        self.filter = Some(filter);
470        self
471    }
472
473    /// Returns mode.
474    pub fn mode(mut self, mode: RetrievalMode) -> Self {
475        match mode {
476            RetrievalMode::FullText => {
477                self.hybrid.semantic_weight = 0.0;
478                self.hybrid.lexical_weight = 1.0;
479            }
480            RetrievalMode::Semantic => {
481                self.hybrid.semantic_weight = 1.0;
482                self.hybrid.lexical_weight = 0.0;
483            }
484            RetrievalMode::Hybrid => {
485                if self.hybrid.semantic_weight <= f32::EPSILON {
486                    self.hybrid.semantic_weight = HybridConfig::default().semantic_weight;
487                }
488                if self.hybrid.lexical_weight <= f32::EPSILON {
489                    self.hybrid.lexical_weight = HybridConfig::default().lexical_weight;
490                }
491            }
492        }
493        self
494    }
495
496    /// Returns retrieval mode.
497    pub fn retrieval_mode(&self) -> RetrievalMode {
498        let semantic = self.hybrid.semantic_weight > f32::EPSILON;
499        let lexical = self.hybrid.lexical_weight > f32::EPSILON;
500        match (semantic, lexical) {
501            (false, true) => RetrievalMode::FullText,
502            (true, false) => RetrievalMode::Semantic,
503            _ => RetrievalMode::Hybrid,
504        }
505    }
506}
507
508/// Request for query/document reranking.
509#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
510#[serde(rename_all = "camelCase")]
511pub struct RerankRequest {
512    /// Query text.
513    pub query: String,
514    /// Documents to rerank.
515    pub documents: Vec<String>,
516    /// Maximum output count.
517    #[serde(default = "default_rerank_top_k")]
518    pub top_k: usize,
519    /// Optional caller-supplied document scores.
520    #[serde(default)]
521    pub imported_scores: Vec<f32>,
522}
523
524/// One reranked document.
525#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
526#[serde(rename_all = "camelCase")]
527pub struct RerankResult {
528    /// Original document index.
529    pub index: usize,
530    /// Document text.
531    pub document: String,
532    /// Relevance score.
533    pub score: f32,
534    /// Optional runtime used to produce this score.
535    #[serde(default)]
536    pub runtime: Option<String>,
537    /// Optional model identifier used to produce this score.
538    #[serde(default)]
539    pub model_id: Option<String>,
540}
541
542/// Response for query/document reranking.
543#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
544#[serde(rename_all = "camelCase")]
545pub struct RerankResponse {
546    /// Accepted flag for generated package surfaces.
547    pub accepted: bool,
548    /// Operation name.
549    pub operation: String,
550    /// Query text.
551    pub query: String,
552    /// Ranked documents.
553    pub results: Vec<RerankResult>,
554    /// Optional runtime used by a rerank backend.
555    #[serde(default)]
556    pub runtime: Option<String>,
557    /// Optional model identifier used by a rerank backend.
558    #[serde(default)]
559    pub model_id: Option<String>,
560}
561
562/// Caller-supplied runtime context for reranking.
563#[derive(Default)]
564pub struct RerankExecutionContext<'a> {
565    /// Optional reranker backend.
566    pub reranker: Option<&'a mut dyn TextReranker>,
567    /// Optional model identifier for response metadata.
568    pub model_id: Option<String>,
569}
570
571/// Reranks documents from imported scores or deterministic lexical overlap.
572pub fn rerank_documents(request: RerankRequest) -> CoreResult<RerankResponse> {
573    rerank_documents_with_context(request, &mut RerankExecutionContext::default())
574}
575
576/// Reranks documents using a runtime backend when supplied, otherwise falls back.
577pub fn rerank_documents_with_context(
578    request: RerankRequest,
579    context: &mut RerankExecutionContext<'_>,
580) -> CoreResult<RerankResponse> {
581    if request.query.trim().is_empty() {
582        return Err(DetectError::InvalidArgument(
583            "rerank request must include a non-empty query".to_string(),
584        ));
585    }
586    if request.documents.is_empty() {
587        return Err(DetectError::InvalidArgument(
588            "rerank request must include at least one document".to_string(),
589        ));
590    }
591    let backend_scores = if let Some(reranker) = context.reranker.as_deref_mut() {
592        let runtime = rerank_runtime_name(reranker.runtime_backend());
593        Some((
594            runtime,
595            reranker.rerank(&request.query, &request.documents)?,
596        ))
597    } else {
598        None
599    };
600    let mut results = request
601        .documents
602        .iter()
603        .enumerate()
604        .map(|(index, document)| {
605            let (score, runtime) = if let Some((runtime, scores)) = &backend_scores {
606                (
607                    scores.get(index).copied().unwrap_or(0.0),
608                    Some(runtime.clone()),
609                )
610            } else {
611                (
612                    request
613                        .imported_scores
614                        .get(index)
615                        .copied()
616                        .unwrap_or_else(|| lexical_overlap_score(&request.query, document)),
617                    None,
618                )
619            };
620            RerankResult {
621                index,
622                document: document.clone(),
623                score,
624                runtime,
625                model_id: context.model_id.clone(),
626            }
627        })
628        .collect::<Vec<_>>();
629    results.sort_by(|left, right| {
630        right
631            .score
632            .total_cmp(&left.score)
633            .then_with(|| left.index.cmp(&right.index))
634    });
635    results.truncate(request.top_k.max(1));
636    Ok(RerankResponse {
637        accepted: true,
638        operation: "rerank".to_string(),
639        query: request.query,
640        results,
641        runtime: backend_scores.map(|(runtime, _)| runtime),
642        model_id: context.model_id.clone(),
643    })
644}
645
646fn rerank_runtime_name(runtime: TextRuntimeBackend) -> String {
647    match runtime {
648        TextRuntimeBackend::Candle => "candle",
649        TextRuntimeBackend::Onnx => "onnx",
650        TextRuntimeBackend::Tokenizers => "tokenizers",
651        TextRuntimeBackend::CudaOxide => "cuda_oxide",
652        TextRuntimeBackend::External => "external",
653        TextRuntimeBackend::Heuristic => "heuristic",
654    }
655    .to_string()
656}
657
658fn lexical_overlap_score(query: &str, document: &str) -> f32 {
659    let processing = TextProcessingOptions::default();
660    let query_terms = tokenize(query, &processing)
661        .into_iter()
662        .filter_map(|token| match token.kind {
663            TokenKind::Word => Some(token.normalized),
664            _ => None,
665        })
666        .collect::<BTreeSet<_>>();
667    if query_terms.is_empty() {
668        return 0.0;
669    }
670    let document_terms = tokenize(document, &processing)
671        .into_iter()
672        .filter_map(|token| match token.kind {
673            TokenKind::Word => Some(token.normalized),
674            _ => None,
675        })
676        .collect::<BTreeSet<_>>();
677    let overlap = query_terms
678        .iter()
679        .filter(|term| document_terms.contains(term.as_str()))
680        .count();
681    overlap as f32 / query_terms.len() as f32
682}
683
684fn rerank_search_results(
685    query: &str,
686    results: &mut Vec<SearchResult>,
687    top_k: usize,
688    context: &mut RerankExecutionContext<'_>,
689) -> CoreResult<()> {
690    let candidate_count = results.len();
691    if candidate_count == 0 {
692        return Ok(());
693    }
694    let request = RerankRequest {
695        query: query.to_string(),
696        documents: results
697            .iter()
698            .map(|result| result.snippet.clone())
699            .collect(),
700        top_k: top_k.max(1),
701        imported_scores: results.iter().map(|result| result.score).collect(),
702    };
703    let response = rerank_documents_with_context(request, context)?;
704    let mut ranked = Vec::new();
705    for reranked in response.results {
706        if let Some(mut result) = results.get(reranked.index).cloned() {
707            result.score = reranked.score;
708            if let Some(runtime) = reranked.runtime {
709                result
710                    .metadata
711                    .insert("rerank_runtime".to_string(), runtime);
712            }
713            if let Some(model_id) = reranked.model_id {
714                result
715                    .metadata
716                    .insert("rerank_model_id".to_string(), model_id);
717            }
718            ranked.push(result);
719        }
720    }
721    *results = ranked;
722    if results.len() > candidate_count {
723        results.truncate(candidate_count);
724    }
725    Ok(())
726}
727
728fn default_rerank_top_k() -> usize {
729    3
730}
731
732#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
733/// Data type for search result.
734pub struct SearchResult {
735    /// The chunk identifier value.
736    pub chunk_id: String,
737    /// The document identifier value.
738    pub document_id: String,
739    /// Score assigned to this value.
740    pub score: f32,
741    /// The semantic score value.
742    pub semantic_score: f32,
743    /// The lexical score value.
744    pub lexical_score: f32,
745    /// The snippet value.
746    pub snippet: String,
747    /// Metadata associated with this value.
748    pub metadata: BTreeMap<String, String>,
749}
750
751#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
752/// Data type for ingest report.
753pub struct IngestReport {
754    /// The documents received value.
755    pub documents_received: usize,
756    /// The documents replaced value.
757    pub documents_replaced: usize,
758    /// The documents skipped value.
759    pub documents_skipped: usize,
760    /// The chunks indexed value.
761    pub chunks_indexed: usize,
762}
763
764#[derive(Debug, Clone, PartialEq)]
765/// Data type for retrieval index.
766pub struct RetrievalIndex<B> {
767    embedder: B,
768    corpus_options: CorpusOptions,
769    bm25_options: Bm25Options,
770    chunks: BTreeMap<String, DocumentChunk>,
771    raw_text_by_chunk_id: BTreeMap<String, String>,
772    document_chunks: BTreeMap<String, Vec<String>>,
773    vectors: VectorSearchIndex,
774    bm25: Bm25Corpus,
775}
776
777impl<B: TextEmbedderBackend> RetrievalIndex<B> {
778    /// Creates a new value.
779    pub fn new(embedder: B) -> Self {
780        Self::with_options(embedder, CorpusOptions::default(), Bm25Options::default())
781    }
782
783    /// Returns this value with options.
784    pub fn with_options(
785        embedder: B,
786        corpus_options: CorpusOptions,
787        bm25_options: Bm25Options,
788    ) -> Self {
789        Self {
790            embedder,
791            bm25: Bm25Corpus::new(bm25_options.clone()),
792            corpus_options,
793            bm25_options,
794            chunks: BTreeMap::new(),
795            raw_text_by_chunk_id: BTreeMap::new(),
796            document_chunks: BTreeMap::new(),
797            vectors: VectorSearchIndex::new(),
798        }
799    }
800
801    /// Builds this value from parts.
802    pub fn from_parts(
803        embedder: B,
804        corpus_options: CorpusOptions,
805        bm25_options: Bm25Options,
806        chunks: Vec<DocumentChunk>,
807        raw_text_by_chunk_id: BTreeMap<String, String>,
808        vector_records: Vec<SerializableVectorRecord>,
809    ) -> CoreResult<Self> {
810        let mut index = Self::with_options(embedder, corpus_options, bm25_options);
811        for chunk in chunks {
812            index.insert_chunk_state(chunk);
813        }
814        index.raw_text_by_chunk_id = raw_text_by_chunk_id;
815        index.vectors = VectorSearchIndex::import_records(vector_records)?;
816        index.rebuild_bm25()?;
817        index.validate_vector_state()?;
818        Ok(index)
819    }
820
821    /// Returns embedder.
822    pub fn embedder(&self) -> &B {
823        &self.embedder
824    }
825
826    /// Returns embedder info.
827    pub fn embedder_info(&self) -> EmbeddingModelInfo {
828        self.embedder.model_info()
829    }
830
831    /// Returns corpus options.
832    pub fn corpus_options(&self) -> &CorpusOptions {
833        &self.corpus_options
834    }
835
836    /// Returns bm25 options.
837    pub fn bm25_options(&self) -> &Bm25Options {
838        &self.bm25_options
839    }
840
841    /// Returns chunks.
842    pub fn chunks(&self) -> Vec<&DocumentChunk> {
843        self.chunks.values().collect()
844    }
845
846    /// Returns chunks iter.
847    pub fn chunks_iter(&self) -> impl Iterator<Item = &DocumentChunk> {
848        self.chunks.values()
849    }
850
851    /// Returns chunk.
852    pub fn chunk(&self, chunk_id: &str) -> Option<&DocumentChunk> {
853        self.chunks.get(chunk_id)
854    }
855
856    /// Returns raw text.
857    pub fn raw_text(&self, chunk_id: &str) -> Option<&str> {
858        self.raw_text_by_chunk_id.get(chunk_id).map(String::as_str)
859    }
860
861    /// Returns vector records.
862    pub fn vector_records(&self) -> Vec<SerializableVectorRecord> {
863        self.vectors.export_records()
864    }
865
866    /// Returns ingest documents.
867    pub fn ingest_documents(
868        &mut self,
869        docs: &[SearchDocument],
870        options: &IngestionOptions,
871    ) -> CoreResult<IngestReport> {
872        validate_ingestion_options(options)?;
873
874        let mut replaced = 0;
875        let mut skipped = 0;
876        for document in docs {
877            validate_document(document)?;
878            if self.document_chunks.contains_key(&document.id) {
879                self.remove_document(&document.id);
880                replaced += 1;
881            }
882
883            let chunks = chunk_document(document, options, &self.corpus_options.processing)?;
884            if chunks.is_empty() {
885                skipped += 1;
886                continue;
887            }
888
889            for chunk in chunks {
890                if options.store_raw_text {
891                    self.raw_text_by_chunk_id
892                        .insert(chunk.chunk_id.clone(), chunk.text.clone());
893                }
894                self.insert_chunk_state(chunk);
895            }
896        }
897
898        self.rebuild_indices()?;
899        Ok(IngestReport {
900            documents_received: docs.len(),
901            documents_replaced: replaced,
902            documents_skipped: skipped,
903            chunks_indexed: self.chunks.len(),
904        })
905    }
906
907    /// Returns search.
908    pub fn search(&self, query: &SearchQuery) -> CoreResult<Vec<SearchResult>> {
909        self.search_with_context(query, &mut RerankExecutionContext::default())
910    }
911
912    /// Returns search, optionally reranking the initial candidates with context.
913    pub fn search_with_context(
914        &self,
915        query: &SearchQuery,
916        context: &mut RerankExecutionContext<'_>,
917    ) -> CoreResult<Vec<SearchResult>> {
918        validate_query(query)?;
919        self.ensure_non_empty()?;
920
921        let normalized_query = normalize_query(&query.text, &self.corpus_options.processing)?;
922        let filter = query.filter.as_ref();
923        let overfetch = query
924            .hybrid
925            .rerank_window
926            .max(query.top_k)
927            .max(query.top_k * 4);
928        let post_filter_limit = if filter.is_some_and(requires_post_filter) {
929            self.chunks.len()
930        } else {
931            overfetch
932        };
933
934        let semantic_hits = if query.hybrid.semantic_weight > f32::EPSILON {
935            let query_vector = self.embedder.embed_text(&normalized_query)?;
936            self.validate_query_dimensions(query_vector.dimensions())?;
937            self.vectors.search_filtered(
938                query_vector.as_slice(),
939                post_filter_limit,
940                filter.map(search_filter_to_vector_filter).as_ref(),
941            )?
942        } else {
943            Vec::new()
944        };
945        let lexical_hits = if query.hybrid.lexical_weight > f32::EPSILON {
946            self.bm25.search(&normalized_query, post_filter_limit)?
947        } else {
948            Vec::new()
949        };
950
951        let semantic_scores = semantic_hits
952            .iter()
953            .filter(|hit| self.matches_filter(hit.id.as_str(), filter))
954            .map(|hit| (hit.id.as_str().to_string(), hit.score))
955            .collect::<BTreeMap<_, _>>();
956        let lexical_scores = lexical_hits
957            .into_iter()
958            .filter(|hit| self.matches_filter(&hit.id, filter))
959            .map(|hit| (hit.id, hit.score))
960            .collect::<BTreeMap<_, _>>();
961
962        let normalized_semantic = normalize_scores(&semantic_scores);
963        let normalized_lexical = normalize_scores(&lexical_scores);
964
965        let mut merged_ids = normalized_semantic.keys().cloned().collect::<BTreeSet<_>>();
966        merged_ids.extend(normalized_lexical.keys().cloned());
967
968        let mut results = merged_ids
969            .into_iter()
970            .filter_map(|chunk_id| {
971                let chunk = self.chunks.get(&chunk_id)?;
972                let semantic_score = normalized_semantic.get(&chunk_id).copied().unwrap_or(0.0);
973                let lexical_score = normalized_lexical.get(&chunk_id).copied().unwrap_or(0.0);
974                let score = semantic_score * query.hybrid.semantic_weight
975                    + lexical_score * query.hybrid.lexical_weight;
976                Some(SearchResult {
977                    chunk_id: chunk.chunk_id.clone(),
978                    document_id: chunk.document_id.clone(),
979                    score,
980                    semantic_score,
981                    lexical_score,
982                    snippet: self.build_snippet(chunk, &normalized_query),
983                    metadata: chunk.metadata.clone(),
984                })
985            })
986            .collect::<Vec<_>>();
987
988        results.sort_by(|left, right| {
989            right
990                .score
991                .total_cmp(&left.score)
992                .then_with(|| right.semantic_score.total_cmp(&left.semantic_score))
993                .then_with(|| right.lexical_score.total_cmp(&left.lexical_score))
994                .then_with(|| left.chunk_id.cmp(&right.chunk_id))
995        });
996        if query.hybrid.rerank {
997            rerank_search_results(&query.text, &mut results, query.top_k, context)?;
998        } else {
999            results.truncate(query.top_k);
1000        }
1001        Ok(results)
1002    }
1003
1004    /// Returns related chunks.
1005    pub fn related_chunks(&self, chunk_id: &str, top_k: usize) -> CoreResult<Vec<SearchResult>> {
1006        if top_k == 0 {
1007            return Err(invalid_argument("search limit must be greater than zero"));
1008        }
1009        self.ensure_non_empty()?;
1010
1011        let query_record = self
1012            .vectors
1013            .records()
1014            .iter()
1015            .find(|record| record.id == chunk_id)
1016            .ok_or_else(|| invalid_argument(format!("chunk `{chunk_id}` was not indexed")))?;
1017        let hits = self
1018            .vectors
1019            .search_filtered(query_record.vector.as_slice(), top_k + 1, None)?;
1020
1021        let mut results = hits
1022            .into_iter()
1023            .filter(|hit| hit.id.as_str() != chunk_id)
1024            .filter_map(|hit| {
1025                let chunk = self.chunks.get(hit.id.as_str())?;
1026                Some(SearchResult {
1027                    chunk_id: chunk.chunk_id.clone(),
1028                    document_id: chunk.document_id.clone(),
1029                    score: hit.score,
1030                    semantic_score: hit.score,
1031                    lexical_score: 0.0,
1032                    snippet: self.build_snippet(chunk, &chunk.text),
1033                    metadata: chunk.metadata.clone(),
1034                })
1035            })
1036            .collect::<Vec<_>>();
1037        results.truncate(top_k);
1038        Ok(results)
1039    }
1040
1041    fn insert_chunk_state(&mut self, chunk: DocumentChunk) {
1042        self.document_chunks
1043            .entry(chunk.document_id.clone())
1044            .or_default()
1045            .push(chunk.chunk_id.clone());
1046        self.chunks.insert(chunk.chunk_id.clone(), chunk);
1047    }
1048
1049    fn remove_document(&mut self, document_id: &str) {
1050        if let Some(chunk_ids) = self.document_chunks.remove(document_id) {
1051            for chunk_id in chunk_ids {
1052                self.chunks.remove(&chunk_id);
1053                self.raw_text_by_chunk_id.remove(&chunk_id);
1054            }
1055        }
1056    }
1057
1058    fn rebuild_indices(&mut self) -> CoreResult<()> {
1059        self.rebuild_bm25()?;
1060        if self.chunks.is_empty() {
1061            self.vectors = VectorSearchIndex::new();
1062            return Ok(());
1063        }
1064
1065        let texts = self
1066            .chunks
1067            .values()
1068            .map(|chunk| {
1069                self.raw_text_by_chunk_id
1070                    .get(&chunk.chunk_id)
1071                    .map(String::as_str)
1072                    .unwrap_or(chunk.text.as_str())
1073            })
1074            .collect::<Vec<_>>();
1075        let vectors = self.embedder.embed_batch(&texts)?;
1076        if vectors.len() != self.chunks.len() {
1077            return Err(invalid_argument(
1078                "embedder returned a different number of vectors than input texts",
1079            ));
1080        }
1081        self.validate_embedded_dimensions(vectors.iter().map(|vector| vector.dimensions()))?;
1082
1083        let mut index = VectorSearchIndex::new();
1084        for (chunk, vector) in self.chunks.values().zip(vectors) {
1085            index.add(VectorRecord::with_payload(
1086                chunk.chunk_id.clone(),
1087                vector,
1088                VectorRecordMetadata {
1089                    tags: metadata_tags(&chunk.metadata),
1090                    metadata: chunk.metadata.clone(),
1091                },
1092            ))?;
1093        }
1094        self.vectors = index;
1095        Ok(())
1096    }
1097
1098    fn rebuild_bm25(&mut self) -> CoreResult<()> {
1099        let mut bm25 = Bm25Corpus::new(self.bm25_options.clone());
1100        for chunk in self.chunks.values() {
1101            bm25.add_document(chunk.chunk_id.clone(), &chunk.text)?;
1102        }
1103        self.bm25 = bm25;
1104        Ok(())
1105    }
1106
1107    fn validate_embedded_dimensions(
1108        &self,
1109        vector_dimensions: impl IntoIterator<Item = usize>,
1110    ) -> CoreResult<()> {
1111        let expected = self.embedder.model_info().dimensions;
1112        if expected == 0 {
1113            return Ok(());
1114        }
1115        if vector_dimensions
1116            .into_iter()
1117            .any(|dimensions| dimensions != expected)
1118        {
1119            return Err(invalid_argument(format!(
1120                "embedder produced vectors that did not match advertised dimensions {expected}"
1121            )));
1122        }
1123        Ok(())
1124    }
1125
1126    fn validate_vector_state(&self) -> CoreResult<()> {
1127        let chunk_ids = self.chunks.keys().cloned().collect::<BTreeSet<_>>();
1128        let vector_ids = self
1129            .vectors
1130            .records()
1131            .iter()
1132            .map(|record| record.id.clone())
1133            .collect::<BTreeSet<_>>();
1134        if chunk_ids != vector_ids {
1135            return Err(invalid_argument(
1136                "persisted vector records and chunk catalog do not describe the same ids",
1137            ));
1138        }
1139        if let Some(dimensions) = self.vectors.dimensions() {
1140            let expected = self.embedder.model_info().dimensions;
1141            if expected > 0 && expected != dimensions {
1142                return Err(invalid_argument(format!(
1143                    "embedder dimensions {expected} did not match persisted index dimensions {dimensions}"
1144                )));
1145            }
1146        }
1147        Ok(())
1148    }
1149
1150    fn validate_query_dimensions(&self, dimensions: usize) -> CoreResult<()> {
1151        if let Some(index_dimensions) = self.vectors.dimensions() {
1152            if dimensions != index_dimensions {
1153                return Err(invalid_argument(format!(
1154                    "query vector dimensions {dimensions} did not match index dimensions {index_dimensions}"
1155                )));
1156            }
1157        }
1158        Ok(())
1159    }
1160
1161    fn matches_filter(&self, chunk_id: &str, filter: Option<&SearchFilter>) -> bool {
1162        let Some(filter) = filter else {
1163            return true;
1164        };
1165        let Some(chunk) = self.chunks.get(chunk_id) else {
1166            return false;
1167        };
1168        if !filter.document_ids.is_empty() && !filter.document_ids.contains(&chunk.document_id) {
1169            return false;
1170        }
1171        if !filter.metadata_contains.iter().all(|(key, needle)| {
1172            chunk
1173                .metadata
1174                .get(key)
1175                .is_some_and(|value| value.contains(needle))
1176        }) {
1177            return false;
1178        }
1179        let tags = metadata_tags(&chunk.metadata);
1180        filter
1181            .required_tags
1182            .iter()
1183            .all(|tag| tags.iter().any(|candidate| candidate == tag))
1184            && filter
1185                .metadata_equals
1186                .iter()
1187                .all(|(key, value)| chunk.metadata.get(key) == Some(value))
1188    }
1189
1190    fn build_snippet(&self, chunk: &DocumentChunk, normalized_query: &str) -> String {
1191        let text = self
1192            .raw_text_by_chunk_id
1193            .get(&chunk.chunk_id)
1194            .map(String::as_str)
1195            .unwrap_or(chunk.text.as_str());
1196        build_snippet(text, normalized_query)
1197    }
1198
1199    fn ensure_non_empty(&self) -> CoreResult<()> {
1200        if self.chunks.is_empty() {
1201            return Err(invalid_argument("search index is empty"));
1202        }
1203        Ok(())
1204    }
1205}
1206
1207fn validate_document(document: &SearchDocument) -> CoreResult<()> {
1208    if document.id.trim().is_empty() {
1209        return Err(invalid_argument("document id must not be empty"));
1210    }
1211    if document.body.trim().is_empty() {
1212        return Err(invalid_argument("document body must not be empty"));
1213    }
1214    Ok(())
1215}
1216
1217fn validate_ingestion_options(options: &IngestionOptions) -> CoreResult<()> {
1218    if options.chunk_tokens == 0 {
1219        return Err(invalid_argument(
1220            "chunk token size must be greater than zero",
1221        ));
1222    }
1223    if options.chunk_overlap_tokens >= options.chunk_tokens {
1224        return Err(invalid_argument(
1225            "chunk overlap must be smaller than the chunk token size",
1226        ));
1227    }
1228    Ok(())
1229}
1230
1231fn validate_query(query: &SearchQuery) -> CoreResult<()> {
1232    if query.top_k == 0 {
1233        return Err(invalid_argument("search limit must be greater than zero"));
1234    }
1235    if !query.hybrid.semantic_weight.is_finite()
1236        || !query.hybrid.lexical_weight.is_finite()
1237        || query.hybrid.semantic_weight < 0.0
1238        || query.hybrid.lexical_weight < 0.0
1239    {
1240        return Err(invalid_argument(
1241            "hybrid weights must be finite and non-negative",
1242        ));
1243    }
1244    if query.hybrid.semantic_weight + query.hybrid.lexical_weight <= f32::EPSILON {
1245        return Err(invalid_argument("hybrid weights must not both be zero"));
1246    }
1247    if query.hybrid.rerank_window == 0 {
1248        return Err(invalid_argument("rerank window must be greater than zero"));
1249    }
1250    let _ = normalize_query(&query.text, &TextProcessingOptions::default())?;
1251    Ok(())
1252}
1253
1254fn chunk_document(
1255    document: &SearchDocument,
1256    options: &IngestionOptions,
1257    processing: &TextProcessingOptions,
1258) -> CoreResult<Vec<DocumentChunk>> {
1259    let tokens = tokenize(document.body.as_str(), processing)
1260        .into_iter()
1261        .filter(|token| {
1262            matches!(
1263                token.kind,
1264                TokenKind::Word
1265                    | TokenKind::Number
1266                    | TokenKind::Url
1267                    | TokenKind::Email
1268                    | TokenKind::Mention
1269                    | TokenKind::Hashtag
1270            )
1271        })
1272        .collect::<Vec<_>>();
1273    if tokens.is_empty() {
1274        return Ok(Vec::new());
1275    }
1276
1277    let mut chunks = Vec::new();
1278    let step = options.chunk_tokens - options.chunk_overlap_tokens;
1279    let metadata = document_metadata(document);
1280    let mut start = 0;
1281    let mut ordinal = 0;
1282    while start < tokens.len() {
1283        let end = (start + options.chunk_tokens).min(tokens.len());
1284        let byte_start = tokens[start].span.byte_start;
1285        let byte_end = tokens[end - 1].span.byte_end;
1286        let text = document
1287            .body
1288            .get(byte_start..byte_end)
1289            .ok_or_else(|| invalid_argument("token spans did not align to valid UTF-8 boundaries"))?
1290            .trim()
1291            .to_string();
1292        if !text.is_empty() {
1293            chunks.push(DocumentChunk {
1294                chunk_id: format!("{}:{ordinal}", document.id),
1295                document_id: document.id.clone(),
1296                text,
1297                ordinal,
1298                metadata: metadata.clone(),
1299                source: document.source.clone(),
1300                provenance: document.provenance.clone(),
1301                annotations: document.annotations.clone(),
1302            });
1303            ordinal += 1;
1304        }
1305        if end == tokens.len() {
1306            break;
1307        }
1308        start += step;
1309    }
1310    Ok(chunks)
1311}
1312
1313fn chunk_spans(
1314    document: &SearchDocument,
1315    spans: impl IntoIterator<Item = TextSpan>,
1316) -> CoreResult<Vec<DocumentChunk>> {
1317    let metadata = document_metadata(document);
1318    let mut chunks = Vec::new();
1319    for (ordinal, span) in spans.into_iter().enumerate() {
1320        let text = document
1321            .body
1322            .get(span.byte_start..span.byte_end)
1323            .ok_or_else(|| invalid_argument("chunk span did not align to valid UTF-8 boundaries"))?
1324            .trim()
1325            .to_string();
1326        if text.is_empty() {
1327            continue;
1328        }
1329        chunks.push(DocumentChunk {
1330            chunk_id: format!("{}:{ordinal}", document.id),
1331            document_id: document.id.clone(),
1332            text,
1333            ordinal,
1334            metadata: metadata.clone(),
1335            source: document.source.clone(),
1336            provenance: document.provenance.clone(),
1337            annotations: document.annotations.clone(),
1338        });
1339    }
1340    Ok(chunks)
1341}
1342
1343fn document_metadata(document: &SearchDocument) -> BTreeMap<String, String> {
1344    let mut metadata = document.metadata.clone();
1345    if let Some(title) = &document.title {
1346        metadata
1347            .entry(TITLE_METADATA_KEY.to_string())
1348            .or_insert_with(|| title.clone());
1349    }
1350    metadata
1351}
1352
1353fn normalize_query(query: &str, processing: &TextProcessingOptions) -> CoreResult<String> {
1354    let tokens = tokenize(query, processing)
1355        .into_iter()
1356        .filter(|token| {
1357            matches!(
1358                token.kind,
1359                TokenKind::Word
1360                    | TokenKind::Number
1361                    | TokenKind::Url
1362                    | TokenKind::Email
1363                    | TokenKind::Mention
1364                    | TokenKind::Hashtag
1365            )
1366        })
1367        .map(|token| token.normalized)
1368        .collect::<Vec<_>>();
1369    if tokens.is_empty() {
1370        return Err(invalid_argument(
1371            "query must contain at least one searchable term",
1372        ));
1373    }
1374    Ok(tokens.join(" "))
1375}
1376
1377fn metadata_tags(metadata: &BTreeMap<String, String>) -> Vec<String> {
1378    metadata
1379        .get("tags")
1380        .map(|tags| {
1381            tags.split([',', ';'])
1382                .flat_map(|group| group.split_whitespace())
1383                .map(str::trim)
1384                .filter(|tag| !tag.is_empty())
1385                .map(ToString::to_string)
1386                .collect::<BTreeSet<_>>()
1387                .into_iter()
1388                .collect::<Vec<_>>()
1389        })
1390        .unwrap_or_default()
1391}
1392
1393fn search_filter_to_vector_filter(filter: &SearchFilter) -> VectorSearchFilter {
1394    VectorSearchFilter {
1395        required_tags: filter.required_tags.clone(),
1396        metadata_equals: filter.metadata_equals.clone(),
1397    }
1398}
1399
1400fn requires_post_filter(filter: &SearchFilter) -> bool {
1401    !filter.document_ids.is_empty() || !filter.metadata_contains.is_empty()
1402}
1403
1404fn normalize_scores(scores: &BTreeMap<String, f32>) -> BTreeMap<String, f32> {
1405    if scores.is_empty() {
1406        return BTreeMap::new();
1407    }
1408    let min = scores.values().copied().fold(f32::INFINITY, f32::min);
1409    let max = scores.values().copied().fold(f32::NEG_INFINITY, f32::max);
1410    if (max - min).abs() <= f32::EPSILON {
1411        return scores
1412            .keys()
1413            .cloned()
1414            .map(|id| (id, 1.0))
1415            .collect::<BTreeMap<_, _>>();
1416    }
1417    scores
1418        .iter()
1419        .map(|(id, score)| (id.clone(), (score - min) / (max - min)))
1420        .collect()
1421}
1422
1423fn build_snippet(text: &str, normalized_query: &str) -> String {
1424    let terms = tokenize(normalized_query, &TextProcessingOptions::default())
1425        .into_iter()
1426        .map(|token| token.normalized)
1427        .collect::<Vec<_>>();
1428    let text_lower = text.to_lowercase();
1429    let match_start = terms.iter().filter_map(|term| text_lower.find(term)).min();
1430
1431    let snippet = match match_start {
1432        Some(byte_index) => {
1433            let start = text[..byte_index]
1434                .char_indices()
1435                .rev()
1436                .nth(60)
1437                .map(|(index, _)| index)
1438                .unwrap_or(0);
1439            let end = text[byte_index..]
1440                .char_indices()
1441                .nth(160)
1442                .map(|(index, _)| byte_index + index)
1443                .unwrap_or(text.len());
1444            text[start..end].trim()
1445        }
1446        None => text
1447            .char_indices()
1448            .nth(160)
1449            .map(|(index, _)| &text[..index])
1450            .unwrap_or(text)
1451            .trim(),
1452    };
1453    snippet.to_string()
1454}
1455
1456fn invalid_argument(message: impl Into<String>) -> DetectError {
1457    DetectError::InvalidArgument(message.into())
1458}
1459
1460#[cfg(test)]
1461mod tests {
1462    use super::*;
1463    use std::cell::Cell;
1464    use text_embeddings::{
1465        DenseVector, HashedTextEmbedder, TextEmbeddingBackend, TextEmbeddingBackendKind,
1466        TextEmbeddingConfig, TextEmbeddingMetadata,
1467    };
1468
1469    fn embedder() -> HashedTextEmbedder {
1470        HashedTextEmbedder::new(
1471            TextEmbeddingConfig {
1472                dimensions: 32,
1473                use_idf: true,
1474            },
1475            CorpusOptions::default(),
1476        )
1477        .unwrap()
1478    }
1479
1480    #[derive(Debug)]
1481    struct FlaggedEmbedder {
1482        panic_on_embed: Cell<bool>,
1483    }
1484
1485    impl FlaggedEmbedder {
1486        fn new() -> Self {
1487            Self {
1488                panic_on_embed: Cell::new(false),
1489            }
1490        }
1491    }
1492
1493    impl TextEmbeddingBackend for FlaggedEmbedder {
1494        fn embed_text(&self, text: &str) -> video_analysis_core::Result<DenseVector> {
1495            if self.panic_on_embed.get() {
1496                panic!("query embedding should have been skipped");
1497            }
1498            DenseVector::new([
1499                text.bytes().filter(|byte| byte % 2 == 0).count() as f32 + 1.0,
1500                text.bytes().filter(|byte| byte % 2 == 1).count() as f32 + 1.0,
1501            ])
1502        }
1503
1504        fn metadata(&self) -> TextEmbeddingMetadata {
1505            TextEmbeddingMetadata {
1506                backend: TextEmbeddingBackendKind::Custom,
1507                model_name: Some("flagged".to_string()),
1508                dimensions: Some(2),
1509                ..TextEmbeddingMetadata::default()
1510            }
1511        }
1512    }
1513
1514    struct FakeReranker;
1515
1516    impl TextReranker for FakeReranker {
1517        fn rerank(
1518            &mut self,
1519            _query: &str,
1520            documents: &[String],
1521        ) -> video_analysis_core::Result<Vec<f32>> {
1522            Ok(documents
1523                .iter()
1524                .map(|document| {
1525                    if document.contains("second") {
1526                        0.95
1527                    } else {
1528                        0.1
1529                    }
1530                })
1531                .collect())
1532        }
1533
1534        fn runtime_backend(&self) -> TextRuntimeBackend {
1535            TextRuntimeBackend::External
1536        }
1537    }
1538
1539    fn query(text: &str) -> SearchQuery {
1540        SearchQuery {
1541            text: text.to_string(),
1542            top_k: 3,
1543            filter: None,
1544            hybrid: HybridConfig::default(),
1545        }
1546    }
1547
1548    #[test]
1549    fn chunking_preserves_order_and_ids() {
1550        let document = SearchDocument {
1551            id: "doc-1".to_string(),
1552            title: None,
1553            body: (0..10)
1554                .map(|index| format!("token{index}"))
1555                .collect::<Vec<_>>()
1556                .join(" "),
1557            metadata: BTreeMap::new(),
1558            source: None,
1559            provenance: Vec::new(),
1560            annotations: Vec::new(),
1561        };
1562
1563        let chunks = chunk_document(
1564            &document,
1565            &IngestionOptions {
1566                chunk_tokens: 4,
1567                chunk_overlap_tokens: 1,
1568                store_raw_text: true,
1569            },
1570            &TextProcessingOptions::default(),
1571        )
1572        .unwrap();
1573
1574        assert_eq!(chunks[0].chunk_id, "doc-1:0");
1575        assert_eq!(chunks[1].chunk_id, "doc-1:1");
1576        assert_eq!(chunks[0].ordinal, 0);
1577        assert_eq!(chunks[1].ordinal, 1);
1578    }
1579
1580    #[test]
1581    fn explicit_chunking_can_use_sentences() {
1582        let document = SearchDocument::new("doc-1", "First sentence. Second sentence.");
1583        let chunks = chunk_search_document(
1584            &document,
1585            &ChunkingOptions {
1586                strategy: ChunkingStrategy::Sentence,
1587                ..ChunkingOptions::default()
1588            },
1589            &TextProcessingOptions::default(),
1590        )
1591        .unwrap();
1592
1593        assert_eq!(chunks.len(), 2);
1594        assert_eq!(chunks[0].text, "First sentence.");
1595        assert_eq!(chunks[1].text, "Second sentence.");
1596    }
1597
1598    #[test]
1599    fn rerank_context_uses_backend_scores() {
1600        let mut reranker = FakeReranker;
1601        let mut context = RerankExecutionContext {
1602            reranker: Some(&mut reranker),
1603            model_id: Some("fake-reranker".to_string()),
1604        };
1605
1606        let response = rerank_documents_with_context(
1607            RerankRequest {
1608                query: "pick second".to_string(),
1609                documents: vec!["first document".to_string(), "second document".to_string()],
1610                top_k: 2,
1611                imported_scores: Vec::new(),
1612            },
1613            &mut context,
1614        )
1615        .unwrap();
1616
1617        assert_eq!(response.results[0].index, 1);
1618        assert_eq!(response.results[0].runtime.as_deref(), Some("external"));
1619        assert_eq!(
1620            response.results[0].model_id.as_deref(),
1621            Some("fake-reranker")
1622        );
1623    }
1624
1625    #[test]
1626    fn empty_query_returns_error() {
1627        let index = RetrievalIndex::new(embedder());
1628        let err = index.search(&query("   ")).unwrap_err();
1629        assert!(matches!(err, DetectError::InvalidArgument(message) if message.contains("query")));
1630    }
1631
1632    #[test]
1633    fn search_document_from_text_segment_contract_preserves_metadata() {
1634        let mut segment = TextSegmentContract::new(7, "hello retrieval");
1635        segment.stream_id = Some("subs".to_string());
1636        segment.language = Some("en".to_string());
1637        segment.timestamp = Some(
1638            video_analysis_core::Timestamp::new(1250, video_analysis_core::Timebase::new(1, 1000))
1639                .into(),
1640        );
1641        segment.duration_seconds = Some(1.25);
1642        segment
1643            .attributes
1644            .insert("speaker".to_string(), "narrator".to_string());
1645        segment
1646            .attributes
1647            .insert("confidence".to_string(), "0.8".to_string());
1648        segment
1649            .attributes
1650            .insert("source".to_string(), "fixture.srt".to_string());
1651
1652        let document = SearchDocument::from_text_segment_contract(&segment);
1653
1654        assert_eq!(document.id, "subs:7");
1655        assert_eq!(document.body, "hello retrieval");
1656        assert_eq!(document.metadata["language"], "en");
1657        assert_eq!(document.metadata["speaker"], "narrator");
1658        assert_eq!(document.metadata["confidence"], "0.8");
1659        assert_eq!(document.metadata["timestamp_seconds"], "1.25");
1660        assert_eq!(document.metadata["duration_seconds"], "1.25");
1661        assert_eq!(document.metadata["source"], "fixture.srt");
1662
1663        let chunk = chunk_search_document(
1664            &document,
1665            &ChunkingOptions::default(),
1666            &TextProcessingOptions::default(),
1667        )
1668        .unwrap()
1669        .remove(0);
1670        let chunk_document = chunk.to_text_document_contract();
1671        let chunk_segment = chunk.to_text_segment_contract();
1672        assert_eq!(chunk_document.source.unwrap().duration_seconds, Some(1.25));
1673        assert_eq!(chunk_segment.duration_seconds, Some(1.25));
1674        assert_eq!(chunk_segment.attributes["speaker"], "narrator");
1675    }
1676
1677    #[test]
1678    fn replacing_document_rebuilds_chunk_set() {
1679        let mut index = RetrievalIndex::new(embedder());
1680        let options = IngestionOptions {
1681            chunk_tokens: 3,
1682            chunk_overlap_tokens: 1,
1683            store_raw_text: true,
1684        };
1685
1686        index
1687            .ingest_documents(
1688                &[SearchDocument {
1689                    id: "doc".to_string(),
1690                    title: None,
1691                    body: "rust cargo crates docs search".to_string(),
1692                    metadata: BTreeMap::new(),
1693                    source: None,
1694                    provenance: Vec::new(),
1695                    annotations: Vec::new(),
1696                }],
1697                &options,
1698            )
1699            .unwrap();
1700        let original_texts = index
1701            .chunks
1702            .values()
1703            .map(|chunk| chunk.text.clone())
1704            .collect::<Vec<_>>();
1705
1706        let report = index
1707            .ingest_documents(
1708                &[SearchDocument {
1709                    id: "doc".to_string(),
1710                    title: None,
1711                    body: "music playlists and recommendations".to_string(),
1712                    metadata: BTreeMap::new(),
1713                    source: None,
1714                    provenance: Vec::new(),
1715                    annotations: Vec::new(),
1716                }],
1717                &options,
1718            )
1719            .unwrap();
1720
1721        assert_eq!(report.documents_replaced, 1);
1722        assert!(index
1723            .chunks
1724            .keys()
1725            .all(|chunk_id| chunk_id.starts_with("doc:")));
1726        assert!(index.chunks.values().all(|chunk| !original_texts
1727            .iter()
1728            .any(|original| original == &chunk.text)));
1729    }
1730
1731    #[test]
1732    fn hybrid_search_combines_bm25_and_vectors() {
1733        let mut index = RetrievalIndex::new(embedder());
1734        index
1735            .ingest_documents(
1736                &[
1737                    SearchDocument {
1738                        id: "doc-1".to_string(),
1739                        title: Some("Rust Search".to_string()),
1740                        body: "Rust cargo crates enable semantic search over documentation."
1741                            .to_string(),
1742                        metadata: BTreeMap::from([
1743                            ("lang".to_string(), "en".to_string()),
1744                            ("tags".to_string(), "docs rust".to_string()),
1745                        ]),
1746                        source: None,
1747                        provenance: Vec::new(),
1748                        annotations: Vec::new(),
1749                    },
1750                    SearchDocument {
1751                        id: "doc-2".to_string(),
1752                        title: None,
1753                        body: "Berlin travel plans and restaurant notes.".to_string(),
1754                        metadata: BTreeMap::from([("lang".to_string(), "en".to_string())]),
1755                        source: None,
1756                        provenance: Vec::new(),
1757                        annotations: Vec::new(),
1758                    },
1759                ],
1760                &IngestionOptions::default(),
1761            )
1762            .unwrap();
1763
1764        let results = index.search(&query("rust documentation search")).unwrap();
1765
1766        assert_eq!(results[0].document_id, "doc-1");
1767        assert!(results[0].semantic_score > 0.0);
1768    }
1769
1770    #[test]
1771    fn full_text_retrieval_skips_query_embedding() {
1772        let mut index = RetrievalIndex::new(FlaggedEmbedder::new());
1773        index
1774            .ingest_documents(
1775                &[SearchDocument {
1776                    id: "doc-1".to_string(),
1777                    title: None,
1778                    body: "rust cargo full text search".to_string(),
1779                    metadata: BTreeMap::new(),
1780                    source: None,
1781                    provenance: Vec::new(),
1782                    annotations: Vec::new(),
1783                }],
1784                &IngestionOptions::default(),
1785            )
1786            .unwrap();
1787        index.embedder().panic_on_embed.set(true);
1788
1789        let results = index
1790            .search(&SearchQuery::full_text("cargo search", 1))
1791            .unwrap();
1792
1793        assert_eq!(results[0].document_id, "doc-1");
1794        assert_eq!(results[0].semantic_score, 0.0);
1795        assert!(results[0].lexical_score > 0.0);
1796    }
1797
1798    #[test]
1799    fn semantic_search_does_not_require_lexical_matches() {
1800        let mut index = RetrievalIndex::new(embedder());
1801        index
1802            .ingest_documents(
1803                &[SearchDocument {
1804                    id: "doc-1".to_string(),
1805                    title: None,
1806                    body: "alpha beta gamma".to_string(),
1807                    metadata: BTreeMap::new(),
1808                    source: None,
1809                    provenance: Vec::new(),
1810                    annotations: Vec::new(),
1811                }],
1812                &IngestionOptions::default(),
1813            )
1814            .unwrap();
1815
1816        let results = index
1817            .search(&SearchQuery::semantic("unshared tokens", 1))
1818            .unwrap();
1819
1820        assert_eq!(results[0].document_id, "doc-1");
1821        assert_eq!(results[0].lexical_score, 0.0);
1822        assert!(results[0].semantic_score > 0.0);
1823    }
1824
1825    #[test]
1826    fn post_filter_search_expands_candidates_before_filtering() {
1827        let mut index = RetrievalIndex::new(embedder());
1828        let mut docs = Vec::new();
1829        for number in 0..40 {
1830            docs.push(SearchDocument {
1831                id: format!("doc-{number:02}"),
1832                title: None,
1833                body: "common search text".to_string(),
1834                metadata: BTreeMap::from([(
1835                    "note".to_string(),
1836                    if number == 39 {
1837                        "contains-special-target".to_string()
1838                    } else {
1839                        "ordinary".to_string()
1840                    },
1841                )]),
1842                source: None,
1843                provenance: Vec::new(),
1844                annotations: Vec::new(),
1845            });
1846        }
1847        index
1848            .ingest_documents(&docs, &IngestionOptions::default())
1849            .unwrap();
1850
1851        let filter = SearchFilter {
1852            metadata_contains: BTreeMap::from([("note".to_string(), "special".to_string())]),
1853            document_ids: BTreeSet::from(["doc-39".to_string()]),
1854            ..SearchFilter::default()
1855        };
1856        let results = index
1857            .search(&SearchQuery::full_text("common", 1).filter(filter))
1858            .unwrap();
1859
1860        assert_eq!(results.len(), 1);
1861        assert_eq!(results[0].document_id, "doc-39");
1862    }
1863
1864    #[test]
1865    fn filters_exclude_non_matching_chunks() {
1866        let mut index = RetrievalIndex::new(embedder());
1867        index
1868            .ingest_documents(
1869                &[
1870                    SearchDocument {
1871                        id: "doc-1".to_string(),
1872                        title: None,
1873                        body: "English cargo docs".to_string(),
1874                        metadata: BTreeMap::from([
1875                            ("lang".to_string(), "en".to_string()),
1876                            ("tags".to_string(), "docs".to_string()),
1877                        ]),
1878                        source: None,
1879                        provenance: Vec::new(),
1880                        annotations: Vec::new(),
1881                    },
1882                    SearchDocument {
1883                        id: "doc-2".to_string(),
1884                        title: None,
1885                        body: "German cargo docs".to_string(),
1886                        metadata: BTreeMap::from([
1887                            ("lang".to_string(), "de".to_string()),
1888                            ("tags".to_string(), "docs".to_string()),
1889                        ]),
1890                        source: None,
1891                        provenance: Vec::new(),
1892                        annotations: Vec::new(),
1893                    },
1894                ],
1895                &IngestionOptions::default(),
1896            )
1897            .unwrap();
1898
1899        let mut query = query("cargo docs");
1900        query.filter = Some(SearchFilter {
1901            metadata_equals: BTreeMap::from([("lang".to_string(), "en".to_string())]),
1902            metadata_contains: BTreeMap::new(),
1903            required_tags: vec!["docs".to_string()],
1904            document_ids: BTreeSet::new(),
1905        });
1906
1907        let results = index.search(&query).unwrap();
1908        assert_eq!(results.len(), 1);
1909        assert_eq!(results[0].document_id, "doc-1");
1910    }
1911
1912    #[test]
1913    fn related_chunks_excludes_self() {
1914        let mut index = RetrievalIndex::new(embedder());
1915        index
1916            .ingest_documents(
1917                &[SearchDocument {
1918                    id: "doc-1".to_string(),
1919                    title: None,
1920                    body: "rust cargo crates semantic search documentation indexing retrieval"
1921                        .to_string(),
1922                    metadata: BTreeMap::new(),
1923                    source: None,
1924                    provenance: Vec::new(),
1925                    annotations: Vec::new(),
1926                }],
1927                &IngestionOptions {
1928                    chunk_tokens: 4,
1929                    chunk_overlap_tokens: 1,
1930                    store_raw_text: true,
1931                },
1932            )
1933            .unwrap();
1934
1935        let related = index.related_chunks("doc-1:0", 2).unwrap();
1936        assert!(related.iter().all(|result| result.chunk_id != "doc-1:0"));
1937    }
1938
1939    #[test]
1940    fn from_parts_validates_dimension_mismatch() {
1941        let chunks = vec![DocumentChunk {
1942            chunk_id: "doc:0".to_string(),
1943            document_id: "doc".to_string(),
1944            text: "rust cargo docs".to_string(),
1945            ordinal: 0,
1946            metadata: BTreeMap::new(),
1947            source: None,
1948            provenance: Vec::new(),
1949            annotations: Vec::new(),
1950        }];
1951        let records = vec![SerializableVectorRecord {
1952            id: "doc:0".into(),
1953            vector: vec![1.0, 0.0],
1954            payload: VectorRecordMetadata::default(),
1955        }];
1956
1957        let err = RetrievalIndex::from_parts(
1958            embedder(),
1959            CorpusOptions::default(),
1960            Bm25Options::default(),
1961            chunks,
1962            BTreeMap::new(),
1963            records,
1964        )
1965        .unwrap_err();
1966
1967        assert!(
1968            matches!(err, DetectError::InvalidArgument(message) if message.contains("dimensions"))
1969        );
1970    }
1971}
text_retrieval/lib.rs

text_retrieval/
lib.rs