1#![doc = include_str!("../README.md")]
2
3pub mod surface;
4use std::collections::{BTreeMap, BTreeSet};
5
6mod storage;
7
8use serde::{Deserialize, Serialize};
9use text_core::{
10 split_paragraphs, split_sentence_spans, tokenize, TextAnnotationSpan, TextDocument,
11 TextDocumentContract, TextProcessingOptions, TextProvenance, TextSegmentContract,
12 TextSourceRef, TextSpan, TokenKind,
13};
14use text_embeddings::{EmbeddingModelInfo, TextEmbedderBackend};
15use text_lexical::{Bm25Corpus, Bm25Options, CorpusOptions, TextCorpus, TextCorpusDocument};
16use text_model_runtime::{TextReranker, TextRuntimeBackend};
17use vector_analysis_index::{
18 SerializableVectorRecord, VectorRecord, VectorRecordMetadata, VectorSearchFilter,
19 VectorSearchIndex,
20};
21use video_analysis_core::{DetectError, Result as CoreResult};
22
23pub use storage::*;
24
25const TITLE_METADATA_KEY: &str = "__title";
26
27#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
28pub struct SearchDocument {
30 pub id: String,
32 #[serde(default)]
34 pub title: Option<String>,
35 pub body: String,
37 #[serde(default)]
39 pub metadata: BTreeMap<String, String>,
40 #[serde(default)]
42 pub source: Option<TextSourceRef>,
43 #[serde(default)]
45 pub provenance: Vec<TextProvenance>,
46 #[serde(default)]
48 pub annotations: Vec<TextAnnotationSpan>,
49}
50
51impl SearchDocument {
52 pub fn new(id: impl Into<String>, body: impl Into<String>) -> Self {
54 Self {
55 id: id.into(),
56 title: None,
57 body: body.into(),
58 metadata: BTreeMap::new(),
59 source: None,
60 provenance: Vec::new(),
61 annotations: Vec::new(),
62 }
63 }
64
65 pub fn from_text_document(document: &TextDocument<'_>) -> Self {
67 let mut metadata = BTreeMap::new();
68 if let Some(language) = document.language {
69 metadata.insert("language".to_string(), language.to_string());
70 }
71 if let Some(timestamp) = document.timestamp {
72 metadata.insert(
73 "timestamp_seconds".to_string(),
74 timestamp.seconds().to_string(),
75 );
76 }
77 Self {
78 id: document.id.to_string(),
79 title: None,
80 body: document.text.to_string(),
81 metadata,
82 source: document.timestamp.map(|timestamp| TextSourceRef {
83 source_id: None,
84 source_kind: Some("text_document".to_string()),
85 uri: None,
86 media_timestamp: Some(timestamp.into()),
87 duration_seconds: None,
88 }),
89 provenance: Vec::new(),
90 annotations: Vec::new(),
91 }
92 }
93
94 pub fn from_text_document_contract(document: &TextDocumentContract) -> Self {
96 let mut metadata = document.attributes.clone();
97 if let Some(language) = &document.language {
98 metadata.insert("language".to_string(), language.clone());
99 }
100 if let Some(timestamp) = document.timestamp {
101 metadata.insert(
102 "timestamp_seconds".to_string(),
103 timestamp.seconds().to_string(),
104 );
105 }
106 Self {
107 id: document.id.clone(),
108 title: None,
109 body: document.text.clone(),
110 metadata,
111 source: document.source.clone(),
112 provenance: document.provenance.clone(),
113 annotations: document.annotations.clone(),
114 }
115 }
116
117 pub fn from_text_segment_contract(segment: &TextSegmentContract) -> Self {
119 let mut metadata = segment.attributes.clone();
120 if let Some(language) = &segment.language {
121 metadata.insert("language".to_string(), language.clone());
122 }
123 if let Some(timestamp) = segment.timestamp {
124 metadata.insert(
125 "timestamp_seconds".to_string(),
126 timestamp.seconds().to_string(),
127 );
128 }
129 if let Some(duration_seconds) = segment.duration_seconds {
130 metadata.insert("duration_seconds".to_string(), duration_seconds.to_string());
131 }
132 Self {
133 id: segment
134 .document_id()
135 .unwrap_or_else(|| segment.segment_index.to_string()),
136 title: None,
137 body: segment.text.clone(),
138 metadata,
139 source: segment.source.clone().or_else(|| {
140 (segment.timestamp.is_some() || segment.duration_seconds.is_some()).then(|| {
141 TextSourceRef {
142 source_id: segment.stream_id.clone(),
143 source_kind: Some("text_segment".to_string()),
144 uri: None,
145 media_timestamp: segment.timestamp,
146 duration_seconds: segment.duration_seconds,
147 }
148 })
149 }),
150 provenance: segment.provenance.clone(),
151 annotations: segment.annotations.clone(),
152 }
153 }
154
155 pub fn from_text_corpus_document(document: &TextCorpusDocument) -> Self {
157 let mut metadata = document.metadata.clone();
158 if let Some(language) = &document.language {
159 metadata.insert("language".to_string(), language.clone());
160 }
161 if let Some(timestamp) = document.timestamp {
162 metadata.insert(
163 "timestamp_seconds".to_string(),
164 timestamp.seconds().to_string(),
165 );
166 }
167 if let Some(duration_seconds) = document
168 .source
169 .as_ref()
170 .and_then(|source| source.duration_seconds)
171 {
172 metadata.insert("duration_seconds".to_string(), duration_seconds.to_string());
173 }
174 Self {
175 id: document.id.clone(),
176 title: None,
177 body: document.text.clone(),
178 metadata,
179 source: document.source.clone(),
180 provenance: document.provenance.clone(),
181 annotations: document.annotations.clone(),
182 }
183 }
184
185 pub fn from_text_corpus(corpus: &TextCorpus) -> Vec<Self> {
187 corpus
188 .documents
189 .iter()
190 .map(Self::from_text_corpus_document)
191 .collect()
192 }
193}
194
195pub trait IntoSearchDocument {
196 fn into_search_document(self) -> SearchDocument;
197}
198
199impl IntoSearchDocument for TextDocument<'_> {
200 fn into_search_document(self) -> SearchDocument {
201 SearchDocument::from_text_document(&self)
202 }
203}
204
205impl IntoSearchDocument for &TextDocument<'_> {
206 fn into_search_document(self) -> SearchDocument {
207 SearchDocument::from_text_document(self)
208 }
209}
210
211impl IntoSearchDocument for TextDocumentContract {
212 fn into_search_document(self) -> SearchDocument {
213 SearchDocument::from_text_document_contract(&self)
214 }
215}
216
217impl IntoSearchDocument for &TextDocumentContract {
218 fn into_search_document(self) -> SearchDocument {
219 SearchDocument::from_text_document_contract(self)
220 }
221}
222
223impl IntoSearchDocument for TextSegmentContract {
224 fn into_search_document(self) -> SearchDocument {
225 SearchDocument::from_text_segment_contract(&self)
226 }
227}
228
229impl IntoSearchDocument for &TextSegmentContract {
230 fn into_search_document(self) -> SearchDocument {
231 SearchDocument::from_text_segment_contract(self)
232 }
233}
234
235#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
236pub struct DocumentChunk {
238 pub chunk_id: String,
240 pub document_id: String,
242 pub text: String,
244 pub ordinal: usize,
246 pub metadata: BTreeMap<String, String>,
248 #[serde(default)]
250 pub source: Option<TextSourceRef>,
251 #[serde(default)]
253 pub provenance: Vec<TextProvenance>,
254 #[serde(default)]
256 pub annotations: Vec<TextAnnotationSpan>,
257}
258
259impl DocumentChunk {
260 pub fn to_text_document_contract(&self) -> TextDocumentContract {
262 TextDocumentContract {
263 id: self.chunk_id.clone(),
264 text: self.text.clone(),
265 language: self.metadata.get("language").cloned(),
266 timestamp: self
267 .source
268 .as_ref()
269 .and_then(|source| source.media_timestamp),
270 attributes: self.metadata.clone(),
271 source: self.source.clone(),
272 provenance: self.provenance.clone(),
273 annotations: self.annotations.clone(),
274 }
275 }
276
277 pub fn to_text_segment_contract(&self) -> TextSegmentContract {
279 TextSegmentContract {
280 stream_id: Some(self.document_id.clone()),
281 segment_index: self.ordinal as u64,
282 text: self.text.clone(),
283 language: self.metadata.get("language").cloned(),
284 timestamp: self
285 .source
286 .as_ref()
287 .and_then(|source| source.media_timestamp),
288 duration_seconds: self
289 .source
290 .as_ref()
291 .and_then(|source| source.duration_seconds),
292 is_final: true,
293 attributes: self.metadata.clone(),
294 source: self.source.clone(),
295 provenance: self.provenance.clone(),
296 annotations: self.annotations.clone(),
297 }
298 }
299}
300
301#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
302pub struct IngestionOptions {
304 pub chunk_tokens: usize,
306 pub chunk_overlap_tokens: usize,
308 pub store_raw_text: bool,
310}
311
312#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
313pub enum ChunkingStrategy {
315 TokenWindow,
317 Sentence,
319 Paragraph,
321}
322
323#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
324pub struct ChunkingOptions {
326 pub strategy: ChunkingStrategy,
328 pub ingestion: IngestionOptions,
330}
331
332impl Default for ChunkingOptions {
333 fn default() -> Self {
334 Self {
335 strategy: ChunkingStrategy::TokenWindow,
336 ingestion: IngestionOptions::default(),
337 }
338 }
339}
340
341impl Default for IngestionOptions {
342 fn default() -> Self {
343 Self {
344 chunk_tokens: 256,
345 chunk_overlap_tokens: 32,
346 store_raw_text: true,
347 }
348 }
349}
350
351pub fn chunk_search_document(
353 document: &SearchDocument,
354 options: &ChunkingOptions,
355 processing: &TextProcessingOptions,
356) -> CoreResult<Vec<DocumentChunk>> {
357 validate_document(document)?;
358 match options.strategy {
359 ChunkingStrategy::TokenWindow => chunk_document(document, &options.ingestion, processing),
360 ChunkingStrategy::Sentence => chunk_spans(
361 document,
362 split_sentence_spans(&document.body, processing)
363 .into_iter()
364 .map(|sentence| sentence.span),
365 ),
366 ChunkingStrategy::Paragraph => chunk_spans(
367 document,
368 split_paragraphs(&document.body)
369 .into_iter()
370 .map(|paragraph| paragraph.span),
371 ),
372 }
373}
374
375#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
376pub struct SearchFilter {
378 pub metadata_equals: BTreeMap<String, String>,
380 pub metadata_contains: BTreeMap<String, String>,
382 pub required_tags: Vec<String>,
384 pub document_ids: BTreeSet<String>,
386}
387
388#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
389pub enum RetrievalMode {
391 FullText,
393 Semantic,
395 Hybrid,
397}
398
399#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
400pub struct HybridConfig {
402 pub semantic_weight: f32,
404 pub lexical_weight: f32,
406 pub rerank_window: usize,
408 pub rerank: bool,
410}
411
412impl Default for HybridConfig {
413 fn default() -> Self {
414 Self {
415 semantic_weight: 0.8,
416 lexical_weight: 0.2,
417 rerank_window: 64,
418 rerank: false,
419 }
420 }
421}
422
423#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
424pub struct SearchQuery {
426 pub text: String,
428 pub top_k: usize,
430 pub filter: Option<SearchFilter>,
432 pub hybrid: HybridConfig,
434}
435
436impl SearchQuery {
437 pub fn new(text: impl Into<String>, top_k: usize) -> Self {
439 Self {
440 text: text.into(),
441 top_k,
442 filter: None,
443 hybrid: HybridConfig::default(),
444 }
445 }
446
447 pub fn full_text(text: impl Into<String>, top_k: usize) -> Self {
449 Self::new(text, top_k).mode(RetrievalMode::FullText)
450 }
451
452 pub fn semantic(text: impl Into<String>, top_k: usize) -> Self {
454 Self::new(text, top_k).mode(RetrievalMode::Semantic)
455 }
456
457 pub fn hybrid(text: impl Into<String>, top_k: usize, config: HybridConfig) -> Self {
459 Self {
460 text: text.into(),
461 top_k,
462 filter: None,
463 hybrid: config,
464 }
465 }
466
467 pub fn filter(mut self, filter: SearchFilter) -> Self {
469 self.filter = Some(filter);
470 self
471 }
472
473 pub fn mode(mut self, mode: RetrievalMode) -> Self {
475 match mode {
476 RetrievalMode::FullText => {
477 self.hybrid.semantic_weight = 0.0;
478 self.hybrid.lexical_weight = 1.0;
479 }
480 RetrievalMode::Semantic => {
481 self.hybrid.semantic_weight = 1.0;
482 self.hybrid.lexical_weight = 0.0;
483 }
484 RetrievalMode::Hybrid => {
485 if self.hybrid.semantic_weight <= f32::EPSILON {
486 self.hybrid.semantic_weight = HybridConfig::default().semantic_weight;
487 }
488 if self.hybrid.lexical_weight <= f32::EPSILON {
489 self.hybrid.lexical_weight = HybridConfig::default().lexical_weight;
490 }
491 }
492 }
493 self
494 }
495
496 pub fn retrieval_mode(&self) -> RetrievalMode {
498 let semantic = self.hybrid.semantic_weight > f32::EPSILON;
499 let lexical = self.hybrid.lexical_weight > f32::EPSILON;
500 match (semantic, lexical) {
501 (false, true) => RetrievalMode::FullText,
502 (true, false) => RetrievalMode::Semantic,
503 _ => RetrievalMode::Hybrid,
504 }
505 }
506}
507
508#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
510#[serde(rename_all = "camelCase")]
511pub struct RerankRequest {
512 pub query: String,
514 pub documents: Vec<String>,
516 #[serde(default = "default_rerank_top_k")]
518 pub top_k: usize,
519 #[serde(default)]
521 pub imported_scores: Vec<f32>,
522}
523
524#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
526#[serde(rename_all = "camelCase")]
527pub struct RerankResult {
528 pub index: usize,
530 pub document: String,
532 pub score: f32,
534 #[serde(default)]
536 pub runtime: Option<String>,
537 #[serde(default)]
539 pub model_id: Option<String>,
540}
541
542#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
544#[serde(rename_all = "camelCase")]
545pub struct RerankResponse {
546 pub accepted: bool,
548 pub operation: String,
550 pub query: String,
552 pub results: Vec<RerankResult>,
554 #[serde(default)]
556 pub runtime: Option<String>,
557 #[serde(default)]
559 pub model_id: Option<String>,
560}
561
562#[derive(Default)]
564pub struct RerankExecutionContext<'a> {
565 pub reranker: Option<&'a mut dyn TextReranker>,
567 pub model_id: Option<String>,
569}
570
571pub fn rerank_documents(request: RerankRequest) -> CoreResult<RerankResponse> {
573 rerank_documents_with_context(request, &mut RerankExecutionContext::default())
574}
575
576pub fn rerank_documents_with_context(
578 request: RerankRequest,
579 context: &mut RerankExecutionContext<'_>,
580) -> CoreResult<RerankResponse> {
581 if request.query.trim().is_empty() {
582 return Err(DetectError::InvalidArgument(
583 "rerank request must include a non-empty query".to_string(),
584 ));
585 }
586 if request.documents.is_empty() {
587 return Err(DetectError::InvalidArgument(
588 "rerank request must include at least one document".to_string(),
589 ));
590 }
591 let backend_scores = if let Some(reranker) = context.reranker.as_deref_mut() {
592 let runtime = rerank_runtime_name(reranker.runtime_backend());
593 Some((
594 runtime,
595 reranker.rerank(&request.query, &request.documents)?,
596 ))
597 } else {
598 None
599 };
600 let mut results = request
601 .documents
602 .iter()
603 .enumerate()
604 .map(|(index, document)| {
605 let (score, runtime) = if let Some((runtime, scores)) = &backend_scores {
606 (
607 scores.get(index).copied().unwrap_or(0.0),
608 Some(runtime.clone()),
609 )
610 } else {
611 (
612 request
613 .imported_scores
614 .get(index)
615 .copied()
616 .unwrap_or_else(|| lexical_overlap_score(&request.query, document)),
617 None,
618 )
619 };
620 RerankResult {
621 index,
622 document: document.clone(),
623 score,
624 runtime,
625 model_id: context.model_id.clone(),
626 }
627 })
628 .collect::<Vec<_>>();
629 results.sort_by(|left, right| {
630 right
631 .score
632 .total_cmp(&left.score)
633 .then_with(|| left.index.cmp(&right.index))
634 });
635 results.truncate(request.top_k.max(1));
636 Ok(RerankResponse {
637 accepted: true,
638 operation: "rerank".to_string(),
639 query: request.query,
640 results,
641 runtime: backend_scores.map(|(runtime, _)| runtime),
642 model_id: context.model_id.clone(),
643 })
644}
645
646fn rerank_runtime_name(runtime: TextRuntimeBackend) -> String {
647 match runtime {
648 TextRuntimeBackend::Candle => "candle",
649 TextRuntimeBackend::Onnx => "onnx",
650 TextRuntimeBackend::Tokenizers => "tokenizers",
651 TextRuntimeBackend::CudaOxide => "cuda_oxide",
652 TextRuntimeBackend::External => "external",
653 TextRuntimeBackend::Heuristic => "heuristic",
654 }
655 .to_string()
656}
657
658fn lexical_overlap_score(query: &str, document: &str) -> f32 {
659 let processing = TextProcessingOptions::default();
660 let query_terms = tokenize(query, &processing)
661 .into_iter()
662 .filter_map(|token| match token.kind {
663 TokenKind::Word => Some(token.normalized),
664 _ => None,
665 })
666 .collect::<BTreeSet<_>>();
667 if query_terms.is_empty() {
668 return 0.0;
669 }
670 let document_terms = tokenize(document, &processing)
671 .into_iter()
672 .filter_map(|token| match token.kind {
673 TokenKind::Word => Some(token.normalized),
674 _ => None,
675 })
676 .collect::<BTreeSet<_>>();
677 let overlap = query_terms
678 .iter()
679 .filter(|term| document_terms.contains(term.as_str()))
680 .count();
681 overlap as f32 / query_terms.len() as f32
682}
683
684fn rerank_search_results(
685 query: &str,
686 results: &mut Vec<SearchResult>,
687 top_k: usize,
688 context: &mut RerankExecutionContext<'_>,
689) -> CoreResult<()> {
690 let candidate_count = results.len();
691 if candidate_count == 0 {
692 return Ok(());
693 }
694 let request = RerankRequest {
695 query: query.to_string(),
696 documents: results
697 .iter()
698 .map(|result| result.snippet.clone())
699 .collect(),
700 top_k: top_k.max(1),
701 imported_scores: results.iter().map(|result| result.score).collect(),
702 };
703 let response = rerank_documents_with_context(request, context)?;
704 let mut ranked = Vec::new();
705 for reranked in response.results {
706 if let Some(mut result) = results.get(reranked.index).cloned() {
707 result.score = reranked.score;
708 if let Some(runtime) = reranked.runtime {
709 result
710 .metadata
711 .insert("rerank_runtime".to_string(), runtime);
712 }
713 if let Some(model_id) = reranked.model_id {
714 result
715 .metadata
716 .insert("rerank_model_id".to_string(), model_id);
717 }
718 ranked.push(result);
719 }
720 }
721 *results = ranked;
722 if results.len() > candidate_count {
723 results.truncate(candidate_count);
724 }
725 Ok(())
726}
727
728fn default_rerank_top_k() -> usize {
729 3
730}
731
732#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
733pub struct SearchResult {
735 pub chunk_id: String,
737 pub document_id: String,
739 pub score: f32,
741 pub semantic_score: f32,
743 pub lexical_score: f32,
745 pub snippet: String,
747 pub metadata: BTreeMap<String, String>,
749}
750
751#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
752pub struct IngestReport {
754 pub documents_received: usize,
756 pub documents_replaced: usize,
758 pub documents_skipped: usize,
760 pub chunks_indexed: usize,
762}
763
764#[derive(Debug, Clone, PartialEq)]
765pub struct RetrievalIndex<B> {
767 embedder: B,
768 corpus_options: CorpusOptions,
769 bm25_options: Bm25Options,
770 chunks: BTreeMap<String, DocumentChunk>,
771 raw_text_by_chunk_id: BTreeMap<String, String>,
772 document_chunks: BTreeMap<String, Vec<String>>,
773 vectors: VectorSearchIndex,
774 bm25: Bm25Corpus,
775}
776
777impl<B: TextEmbedderBackend> RetrievalIndex<B> {
778 pub fn new(embedder: B) -> Self {
780 Self::with_options(embedder, CorpusOptions::default(), Bm25Options::default())
781 }
782
783 pub fn with_options(
785 embedder: B,
786 corpus_options: CorpusOptions,
787 bm25_options: Bm25Options,
788 ) -> Self {
789 Self {
790 embedder,
791 bm25: Bm25Corpus::new(bm25_options.clone()),
792 corpus_options,
793 bm25_options,
794 chunks: BTreeMap::new(),
795 raw_text_by_chunk_id: BTreeMap::new(),
796 document_chunks: BTreeMap::new(),
797 vectors: VectorSearchIndex::new(),
798 }
799 }
800
801 pub fn from_parts(
803 embedder: B,
804 corpus_options: CorpusOptions,
805 bm25_options: Bm25Options,
806 chunks: Vec<DocumentChunk>,
807 raw_text_by_chunk_id: BTreeMap<String, String>,
808 vector_records: Vec<SerializableVectorRecord>,
809 ) -> CoreResult<Self> {
810 let mut index = Self::with_options(embedder, corpus_options, bm25_options);
811 for chunk in chunks {
812 index.insert_chunk_state(chunk);
813 }
814 index.raw_text_by_chunk_id = raw_text_by_chunk_id;
815 index.vectors = VectorSearchIndex::import_records(vector_records)?;
816 index.rebuild_bm25()?;
817 index.validate_vector_state()?;
818 Ok(index)
819 }
820
821 pub fn embedder(&self) -> &B {
823 &self.embedder
824 }
825
826 pub fn embedder_info(&self) -> EmbeddingModelInfo {
828 self.embedder.model_info()
829 }
830
831 pub fn corpus_options(&self) -> &CorpusOptions {
833 &self.corpus_options
834 }
835
836 pub fn bm25_options(&self) -> &Bm25Options {
838 &self.bm25_options
839 }
840
841 pub fn chunks(&self) -> Vec<&DocumentChunk> {
843 self.chunks.values().collect()
844 }
845
846 pub fn chunks_iter(&self) -> impl Iterator<Item = &DocumentChunk> {
848 self.chunks.values()
849 }
850
851 pub fn chunk(&self, chunk_id: &str) -> Option<&DocumentChunk> {
853 self.chunks.get(chunk_id)
854 }
855
856 pub fn raw_text(&self, chunk_id: &str) -> Option<&str> {
858 self.raw_text_by_chunk_id.get(chunk_id).map(String::as_str)
859 }
860
861 pub fn vector_records(&self) -> Vec<SerializableVectorRecord> {
863 self.vectors.export_records()
864 }
865
866 pub fn ingest_documents(
868 &mut self,
869 docs: &[SearchDocument],
870 options: &IngestionOptions,
871 ) -> CoreResult<IngestReport> {
872 validate_ingestion_options(options)?;
873
874 let mut replaced = 0;
875 let mut skipped = 0;
876 for document in docs {
877 validate_document(document)?;
878 if self.document_chunks.contains_key(&document.id) {
879 self.remove_document(&document.id);
880 replaced += 1;
881 }
882
883 let chunks = chunk_document(document, options, &self.corpus_options.processing)?;
884 if chunks.is_empty() {
885 skipped += 1;
886 continue;
887 }
888
889 for chunk in chunks {
890 if options.store_raw_text {
891 self.raw_text_by_chunk_id
892 .insert(chunk.chunk_id.clone(), chunk.text.clone());
893 }
894 self.insert_chunk_state(chunk);
895 }
896 }
897
898 self.rebuild_indices()?;
899 Ok(IngestReport {
900 documents_received: docs.len(),
901 documents_replaced: replaced,
902 documents_skipped: skipped,
903 chunks_indexed: self.chunks.len(),
904 })
905 }
906
907 pub fn search(&self, query: &SearchQuery) -> CoreResult<Vec<SearchResult>> {
909 self.search_with_context(query, &mut RerankExecutionContext::default())
910 }
911
912 pub fn search_with_context(
914 &self,
915 query: &SearchQuery,
916 context: &mut RerankExecutionContext<'_>,
917 ) -> CoreResult<Vec<SearchResult>> {
918 validate_query(query)?;
919 self.ensure_non_empty()?;
920
921 let normalized_query = normalize_query(&query.text, &self.corpus_options.processing)?;
922 let filter = query.filter.as_ref();
923 let overfetch = query
924 .hybrid
925 .rerank_window
926 .max(query.top_k)
927 .max(query.top_k * 4);
928 let post_filter_limit = if filter.is_some_and(requires_post_filter) {
929 self.chunks.len()
930 } else {
931 overfetch
932 };
933
934 let semantic_hits = if query.hybrid.semantic_weight > f32::EPSILON {
935 let query_vector = self.embedder.embed_text(&normalized_query)?;
936 self.validate_query_dimensions(query_vector.dimensions())?;
937 self.vectors.search_filtered(
938 query_vector.as_slice(),
939 post_filter_limit,
940 filter.map(search_filter_to_vector_filter).as_ref(),
941 )?
942 } else {
943 Vec::new()
944 };
945 let lexical_hits = if query.hybrid.lexical_weight > f32::EPSILON {
946 self.bm25.search(&normalized_query, post_filter_limit)?
947 } else {
948 Vec::new()
949 };
950
951 let semantic_scores = semantic_hits
952 .iter()
953 .filter(|hit| self.matches_filter(hit.id.as_str(), filter))
954 .map(|hit| (hit.id.as_str().to_string(), hit.score))
955 .collect::<BTreeMap<_, _>>();
956 let lexical_scores = lexical_hits
957 .into_iter()
958 .filter(|hit| self.matches_filter(&hit.id, filter))
959 .map(|hit| (hit.id, hit.score))
960 .collect::<BTreeMap<_, _>>();
961
962 let normalized_semantic = normalize_scores(&semantic_scores);
963 let normalized_lexical = normalize_scores(&lexical_scores);
964
965 let mut merged_ids = normalized_semantic.keys().cloned().collect::<BTreeSet<_>>();
966 merged_ids.extend(normalized_lexical.keys().cloned());
967
968 let mut results = merged_ids
969 .into_iter()
970 .filter_map(|chunk_id| {
971 let chunk = self.chunks.get(&chunk_id)?;
972 let semantic_score = normalized_semantic.get(&chunk_id).copied().unwrap_or(0.0);
973 let lexical_score = normalized_lexical.get(&chunk_id).copied().unwrap_or(0.0);
974 let score = semantic_score * query.hybrid.semantic_weight
975 + lexical_score * query.hybrid.lexical_weight;
976 Some(SearchResult {
977 chunk_id: chunk.chunk_id.clone(),
978 document_id: chunk.document_id.clone(),
979 score,
980 semantic_score,
981 lexical_score,
982 snippet: self.build_snippet(chunk, &normalized_query),
983 metadata: chunk.metadata.clone(),
984 })
985 })
986 .collect::<Vec<_>>();
987
988 results.sort_by(|left, right| {
989 right
990 .score
991 .total_cmp(&left.score)
992 .then_with(|| right.semantic_score.total_cmp(&left.semantic_score))
993 .then_with(|| right.lexical_score.total_cmp(&left.lexical_score))
994 .then_with(|| left.chunk_id.cmp(&right.chunk_id))
995 });
996 if query.hybrid.rerank {
997 rerank_search_results(&query.text, &mut results, query.top_k, context)?;
998 } else {
999 results.truncate(query.top_k);
1000 }
1001 Ok(results)
1002 }
1003
1004 pub fn related_chunks(&self, chunk_id: &str, top_k: usize) -> CoreResult<Vec<SearchResult>> {
1006 if top_k == 0 {
1007 return Err(invalid_argument("search limit must be greater than zero"));
1008 }
1009 self.ensure_non_empty()?;
1010
1011 let query_record = self
1012 .vectors
1013 .records()
1014 .iter()
1015 .find(|record| record.id == chunk_id)
1016 .ok_or_else(|| invalid_argument(format!("chunk `{chunk_id}` was not indexed")))?;
1017 let hits = self
1018 .vectors
1019 .search_filtered(query_record.vector.as_slice(), top_k + 1, None)?;
1020
1021 let mut results = hits
1022 .into_iter()
1023 .filter(|hit| hit.id.as_str() != chunk_id)
1024 .filter_map(|hit| {
1025 let chunk = self.chunks.get(hit.id.as_str())?;
1026 Some(SearchResult {
1027 chunk_id: chunk.chunk_id.clone(),
1028 document_id: chunk.document_id.clone(),
1029 score: hit.score,
1030 semantic_score: hit.score,
1031 lexical_score: 0.0,
1032 snippet: self.build_snippet(chunk, &chunk.text),
1033 metadata: chunk.metadata.clone(),
1034 })
1035 })
1036 .collect::<Vec<_>>();
1037 results.truncate(top_k);
1038 Ok(results)
1039 }
1040
1041 fn insert_chunk_state(&mut self, chunk: DocumentChunk) {
1042 self.document_chunks
1043 .entry(chunk.document_id.clone())
1044 .or_default()
1045 .push(chunk.chunk_id.clone());
1046 self.chunks.insert(chunk.chunk_id.clone(), chunk);
1047 }
1048
1049 fn remove_document(&mut self, document_id: &str) {
1050 if let Some(chunk_ids) = self.document_chunks.remove(document_id) {
1051 for chunk_id in chunk_ids {
1052 self.chunks.remove(&chunk_id);
1053 self.raw_text_by_chunk_id.remove(&chunk_id);
1054 }
1055 }
1056 }
1057
1058 fn rebuild_indices(&mut self) -> CoreResult<()> {
1059 self.rebuild_bm25()?;
1060 if self.chunks.is_empty() {
1061 self.vectors = VectorSearchIndex::new();
1062 return Ok(());
1063 }
1064
1065 let texts = self
1066 .chunks
1067 .values()
1068 .map(|chunk| {
1069 self.raw_text_by_chunk_id
1070 .get(&chunk.chunk_id)
1071 .map(String::as_str)
1072 .unwrap_or(chunk.text.as_str())
1073 })
1074 .collect::<Vec<_>>();
1075 let vectors = self.embedder.embed_batch(&texts)?;
1076 if vectors.len() != self.chunks.len() {
1077 return Err(invalid_argument(
1078 "embedder returned a different number of vectors than input texts",
1079 ));
1080 }
1081 self.validate_embedded_dimensions(vectors.iter().map(|vector| vector.dimensions()))?;
1082
1083 let mut index = VectorSearchIndex::new();
1084 for (chunk, vector) in self.chunks.values().zip(vectors) {
1085 index.add(VectorRecord::with_payload(
1086 chunk.chunk_id.clone(),
1087 vector,
1088 VectorRecordMetadata {
1089 tags: metadata_tags(&chunk.metadata),
1090 metadata: chunk.metadata.clone(),
1091 },
1092 ))?;
1093 }
1094 self.vectors = index;
1095 Ok(())
1096 }
1097
1098 fn rebuild_bm25(&mut self) -> CoreResult<()> {
1099 let mut bm25 = Bm25Corpus::new(self.bm25_options.clone());
1100 for chunk in self.chunks.values() {
1101 bm25.add_document(chunk.chunk_id.clone(), &chunk.text)?;
1102 }
1103 self.bm25 = bm25;
1104 Ok(())
1105 }
1106
1107 fn validate_embedded_dimensions(
1108 &self,
1109 vector_dimensions: impl IntoIterator<Item = usize>,
1110 ) -> CoreResult<()> {
1111 let expected = self.embedder.model_info().dimensions;
1112 if expected == 0 {
1113 return Ok(());
1114 }
1115 if vector_dimensions
1116 .into_iter()
1117 .any(|dimensions| dimensions != expected)
1118 {
1119 return Err(invalid_argument(format!(
1120 "embedder produced vectors that did not match advertised dimensions {expected}"
1121 )));
1122 }
1123 Ok(())
1124 }
1125
1126 fn validate_vector_state(&self) -> CoreResult<()> {
1127 let chunk_ids = self.chunks.keys().cloned().collect::<BTreeSet<_>>();
1128 let vector_ids = self
1129 .vectors
1130 .records()
1131 .iter()
1132 .map(|record| record.id.clone())
1133 .collect::<BTreeSet<_>>();
1134 if chunk_ids != vector_ids {
1135 return Err(invalid_argument(
1136 "persisted vector records and chunk catalog do not describe the same ids",
1137 ));
1138 }
1139 if let Some(dimensions) = self.vectors.dimensions() {
1140 let expected = self.embedder.model_info().dimensions;
1141 if expected > 0 && expected != dimensions {
1142 return Err(invalid_argument(format!(
1143 "embedder dimensions {expected} did not match persisted index dimensions {dimensions}"
1144 )));
1145 }
1146 }
1147 Ok(())
1148 }
1149
1150 fn validate_query_dimensions(&self, dimensions: usize) -> CoreResult<()> {
1151 if let Some(index_dimensions) = self.vectors.dimensions() {
1152 if dimensions != index_dimensions {
1153 return Err(invalid_argument(format!(
1154 "query vector dimensions {dimensions} did not match index dimensions {index_dimensions}"
1155 )));
1156 }
1157 }
1158 Ok(())
1159 }
1160
1161 fn matches_filter(&self, chunk_id: &str, filter: Option<&SearchFilter>) -> bool {
1162 let Some(filter) = filter else {
1163 return true;
1164 };
1165 let Some(chunk) = self.chunks.get(chunk_id) else {
1166 return false;
1167 };
1168 if !filter.document_ids.is_empty() && !filter.document_ids.contains(&chunk.document_id) {
1169 return false;
1170 }
1171 if !filter.metadata_contains.iter().all(|(key, needle)| {
1172 chunk
1173 .metadata
1174 .get(key)
1175 .is_some_and(|value| value.contains(needle))
1176 }) {
1177 return false;
1178 }
1179 let tags = metadata_tags(&chunk.metadata);
1180 filter
1181 .required_tags
1182 .iter()
1183 .all(|tag| tags.iter().any(|candidate| candidate == tag))
1184 && filter
1185 .metadata_equals
1186 .iter()
1187 .all(|(key, value)| chunk.metadata.get(key) == Some(value))
1188 }
1189
1190 fn build_snippet(&self, chunk: &DocumentChunk, normalized_query: &str) -> String {
1191 let text = self
1192 .raw_text_by_chunk_id
1193 .get(&chunk.chunk_id)
1194 .map(String::as_str)
1195 .unwrap_or(chunk.text.as_str());
1196 build_snippet(text, normalized_query)
1197 }
1198
1199 fn ensure_non_empty(&self) -> CoreResult<()> {
1200 if self.chunks.is_empty() {
1201 return Err(invalid_argument("search index is empty"));
1202 }
1203 Ok(())
1204 }
1205}
1206
1207fn validate_document(document: &SearchDocument) -> CoreResult<()> {
1208 if document.id.trim().is_empty() {
1209 return Err(invalid_argument("document id must not be empty"));
1210 }
1211 if document.body.trim().is_empty() {
1212 return Err(invalid_argument("document body must not be empty"));
1213 }
1214 Ok(())
1215}
1216
1217fn validate_ingestion_options(options: &IngestionOptions) -> CoreResult<()> {
1218 if options.chunk_tokens == 0 {
1219 return Err(invalid_argument(
1220 "chunk token size must be greater than zero",
1221 ));
1222 }
1223 if options.chunk_overlap_tokens >= options.chunk_tokens {
1224 return Err(invalid_argument(
1225 "chunk overlap must be smaller than the chunk token size",
1226 ));
1227 }
1228 Ok(())
1229}
1230
1231fn validate_query(query: &SearchQuery) -> CoreResult<()> {
1232 if query.top_k == 0 {
1233 return Err(invalid_argument("search limit must be greater than zero"));
1234 }
1235 if !query.hybrid.semantic_weight.is_finite()
1236 || !query.hybrid.lexical_weight.is_finite()
1237 || query.hybrid.semantic_weight < 0.0
1238 || query.hybrid.lexical_weight < 0.0
1239 {
1240 return Err(invalid_argument(
1241 "hybrid weights must be finite and non-negative",
1242 ));
1243 }
1244 if query.hybrid.semantic_weight + query.hybrid.lexical_weight <= f32::EPSILON {
1245 return Err(invalid_argument("hybrid weights must not both be zero"));
1246 }
1247 if query.hybrid.rerank_window == 0 {
1248 return Err(invalid_argument("rerank window must be greater than zero"));
1249 }
1250 let _ = normalize_query(&query.text, &TextProcessingOptions::default())?;
1251 Ok(())
1252}
1253
1254fn chunk_document(
1255 document: &SearchDocument,
1256 options: &IngestionOptions,
1257 processing: &TextProcessingOptions,
1258) -> CoreResult<Vec<DocumentChunk>> {
1259 let tokens = tokenize(document.body.as_str(), processing)
1260 .into_iter()
1261 .filter(|token| {
1262 matches!(
1263 token.kind,
1264 TokenKind::Word
1265 | TokenKind::Number
1266 | TokenKind::Url
1267 | TokenKind::Email
1268 | TokenKind::Mention
1269 | TokenKind::Hashtag
1270 )
1271 })
1272 .collect::<Vec<_>>();
1273 if tokens.is_empty() {
1274 return Ok(Vec::new());
1275 }
1276
1277 let mut chunks = Vec::new();
1278 let step = options.chunk_tokens - options.chunk_overlap_tokens;
1279 let metadata = document_metadata(document);
1280 let mut start = 0;
1281 let mut ordinal = 0;
1282 while start < tokens.len() {
1283 let end = (start + options.chunk_tokens).min(tokens.len());
1284 let byte_start = tokens[start].span.byte_start;
1285 let byte_end = tokens[end - 1].span.byte_end;
1286 let text = document
1287 .body
1288 .get(byte_start..byte_end)
1289 .ok_or_else(|| invalid_argument("token spans did not align to valid UTF-8 boundaries"))?
1290 .trim()
1291 .to_string();
1292 if !text.is_empty() {
1293 chunks.push(DocumentChunk {
1294 chunk_id: format!("{}:{ordinal}", document.id),
1295 document_id: document.id.clone(),
1296 text,
1297 ordinal,
1298 metadata: metadata.clone(),
1299 source: document.source.clone(),
1300 provenance: document.provenance.clone(),
1301 annotations: document.annotations.clone(),
1302 });
1303 ordinal += 1;
1304 }
1305 if end == tokens.len() {
1306 break;
1307 }
1308 start += step;
1309 }
1310 Ok(chunks)
1311}
1312
1313fn chunk_spans(
1314 document: &SearchDocument,
1315 spans: impl IntoIterator<Item = TextSpan>,
1316) -> CoreResult<Vec<DocumentChunk>> {
1317 let metadata = document_metadata(document);
1318 let mut chunks = Vec::new();
1319 for (ordinal, span) in spans.into_iter().enumerate() {
1320 let text = document
1321 .body
1322 .get(span.byte_start..span.byte_end)
1323 .ok_or_else(|| invalid_argument("chunk span did not align to valid UTF-8 boundaries"))?
1324 .trim()
1325 .to_string();
1326 if text.is_empty() {
1327 continue;
1328 }
1329 chunks.push(DocumentChunk {
1330 chunk_id: format!("{}:{ordinal}", document.id),
1331 document_id: document.id.clone(),
1332 text,
1333 ordinal,
1334 metadata: metadata.clone(),
1335 source: document.source.clone(),
1336 provenance: document.provenance.clone(),
1337 annotations: document.annotations.clone(),
1338 });
1339 }
1340 Ok(chunks)
1341}
1342
1343fn document_metadata(document: &SearchDocument) -> BTreeMap<String, String> {
1344 let mut metadata = document.metadata.clone();
1345 if let Some(title) = &document.title {
1346 metadata
1347 .entry(TITLE_METADATA_KEY.to_string())
1348 .or_insert_with(|| title.clone());
1349 }
1350 metadata
1351}
1352
1353fn normalize_query(query: &str, processing: &TextProcessingOptions) -> CoreResult<String> {
1354 let tokens = tokenize(query, processing)
1355 .into_iter()
1356 .filter(|token| {
1357 matches!(
1358 token.kind,
1359 TokenKind::Word
1360 | TokenKind::Number
1361 | TokenKind::Url
1362 | TokenKind::Email
1363 | TokenKind::Mention
1364 | TokenKind::Hashtag
1365 )
1366 })
1367 .map(|token| token.normalized)
1368 .collect::<Vec<_>>();
1369 if tokens.is_empty() {
1370 return Err(invalid_argument(
1371 "query must contain at least one searchable term",
1372 ));
1373 }
1374 Ok(tokens.join(" "))
1375}
1376
1377fn metadata_tags(metadata: &BTreeMap<String, String>) -> Vec<String> {
1378 metadata
1379 .get("tags")
1380 .map(|tags| {
1381 tags.split([',', ';'])
1382 .flat_map(|group| group.split_whitespace())
1383 .map(str::trim)
1384 .filter(|tag| !tag.is_empty())
1385 .map(ToString::to_string)
1386 .collect::<BTreeSet<_>>()
1387 .into_iter()
1388 .collect::<Vec<_>>()
1389 })
1390 .unwrap_or_default()
1391}
1392
1393fn search_filter_to_vector_filter(filter: &SearchFilter) -> VectorSearchFilter {
1394 VectorSearchFilter {
1395 required_tags: filter.required_tags.clone(),
1396 metadata_equals: filter.metadata_equals.clone(),
1397 }
1398}
1399
1400fn requires_post_filter(filter: &SearchFilter) -> bool {
1401 !filter.document_ids.is_empty() || !filter.metadata_contains.is_empty()
1402}
1403
1404fn normalize_scores(scores: &BTreeMap<String, f32>) -> BTreeMap<String, f32> {
1405 if scores.is_empty() {
1406 return BTreeMap::new();
1407 }
1408 let min = scores.values().copied().fold(f32::INFINITY, f32::min);
1409 let max = scores.values().copied().fold(f32::NEG_INFINITY, f32::max);
1410 if (max - min).abs() <= f32::EPSILON {
1411 return scores
1412 .keys()
1413 .cloned()
1414 .map(|id| (id, 1.0))
1415 .collect::<BTreeMap<_, _>>();
1416 }
1417 scores
1418 .iter()
1419 .map(|(id, score)| (id.clone(), (score - min) / (max - min)))
1420 .collect()
1421}
1422
1423fn build_snippet(text: &str, normalized_query: &str) -> String {
1424 let terms = tokenize(normalized_query, &TextProcessingOptions::default())
1425 .into_iter()
1426 .map(|token| token.normalized)
1427 .collect::<Vec<_>>();
1428 let text_lower = text.to_lowercase();
1429 let match_start = terms.iter().filter_map(|term| text_lower.find(term)).min();
1430
1431 let snippet = match match_start {
1432 Some(byte_index) => {
1433 let start = text[..byte_index]
1434 .char_indices()
1435 .rev()
1436 .nth(60)
1437 .map(|(index, _)| index)
1438 .unwrap_or(0);
1439 let end = text[byte_index..]
1440 .char_indices()
1441 .nth(160)
1442 .map(|(index, _)| byte_index + index)
1443 .unwrap_or(text.len());
1444 text[start..end].trim()
1445 }
1446 None => text
1447 .char_indices()
1448 .nth(160)
1449 .map(|(index, _)| &text[..index])
1450 .unwrap_or(text)
1451 .trim(),
1452 };
1453 snippet.to_string()
1454}
1455
1456fn invalid_argument(message: impl Into<String>) -> DetectError {
1457 DetectError::InvalidArgument(message.into())
1458}
1459
1460#[cfg(test)]
1461mod tests {
1462 use super::*;
1463 use std::cell::Cell;
1464 use text_embeddings::{
1465 DenseVector, HashedTextEmbedder, TextEmbeddingBackend, TextEmbeddingBackendKind,
1466 TextEmbeddingConfig, TextEmbeddingMetadata,
1467 };
1468
1469 fn embedder() -> HashedTextEmbedder {
1470 HashedTextEmbedder::new(
1471 TextEmbeddingConfig {
1472 dimensions: 32,
1473 use_idf: true,
1474 },
1475 CorpusOptions::default(),
1476 )
1477 .unwrap()
1478 }
1479
1480 #[derive(Debug)]
1481 struct FlaggedEmbedder {
1482 panic_on_embed: Cell<bool>,
1483 }
1484
1485 impl FlaggedEmbedder {
1486 fn new() -> Self {
1487 Self {
1488 panic_on_embed: Cell::new(false),
1489 }
1490 }
1491 }
1492
1493 impl TextEmbeddingBackend for FlaggedEmbedder {
1494 fn embed_text(&self, text: &str) -> video_analysis_core::Result<DenseVector> {
1495 if self.panic_on_embed.get() {
1496 panic!("query embedding should have been skipped");
1497 }
1498 DenseVector::new([
1499 text.bytes().filter(|byte| byte % 2 == 0).count() as f32 + 1.0,
1500 text.bytes().filter(|byte| byte % 2 == 1).count() as f32 + 1.0,
1501 ])
1502 }
1503
1504 fn metadata(&self) -> TextEmbeddingMetadata {
1505 TextEmbeddingMetadata {
1506 backend: TextEmbeddingBackendKind::Custom,
1507 model_name: Some("flagged".to_string()),
1508 dimensions: Some(2),
1509 ..TextEmbeddingMetadata::default()
1510 }
1511 }
1512 }
1513
1514 struct FakeReranker;
1515
1516 impl TextReranker for FakeReranker {
1517 fn rerank(
1518 &mut self,
1519 _query: &str,
1520 documents: &[String],
1521 ) -> video_analysis_core::Result<Vec<f32>> {
1522 Ok(documents
1523 .iter()
1524 .map(|document| {
1525 if document.contains("second") {
1526 0.95
1527 } else {
1528 0.1
1529 }
1530 })
1531 .collect())
1532 }
1533
1534 fn runtime_backend(&self) -> TextRuntimeBackend {
1535 TextRuntimeBackend::External
1536 }
1537 }
1538
1539 fn query(text: &str) -> SearchQuery {
1540 SearchQuery {
1541 text: text.to_string(),
1542 top_k: 3,
1543 filter: None,
1544 hybrid: HybridConfig::default(),
1545 }
1546 }
1547
1548 #[test]
1549 fn chunking_preserves_order_and_ids() {
1550 let document = SearchDocument {
1551 id: "doc-1".to_string(),
1552 title: None,
1553 body: (0..10)
1554 .map(|index| format!("token{index}"))
1555 .collect::<Vec<_>>()
1556 .join(" "),
1557 metadata: BTreeMap::new(),
1558 source: None,
1559 provenance: Vec::new(),
1560 annotations: Vec::new(),
1561 };
1562
1563 let chunks = chunk_document(
1564 &document,
1565 &IngestionOptions {
1566 chunk_tokens: 4,
1567 chunk_overlap_tokens: 1,
1568 store_raw_text: true,
1569 },
1570 &TextProcessingOptions::default(),
1571 )
1572 .unwrap();
1573
1574 assert_eq!(chunks[0].chunk_id, "doc-1:0");
1575 assert_eq!(chunks[1].chunk_id, "doc-1:1");
1576 assert_eq!(chunks[0].ordinal, 0);
1577 assert_eq!(chunks[1].ordinal, 1);
1578 }
1579
1580 #[test]
1581 fn explicit_chunking_can_use_sentences() {
1582 let document = SearchDocument::new("doc-1", "First sentence. Second sentence.");
1583 let chunks = chunk_search_document(
1584 &document,
1585 &ChunkingOptions {
1586 strategy: ChunkingStrategy::Sentence,
1587 ..ChunkingOptions::default()
1588 },
1589 &TextProcessingOptions::default(),
1590 )
1591 .unwrap();
1592
1593 assert_eq!(chunks.len(), 2);
1594 assert_eq!(chunks[0].text, "First sentence.");
1595 assert_eq!(chunks[1].text, "Second sentence.");
1596 }
1597
1598 #[test]
1599 fn rerank_context_uses_backend_scores() {
1600 let mut reranker = FakeReranker;
1601 let mut context = RerankExecutionContext {
1602 reranker: Some(&mut reranker),
1603 model_id: Some("fake-reranker".to_string()),
1604 };
1605
1606 let response = rerank_documents_with_context(
1607 RerankRequest {
1608 query: "pick second".to_string(),
1609 documents: vec!["first document".to_string(), "second document".to_string()],
1610 top_k: 2,
1611 imported_scores: Vec::new(),
1612 },
1613 &mut context,
1614 )
1615 .unwrap();
1616
1617 assert_eq!(response.results[0].index, 1);
1618 assert_eq!(response.results[0].runtime.as_deref(), Some("external"));
1619 assert_eq!(
1620 response.results[0].model_id.as_deref(),
1621 Some("fake-reranker")
1622 );
1623 }
1624
1625 #[test]
1626 fn empty_query_returns_error() {
1627 let index = RetrievalIndex::new(embedder());
1628 let err = index.search(&query(" ")).unwrap_err();
1629 assert!(matches!(err, DetectError::InvalidArgument(message) if message.contains("query")));
1630 }
1631
1632 #[test]
1633 fn search_document_from_text_segment_contract_preserves_metadata() {
1634 let mut segment = TextSegmentContract::new(7, "hello retrieval");
1635 segment.stream_id = Some("subs".to_string());
1636 segment.language = Some("en".to_string());
1637 segment.timestamp = Some(
1638 video_analysis_core::Timestamp::new(1250, video_analysis_core::Timebase::new(1, 1000))
1639 .into(),
1640 );
1641 segment.duration_seconds = Some(1.25);
1642 segment
1643 .attributes
1644 .insert("speaker".to_string(), "narrator".to_string());
1645 segment
1646 .attributes
1647 .insert("confidence".to_string(), "0.8".to_string());
1648 segment
1649 .attributes
1650 .insert("source".to_string(), "fixture.srt".to_string());
1651
1652 let document = SearchDocument::from_text_segment_contract(&segment);
1653
1654 assert_eq!(document.id, "subs:7");
1655 assert_eq!(document.body, "hello retrieval");
1656 assert_eq!(document.metadata["language"], "en");
1657 assert_eq!(document.metadata["speaker"], "narrator");
1658 assert_eq!(document.metadata["confidence"], "0.8");
1659 assert_eq!(document.metadata["timestamp_seconds"], "1.25");
1660 assert_eq!(document.metadata["duration_seconds"], "1.25");
1661 assert_eq!(document.metadata["source"], "fixture.srt");
1662
1663 let chunk = chunk_search_document(
1664 &document,
1665 &ChunkingOptions::default(),
1666 &TextProcessingOptions::default(),
1667 )
1668 .unwrap()
1669 .remove(0);
1670 let chunk_document = chunk.to_text_document_contract();
1671 let chunk_segment = chunk.to_text_segment_contract();
1672 assert_eq!(chunk_document.source.unwrap().duration_seconds, Some(1.25));
1673 assert_eq!(chunk_segment.duration_seconds, Some(1.25));
1674 assert_eq!(chunk_segment.attributes["speaker"], "narrator");
1675 }
1676
1677 #[test]
1678 fn replacing_document_rebuilds_chunk_set() {
1679 let mut index = RetrievalIndex::new(embedder());
1680 let options = IngestionOptions {
1681 chunk_tokens: 3,
1682 chunk_overlap_tokens: 1,
1683 store_raw_text: true,
1684 };
1685
1686 index
1687 .ingest_documents(
1688 &[SearchDocument {
1689 id: "doc".to_string(),
1690 title: None,
1691 body: "rust cargo crates docs search".to_string(),
1692 metadata: BTreeMap::new(),
1693 source: None,
1694 provenance: Vec::new(),
1695 annotations: Vec::new(),
1696 }],
1697 &options,
1698 )
1699 .unwrap();
1700 let original_texts = index
1701 .chunks
1702 .values()
1703 .map(|chunk| chunk.text.clone())
1704 .collect::<Vec<_>>();
1705
1706 let report = index
1707 .ingest_documents(
1708 &[SearchDocument {
1709 id: "doc".to_string(),
1710 title: None,
1711 body: "music playlists and recommendations".to_string(),
1712 metadata: BTreeMap::new(),
1713 source: None,
1714 provenance: Vec::new(),
1715 annotations: Vec::new(),
1716 }],
1717 &options,
1718 )
1719 .unwrap();
1720
1721 assert_eq!(report.documents_replaced, 1);
1722 assert!(index
1723 .chunks
1724 .keys()
1725 .all(|chunk_id| chunk_id.starts_with("doc:")));
1726 assert!(index.chunks.values().all(|chunk| !original_texts
1727 .iter()
1728 .any(|original| original == &chunk.text)));
1729 }
1730
1731 #[test]
1732 fn hybrid_search_combines_bm25_and_vectors() {
1733 let mut index = RetrievalIndex::new(embedder());
1734 index
1735 .ingest_documents(
1736 &[
1737 SearchDocument {
1738 id: "doc-1".to_string(),
1739 title: Some("Rust Search".to_string()),
1740 body: "Rust cargo crates enable semantic search over documentation."
1741 .to_string(),
1742 metadata: BTreeMap::from([
1743 ("lang".to_string(), "en".to_string()),
1744 ("tags".to_string(), "docs rust".to_string()),
1745 ]),
1746 source: None,
1747 provenance: Vec::new(),
1748 annotations: Vec::new(),
1749 },
1750 SearchDocument {
1751 id: "doc-2".to_string(),
1752 title: None,
1753 body: "Berlin travel plans and restaurant notes.".to_string(),
1754 metadata: BTreeMap::from([("lang".to_string(), "en".to_string())]),
1755 source: None,
1756 provenance: Vec::new(),
1757 annotations: Vec::new(),
1758 },
1759 ],
1760 &IngestionOptions::default(),
1761 )
1762 .unwrap();
1763
1764 let results = index.search(&query("rust documentation search")).unwrap();
1765
1766 assert_eq!(results[0].document_id, "doc-1");
1767 assert!(results[0].semantic_score > 0.0);
1768 }
1769
1770 #[test]
1771 fn full_text_retrieval_skips_query_embedding() {
1772 let mut index = RetrievalIndex::new(FlaggedEmbedder::new());
1773 index
1774 .ingest_documents(
1775 &[SearchDocument {
1776 id: "doc-1".to_string(),
1777 title: None,
1778 body: "rust cargo full text search".to_string(),
1779 metadata: BTreeMap::new(),
1780 source: None,
1781 provenance: Vec::new(),
1782 annotations: Vec::new(),
1783 }],
1784 &IngestionOptions::default(),
1785 )
1786 .unwrap();
1787 index.embedder().panic_on_embed.set(true);
1788
1789 let results = index
1790 .search(&SearchQuery::full_text("cargo search", 1))
1791 .unwrap();
1792
1793 assert_eq!(results[0].document_id, "doc-1");
1794 assert_eq!(results[0].semantic_score, 0.0);
1795 assert!(results[0].lexical_score > 0.0);
1796 }
1797
1798 #[test]
1799 fn semantic_search_does_not_require_lexical_matches() {
1800 let mut index = RetrievalIndex::new(embedder());
1801 index
1802 .ingest_documents(
1803 &[SearchDocument {
1804 id: "doc-1".to_string(),
1805 title: None,
1806 body: "alpha beta gamma".to_string(),
1807 metadata: BTreeMap::new(),
1808 source: None,
1809 provenance: Vec::new(),
1810 annotations: Vec::new(),
1811 }],
1812 &IngestionOptions::default(),
1813 )
1814 .unwrap();
1815
1816 let results = index
1817 .search(&SearchQuery::semantic("unshared tokens", 1))
1818 .unwrap();
1819
1820 assert_eq!(results[0].document_id, "doc-1");
1821 assert_eq!(results[0].lexical_score, 0.0);
1822 assert!(results[0].semantic_score > 0.0);
1823 }
1824
1825 #[test]
1826 fn post_filter_search_expands_candidates_before_filtering() {
1827 let mut index = RetrievalIndex::new(embedder());
1828 let mut docs = Vec::new();
1829 for number in 0..40 {
1830 docs.push(SearchDocument {
1831 id: format!("doc-{number:02}"),
1832 title: None,
1833 body: "common search text".to_string(),
1834 metadata: BTreeMap::from([(
1835 "note".to_string(),
1836 if number == 39 {
1837 "contains-special-target".to_string()
1838 } else {
1839 "ordinary".to_string()
1840 },
1841 )]),
1842 source: None,
1843 provenance: Vec::new(),
1844 annotations: Vec::new(),
1845 });
1846 }
1847 index
1848 .ingest_documents(&docs, &IngestionOptions::default())
1849 .unwrap();
1850
1851 let filter = SearchFilter {
1852 metadata_contains: BTreeMap::from([("note".to_string(), "special".to_string())]),
1853 document_ids: BTreeSet::from(["doc-39".to_string()]),
1854 ..SearchFilter::default()
1855 };
1856 let results = index
1857 .search(&SearchQuery::full_text("common", 1).filter(filter))
1858 .unwrap();
1859
1860 assert_eq!(results.len(), 1);
1861 assert_eq!(results[0].document_id, "doc-39");
1862 }
1863
1864 #[test]
1865 fn filters_exclude_non_matching_chunks() {
1866 let mut index = RetrievalIndex::new(embedder());
1867 index
1868 .ingest_documents(
1869 &[
1870 SearchDocument {
1871 id: "doc-1".to_string(),
1872 title: None,
1873 body: "English cargo docs".to_string(),
1874 metadata: BTreeMap::from([
1875 ("lang".to_string(), "en".to_string()),
1876 ("tags".to_string(), "docs".to_string()),
1877 ]),
1878 source: None,
1879 provenance: Vec::new(),
1880 annotations: Vec::new(),
1881 },
1882 SearchDocument {
1883 id: "doc-2".to_string(),
1884 title: None,
1885 body: "German cargo docs".to_string(),
1886 metadata: BTreeMap::from([
1887 ("lang".to_string(), "de".to_string()),
1888 ("tags".to_string(), "docs".to_string()),
1889 ]),
1890 source: None,
1891 provenance: Vec::new(),
1892 annotations: Vec::new(),
1893 },
1894 ],
1895 &IngestionOptions::default(),
1896 )
1897 .unwrap();
1898
1899 let mut query = query("cargo docs");
1900 query.filter = Some(SearchFilter {
1901 metadata_equals: BTreeMap::from([("lang".to_string(), "en".to_string())]),
1902 metadata_contains: BTreeMap::new(),
1903 required_tags: vec!["docs".to_string()],
1904 document_ids: BTreeSet::new(),
1905 });
1906
1907 let results = index.search(&query).unwrap();
1908 assert_eq!(results.len(), 1);
1909 assert_eq!(results[0].document_id, "doc-1");
1910 }
1911
1912 #[test]
1913 fn related_chunks_excludes_self() {
1914 let mut index = RetrievalIndex::new(embedder());
1915 index
1916 .ingest_documents(
1917 &[SearchDocument {
1918 id: "doc-1".to_string(),
1919 title: None,
1920 body: "rust cargo crates semantic search documentation indexing retrieval"
1921 .to_string(),
1922 metadata: BTreeMap::new(),
1923 source: None,
1924 provenance: Vec::new(),
1925 annotations: Vec::new(),
1926 }],
1927 &IngestionOptions {
1928 chunk_tokens: 4,
1929 chunk_overlap_tokens: 1,
1930 store_raw_text: true,
1931 },
1932 )
1933 .unwrap();
1934
1935 let related = index.related_chunks("doc-1:0", 2).unwrap();
1936 assert!(related.iter().all(|result| result.chunk_id != "doc-1:0"));
1937 }
1938
1939 #[test]
1940 fn from_parts_validates_dimension_mismatch() {
1941 let chunks = vec![DocumentChunk {
1942 chunk_id: "doc:0".to_string(),
1943 document_id: "doc".to_string(),
1944 text: "rust cargo docs".to_string(),
1945 ordinal: 0,
1946 metadata: BTreeMap::new(),
1947 source: None,
1948 provenance: Vec::new(),
1949 annotations: Vec::new(),
1950 }];
1951 let records = vec![SerializableVectorRecord {
1952 id: "doc:0".into(),
1953 vector: vec![1.0, 0.0],
1954 payload: VectorRecordMetadata::default(),
1955 }];
1956
1957 let err = RetrievalIndex::from_parts(
1958 embedder(),
1959 CorpusOptions::default(),
1960 Bm25Options::default(),
1961 chunks,
1962 BTreeMap::new(),
1963 records,
1964 )
1965 .unwrap_err();
1966
1967 assert!(
1968 matches!(err, DetectError::InvalidArgument(message) if message.contains("dimensions"))
1969 );
1970 }
1971}