#![doc = include_str!("../README.md")]
pub mod surface;
use std::collections::{BTreeMap, BTreeSet};
mod storage;
use serde::{Deserialize, Serialize};
use text_core::{
split_paragraphs, split_sentence_spans, tokenize, TextAnnotationSpan, TextDocument,
TextDocumentContract, TextProcessingOptions, TextProvenance, TextSegmentContract,
TextSourceRef, TextSpan, TokenKind,
};
use text_embeddings::{EmbeddingModelInfo, TextEmbedderBackend};
use text_lexical::{Bm25Corpus, Bm25Options, CorpusOptions, TextCorpus, TextCorpusDocument};
use text_model_runtime::{TextReranker, TextRuntimeBackend};
use vector_analysis_index::{
SerializableVectorRecord, VectorRecord, VectorRecordMetadata, VectorSearchFilter,
VectorSearchIndex,
};
use video_analysis_core::{DetectError, Result as CoreResult};
pub use storage::*;
const TITLE_METADATA_KEY: &str = "__title";
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SearchDocument {
pub id: String,
#[serde(default)]
pub title: Option<String>,
pub body: String,
#[serde(default)]
pub metadata: BTreeMap<String, String>,
#[serde(default)]
pub source: Option<TextSourceRef>,
#[serde(default)]
pub provenance: Vec<TextProvenance>,
#[serde(default)]
pub annotations: Vec<TextAnnotationSpan>,
}
impl SearchDocument {
pub fn new(id: impl Into<String>, body: impl Into<String>) -> Self {
Self {
id: id.into(),
title: None,
body: body.into(),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}
}
pub fn from_text_document(document: &TextDocument<'_>) -> Self {
let mut metadata = BTreeMap::new();
if let Some(language) = document.language {
metadata.insert("language".to_string(), language.to_string());
}
if let Some(timestamp) = document.timestamp {
metadata.insert(
"timestamp_seconds".to_string(),
timestamp.seconds().to_string(),
);
}
Self {
id: document.id.to_string(),
title: None,
body: document.text.to_string(),
metadata,
source: document.timestamp.map(|timestamp| TextSourceRef {
source_id: None,
source_kind: Some("text_document".to_string()),
uri: None,
media_timestamp: Some(timestamp.into()),
duration_seconds: None,
}),
provenance: Vec::new(),
annotations: Vec::new(),
}
}
pub fn from_text_document_contract(document: &TextDocumentContract) -> Self {
let mut metadata = document.attributes.clone();
if let Some(language) = &document.language {
metadata.insert("language".to_string(), language.clone());
}
if let Some(timestamp) = document.timestamp {
metadata.insert(
"timestamp_seconds".to_string(),
timestamp.seconds().to_string(),
);
}
Self {
id: document.id.clone(),
title: None,
body: document.text.clone(),
metadata,
source: document.source.clone(),
provenance: document.provenance.clone(),
annotations: document.annotations.clone(),
}
}
pub fn from_text_segment_contract(segment: &TextSegmentContract) -> Self {
let mut metadata = segment.attributes.clone();
if let Some(language) = &segment.language {
metadata.insert("language".to_string(), language.clone());
}
if let Some(timestamp) = segment.timestamp {
metadata.insert(
"timestamp_seconds".to_string(),
timestamp.seconds().to_string(),
);
}
if let Some(duration_seconds) = segment.duration_seconds {
metadata.insert("duration_seconds".to_string(), duration_seconds.to_string());
}
Self {
id: segment
.document_id()
.unwrap_or_else(|| segment.segment_index.to_string()),
title: None,
body: segment.text.clone(),
metadata,
source: segment.source.clone().or_else(|| {
(segment.timestamp.is_some() || segment.duration_seconds.is_some()).then(|| {
TextSourceRef {
source_id: segment.stream_id.clone(),
source_kind: Some("text_segment".to_string()),
uri: None,
media_timestamp: segment.timestamp,
duration_seconds: segment.duration_seconds,
}
})
}),
provenance: segment.provenance.clone(),
annotations: segment.annotations.clone(),
}
}
pub fn from_text_corpus_document(document: &TextCorpusDocument) -> Self {
let mut metadata = document.metadata.clone();
if let Some(language) = &document.language {
metadata.insert("language".to_string(), language.clone());
}
if let Some(timestamp) = document.timestamp {
metadata.insert(
"timestamp_seconds".to_string(),
timestamp.seconds().to_string(),
);
}
if let Some(duration_seconds) = document
.source
.as_ref()
.and_then(|source| source.duration_seconds)
{
metadata.insert("duration_seconds".to_string(), duration_seconds.to_string());
}
Self {
id: document.id.clone(),
title: None,
body: document.text.clone(),
metadata,
source: document.source.clone(),
provenance: document.provenance.clone(),
annotations: document.annotations.clone(),
}
}
pub fn from_text_corpus(corpus: &TextCorpus) -> Vec<Self> {
corpus
.documents
.iter()
.map(Self::from_text_corpus_document)
.collect()
}
}
pub trait IntoSearchDocument {
fn into_search_document(self) -> SearchDocument;
}
impl IntoSearchDocument for TextDocument<'_> {
fn into_search_document(self) -> SearchDocument {
SearchDocument::from_text_document(&self)
}
}
impl IntoSearchDocument for &TextDocument<'_> {
fn into_search_document(self) -> SearchDocument {
SearchDocument::from_text_document(self)
}
}
impl IntoSearchDocument for TextDocumentContract {
fn into_search_document(self) -> SearchDocument {
SearchDocument::from_text_document_contract(&self)
}
}
impl IntoSearchDocument for &TextDocumentContract {
fn into_search_document(self) -> SearchDocument {
SearchDocument::from_text_document_contract(self)
}
}
impl IntoSearchDocument for TextSegmentContract {
fn into_search_document(self) -> SearchDocument {
SearchDocument::from_text_segment_contract(&self)
}
}
impl IntoSearchDocument for &TextSegmentContract {
fn into_search_document(self) -> SearchDocument {
SearchDocument::from_text_segment_contract(self)
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct DocumentChunk {
pub chunk_id: String,
pub document_id: String,
pub text: String,
pub ordinal: usize,
pub metadata: BTreeMap<String, String>,
#[serde(default)]
pub source: Option<TextSourceRef>,
#[serde(default)]
pub provenance: Vec<TextProvenance>,
#[serde(default)]
pub annotations: Vec<TextAnnotationSpan>,
}
impl DocumentChunk {
pub fn to_text_document_contract(&self) -> TextDocumentContract {
TextDocumentContract {
id: self.chunk_id.clone(),
text: self.text.clone(),
language: self.metadata.get("language").cloned(),
timestamp: self
.source
.as_ref()
.and_then(|source| source.media_timestamp),
attributes: self.metadata.clone(),
source: self.source.clone(),
provenance: self.provenance.clone(),
annotations: self.annotations.clone(),
}
}
pub fn to_text_segment_contract(&self) -> TextSegmentContract {
TextSegmentContract {
stream_id: Some(self.document_id.clone()),
segment_index: self.ordinal as u64,
text: self.text.clone(),
language: self.metadata.get("language").cloned(),
timestamp: self
.source
.as_ref()
.and_then(|source| source.media_timestamp),
duration_seconds: self
.source
.as_ref()
.and_then(|source| source.duration_seconds),
is_final: true,
attributes: self.metadata.clone(),
source: self.source.clone(),
provenance: self.provenance.clone(),
annotations: self.annotations.clone(),
}
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct IngestionOptions {
pub chunk_tokens: usize,
pub chunk_overlap_tokens: usize,
pub store_raw_text: bool,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum ChunkingStrategy {
TokenWindow,
Sentence,
Paragraph,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct ChunkingOptions {
pub strategy: ChunkingStrategy,
pub ingestion: IngestionOptions,
}
impl Default for ChunkingOptions {
fn default() -> Self {
Self {
strategy: ChunkingStrategy::TokenWindow,
ingestion: IngestionOptions::default(),
}
}
}
impl Default for IngestionOptions {
fn default() -> Self {
Self {
chunk_tokens: 256,
chunk_overlap_tokens: 32,
store_raw_text: true,
}
}
}
pub fn chunk_search_document(
document: &SearchDocument,
options: &ChunkingOptions,
processing: &TextProcessingOptions,
) -> CoreResult<Vec<DocumentChunk>> {
validate_document(document)?;
match options.strategy {
ChunkingStrategy::TokenWindow => chunk_document(document, &options.ingestion, processing),
ChunkingStrategy::Sentence => chunk_spans(
document,
split_sentence_spans(&document.body, processing)
.into_iter()
.map(|sentence| sentence.span),
),
ChunkingStrategy::Paragraph => chunk_spans(
document,
split_paragraphs(&document.body)
.into_iter()
.map(|paragraph| paragraph.span),
),
}
}
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
pub struct SearchFilter {
pub metadata_equals: BTreeMap<String, String>,
pub metadata_contains: BTreeMap<String, String>,
pub required_tags: Vec<String>,
pub document_ids: BTreeSet<String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub enum RetrievalMode {
FullText,
Semantic,
Hybrid,
}
#[derive(Debug, Clone, Copy, PartialEq, Serialize, Deserialize)]
pub struct HybridConfig {
pub semantic_weight: f32,
pub lexical_weight: f32,
pub rerank_window: usize,
pub rerank: bool,
}
impl Default for HybridConfig {
fn default() -> Self {
Self {
semantic_weight: 0.8,
lexical_weight: 0.2,
rerank_window: 64,
rerank: false,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SearchQuery {
pub text: String,
pub top_k: usize,
pub filter: Option<SearchFilter>,
pub hybrid: HybridConfig,
}
impl SearchQuery {
pub fn new(text: impl Into<String>, top_k: usize) -> Self {
Self {
text: text.into(),
top_k,
filter: None,
hybrid: HybridConfig::default(),
}
}
pub fn full_text(text: impl Into<String>, top_k: usize) -> Self {
Self::new(text, top_k).mode(RetrievalMode::FullText)
}
pub fn semantic(text: impl Into<String>, top_k: usize) -> Self {
Self::new(text, top_k).mode(RetrievalMode::Semantic)
}
pub fn hybrid(text: impl Into<String>, top_k: usize, config: HybridConfig) -> Self {
Self {
text: text.into(),
top_k,
filter: None,
hybrid: config,
}
}
pub fn filter(mut self, filter: SearchFilter) -> Self {
self.filter = Some(filter);
self
}
pub fn mode(mut self, mode: RetrievalMode) -> Self {
match mode {
RetrievalMode::FullText => {
self.hybrid.semantic_weight = 0.0;
self.hybrid.lexical_weight = 1.0;
}
RetrievalMode::Semantic => {
self.hybrid.semantic_weight = 1.0;
self.hybrid.lexical_weight = 0.0;
}
RetrievalMode::Hybrid => {
if self.hybrid.semantic_weight <= f32::EPSILON {
self.hybrid.semantic_weight = HybridConfig::default().semantic_weight;
}
if self.hybrid.lexical_weight <= f32::EPSILON {
self.hybrid.lexical_weight = HybridConfig::default().lexical_weight;
}
}
}
self
}
pub fn retrieval_mode(&self) -> RetrievalMode {
let semantic = self.hybrid.semantic_weight > f32::EPSILON;
let lexical = self.hybrid.lexical_weight > f32::EPSILON;
match (semantic, lexical) {
(false, true) => RetrievalMode::FullText,
(true, false) => RetrievalMode::Semantic,
_ => RetrievalMode::Hybrid,
}
}
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RerankRequest {
pub query: String,
pub documents: Vec<String>,
#[serde(default = "default_rerank_top_k")]
pub top_k: usize,
#[serde(default)]
pub imported_scores: Vec<f32>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RerankResult {
pub index: usize,
pub document: String,
pub score: f32,
#[serde(default)]
pub runtime: Option<String>,
#[serde(default)]
pub model_id: Option<String>,
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
#[serde(rename_all = "camelCase")]
pub struct RerankResponse {
pub accepted: bool,
pub operation: String,
pub query: String,
pub results: Vec<RerankResult>,
#[serde(default)]
pub runtime: Option<String>,
#[serde(default)]
pub model_id: Option<String>,
}
#[derive(Default)]
pub struct RerankExecutionContext<'a> {
pub reranker: Option<&'a mut dyn TextReranker>,
pub model_id: Option<String>,
}
pub fn rerank_documents(request: RerankRequest) -> CoreResult<RerankResponse> {
rerank_documents_with_context(request, &mut RerankExecutionContext::default())
}
pub fn rerank_documents_with_context(
request: RerankRequest,
context: &mut RerankExecutionContext<'_>,
) -> CoreResult<RerankResponse> {
if request.query.trim().is_empty() {
return Err(DetectError::InvalidArgument(
"rerank request must include a non-empty query".to_string(),
));
}
if request.documents.is_empty() {
return Err(DetectError::InvalidArgument(
"rerank request must include at least one document".to_string(),
));
}
let backend_scores = if let Some(reranker) = context.reranker.as_deref_mut() {
let runtime = rerank_runtime_name(reranker.runtime_backend());
Some((
runtime,
reranker.rerank(&request.query, &request.documents)?,
))
} else {
None
};
let mut results = request
.documents
.iter()
.enumerate()
.map(|(index, document)| {
let (score, runtime) = if let Some((runtime, scores)) = &backend_scores {
(
scores.get(index).copied().unwrap_or(0.0),
Some(runtime.clone()),
)
} else {
(
request
.imported_scores
.get(index)
.copied()
.unwrap_or_else(|| lexical_overlap_score(&request.query, document)),
None,
)
};
RerankResult {
index,
document: document.clone(),
score,
runtime,
model_id: context.model_id.clone(),
}
})
.collect::<Vec<_>>();
results.sort_by(|left, right| {
right
.score
.total_cmp(&left.score)
.then_with(|| left.index.cmp(&right.index))
});
results.truncate(request.top_k.max(1));
Ok(RerankResponse {
accepted: true,
operation: "rerank".to_string(),
query: request.query,
results,
runtime: backend_scores.map(|(runtime, _)| runtime),
model_id: context.model_id.clone(),
})
}
fn rerank_runtime_name(runtime: TextRuntimeBackend) -> String {
match runtime {
TextRuntimeBackend::Candle => "candle",
TextRuntimeBackend::Onnx => "onnx",
TextRuntimeBackend::Tokenizers => "tokenizers",
TextRuntimeBackend::CudaOxide => "cuda_oxide",
TextRuntimeBackend::External => "external",
TextRuntimeBackend::Heuristic => "heuristic",
}
.to_string()
}
fn lexical_overlap_score(query: &str, document: &str) -> f32 {
let processing = TextProcessingOptions::default();
let query_terms = tokenize(query, &processing)
.into_iter()
.filter_map(|token| match token.kind {
TokenKind::Word => Some(token.normalized),
_ => None,
})
.collect::<BTreeSet<_>>();
if query_terms.is_empty() {
return 0.0;
}
let document_terms = tokenize(document, &processing)
.into_iter()
.filter_map(|token| match token.kind {
TokenKind::Word => Some(token.normalized),
_ => None,
})
.collect::<BTreeSet<_>>();
let overlap = query_terms
.iter()
.filter(|term| document_terms.contains(term.as_str()))
.count();
overlap as f32 / query_terms.len() as f32
}
fn rerank_search_results(
query: &str,
results: &mut Vec<SearchResult>,
top_k: usize,
context: &mut RerankExecutionContext<'_>,
) -> CoreResult<()> {
let candidate_count = results.len();
if candidate_count == 0 {
return Ok(());
}
let request = RerankRequest {
query: query.to_string(),
documents: results
.iter()
.map(|result| result.snippet.clone())
.collect(),
top_k: top_k.max(1),
imported_scores: results.iter().map(|result| result.score).collect(),
};
let response = rerank_documents_with_context(request, context)?;
let mut ranked = Vec::new();
for reranked in response.results {
if let Some(mut result) = results.get(reranked.index).cloned() {
result.score = reranked.score;
if let Some(runtime) = reranked.runtime {
result
.metadata
.insert("rerank_runtime".to_string(), runtime);
}
if let Some(model_id) = reranked.model_id {
result
.metadata
.insert("rerank_model_id".to_string(), model_id);
}
ranked.push(result);
}
}
*results = ranked;
if results.len() > candidate_count {
results.truncate(candidate_count);
}
Ok(())
}
fn default_rerank_top_k() -> usize {
3
}
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct SearchResult {
pub chunk_id: String,
pub document_id: String,
pub score: f32,
pub semantic_score: f32,
pub lexical_score: f32,
pub snippet: String,
pub metadata: BTreeMap<String, String>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
pub struct IngestReport {
pub documents_received: usize,
pub documents_replaced: usize,
pub documents_skipped: usize,
pub chunks_indexed: usize,
}
#[derive(Debug, Clone, PartialEq)]
pub struct RetrievalIndex<B> {
embedder: B,
corpus_options: CorpusOptions,
bm25_options: Bm25Options,
chunks: BTreeMap<String, DocumentChunk>,
raw_text_by_chunk_id: BTreeMap<String, String>,
document_chunks: BTreeMap<String, Vec<String>>,
vectors: VectorSearchIndex,
bm25: Bm25Corpus,
}
impl<B: TextEmbedderBackend> RetrievalIndex<B> {
pub fn new(embedder: B) -> Self {
Self::with_options(embedder, CorpusOptions::default(), Bm25Options::default())
}
pub fn with_options(
embedder: B,
corpus_options: CorpusOptions,
bm25_options: Bm25Options,
) -> Self {
Self {
embedder,
bm25: Bm25Corpus::new(bm25_options.clone()),
corpus_options,
bm25_options,
chunks: BTreeMap::new(),
raw_text_by_chunk_id: BTreeMap::new(),
document_chunks: BTreeMap::new(),
vectors: VectorSearchIndex::new(),
}
}
pub fn from_parts(
embedder: B,
corpus_options: CorpusOptions,
bm25_options: Bm25Options,
chunks: Vec<DocumentChunk>,
raw_text_by_chunk_id: BTreeMap<String, String>,
vector_records: Vec<SerializableVectorRecord>,
) -> CoreResult<Self> {
let mut index = Self::with_options(embedder, corpus_options, bm25_options);
for chunk in chunks {
index.insert_chunk_state(chunk);
}
index.raw_text_by_chunk_id = raw_text_by_chunk_id;
index.vectors = VectorSearchIndex::import_records(vector_records)?;
index.rebuild_bm25()?;
index.validate_vector_state()?;
Ok(index)
}
pub fn embedder(&self) -> &B {
&self.embedder
}
pub fn embedder_info(&self) -> EmbeddingModelInfo {
self.embedder.model_info()
}
pub fn corpus_options(&self) -> &CorpusOptions {
&self.corpus_options
}
pub fn bm25_options(&self) -> &Bm25Options {
&self.bm25_options
}
pub fn chunks(&self) -> Vec<&DocumentChunk> {
self.chunks.values().collect()
}
pub fn chunks_iter(&self) -> impl Iterator<Item = &DocumentChunk> {
self.chunks.values()
}
pub fn chunk(&self, chunk_id: &str) -> Option<&DocumentChunk> {
self.chunks.get(chunk_id)
}
pub fn raw_text(&self, chunk_id: &str) -> Option<&str> {
self.raw_text_by_chunk_id.get(chunk_id).map(String::as_str)
}
pub fn vector_records(&self) -> Vec<SerializableVectorRecord> {
self.vectors.export_records()
}
pub fn ingest_documents(
&mut self,
docs: &[SearchDocument],
options: &IngestionOptions,
) -> CoreResult<IngestReport> {
validate_ingestion_options(options)?;
let mut replaced = 0;
let mut skipped = 0;
for document in docs {
validate_document(document)?;
if self.document_chunks.contains_key(&document.id) {
self.remove_document(&document.id);
replaced += 1;
}
let chunks = chunk_document(document, options, &self.corpus_options.processing)?;
if chunks.is_empty() {
skipped += 1;
continue;
}
for chunk in chunks {
if options.store_raw_text {
self.raw_text_by_chunk_id
.insert(chunk.chunk_id.clone(), chunk.text.clone());
}
self.insert_chunk_state(chunk);
}
}
self.rebuild_indices()?;
Ok(IngestReport {
documents_received: docs.len(),
documents_replaced: replaced,
documents_skipped: skipped,
chunks_indexed: self.chunks.len(),
})
}
pub fn search(&self, query: &SearchQuery) -> CoreResult<Vec<SearchResult>> {
self.search_with_context(query, &mut RerankExecutionContext::default())
}
pub fn search_with_context(
&self,
query: &SearchQuery,
context: &mut RerankExecutionContext<'_>,
) -> CoreResult<Vec<SearchResult>> {
validate_query(query)?;
self.ensure_non_empty()?;
let normalized_query = normalize_query(&query.text, &self.corpus_options.processing)?;
let filter = query.filter.as_ref();
let overfetch = query
.hybrid
.rerank_window
.max(query.top_k)
.max(query.top_k * 4);
let post_filter_limit = if filter.is_some_and(requires_post_filter) {
self.chunks.len()
} else {
overfetch
};
let semantic_hits = if query.hybrid.semantic_weight > f32::EPSILON {
let query_vector = self.embedder.embed_text(&normalized_query)?;
self.validate_query_dimensions(query_vector.dimensions())?;
self.vectors.search_filtered(
query_vector.as_slice(),
post_filter_limit,
filter.map(search_filter_to_vector_filter).as_ref(),
)?
} else {
Vec::new()
};
let lexical_hits = if query.hybrid.lexical_weight > f32::EPSILON {
self.bm25.search(&normalized_query, post_filter_limit)?
} else {
Vec::new()
};
let semantic_scores = semantic_hits
.iter()
.filter(|hit| self.matches_filter(hit.id.as_str(), filter))
.map(|hit| (hit.id.as_str().to_string(), hit.score))
.collect::<BTreeMap<_, _>>();
let lexical_scores = lexical_hits
.into_iter()
.filter(|hit| self.matches_filter(&hit.id, filter))
.map(|hit| (hit.id, hit.score))
.collect::<BTreeMap<_, _>>();
let normalized_semantic = normalize_scores(&semantic_scores);
let normalized_lexical = normalize_scores(&lexical_scores);
let mut merged_ids = normalized_semantic.keys().cloned().collect::<BTreeSet<_>>();
merged_ids.extend(normalized_lexical.keys().cloned());
let mut results = merged_ids
.into_iter()
.filter_map(|chunk_id| {
let chunk = self.chunks.get(&chunk_id)?;
let semantic_score = normalized_semantic.get(&chunk_id).copied().unwrap_or(0.0);
let lexical_score = normalized_lexical.get(&chunk_id).copied().unwrap_or(0.0);
let score = semantic_score * query.hybrid.semantic_weight
+ lexical_score * query.hybrid.lexical_weight;
Some(SearchResult {
chunk_id: chunk.chunk_id.clone(),
document_id: chunk.document_id.clone(),
score,
semantic_score,
lexical_score,
snippet: self.build_snippet(chunk, &normalized_query),
metadata: chunk.metadata.clone(),
})
})
.collect::<Vec<_>>();
results.sort_by(|left, right| {
right
.score
.total_cmp(&left.score)
.then_with(|| right.semantic_score.total_cmp(&left.semantic_score))
.then_with(|| right.lexical_score.total_cmp(&left.lexical_score))
.then_with(|| left.chunk_id.cmp(&right.chunk_id))
});
if query.hybrid.rerank {
rerank_search_results(&query.text, &mut results, query.top_k, context)?;
} else {
results.truncate(query.top_k);
}
Ok(results)
}
pub fn related_chunks(&self, chunk_id: &str, top_k: usize) -> CoreResult<Vec<SearchResult>> {
if top_k == 0 {
return Err(invalid_argument("search limit must be greater than zero"));
}
self.ensure_non_empty()?;
let query_record = self
.vectors
.records()
.iter()
.find(|record| record.id == chunk_id)
.ok_or_else(|| invalid_argument(format!("chunk `{chunk_id}` was not indexed")))?;
let hits = self
.vectors
.search_filtered(query_record.vector.as_slice(), top_k + 1, None)?;
let mut results = hits
.into_iter()
.filter(|hit| hit.id.as_str() != chunk_id)
.filter_map(|hit| {
let chunk = self.chunks.get(hit.id.as_str())?;
Some(SearchResult {
chunk_id: chunk.chunk_id.clone(),
document_id: chunk.document_id.clone(),
score: hit.score,
semantic_score: hit.score,
lexical_score: 0.0,
snippet: self.build_snippet(chunk, &chunk.text),
metadata: chunk.metadata.clone(),
})
})
.collect::<Vec<_>>();
results.truncate(top_k);
Ok(results)
}
fn insert_chunk_state(&mut self, chunk: DocumentChunk) {
self.document_chunks
.entry(chunk.document_id.clone())
.or_default()
.push(chunk.chunk_id.clone());
self.chunks.insert(chunk.chunk_id.clone(), chunk);
}
fn remove_document(&mut self, document_id: &str) {
if let Some(chunk_ids) = self.document_chunks.remove(document_id) {
for chunk_id in chunk_ids {
self.chunks.remove(&chunk_id);
self.raw_text_by_chunk_id.remove(&chunk_id);
}
}
}
fn rebuild_indices(&mut self) -> CoreResult<()> {
self.rebuild_bm25()?;
if self.chunks.is_empty() {
self.vectors = VectorSearchIndex::new();
return Ok(());
}
let texts = self
.chunks
.values()
.map(|chunk| {
self.raw_text_by_chunk_id
.get(&chunk.chunk_id)
.map(String::as_str)
.unwrap_or(chunk.text.as_str())
})
.collect::<Vec<_>>();
let vectors = self.embedder.embed_batch(&texts)?;
if vectors.len() != self.chunks.len() {
return Err(invalid_argument(
"embedder returned a different number of vectors than input texts",
));
}
self.validate_embedded_dimensions(vectors.iter().map(|vector| vector.dimensions()))?;
let mut index = VectorSearchIndex::new();
for (chunk, vector) in self.chunks.values().zip(vectors) {
index.add(VectorRecord::with_payload(
chunk.chunk_id.clone(),
vector,
VectorRecordMetadata {
tags: metadata_tags(&chunk.metadata),
metadata: chunk.metadata.clone(),
},
))?;
}
self.vectors = index;
Ok(())
}
fn rebuild_bm25(&mut self) -> CoreResult<()> {
let mut bm25 = Bm25Corpus::new(self.bm25_options.clone());
for chunk in self.chunks.values() {
bm25.add_document(chunk.chunk_id.clone(), &chunk.text)?;
}
self.bm25 = bm25;
Ok(())
}
fn validate_embedded_dimensions(
&self,
vector_dimensions: impl IntoIterator<Item = usize>,
) -> CoreResult<()> {
let expected = self.embedder.model_info().dimensions;
if expected == 0 {
return Ok(());
}
if vector_dimensions
.into_iter()
.any(|dimensions| dimensions != expected)
{
return Err(invalid_argument(format!(
"embedder produced vectors that did not match advertised dimensions {expected}"
)));
}
Ok(())
}
fn validate_vector_state(&self) -> CoreResult<()> {
let chunk_ids = self.chunks.keys().cloned().collect::<BTreeSet<_>>();
let vector_ids = self
.vectors
.records()
.iter()
.map(|record| record.id.clone())
.collect::<BTreeSet<_>>();
if chunk_ids != vector_ids {
return Err(invalid_argument(
"persisted vector records and chunk catalog do not describe the same ids",
));
}
if let Some(dimensions) = self.vectors.dimensions() {
let expected = self.embedder.model_info().dimensions;
if expected > 0 && expected != dimensions {
return Err(invalid_argument(format!(
"embedder dimensions {expected} did not match persisted index dimensions {dimensions}"
)));
}
}
Ok(())
}
fn validate_query_dimensions(&self, dimensions: usize) -> CoreResult<()> {
if let Some(index_dimensions) = self.vectors.dimensions() {
if dimensions != index_dimensions {
return Err(invalid_argument(format!(
"query vector dimensions {dimensions} did not match index dimensions {index_dimensions}"
)));
}
}
Ok(())
}
fn matches_filter(&self, chunk_id: &str, filter: Option<&SearchFilter>) -> bool {
let Some(filter) = filter else {
return true;
};
let Some(chunk) = self.chunks.get(chunk_id) else {
return false;
};
if !filter.document_ids.is_empty() && !filter.document_ids.contains(&chunk.document_id) {
return false;
}
if !filter.metadata_contains.iter().all(|(key, needle)| {
chunk
.metadata
.get(key)
.is_some_and(|value| value.contains(needle))
}) {
return false;
}
let tags = metadata_tags(&chunk.metadata);
filter
.required_tags
.iter()
.all(|tag| tags.iter().any(|candidate| candidate == tag))
&& filter
.metadata_equals
.iter()
.all(|(key, value)| chunk.metadata.get(key) == Some(value))
}
fn build_snippet(&self, chunk: &DocumentChunk, normalized_query: &str) -> String {
let text = self
.raw_text_by_chunk_id
.get(&chunk.chunk_id)
.map(String::as_str)
.unwrap_or(chunk.text.as_str());
build_snippet(text, normalized_query)
}
fn ensure_non_empty(&self) -> CoreResult<()> {
if self.chunks.is_empty() {
return Err(invalid_argument("search index is empty"));
}
Ok(())
}
}
fn validate_document(document: &SearchDocument) -> CoreResult<()> {
if document.id.trim().is_empty() {
return Err(invalid_argument("document id must not be empty"));
}
if document.body.trim().is_empty() {
return Err(invalid_argument("document body must not be empty"));
}
Ok(())
}
fn validate_ingestion_options(options: &IngestionOptions) -> CoreResult<()> {
if options.chunk_tokens == 0 {
return Err(invalid_argument(
"chunk token size must be greater than zero",
));
}
if options.chunk_overlap_tokens >= options.chunk_tokens {
return Err(invalid_argument(
"chunk overlap must be smaller than the chunk token size",
));
}
Ok(())
}
fn validate_query(query: &SearchQuery) -> CoreResult<()> {
if query.top_k == 0 {
return Err(invalid_argument("search limit must be greater than zero"));
}
if !query.hybrid.semantic_weight.is_finite()
|| !query.hybrid.lexical_weight.is_finite()
|| query.hybrid.semantic_weight < 0.0
|| query.hybrid.lexical_weight < 0.0
{
return Err(invalid_argument(
"hybrid weights must be finite and non-negative",
));
}
if query.hybrid.semantic_weight + query.hybrid.lexical_weight <= f32::EPSILON {
return Err(invalid_argument("hybrid weights must not both be zero"));
}
if query.hybrid.rerank_window == 0 {
return Err(invalid_argument("rerank window must be greater than zero"));
}
let _ = normalize_query(&query.text, &TextProcessingOptions::default())?;
Ok(())
}
fn chunk_document(
document: &SearchDocument,
options: &IngestionOptions,
processing: &TextProcessingOptions,
) -> CoreResult<Vec<DocumentChunk>> {
let tokens = tokenize(document.body.as_str(), processing)
.into_iter()
.filter(|token| {
matches!(
token.kind,
TokenKind::Word
| TokenKind::Number
| TokenKind::Url
| TokenKind::Email
| TokenKind::Mention
| TokenKind::Hashtag
)
})
.collect::<Vec<_>>();
if tokens.is_empty() {
return Ok(Vec::new());
}
let mut chunks = Vec::new();
let step = options.chunk_tokens - options.chunk_overlap_tokens;
let metadata = document_metadata(document);
let mut start = 0;
let mut ordinal = 0;
while start < tokens.len() {
let end = (start + options.chunk_tokens).min(tokens.len());
let byte_start = tokens[start].span.byte_start;
let byte_end = tokens[end - 1].span.byte_end;
let text = document
.body
.get(byte_start..byte_end)
.ok_or_else(|| invalid_argument("token spans did not align to valid UTF-8 boundaries"))?
.trim()
.to_string();
if !text.is_empty() {
chunks.push(DocumentChunk {
chunk_id: format!("{}:{ordinal}", document.id),
document_id: document.id.clone(),
text,
ordinal,
metadata: metadata.clone(),
source: document.source.clone(),
provenance: document.provenance.clone(),
annotations: document.annotations.clone(),
});
ordinal += 1;
}
if end == tokens.len() {
break;
}
start += step;
}
Ok(chunks)
}
fn chunk_spans(
document: &SearchDocument,
spans: impl IntoIterator<Item = TextSpan>,
) -> CoreResult<Vec<DocumentChunk>> {
let metadata = document_metadata(document);
let mut chunks = Vec::new();
for (ordinal, span) in spans.into_iter().enumerate() {
let text = document
.body
.get(span.byte_start..span.byte_end)
.ok_or_else(|| invalid_argument("chunk span did not align to valid UTF-8 boundaries"))?
.trim()
.to_string();
if text.is_empty() {
continue;
}
chunks.push(DocumentChunk {
chunk_id: format!("{}:{ordinal}", document.id),
document_id: document.id.clone(),
text,
ordinal,
metadata: metadata.clone(),
source: document.source.clone(),
provenance: document.provenance.clone(),
annotations: document.annotations.clone(),
});
}
Ok(chunks)
}
fn document_metadata(document: &SearchDocument) -> BTreeMap<String, String> {
let mut metadata = document.metadata.clone();
if let Some(title) = &document.title {
metadata
.entry(TITLE_METADATA_KEY.to_string())
.or_insert_with(|| title.clone());
}
metadata
}
fn normalize_query(query: &str, processing: &TextProcessingOptions) -> CoreResult<String> {
let tokens = tokenize(query, processing)
.into_iter()
.filter(|token| {
matches!(
token.kind,
TokenKind::Word
| TokenKind::Number
| TokenKind::Url
| TokenKind::Email
| TokenKind::Mention
| TokenKind::Hashtag
)
})
.map(|token| token.normalized)
.collect::<Vec<_>>();
if tokens.is_empty() {
return Err(invalid_argument(
"query must contain at least one searchable term",
));
}
Ok(tokens.join(" "))
}
fn metadata_tags(metadata: &BTreeMap<String, String>) -> Vec<String> {
metadata
.get("tags")
.map(|tags| {
tags.split([',', ';'])
.flat_map(|group| group.split_whitespace())
.map(str::trim)
.filter(|tag| !tag.is_empty())
.map(ToString::to_string)
.collect::<BTreeSet<_>>()
.into_iter()
.collect::<Vec<_>>()
})
.unwrap_or_default()
}
fn search_filter_to_vector_filter(filter: &SearchFilter) -> VectorSearchFilter {
VectorSearchFilter {
required_tags: filter.required_tags.clone(),
metadata_equals: filter.metadata_equals.clone(),
}
}
fn requires_post_filter(filter: &SearchFilter) -> bool {
!filter.document_ids.is_empty() || !filter.metadata_contains.is_empty()
}
fn normalize_scores(scores: &BTreeMap<String, f32>) -> BTreeMap<String, f32> {
if scores.is_empty() {
return BTreeMap::new();
}
let min = scores.values().copied().fold(f32::INFINITY, f32::min);
let max = scores.values().copied().fold(f32::NEG_INFINITY, f32::max);
if (max - min).abs() <= f32::EPSILON {
return scores
.keys()
.cloned()
.map(|id| (id, 1.0))
.collect::<BTreeMap<_, _>>();
}
scores
.iter()
.map(|(id, score)| (id.clone(), (score - min) / (max - min)))
.collect()
}
fn build_snippet(text: &str, normalized_query: &str) -> String {
let terms = tokenize(normalized_query, &TextProcessingOptions::default())
.into_iter()
.map(|token| token.normalized)
.collect::<Vec<_>>();
let text_lower = text.to_lowercase();
let match_start = terms.iter().filter_map(|term| text_lower.find(term)).min();
let snippet = match match_start {
Some(byte_index) => {
let start = text[..byte_index]
.char_indices()
.rev()
.nth(60)
.map(|(index, _)| index)
.unwrap_or(0);
let end = text[byte_index..]
.char_indices()
.nth(160)
.map(|(index, _)| byte_index + index)
.unwrap_or(text.len());
text[start..end].trim()
}
None => text
.char_indices()
.nth(160)
.map(|(index, _)| &text[..index])
.unwrap_or(text)
.trim(),
};
snippet.to_string()
}
fn invalid_argument(message: impl Into<String>) -> DetectError {
DetectError::InvalidArgument(message.into())
}
#[cfg(test)]
mod tests {
use super::*;
use std::cell::Cell;
use text_embeddings::{
DenseVector, HashedTextEmbedder, TextEmbeddingBackend, TextEmbeddingBackendKind,
TextEmbeddingConfig, TextEmbeddingMetadata,
};
fn embedder() -> HashedTextEmbedder {
HashedTextEmbedder::new(
TextEmbeddingConfig {
dimensions: 32,
use_idf: true,
},
CorpusOptions::default(),
)
.unwrap()
}
#[derive(Debug)]
struct FlaggedEmbedder {
panic_on_embed: Cell<bool>,
}
impl FlaggedEmbedder {
fn new() -> Self {
Self {
panic_on_embed: Cell::new(false),
}
}
}
impl TextEmbeddingBackend for FlaggedEmbedder {
fn embed_text(&self, text: &str) -> video_analysis_core::Result<DenseVector> {
if self.panic_on_embed.get() {
panic!("query embedding should have been skipped");
}
DenseVector::new([
text.bytes().filter(|byte| byte % 2 == 0).count() as f32 + 1.0,
text.bytes().filter(|byte| byte % 2 == 1).count() as f32 + 1.0,
])
}
fn metadata(&self) -> TextEmbeddingMetadata {
TextEmbeddingMetadata {
backend: TextEmbeddingBackendKind::Custom,
model_name: Some("flagged".to_string()),
dimensions: Some(2),
..TextEmbeddingMetadata::default()
}
}
}
struct FakeReranker;
impl TextReranker for FakeReranker {
fn rerank(
&mut self,
_query: &str,
documents: &[String],
) -> video_analysis_core::Result<Vec<f32>> {
Ok(documents
.iter()
.map(|document| {
if document.contains("second") {
0.95
} else {
0.1
}
})
.collect())
}
fn runtime_backend(&self) -> TextRuntimeBackend {
TextRuntimeBackend::External
}
}
fn query(text: &str) -> SearchQuery {
SearchQuery {
text: text.to_string(),
top_k: 3,
filter: None,
hybrid: HybridConfig::default(),
}
}
#[test]
fn chunking_preserves_order_and_ids() {
let document = SearchDocument {
id: "doc-1".to_string(),
title: None,
body: (0..10)
.map(|index| format!("token{index}"))
.collect::<Vec<_>>()
.join(" "),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
};
let chunks = chunk_document(
&document,
&IngestionOptions {
chunk_tokens: 4,
chunk_overlap_tokens: 1,
store_raw_text: true,
},
&TextProcessingOptions::default(),
)
.unwrap();
assert_eq!(chunks[0].chunk_id, "doc-1:0");
assert_eq!(chunks[1].chunk_id, "doc-1:1");
assert_eq!(chunks[0].ordinal, 0);
assert_eq!(chunks[1].ordinal, 1);
}
#[test]
fn explicit_chunking_can_use_sentences() {
let document = SearchDocument::new("doc-1", "First sentence. Second sentence.");
let chunks = chunk_search_document(
&document,
&ChunkingOptions {
strategy: ChunkingStrategy::Sentence,
..ChunkingOptions::default()
},
&TextProcessingOptions::default(),
)
.unwrap();
assert_eq!(chunks.len(), 2);
assert_eq!(chunks[0].text, "First sentence.");
assert_eq!(chunks[1].text, "Second sentence.");
}
#[test]
fn rerank_context_uses_backend_scores() {
let mut reranker = FakeReranker;
let mut context = RerankExecutionContext {
reranker: Some(&mut reranker),
model_id: Some("fake-reranker".to_string()),
};
let response = rerank_documents_with_context(
RerankRequest {
query: "pick second".to_string(),
documents: vec!["first document".to_string(), "second document".to_string()],
top_k: 2,
imported_scores: Vec::new(),
},
&mut context,
)
.unwrap();
assert_eq!(response.results[0].index, 1);
assert_eq!(response.results[0].runtime.as_deref(), Some("external"));
assert_eq!(
response.results[0].model_id.as_deref(),
Some("fake-reranker")
);
}
#[test]
fn empty_query_returns_error() {
let index = RetrievalIndex::new(embedder());
let err = index.search(&query(" ")).unwrap_err();
assert!(matches!(err, DetectError::InvalidArgument(message) if message.contains("query")));
}
#[test]
fn search_document_from_text_segment_contract_preserves_metadata() {
let mut segment = TextSegmentContract::new(7, "hello retrieval");
segment.stream_id = Some("subs".to_string());
segment.language = Some("en".to_string());
segment.timestamp = Some(
video_analysis_core::Timestamp::new(1250, video_analysis_core::Timebase::new(1, 1000))
.into(),
);
segment.duration_seconds = Some(1.25);
segment
.attributes
.insert("speaker".to_string(), "narrator".to_string());
segment
.attributes
.insert("confidence".to_string(), "0.8".to_string());
segment
.attributes
.insert("source".to_string(), "fixture.srt".to_string());
let document = SearchDocument::from_text_segment_contract(&segment);
assert_eq!(document.id, "subs:7");
assert_eq!(document.body, "hello retrieval");
assert_eq!(document.metadata["language"], "en");
assert_eq!(document.metadata["speaker"], "narrator");
assert_eq!(document.metadata["confidence"], "0.8");
assert_eq!(document.metadata["timestamp_seconds"], "1.25");
assert_eq!(document.metadata["duration_seconds"], "1.25");
assert_eq!(document.metadata["source"], "fixture.srt");
let chunk = chunk_search_document(
&document,
&ChunkingOptions::default(),
&TextProcessingOptions::default(),
)
.unwrap()
.remove(0);
let chunk_document = chunk.to_text_document_contract();
let chunk_segment = chunk.to_text_segment_contract();
assert_eq!(chunk_document.source.unwrap().duration_seconds, Some(1.25));
assert_eq!(chunk_segment.duration_seconds, Some(1.25));
assert_eq!(chunk_segment.attributes["speaker"], "narrator");
}
#[test]
fn replacing_document_rebuilds_chunk_set() {
let mut index = RetrievalIndex::new(embedder());
let options = IngestionOptions {
chunk_tokens: 3,
chunk_overlap_tokens: 1,
store_raw_text: true,
};
index
.ingest_documents(
&[SearchDocument {
id: "doc".to_string(),
title: None,
body: "rust cargo crates docs search".to_string(),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}],
&options,
)
.unwrap();
let original_texts = index
.chunks
.values()
.map(|chunk| chunk.text.clone())
.collect::<Vec<_>>();
let report = index
.ingest_documents(
&[SearchDocument {
id: "doc".to_string(),
title: None,
body: "music playlists and recommendations".to_string(),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}],
&options,
)
.unwrap();
assert_eq!(report.documents_replaced, 1);
assert!(index
.chunks
.keys()
.all(|chunk_id| chunk_id.starts_with("doc:")));
assert!(index.chunks.values().all(|chunk| !original_texts
.iter()
.any(|original| original == &chunk.text)));
}
#[test]
fn hybrid_search_combines_bm25_and_vectors() {
let mut index = RetrievalIndex::new(embedder());
index
.ingest_documents(
&[
SearchDocument {
id: "doc-1".to_string(),
title: Some("Rust Search".to_string()),
body: "Rust cargo crates enable semantic search over documentation."
.to_string(),
metadata: BTreeMap::from([
("lang".to_string(), "en".to_string()),
("tags".to_string(), "docs rust".to_string()),
]),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
},
SearchDocument {
id: "doc-2".to_string(),
title: None,
body: "Berlin travel plans and restaurant notes.".to_string(),
metadata: BTreeMap::from([("lang".to_string(), "en".to_string())]),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
},
],
&IngestionOptions::default(),
)
.unwrap();
let results = index.search(&query("rust documentation search")).unwrap();
assert_eq!(results[0].document_id, "doc-1");
assert!(results[0].semantic_score > 0.0);
}
#[test]
fn full_text_retrieval_skips_query_embedding() {
let mut index = RetrievalIndex::new(FlaggedEmbedder::new());
index
.ingest_documents(
&[SearchDocument {
id: "doc-1".to_string(),
title: None,
body: "rust cargo full text search".to_string(),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}],
&IngestionOptions::default(),
)
.unwrap();
index.embedder().panic_on_embed.set(true);
let results = index
.search(&SearchQuery::full_text("cargo search", 1))
.unwrap();
assert_eq!(results[0].document_id, "doc-1");
assert_eq!(results[0].semantic_score, 0.0);
assert!(results[0].lexical_score > 0.0);
}
#[test]
fn semantic_search_does_not_require_lexical_matches() {
let mut index = RetrievalIndex::new(embedder());
index
.ingest_documents(
&[SearchDocument {
id: "doc-1".to_string(),
title: None,
body: "alpha beta gamma".to_string(),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}],
&IngestionOptions::default(),
)
.unwrap();
let results = index
.search(&SearchQuery::semantic("unshared tokens", 1))
.unwrap();
assert_eq!(results[0].document_id, "doc-1");
assert_eq!(results[0].lexical_score, 0.0);
assert!(results[0].semantic_score > 0.0);
}
#[test]
fn post_filter_search_expands_candidates_before_filtering() {
let mut index = RetrievalIndex::new(embedder());
let mut docs = Vec::new();
for number in 0..40 {
docs.push(SearchDocument {
id: format!("doc-{number:02}"),
title: None,
body: "common search text".to_string(),
metadata: BTreeMap::from([(
"note".to_string(),
if number == 39 {
"contains-special-target".to_string()
} else {
"ordinary".to_string()
},
)]),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
});
}
index
.ingest_documents(&docs, &IngestionOptions::default())
.unwrap();
let filter = SearchFilter {
metadata_contains: BTreeMap::from([("note".to_string(), "special".to_string())]),
document_ids: BTreeSet::from(["doc-39".to_string()]),
..SearchFilter::default()
};
let results = index
.search(&SearchQuery::full_text("common", 1).filter(filter))
.unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].document_id, "doc-39");
}
#[test]
fn filters_exclude_non_matching_chunks() {
let mut index = RetrievalIndex::new(embedder());
index
.ingest_documents(
&[
SearchDocument {
id: "doc-1".to_string(),
title: None,
body: "English cargo docs".to_string(),
metadata: BTreeMap::from([
("lang".to_string(), "en".to_string()),
("tags".to_string(), "docs".to_string()),
]),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
},
SearchDocument {
id: "doc-2".to_string(),
title: None,
body: "German cargo docs".to_string(),
metadata: BTreeMap::from([
("lang".to_string(), "de".to_string()),
("tags".to_string(), "docs".to_string()),
]),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
},
],
&IngestionOptions::default(),
)
.unwrap();
let mut query = query("cargo docs");
query.filter = Some(SearchFilter {
metadata_equals: BTreeMap::from([("lang".to_string(), "en".to_string())]),
metadata_contains: BTreeMap::new(),
required_tags: vec!["docs".to_string()],
document_ids: BTreeSet::new(),
});
let results = index.search(&query).unwrap();
assert_eq!(results.len(), 1);
assert_eq!(results[0].document_id, "doc-1");
}
#[test]
fn related_chunks_excludes_self() {
let mut index = RetrievalIndex::new(embedder());
index
.ingest_documents(
&[SearchDocument {
id: "doc-1".to_string(),
title: None,
body: "rust cargo crates semantic search documentation indexing retrieval"
.to_string(),
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}],
&IngestionOptions {
chunk_tokens: 4,
chunk_overlap_tokens: 1,
store_raw_text: true,
},
)
.unwrap();
let related = index.related_chunks("doc-1:0", 2).unwrap();
assert!(related.iter().all(|result| result.chunk_id != "doc-1:0"));
}
#[test]
fn from_parts_validates_dimension_mismatch() {
let chunks = vec![DocumentChunk {
chunk_id: "doc:0".to_string(),
document_id: "doc".to_string(),
text: "rust cargo docs".to_string(),
ordinal: 0,
metadata: BTreeMap::new(),
source: None,
provenance: Vec::new(),
annotations: Vec::new(),
}];
let records = vec![SerializableVectorRecord {
id: "doc:0".into(),
vector: vec![1.0, 0.0],
payload: VectorRecordMetadata::default(),
}];
let err = RetrievalIndex::from_parts(
embedder(),
CorpusOptions::default(),
Bm25Options::default(),
chunks,
BTreeMap::new(),
records,
)
.unwrap_err();
assert!(
matches!(err, DetectError::InvalidArgument(message) if message.contains("dimensions"))
);
}
}