use anyhow::{Context, Result, anyhow, bail};
use crossbeam_channel as mpsc;
use frankensearch::lexical::{
BooleanQuery, CASS_SCHEMA_HASH as FS_CASS_SCHEMA_HASH, CassFields as FsCassFields,
CassQueryFilters as FsCassQueryFilters, CassQueryToken as FsCassQueryToken,
CassSourceFilter as FsCassSourceFilter, CassWildcardPattern as FsCassWildcardPattern, Count,
IndexReader, IndexRecordOption, LexicalDocHit as FsLexicalDocHit,
LexicalSearchResult as FsLexicalSearchResult, Occur, Query, ReloadPolicy, Searcher,
SnippetConfig as FsSnippetConfig, TantivyDocument, Term, TermQuery, TopDocs, Value,
cass_build_tantivy_query as fs_cass_build_tantivy_query,
cass_has_boolean_operators as fs_cass_has_boolean_operators,
cass_open_search_reader as fs_cass_open_search_reader,
cass_parse_boolean_query as fs_cass_parse_boolean_query,
cass_sanitize_query as fs_cass_sanitize_query, load_doc as fs_load_doc,
render_snippet_html as fs_render_snippet_html,
try_build_snippet_generator as fs_try_build_snippet_generator,
};
use frankensearch::{
Cx as FsCx, InMemoryTwoTierIndex as FsInMemoryTwoTierIndex,
InMemoryVectorIndex as FsInMemoryVectorIndex, LexicalSearch as FsLexicalSearch,
QueryClass as FsQueryClass, RrfConfig as FsRrfConfig, ScoreSource as FsScoreSource,
ScoredResult as FsScoredResult, SearchError as FsSearchError, SearchFuture as FsSearchFuture,
SearchPhase as FsSearchPhase, SyncEmbedderAdapter as FsSyncEmbedderAdapter,
SyncTwoTierSearcher as FsSyncTwoTierSearcher, TwoTierConfig as FsTwoTierConfig,
TwoTierIndex as FsTwoTierIndex, TwoTierSearcher as FsTwoTierSearcher, VectorHit as FsVectorHit,
candidate_count as fs_candidate_count,
core::filter::SearchFilter as FsSearchFilter,
index::{
HNSW_DEFAULT_EF_SEARCH as FS_HNSW_DEFAULT_EF_SEARCH, HnswIndex as FsHnswIndex,
VectorIndex as FsVectorIndex,
},
rrf_fuse as fs_rrf_fuse,
};
use lru::LruCache;
use once_cell::sync::Lazy;
use parking_lot::RwLock;
use std::cell::RefCell;
use std::cmp::Ordering as CmpOrdering;
use std::collections::{HashMap, HashSet, VecDeque};
use std::hash::{Hash, Hasher};
use std::num::NonZeroUsize;
use std::path::{Path, PathBuf};
use std::sync::atomic::{AtomicU64, Ordering};
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use frankensqlite::Connection;
#[cfg(test)]
use frankensqlite::compat::OptionalExtension;
use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
#[cfg(test)]
use frankensqlite::params;
struct SendConnection(Connection);
type TantivyContentExactKey = (i64, i64);
type TantivyContentFallbackKey = (String, String, i64);
type TantivyHydratedContentMaps = (
HashMap<TantivyContentExactKey, String>,
HashMap<TantivyContentFallbackKey, String>,
);
type SqliteFtsHydratedRow = (
i64,
String,
String,
String,
String,
String,
Option<i64>,
Option<i64>,
Option<i64>,
Option<String>,
Option<String>,
Option<String>,
);
const SQLITE_FTS5_HYDRATE_PARAM_CHUNK: usize = 30_000;
const SQLITE_MAX_VARIABLE_NUMBER: usize = 32_766;
const SEARCH_SQLITE_HYDRATION_CACHE_KIB: i64 = 4_096;
const SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER: usize = 4;
unsafe impl Send for SendConnection {}
impl std::ops::Deref for SendConnection {
type Target = Connection;
fn deref(&self) -> &Connection {
&self.0
}
}
fn open_search_hydration_sqlite(path: &Path, timeout: Duration) -> Result<Connection> {
let conn =
crate::storage::sqlite::open_franken_raw_readonly_connection_with_timeout(path, timeout)?;
conn.execute("PRAGMA query_only = 1;")
.with_context(|| "setting search hydration query_only")?;
conn.execute("PRAGMA busy_timeout = 5000;")
.with_context(|| "setting search hydration busy_timeout")?;
conn.execute(&format!(
"PRAGMA cache_size = -{SEARCH_SQLITE_HYDRATION_CACHE_KIB};"
))
.with_context(|| "setting search hydration cache_size")?;
Ok(conn)
}
fn nfc_sanitize_query(raw: &str) -> String {
use unicode_normalization::UnicodeNormalization;
let nfc: String = raw.nfc().collect();
fs_cass_sanitize_query(&nfc)
}
fn franken_query_map_collect_retry<T, F>(
conn: &Connection,
sql: &str,
params: &[ParamValue],
map: F,
) -> Result<Vec<T>, frankensqlite::FrankenError>
where
F: Copy + Fn(&frankensqlite::Row) -> Result<T, frankensqlite::FrankenError>,
{
let deadline = Instant::now() + Duration::from_secs(2);
let mut backoff = Duration::from_millis(4);
loop {
match conn.query_map_collect(sql, params, |row| map(row)) {
Ok(values) => return Ok(values),
Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
let now = Instant::now();
if now >= deadline {
return Err(err);
}
let remaining = deadline.saturating_duration_since(now);
crate::storage::sqlite::sleep_with_franken_retry_backoff(
&mut backoff,
remaining,
Duration::from_millis(64),
);
}
Err(err) => return Err(err),
}
}
}
fn hydrate_message_content_by_conversation(
conn: &Connection,
requests: &[TantivyContentExactKey],
) -> Result<HashMap<TantivyContentExactKey, String>> {
if requests.is_empty() {
return Ok(HashMap::new());
}
let mut wanted_by_conversation: HashMap<i64, HashSet<i64>> = HashMap::new();
for &(conversation_id, line_idx) in requests {
wanted_by_conversation
.entry(conversation_id)
.or_default()
.insert(line_idx);
}
let mut conversation_ids = wanted_by_conversation.keys().copied().collect::<Vec<_>>();
conversation_ids.sort_unstable();
let mut hydrated = HashMap::with_capacity(requests.len());
for conversation_id in conversation_ids {
let Some(wanted_indices) = wanted_by_conversation.get(&conversation_id) else {
continue;
};
let mut wanted_indices = wanted_indices.iter().copied().collect::<Vec<_>>();
wanted_indices.sort_unstable();
let placeholders = sql_placeholders(wanted_indices.len());
let sql = format!(
"SELECT m.conversation_id, m.idx, m.content
FROM messages m INDEXED BY sqlite_autoindex_messages_1
WHERE m.conversation_id = ? AND m.idx IN ({placeholders})
ORDER BY m.idx"
);
let mut params = Vec::with_capacity(wanted_indices.len() + 1);
params.push(ParamValue::from(conversation_id));
params.extend(wanted_indices.iter().copied().map(ParamValue::from));
let rows: Vec<(i64, i64, String)> =
franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?))
})?;
for (conversation_id, line_idx, content) in rows {
hydrated.insert((conversation_id, line_idx), content);
}
}
Ok(hydrated)
}
fn semantic_message_id_from_db(message_id: i64) -> std::io::Result<u64> {
u64::try_from(message_id).map_err(|_| std::io::Error::other("negative message_id"))
}
fn semantic_doc_component_id_from_db(raw: Option<i64>) -> u32 {
raw.map(|value| u32::try_from(value.max(0)).unwrap_or(u32::MAX))
.unwrap_or(0)
}
use crate::search::canonicalize::{canonicalize_for_embedding, content_hash, is_search_noise_text};
use crate::search::embedder::Embedder;
use crate::search::vector_index::{
ROLE_USER, SemanticDocId, SemanticFilter, SemanticFilterMaps, VectorIndex, VectorSearchResult,
parse_semantic_doc_id, role_code_from_str,
};
use crate::sources::provenance::SourceFilter;
pub struct StringInterner {
cache: RwLock<LruCache<Arc<str>, Arc<str>>>,
}
impl StringInterner {
pub fn new(capacity: usize) -> Self {
Self {
cache: RwLock::new(LruCache::new(
NonZeroUsize::new(capacity).expect("capacity must be > 0"),
)),
}
}
pub fn intern(&self, s: &str) -> Arc<str> {
{
let cache = self.cache.read();
if let Some(arc) = cache.peek(s) {
return Arc::clone(arc);
}
}
let mut cache = self.cache.write();
if let Some(arc) = cache.get(s) {
return Arc::clone(arc);
}
let arc: Arc<str> = Arc::from(s);
cache.put(Arc::clone(&arc), Arc::clone(&arc));
arc
}
#[allow(dead_code)]
pub fn len(&self) -> usize {
self.cache.read().len()
}
#[allow(dead_code)]
pub fn is_empty(&self) -> bool {
self.cache.read().is_empty()
}
}
static CACHE_KEY_INTERNER: Lazy<StringInterner> = Lazy::new(|| StringInterner::new(10_000));
#[inline]
fn intern_cache_key(s: &str) -> Arc<str> {
CACHE_KEY_INTERNER.intern(s)
}
#[inline]
pub fn sql_placeholders(count: usize) -> String {
if count == 0 {
return String::new();
}
let capacity = count.saturating_mul(2).saturating_sub(1);
let mut result = String::with_capacity(capacity);
for i in 0..count {
if i > 0 {
result.push(',');
}
result.push('?');
}
result
}
#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
pub struct SearchFilters {
pub agents: HashSet<String>,
pub workspaces: HashSet<String>,
pub created_from: Option<i64>,
pub created_to: Option<i64>,
#[serde(skip_serializing_if = "SourceFilter::is_all")]
pub source_filter: SourceFilter,
#[serde(skip_serializing_if = "HashSet::is_empty")]
pub session_paths: HashSet<String>,
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, clap::ValueEnum)]
#[serde(rename_all = "snake_case")]
pub enum SearchMode {
Lexical,
Semantic,
#[default]
Hybrid,
}
impl SearchMode {
pub fn next(self) -> Self {
match self {
SearchMode::Lexical => SearchMode::Semantic,
SearchMode::Semantic => SearchMode::Hybrid,
SearchMode::Hybrid => SearchMode::Lexical,
}
}
}
#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum SemanticTierMode {
#[default]
Single,
Progressive,
FastOnly,
QualityOnly,
}
impl SemanticTierMode {
const fn wants_two_tier(self) -> bool {
!matches!(self, Self::Single)
}
fn to_frankensearch_config(self) -> FsTwoTierConfig {
let mut config = frankensearch_two_tier_config();
match self {
Self::Single | Self::Progressive => {}
Self::FastOnly => {
config.fast_only = true;
}
Self::QualityOnly => {
config.fast_only = false;
config.quality_weight = 1.0;
}
}
config
}
}
const PROGRESSIVE_EMBEDDING_CACHE_CAPACITY: usize = 64;
const ANN_CANDIDATE_MULTIPLIER: usize = 4;
const HYBRID_NO_LIMIT_PLANNING_WINDOW: usize = 64;
const HYBRID_NO_LIMIT_SEMANTIC_CAP: usize = 2048;
const AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS: usize = 16;
pub const NO_LIMIT_RESULT_MIN: usize = 1_000;
pub const NO_LIMIT_RESULT_MAX: usize = 1_000_000;
const AVG_HIT_BYTES: u64 = 80 * 1024;
const NO_LIMIT_BYTES_CEILING: u64 = 16 * 1024 * 1024 * 1024;
const NO_LIMIT_BYTES_FLOOR: u64 = 256 * 1024 * 1024;
const NO_LIMIT_RAM_DIVISOR: u64 = 16;
fn available_memory_bytes() -> Option<u64> {
let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?;
for line in meminfo.lines() {
if let Some(rest) = line.strip_prefix("MemAvailable:") {
let kb: u64 = rest.split_whitespace().next()?.parse().ok()?;
return Some(kb.saturating_mul(1024));
}
}
None
}
fn no_limit_result_cap() -> usize {
static CAP: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
*CAP.get_or_init(|| {
compute_no_limit_result_cap_from(
std::env::var("CASS_SEARCH_NO_LIMIT_CAP").ok(),
std::env::var("CASS_SEARCH_NO_LIMIT_BYTES").ok(),
available_memory_bytes(),
)
})
}
fn compute_no_limit_result_cap_from(
cap_env: Option<String>,
bytes_env: Option<String>,
available_bytes: Option<u64>,
) -> usize {
if let Some(hits) = cap_env
.and_then(|v| v.parse::<usize>().ok())
.filter(|v| *v > 0)
{
return hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
}
let budget_bytes = no_limit_budget_bytes(bytes_env, available_bytes);
let hits = (budget_bytes / AVG_HIT_BYTES) as usize;
hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX)
}
fn no_limit_budget_bytes(bytes_env: Option<String>, available_bytes: Option<u64>) -> u64 {
bytes_env
.and_then(|v| v.parse::<u64>().ok())
.filter(|v| *v > 0)
.or_else(|| no_limit_available_memory_budget(available_bytes))
.unwrap_or(NO_LIMIT_BYTES_FLOOR)
}
fn no_limit_available_memory_budget(available_bytes: Option<u64>) -> Option<u64> {
available_bytes.map(|avail| {
(avail / NO_LIMIT_RAM_DIVISOR).clamp(NO_LIMIT_BYTES_FLOOR, NO_LIMIT_BYTES_CEILING)
})
}
static FRANKENSEARCH_TWO_TIER_CONFIG: Lazy<FsTwoTierConfig> =
Lazy::new(|| FsTwoTierConfig::optimized().with_env_overrides());
fn frankensearch_two_tier_config() -> FsTwoTierConfig {
FRANKENSEARCH_TWO_TIER_CONFIG.clone()
}
#[inline]
const fn progressive_phase_fetch_limit(limit: usize) -> usize {
let limit = if limit == 0 { 1 } else { limit };
limit.saturating_mul(3)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
struct HybridCandidateBudget {
lexical_candidates: usize,
semantic_candidates: usize,
}
#[inline]
const fn hybrid_stage_multipliers(query_class: FsQueryClass) -> (usize, usize) {
match query_class {
FsQueryClass::Identifier => (6, 2),
FsQueryClass::ShortKeyword => (4, 4),
FsQueryClass::NaturalLanguage => (2, 8),
FsQueryClass::Empty => (0, 0),
}
}
#[inline]
fn hybrid_candidate_budget(
query: &str,
requested_limit: usize,
effective_limit: usize,
offset: usize,
total_docs: usize,
) -> HybridCandidateBudget {
let query_class = FsQueryClass::classify(query);
let (lex_mult, sem_mult) = hybrid_stage_multipliers(query_class);
let total_docs = total_docs.max(1);
if requested_limit == 0 {
let planning_window = HYBRID_NO_LIMIT_PLANNING_WINDOW.max(offset.saturating_add(1));
let lexical = effective_limit.min(total_docs).min(no_limit_result_cap());
let semantic = fs_candidate_count(planning_window, 0, sem_mult)
.max(planning_window)
.min(HYBRID_NO_LIMIT_SEMANTIC_CAP.max(offset.saturating_add(planning_window)))
.min(total_docs)
.min(lexical);
return HybridCandidateBudget {
lexical_candidates: lexical,
semantic_candidates: semantic,
};
}
let lexical = fs_candidate_count(requested_limit, offset, lex_mult.max(1))
.max(requested_limit.saturating_add(offset))
.min(total_docs);
let semantic = fs_candidate_count(requested_limit, offset, sem_mult.max(1))
.max(requested_limit.saturating_add(offset))
.min(total_docs);
HybridCandidateBudget {
lexical_candidates: lexical,
semantic_candidates: semantic,
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum QueryType {
Simple,
Phrase,
Boolean,
Wildcard,
Filtered,
Empty,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum IndexStrategy {
EdgeNgram,
RegexScan,
BooleanCombination,
RangeScan,
FullScan,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum QueryCost {
Low,
Medium,
High,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct ParsedSubTerm {
pub text: String,
pub pattern: String,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct ParsedTerm {
pub text: String,
pub negated: bool,
pub subterms: Vec<ParsedSubTerm>,
}
#[derive(Debug, Clone, Default, serde::Serialize)]
pub struct ParsedQuery {
pub terms: Vec<ParsedTerm>,
pub phrases: Vec<String>,
pub operators: Vec<String>,
pub implicit_and: bool,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct QueryExplanation {
pub original_query: String,
pub sanitized_query: String,
pub parsed: ParsedQuery,
pub query_type: QueryType,
pub index_strategy: IndexStrategy,
pub wildcard_applied: bool,
pub estimated_cost: QueryCost,
pub filters_summary: FiltersSummary,
pub warnings: Vec<String>,
}
#[derive(Debug, Clone, Default, serde::Serialize)]
pub struct FiltersSummary {
pub agent_count: usize,
pub workspace_count: usize,
pub has_time_filter: bool,
pub description: Option<String>,
}
impl QueryExplanation {
pub fn analyze(query: &str, filters: &SearchFilters) -> Self {
let sanitized = nfc_sanitize_query(query);
let tokens = fs_cass_parse_boolean_query(query);
let mut parsed = ParsedQuery::default();
let mut has_explicit_operator = false;
let mut next_negated = false;
for token in &tokens {
match token {
FsCassQueryToken::Term(t) => {
let parts: Vec<String> = nfc_sanitize_query(t)
.split_whitespace()
.map(|s| s.to_string())
.collect();
if parts.is_empty() {
next_negated = false;
continue;
}
let mut subterms = Vec::new();
for part in parts {
let pattern = FsCassWildcardPattern::parse(&part);
let pattern_str = match &pattern {
FsCassWildcardPattern::Exact(_) => "exact",
FsCassWildcardPattern::Prefix(_) => "prefix (*)",
FsCassWildcardPattern::Suffix(_) => "suffix (*)",
FsCassWildcardPattern::Substring(_) => "substring (*)",
FsCassWildcardPattern::Complex(_) => "complex (*)",
};
subterms.push(ParsedSubTerm {
text: part,
pattern: pattern_str.to_string(),
});
}
parsed.terms.push(ParsedTerm {
text: t.clone(),
negated: next_negated,
subterms,
});
next_negated = false;
}
FsCassQueryToken::Phrase(p) => {
let parts: Vec<String> = nfc_sanitize_query(p)
.split_whitespace()
.map(|s| s.trim_matches('*').to_lowercase())
.filter(|s| !s.is_empty())
.collect();
if !parts.is_empty() {
parsed.phrases.push(parts.join(" "));
}
next_negated = false;
}
FsCassQueryToken::And => {
parsed.operators.push("AND".to_string());
has_explicit_operator = true;
}
FsCassQueryToken::Or => {
parsed.operators.push("OR".to_string());
has_explicit_operator = true;
}
FsCassQueryToken::Not => {
parsed.operators.push("NOT".to_string());
has_explicit_operator = true;
next_negated = true;
}
}
}
parsed.implicit_and = !has_explicit_operator && parsed.terms.len() > 1;
let query_type = Self::classify_query(&parsed, filters, &sanitized);
let index_strategy = Self::determine_strategy(&parsed, &sanitized);
let estimated_cost = Self::estimate_cost(&parsed, &index_strategy, filters);
let filters_summary = Self::summarize_filters(filters);
let warnings = Self::generate_warnings(&parsed, &sanitized, filters);
Self {
original_query: query.to_string(),
sanitized_query: sanitized,
parsed,
query_type,
index_strategy,
wildcard_applied: false, estimated_cost,
filters_summary,
warnings,
}
}
fn classify_query(parsed: &ParsedQuery, filters: &SearchFilters, sanitized: &str) -> QueryType {
if sanitized.trim().is_empty() {
return QueryType::Empty;
}
let has_filters = !filters.agents.is_empty()
|| !filters.workspaces.is_empty()
|| filters.created_from.is_some()
|| filters.created_to.is_some()
|| !filters.source_filter.is_all();
if has_filters {
return QueryType::Filtered;
}
if !parsed.operators.is_empty() {
return QueryType::Boolean;
}
if !parsed.phrases.is_empty() {
return QueryType::Phrase;
}
let has_wildcards = parsed
.terms
.iter()
.flat_map(|t| &t.subterms)
.any(|t| t.pattern != "exact");
if has_wildcards {
return QueryType::Wildcard;
}
QueryType::Simple
}
fn determine_strategy(parsed: &ParsedQuery, sanitized: &str) -> IndexStrategy {
if sanitized.trim().is_empty() {
return IndexStrategy::FullScan;
}
let has_leading_wildcard = parsed
.terms
.iter()
.flat_map(|t| &t.subterms)
.any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
if has_leading_wildcard {
return IndexStrategy::RegexScan;
}
let has_compound_terms = parsed.terms.iter().any(|t| t.subterms.len() > 1);
if !parsed.operators.is_empty()
|| parsed.terms.len() > 1
|| !parsed.phrases.is_empty()
|| has_compound_terms
{
return IndexStrategy::BooleanCombination;
}
IndexStrategy::EdgeNgram
}
fn estimate_cost(
parsed: &ParsedQuery,
strategy: &IndexStrategy,
filters: &SearchFilters,
) -> QueryCost {
if matches!(strategy, IndexStrategy::RegexScan) {
return QueryCost::High;
}
if matches!(strategy, IndexStrategy::FullScan) {
return QueryCost::High;
}
let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
let term_count: usize = parsed.terms.iter().map(|t| t.subterms.len()).sum();
let operator_count = parsed.operators.len();
let phrase_count = parsed.phrases.len();
let complexity = term_count + operator_count * 2 + phrase_count * 2;
if complexity > 6 || has_time_filter {
QueryCost::High
} else if complexity > 2 {
QueryCost::Medium
} else {
QueryCost::Low
}
}
fn summarize_filters(filters: &SearchFilters) -> FiltersSummary {
let agent_count = filters.agents.len();
let workspace_count = filters.workspaces.len();
let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
let mut parts = Vec::new();
if agent_count > 0 {
parts.push(format!(
"{} agent{}",
agent_count,
if agent_count > 1 { "s" } else { "" }
));
}
if workspace_count > 0 {
parts.push(format!(
"{} workspace{}",
workspace_count,
if workspace_count > 1 { "s" } else { "" }
));
}
if has_time_filter {
parts.push("time range".to_string());
}
let description = if parts.is_empty() {
None
} else {
Some(format!("Filtering by: {}", parts.join(", ")))
};
FiltersSummary {
agent_count,
workspace_count,
has_time_filter,
description,
}
}
fn generate_warnings(
parsed: &ParsedQuery,
sanitized: &str,
filters: &SearchFilters,
) -> Vec<String> {
let mut warnings = Vec::new();
let has_leading_wildcard = parsed
.terms
.iter()
.flat_map(|t| &t.subterms)
.any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
if has_leading_wildcard {
warnings.push(
"Leading wildcards (*foo) require regex scan and may be slow on large indexes"
.to_string(),
);
}
for term in &parsed.terms {
for sub in &term.subterms {
if sub.text.trim_matches('*').len() < 2 {
warnings.push(format!(
"Very short term '{}' may match many documents",
sub.text
));
}
}
}
if sanitized.trim().is_empty() {
warnings.push("Empty query will return all documents (expensive)".to_string());
}
if parsed.operators.len() > 3 {
warnings.push("Complex boolean query may have unexpected precedence".to_string());
}
if let Some(agent) = filters.agents.iter().next()
&& filters.agents.len() == 1
&& filters.workspaces.is_empty()
{
warnings.push(format!(
"Searching only in agent '{}' - results from other agents will be excluded",
agent
));
}
warnings
}
pub fn with_wildcard_fallback(mut self, applied: bool) -> Self {
self.wildcard_applied = applied;
if applied
&& !self
.warnings
.iter()
.any(|w| w.contains("wildcard fallback"))
{
self.warnings.push(
"Wildcard fallback was applied automatically due to sparse exact matches"
.to_string(),
);
}
self
}
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum MatchType {
#[default]
Exact,
Prefix,
Suffix,
Substring,
Wildcard,
ImplicitWildcard,
}
impl MatchType {
pub fn quality_factor(self) -> f32 {
match self {
MatchType::Exact => 1.0,
MatchType::Prefix => 0.9,
MatchType::Suffix => 0.8,
MatchType::Substring => 0.7,
MatchType::Wildcard => 0.65,
MatchType::ImplicitWildcard => 0.6,
}
}
}
#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
#[serde(rename_all = "snake_case")]
pub enum SuggestionKind {
SpellingFix,
WildcardQuery,
RemoveFilter,
AlternateAgent,
BroaderDateRange,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct QuerySuggestion {
pub kind: SuggestionKind,
pub message: String,
pub suggested_query: Option<String>,
pub suggested_filters: Option<SearchFilters>,
pub shortcut: Option<u8>,
}
impl QuerySuggestion {
fn spelling(_query: &str, corrected: &str) -> Self {
Self {
kind: SuggestionKind::SpellingFix,
message: format!("Did you mean: \"{corrected}\"?"),
suggested_query: Some(corrected.to_string()),
suggested_filters: None,
shortcut: None,
}
}
fn wildcard(query: &str) -> Self {
let wildcard_query = format!("*{}*", query.trim_matches('*'));
Self {
kind: SuggestionKind::WildcardQuery,
message: format!("Try broader search: \"{wildcard_query}\""),
suggested_query: Some(wildcard_query),
suggested_filters: None,
shortcut: None,
}
}
fn remove_agent_filter(current_agent: &str, current_filters: &SearchFilters) -> Self {
let mut filters = current_filters.clone();
filters.agents.clear();
Self {
kind: SuggestionKind::RemoveFilter,
message: format!("Remove agent filter (currently: {current_agent})"),
suggested_query: None,
suggested_filters: Some(filters),
shortcut: None,
}
}
fn try_agent(agent_slug: &str) -> Self {
let mut filters = SearchFilters::default();
filters.agents.insert(agent_slug.to_string());
Self {
kind: SuggestionKind::AlternateAgent,
message: format!("Try searching in: {agent_slug}"),
suggested_query: None,
suggested_filters: Some(filters),
shortcut: None,
}
}
fn with_shortcut(mut self, key: u8) -> Self {
self.shortcut = Some(key);
self
}
}
#[derive(Debug, Clone, Copy)]
pub struct FieldMask {
flags: u8,
preview_content_chars: Option<usize>,
}
impl FieldMask {
const CONTENT: u8 = 1 << 0;
const SNIPPET: u8 = 1 << 1;
const TITLE: u8 = 1 << 2;
const CACHE: u8 = 1 << 3;
pub const FULL: Self = Self {
flags: Self::CONTENT | Self::SNIPPET | Self::TITLE | Self::CACHE,
preview_content_chars: None,
};
pub fn new(
wants_content: bool,
wants_snippet: bool,
wants_title: bool,
allows_cache: bool,
) -> Self {
let mut flags = 0;
if wants_content {
flags |= Self::CONTENT;
}
if wants_snippet {
flags |= Self::SNIPPET;
}
if wants_title {
flags |= Self::TITLE;
}
if allows_cache {
flags |= Self::CACHE;
}
Self {
flags,
preview_content_chars: None,
}
}
pub fn with_preview_content_limit(mut self, max_chars: Option<usize>) -> Self {
self.preview_content_chars = max_chars;
if max_chars.is_some() {
self.flags &= !Self::CACHE;
}
self
}
pub fn needs_content(self) -> bool {
self.flags & Self::CONTENT != 0
}
pub fn wants_snippet(self) -> bool {
self.flags & Self::SNIPPET != 0
}
pub fn wants_title(self) -> bool {
self.flags & Self::TITLE != 0
}
pub fn allows_cache(self) -> bool {
self.flags & Self::CACHE != 0
}
pub fn preview_content_limit(self) -> Option<usize> {
self.preview_content_chars
}
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchHit {
pub title: String,
pub snippet: String,
pub content: String,
#[serde(skip_serializing)]
pub content_hash: u64,
#[serde(skip_serializing)]
pub conversation_id: Option<i64>,
pub score: f32,
pub source_path: String,
pub agent: String,
pub workspace: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub workspace_original: Option<String>,
pub created_at: Option<i64>,
pub line_number: Option<usize>,
#[serde(default)]
pub match_type: MatchType,
#[serde(default = "default_source_id")]
pub source_id: String,
#[serde(default = "default_source_id")]
pub origin_kind: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub origin_host: Option<String>,
}
static LAZY_FIELDS_ENABLED: Lazy<bool> = Lazy::new(|| {
dotenvy::var("CASS_LAZY_FIELDS")
.ok()
.map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
.unwrap_or(true)
});
fn default_source_id() -> String {
"local".to_string()
}
fn effective_field_mask(field_mask: FieldMask) -> FieldMask {
if *LAZY_FIELDS_ENABLED {
field_mask
} else {
FieldMask::FULL
}
}
fn execute_query_with_lazy_exact_count(
searcher: &Searcher,
query: &dyn Query,
limit: usize,
offset: usize,
) -> Result<FsLexicalSearchResult> {
let top_docs = searcher.search(
query,
&TopDocs::with_limit(limit)
.and_offset(offset)
.order_by_score(),
)?;
let page_saturated = top_docs.len() == limit;
let total_count = if page_saturated {
searcher.search(query, &Count)?
} else {
offset.saturating_add(top_docs.len())
};
let hits = top_docs
.into_iter()
.enumerate()
.map(|(rank, (bm25_score, doc_address))| FsLexicalDocHit {
bm25_score,
rank,
doc_address,
})
.collect();
Ok(FsLexicalSearchResult { hits, total_count })
}
#[derive(Debug, Clone)]
pub struct SearchResult {
pub hits: Vec<SearchHit>,
pub wildcard_fallback: bool,
pub cache_stats: CacheStats,
pub suggestions: Vec<QuerySuggestion>,
pub ann_stats: Option<crate::search::ann_index::AnnSearchStats>,
pub total_count: Option<usize>,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum ProgressivePhaseKind {
Initial,
Refined,
}
#[allow(clippy::large_enum_variant)]
#[derive(Debug, Clone)]
pub enum ProgressiveSearchEvent {
Phase {
kind: ProgressivePhaseKind,
result: SearchResult,
elapsed_ms: u128,
},
RefinementFailed {
latency_ms: u128,
error: String,
},
}
#[derive(Debug, Clone)]
pub(crate) struct ProgressiveSearchRequest<'a> {
pub(crate) cx: &'a FsCx,
pub(crate) query: &'a str,
pub(crate) filters: SearchFilters,
pub(crate) limit: usize,
pub(crate) sparse_threshold: usize,
pub(crate) field_mask: FieldMask,
pub(crate) mode: SearchMode,
}
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
struct SearchHitKey {
source_id: String,
source_path: String,
conversation_id: Option<i64>,
title: String,
line_number: Option<usize>,
created_at: Option<i64>,
content_hash: u64,
}
fn normalized_search_source_id_sql_expr(
source_id_column: &str,
origin_kind_column: &str,
origin_host_column: &str,
) -> String {
format!(
"CASE \
WHEN TRIM(COALESCE({source_id_column}, '')) != '' THEN \
CASE \
WHEN LOWER(TRIM(COALESCE({source_id_column}, ''))) = '{local}' THEN '{local}' \
ELSE TRIM(COALESCE({source_id_column}, '')) \
END \
WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) IN ('ssh', 'remote') THEN \
CASE \
WHEN TRIM(COALESCE({origin_host_column}, '')) = '' THEN 'remote' \
ELSE TRIM(COALESCE({origin_host_column}, '')) \
END \
WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) = '{local}' THEN '{local}' \
WHEN TRIM(COALESCE({origin_host_column}, '')) != '' THEN TRIM(COALESCE({origin_host_column}, '')) \
ELSE '{local}' \
END",
local = crate::sources::provenance::LOCAL_SOURCE_ID,
)
}
fn normalize_search_source_filter_value(source_id: &str) -> String {
let trimmed = source_id.trim();
if trimmed.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
} else {
trimmed.to_string()
}
}
fn normalized_search_hit_source_id_parts(
source_id: &str,
origin_kind: &str,
origin_host: Option<&str>,
) -> String {
let trimmed_source_id = source_id.trim();
if !trimmed_source_id.is_empty() {
if trimmed_source_id.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
}
return trimmed_source_id.to_string();
}
let trimmed_origin_host = origin_host.map(str::trim).filter(|value| !value.is_empty());
let trimmed_origin_kind = origin_kind.trim();
if trimmed_origin_kind.eq_ignore_ascii_case("ssh")
|| trimmed_origin_kind.eq_ignore_ascii_case("remote")
{
return trimmed_origin_host.unwrap_or("remote").to_string();
}
if let Some(origin_host) = trimmed_origin_host {
return origin_host.to_string();
}
crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
}
fn normalized_search_hit_origin_kind(source_id: &str, origin_kind: Option<&str>) -> String {
if let Some(kind) = origin_kind.map(str::trim).filter(|value| !value.is_empty()) {
if kind.eq_ignore_ascii_case("local") {
return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
}
if kind.eq_ignore_ascii_case("ssh") || kind.eq_ignore_ascii_case("remote") {
return "remote".to_string();
}
return kind.to_ascii_lowercase();
}
if source_id == crate::sources::provenance::LOCAL_SOURCE_ID {
crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
} else {
"remote".to_string()
}
}
fn normalized_search_hit_source_id(hit: &SearchHit) -> String {
normalized_search_hit_source_id_parts(
hit.source_id.as_str(),
hit.origin_kind.as_str(),
hit.origin_host.as_deref(),
)
}
impl SearchHitKey {
fn from_hit(hit: &SearchHit) -> Self {
Self {
source_id: normalized_search_hit_source_id(hit),
source_path: hit.source_path.clone(),
conversation_id: hit.conversation_id,
title: if hit.conversation_id.is_some() {
String::new()
} else {
hit.title.trim().to_string()
},
line_number: hit.line_number,
created_at: hit.created_at,
content_hash: hit.content_hash,
}
}
}
impl Ord for SearchHitKey {
fn cmp(&self, other: &Self) -> CmpOrdering {
self.source_id
.cmp(&other.source_id)
.then_with(|| self.source_path.cmp(&other.source_path))
.then_with(|| self.conversation_id.cmp(&other.conversation_id))
.then_with(|| self.title.cmp(&other.title))
.then_with(|| self.line_number.cmp(&other.line_number))
.then_with(|| self.created_at.cmp(&other.created_at))
.then_with(|| self.content_hash.cmp(&other.content_hash))
}
}
impl PartialOrd for SearchHitKey {
fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
Some(self.cmp(other))
}
}
const FEDERATED_RRF_K: f32 = 60.0;
#[derive(Debug)]
struct FederatedRankedHit {
hit: SearchHit,
shard_index: usize,
shard_rank: usize,
fused_score: f32,
}
fn federated_rrf_score(shard_rank: usize) -> f32 {
1.0 / (FEDERATED_RRF_K + shard_rank as f32 + 1.0)
}
fn merge_federated_ranked_hits(mut ranked_hits: Vec<FederatedRankedHit>) -> Vec<SearchHit> {
ranked_hits.sort_by(|a, b| {
b.fused_score
.total_cmp(&a.fused_score)
.then_with(|| a.shard_rank.cmp(&b.shard_rank))
.then_with(|| SearchHitKey::from_hit(&a.hit).cmp(&SearchHitKey::from_hit(&b.hit)))
.then_with(|| a.shard_index.cmp(&b.shard_index))
});
ranked_hits
.into_iter()
.map(|mut ranked| {
ranked.hit.score = ranked.fused_score;
ranked.hit
})
.collect()
}
#[cfg(test)]
#[allow(dead_code)]
#[derive(Debug, Default, Clone)]
struct HybridScore {
rrf: f32,
lexical_rank: Option<usize>,
semantic_rank: Option<usize>,
lexical_score: Option<f32>,
semantic_score: Option<f32>,
}
#[cfg(test)]
#[allow(dead_code)]
#[derive(Debug, Clone)]
struct FusedHit {
key: SearchHitKey,
score: HybridScore,
hit: SearchHit,
}
pub(crate) fn stable_content_hash(content: &str) -> u64 {
use xxhash_rust::xxh3::Xxh3;
let mut hasher = Xxh3::new();
let mut first = true;
for token in content.split_whitespace() {
if !first {
hasher.update(b" ");
}
hasher.update(token.as_bytes());
first = false;
}
hasher.digest()
}
fn stable_hit_hash(
content: &str,
source_path: &str,
line_number: Option<usize>,
created_at: Option<i64>,
) -> u64 {
use xxhash_rust::xxh3::Xxh3;
let mut hasher = Xxh3::new();
if !content.is_empty() {
hasher.update(&stable_content_hash(content).to_le_bytes());
}
hasher.update(b"|");
hasher.update(source_path.as_bytes());
hasher.update(b"|");
if let Some(line) = line_number {
let mut buf = itoa::Buffer::new();
hasher.update(buf.format(line).as_bytes());
}
hasher.update(b"|");
if let Some(ts) = created_at {
let mut buf = itoa::Buffer::new();
hasher.update(buf.format(ts).as_bytes());
}
hasher.digest()
}
fn search_hit_key_doc_id(key: &SearchHitKey) -> String {
use std::fmt::Write as _;
const SEP: char = '\u{1f}';
let capacity = key.source_id.len()
+ key.source_path.len()
+ key.title.len()
+ 6 + 3 * 20 + 20; let mut out = String::with_capacity(capacity);
out.push_str(&key.source_id);
out.push(SEP);
out.push_str(&key.source_path);
out.push(SEP);
if let Some(v) = key.conversation_id {
let _ = write!(out, "{v}");
}
out.push(SEP);
out.push_str(&key.title);
out.push(SEP);
if let Some(v) = key.line_number {
let _ = write!(out, "{v}");
}
out.push(SEP);
if let Some(v) = key.created_at {
let _ = write!(out, "{v}");
}
out.push(SEP);
let _ = write!(out, "{}", key.content_hash);
out
}
fn search_hit_doc_id(hit: &SearchHit) -> String {
search_hit_key_doc_id(&SearchHitKey::from_hit(hit))
}
#[cfg(test)]
fn cmp_fused_hit_desc(a: &FusedHit, b: &FusedHit) -> CmpOrdering {
b.score
.rrf
.total_cmp(&a.score.rrf)
.then_with(|| {
let a_both = a.score.lexical_rank.is_some() && a.score.semantic_rank.is_some();
let b_both = b.score.lexical_rank.is_some() && b.score.semantic_rank.is_some();
match (b_both, a_both) {
(true, false) => CmpOrdering::Greater,
(false, true) => CmpOrdering::Less,
_ => CmpOrdering::Equal,
}
})
.then_with(|| a.key.cmp(&b.key))
}
#[cfg(test)]
#[allow(dead_code)]
const QUICKSELECT_THRESHOLD: usize = 64;
#[cfg(test)]
#[allow(dead_code)]
fn top_k_fused(mut hits: Vec<FusedHit>, k: usize) -> Vec<FusedHit> {
let n = hits.len();
if n == 0 || k == 0 {
return Vec::new();
}
if k >= n {
hits.sort_by(cmp_fused_hit_desc);
return hits;
}
if n < QUICKSELECT_THRESHOLD {
hits.sort_by(cmp_fused_hit_desc);
hits.truncate(k);
return hits;
}
hits.select_nth_unstable_by(k - 1, cmp_fused_hit_desc);
hits.truncate(k);
hits.sort_by(cmp_fused_hit_desc);
hits
}
pub fn rrf_fuse_hits(
lexical: &[SearchHit],
semantic: &[SearchHit],
query: &str,
limit: usize,
offset: usize,
) -> Vec<SearchHit> {
if limit == 0 {
return Vec::new();
}
let total_candidates = lexical.len().saturating_add(semantic.len());
if total_candidates == 0 {
return Vec::new();
}
let mut lexical_scored = Vec::with_capacity(lexical.len());
let mut semantic_scored = Vec::with_capacity(semantic.len());
let mut hit_by_doc_id: HashMap<String, SearchHit> = HashMap::with_capacity(total_candidates);
for hit in lexical {
let doc_id = search_hit_doc_id(hit);
hit_by_doc_id.insert(doc_id.clone(), hit.clone());
lexical_scored.push(FsScoredResult {
doc_id,
score: hit.score,
source: FsScoreSource::Lexical,
index: None,
fast_score: None,
quality_score: None,
lexical_score: Some(hit.score),
rerank_score: None,
explanation: None,
metadata: None,
});
}
for (idx, hit) in semantic.iter().enumerate() {
let doc_id = search_hit_doc_id(hit);
hit_by_doc_id
.entry(doc_id.clone())
.or_insert_with(|| hit.clone());
semantic_scored.push(FsVectorHit {
index: u32::try_from(idx).unwrap_or(u32::MAX),
score: hit.score,
doc_id,
});
}
let fused = fs_rrf_fuse(
&lexical_scored,
&semantic_scored,
total_candidates,
0,
&FsRrfConfig::default(),
);
#[derive(Clone, Copy)]
struct CompatSlot {
index: usize,
conversation_id: Option<i64>,
ambiguous: bool,
}
let mut source_ids: HashMap<String, u32> = HashMap::new();
let mut path_ids: HashMap<String, u32> = HashMap::new();
let mut title_ids: HashMap<String, u32> = HashMap::new();
let mut next_source_id: u32 = 0;
let mut next_path_id: u32 = 0;
let mut next_title_id: u32 = 0;
type CompatExactKey = (
u32,
u32,
Option<i64>,
Option<u32>,
Option<usize>,
Option<i64>,
u64,
);
type CompatFallbackKey = (u32, u32, u32, Option<usize>, Option<i64>, u64);
let mut exact_seen: HashMap<CompatExactKey, usize> = HashMap::with_capacity(fused.len());
let mut fallback_seen: HashMap<CompatFallbackKey, CompatSlot> =
HashMap::with_capacity(fused.len());
let mut unique_hits: Vec<SearchHit> = Vec::with_capacity(fused.len());
let update_slot = |slot: &mut CompatSlot, conversation_id: Option<i64>| {
if slot.ambiguous {
return;
}
match (slot.conversation_id, conversation_id) {
(Some(existing), Some(current)) if existing != current => slot.ambiguous = true,
(None, Some(current)) => slot.conversation_id = Some(current),
_ => {}
}
};
for fused_hit in fused {
let mut hit = match hit_by_doc_id.remove(&fused_hit.doc_id) {
Some(hit) => hit,
None => continue,
};
if hit_is_noise(&hit, query) {
continue;
}
let normalized_source_id = normalized_search_hit_source_id(&hit);
let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
*id
} else {
let id = next_source_id;
next_source_id = next_source_id.saturating_add(1);
source_ids.insert(normalized_source_id, id);
id
};
let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
*id
} else {
let id = next_path_id;
next_path_id = next_path_id.saturating_add(1);
path_ids.insert(hit.source_path.clone(), id);
id
};
let normalized_title = hit.title.trim();
let fallback_title_key = if let Some(id) = title_ids.get(normalized_title) {
*id
} else {
let id = next_title_id;
next_title_id = next_title_id.saturating_add(1);
title_ids.insert(normalized_title.to_string(), id);
id
};
let exact_title_key = if hit.conversation_id.is_some() {
None
} else {
Some(fallback_title_key)
};
let exact_key = (
source_key,
path_key,
hit.conversation_id,
exact_title_key,
hit.line_number,
hit.created_at,
hit.content_hash,
);
let fallback_key = (
source_key,
path_key,
fallback_title_key,
hit.line_number,
hit.created_at,
hit.content_hash,
);
let merged_idx = exact_seen.get(&exact_key).copied().or_else(|| {
fallback_seen.get(&fallback_key).and_then(|slot| {
if slot.ambiguous {
return None;
}
match (slot.conversation_id, hit.conversation_id) {
(Some(existing), Some(current)) if existing != current => None,
_ => Some(slot.index),
}
})
});
if let Some(existing_idx) = merged_idx {
exact_seen.insert(exact_key, existing_idx);
let slot = fallback_seen.entry(fallback_key).or_insert(CompatSlot {
index: existing_idx,
conversation_id: hit.conversation_id,
ambiguous: false,
});
update_slot(slot, hit.conversation_id);
if unique_hits[existing_idx].conversation_id.is_none() && hit.conversation_id.is_some()
{
unique_hits[existing_idx].conversation_id = hit.conversation_id;
}
unique_hits[existing_idx].score += fused_hit.rrf_score as f32;
continue;
}
hit.score = fused_hit.rrf_score as f32;
let index = unique_hits.len();
unique_hits.push(hit);
exact_seen.insert(exact_key, index);
match fallback_seen.get_mut(&fallback_key) {
Some(slot) => update_slot(slot, unique_hits[index].conversation_id),
None => {
fallback_seen.insert(
fallback_key,
CompatSlot {
index,
conversation_id: unique_hits[index].conversation_id,
ambiguous: false,
},
);
}
}
}
unique_hits.sort_by(|a, b| {
b.score
.total_cmp(&a.score)
.then_with(|| SearchHitKey::from_hit(a).cmp(&SearchHitKey::from_hit(b)))
});
let start = offset.min(unique_hits.len());
unique_hits.into_iter().skip(start).take(limit).collect()
}
struct QueryCache {
embedder_id: String,
embeddings: LruCache<String, Vec<f32>>,
}
impl QueryCache {
fn new(embedder_id: &str, capacity: NonZeroUsize) -> Self {
Self {
embedder_id: embedder_id.to_string(),
embeddings: LruCache::new(capacity),
}
}
fn align_embedder(&mut self, embedder: &dyn Embedder) {
if self.embedder_id != embedder.id() {
self.embedder_id = embedder.id().to_string();
self.embeddings.clear();
}
}
fn get_cached(&mut self, embedder: &dyn Embedder, canonical: &str) -> Option<Vec<f32>> {
self.align_embedder(embedder);
self.embeddings.get(canonical).cloned()
}
fn store(&mut self, embedder: &dyn Embedder, canonical: &str, embedding: Vec<f32>) {
self.align_embedder(embedder);
self.embeddings.put(canonical.to_string(), embedding);
}
}
fn semantic_filter_as_search_filter(filter: &SemanticFilter) -> Option<&dyn FsSearchFilter> {
let unrestricted = filter.agents.is_none()
&& filter.workspaces.is_none()
&& filter.sources.is_none()
&& filter.roles.is_none()
&& filter.created_from.is_none()
&& filter.created_to.is_none();
if unrestricted { None } else { Some(filter) }
}
fn open_fs_semantic_ann_index(fs_index: &FsVectorIndex, ann_path: &Path) -> Result<FsHnswIndex> {
if !ann_path.is_file() {
bail!(
"approximate search unavailable: HNSW index not found at {}",
ann_path.display()
);
}
let ann = FsHnswIndex::load(ann_path, fs_index)
.map_err(|err| anyhow!("open HNSW index failed: {err}"))?;
let matches = ann
.matches_vector_index(fs_index)
.map_err(|err| anyhow!("validate HNSW index failed: {err}"))?;
if !matches {
bail!(
"approximate search unavailable: HNSW index at {} is stale for current semantic index (run 'cass index --semantic --build-hnsw')",
ann_path.display()
);
}
Ok(ann)
}
struct SemanticSearchState {
context_token: Arc<()>,
embedder: Arc<dyn Embedder>,
fs_semantic_index: Arc<FsVectorIndex>,
fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
fs_ann_index: Option<Arc<FsHnswIndex>>,
ann_path: Option<PathBuf>,
fs_in_memory_two_tier_index: Option<Arc<FsInMemoryTwoTierIndex>>,
in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable,
progressive_context: Option<Arc<ProgressiveTwoTierContext>>,
progressive_context_unavailable: bool,
filter_maps: SemanticFilterMaps,
roles: Option<HashSet<u8>>,
query_cache: QueryCache,
}
#[derive(Debug, Clone, Copy, Default)]
struct InMemoryTwoTierUnavailable {
fast_only: bool,
quality: bool,
}
impl InMemoryTwoTierUnavailable {
fn is_known_unavailable(self, tier_mode: SemanticTierMode) -> bool {
match tier_mode {
SemanticTierMode::Single => false,
SemanticTierMode::FastOnly => self.fast_only,
SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => self.quality,
}
}
fn mark_unavailable(&mut self, tier_mode: SemanticTierMode) {
match tier_mode {
SemanticTierMode::Single => {}
SemanticTierMode::FastOnly => {
self.fast_only = true;
}
SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => {
self.quality = true;
}
}
}
}
struct ProgressiveTwoTierContext {
context_token: Arc<()>,
index: Arc<FsTwoTierIndex>,
fast_embedder: Arc<dyn frankensearch::Embedder>,
quality_embedder: Option<Arc<dyn frankensearch::Embedder>>,
}
#[derive(Clone)]
struct SemanticCandidateContext {
fs_semantic_index: Arc<FsVectorIndex>,
fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
filter_maps: SemanticFilterMaps,
roles: Option<HashSet<u8>>,
}
struct SemanticCandidateSearchRequest<'a> {
fetch_limit: usize,
approximate: bool,
tier_mode: SemanticTierMode,
in_memory_two_tier_index: Option<&'a Arc<FsInMemoryTwoTierIndex>>,
ann_index: Option<&'a Arc<FsHnswIndex>>,
}
#[derive(Debug, Clone, Copy, Default)]
struct SemanticCandidateRetryState {
has_more_candidates: bool,
exact_window_may_omit_competitor: bool,
}
struct SemanticQueryEmbedding {
context_token: Arc<()>,
vector: Vec<f32>,
}
struct SharedCassSyncEmbedder {
inner: Arc<dyn Embedder>,
cache: Mutex<LruCache<String, Vec<f32>>>,
}
impl SharedCassSyncEmbedder {
fn new(inner: Arc<dyn Embedder>) -> Self {
let cache_capacity =
NonZeroUsize::new(PROGRESSIVE_EMBEDDING_CACHE_CAPACITY).expect("cache capacity > 0");
Self {
inner,
cache: Mutex::new(LruCache::new(cache_capacity)),
}
}
}
impl Embedder for SharedCassSyncEmbedder {
fn embed_sync(&self, text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
if let Ok(mut cache) = self.cache.lock()
&& let Some(embedding) = cache.get(text).cloned()
{
return Ok(embedding);
}
let embedding = self.inner.embed_sync(text)?;
if let Ok(mut cache) = self.cache.lock() {
cache.put(text.to_owned(), embedding.clone());
}
Ok(embedding)
}
fn embed_batch_sync(
&self,
texts: &[&str],
) -> crate::search::embedder::EmbedderResult<Vec<Vec<f32>>> {
self.inner.embed_batch_sync(texts)
}
fn dimension(&self) -> usize {
self.inner.dimension()
}
fn id(&self) -> &str {
self.inner.id()
}
fn model_name(&self) -> &str {
self.inner.model_name()
}
fn is_ready(&self) -> bool {
self.inner.is_ready()
}
fn is_semantic(&self) -> bool {
self.inner.is_semantic()
}
fn category(&self) -> frankensearch::ModelCategory {
self.inner.category()
}
fn tier(&self) -> frankensearch::ModelTier {
self.inner.tier()
}
fn supports_mrl(&self) -> bool {
self.inner.supports_mrl()
}
}
fn build_in_memory_two_tier_index(
ann_path: Option<PathBuf>,
embedder_id: &str,
tier_mode: SemanticTierMode,
) -> Option<Arc<FsInMemoryTwoTierIndex>> {
let index_dir = ann_path
.as_ref()
.and_then(|path| path.parent().map(Path::to_path_buf));
let Some(index_dir) = index_dir else {
tracing::debug!("two-tier semantic unavailable: ann/index directory path missing");
return None;
};
match FsInMemoryTwoTierIndex::from_dir(&index_dir) {
Ok(index) => return Some(Arc::new(index)),
Err(err) => {
tracing::debug!(
dir = %index_dir.display(),
error = %err,
"two-tier semantic index load failed; considering fallback"
);
}
}
if !matches!(tier_mode, SemanticTierMode::FastOnly) {
return None;
}
let fallback_fast = index_dir.join(format!("index-{embedder_id}.fsvi"));
if !fallback_fast.is_file() {
return None;
}
match FsInMemoryVectorIndex::from_fsvi(&fallback_fast) {
Ok(fast) => Some(Arc::new(FsInMemoryTwoTierIndex::new(fast, None))),
Err(err) => {
tracing::debug!(
path = %fallback_fast.display(),
error = %err,
"fast-only semantic fallback index load failed"
);
None
}
}
}
fn two_tier_index_supports_mode(
index: &FsInMemoryTwoTierIndex,
tier_mode: SemanticTierMode,
) -> bool {
!matches!(
tier_mode,
SemanticTierMode::Progressive | SemanticTierMode::QualityOnly
) || index.has_quality_index()
}
#[derive(Debug, Clone)]
struct ResolvedSemanticDocId {
message_id: u64,
doc_id: String,
}
type ProgressiveLookupKey = (String, String, Option<i64>, String, i64, Option<i64>, u64);
type ProgressiveExactQueryKey = (i64, i64);
type ProgressiveFallbackQueryKey = (String, String, i64);
type ResolvedSemanticLookupRow = Option<(ProgressiveLookupKey, ResolvedSemanticDocId)>;
#[derive(Debug, Clone)]
struct ProgressiveLexicalHit {
title: String,
snippet: String,
content: String,
content_hash: u64,
conversation_id: Option<i64>,
source_path: String,
agent: String,
workspace: String,
workspace_original: Option<String>,
created_at: Option<i64>,
match_type: MatchType,
line_number: Option<usize>,
source_id: String,
origin_kind: String,
origin_host: Option<String>,
}
impl ProgressiveLexicalHit {
fn from_search_hit(hit: &SearchHit, field_mask: FieldMask) -> Self {
Self {
title: if field_mask.wants_title() {
hit.title.clone()
} else {
String::new()
},
snippet: if field_mask.wants_snippet() {
hit.snippet.clone()
} else {
String::new()
},
content: if field_mask.needs_content() {
hit.content.clone()
} else {
String::new()
},
content_hash: hit.content_hash,
conversation_id: hit.conversation_id,
source_path: hit.source_path.clone(),
agent: hit.agent.clone(),
workspace: hit.workspace.clone(),
workspace_original: hit.workspace_original.clone(),
created_at: hit.created_at,
match_type: hit.match_type,
line_number: hit.line_number,
source_id: hit.source_id.clone(),
origin_kind: hit.origin_kind.clone(),
origin_host: hit.origin_host.clone(),
}
}
fn to_search_hit(&self, score: f32) -> SearchHit {
SearchHit {
title: self.title.clone(),
snippet: self.snippet.clone(),
content: self.content.clone(),
content_hash: self.content_hash,
conversation_id: self.conversation_id,
score,
source_path: self.source_path.clone(),
agent: self.agent.clone(),
workspace: self.workspace.clone(),
workspace_original: self.workspace_original.clone(),
created_at: self.created_at,
line_number: self.line_number,
match_type: self.match_type,
source_id: self.source_id.clone(),
origin_kind: self.origin_kind.clone(),
origin_host: self.origin_host.clone(),
}
}
}
#[derive(Debug, Default)]
struct ProgressiveLexicalCache {
hits_by_message: HashMap<u64, ProgressiveLexicalHit>,
wildcard_fallback: bool,
suggestions: Vec<QuerySuggestion>,
}
#[derive(Clone, Copy)]
struct ProgressivePhaseContext<'a> {
query: &'a str,
filters: &'a SearchFilters,
field_mask: FieldMask,
lexical_cache: Option<&'a ProgressiveLexicalCache>,
limit: usize,
fetch_limit: usize,
}
type ProgressiveLexicalSnapshot = Arc<ProgressiveLexicalCache>;
struct CassProgressiveLexicalAdapter {
client: Arc<SearchClient>,
filters: SearchFilters,
field_mask: FieldMask,
sparse_threshold: usize,
shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
}
impl CassProgressiveLexicalAdapter {
fn new(
client: Arc<SearchClient>,
filters: SearchFilters,
field_mask: FieldMask,
sparse_threshold: usize,
shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
) -> Self {
Self {
client,
filters,
field_mask,
sparse_threshold,
shared,
}
}
}
impl FsLexicalSearch for CassProgressiveLexicalAdapter {
fn search<'a>(
&'a self,
cx: &'a FsCx,
query: &'a str,
limit: usize,
) -> FsSearchFuture<'a, Vec<FsScoredResult>> {
Box::pin(async move {
if cx.is_cancel_requested() {
return Err(FsSearchError::Cancelled {
phase: "lexical".to_string(),
reason: "cancel requested".to_string(),
});
}
let result = self
.client
.search_with_fallback(
query,
self.filters.clone(),
limit,
0,
self.sparse_threshold,
self.field_mask,
)
.map_err(|err| FsSearchError::SubsystemError {
subsystem: "cass_lexical_adapter",
source: Box::new(std::io::Error::other(err.to_string())),
})?;
let resolved = self
.client
.resolve_semantic_doc_ids_for_hits(&result.hits)
.map_err(|err| FsSearchError::SubsystemError {
subsystem: "cass_lexical_adapter",
source: Box::new(std::io::Error::other(err.to_string())),
})?;
let mut scored = Vec::with_capacity(result.hits.len());
let mut hits_by_message = HashMap::with_capacity(result.hits.len());
for (hit, resolved_doc) in result.hits.iter().zip(resolved) {
let Some(resolved_doc) = resolved_doc else {
continue;
};
hits_by_message
.entry(resolved_doc.message_id)
.or_insert_with(|| {
ProgressiveLexicalHit::from_search_hit(hit, self.field_mask)
});
scored.push(FsScoredResult {
doc_id: resolved_doc.doc_id,
score: hit.score,
source: FsScoreSource::Lexical,
index: None,
fast_score: None,
quality_score: None,
lexical_score: Some(hit.score),
rerank_score: None,
explanation: None,
metadata: None,
});
}
if let Ok(mut guard) = self.shared.lock() {
*guard = Arc::new(ProgressiveLexicalCache {
hits_by_message,
wildcard_fallback: result.wildcard_fallback,
suggestions: result.suggestions,
});
}
Ok(scored)
})
}
fn index_document<'a>(
&'a self,
_cx: &'a FsCx,
_doc: &'a frankensearch::IndexableDocument,
) -> FsSearchFuture<'a, ()> {
Box::pin(async move {
Err(FsSearchError::SubsystemError {
subsystem: "cass_lexical_adapter",
source: Box::new(std::io::Error::other("cass lexical adapter is read-only")),
})
})
}
fn commit<'a>(&'a self, _cx: &'a FsCx) -> FsSearchFuture<'a, ()> {
Box::pin(async move { Ok(()) })
}
fn doc_count(&self) -> usize {
self.client.total_docs()
}
}
pub struct SearchClient {
reader: Option<(IndexReader, FsCassFields)>,
sqlite: Mutex<Option<SendConnection>>,
sqlite_path: Option<PathBuf>,
prefix_cache: Mutex<CacheShards>,
reload_on_search: bool,
last_reload: Mutex<Option<Instant>>,
last_generation: Mutex<Option<u64>>,
reload_epoch: Arc<AtomicU64>,
warm_tx: Option<mpsc::Sender<WarmJob>>,
_warm_handle: Option<std::thread::JoinHandle<()>>,
metrics: Metrics,
cache_namespace: String,
semantic: Mutex<Option<SemanticSearchState>>,
last_tantivy_total_count: Mutex<Option<usize>>,
}
#[derive(Debug, Clone, Copy)]
pub struct SearchClientOptions {
pub enable_reload: bool,
pub enable_warm: bool,
}
impl Default for SearchClientOptions {
fn default() -> Self {
Self {
enable_reload: true,
enable_warm: true,
}
}
}
impl Drop for SearchClient {
fn drop(&mut self) {
FEDERATED_SEARCH_READERS
.write()
.remove(&self.cache_namespace);
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct CacheStats {
pub cache_hits: u64,
pub cache_miss: u64,
pub cache_shortfall: u64,
pub reloads: u64,
pub reload_ms_total: u128,
pub total_cap: usize,
pub total_cost: usize,
pub eviction_count: u64,
pub approx_bytes: usize,
pub byte_cap: usize,
pub eviction_policy: &'static str,
pub ghost_entries: usize,
pub admission_rejects: u64,
pub prewarm_scheduled: u64,
pub prewarm_skipped_pressure: u64,
pub reader_generation: Option<u64>,
}
impl Default for CacheStats {
fn default() -> Self {
Self {
cache_hits: 0,
cache_miss: 0,
cache_shortfall: 0,
reloads: 0,
reload_ms_total: 0,
total_cap: 0,
total_cost: 0,
eviction_count: 0,
approx_bytes: 0,
byte_cap: 0,
eviction_policy: "unknown",
ghost_entries: 0,
admission_rejects: 0,
prewarm_scheduled: 0,
prewarm_skipped_pressure: 0,
reader_generation: None,
}
}
}
static CACHE_SHARD_CAP: Lazy<usize> = Lazy::new(|| {
dotenvy::var("CASS_CACHE_SHARD_CAP")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.filter(|v| *v > 0)
.unwrap_or(256)
});
static CACHE_TOTAL_CAP: Lazy<usize> = Lazy::new(|| {
dotenvy::var("CASS_CACHE_TOTAL_CAP")
.ok()
.and_then(|v| v.parse::<usize>().ok())
.filter(|v| *v > 0)
.unwrap_or(2048)
});
static CACHE_DEBUG_ENABLED: Lazy<bool> = Lazy::new(|| {
dotenvy::var("CASS_DEBUG_CACHE_METRICS")
.map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
.unwrap_or(false)
});
static CACHE_BYTE_CAP: Lazy<usize> = Lazy::new(|| match dotenvy::var("CASS_CACHE_BYTE_CAP") {
Ok(value) => cache_byte_cap_from_env_value(Some(&value), available_memory_bytes()),
Err(_) => default_cache_byte_cap(),
});
static CACHE_EVICTION_POLICY: Lazy<CacheEvictionPolicy> = Lazy::new(|| {
cache_eviction_policy_from_env_value(dotenvy::var("CASS_CACHE_EVICTION_POLICY").ok().as_deref())
});
const DEFAULT_CACHE_BYTE_CAP_FALLBACK: usize = 64 * 1024 * 1024;
const DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR: u64 = 128;
const DEFAULT_CACHE_BYTE_CAP_CEILING: u64 = 2 * 1024 * 1024 * 1024;
const S3_FIFO_GHOST_CAP_MULTIPLIER: usize = 2;
const S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR: usize = 4;
const PREWARM_ENTRY_PRESSURE_NUMERATOR: usize = 9;
const PREWARM_ENTRY_PRESSURE_DENOMINATOR: usize = 10;
const PREWARM_BYTE_PRESSURE_NUMERATOR: usize = 4;
const PREWARM_BYTE_PRESSURE_DENOMINATOR: usize = 5;
const CACHE_KEY_VERSION: &str = "1";
static WARM_DEBOUNCE_MS: Lazy<u64> = Lazy::new(|| {
dotenvy::var("CASS_WARM_DEBOUNCE_MS")
.ok()
.and_then(|v| v.parse::<u64>().ok())
.filter(|v| *v > 0)
.unwrap_or(120)
});
fn default_cache_byte_cap() -> usize {
default_cache_byte_cap_for_available(available_memory_bytes())
}
fn cache_byte_cap_from_env_value(value: Option<&str>, available_bytes: Option<u64>) -> usize {
let Some(raw) = value else {
return default_cache_byte_cap_for_available(available_bytes);
};
raw.parse::<usize>()
.unwrap_or_else(|_| default_cache_byte_cap_for_available(available_bytes))
}
fn default_cache_byte_cap_for_available(available_bytes: Option<u64>) -> usize {
let Some(available_bytes) = available_bytes else {
return DEFAULT_CACHE_BYTE_CAP_FALLBACK;
};
let ceiling = usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX);
let budget = available_bytes / DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR;
let budget = budget.min(DEFAULT_CACHE_BYTE_CAP_CEILING);
let budget = usize::try_from(budget).unwrap_or(ceiling);
budget.clamp(DEFAULT_CACHE_BYTE_CAP_FALLBACK, ceiling)
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum CacheEvictionPolicy {
Lru,
S3Fifo,
}
impl CacheEvictionPolicy {
fn label(self) -> &'static str {
match self {
CacheEvictionPolicy::Lru => "lru",
CacheEvictionPolicy::S3Fifo => "s3-fifo",
}
}
}
fn cache_eviction_policy_from_env_value(value: Option<&str>) -> CacheEvictionPolicy {
match value.map(str::trim).filter(|value| !value.is_empty()) {
Some(value) if value.eq_ignore_ascii_case("s3-fifo") => CacheEvictionPolicy::S3Fifo,
Some(value) if value.eq_ignore_ascii_case("s3fifo") => CacheEvictionPolicy::S3Fifo,
Some(value) if value.eq_ignore_ascii_case("s3_fifo") => CacheEvictionPolicy::S3Fifo,
_ => CacheEvictionPolicy::Lru,
}
}
#[derive(Clone)]
struct CachedHit {
hit: SearchHit,
lc_content: String,
lc_title: Option<String>,
bloom64: u64,
}
impl CachedHit {
fn approx_bytes(&self) -> usize {
let base = std::mem::size_of::<Self>();
let hit_strings = self.hit.title.len()
+ self.hit.snippet.len()
+ self.hit.content.len()
+ self.hit.source_path.len()
+ self.hit.agent.len()
+ self.hit.workspace.len()
+ self
.hit
.workspace_original
.as_ref()
.map_or(0, std::string::String::len)
+ self.hit.source_id.len()
+ self.hit.origin_kind.len()
+ self
.hit
.origin_host
.as_ref()
.map_or(0, std::string::String::len);
let lc_strings =
self.lc_content.len() + self.lc_title.as_ref().map_or(0, std::string::String::len);
base + hit_strings + lc_strings
}
}
struct CacheShards {
shards: HashMap<Arc<str>, LruCache<Arc<str>, Vec<CachedHit>>>,
total_cap: usize,
total_cost: usize,
eviction_count: u64,
total_bytes: usize,
byte_cap: usize,
policy: CacheEvictionPolicy,
ghost_keys: VecDeque<Arc<str>>,
ghost_set: HashSet<Arc<str>>,
admission_rejects: u64,
}
impl CacheShards {
fn new(total_cap: usize, byte_cap: usize) -> Self {
Self::new_with_policy(total_cap, byte_cap, *CACHE_EVICTION_POLICY)
}
fn new_with_policy(total_cap: usize, byte_cap: usize, policy: CacheEvictionPolicy) -> Self {
Self {
shards: HashMap::new(),
total_cap: total_cap.max(1),
total_cost: 0,
eviction_count: 0,
total_bytes: 0,
byte_cap,
policy,
ghost_keys: VecDeque::new(),
ghost_set: HashSet::new(),
admission_rejects: 0,
}
}
fn shard_mut(&mut self, name: &str) -> &mut LruCache<Arc<str>, Vec<CachedHit>> {
let interned_name = intern_cache_key(name);
self.shards
.entry(interned_name)
.or_insert_with(|| LruCache::new(NonZeroUsize::new(*CACHE_SHARD_CAP).unwrap()))
}
fn shard_opt(&self, name: &str) -> Option<&LruCache<Arc<str>, Vec<CachedHit>>> {
self.shards.get(name)
}
fn put(&mut self, shard_name: &str, key: Arc<str>, value: Vec<CachedHit>) {
let new_cost = value.len();
let new_bytes: usize = value.iter().map(CachedHit::approx_bytes).sum();
let replacing = self
.shard_opt(shard_name)
.is_some_and(|shard| shard.contains(&key));
if !replacing && !self.should_admit(&key, new_cost, new_bytes) {
self.admission_rejects += 1;
self.record_ghost(key);
return;
}
self.remove_ghost(&key);
let shard = self.shard_mut(shard_name);
let old_val = shard.put(key, value);
let (old_cost, old_bytes) = old_val.as_ref().map_or((0, 0), |v| {
(v.len(), v.iter().map(CachedHit::approx_bytes).sum())
});
self.total_cost = self
.total_cost
.saturating_add(new_cost)
.saturating_sub(old_cost);
self.total_bytes = self
.total_bytes
.saturating_add(new_bytes)
.saturating_sub(old_bytes);
self.evict_until_within_cap();
}
fn evict_until_within_cap(&mut self) {
while self.total_cost > self.total_cap
|| (self.byte_cap > 0 && self.total_bytes > self.byte_cap)
{
let byte_pressure = self.byte_cap > 0 && self.total_bytes > self.byte_cap;
let mut largest_shard_key = None;
let mut max_score = 0usize;
for (k, v) in self.shards.iter() {
let score = if byte_pressure {
shard_cached_bytes(v)
} else {
v.len()
};
if score > max_score {
max_score = score;
largest_shard_key = Some(k.clone());
}
}
if let Some(key) = largest_shard_key {
if let Some(shard) = self.shards.get_mut(&key)
&& let Some((evicted_key, v)) = shard.pop_lru()
{
let evicted_bytes: usize = v.iter().map(CachedHit::approx_bytes).sum();
self.total_cost = self.total_cost.saturating_sub(v.len());
self.total_bytes = self.total_bytes.saturating_sub(evicted_bytes);
self.eviction_count += 1;
self.record_ghost(evicted_key);
}
} else {
break; }
}
}
fn should_admit(&self, key: &Arc<str>, cost: usize, bytes: usize) -> bool {
if self.policy == CacheEvictionPolicy::Lru || self.ghost_set.contains(key) {
return true;
}
!self.is_s3_fifo_large_candidate(cost, bytes)
}
fn is_s3_fifo_large_candidate(&self, cost: usize, bytes: usize) -> bool {
let entry_heavy = cost
> self
.total_cap
.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
let byte_heavy = self.byte_cap > 0
&& bytes
> self
.byte_cap
.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
entry_heavy || byte_heavy
}
fn record_ghost(&mut self, key: Arc<str>) {
if self.policy != CacheEvictionPolicy::S3Fifo {
return;
}
if self.ghost_set.insert(key.clone()) {
self.ghost_keys.push_back(key);
}
let cap = self
.total_cap
.saturating_mul(S3_FIFO_GHOST_CAP_MULTIPLIER)
.max(1);
while self.ghost_set.len() > cap {
if let Some(old) = self.ghost_keys.pop_front() {
self.ghost_set.remove(&old);
} else {
break;
}
}
}
fn remove_ghost(&mut self, key: &Arc<str>) {
self.ghost_set.remove(key);
self.ghost_keys.retain(|candidate| candidate != key);
}
fn clear(&mut self) {
self.shards.clear();
self.total_cost = 0;
self.total_bytes = 0;
self.ghost_keys.clear();
self.ghost_set.clear();
}
fn total_cost(&self) -> usize {
self.total_cost
}
fn total_cap(&self) -> usize {
self.total_cap
}
fn eviction_count(&self) -> u64 {
self.eviction_count
}
fn total_bytes(&self) -> usize {
self.total_bytes
}
fn byte_cap(&self) -> usize {
self.byte_cap
}
fn policy_label(&self) -> &'static str {
self.policy.label()
}
fn ghost_entries(&self) -> usize {
self.ghost_set.len()
}
fn admission_rejects(&self) -> u64 {
self.admission_rejects
}
fn prewarm_pressure(&self) -> bool {
let entry_pressure = self
.total_cost
.saturating_mul(PREWARM_ENTRY_PRESSURE_DENOMINATOR)
>= self
.total_cap
.saturating_mul(PREWARM_ENTRY_PRESSURE_NUMERATOR);
let byte_pressure = self.byte_cap > 0
&& self
.total_bytes
.saturating_mul(PREWARM_BYTE_PRESSURE_DENOMINATOR)
>= self
.byte_cap
.saturating_mul(PREWARM_BYTE_PRESSURE_NUMERATOR);
entry_pressure || byte_pressure
}
}
fn shard_cached_bytes(shard: &LruCache<Arc<str>, Vec<CachedHit>>) -> usize {
shard
.iter()
.map(|(_key, hits)| hits.iter().map(CachedHit::approx_bytes).sum::<usize>())
.sum()
}
#[derive(Clone)]
struct WarmJob {
query: String,
filters_fingerprint: String,
shard_name: String,
}
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
enum AdaptivePrewarmDecision {
Schedule,
SkipCold,
SkipPressure,
}
#[derive(Clone)]
struct SearcherCacheEntry {
epoch: u64,
reader_key: usize,
searcher: Searcher,
}
thread_local! {
static THREAD_SEARCHER: RefCell<Option<SearcherCacheEntry>> = const { RefCell::new(None) };
}
#[derive(Clone)]
struct FederatedIndexReader {
reader: IndexReader,
fields: FsCassFields,
}
static FEDERATED_SEARCH_READERS: Lazy<RwLock<HashMap<String, Arc<Vec<FederatedIndexReader>>>>> =
Lazy::new(|| RwLock::new(HashMap::new()));
static SEARCH_CLIENT_INSTANCE_COUNTER: AtomicU64 = AtomicU64::new(1);
fn levenshtein_distance(a: &str, b: &str) -> usize {
let a_chars: Vec<char> = a.chars().collect();
let b_chars: Vec<char> = b.chars().collect();
let a_len = a_chars.len();
let b_len = b_chars.len();
if a_len == 0 {
return b_len;
}
if b_len == 0 {
return a_len;
}
let mut prev_row: Vec<usize> = (0..=b_len).collect();
let mut curr_row: Vec<usize> = vec![0; b_len + 1];
for (i, a_char) in a_chars.iter().enumerate() {
curr_row[0] = i + 1;
for (j, b_char) in b_chars.iter().enumerate() {
let cost = usize::from(a_char != b_char);
curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
std::mem::swap(&mut prev_row, &mut curr_row);
}
prev_row[b_len]
}
fn normalize_term_parts(raw: &str) -> Vec<String> {
let mut parts = Vec::new();
for token in nfc_sanitize_query(raw).split_whitespace() {
let mut current = String::new();
let mut chars = token.chars().peekable();
while let Some(ch) = chars.next() {
let trailing_wildcard = ch == '*' && chars.peek().is_none() && !current.is_empty();
if ch.is_alphanumeric() || ch == '_' || trailing_wildcard {
current.push(ch);
continue;
}
if !current.is_empty() {
parts.push(std::mem::take(&mut current));
}
}
if !current.is_empty() {
parts.push(current);
}
}
parts
}
fn normalize_phrase_terms(raw: &str) -> Vec<String> {
normalize_term_parts(raw)
.into_iter()
.map(|s| s.trim_matches('*').to_lowercase())
.filter(|s| !s.is_empty())
.collect()
}
fn render_fts5_term_part(part: &str) -> Option<String> {
let pattern = FsCassWildcardPattern::parse(part);
if matches!(
pattern,
FsCassWildcardPattern::Suffix(_)
| FsCassWildcardPattern::Substring(_)
| FsCassWildcardPattern::Complex(_)
) {
return None;
}
Some(part.to_string())
}
fn dominant_match_type(query: &str) -> MatchType {
let mut worst = MatchType::Exact;
for term in query.split_whitespace() {
let pattern = FsCassWildcardPattern::parse(term);
let mt = match pattern {
FsCassWildcardPattern::Exact(_) => MatchType::Exact,
FsCassWildcardPattern::Prefix(_) => MatchType::Prefix,
FsCassWildcardPattern::Suffix(_) => MatchType::Suffix,
FsCassWildcardPattern::Substring(_) => MatchType::Substring,
FsCassWildcardPattern::Complex(_) => MatchType::Wildcard,
};
if mt.quality_factor() < worst.quality_factor() {
worst = mt;
}
}
worst
}
pub(crate) fn is_tool_invocation_noise(content: &str) -> bool {
let trimmed = content.trim();
if trimmed.starts_with("[Tool:") {
if let Some(close_idx) = trimmed.find(']') {
let after = &trimmed[close_idx + 1..];
if !after.trim().is_empty() {
return false; }
let inner = &trimmed[6..close_idx]; return inner.trim().is_empty();
}
return true;
}
if trimmed.len() < 20 {
let lower = trimmed.to_lowercase();
if lower.starts_with("[tool") || lower.starts_with("tool:") {
return true;
}
}
false
}
fn hit_content_for_noise_check(hit: &SearchHit) -> &str {
if hit.content.is_empty() {
&hit.snippet
} else {
&hit.content
}
}
fn hit_is_noise(hit: &SearchHit, query: &str) -> bool {
let content_to_check = hit_content_for_noise_check(hit);
if content_to_check.is_empty() {
return false;
}
is_search_noise_text(content_to_check, query) || is_tool_invocation_noise(content_to_check)
}
fn snippet_from_content(content: &str) -> String {
let trimmed = content.trim();
let mut chars = trimmed.chars();
let preview: String = chars.by_ref().take(200).collect();
if chars.next().is_some() {
format!("{preview}...")
} else {
preview
}
}
#[cfg(test)]
pub(crate) fn deduplicate_hits(hits: Vec<SearchHit>) -> Vec<SearchHit> {
deduplicate_hits_with_query(hits, "")
}
pub(crate) fn deduplicate_hits_with_query(hits: Vec<SearchHit>, query: &str) -> Vec<SearchHit> {
let mut source_ids: HashMap<String, u32> = HashMap::new();
let mut path_ids: HashMap<String, u32> = HashMap::new();
let mut title_ids: HashMap<String, u32> = HashMap::new();
let mut next_source_id: u32 = 0;
let mut next_path_id: u32 = 0;
let mut next_title_id: u32 = 0;
type DedupKey = (
u32,
u32,
Option<i64>,
Option<u32>,
Option<usize>,
Option<i64>,
u64,
);
let mut seen: HashMap<DedupKey, usize> = HashMap::new();
let mut deduped: Vec<SearchHit> = Vec::new();
for hit in hits {
if hit_is_noise(&hit, query) {
continue;
}
let normalized_source_id = normalized_search_hit_source_id(&hit);
let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
*id
} else {
let id = next_source_id;
next_source_id = next_source_id.saturating_add(1);
source_ids.insert(normalized_source_id, id);
id
};
let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
*id
} else {
let id = next_path_id;
next_path_id = next_path_id.saturating_add(1);
path_ids.insert(hit.source_path.clone(), id);
id
};
let title_key = if hit.conversation_id.is_some() {
None
} else {
let normalized_title = hit.title.trim();
Some(if let Some(id) = title_ids.get(normalized_title) {
*id
} else {
let id = next_title_id;
next_title_id = next_title_id.saturating_add(1);
title_ids.insert(normalized_title.to_string(), id);
id
})
};
let key = (
source_key,
path_key,
hit.conversation_id,
title_key,
hit.line_number,
hit.created_at,
hit.content_hash,
);
if let Some(&existing_idx) = seen.get(&key) {
if deduped[existing_idx].score < hit.score {
deduped[existing_idx] = hit;
}
} else {
seen.insert(key, deduped.len());
deduped.push(hit);
}
}
deduped
}
fn should_try_wildcard_fallback(
returned_hits: usize,
limit: usize,
offset: usize,
sparse_threshold: usize,
) -> bool {
if offset != 0 {
return false;
}
let effective_sparse_threshold = if limit == 0 {
sparse_threshold
} else {
sparse_threshold.min(limit)
};
returned_hits < effective_sparse_threshold
}
fn should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(
query: &str,
returned_hits: usize,
) -> bool {
if returned_hits != 0 {
return false;
}
for token in normalize_phrase_terms(query) {
if token.chars().count() > AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS {
return true;
}
}
false
}
fn snippet_from_preview_without_full_content(
field_mask: FieldMask,
stored_preview: &str,
query: &str,
) -> Option<String> {
if field_mask.needs_content() || !field_mask.wants_snippet() || stored_preview.is_empty() {
return None;
}
cached_prefix_snippet(stored_preview, query, 160)
}
fn stored_preview_is_complete_content(stored_preview: &str) -> bool {
!stored_preview.is_empty() && !stored_preview.ends_with('…')
}
impl SearchClient {
pub fn open(index_path: &Path, db_path: Option<&Path>) -> Result<Option<Self>> {
Self::open_with_options(index_path, db_path, SearchClientOptions::default())
}
pub fn open_with_options(
index_path: &Path,
db_path: Option<&Path>,
options: SearchClientOptions,
) -> Result<Option<Self>> {
let tantivy = fs_cass_open_search_reader(index_path, ReloadPolicy::Manual).ok();
let client_id = SEARCH_CLIENT_INSTANCE_COUNTER.fetch_add(1, Ordering::Relaxed);
let cache_namespace = format!(
"v{}|schema:{}|client:{}|index:{}",
CACHE_KEY_VERSION,
FS_CASS_SCHEMA_HASH,
client_id,
index_path.display()
);
let federated_readers = if tantivy.is_none() {
crate::search::tantivy::open_federated_search_readers(index_path, ReloadPolicy::Manual)
.ok()
.flatten()
.filter(|readers| !readers.is_empty())
.map(|readers| {
Arc::new(
readers
.into_iter()
.map(|(reader, fields)| FederatedIndexReader { reader, fields })
.collect::<Vec<_>>(),
)
})
} else {
None
};
let sqlite_path = db_path.map(Path::to_path_buf).filter(|path| path.exists());
if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_some() {
tracing::warn!(
index_path = %index_path.display(),
"Tantivy search index not found or incompatible. \
Search results will be degraded. \
Run `cass index --full` to rebuild the index."
);
}
if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_none() {
return Ok(None);
}
let reload_epoch = Arc::new(AtomicU64::new(0));
let metrics = Metrics::default();
let warm_pair = if options.enable_warm
&& let Some((reader, fields)) = &tantivy
{
maybe_spawn_warm_worker(
reader.clone(),
*fields,
reload_epoch.clone(),
metrics.clone(),
)
} else {
None
};
if let Some(readers) = &federated_readers {
FEDERATED_SEARCH_READERS
.write()
.insert(cache_namespace.clone(), Arc::clone(readers));
} else {
FEDERATED_SEARCH_READERS.write().remove(&cache_namespace);
}
Ok(Some(Self {
reader: tantivy,
sqlite: Mutex::new(None),
sqlite_path,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: options.enable_reload,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch,
warm_tx: warm_pair.as_ref().map(|(tx, _)| tx.clone()),
_warm_handle: warm_pair.map(|(_, h)| h),
metrics,
cache_namespace,
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
}))
}
fn sqlite_guard(&self) -> Result<std::sync::MutexGuard<'_, Option<SendConnection>>> {
let mut guard = self
.sqlite
.lock()
.map_err(|_| anyhow!("sqlite lock poisoned"))?;
if guard.is_none()
&& let Some(path) = &self.sqlite_path
{
match open_search_hydration_sqlite(path, std::time::Duration::from_secs(1)) {
Ok(conn) => {
*guard = Some(SendConnection(conn));
}
Err(err) => {
tracing::debug!(
error = %err,
path = %path.display(),
"readonly sqlite open failed for search client"
);
}
}
}
Ok(guard)
}
pub fn search(
&self,
query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
field_mask: FieldMask,
) -> Result<Vec<SearchHit>> {
use unicode_normalization::UnicodeNormalization;
let query: String = query.nfc().collect();
let query: &str = &query;
let sanitized = nfc_sanitize_query(query);
let field_mask = effective_field_mask(field_mask);
let limit = if limit == 0 {
self.total_docs().min(no_limit_result_cap()).max(1)
} else {
limit
};
let can_use_cache =
field_mask.allows_cache() && (field_mask.needs_content() || field_mask.wants_snippet());
if let Some((reader, _)) = &self.reader {
self.maybe_reload_reader(reader)?;
let searcher = self.searcher_for_thread(reader);
self.track_generation(searcher.generation().generation_id());
} else if let Some(readers) = self.federated_readers()
&& let Some(signature) = self.maybe_reload_federated_readers(readers.as_ref())?
{
self.track_generation(signature);
}
if can_use_cache
&& offset == 0
&& !query.contains('*')
&& !fs_cass_has_boolean_operators(query)
{
self.maybe_schedule_adaptive_query_prewarm(&sanitized, &filters);
if let Some(cached) = self.cached_prefix_hits(&sanitized, &filters) {
let query_terms = QueryTermsLower::from_query(&sanitized);
let mut filtered: Vec<SearchHit> = cached
.into_iter()
.filter(|h| hit_matches_query_cached_precomputed(h, &query_terms))
.map(|c| c.hit.clone())
.collect();
if filtered.len() >= limit {
filtered.truncate(limit);
self.metrics.inc_cache_hits();
self.maybe_log_cache_metrics("hit");
return Ok(filtered);
}
self.metrics.inc_cache_shortfall();
self.maybe_log_cache_metrics("shortfall");
} else {
self.metrics.inc_cache_miss();
self.maybe_log_cache_metrics("miss");
}
}
let target_hits = offset.saturating_add(limit);
let initial_fetch_limit = if target_hits <= 16 {
target_hits.saturating_mul(2)
} else {
target_hits.saturating_mul(3).div_ceil(2)
};
let session_path_filter_active = !filters.session_paths.is_empty();
let fallback_fetch_limit = if session_path_filter_active {
self.total_docs()
.min(no_limit_result_cap())
.max(target_hits.saturating_mul(3))
.max(1)
} else {
target_hits.saturating_mul(3)
};
if let Some((reader, fields)) = &self.reader {
tracing::info!(
backend = "tantivy",
query = sanitized,
limit = initial_fetch_limit,
offset = 0,
"search_start"
);
let (hits, tantivy_total_count) = self.search_tantivy(
reader,
fields,
query,
&sanitized,
filters.clone(),
initial_fetch_limit,
0, field_mask,
)?;
if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
*tc = Some(tantivy_total_count);
}
if !hits.is_empty() {
let initial_hit_count = hits.len();
let page_hits = |raw_hits: Vec<SearchHit>| {
self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
};
let (mut deduped_len, mut paged_hits) = page_hits(hits);
let needs_retry = deduped_len < target_hits
&& initial_hit_count == initial_fetch_limit
&& initial_fetch_limit < fallback_fetch_limit;
if needs_retry {
tracing::debug!(
query = sanitized,
target_hits,
deduped_len,
initial_fetch_limit,
fallback_fetch_limit,
session_path_filter_active,
"retrying lexical fetch due to dedup or session-path shortfall"
);
let (retry_hits, retry_total_count) = self.search_tantivy(
reader,
fields,
query,
&sanitized,
filters.clone(),
fallback_fetch_limit,
0,
field_mask,
)?;
if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
*tc = Some(retry_total_count);
}
if !retry_hits.is_empty() {
(deduped_len, paged_hits) = page_hits(retry_hits);
}
}
tracing::trace!(
query = sanitized,
target_hits,
deduped_len,
returned = paged_hits.len(),
"lexical fetch complete"
);
if can_use_cache && offset == 0 {
self.put_cache(&sanitized, &filters, &paged_hits);
}
return Ok(paged_hits);
}
tracing::debug!(
query = sanitized,
"tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
);
return Ok(Vec::new());
} else if let Some(readers) = self.federated_readers() {
tracing::info!(
backend = "tantivy-federated",
query = sanitized,
limit = initial_fetch_limit,
offset = 0,
shards = readers.len(),
"search_start"
);
let (hits, tantivy_total_count) = self.search_tantivy_federated(
readers.as_ref(),
query,
&sanitized,
filters.clone(),
initial_fetch_limit,
field_mask,
)?;
if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
*tc = Some(tantivy_total_count);
}
if !hits.is_empty() {
let initial_hit_count = hits.len();
let page_hits = |raw_hits: Vec<SearchHit>| {
self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
};
let (mut deduped_len, mut paged_hits) = page_hits(hits);
let expected_federated_capacity = initial_fetch_limit.saturating_mul(readers.len());
let federated_initial_capacity_reached = if session_path_filter_active {
initial_hit_count >= initial_fetch_limit.min(expected_federated_capacity)
} else {
initial_hit_count == expected_federated_capacity
};
let needs_retry = deduped_len < target_hits
&& federated_initial_capacity_reached
&& initial_fetch_limit < fallback_fetch_limit;
if needs_retry {
tracing::debug!(
query = sanitized,
target_hits,
deduped_len,
initial_fetch_limit,
fallback_fetch_limit,
shards = readers.len(),
session_path_filter_active,
"retrying federated lexical fetch due to dedup or session-path shortfall"
);
let (retry_hits, retry_total_count) = self.search_tantivy_federated(
readers.as_ref(),
query,
&sanitized,
filters.clone(),
fallback_fetch_limit,
field_mask,
)?;
if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
*tc = Some(retry_total_count);
}
if !retry_hits.is_empty() {
(deduped_len, paged_hits) = page_hits(retry_hits);
}
}
tracing::trace!(
query = sanitized,
target_hits,
deduped_len,
returned = paged_hits.len(),
shards = readers.len(),
"federated lexical fetch complete"
);
if can_use_cache && offset == 0 {
self.put_cache(&sanitized, &filters, &paged_hits);
}
return Ok(paged_hits);
}
tracing::debug!(
query = sanitized,
shards = readers.len(),
"federated tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
);
return Ok(Vec::new());
}
let unsupported_wildcards = sanitized.split_whitespace().any(|t| {
let core = t.trim_end_matches('*');
core.contains('*') });
if unsupported_wildcards {
return Ok(Vec::new());
}
let has_sqlite_backend = {
let sqlite_guard = self
.sqlite
.lock()
.map_err(|_| anyhow!("sqlite lock poisoned"))?;
sqlite_guard.is_some() || self.sqlite_path.is_some()
};
if has_sqlite_backend {
tracing::info!(
backend = "sqlite-fts5",
query = sanitized,
limit = fallback_fetch_limit,
offset = 0,
"search_start"
);
let hits = self.search_sqlite_fts5(
self.sqlite_path
.as_deref()
.unwrap_or_else(|| Path::new(":memory:")),
query,
filters.clone(),
fallback_fetch_limit,
0, field_mask,
)?;
let (_, paged_hits) =
self.postprocess_hits_page(hits, &sanitized, &filters, limit, offset);
if can_use_cache && offset == 0 {
self.put_cache(&sanitized, &filters, &paged_hits);
}
return Ok(paged_hits);
}
tracing::info!(backend = "none", query = query, "search_start");
Ok(Vec::new())
}
pub fn set_semantic_context(
&self,
embedder: Arc<dyn Embedder>,
fs_semantic_index: VectorIndex,
filter_maps: SemanticFilterMaps,
roles: Option<HashSet<u8>>,
ann_path: Option<PathBuf>,
) -> Result<()> {
self.set_semantic_indexes_context(
embedder,
vec![fs_semantic_index],
filter_maps,
roles,
ann_path,
)
}
pub fn set_semantic_indexes_context(
&self,
embedder: Arc<dyn Embedder>,
fs_semantic_indexes: Vec<VectorIndex>,
filter_maps: SemanticFilterMaps,
roles: Option<HashSet<u8>>,
ann_path: Option<PathBuf>,
) -> Result<()> {
if fs_semantic_indexes.is_empty() {
bail!("semantic context requires at least one vector index");
}
let fs_semantic_indexes = fs_semantic_indexes
.into_iter()
.map(|index| {
let embedder_id = index.embedder_id().to_string();
let dimension = index.dimension();
if embedder_id != embedder.id() {
bail!(
"embedder mismatch: index uses {}, embedder is {}",
embedder_id,
embedder.id()
);
}
if dimension != embedder.dimension() {
bail!(
"embedder dimension mismatch: index uses {}, embedder is {}",
dimension,
embedder.dimension()
);
}
Ok(Arc::new(index))
})
.collect::<Result<Vec<_>>>()?;
let fs_semantic_index = Arc::clone(&fs_semantic_indexes[0]);
let shard_count = fs_semantic_indexes.len();
let ann_path = if shard_count == 1 { ann_path } else { None };
let embedder_id = fs_semantic_index.embedder_id().to_string();
let dimension = fs_semantic_index.dimension();
let fs_semantic_indexes = Arc::new(fs_semantic_indexes);
let capacity = NonZeroUsize::new(100).ok_or_else(|| anyhow!("invalid cache size"))?;
let context_token = Arc::new(());
let mut state_guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
*state_guard = Some(SemanticSearchState {
context_token,
embedder,
fs_semantic_index,
fs_semantic_indexes,
fs_ann_index: None,
ann_path,
fs_in_memory_two_tier_index: None,
in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable::default(),
progressive_context: None,
progressive_context_unavailable: false,
filter_maps,
roles,
query_cache: QueryCache::new(embedder_id.as_str(), capacity),
});
if shard_count > 1 {
tracing::info!(
shard_count,
dimension,
embedder = embedder_id,
"semantic search context loaded sharded vector generation"
);
}
Ok(())
}
pub fn clear_semantic_context(&self) -> Result<()> {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
*guard = None;
Ok(())
}
fn semantic_context_matches(&self, context_token: &Arc<()>) -> Result<bool> {
let guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
Ok(guard
.as_ref()
.is_some_and(|state| Arc::ptr_eq(&state.context_token, context_token)))
}
fn semantic_query_embedding(&self, canonical: &str) -> Result<SemanticQueryEmbedding> {
loop {
let (embedder, context_token) = {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(hit) = state
.query_cache
.get_cached(state.embedder.as_ref(), canonical)
{
return Ok(SemanticQueryEmbedding {
context_token: Arc::clone(&state.context_token),
vector: hit,
});
}
(
Arc::clone(&state.embedder),
Arc::clone(&state.context_token),
)
};
let embedding = embedder
.embed_sync(canonical)
.map_err(|e| anyhow!("embedding failed: {e}"))?;
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if !Arc::ptr_eq(&state.context_token, &context_token) {
continue;
}
if let Some(hit) = state
.query_cache
.get_cached(state.embedder.as_ref(), canonical)
{
return Ok(SemanticQueryEmbedding {
context_token,
vector: hit,
});
}
state
.query_cache
.store(state.embedder.as_ref(), canonical, embedding.clone());
return Ok(SemanticQueryEmbedding {
context_token,
vector: embedding,
});
}
}
fn in_memory_two_tier_index(
&self,
tier_mode: SemanticTierMode,
) -> Result<Option<Arc<FsInMemoryTwoTierIndex>>> {
loop {
let (ann_path, embedder_id, context_token) = {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(index) = state.fs_in_memory_two_tier_index.as_ref()
&& two_tier_index_supports_mode(index.as_ref(), tier_mode)
{
return Ok(Some(Arc::clone(index)));
}
if state
.in_memory_two_tier_unavailable
.is_known_unavailable(tier_mode)
{
return Ok(None);
}
(
state.ann_path.clone(),
state.embedder.id().to_string(),
Arc::clone(&state.context_token),
)
};
let index = build_in_memory_two_tier_index(ann_path.clone(), &embedder_id, tier_mode);
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(existing) = state.fs_in_memory_two_tier_index.as_ref()
&& two_tier_index_supports_mode(existing.as_ref(), tier_mode)
{
return Ok(Some(Arc::clone(existing)));
}
if !Arc::ptr_eq(&state.context_token, &context_token) {
continue;
}
let Some(index) = index else {
state
.in_memory_two_tier_unavailable
.mark_unavailable(tier_mode);
return Ok(None);
};
if !two_tier_index_supports_mode(index.as_ref(), tier_mode) {
state
.in_memory_two_tier_unavailable
.mark_unavailable(tier_mode);
return Ok(None);
}
state.fs_in_memory_two_tier_index = Some(Arc::clone(&index));
if index.has_quality_index() {
state.in_memory_two_tier_unavailable = InMemoryTwoTierUnavailable::default();
} else {
state.in_memory_two_tier_unavailable.fast_only = false;
}
return Ok(Some(index));
}
}
fn ann_index(&self) -> Result<Arc<FsHnswIndex>> {
loop {
let (ann_path, fs_semantic_index) = {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(index) = state.fs_ann_index.as_ref() {
return Ok(Arc::clone(index));
}
let ann_path = state.ann_path.clone().ok_or_else(|| {
anyhow!(
"approximate search unavailable: HNSW index missing (run 'cass index --semantic --build-hnsw')"
)
})?;
(ann_path, Arc::clone(&state.fs_semantic_index))
};
let ann = Arc::new(open_fs_semantic_ann_index(
fs_semantic_index.as_ref(),
&ann_path,
)?);
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(existing) = state.fs_ann_index.as_ref() {
return Ok(Arc::clone(existing));
}
if state.ann_path.as_ref() != Some(&ann_path)
|| !Arc::ptr_eq(&state.fs_semantic_index, &fs_semantic_index)
{
continue;
}
state.fs_ann_index = Some(Arc::clone(&ann));
return Ok(ann);
}
}
fn collapse_semantic_results(
best_by_message: HashMap<u64, VectorSearchResult>,
fetch_limit: usize,
) -> Vec<VectorSearchResult> {
let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
collapsed.sort_by(|a, b| {
b.score
.total_cmp(&a.score)
.then_with(|| a.message_id.cmp(&b.message_id))
});
if collapsed.len() > fetch_limit {
collapsed.truncate(fetch_limit);
}
collapsed
}
fn semantic_exact_candidate_limit(fetch_limit: usize, record_count: usize) -> usize {
fetch_limit
.saturating_mul(SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER)
.max(fetch_limit)
.min(record_count)
}
fn semantic_window_may_omit_competitor(
collapsed: &[VectorSearchResult],
fetch_limit: usize,
max_omitted_score: Option<f32>,
) -> bool {
if fetch_limit == 0 {
return false;
}
let Some(max_omitted_score) = max_omitted_score else {
return false;
};
if collapsed.len() < fetch_limit {
return true;
}
let Some(last_in_requested_window) = collapsed.get(fetch_limit - 1) else {
return true;
};
!last_in_requested_window
.score
.total_cmp(&max_omitted_score)
.is_gt()
}
fn record_fs_semantic_hit(
best_by_message: &mut HashMap<u64, VectorSearchResult>,
hit: &FsVectorHit,
) {
let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
return;
};
best_by_message
.entry(parsed.message_id)
.and_modify(|entry| {
if hit.score > entry.score {
entry.score = hit.score;
entry.chunk_idx = parsed.chunk_idx;
}
})
.or_insert(VectorSearchResult {
message_id: parsed.message_id,
chunk_idx: parsed.chunk_idx,
score: hit.score,
});
}
fn search_exact_semantic_indexes(
context: &SemanticCandidateContext,
embedding: &[f32],
fetch_limit: usize,
fs_filter: Option<&dyn FsSearchFilter>,
) -> Result<(Vec<VectorSearchResult>, SemanticCandidateRetryState)> {
if context.fs_semantic_indexes.len() == 1 {
let record_count = context.fs_semantic_index.record_count();
let candidate_limit = Self::semantic_exact_candidate_limit(fetch_limit, record_count);
let fs_hits = context
.fs_semantic_index
.search_top_k(embedding, candidate_limit, fs_filter)
.map_err(|err| anyhow!("frankensearch semantic search failed: {err}"))?;
let mut best_by_message = HashMap::with_capacity(fs_hits.len());
for hit in &fs_hits {
Self::record_fs_semantic_hit(&mut best_by_message, hit);
}
let collapsed = Self::collapse_semantic_results(best_by_message, candidate_limit);
let has_more_candidates =
fs_hits.len() >= candidate_limit && candidate_limit < record_count;
let max_omitted_score = if has_more_candidates {
fs_hits.last().map(|hit| hit.score)
} else {
None
};
let exact_window_may_omit_competitor = Self::semantic_window_may_omit_competitor(
&collapsed,
fetch_limit,
max_omitted_score,
);
return Ok((
collapsed,
SemanticCandidateRetryState {
has_more_candidates,
exact_window_may_omit_competitor,
},
));
}
let mut best_by_message = HashMap::new();
let mut raw_hits = 0usize;
let mut max_omitted_score: Option<f32> = None;
let mut has_more_candidates = false;
for index in context.fs_semantic_indexes.iter() {
let shard_record_count = index.record_count();
let shard_limit = Self::semantic_exact_candidate_limit(fetch_limit, shard_record_count);
if shard_limit == 0 {
continue;
}
let fs_hits = index
.search_top_k(embedding, shard_limit, fs_filter)
.map_err(|err| anyhow!("frankensearch sharded semantic search failed: {err}"))?;
if fs_hits.len() >= shard_limit
&& shard_limit < shard_record_count
&& let Some(last_hit) = fs_hits.last()
{
has_more_candidates = true;
max_omitted_score = Some(
max_omitted_score
.map(|current| current.max(last_hit.score))
.unwrap_or(last_hit.score),
);
}
raw_hits = raw_hits.saturating_add(fs_hits.len());
best_by_message.reserve(fs_hits.len());
for hit in &fs_hits {
Self::record_fs_semantic_hit(&mut best_by_message, hit);
}
}
let candidate_return_limit = Self::semantic_exact_candidate_limit(fetch_limit, raw_hits);
let collapsed = Self::collapse_semantic_results(best_by_message, candidate_return_limit);
let exact_window_may_omit_competitor =
Self::semantic_window_may_omit_competitor(&collapsed, fetch_limit, max_omitted_score);
tracing::debug!(
shard_count = context.fs_semantic_indexes.len(),
raw_hits,
returned = collapsed.len(),
"semantic sharded exact merge complete"
);
Ok((
collapsed,
SemanticCandidateRetryState {
has_more_candidates,
exact_window_may_omit_competitor,
},
))
}
fn search_semantic_candidates(
&self,
context: &SemanticCandidateContext,
embedding: &[f32],
filters: &SearchFilters,
request: SemanticCandidateSearchRequest<'_>,
) -> Result<(
Vec<VectorSearchResult>,
SemanticCandidateRetryState,
Option<crate::search::ann_index::AnnSearchStats>,
)> {
let mut semantic_filter =
SemanticFilter::from_search_filters(filters, &context.filter_maps)?;
if let Some(roles) = context.roles.clone() {
semantic_filter = semantic_filter.with_roles(Some(roles));
}
if request.tier_mode.wants_two_tier() && !request.approximate {
let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
if let Some(two_tier_index) = request.in_memory_two_tier_index {
let config = request.tier_mode.to_frankensearch_config();
let searcher = FsSyncTwoTierSearcher::new(Arc::clone(two_tier_index), config);
let (tier_hits, metrics) = searcher
.search_collect_with_filter(embedding, request.fetch_limit, fs_filter)
.map_err(|err| {
anyhow!("frankensearch two-tier semantic search failed: {err}")
})?;
tracing::debug!(
tier_mode = ?request.tier_mode,
phase1_ms = metrics.phase1_total_ms,
phase2_ms = metrics.phase2_total_ms,
skip_reason = ?metrics.skip_reason,
returned = tier_hits.len(),
"semantic two-tier search executed"
);
let mut best_by_message: HashMap<u64, VectorSearchResult> =
HashMap::with_capacity(tier_hits.len());
for hit in tier_hits.iter() {
let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
continue;
};
best_by_message
.entry(parsed.message_id)
.and_modify(|entry| {
if hit.score > entry.score {
entry.score = hit.score;
entry.chunk_idx = parsed.chunk_idx;
}
})
.or_insert(VectorSearchResult {
message_id: parsed.message_id,
chunk_idx: parsed.chunk_idx,
score: hit.score,
});
}
return Ok((
Self::collapse_semantic_results(best_by_message, request.fetch_limit),
SemanticCandidateRetryState {
has_more_candidates: tier_hits.len() >= request.fetch_limit,
exact_window_may_omit_competitor: false,
},
None,
));
}
tracing::debug!(
tier_mode = ?request.tier_mode,
"two-tier semantic unavailable; falling back to exact single-tier search"
);
let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
let (results, truncated) = Self::search_exact_semantic_indexes(
context,
embedding,
request.fetch_limit,
fs_filter,
)?;
return Ok((results, truncated, None));
}
if request.approximate {
if request.tier_mode.wants_two_tier() {
tracing::debug!(
tier_mode = ?request.tier_mode,
"approximate search requested; bypassing two-tier mode"
);
}
let ann = request
.ann_index
.ok_or_else(|| anyhow!("HNSW index failed to initialize"))?;
let candidate = request
.fetch_limit
.saturating_mul(ANN_CANDIDATE_MULTIPLIER)
.max(request.fetch_limit);
let ef = FS_HNSW_DEFAULT_EF_SEARCH.max(candidate);
let (ann_results, search_stats) =
ann.knn_search_with_stats(embedding, candidate, ef)
.map_err(|err| anyhow!("frankensearch approximate search failed: {err}"))?;
let ann_stats = Some(crate::search::ann_index::AnnSearchStats {
index_size: search_stats.index_size,
dimension: search_stats.dimension,
ef_search: search_stats.ef_search,
k_requested: search_stats.k_requested,
k_returned: search_stats.k_returned,
search_time_us: search_stats.search_time_us,
estimated_recall: search_stats.estimated_recall as f32,
is_approximate: search_stats.is_approximate,
});
let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
let mut best_by_message: HashMap<u64, VectorSearchResult> =
HashMap::with_capacity(ann_results.len());
for hit in ann_results.iter() {
if let Some(filter) = fs_filter
&& !filter.matches(&hit.doc_id, None)
{
continue;
}
let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
continue;
};
best_by_message
.entry(parsed.message_id)
.and_modify(|entry| {
if hit.score > entry.score {
entry.score = hit.score;
entry.chunk_idx = parsed.chunk_idx;
}
})
.or_insert(VectorSearchResult {
message_id: parsed.message_id,
chunk_idx: parsed.chunk_idx,
score: hit.score,
});
}
return Ok((
Self::collapse_semantic_results(best_by_message, request.fetch_limit),
SemanticCandidateRetryState {
has_more_candidates: ann_results.len() >= candidate,
exact_window_may_omit_competitor: false,
},
ann_stats,
));
}
let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
let (results, truncated) = Self::search_exact_semantic_indexes(
context,
embedding,
request.fetch_limit,
fs_filter,
)?;
Ok((results, truncated, None))
}
pub fn can_progressively_refine(&self) -> bool {
self.progressive_context()
.map(|context| {
context.as_ref().is_some_and(|ctx| {
ctx.quality_embedder.is_some() && ctx.index.has_quality_index()
})
})
.unwrap_or(false)
}
fn progressive_context(&self) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
loop {
let (ann_path, embedder, context_token) = {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(context) = state.progressive_context.as_ref() {
return Ok(Some(Arc::clone(context)));
}
if state.progressive_context_unavailable {
return Ok(None);
}
(
state.ann_path.clone(),
Arc::clone(&state.embedder),
Arc::clone(&state.context_token),
)
};
let context = match self.build_progressive_context(
ann_path.clone(),
embedder,
Arc::clone(&context_token),
) {
Ok(context) => context,
Err(err) => {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(existing) = state.progressive_context.as_ref() {
return Ok(Some(Arc::clone(existing)));
}
if !Arc::ptr_eq(&state.context_token, &context_token) {
continue;
}
return Err(err);
}
};
let Some(context) = context else {
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(existing) = state.progressive_context.as_ref() {
return Ok(Some(Arc::clone(existing)));
}
if !Arc::ptr_eq(&state.context_token, &context_token) {
continue;
}
state.progressive_context_unavailable = true;
return Ok(None);
};
let mut guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_mut().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if let Some(existing) = state.progressive_context.as_ref() {
return Ok(Some(Arc::clone(existing)));
}
if !Arc::ptr_eq(&state.context_token, &context_token) {
continue;
}
state.progressive_context_unavailable = false;
state.progressive_context = Some(Arc::clone(&context));
return Ok(Some(context));
}
}
fn build_progressive_context(
&self,
ann_path: Option<PathBuf>,
embedder: Arc<dyn Embedder>,
context_token: Arc<()>,
) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
let Some(index_dir) = ann_path
.as_ref()
.and_then(|path| path.parent().map(Path::to_path_buf))
else {
return Ok(None);
};
let fast_path = {
let explicit = index_dir.join("vector.fast.idx");
if explicit.is_file() {
explicit
} else {
let fallback = index_dir.join("vector.idx");
if fallback.is_file() {
fallback
} else {
return Ok(None);
}
}
};
let quality_path = index_dir.join("vector.quality.idx");
if !quality_path.is_file() {
return Ok(None);
}
let fast_index = FsVectorIndex::open(&fast_path)
.map_err(|err| anyhow!("open fast-tier index failed: {err}"))?;
let quality_index = FsVectorIndex::open(&quality_path)
.map_err(|err| anyhow!("open quality-tier index failed: {err}"))?;
let index = Arc::new(
FsTwoTierIndex::open(&index_dir, frankensearch_two_tier_config())
.map_err(|err| anyhow!("open progressive two-tier index failed: {err}"))?,
);
let fast_embedder = self.load_embedder_for_progressive_id(
&embedder,
fast_index.embedder_id(),
fast_index.dimension(),
)?;
let fast_embedder: Arc<dyn frankensearch::Embedder> = Arc::new(FsSyncEmbedderAdapter(
SharedCassSyncEmbedder::new(fast_embedder),
));
let quality_embedder = Some(self.load_embedder_for_progressive_id(
&embedder,
quality_index.embedder_id(),
quality_index.dimension(),
)?);
let quality_embedder = quality_embedder.map(|embedder| {
Arc::new(FsSyncEmbedderAdapter(SharedCassSyncEmbedder::new(embedder)))
as Arc<dyn frankensearch::Embedder>
});
Ok(Some(Arc::new(ProgressiveTwoTierContext {
context_token,
index,
fast_embedder,
quality_embedder,
})))
}
fn load_embedder_for_progressive_id(
&self,
current_embedder: &Arc<dyn Embedder>,
embedder_id: &str,
dimension: usize,
) -> Result<Arc<dyn Embedder>> {
if current_embedder.id() == embedder_id {
return Ok(Arc::clone(current_embedder));
}
if let Some(dim) = embedder_id.strip_prefix("fnv1a-")
&& let Ok(parsed) = dim.parse::<usize>()
{
return Ok(Arc::new(crate::search::hash_embedder::HashEmbedder::new(
parsed.max(dimension),
)));
}
if let Some(embedder_name) =
crate::search::fastembed_embedder::FastEmbedder::canonical_name(embedder_id)
{
let data_dir = self
.sqlite_path
.as_ref()
.and_then(|path| path.parent())
.ok_or_else(|| anyhow!("cannot resolve data dir for progressive embedder load"))?;
let embedder = crate::search::fastembed_embedder::FastEmbedder::load_by_name(
data_dir,
embedder_name,
)
.with_context(|| format!("loading FastEmbed model for {embedder_name}"))?;
if embedder.dimension() != dimension {
bail!(
"progressive embedder dimension mismatch: {} index expects {}, model has {}",
embedder_id,
dimension,
embedder.dimension()
);
}
return Ok(Arc::new(embedder));
}
bail!("unsupported progressive embedder id: {embedder_id}");
}
fn resolve_semantic_doc_ids_for_hits(
&self,
hits: &[SearchHit],
) -> Result<Vec<Option<ResolvedSemanticDocId>>> {
if hits.is_empty() {
return Ok(Vec::new());
}
let lookup_keys: Vec<Option<ProgressiveLookupKey>> = hits
.iter()
.map(|hit| {
let idx = hit
.line_number
.and_then(|line| line.checked_sub(1))
.map(i64::try_from)
.transpose()
.ok()
.flatten()?;
Some((
normalized_search_hit_source_id(hit),
hit.source_path.clone(),
hit.conversation_id,
hit.title.trim().to_string(),
idx,
hit.created_at,
hit.content_hash,
))
})
.collect();
let mut seen_exact = HashSet::new();
let mut exact_query_keys = Vec::new();
let mut seen_fallback = HashSet::new();
let mut fallback_query_keys = Vec::new();
for (source_id, source_path, conversation_id, _title, idx, _created_at, _content_hash) in
lookup_keys.iter().flatten()
{
if let Some(conversation_id) = conversation_id {
let query_key: ProgressiveExactQueryKey = (*conversation_id, *idx);
if seen_exact.insert(query_key) {
exact_query_keys.push(query_key);
}
} else {
let query_key: ProgressiveFallbackQueryKey =
(source_id.clone(), source_path.clone(), *idx);
if seen_fallback.insert(query_key.clone()) {
fallback_query_keys.push(query_key);
}
}
}
if exact_query_keys.is_empty() && fallback_query_keys.is_empty() {
return Ok(vec![None; hits.len()]);
}
let sqlite_guard = self.sqlite_guard()?;
let conn = sqlite_guard
.as_ref()
.ok_or_else(|| anyhow!("progressive search requires database connection"))?;
let mut resolved_by_key = HashMap::new();
let normalized_source_sql =
normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
const CHUNK_SIZE: usize = 300;
for chunk in exact_query_keys.chunks(CHUNK_SIZE) {
let mut sql = String::from("SELECT c.id, ");
sql.push_str(&normalized_source_sql);
sql.push_str(
", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
LEFT JOIN sources s ON c.source_id = s.id
WHERE ",
);
let mut params = Vec::with_capacity(chunk.len().saturating_mul(2));
for (idx, (conversation_id, line_idx)) in chunk.iter().enumerate() {
if idx > 0 {
sql.push_str(" OR ");
}
sql.push_str("(c.id = ? AND m.idx = ?)");
params.push(ParamValue::from(*conversation_id));
params.push(ParamValue::from(*line_idx));
}
let chunk_rows: Vec<ResolvedSemanticLookupRow> =
conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
let conversation_id: i64 = row.get_typed(0)?;
let source_id: String = row.get_typed(1)?;
let source_path: String = row.get_typed(2)?;
let idx: i64 = row.get_typed(3)?;
let message_id_raw: i64 = row.get_typed(4)?;
let agent_id_raw: Option<i64> = row.get_typed(5)?;
let workspace_id_raw: Option<i64> = row.get_typed(6)?;
let role_raw: String = row.get_typed(7)?;
let created_at_ms: Option<i64> = row.get_typed(8)?;
let content: String = row.get_typed(9)?;
let title: Option<String> = row.get_typed(10)?;
let canonical = canonicalize_for_embedding(&content);
if canonical.is_empty() {
return Ok(None);
}
let message_id = u64::try_from(message_id_raw).map_err(|_| {
std::io::Error::other("message id out of range for progressive doc_id")
})?;
let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
let doc_id = SemanticDocId {
message_id,
chunk_idx: 0,
agent_id,
workspace_id,
source_id: crc32fast::hash(source_id.as_bytes()),
role,
created_at_ms: created_at_ms.unwrap_or(0),
content_hash: Some(content_hash(&canonical)),
}
.to_doc_id_string();
let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
let lookup_key = (
source_id,
source_path.clone(),
Some(conversation_id),
title.unwrap_or_default().trim().to_string(),
idx,
created_at_ms,
stable_hit_hash(&content, &source_path, line_number, created_at_ms),
);
Ok(Some((
lookup_key,
ResolvedSemanticDocId { message_id, doc_id },
)))
})?;
for row in chunk_rows.into_iter().flatten() {
resolved_by_key.insert(row.0, row.1);
}
}
for chunk in fallback_query_keys.chunks(CHUNK_SIZE) {
let mut sql = String::from("SELECT ");
sql.push_str(&normalized_source_sql);
sql.push_str(
", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
LEFT JOIN sources s ON c.source_id = s.id
WHERE ",
);
let mut params = Vec::with_capacity(chunk.len().saturating_mul(3));
for (idx, (source_id, source_path, line_idx)) in chunk.iter().enumerate() {
if idx > 0 {
sql.push_str(" OR ");
}
sql.push_str(&format!(
"({normalized_source_sql} = ? AND c.source_path = ? AND m.idx = ?)"
));
params.push(ParamValue::from(normalize_search_source_filter_value(
source_id,
)));
params.push(ParamValue::from(source_path.clone()));
params.push(ParamValue::from(*line_idx));
}
let chunk_rows: Vec<ResolvedSemanticLookupRow> =
conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
let source_id: String = row.get_typed(0)?;
let source_path: String = row.get_typed(1)?;
let idx: i64 = row.get_typed(2)?;
let message_id_raw: i64 = row.get_typed(3)?;
let agent_id_raw: Option<i64> = row.get_typed(4)?;
let workspace_id_raw: Option<i64> = row.get_typed(5)?;
let role_raw: String = row.get_typed(6)?;
let created_at_ms: Option<i64> = row.get_typed(7)?;
let content: String = row.get_typed(8)?;
let title: Option<String> = row.get_typed(9)?;
let canonical = canonicalize_for_embedding(&content);
if canonical.is_empty() {
return Ok(None);
}
let message_id = u64::try_from(message_id_raw).map_err(|_| {
std::io::Error::other("message id out of range for progressive doc_id")
})?;
let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
let doc_id = SemanticDocId {
message_id,
chunk_idx: 0,
agent_id,
workspace_id,
source_id: crc32fast::hash(source_id.as_bytes()),
role,
created_at_ms: created_at_ms.unwrap_or(0),
content_hash: Some(content_hash(&canonical)),
}
.to_doc_id_string();
let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
let lookup_key = (
source_id,
source_path.clone(),
None,
title.unwrap_or_default().trim().to_string(),
idx,
created_at_ms,
stable_hit_hash(&content, &source_path, line_number, created_at_ms),
);
Ok(Some((
lookup_key,
ResolvedSemanticDocId { message_id, doc_id },
)))
})?;
for row in chunk_rows.into_iter().flatten() {
resolved_by_key.insert(row.0, row.1);
}
}
Ok(lookup_keys
.into_iter()
.map(|key| key.and_then(|lookup| resolved_by_key.get(&lookup).cloned()))
.collect())
}
fn load_message_text_by_id(&self, message_id: u64) -> Result<Option<String>> {
let sqlite_guard = self.sqlite_guard()?;
let conn = sqlite_guard
.as_ref()
.ok_or_else(|| anyhow!("progressive search requires database connection"))?;
let rows: Vec<String> = conn.query_map_collect(
"SELECT content FROM messages WHERE id = ?",
&[ParamValue::from(i64::try_from(message_id)?)],
|row: &frankensqlite::Row| row.get_typed(0),
)?;
Ok(rows.into_iter().next())
}
fn collapse_progressive_scored_results(
&self,
results: &[FsScoredResult],
fetch_limit: usize,
) -> Vec<VectorSearchResult> {
let fetch = fetch_limit.max(1);
let mut best_by_message: HashMap<u64, VectorSearchResult> =
HashMap::with_capacity(results.len());
for hit in results {
let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
continue;
};
best_by_message
.entry(parsed.message_id)
.and_modify(|entry| {
if hit.score > entry.score {
entry.score = hit.score;
entry.chunk_idx = parsed.chunk_idx;
}
})
.or_insert(VectorSearchResult {
message_id: parsed.message_id,
chunk_idx: parsed.chunk_idx,
score: hit.score,
});
}
let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
collapsed.sort_by(|a, b| {
b.score
.total_cmp(&a.score)
.then_with(|| a.message_id.cmp(&b.message_id))
});
if collapsed.len() > fetch {
collapsed.truncate(fetch);
}
collapsed
}
fn hydrate_semantic_hits_with_ids(
&self,
results: &[VectorSearchResult],
field_mask: FieldMask,
) -> Result<Vec<(u64, SearchHit)>> {
if results.is_empty() {
return Ok(Vec::new());
}
let sqlite_guard = self.sqlite_guard()?;
let conn = sqlite_guard
.as_ref()
.ok_or_else(|| anyhow!("semantic search requires database connection"))?;
let placeholder_capacity = results.len().saturating_mul(2).saturating_sub(1);
let mut placeholders = String::with_capacity(placeholder_capacity);
let mut params: Vec<ParamValue> = Vec::with_capacity(results.len());
for (idx, result) in results.iter().enumerate() {
if idx > 0 {
placeholders.push(',');
}
placeholders.push('?');
params.push(ParamValue::from(i64::try_from(result.message_id)?));
}
let title_expr = if field_mask.wants_title() {
"c.title"
} else {
"''"
};
let normalized_source_sql =
normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
let sql = format!(
"SELECT m.id, c.id, m.content, m.created_at, m.idx, m.role, {title_expr}, c.source_path, {normalized_source_sql}, c.origin_host, COALESCE(a.slug, 'unknown'), w.path, s.kind, c.started_at
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
LEFT JOIN agents a ON c.agent_id = a.id
LEFT JOIN workspaces w ON c.workspace_id = w.id
LEFT JOIN sources s ON c.source_id = s.id
WHERE m.id IN ({placeholders})"
);
let rows: Vec<(u64, SearchHit)> =
conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
let message_id: i64 = row.get_typed(0)?;
let conversation_id: i64 = row.get_typed(1)?;
let full_content: String = row.get_typed(2)?;
let msg_created_at: Option<i64> = row.get_typed(3)?;
let idx: Option<i64> = row.get_typed(4)?;
let title: Option<String> = if field_mask.wants_title() {
row.get_typed(6)?
} else {
None
};
let source_path: String = row.get_typed(7)?;
let raw_source_id: String = row.get_typed(8)?;
let origin_host: Option<String> = row.get_typed(9)?;
let agent: String = row.get_typed(10)?;
let workspace: Option<String> = row.get_typed(11)?;
let raw_origin_kind: Option<String> = row.get_typed(12)?;
let started_at: Option<i64> = row.get_typed(13)?;
let created_at = msg_created_at.or(started_at);
let line_number = idx
.and_then(|i| usize::try_from(i).ok())
.map(|i| i.saturating_add(1));
let snippet = if field_mask.wants_snippet() {
snippet_from_content(&full_content)
} else {
String::new()
};
let content = if field_mask.needs_content() {
full_content.clone()
} else {
String::new()
};
let content_hash =
stable_hit_hash(&full_content, &source_path, line_number, created_at);
let source_id = normalized_search_hit_source_id_parts(
raw_source_id.as_str(),
raw_origin_kind.as_deref().unwrap_or_default(),
origin_host.as_deref(),
);
let origin_kind =
normalized_search_hit_origin_kind(&source_id, raw_origin_kind.as_deref());
let hit = SearchHit {
title: if field_mask.wants_title() {
title.unwrap_or_default()
} else {
String::new()
},
snippet,
content,
content_hash,
conversation_id: Some(conversation_id),
score: 0.0,
source_path,
agent,
workspace: workspace.unwrap_or_default(),
workspace_original: None,
created_at,
line_number,
match_type: MatchType::Exact,
source_id,
origin_kind,
origin_host,
};
Ok((semantic_message_id_from_db(message_id)?, hit))
})?;
let mut hits_by_id = HashMap::new();
for (id, hit) in rows {
hits_by_id.insert(id, hit);
}
let mut ordered = Vec::new();
for result in results {
if let Some(mut hit) = hits_by_id.remove(&result.message_id) {
hit.score = result.score;
ordered.push((result.message_id, hit));
}
}
Ok(ordered)
}
fn overlay_progressive_lexical_hit(
&self,
hit: &mut SearchHit,
lexical: &ProgressiveLexicalHit,
field_mask: FieldMask,
) {
if field_mask.wants_title() && !lexical.title.is_empty() {
hit.title = lexical.title.clone();
}
if field_mask.wants_snippet() && !lexical.snippet.is_empty() {
hit.snippet = lexical.snippet.clone();
}
if field_mask.needs_content() && !lexical.content.is_empty() {
hit.content = lexical.content.clone();
}
hit.match_type = lexical.match_type;
hit.line_number = lexical.line_number.or(hit.line_number);
}
fn progressive_phase_to_result(
&self,
results: &[FsScoredResult],
ctx: ProgressivePhaseContext<'_>,
) -> Result<SearchResult> {
let collapsed = self.collapse_progressive_scored_results(results, ctx.fetch_limit);
let missing: Vec<VectorSearchResult> = collapsed
.iter()
.filter(|result| {
ctx.lexical_cache
.and_then(|cache| cache.hits_by_message.get(&result.message_id))
.is_none()
})
.map(|result| VectorSearchResult {
message_id: result.message_id,
chunk_idx: result.chunk_idx,
score: result.score,
})
.collect();
let mut hydrated_by_id: HashMap<u64, SearchHit> = self
.hydrate_semantic_hits_with_ids(&missing, ctx.field_mask)?
.into_iter()
.collect();
let mut hydrated: Vec<(u64, SearchHit)> = Vec::with_capacity(collapsed.len());
for result in &collapsed {
if let Some(cache) = ctx.lexical_cache
&& let Some(lexical) = cache.hits_by_message.get(&result.message_id)
{
hydrated.push((result.message_id, lexical.to_search_hit(result.score)));
continue;
}
if let Some(mut hit) = hydrated_by_id.remove(&result.message_id) {
if let Some(cache) = ctx.lexical_cache
&& let Some(lexical) = cache.hits_by_message.get(&result.message_id)
{
self.overlay_progressive_lexical_hit(&mut hit, lexical, ctx.field_mask);
}
hydrated.push((result.message_id, hit));
}
}
let mut hits: Vec<SearchHit> = hydrated.into_iter().map(|(_, hit)| hit).collect();
(_, hits) = self.postprocess_hits_page(hits, ctx.query, ctx.filters, ctx.limit, 0);
let (wildcard_fallback, suggestions) = ctx
.lexical_cache
.map(|cache| {
let suggestions = if hits.is_empty() {
cache.suggestions.clone()
} else {
Vec::new()
};
(cache.wildcard_fallback, suggestions)
})
.unwrap_or((false, Vec::new()));
Ok(SearchResult {
hits,
wildcard_fallback,
cache_stats: self.cache_stats(),
suggestions,
ann_stats: None,
total_count: None,
})
}
pub(crate) async fn search_progressive_with_callback(
self: &Arc<Self>,
request: ProgressiveSearchRequest<'_>,
mut on_event: impl FnMut(ProgressiveSearchEvent) + Send,
) -> Result<()> {
let ProgressiveSearchRequest {
cx,
query,
filters,
limit,
sparse_threshold,
field_mask,
mode,
} = request;
let field_mask = effective_field_mask(field_mask);
let limit = limit.max(1);
let fetch_limit = progressive_phase_fetch_limit(limit);
match mode {
SearchMode::Lexical => {
let started = Instant::now();
let result = self.search_with_fallback(
query,
filters,
limit,
0,
sparse_threshold,
field_mask,
)?;
on_event(ProgressiveSearchEvent::Phase {
kind: ProgressivePhaseKind::Initial,
elapsed_ms: started.elapsed().as_millis(),
result,
});
return Ok(());
}
SearchMode::Semantic | SearchMode::Hybrid => {}
}
let progressive_context = {
self.progressive_context()?
.ok_or_else(|| anyhow!("progressive two-tier context unavailable"))?
};
let progressive_context_token = Arc::clone(&progressive_context.context_token);
let lexical_cache: Arc<Mutex<ProgressiveLexicalSnapshot>> =
Arc::new(Mutex::new(Arc::new(ProgressiveLexicalCache::default())));
let text_cache: Arc<Mutex<HashMap<u64, String>>> = Arc::new(Mutex::new(HashMap::new()));
let text_client = Arc::clone(self);
let text_cache_for_lookup = Arc::clone(&text_cache);
let text_fn = move |doc_id: &str| -> Option<String> {
let parsed = parse_semantic_doc_id(doc_id)?;
if let Ok(cache) = text_cache_for_lookup.lock()
&& let Some(text) = cache.get(&parsed.message_id)
{
return Some(text.clone());
}
let loaded = text_client
.load_message_text_by_id(parsed.message_id)
.ok()
.flatten()?;
if let Ok(mut cache) = text_cache_for_lookup.lock() {
cache.insert(parsed.message_id, loaded.clone());
}
Some(loaded)
};
let mut searcher = FsTwoTierSearcher::new(
Arc::clone(&progressive_context.index),
Arc::clone(&progressive_context.fast_embedder),
frankensearch_two_tier_config(),
);
if let Some(quality_embedder) = progressive_context.quality_embedder.as_ref() {
searcher = searcher.with_quality_embedder(Arc::clone(quality_embedder));
}
if matches!(mode, SearchMode::Hybrid) {
let lexical = Arc::new(CassProgressiveLexicalAdapter::new(
Arc::clone(self),
filters.clone(),
field_mask,
sparse_threshold,
Arc::clone(&lexical_cache),
));
searcher = searcher.with_lexical(lexical);
}
let phase_client = Arc::clone(self);
let phase_filters = filters.clone();
let phase_cache = Arc::clone(&lexical_cache);
let mut phase_error: Option<anyhow::Error> = None;
let search_result = searcher
.search(cx, query, fetch_limit, text_fn, |phase| {
if phase_error.is_some() {
return;
}
match phase_client.semantic_context_matches(&progressive_context_token) {
Ok(true) => {}
Ok(false) => {
phase_error = Some(anyhow!(
"progressive search aborted: semantic context changed"
));
cx.set_cancel_requested(true);
return;
}
Err(err) => {
phase_error = Some(err);
cx.set_cancel_requested(true);
return;
}
}
let lexical_snapshot = phase_cache.lock().ok().map(|guard| Arc::clone(&guard));
let event_result = match phase {
FsSearchPhase::Initial {
results, latency, ..
} => phase_client
.progressive_phase_to_result(
&results,
ProgressivePhaseContext {
query,
filters: &phase_filters,
field_mask,
lexical_cache: lexical_snapshot.as_deref(),
limit,
fetch_limit,
},
)
.map(|result| ProgressiveSearchEvent::Phase {
kind: ProgressivePhaseKind::Initial,
elapsed_ms: latency.as_millis(),
result,
}),
FsSearchPhase::Refined {
results, latency, ..
} => phase_client
.progressive_phase_to_result(
&results,
ProgressivePhaseContext {
query,
filters: &phase_filters,
field_mask,
lexical_cache: lexical_snapshot.as_deref(),
limit,
fetch_limit,
},
)
.map(|result| ProgressiveSearchEvent::Phase {
kind: ProgressivePhaseKind::Refined,
elapsed_ms: latency.as_millis(),
result,
}),
FsSearchPhase::Reranked {
results, latency, ..
} => phase_client
.progressive_phase_to_result(
&results,
ProgressivePhaseContext {
query,
filters: &phase_filters,
field_mask,
lexical_cache: lexical_snapshot.as_deref(),
limit,
fetch_limit,
},
)
.map(|result| ProgressiveSearchEvent::Phase {
kind: ProgressivePhaseKind::Refined,
elapsed_ms: latency.as_millis(),
result,
}),
FsSearchPhase::RefinementFailed { error, latency, .. } => {
Ok(ProgressiveSearchEvent::RefinementFailed {
latency_ms: latency.as_millis(),
error: error.to_string(),
})
}
};
match event_result {
Ok(event) => on_event(event),
Err(err) => {
phase_error = Some(err);
cx.set_cancel_requested(true);
}
}
})
.await;
if let Some(err) = phase_error {
return Err(err);
}
search_result
.map(|_| ())
.map_err(|err| anyhow!("progressive search failed: {err}"))
}
pub fn search_semantic(
&self,
query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
field_mask: FieldMask,
approximate: bool,
) -> Result<(
Vec<SearchHit>,
Option<crate::search::ann_index::AnnSearchStats>,
)> {
self.search_semantic_with_tier(
query,
filters,
limit,
offset,
field_mask,
approximate,
SemanticTierMode::Single,
)
}
#[allow(clippy::too_many_arguments)]
pub fn search_semantic_with_tier(
&self,
query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
field_mask: FieldMask,
approximate: bool,
tier_mode: SemanticTierMode,
) -> Result<(
Vec<SearchHit>,
Option<crate::search::ann_index::AnnSearchStats>,
)> {
let field_mask = effective_field_mask(field_mask);
let canonical = canonicalize_for_embedding(query);
if canonical.trim().is_empty() {
return Ok((Vec::new(), None));
}
let limit = if limit == 0 {
self.total_docs().min(no_limit_result_cap()).max(1)
} else {
limit
};
let target_hits = limit.saturating_add(offset);
if target_hits == 0 {
return Ok((Vec::new(), None));
}
let initial_fetch_limit = target_hits;
let fallback_fetch_limit = target_hits.saturating_mul(3);
loop {
let (embedding, candidate_context, in_memory_two_tier_index, ann_index, context_token) = loop {
let embedding = self.semantic_query_embedding(&canonical)?;
let (candidate_context, context_token) = {
let guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_ref().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
(
SemanticCandidateContext {
fs_semantic_index: Arc::clone(&state.fs_semantic_index),
fs_semantic_indexes: Arc::clone(&state.fs_semantic_indexes),
filter_maps: state.filter_maps.clone(),
roles: state.roles.clone(),
},
Arc::clone(&state.context_token),
)
};
if !Arc::ptr_eq(&embedding.context_token, &context_token) {
continue;
}
let in_memory_two_tier_index = if tier_mode.wants_two_tier() && !approximate {
self.in_memory_two_tier_index(tier_mode)?
} else {
None
};
let ann_index = if approximate {
Some(self.ann_index()?)
} else {
None
};
let guard = self
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard.as_ref().ok_or_else(|| {
anyhow!("semantic search unavailable (no embedder or vector index)")
})?;
if !Arc::ptr_eq(&state.context_token, &context_token) {
continue;
}
break (
embedding.vector,
candidate_context,
in_memory_two_tier_index,
ann_index,
context_token,
);
};
let finalize_hits =
|results: &[VectorSearchResult]| -> Result<(usize, Vec<SearchHit>)> {
let hits = self.hydrate_semantic_hits(results, field_mask)?;
Ok(self.postprocess_hits_page(hits, query, &filters, limit, offset))
};
let (results, retry_state, mut ann_stats) = self.search_semantic_candidates(
&candidate_context,
&embedding,
&filters,
SemanticCandidateSearchRequest {
fetch_limit: initial_fetch_limit,
approximate,
tier_mode,
in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
ann_index: ann_index.as_ref(),
},
)?;
if !self.semantic_context_matches(&context_token)? {
tracing::debug!("semantic context changed during candidate search; retrying");
continue;
}
let (mut available_hits, mut paged_hits) = finalize_hits(&results)?;
let needs_retry = initial_fetch_limit < fallback_fetch_limit
&& ((available_hits < target_hits && retry_state.has_more_candidates)
|| retry_state.exact_window_may_omit_competitor);
if needs_retry {
tracing::debug!(
query = canonical,
target_hits,
available_hits,
initial_fetch_limit,
fallback_fetch_limit,
"retrying semantic fetch due to candidate-window shortfall"
);
let (retry_results, _, retry_ann_stats) = self.search_semantic_candidates(
&candidate_context,
&embedding,
&filters,
SemanticCandidateSearchRequest {
fetch_limit: fallback_fetch_limit,
approximate,
tier_mode,
in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
ann_index: ann_index.as_ref(),
},
)?;
if !self.semantic_context_matches(&context_token)? {
tracing::debug!("semantic context changed during retry fetch; retrying");
continue;
}
(available_hits, paged_hits) = finalize_hits(&retry_results)?;
ann_stats = retry_ann_stats;
}
tracing::trace!(
query = canonical,
target_hits,
available_hits,
returned = paged_hits.len(),
"semantic fetch complete"
);
return Ok((paged_hits, ann_stats));
}
}
fn hydrate_semantic_hits(
&self,
results: &[VectorSearchResult],
field_mask: FieldMask,
) -> Result<Vec<SearchHit>> {
self.hydrate_semantic_hits_with_ids(results, field_mask)
.map(|rows| rows.into_iter().map(|(_, hit)| hit).collect())
}
fn postprocess_hits_page(
&self,
hits: Vec<SearchHit>,
query: &str,
filters: &SearchFilters,
limit: usize,
offset: usize,
) -> (usize, Vec<SearchHit>) {
let mut hits = deduplicate_hits_with_query(hits, query);
if !filters.session_paths.is_empty() {
hits.retain(|hit| filters.session_paths.contains(&hit.source_path));
}
let available_hits = hits.len();
let paged_hits = hits.into_iter().skip(offset).take(limit).collect();
(available_hits, paged_hits)
}
pub fn search_with_fallback(
&self,
query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
sparse_threshold: usize,
field_mask: FieldMask,
) -> Result<SearchResult> {
let hits = self.search(query, filters.clone(), limit, offset, field_mask)?;
let baseline_stats = self.cache_stats();
let tantivy_total = self
.last_tantivy_total_count
.lock()
.ok()
.and_then(|guard| *guard);
let query_has_wildcards = query.contains('*');
let has_boolean_or_phrase = fs_cass_has_boolean_operators(query);
let is_sparse = should_try_wildcard_fallback(hits.len(), limit, offset, sparse_threshold);
if !is_sparse || query_has_wildcards || has_boolean_or_phrase || query.trim().is_empty() {
let suggestions = if hits.is_empty() && !query.trim().is_empty() {
self.generate_suggestions(query, &filters)
} else {
Vec::new()
};
return Ok(SearchResult {
hits,
wildcard_fallback: false,
cache_stats: baseline_stats,
suggestions,
ann_stats: None,
total_count: tantivy_total,
});
}
if should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(query, hits.len()) {
let suggestions = if hits.is_empty() {
self.generate_suggestions(query, &filters)
} else {
Vec::new()
};
return Ok(SearchResult {
hits,
wildcard_fallback: false,
cache_stats: baseline_stats,
suggestions,
ann_stats: None,
total_count: tantivy_total,
});
}
let wildcard_query = query
.split_whitespace()
.map(|term| format!("*{}*", term.trim_matches('*')))
.collect::<Vec<_>>()
.join(" ");
tracing::info!(
original_query = query,
wildcard_query = wildcard_query,
original_count = hits.len(),
"wildcard_fallback"
);
let mut fallback_hits =
self.search(&wildcard_query, filters.clone(), limit, offset, field_mask)?;
let fallback_stats = self.cache_stats();
let fallback_tantivy_total = self
.last_tantivy_total_count
.lock()
.ok()
.and_then(|guard| *guard);
if fallback_hits.len() > hits.len() {
for hit in &mut fallback_hits {
hit.match_type = MatchType::ImplicitWildcard;
}
let suggestions = if fallback_hits.is_empty() {
self.generate_suggestions(query, &filters)
} else {
Vec::new()
};
Ok(SearchResult {
hits: fallback_hits,
wildcard_fallback: true,
cache_stats: fallback_stats,
suggestions,
ann_stats: None,
total_count: fallback_tantivy_total,
})
} else {
let suggestions = if hits.is_empty() {
self.generate_suggestions(query, &filters)
} else {
Vec::new()
};
Ok(SearchResult {
hits,
wildcard_fallback: false,
cache_stats: baseline_stats,
suggestions,
ann_stats: None,
total_count: tantivy_total,
})
}
}
#[allow(clippy::too_many_arguments)]
pub fn search_hybrid(
&self,
lexical_query: &str,
semantic_query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
sparse_threshold: usize,
field_mask: FieldMask,
approximate: bool,
) -> Result<SearchResult> {
self.search_hybrid_with_tier(
lexical_query,
semantic_query,
filters,
limit,
offset,
sparse_threshold,
field_mask,
approximate,
SemanticTierMode::Single,
)
}
#[allow(clippy::too_many_arguments)]
pub fn search_hybrid_with_tier(
&self,
lexical_query: &str,
semantic_query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
sparse_threshold: usize,
field_mask: FieldMask,
approximate: bool,
semantic_tier_mode: SemanticTierMode,
) -> Result<SearchResult> {
let requested_limit = limit;
let total_docs = self.total_docs().max(1);
let limit = if requested_limit == 0 {
total_docs.min(no_limit_result_cap()).max(1)
} else {
requested_limit
};
let fetch = limit.saturating_add(offset);
if fetch == 0 {
return Ok(SearchResult {
hits: Vec::new(),
wildcard_fallback: false,
cache_stats: self.cache_stats(),
suggestions: Vec::new(),
ann_stats: None,
total_count: None,
});
}
if semantic_query.trim().is_empty() {
return self.search_with_fallback(
lexical_query,
filters,
limit,
offset,
sparse_threshold,
field_mask,
);
}
let budget =
hybrid_candidate_budget(semantic_query, requested_limit, limit, offset, total_docs);
let lexical = self.search_with_fallback(
lexical_query,
filters.clone(),
budget.lexical_candidates,
0,
sparse_threshold,
field_mask,
)?;
let (semantic_hits, semantic_ann_stats) = self.search_semantic_with_tier(
semantic_query,
filters,
budget.semantic_candidates,
0,
field_mask,
approximate,
semantic_tier_mode,
)?;
let fused = rrf_fuse_hits(&lexical.hits, &semantic_hits, semantic_query, limit, offset);
let suggestions = if fused.is_empty() {
lexical.suggestions.clone()
} else {
Vec::new()
};
Ok(SearchResult {
hits: fused,
wildcard_fallback: lexical.wildcard_fallback,
cache_stats: lexical.cache_stats,
suggestions,
ann_stats: semantic_ann_stats,
total_count: None,
})
}
fn generate_suggestions(&self, query: &str, filters: &SearchFilters) -> Vec<QuerySuggestion> {
let mut suggestions = Vec::new();
let query_lower = query.to_lowercase();
if !query.contains('*') && query.len() >= 2 {
suggestions.push(QuerySuggestion::wildcard(query).with_shortcut(1));
}
if !filters.agents.is_empty() {
let agents: Vec<&str> = filters
.agents
.iter()
.map(std::string::String::as_str)
.collect();
let agent_str = agents.join(", ");
suggestions
.push(QuerySuggestion::remove_agent_filter(&agent_str, filters).with_shortcut(2));
}
let known_agents = [
"codex",
"claude",
"claude_code",
"cline",
"gemini",
"amp",
"opencode",
];
for agent in &known_agents {
if levenshtein_distance(&query_lower, agent) <= 2 && query_lower != *agent {
suggestions.push(
QuerySuggestion::spelling(query, agent)
.with_shortcut(suggestions.len().min(2) as u8 + 1),
);
break; }
}
if filters.agents.is_empty()
&& let Ok(sqlite_guard) = self.sqlite.lock()
&& let Some(conn) = sqlite_guard.as_ref()
&& let Ok(rows) = conn.query_map_collect(
"SELECT a.slug
FROM conversations c
JOIN agents a ON c.agent_id = a.id
GROUP BY a.slug
ORDER BY MAX(c.id) DESC
LIMIT 3",
&[],
|row: &frankensqlite::Row| row.get_typed::<String>(0),
)
{
for row in rows {
if suggestions.len() < 3 {
suggestions.push(
QuerySuggestion::try_agent(&row)
.with_shortcut(suggestions.len().min(2) as u8 + 1),
);
}
}
}
suggestions.truncate(3);
for (i, sugg) in suggestions.iter_mut().enumerate() {
sugg.shortcut = Some((i + 1) as u8);
}
suggestions
}
fn searcher_for_thread(&self, reader: &IndexReader) -> Searcher {
let epoch = self.reload_epoch.load(Ordering::Relaxed);
let reader_key = reader as *const IndexReader as usize;
THREAD_SEARCHER.with(|slot| {
let mut slot = slot.borrow_mut();
if let Some(entry) = slot.as_ref()
&& entry.epoch == epoch
&& entry.reader_key == reader_key
{
return entry.searcher.clone();
}
let searcher = reader.searcher();
*slot = Some(SearcherCacheEntry {
epoch,
reader_key,
searcher: searcher.clone(),
});
searcher
})
}
fn federated_readers(&self) -> Option<Arc<Vec<FederatedIndexReader>>> {
FEDERATED_SEARCH_READERS
.read()
.get(&self.cache_namespace)
.cloned()
}
fn maybe_reload_federated_readers(
&self,
readers: &[FederatedIndexReader],
) -> Result<Option<u64>> {
if !self.reload_on_search || readers.is_empty() {
return Ok(None);
}
const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
let now = Instant::now();
let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
if guard
.map(|t| now.duration_since(t) < MIN_RELOAD_INTERVAL)
.unwrap_or(false)
{
let signature = self.federated_generation_signature(readers);
return Ok(Some(signature));
}
let reload_started = Instant::now();
for shard in readers {
shard.reader.reload()?;
}
let elapsed = reload_started.elapsed();
*guard = Some(now);
let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
self.metrics.record_reload(elapsed);
tracing::debug!(
duration_ms = elapsed.as_millis() as u64,
reload_epoch = epoch,
shards = readers.len(),
"tantivy_reader_reload_federated"
);
Ok(Some(self.federated_generation_signature(readers)))
}
fn federated_generation_signature(&self, readers: &[FederatedIndexReader]) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
readers.len().hash(&mut hasher);
for shard in readers {
self.searcher_for_thread(&shard.reader)
.generation()
.generation_id()
.hash(&mut hasher);
}
hasher.finish()
}
fn track_generation(&self, generation: u64) {
let mut guard = self
.last_generation
.lock()
.unwrap_or_else(|e| e.into_inner());
if let Some(prev) = *guard
&& prev != generation
&& let Ok(mut cache) = self.prefix_cache.lock()
{
cache.clear();
}
*guard = Some(generation);
}
fn hydrate_tantivy_hit_contents(
&self,
exact_keys: &[TantivyContentExactKey],
fallback_keys: &[TantivyContentFallbackKey],
) -> Result<TantivyHydratedContentMaps> {
if exact_keys.is_empty() && fallback_keys.is_empty() {
return Ok((HashMap::new(), HashMap::new()));
}
let sqlite_guard = match self.sqlite_guard() {
Ok(guard) => guard,
Err(_) => return Ok((HashMap::new(), HashMap::new())),
};
let Some(conn) = sqlite_guard.as_ref() else {
return Ok((HashMap::new(), HashMap::new()));
};
let mut hydrated_exact = HashMap::new();
let mut hydrated_fallback = HashMap::new();
const CHUNK_SIZE: usize = 300;
if !exact_keys.is_empty() {
let mut unique_exact_keys = Vec::with_capacity(exact_keys.len());
let mut seen = HashSet::with_capacity(exact_keys.len());
for key in exact_keys {
if seen.insert(*key) {
unique_exact_keys.push(*key);
}
}
hydrated_exact.extend(hydrate_message_content_by_conversation(
conn,
&unique_exact_keys,
)?);
}
if !fallback_keys.is_empty() {
let mut unique_fallback_keys = Vec::with_capacity(fallback_keys.len());
let mut seen = HashSet::with_capacity(fallback_keys.len());
for key in fallback_keys {
if seen.insert(key.clone()) {
unique_fallback_keys.push(key.clone());
}
}
let mut unique_source_paths = Vec::with_capacity(unique_fallback_keys.len());
let mut seen_source_paths = HashSet::with_capacity(unique_fallback_keys.len());
for (_, source_path, _) in &unique_fallback_keys {
if seen_source_paths.insert(source_path.clone()) {
unique_source_paths.push(source_path.clone());
}
}
let mut conversations_by_key: HashMap<(String, String), Vec<i64>> = HashMap::new();
for chunk in unique_source_paths.chunks(CHUNK_SIZE) {
let placeholders = sql_placeholders(chunk.len());
let sql = format!(
"SELECT c.id,
c.source_path,
COALESCE(c.source_id, ''),
COALESCE(c.origin_host, ''),
COALESCE(s.kind, '')
FROM conversations c
LEFT JOIN sources s ON c.source_id = s.id
WHERE c.source_path IN ({placeholders})
ORDER BY c.id"
);
let params = chunk
.iter()
.map(|source_path| ParamValue::from(source_path.clone()))
.collect::<Vec<_>>();
let rows: Vec<(i64, String, String, String, String)> =
franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
row.get_typed(4)?,
))
})?;
for (conversation_id, source_path, raw_source_id, origin_host, origin_kind) in rows
{
let normalized_source_id = normalized_search_hit_source_id_parts(
&raw_source_id,
&origin_kind,
(!origin_host.trim().is_empty()).then_some(origin_host.as_str()),
);
conversations_by_key
.entry((normalized_source_id, source_path))
.or_default()
.push(conversation_id);
}
}
let mut message_requests = Vec::new();
let mut fallback_keys_by_exact: HashMap<
TantivyContentExactKey,
Vec<TantivyContentFallbackKey>,
> = HashMap::new();
let mut seen_message_requests = HashSet::new();
for (source_id, source_path, line_idx) in &unique_fallback_keys {
let key = (source_id.clone(), source_path.clone());
let Some(conversation_ids) = conversations_by_key.get(&key) else {
continue;
};
for &conversation_id in conversation_ids {
let exact_key = (conversation_id, *line_idx);
if seen_message_requests.insert(exact_key) {
message_requests.push(exact_key);
}
fallback_keys_by_exact.entry(exact_key).or_default().push((
source_id.clone(),
source_path.clone(),
*line_idx,
));
}
}
for ((conversation_id, line_idx), content) in
hydrate_message_content_by_conversation(conn, &message_requests)?
{
if let Some(fallback_keys) =
fallback_keys_by_exact.get(&(conversation_id, line_idx))
{
for fallback_key in fallback_keys {
hydrated_fallback.insert(fallback_key.clone(), content.clone());
}
}
}
}
Ok((hydrated_exact, hydrated_fallback))
}
#[allow(clippy::too_many_arguments)]
fn search_tantivy(
&self,
reader: &IndexReader,
fields: &FsCassFields,
raw_query: &str,
sanitized_query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
field_mask: FieldMask,
) -> Result<(Vec<SearchHit>, usize)> {
struct PendingTantivyHit {
score: f32,
doc: TantivyDocument,
title: String,
stored_content: String,
stored_preview: String,
agent: String,
source_path: String,
workspace: String,
workspace_original: Option<String>,
created_at: Option<i64>,
line_number: Option<usize>,
stored_preview_snippet: Option<String>,
source_id: String,
conversation_id: Option<i64>,
raw_origin_kind: Option<String>,
origin_host: Option<String>,
}
self.maybe_reload_reader(reader)?;
let searcher = self.searcher_for_thread(reader);
self.track_generation(searcher.generation().generation_id());
let wants_snippet = field_mask.wants_snippet();
let needs_content = field_mask.needs_content() || wants_snippet;
let fs_filters = FsCassQueryFilters {
agents: filters.agents.into_iter().collect(),
workspaces: filters.workspaces.into_iter().collect(),
created_from: filters.created_from,
created_to: filters.created_to,
source_filter: match filters.source_filter {
SourceFilter::All => FsCassSourceFilter::All,
SourceFilter::Local => FsCassSourceFilter::Local,
SourceFilter::Remote => FsCassSourceFilter::Remote,
SourceFilter::SourceId(id) => {
FsCassSourceFilter::SourceId(normalize_search_source_filter_value(&id))
}
},
};
let q: Box<dyn Query> = fs_cass_build_tantivy_query(raw_query, &fs_filters, fields);
let prefix_only = is_prefix_only(sanitized_query);
let top_docs = execute_query_with_lazy_exact_count(&searcher, &*q, limit, offset)?;
let tantivy_total_count = top_docs.total_count;
let query_match_type = dominant_match_type(sanitized_query);
let mut pending_hits = Vec::with_capacity(top_docs.hits.len());
let mut missing_exact_content_keys = Vec::new();
let mut missing_fallback_content_keys = Vec::new();
for ranked_hit in top_docs.hits {
let score = ranked_hit.bm25_score;
let doc: TantivyDocument = fs_load_doc(&searcher, ranked_hit.doc_address)?;
let title = if field_mask.wants_title() {
doc.get_first(fields.title)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string()
} else {
String::new()
};
let stored_content = doc
.get_first(fields.content)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let stored_preview = doc
.get_first(fields.preview)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let stored_preview_snippet = snippet_from_preview_without_full_content(
field_mask,
&stored_preview,
sanitized_query,
);
let agent = doc
.get_first(fields.agent)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let workspace = doc
.get_first(fields.workspace)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let workspace_original = doc
.get_first(fields.workspace_original)
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.map(String::from);
let created_at = doc.get_first(fields.created_at).and_then(|v| v.as_i64());
let line_number = doc
.get_first(fields.msg_idx)
.and_then(|v| v.as_u64())
.and_then(|i| usize::try_from(i).ok())
.map(|i| i.saturating_add(1));
let raw_source_id = doc
.get_first(fields.source_id)
.and_then(|v| v.as_str())
.unwrap_or_default()
.to_string();
let conversation_id = fields
.conversation_id
.and_then(|field| doc.get_first(field))
.and_then(|v| v.as_i64());
let source_path = doc
.get_first(fields.source_path)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let raw_origin_kind = doc
.get_first(fields.origin_kind)
.and_then(|v| v.as_str())
.map(str::to_string);
let origin_host = doc
.get_first(fields.origin_host)
.and_then(|v| v.as_str())
.filter(|s| !s.is_empty())
.map(String::from);
let source_id = normalized_search_hit_source_id_parts(
raw_source_id.as_str(),
raw_origin_kind.as_deref().unwrap_or_default(),
origin_host.as_deref(),
);
let preview_satisfies_bounded_content =
field_mask.preview_content_limit().is_some() && !stored_preview.is_empty();
let preview_satisfies_full_content = field_mask.needs_content()
&& field_mask.preview_content_limit().is_none()
&& stored_preview_is_complete_content(&stored_preview);
if needs_content
&& let Some(line_idx) = line_number
.and_then(|line| line.checked_sub(1))
.and_then(|line| i64::try_from(line).ok())
&& stored_content.is_empty()
&& !preview_satisfies_bounded_content
&& !preview_satisfies_full_content
&& stored_preview_snippet.is_none()
{
if let Some(conversation_id) = conversation_id {
missing_exact_content_keys.push((conversation_id, line_idx));
} else {
missing_fallback_content_keys.push((
source_id.clone(),
source_path.clone(),
line_idx,
));
}
}
pending_hits.push(PendingTantivyHit {
score,
doc,
title,
stored_content,
stored_preview,
agent,
source_path,
workspace,
workspace_original,
created_at,
line_number,
stored_preview_snippet,
source_id,
conversation_id,
raw_origin_kind,
origin_host,
});
}
let (hydrated_contents, hydrated_fallback_contents) = if needs_content
&& (!missing_exact_content_keys.is_empty() || !missing_fallback_content_keys.is_empty())
{
self.hydrate_tantivy_hit_contents(
&missing_exact_content_keys,
&missing_fallback_content_keys,
)?
} else {
(HashMap::new(), HashMap::new())
};
let needs_tantivy_snippet_generator = wants_snippet
&& !prefix_only
&& pending_hits
.iter()
.any(|pending| pending.stored_preview_snippet.is_none());
let snippet_generator = if needs_tantivy_snippet_generator {
let snippet_cfg = FsSnippetConfig {
max_chars: 160,
highlight_prefix: "<b>".to_string(),
highlight_postfix: "</b>".to_string(),
};
fs_try_build_snippet_generator(&searcher, &*q, fields.content, &snippet_cfg)
} else {
None
};
let mut hits = Vec::with_capacity(pending_hits.len());
for pending in pending_hits {
let hydrated_content = pending
.line_number
.and_then(|line| line.checked_sub(1))
.and_then(|line| i64::try_from(line).ok())
.and_then(|line_idx| {
if let Some(conversation_id) = pending.conversation_id {
hydrated_contents.get(&(conversation_id, line_idx)).cloned()
} else {
hydrated_fallback_contents
.get(&(
pending.source_id.clone(),
pending.source_path.clone(),
line_idx,
))
.cloned()
}
});
let preview_satisfies_effective_content = !pending.stored_preview.is_empty()
&& (field_mask.preview_content_limit().is_some()
|| (field_mask.needs_content()
&& field_mask.preview_content_limit().is_none()
&& stored_preview_is_complete_content(&pending.stored_preview)));
let effective_content = if !pending.stored_content.is_empty() {
pending.stored_content.clone()
} else if preview_satisfies_effective_content {
pending.stored_preview.clone()
} else if let Some(content) = hydrated_content {
content
} else {
pending.stored_preview.clone()
};
let snippet = if wants_snippet {
if let Some(snippet) = pending.stored_preview_snippet.clone() {
snippet
} else if let Some(r#gen) = &snippet_generator {
let rendered = if !pending.stored_content.is_empty() {
fs_render_snippet_html(r#gen, &pending.doc, "<b>", "</b>")
} else if !effective_content.is_empty() {
let mut snippet_doc = TantivyDocument::new();
snippet_doc.add_text(fields.content, &effective_content);
fs_render_snippet_html(r#gen, &snippet_doc, "<b>", "</b>")
} else {
None
};
rendered
.map(|html| html.replace("<b>", "**").replace("</b>", "**"))
.or_else(|| cached_prefix_snippet(&effective_content, sanitized_query, 160))
.unwrap_or_else(|| {
quick_prefix_snippet(&effective_content, sanitized_query, 160)
})
} else if let Some(sn) =
cached_prefix_snippet(&effective_content, sanitized_query, 160)
{
sn
} else {
quick_prefix_snippet(&effective_content, sanitized_query, 160)
}
} else {
String::new()
};
let content = if field_mask.needs_content() {
effective_content.clone()
} else {
String::new()
};
let content_hash = stable_hit_hash(
&effective_content,
&pending.source_path,
pending.line_number,
pending.created_at,
);
let origin_kind = normalized_search_hit_origin_kind(
&pending.source_id,
pending.raw_origin_kind.as_deref(),
)
.to_string();
hits.push(SearchHit {
title: pending.title,
snippet,
content,
content_hash,
conversation_id: pending.conversation_id,
score: pending.score,
source_path: pending.source_path,
agent: pending.agent,
workspace: pending.workspace,
workspace_original: pending.workspace_original,
created_at: pending.created_at,
line_number: pending.line_number,
match_type: query_match_type,
source_id: pending.source_id,
origin_kind,
origin_host: pending.origin_host,
});
}
Ok((hits, tantivy_total_count))
}
#[allow(clippy::too_many_arguments)]
fn search_tantivy_federated(
&self,
readers: &[FederatedIndexReader],
raw_query: &str,
sanitized_query: &str,
filters: SearchFilters,
limit: usize,
field_mask: FieldMask,
) -> Result<(Vec<SearchHit>, usize)> {
let mut ranked_hits = Vec::new();
let mut total_count = 0usize;
for (shard_index, shard) in readers.iter().enumerate() {
let (shard_hits, shard_total_count) = self.search_tantivy(
&shard.reader,
&shard.fields,
raw_query,
sanitized_query,
filters.clone(),
limit,
0,
field_mask,
)?;
total_count = total_count.saturating_add(shard_total_count);
for (shard_rank, hit) in shard_hits.into_iter().enumerate() {
ranked_hits.push(FederatedRankedHit {
hit,
shard_index,
shard_rank,
fused_score: federated_rrf_score(shard_rank),
});
}
}
let raw_hit_count = ranked_hits.len();
let generation_signature = self.federated_generation_signature(readers);
self.track_generation(generation_signature);
let combined_hits = merge_federated_ranked_hits(ranked_hits);
tracing::debug!(
generation_signature,
shard_count = readers.len(),
total_count,
raw_hit_count,
returned_hit_count = combined_hits.len(),
merge_policy = "rrf_rank_then_stable_hit_key",
"federated lexical search merged shard results"
);
Ok((combined_hits, total_count))
}
fn sqlite_fts_uses_message_id_column(conn: &Connection) -> Result<bool> {
let params: [ParamValue; 0] = [];
let ddl_rows: Vec<String> = franken_query_map_collect_retry(
conn,
"SELECT COALESCE(sql, '')
FROM sqlite_master
WHERE name = 'fts_messages'
ORDER BY rowid DESC
LIMIT 1",
¶ms,
|row: &frankensqlite::Row| row.get_typed::<String>(0),
)?;
Ok(ddl_rows
.first()
.map(|sql| sql.to_ascii_lowercase().contains("message_id"))
.unwrap_or(false))
}
fn sqlite_fts5_rank_query(
fts_query: &str,
filters: &SearchFilters,
limit: usize,
offset: usize,
uses_message_id: bool,
) -> (String, Vec<ParamValue>) {
let normalized_source_sql =
normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
let created_at_expr = "CAST(fts_messages.created_at AS INTEGER)";
let message_key_expr = if uses_message_id {
"CAST(fts_messages.message_id AS INTEGER)"
} else {
"fts_messages.rowid"
};
let mut sql = format!(
"SELECT fts_messages.rowid,
bm25(fts_messages)
FROM fts_messages
LEFT JOIN messages m ON {message_key_expr} = m.id
LEFT JOIN conversations c ON m.conversation_id = c.id
LEFT JOIN sources s ON c.source_id = s.id
WHERE fts_messages MATCH ?"
);
let mut params = Vec::with_capacity(filters.agents.len() + filters.workspaces.len() + 5);
params.push(ParamValue::from(fts_query));
if !filters.agents.is_empty() {
let placeholders = sql_placeholders(filters.agents.len());
sql.push_str(&format!(" AND fts_messages.agent IN ({placeholders})"));
for agent in &filters.agents {
params.push(ParamValue::from(agent.as_str()));
}
}
if !filters.workspaces.is_empty() {
let placeholders = sql_placeholders(filters.workspaces.len());
sql.push_str(&format!(
" AND COALESCE(fts_messages.workspace, '') IN ({placeholders})"
));
for workspace in &filters.workspaces {
params.push(ParamValue::from(workspace.as_str()));
}
}
if let Some(created_from) = filters.created_from {
sql.push_str(&format!(" AND {created_at_expr} >= ?"));
params.push(ParamValue::from(created_from));
}
if let Some(created_to) = filters.created_to {
sql.push_str(&format!(" AND {created_at_expr} <= ?"));
params.push(ParamValue::from(created_to));
}
match &filters.source_filter {
SourceFilter::All => {}
SourceFilter::Local => sql.push_str(&format!(
" AND {normalized_source_sql} = '{local}'",
local = crate::sources::provenance::LOCAL_SOURCE_ID,
)),
SourceFilter::Remote => sql.push_str(&format!(
" AND {normalized_source_sql} != '{local}'",
local = crate::sources::provenance::LOCAL_SOURCE_ID,
)),
SourceFilter::SourceId(id) => {
sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
params.push(ParamValue::from(normalize_search_source_filter_value(id)));
}
}
sql.push_str(&format!(
" ORDER BY bm25(fts_messages), {message_key_expr}, fts_messages.rowid LIMIT ? OFFSET ?"
));
params.push(ParamValue::from(limit as i64));
params.push(ParamValue::from(offset as i64));
(sql, params)
}
fn sqlite_fts5_hydrate_query(
row_count: usize,
field_mask: FieldMask,
uses_message_id: bool,
) -> String {
let title_expr = if field_mask.wants_title() {
"fts_messages.title"
} else {
"''"
};
let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
"fts_messages.content"
} else {
"''"
};
let normalized_source_sql =
normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
let created_at_expr = "CAST(fts_messages.created_at AS INTEGER)";
let message_key_expr = if uses_message_id {
"CAST(fts_messages.message_id AS INTEGER)"
} else {
"fts_messages.rowid"
};
let placeholders = sql_placeholders(row_count);
format!(
"SELECT fts_messages.rowid,
{title_expr},
{content_expr},
fts_messages.agent,
COALESCE(fts_messages.workspace, ''),
fts_messages.source_path,
{created_at_expr},
m.idx,
c.id,
{normalized_source_sql},
c.origin_host,
s.kind
FROM fts_messages
LEFT JOIN messages m ON {message_key_expr} = m.id
LEFT JOIN conversations c ON m.conversation_id = c.id
LEFT JOIN sources s ON c.source_id = s.id
WHERE fts_messages.rowid IN ({placeholders})"
)
}
fn sqlite_fts5_hydrate_row_chunks(
ranked_rows: &[(i64, f64)],
) -> impl Iterator<Item = &[(i64, f64)]> {
const _: () = assert!(SQLITE_FTS5_HYDRATE_PARAM_CHUNK <= SQLITE_MAX_VARIABLE_NUMBER);
ranked_rows.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK)
}
fn search_sqlite_fts5(
&self,
_db_path: &Path,
raw_query: &str,
filters: SearchFilters,
limit: usize,
offset: usize,
field_mask: FieldMask,
) -> Result<Vec<SearchHit>> {
let fts_query = match transpile_to_fts5(raw_query) {
Some(q) if !q.trim().is_empty() => q,
_ => return Ok(Vec::new()),
};
let sqlite_guard = self.sqlite_guard()?;
let Some(conn) = sqlite_guard.as_ref() else {
return Ok(Vec::new());
};
let empty_params: [ParamValue; 0] = [];
let has_fts = franken_query_map_collect_retry(
conn,
"SELECT 1 FROM sqlite_master WHERE name = 'fts_messages'",
&empty_params,
|row| row.get_typed::<i64>(0),
)
.map(|rows| !rows.is_empty())
.unwrap_or(false);
if !has_fts {
return Ok(Vec::new());
}
let query_match_type = dominant_match_type(raw_query);
let uses_message_id =
if let Ok(uses_message_id) = Self::sqlite_fts_uses_message_id_column(conn) {
uses_message_id
} else {
tracing::warn!(
"sqlite FTS fallback is present but not queryable; skipping fallback search"
);
return Ok(Vec::new());
};
let (rank_sql, rank_params) = Self::sqlite_fts5_rank_query(
fts_query.as_str(),
&filters,
limit,
offset,
uses_message_id,
);
let ranked_rows: Vec<(i64, f64)> =
match franken_query_map_collect_retry(conn, &rank_sql, &rank_params, |row| {
Ok((row.get_typed(0)?, row.get_typed(1)?))
}) {
Ok(rows) => rows,
Err(err) => {
tracing::warn!(
error = %err,
"sqlite FTS fallback rank query failed; returning no fallback hits"
);
return Ok(Vec::new());
}
};
if ranked_rows.is_empty() {
return Ok(Vec::new());
}
let bm25_by_rowid: HashMap<i64, f64> = ranked_rows.iter().copied().collect();
let mut hits_by_rowid = HashMap::with_capacity(ranked_rows.len());
for rank_chunk in Self::sqlite_fts5_hydrate_row_chunks(&ranked_rows) {
let hydrate_sql =
Self::sqlite_fts5_hydrate_query(rank_chunk.len(), field_mask, uses_message_id);
let hydrate_params = rank_chunk
.iter()
.map(|(fts_rowid, _)| ParamValue::from(*fts_rowid))
.collect::<Vec<_>>();
let rows: Vec<SqliteFtsHydratedRow> =
match franken_query_map_collect_retry(conn, &hydrate_sql, &hydrate_params, |row| {
Ok((
row.get_typed(0)?,
row.get_typed(1)?,
row.get_typed(2)?,
row.get_typed(3)?,
row.get_typed(4)?,
row.get_typed(5)?,
row.get_typed(6)?,
row.get_typed(7)?,
row.get_typed(8)?,
row.get_typed::<Option<String>>(9)?,
row.get_typed(10)?,
row.get_typed(11)?,
))
}) {
Ok(rows) => rows,
Err(err) => {
tracing::warn!(
error = %err,
"sqlite FTS fallback hydration query failed; returning no fallback hits"
);
return Ok(Vec::new());
}
};
for (
fts_rowid,
title,
raw_content,
agent,
workspace,
source_path,
created_at,
idx,
conversation_id,
raw_source_id,
origin_host,
raw_origin_kind,
) in rows
{
let Some(&bm25_score) = bm25_by_rowid.get(&fts_rowid) else {
continue;
};
let raw_source_id = raw_source_id.unwrap_or_else(default_source_id);
let source_id = normalized_search_hit_source_id_parts(
raw_source_id.as_str(),
raw_origin_kind.as_deref().unwrap_or_default(),
origin_host.as_deref(),
);
let origin_kind = normalized_search_hit_origin_kind(
source_id.as_str(),
raw_origin_kind.as_deref(),
)
.to_string();
let line_number = idx
.and_then(|i| usize::try_from(i).ok())
.map(|i| i.saturating_add(1));
let snippet = if field_mask.wants_snippet() {
snippet_from_content(&raw_content)
} else {
String::new()
};
let content = if field_mask.needs_content() {
raw_content
} else {
String::new()
};
let content_hash = if content.is_empty() {
stable_hit_hash(&snippet, &source_path, line_number, created_at)
} else {
stable_hit_hash(&content, &source_path, line_number, created_at)
};
let hit = SearchHit {
title,
snippet,
content,
content_hash,
conversation_id,
score: (-bm25_score) as f32,
source_path,
agent,
workspace,
workspace_original: None,
created_at,
line_number,
match_type: query_match_type,
source_id,
origin_kind,
origin_host,
};
hits_by_rowid.insert(fts_rowid, hit);
}
}
let mut hits = Vec::with_capacity(ranked_rows.len());
for (fts_rowid, _) in ranked_rows {
if let Some(hit) = hits_by_rowid.remove(&fts_rowid) {
hits.push(hit);
}
}
Ok(hits)
}
pub fn browse_by_date(
&self,
filters: SearchFilters,
limit: usize,
offset: usize,
newest_first: bool,
field_mask: FieldMask,
) -> Result<Vec<SearchHit>> {
let sqlite_guard = self.sqlite_guard()?;
if let Some(conn) = sqlite_guard.as_ref() {
self.browse_by_date_sqlite(conn, filters, limit, offset, newest_first, field_mask)
} else {
Ok(Vec::new())
}
}
fn browse_by_date_sqlite(
&self,
conn: &Connection,
filters: SearchFilters,
limit: usize,
offset: usize,
newest_first: bool,
field_mask: FieldMask,
) -> Result<Vec<SearchHit>> {
let order = if newest_first { "DESC" } else { "ASC" };
let title_expr = if field_mask.wants_title() {
"c.title"
} else {
"''"
};
let normalized_source_sql =
normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
let mut sql = format!(
"SELECT c.id, {title_expr}, m.content, \
COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'), \
w.path, c.source_path, m.created_at, m.idx, \
{normalized_source_sql}, c.origin_host, s.kind
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
LEFT JOIN workspaces w ON c.workspace_id = w.id
LEFT JOIN sources s ON c.source_id = s.id
WHERE 1=1"
);
let mut params: Vec<ParamValue> = Vec::new();
if !filters.agents.is_empty() {
let placeholders = sql_placeholders(filters.agents.len());
sql.push_str(&format!(
" AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug IN ({placeholders}))"
));
for a in &filters.agents {
params.push(ParamValue::from(a.as_str()));
}
}
if !filters.workspaces.is_empty() {
let placeholders = sql_placeholders(filters.workspaces.len());
sql.push_str(&format!(" AND COALESCE(w.path, '') IN ({placeholders})"));
for w in &filters.workspaces {
params.push(ParamValue::from(w.as_str()));
}
}
if let Some(created_from) = filters.created_from {
sql.push_str(" AND m.created_at >= ?");
params.push(ParamValue::from(created_from));
}
if let Some(created_to) = filters.created_to {
sql.push_str(" AND m.created_at <= ?");
params.push(ParamValue::from(created_to));
}
match &filters.source_filter {
SourceFilter::All => {}
SourceFilter::Local => sql.push_str(&format!(
" AND {normalized_source_sql} = '{local}'",
local = crate::sources::provenance::LOCAL_SOURCE_ID,
)),
SourceFilter::Remote => sql.push_str(&format!(
" AND {normalized_source_sql} != '{local}'",
local = crate::sources::provenance::LOCAL_SOURCE_ID,
)),
SourceFilter::SourceId(id) => {
sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
params.push(ParamValue::from(normalize_search_source_filter_value(id)));
}
}
sql.push_str(&format!(
" ORDER BY CASE WHEN m.created_at IS NULL THEN 1 ELSE 0 END, m.created_at {order}, m.id {order} LIMIT ? OFFSET ?"
));
params.push(ParamValue::from(limit as i64));
params.push(ParamValue::from(offset as i64));
let rows: Vec<SearchHit> =
conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
let conversation_id: i64 = row.get_typed(0)?;
let title: String = if field_mask.wants_title() {
row.get_typed::<Option<String>>(1)?.unwrap_or_default()
} else {
String::new()
};
let raw_content: String = row.get_typed(2)?;
let agent: String = row.get_typed(3)?;
let workspace: Option<String> = row.get_typed(4)?;
let source_path: String = row.get_typed(5)?;
let created_at: Option<i64> = row.get_typed(6)?;
let idx: Option<i64> = row.get_typed(7)?;
let raw_source_id: String = row
.get_typed::<Option<String>>(8)?
.unwrap_or_else(default_source_id);
let origin_host: Option<String> = row.get_typed(9)?;
let raw_origin_kind: Option<String> = row.get_typed(10)?;
let source_id = normalized_search_hit_source_id_parts(
raw_source_id.as_str(),
raw_origin_kind.as_deref().unwrap_or_default(),
origin_host.as_deref(),
);
let origin_kind = normalized_search_hit_origin_kind(
source_id.as_str(),
raw_origin_kind.as_deref(),
);
let line_number = idx
.and_then(|i| usize::try_from(i).ok())
.map(|i| i.saturating_add(1));
let snippet = if field_mask.wants_snippet() {
snippet_from_content(&raw_content)
} else {
String::new()
};
let content = if field_mask.needs_content() {
raw_content.clone()
} else {
String::new()
};
let content_hash =
stable_hit_hash(&raw_content, &source_path, line_number, created_at);
Ok(SearchHit {
title,
snippet,
content,
content_hash,
conversation_id: Some(conversation_id),
score: 0.0,
source_path,
agent,
workspace: workspace.unwrap_or_default(),
workspace_original: None,
created_at,
line_number,
match_type: MatchType::Exact,
source_id,
origin_kind,
origin_host,
})
})?;
Ok(rows)
}
}
#[doc(hidden)]
pub fn fuzz_transpile_to_fts5(raw_query: &str) -> Option<String> {
transpile_to_fts5(raw_query)
}
fn transpile_to_fts5(raw_query: &str) -> Option<String> {
let tokens = fs_cass_parse_boolean_query(raw_query);
if tokens.is_empty() {
return Some("".to_string());
}
let mut fts_clauses: Vec<(&str, String)> = Vec::new();
let mut pending_or_group: Vec<String> = Vec::new();
let mut next_op = "AND";
let mut in_or_sequence = false;
for token in tokens {
match token {
FsCassQueryToken::And => {
if !pending_or_group.is_empty() {
let group = if pending_or_group.len() > 1 {
format!("({})", pending_or_group.join(" OR "))
} else {
pending_or_group.pop().unwrap_or_default()
};
fts_clauses.push(("AND", group));
pending_or_group.clear();
}
in_or_sequence = false;
next_op = "AND";
}
FsCassQueryToken::Or => {
if fts_clauses.is_empty() && pending_or_group.is_empty() {
continue;
}
in_or_sequence = true;
}
FsCassQueryToken::Not => {
if in_or_sequence {
return None;
}
if fts_clauses.is_empty() && pending_or_group.is_empty() {
return None;
}
if !pending_or_group.is_empty() {
let group = if pending_or_group.len() > 1 {
format!("({})", pending_or_group.join(" OR "))
} else {
pending_or_group.pop().unwrap_or_default()
};
fts_clauses.push(("AND", group));
pending_or_group.clear();
}
in_or_sequence = false;
next_op = "NOT";
}
FsCassQueryToken::Term(t) => {
let raw_pattern = FsCassWildcardPattern::parse(&t);
if matches!(
raw_pattern,
FsCassWildcardPattern::Suffix(_)
| FsCassWildcardPattern::Substring(_)
| FsCassWildcardPattern::Complex(_)
) {
return None;
}
let term_parts = normalize_term_parts(&t);
if term_parts.is_empty() {
continue;
}
let mut rendered_parts = Vec::with_capacity(term_parts.len());
for part in &term_parts {
rendered_parts.push(render_fts5_term_part(part)?);
}
let fts_term = if rendered_parts.len() > 1 {
format!("({})", rendered_parts.join(" AND "))
} else {
rendered_parts[0].clone()
};
if in_or_sequence {
if pending_or_group.is_empty() {
let (op, _) = fts_clauses.last()?;
if *op != "AND" {
return None;
}
let (_, val) = fts_clauses.pop()?;
pending_or_group.push(val);
}
pending_or_group.push(fts_term);
in_or_sequence = true;
} else {
fts_clauses.push((next_op, fts_term));
}
next_op = "AND";
}
FsCassQueryToken::Phrase(p) => {
let phrase_parts = normalize_phrase_terms(&p);
if phrase_parts.is_empty() {
continue;
}
let fts_phrase = format!("\"{}\"", phrase_parts.join(" "));
if in_or_sequence {
if pending_or_group.is_empty() {
let (op, _) = fts_clauses.last()?;
if *op != "AND" {
return None;
}
let (_, val) = fts_clauses.pop()?;
pending_or_group.push(val);
}
pending_or_group.push(fts_phrase);
in_or_sequence = true;
} else {
fts_clauses.push((next_op, fts_phrase));
}
next_op = "AND";
}
}
}
if !pending_or_group.is_empty() {
let group = if pending_or_group.len() > 1 {
format!("({})", pending_or_group.join(" OR "))
} else {
pending_or_group.pop().unwrap_or_default()
};
fts_clauses.push((next_op, group));
}
if fts_clauses.is_empty() {
return Some("".to_string());
}
if fts_clauses.first().is_some_and(|(op, _)| *op == "NOT") {
return None;
}
let mut query = String::new();
for (i, (op, text)) in fts_clauses.into_iter().enumerate() {
if i > 0 {
query.push_str(&format!(" {} ", op));
}
query.push_str(&text);
}
Some(query)
}
#[derive(Default, Clone)]
struct Metrics {
cache_hits: Arc<AtomicU64>,
cache_miss: Arc<AtomicU64>,
cache_shortfall: Arc<AtomicU64>,
reloads: Arc<AtomicU64>,
reload_ms_total: Arc<AtomicU64>,
prewarm_scheduled: Arc<AtomicU64>,
prewarm_skipped_pressure: Arc<AtomicU64>,
}
impl Metrics {
fn inc_cache_hits(&self) {
self.cache_hits.fetch_add(1, Ordering::Relaxed);
}
fn inc_cache_miss(&self) {
self.cache_miss.fetch_add(1, Ordering::Relaxed);
}
fn inc_cache_shortfall(&self) {
self.cache_shortfall.fetch_add(1, Ordering::Relaxed);
}
fn inc_prewarm_scheduled(&self) {
self.prewarm_scheduled.fetch_add(1, Ordering::Relaxed);
}
fn inc_prewarm_skipped_pressure(&self) {
self.prewarm_skipped_pressure
.fetch_add(1, Ordering::Relaxed);
}
fn inc_reload(&self) {
self.reloads.fetch_add(1, Ordering::Relaxed);
}
fn record_reload(&self, duration: Duration) {
self.inc_reload();
self.reload_ms_total
.fetch_add(duration.as_millis() as u64, Ordering::Relaxed);
}
fn snapshot_all(&self) -> (u64, u64, u64, u64, u128) {
(
self.cache_hits.load(Ordering::Relaxed),
self.cache_miss.load(Ordering::Relaxed),
self.cache_shortfall.load(Ordering::Relaxed),
self.reloads.load(Ordering::Relaxed),
self.reload_ms_total.load(Ordering::Relaxed) as u128,
)
}
fn snapshot_prewarm(&self) -> (u64, u64) {
(
self.prewarm_scheduled.load(Ordering::Relaxed),
self.prewarm_skipped_pressure.load(Ordering::Relaxed),
)
}
#[cfg(test)]
#[allow(dead_code)]
fn reset(&self) {
self.cache_hits.store(0, Ordering::Relaxed);
self.cache_miss.store(0, Ordering::Relaxed);
self.cache_shortfall.store(0, Ordering::Relaxed);
self.reloads.store(0, Ordering::Relaxed);
self.reload_ms_total.store(0, Ordering::Relaxed);
self.prewarm_scheduled.store(0, Ordering::Relaxed);
self.prewarm_skipped_pressure.store(0, Ordering::Relaxed);
}
}
fn maybe_spawn_warm_worker(
reader: IndexReader,
fields: FsCassFields,
reload_epoch: Arc<AtomicU64>,
metrics: Metrics,
) -> Option<(mpsc::Sender<WarmJob>, std::thread::JoinHandle<()>)> {
let (tx, rx) = mpsc::unbounded::<WarmJob>();
let handle = std::thread::Builder::new()
.name("cass-warm-worker".into())
.spawn(move || {
let mut last_run = Instant::now();
while let Ok(job) = rx.recv() {
let now = Instant::now();
if now.duration_since(last_run) < Duration::from_millis(*WARM_DEBOUNCE_MS) {
continue;
}
last_run = now;
let reload_started = Instant::now();
if let Err(err) = reader.reload() {
tracing::warn!(error = ?err, "warm_worker_reload_failed");
continue;
}
let elapsed = reload_started.elapsed();
let epoch = reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
metrics.record_reload(elapsed);
tracing::debug!(
duration_ms = elapsed.as_millis() as u64,
reload_epoch = epoch,
filters = %job.filters_fingerprint,
shard = %job.shard_name,
"warm_worker_reload"
);
let searcher = reader.searcher();
let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
for term_str in job.query.split_whitespace() {
let term_lower = term_str.to_lowercase();
let term_shoulds: Vec<(Occur, Box<dyn Query>)> = vec![
(
Occur::Should,
Box::new(TermQuery::new(
Term::from_field_text(fields.title, &term_lower),
IndexRecordOption::WithFreqsAndPositions,
)),
),
(
Occur::Should,
Box::new(TermQuery::new(
Term::from_field_text(fields.content, &term_lower),
IndexRecordOption::WithFreqsAndPositions,
)),
),
];
clauses.push((Occur::Must, Box::new(BooleanQuery::new(term_shoulds))));
}
if !clauses.is_empty() {
let q: Box<dyn Query> = Box::new(BooleanQuery::new(clauses));
let _ = searcher.search(&q, &TopDocs::with_limit(1).order_by_score());
}
}
})
.ok()?;
Some((tx, handle))
}
fn cached_hit_from(hit: &SearchHit) -> CachedHit {
let cache_text = if hit.content.is_empty() {
hit.snippet.as_str()
} else {
hit.content.as_str()
};
let lc_content = cache_text.to_lowercase();
let lc_title = (!hit.title.is_empty()).then(|| hit.title.to_lowercase());
let bloom64 = bloom_from_text(&lc_content, &lc_title);
CachedHit {
hit: hit.clone(),
lc_content,
lc_title,
bloom64,
}
}
fn bloom_from_text(content: &str, title: &Option<String>) -> u64 {
let mut bits = 0u64;
for token in token_stream(content) {
bits |= hash_token(token);
}
if let Some(t) = title {
for token in token_stream(t) {
bits |= hash_token(token);
}
}
bits
}
fn token_stream(text: &str) -> impl Iterator<Item = &str> {
text.split(|c: char| !c.is_alphanumeric())
.filter(|s| !s.is_empty())
}
fn hash_token(tok: &str) -> u64 {
let mut h: u64 = 5381;
for b in tok.as_bytes() {
h = ((h << 5).wrapping_add(h)).wrapping_add(u64::from(*b));
}
1u64 << (h % 64)
}
struct QueryTermsLower {
query_lower: String,
token_ranges: Vec<(usize, usize)>,
bloom_mask: u64,
}
impl QueryTermsLower {
fn from_query(query: &str) -> Self {
if query.is_empty() {
return Self {
query_lower: String::new(),
token_ranges: Vec::new(),
bloom_mask: 0,
};
}
let query_lower = query.to_lowercase();
let mut token_ranges = Vec::new();
let mut bloom_mask = 0u64;
let mut start = None;
for (i, c) in query_lower.char_indices() {
if c.is_alphanumeric() {
if start.is_none() {
start = Some(i);
}
} else if let Some(s) = start.take() {
let token = &query_lower[s..i];
bloom_mask |= hash_token(token);
token_ranges.push((s, i));
}
}
if let Some(s) = start {
let token = &query_lower[s..];
bloom_mask |= hash_token(token);
token_ranges.push((s, query_lower.len()));
}
Self {
query_lower,
token_ranges,
bloom_mask,
}
}
#[inline]
fn is_empty(&self) -> bool {
self.token_ranges.is_empty()
}
#[inline]
fn tokens(&self) -> impl Iterator<Item = &str> {
self.token_ranges
.iter()
.map(|(s, e)| &self.query_lower[*s..*e])
}
#[inline]
fn bloom_mask(&self) -> u64 {
self.bloom_mask
}
}
fn hit_matches_query_cached_precomputed(hit: &CachedHit, terms: &QueryTermsLower) -> bool {
if terms.is_empty() {
return true;
}
if hit.bloom64 & terms.bloom_mask() != terms.bloom_mask() {
return false;
}
terms.tokens().all(|t| {
if token_stream(&hit.lc_content).any(|word| word.starts_with(t)) {
return true;
}
if let Some(title) = &hit.lc_title
&& token_stream(title).any(|word| word.starts_with(t))
{
return true;
}
false
})
}
#[cfg(test)]
fn hit_matches_query_cached(hit: &CachedHit, query: &str) -> bool {
let terms = QueryTermsLower::from_query(query);
hit_matches_query_cached_precomputed(hit, &terms)
}
fn is_prefix_only(query: &str) -> bool {
let tokens: Vec<&str> = query.split_whitespace().collect();
if tokens.len() != 1 {
return false;
}
tokens[0].chars().all(char::is_alphanumeric)
}
fn quick_prefix_snippet(content: &str, query: &str, max_chars: usize) -> String {
if query.is_empty() {
let mut chars = content.chars();
let snippet: String = chars.by_ref().take(max_chars).collect();
return if chars.next().is_some() {
format!("{snippet}…")
} else {
snippet
};
}
let lc_content = content.to_lowercase();
let lc_query = query.to_lowercase();
if let Some(pos) = lc_content.find(&lc_query) {
let match_start_char_idx = lc_content[..pos].chars().count();
let query_char_len = lc_query.chars().count();
let start_char = match_start_char_idx.saturating_sub(15);
let mut chars_iter = content.chars().skip(start_char);
let mut snippet = String::new();
let mut chars_taken = 0;
let mut current_idx = start_char;
while chars_taken < max_chars {
if current_idx == match_start_char_idx {
snippet.push_str("**");
for _ in 0..query_char_len {
if let Some(ch) = chars_iter.next() {
snippet.push(ch);
chars_taken += 1;
current_idx += 1;
}
}
snippet.push_str("**");
if chars_taken >= max_chars {
break;
}
continue;
}
if let Some(ch) = chars_iter.next() {
snippet.push(ch);
chars_taken += 1;
current_idx += 1;
} else {
break;
}
}
if chars_iter.next().is_some() {
format!("{snippet}…")
} else {
snippet
}
} else {
let mut chars = content.chars();
let snippet: String = chars.by_ref().take(max_chars).collect();
if chars.next().is_some() {
format!("{snippet}…")
} else {
snippet
}
}
}
fn cached_prefix_snippet(content: &str, query: &str, max_chars: usize) -> Option<String> {
if query.trim().is_empty() {
return None;
}
let lc_content = content.to_lowercase();
let lc_query = query.to_lowercase();
lc_content.find(&lc_query).map(|pos| {
let match_start_char_idx = lc_content[..pos].chars().count();
let query_char_len = lc_query.chars().count();
let start_char = match_start_char_idx.saturating_sub(15);
let mut chars_iter = content.chars().skip(start_char);
let mut snippet = String::new();
let mut chars_taken = 0;
let mut current_idx = start_char;
while chars_taken < max_chars {
if current_idx == match_start_char_idx {
snippet.push_str("**");
for _ in 0..query_char_len {
if let Some(ch) = chars_iter.next() {
snippet.push(ch);
chars_taken += 1;
current_idx += 1;
}
}
snippet.push_str("**");
if chars_taken >= max_chars {
break;
}
continue;
}
if let Some(ch) = chars_iter.next() {
snippet.push(ch);
chars_taken += 1;
current_idx += 1;
} else {
break;
}
}
if chars_iter.next().is_some() {
format!("{snippet}…")
} else {
snippet
}
})
}
fn filters_fingerprint(filters: &SearchFilters) -> String {
let mut parts = Vec::new();
if !filters.agents.is_empty() {
let mut v: Vec<_> = filters.agents.iter().cloned().collect();
v.sort();
parts.push(format!("a:{v:?}"));
}
if !filters.workspaces.is_empty() {
let mut v: Vec<_> = filters.workspaces.iter().cloned().collect();
v.sort();
parts.push(format!("w:{v:?}"));
}
if let Some(f) = filters.created_from {
parts.push(format!("from:{f}"));
}
if let Some(t) = filters.created_to {
parts.push(format!("to:{t}"));
}
if !matches!(
filters.source_filter,
crate::sources::provenance::SourceFilter::All
) {
parts.push(format!("src:{:?}", filters.source_filter));
}
if !filters.session_paths.is_empty() {
let mut v: Vec<_> = filters.session_paths.iter().cloned().collect();
v.sort();
parts.push(format!("sp:{v:?}"));
}
parts.join("|")
}
impl SearchClient {
pub fn total_docs(&self) -> usize {
if let Some((reader, _)) = &self.reader {
return reader.searcher().num_docs() as usize;
}
self.federated_readers()
.map(|readers| {
readers
.iter()
.map(|shard| shard.reader.searcher().num_docs() as usize)
.sum()
})
.unwrap_or(0)
}
pub fn has_tantivy(&self) -> bool {
self.reader.is_some() || self.federated_readers().is_some()
}
fn maybe_reload_reader(&self, reader: &IndexReader) -> Result<()> {
if !self.reload_on_search {
return Ok(());
}
const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
let now = Instant::now();
let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
if guard
.map(|t| now.duration_since(t) >= MIN_RELOAD_INTERVAL)
.unwrap_or(true)
{
let reload_started = Instant::now();
reader.reload()?;
let elapsed = reload_started.elapsed();
*guard = Some(now);
let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
self.metrics.record_reload(elapsed);
tracing::debug!(
duration_ms = elapsed.as_millis() as u64,
reload_epoch = epoch,
"tantivy_reader_reload"
);
}
Ok(())
}
fn maybe_log_cache_metrics(&self, event: &str) {
if !*CACHE_DEBUG_ENABLED {
return;
}
let stats = self.cache_stats();
tracing::debug!(
event = event,
hits = stats.cache_hits,
miss = stats.cache_miss,
shortfall = stats.cache_shortfall,
reloads = stats.reloads,
reload_ms_total = stats.reload_ms_total,
total_cap = stats.total_cap,
total_cost = stats.total_cost,
evictions = stats.eviction_count,
approx_bytes = stats.approx_bytes,
byte_cap = stats.byte_cap,
eviction_policy = stats.eviction_policy,
ghost_entries = stats.ghost_entries,
admission_rejects = stats.admission_rejects,
"cache_metrics"
);
}
fn cache_key(&self, query: &str, filters: &SearchFilters) -> Arc<str> {
let key_str = format!(
"{}|{}::{}",
self.cache_namespace,
query,
filters_fingerprint(filters)
);
intern_cache_key(&key_str)
}
fn shard_name(&self, filters: &SearchFilters) -> String {
if filters.agents.len() == 1 {
format!(
"agent:{}",
filters
.agents
.iter()
.next()
.cloned()
.unwrap_or_else(|| "global".into())
)
} else if filters.workspaces.len() == 1 {
format!(
"workspace:{}",
filters
.workspaces
.iter()
.next()
.cloned()
.unwrap_or_else(|| "global".into())
)
} else {
"global".into()
}
}
fn cached_prefix_key_exists_in_shard(
&self,
shard: &LruCache<Arc<str>, Vec<CachedHit>>,
query: &str,
filters: &SearchFilters,
) -> bool {
let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
byte_indices.push(query.len());
let query_len = query.len();
for &end in byte_indices.iter().rev() {
if end == 0 || end == query_len {
continue;
}
let key = self.cache_key(&query[..end], filters);
if shard.contains(&key) {
return true;
}
}
false
}
fn maybe_schedule_adaptive_query_prewarm(&self, query: &str, filters: &SearchFilters) {
if query.is_empty() {
return;
}
let Some(tx) = &self.warm_tx else {
return;
};
let shard_name = self.shard_name(filters);
let decision = match self.prefix_cache.lock() {
Ok(cache) => {
let hot_prefix = cache.shard_opt(&shard_name).is_some_and(|shard| {
self.cached_prefix_key_exists_in_shard(shard, query, filters)
});
if !hot_prefix {
AdaptivePrewarmDecision::SkipCold
} else if cache.prewarm_pressure() {
AdaptivePrewarmDecision::SkipPressure
} else {
AdaptivePrewarmDecision::Schedule
}
}
Err(_) => return,
};
if decision == AdaptivePrewarmDecision::SkipPressure {
self.metrics.inc_prewarm_skipped_pressure();
return;
}
if decision == AdaptivePrewarmDecision::SkipCold {
return;
}
if tx
.send(WarmJob {
query: query.to_string(),
filters_fingerprint: filters_fingerprint(filters),
shard_name,
})
.is_ok()
{
self.metrics.inc_prewarm_scheduled();
}
}
fn cached_prefix_hits(&self, query: &str, filters: &SearchFilters) -> Option<Vec<CachedHit>> {
if query.is_empty() {
return None;
}
let cache = self.prefix_cache.lock().ok()?;
let shard_name = self.shard_name(filters);
let shard = cache.shard_opt(&shard_name)?;
let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
byte_indices.push(query.len());
for &end in byte_indices.iter().rev() {
if end == 0 {
continue;
}
let key = self.cache_key(&query[..end], filters);
if let Some(hits) = shard.peek(&key) {
return Some(hits.clone());
}
}
None
}
fn put_cache(&self, query: &str, filters: &SearchFilters, hits: &[SearchHit]) {
if query.is_empty() || hits.is_empty() {
return;
}
if let Ok(mut cache) = self.prefix_cache.lock() {
let shard_name = self.shard_name(filters);
let key = self.cache_key(query, filters);
let cached_hits: Vec<CachedHit> = hits.iter().map(cached_hit_from).collect();
cache.put(&shard_name, key, cached_hits);
}
}
pub fn cache_stats(&self) -> CacheStats {
let (hits, miss, shortfall, reloads, reload_ms_total) = self.metrics.snapshot_all();
let (prewarm_scheduled, prewarm_skipped_pressure) = self.metrics.snapshot_prewarm();
let reader_generation = self.last_generation.lock().ok().and_then(|guard| *guard);
let (
total_cap,
total_cost,
eviction_count,
approx_bytes,
byte_cap,
eviction_policy,
ghost_entries,
admission_rejects,
) = if let Ok(cache) = self.prefix_cache.lock() {
(
cache.total_cap(),
cache.total_cost(),
cache.eviction_count(),
cache.total_bytes(),
cache.byte_cap(),
cache.policy_label(),
cache.ghost_entries(),
cache.admission_rejects(),
)
} else {
(0, 0, 0, 0, 0, "unknown", 0, 0)
};
CacheStats {
cache_hits: hits,
cache_miss: miss,
cache_shortfall: shortfall,
reloads,
reload_ms_total,
total_cap,
total_cost,
eviction_count,
approx_bytes,
byte_cap,
eviction_policy,
ghost_entries,
admission_rejects,
prewarm_scheduled,
prewarm_skipped_pressure,
reader_generation,
}
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::connectors::{NormalizedConversation, NormalizedMessage, NormalizedSnippet};
use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
use crate::search::tantivy::TantivyIndex;
use crate::storage::sqlite::FrankenStorage;
use frankensqlite::Connection as FrankenConnection;
use frankensqlite::compat::{ParamValue, params_from_iter};
use serde_json::json;
use tempfile::TempDir;
fn search_hit_key_doc_id_reference_v0(key: &SearchHitKey) -> String {
let sep = '\u{1f}';
format!(
"{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}",
key.source_id,
key.source_path,
key.conversation_id
.map(|v| v.to_string())
.unwrap_or_default(),
key.title,
key.line_number.map(|v| v.to_string()).unwrap_or_default(),
key.created_at.map(|v| v.to_string()).unwrap_or_default(),
key.content_hash,
)
}
fn stable_hit_hash_reference_v0(
content: &str,
source_path: &str,
line_number: Option<usize>,
created_at: Option<i64>,
) -> u64 {
use xxhash_rust::xxh3::Xxh3;
let mut hasher = Xxh3::new();
if !content.is_empty() {
hasher.update(&stable_content_hash(content).to_le_bytes());
}
hasher.update(b"|");
hasher.update(source_path.as_bytes());
hasher.update(b"|");
if let Some(line) = line_number {
hasher.update(line.to_string().as_bytes());
}
hasher.update(b"|");
if let Some(ts) = created_at {
hasher.update(ts.to_string().as_bytes());
}
hasher.digest()
}
fn vector_result(message_id: u64, score: f32) -> VectorSearchResult {
VectorSearchResult {
message_id,
chunk_idx: 0,
score,
}
}
#[test]
fn semantic_exact_candidate_limit_overfetches_chunks_without_full_scan() {
assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 1_000), 40);
assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 25), 25);
assert_eq!(SearchClient::semantic_exact_candidate_limit(0, 1_000), 0);
assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 0), 0);
}
#[test]
fn semantic_window_detects_possible_hidden_chunk_competitors() {
let complete = vec![
vector_result(1, 0.9),
vector_result(2, 0.8),
vector_result(3, 0.7),
];
assert!(
!SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.6)),
"strictly lower omitted chunks cannot alter the top message window"
);
assert!(
SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.7)),
"equal-score omitted chunks can still alter deterministic tie-breaking"
);
let duplicate_collapsed_shortfall = vec![vector_result(1, 0.9)];
assert!(
SearchClient::semantic_window_may_omit_competitor(
&duplicate_collapsed_shortfall,
3,
Some(0.2),
),
"a short collapsed window means high-scoring duplicate chunks may have hidden messages"
);
assert!(!SearchClient::semantic_window_may_omit_competitor(
&complete, 3, None
));
}
#[test]
fn stable_hit_hash_matches_reference_and_is_deterministic() {
let fixtures = [
("", "", None, None),
(
"same content\nnormalized",
"/tmp/session.jsonl",
Some(1),
Some(0),
),
(
"tool output with repeated whitespace",
"/tmp/path with spaces.jsonl",
Some(42),
Some(1_700_000_000_000),
),
(
"unicode stays in the content hash path: café",
"/remote/host/session.jsonl",
Some(usize::MAX),
Some(i64::MIN),
),
(
"negative timestamp fixture",
"/tmp/negative.jsonl",
None,
Some(-123_456),
),
];
for (content, source_path, line_number, created_at) in fixtures {
let optimized = stable_hit_hash(content, source_path, line_number, created_at);
let repeated = stable_hit_hash(content, source_path, line_number, created_at);
let reference =
stable_hit_hash_reference_v0(content, source_path, line_number, created_at);
assert_eq!(optimized, repeated);
assert_eq!(optimized, reference);
}
}
#[test]
fn semantic_message_id_from_db_rejects_negative_values() {
let err = semantic_message_id_from_db(-1).expect_err("negative DB ids must be rejected");
assert!(
err.to_string().contains("negative message_id"),
"unexpected error: {err}"
);
assert_eq!(semantic_message_id_from_db(42).expect("positive id"), 42);
}
#[test]
fn semantic_doc_component_id_from_db_clamps_bounds() {
assert_eq!(semantic_doc_component_id_from_db(None), 0);
assert_eq!(semantic_doc_component_id_from_db(Some(-7)), 0);
assert_eq!(semantic_doc_component_id_from_db(Some(0)), 0);
assert_eq!(semantic_doc_component_id_from_db(Some(7)), 7);
assert_eq!(
semantic_doc_component_id_from_db(Some(i64::from(u32::MAX) + 123)),
u32::MAX
);
}
#[test]
fn search_hit_key_doc_id_matches_reference_byte_for_byte() {
let fixtures = [
SearchHitKey {
source_id: "local".into(),
source_path: "/tmp/path.jsonl".into(),
conversation_id: Some(42),
title: "Demo chat".into(),
line_number: Some(7),
created_at: Some(1_700_000_000_000),
content_hash: 0xdead_beef_u64,
},
SearchHitKey {
source_id: "ssh:host".into(),
source_path: "/remote/path with spaces.jsonl".into(),
conversation_id: None,
title: String::new(),
line_number: None,
created_at: None,
content_hash: 0,
},
SearchHitKey {
source_id: String::new(),
source_path: String::new(),
conversation_id: Some(i64::MIN),
title: "unicode title — héllo".into(),
line_number: Some(usize::MAX),
created_at: Some(i64::MAX),
content_hash: u64::MAX,
},
SearchHitKey {
source_id: "a".into(),
source_path: "b".into(),
conversation_id: Some(0),
title: "c".into(),
line_number: Some(0),
created_at: Some(0),
content_hash: 0,
},
SearchHitKey {
source_id: "with\u{1f}separator".into(),
source_path: "with\u{1f}separator".into(),
conversation_id: Some(-1),
title: "with\u{1f}separator".into(),
line_number: None,
created_at: Some(-1),
content_hash: 1,
},
];
for (idx, key) in fixtures.iter().enumerate() {
let optimized = search_hit_key_doc_id(key);
let reference = search_hit_key_doc_id_reference_v0(key);
assert_eq!(
optimized, reference,
"fixture {idx} produced divergent doc_id; byte-exact dedup key is a contract"
);
}
let structural_key = SearchHitKey {
source_id: "clean".into(),
source_path: "/no/separators/here.jsonl".into(),
conversation_id: Some(1),
title: "plain title".into(),
line_number: Some(2),
created_at: Some(3),
content_hash: 4,
};
let encoded = search_hit_key_doc_id(&structural_key);
assert_eq!(
encoded.matches('\u{1f}').count(),
6,
"structural fixture must contain exactly six 0x1F separators; got {encoded:?}"
);
}
#[derive(Debug)]
struct FixedTestEmbedder {
id: String,
vector: Vec<f32>,
}
impl FixedTestEmbedder {
fn new(id: &str, vector: &[f32]) -> Self {
Self {
id: id.to_string(),
vector: vector.to_vec(),
}
}
}
#[derive(Debug)]
struct BlockingTestEmbedder {
id: String,
vector: Vec<f32>,
started_tx: Mutex<Option<std::sync::mpsc::Sender<()>>>,
unblock_rx: Mutex<std::sync::mpsc::Receiver<()>>,
}
impl BlockingTestEmbedder {
fn new(
id: &str,
vector: &[f32],
started_tx: std::sync::mpsc::Sender<()>,
unblock_rx: std::sync::mpsc::Receiver<()>,
) -> Self {
Self {
id: id.to_string(),
vector: vector.to_vec(),
started_tx: Mutex::new(Some(started_tx)),
unblock_rx: Mutex::new(unblock_rx),
}
}
}
impl crate::search::embedder::Embedder for BlockingTestEmbedder {
fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
if let Ok(mut guard) = self.started_tx.lock()
&& let Some(tx) = guard.take()
{
let _ = tx.send(());
}
self.unblock_rx
.lock()
.expect("blocking embedder receiver")
.recv()
.expect("blocking embedder unblock signal");
Ok(self.vector.clone())
}
fn dimension(&self) -> usize {
self.vector.len()
}
fn id(&self) -> &str {
&self.id
}
fn is_semantic(&self) -> bool {
false
}
fn category(&self) -> frankensearch::ModelCategory {
frankensearch::ModelCategory::HashEmbedder
}
}
impl crate::search::embedder::Embedder for FixedTestEmbedder {
fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
Ok(self.vector.clone())
}
fn dimension(&self) -> usize {
self.vector.len()
}
fn id(&self) -> &str {
&self.id
}
fn is_semantic(&self) -> bool {
false
}
fn category(&self) -> frankensearch::ModelCategory {
frankensearch::ModelCategory::HashEmbedder
}
}
struct SemanticTestFixture {
_dir: TempDir,
client: SearchClient,
doc_ids: Vec<String>,
source_paths: Vec<String>,
}
struct ProgressiveHybridFixture {
_dir: TempDir,
client: Arc<SearchClient>,
query: String,
}
fn projected_minimal_fields_search_hit(title: &str, source_path: &str) -> SearchHit {
SearchHit {
title: title.to_string(),
snippet: String::new(),
content: String::new(),
content_hash: 0,
conversation_id: Some(42),
score: 1.0,
source_path: source_path.to_string(),
agent: "test-agent".into(),
workspace: "/tmp/workspace".into(),
workspace_original: None,
created_at: Some(1_700_000_000_000),
line_number: Some(1),
match_type: MatchType::default(),
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
}
}
#[test]
fn hit_is_noise_returns_false_for_projected_minimal_fields_hit() {
let hit = projected_minimal_fields_search_hit(
"Demo conversation about authentication",
"/tmp/sessions/demo-auth.jsonl",
);
assert_eq!(hit.content, "");
assert_eq!(hit.snippet, "");
assert!(
!hit_is_noise(&hit, "authentication"),
"projected --fields minimal hit must NOT be classified as noise; \
doing so silently drops every real match (bead bd-q6xf9)"
);
}
#[test]
fn hit_is_noise_still_suppresses_real_tool_invocation_noise_when_content_present() {
let mut hit =
projected_minimal_fields_search_hit("Tool ping", "/tmp/sessions/tool-ping.jsonl");
hit.content =
"[tool_call]: {\"name\": \"bash\", \"arguments\": {\"command\": \"ls\"}}".into();
let classified_as_noise_on_real_content =
hit_is_noise(&hit, "ls") || hit_is_noise(&hit, "bash");
let _ = classified_as_noise_on_real_content;
assert!(!hit.content.is_empty(), "precondition: content populated");
}
#[test]
fn hit_is_noise_uses_snippet_when_content_empty_but_snippet_populated() {
let mut hit = projected_minimal_fields_search_hit(
"Real authentication hit",
"/tmp/sessions/real-auth.jsonl",
);
hit.content = String::new();
hit.snippet = "The user asked about authentication flow options.".into();
assert!(
!hit_is_noise(&hit, "authentication"),
"snippet-only hits with real content must survive the noise filter"
);
}
#[test]
fn search_client_is_send_sync_without_phantom_filters() {
fn assert_send_sync<T: Send + Sync>() {}
assert_send_sync::<SearchClient>();
}
#[test]
fn semantic_embedding_releases_semantic_lock_while_embedding() -> Result<()> {
let fixture = build_semantic_test_fixture()?;
let client = Arc::new(fixture.client);
let (started_tx, started_rx) = std::sync::mpsc::channel();
let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
{
let mut guard = client
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard
.as_mut()
.ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
state.embedder = Arc::new(BlockingTestEmbedder::new(
"test-fixed-2d",
&[1.0, 0.0],
started_tx,
unblock_rx,
));
state.query_cache = QueryCache::new(
"test-fixed-2d",
NonZeroUsize::new(100).expect("cache capacity"),
);
}
let search_client = Arc::clone(&client);
let search_handle = std::thread::spawn(move || {
search_client.search_semantic(
"lock scope regression",
SearchFilters::default(),
3,
0,
FieldMask::FULL,
false,
)
});
started_rx
.recv_timeout(Duration::from_secs(1))
.expect("embedder should start");
let clear_client = Arc::clone(&client);
let (clear_tx, clear_rx) = std::sync::mpsc::channel();
let clear_handle = std::thread::spawn(move || {
let _ = clear_tx.send(clear_client.clear_semantic_context());
});
clear_rx
.recv_timeout(Duration::from_millis(500))
.expect("semantic lock should not stay held during embed")?;
unblock_tx.send(()).expect("unblock embedder");
clear_handle.join().expect("clear thread join");
let search_result = search_handle.join().expect("search thread join");
assert!(
search_result.is_err(),
"search should observe semantic context cleared after embedding"
);
Ok(())
}
#[test]
fn semantic_embedding_ignores_stale_same_id_context_after_swap() -> Result<()> {
let fixture = build_semantic_test_fixture()?;
let client = Arc::new(fixture.client);
let (started_tx, started_rx) = std::sync::mpsc::channel();
let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
{
let mut guard = client
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard
.as_mut()
.ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
state.embedder = Arc::new(BlockingTestEmbedder::new(
"test-fixed-2d",
&[1.0, 0.0],
started_tx,
unblock_rx,
));
state.query_cache = QueryCache::new(
"test-fixed-2d",
NonZeroUsize::new(100).expect("cache capacity"),
);
}
let embedding_client = Arc::clone(&client);
let handle =
std::thread::spawn(move || embedding_client.semantic_query_embedding("context-swap"));
started_rx
.recv_timeout(Duration::from_secs(1))
.expect("embedder should start");
{
let mut guard = client
.semantic
.lock()
.map_err(|_| anyhow!("semantic lock poisoned"))?;
let state = guard
.as_mut()
.ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
state.context_token = Arc::new(());
state.embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[0.0, 1.0]));
state.query_cache = QueryCache::new(
"test-fixed-2d",
NonZeroUsize::new(100).expect("cache capacity"),
);
}
unblock_tx.send(()).expect("unblock embedder");
let embedding = handle.join().expect("embedding thread join")?.vector;
assert_eq!(
embedding,
vec![0.0, 1.0],
"stale embedding from the previous same-id context must not leak across the swap"
);
Ok(())
}
#[test]
fn quality_mode_does_not_reuse_fast_only_two_tier_cache() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
let writer = VectorIndex::create_with_revision(
&fast_path,
embedder.id(),
"rev-fast-only",
embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
writer.finish()?;
client.set_semantic_context(
embedder,
VectorIndex::open(&fast_path)?,
SemanticFilterMaps::for_tests(
HashMap::new(),
HashMap::new(),
HashMap::new(),
HashSet::new(),
),
None,
Some(fast_path),
)?;
let fast_only_index = client
.in_memory_two_tier_index(SemanticTierMode::FastOnly)?
.expect("fast-only index should load");
assert!(
!fast_only_index.has_quality_index(),
"fixture should only provide the fast tier"
);
let quality_index = client.in_memory_two_tier_index(SemanticTierMode::QualityOnly)?;
assert!(
quality_index.is_none(),
"quality mode must not reuse a cached fast-only two-tier index"
);
Ok(())
}
#[test]
fn failed_quality_probe_does_not_block_fast_only_two_tier_load() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
let writer = VectorIndex::create_with_revision(
&fast_path,
embedder.id(),
"rev-fast-only",
embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
writer.finish()?;
client.set_semantic_context(
embedder,
VectorIndex::open(&fast_path)?,
SemanticFilterMaps::for_tests(
HashMap::new(),
HashMap::new(),
HashMap::new(),
HashSet::new(),
),
None,
Some(fast_path),
)?;
assert!(
client
.in_memory_two_tier_index(SemanticTierMode::QualityOnly)?
.is_none(),
"quality-only lookup should fail for a fast-only fixture"
);
let fast_only_index = client
.in_memory_two_tier_index(SemanticTierMode::FastOnly)?
.expect("a failed quality-only probe must not poison fast-only loads");
assert!(
!fast_only_index.has_quality_index(),
"fixture should still resolve to the fast-only tier"
);
Ok(())
}
#[test]
fn progressive_context_error_does_not_poison_future_attempts() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
let writer = VectorIndex::create_with_revision(
&fast_path,
embedder.id(),
"rev-progressive-error",
embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
writer.finish()?;
std::fs::write(dir.path().join("vector.fast.idx"), b"not-a-valid-index")?;
std::fs::write(dir.path().join("vector.quality.idx"), b"not-a-valid-index")?;
client.set_semantic_context(
embedder,
VectorIndex::open(&fast_path)?,
SemanticFilterMaps::for_tests(
HashMap::new(),
HashMap::new(),
HashMap::new(),
HashSet::new(),
),
None,
Some(fast_path),
)?;
let first_err = client
.progressive_context()
.err()
.expect("invalid progressive index files should fail to load");
assert!(
first_err
.to_string()
.contains("open fast-tier index failed"),
"unexpected first progressive-context error: {first_err}"
);
let second_err = client
.progressive_context()
.err()
.expect("a failed progressive load must not be memoized as None");
assert!(
second_err
.to_string()
.contains("open fast-tier index failed"),
"unexpected second progressive-context error: {second_err}"
);
Ok(())
}
fn build_semantic_test_fixture() -> Result<SemanticTestFixture> {
build_semantic_test_fixture_with_shards(false)
}
fn build_sharded_semantic_test_fixture() -> Result<SemanticTestFixture> {
build_semantic_test_fixture_with_shards(true)
}
fn build_semantic_test_fixture_with_shards(sharded: bool) -> Result<SemanticTestFixture> {
let dir = TempDir::new()?;
let db_path = dir.path().join("cass.db");
let storage = FrankenStorage::open(&db_path)?;
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent)?;
let workspace_path = dir.path().join("workspace");
std::fs::create_dir_all(&workspace_path)?;
let workspace_id = storage.ensure_workspace(&workspace_path, None)?;
let documents = [
("session-a.jsonl", "top semantic match", [1.0_f32, 0.0_f32]),
(
"session-b.jsonl",
"middle semantic match",
[0.9_f32, 0.1_f32],
),
("session-c.jsonl", "late semantic match", [0.8_f32, 0.2_f32]),
];
let base_ts = 1_700_000_000_000_i64;
let mut doc_ids = Vec::with_capacity(documents.len());
let mut source_paths = Vec::with_capacity(documents.len());
for (idx, (name, content, _vector)) in documents.iter().enumerate() {
let source_path = dir.path().join(name);
source_paths.push(source_path.to_string_lossy().to_string());
let conversation = Conversation {
id: None,
agent_slug: agent.slug.clone(),
workspace: Some(workspace_path.clone()),
external_id: Some(format!("semantic-{idx}")),
title: Some(format!("semantic session {idx}")),
source_path,
started_at: Some(base_ts + idx as i64),
ended_at: Some(base_ts + idx as i64),
approx_tokens: Some(16),
metadata_json: json!({"fixture": "semantic_search"}),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + idx as i64),
content: (*content).to_string(),
extra_json: json!({}),
snippets: Vec::new(),
}],
source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
origin_host: None,
};
storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
}
let message_rows: Vec<(u64, i64)> = storage.raw().query_map_collect(
"SELECT m.id, COALESCE(m.created_at, c.started_at, 0)
FROM messages m
JOIN conversations c ON m.conversation_id = c.id
ORDER BY c.id",
&[],
|row: &frankensqlite::Row| {
let message_id: i64 = row.get_typed(0)?;
let created_at: i64 = row.get_typed(1)?;
Ok((u64::try_from(message_id).unwrap_or(u64::MAX), created_at))
},
)?;
assert_eq!(
message_rows.len(),
documents.len(),
"fixture should create 3 messages"
);
let filter_maps = SemanticFilterMaps::from_storage(&storage)?;
let embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[1.0, 0.0]));
let source_hash = crc32fast::hash(crate::sources::provenance::LOCAL_SOURCE_ID.as_bytes());
let vector_dir = dir.path().join("vector_index");
std::fs::create_dir_all(&vector_dir)?;
let mut vector_records = Vec::with_capacity(documents.len());
for ((message_id, created_at_ms), (_, _, vector)) in message_rows.iter().zip(documents) {
let doc_id = SemanticDocId {
message_id: *message_id,
chunk_idx: 0,
agent_id: u32::try_from(agent_id)?,
workspace_id: u32::try_from(workspace_id)?,
source_id: source_hash,
role: ROLE_USER,
created_at_ms: *created_at_ms,
content_hash: None,
}
.to_doc_id_string();
doc_ids.push(doc_id.clone());
vector_records.push((doc_id, vector));
}
let mut vector_indexes = Vec::new();
if sharded {
for (shard_index, chunk) in vector_records.chunks(2).enumerate() {
let vector_path = vector_dir.join(format!("shard-{shard_index}.fsvi"));
let mut writer = VectorIndex::create_with_revision(
&vector_path,
embedder.id(),
"rev-1",
embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
for (doc_id, vector) in chunk {
writer.write_record(doc_id, vector)?;
}
writer.finish()?;
vector_indexes.push(VectorIndex::open(&vector_path)?);
}
} else {
let vector_path = vector_dir.join("index-test-fixed-2d.fsvi");
let mut writer = VectorIndex::create_with_revision(
&vector_path,
embedder.id(),
"rev-1",
embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
for (doc_id, vector) in &vector_records {
writer.write_record(doc_id, vector)?;
}
writer.finish()?;
vector_indexes.push(VectorIndex::open(&vector_path)?);
}
drop(storage);
let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
client.set_semantic_indexes_context(embedder, vector_indexes, filter_maps, None, None)?;
Ok(SemanticTestFixture {
_dir: dir,
client,
doc_ids,
source_paths,
})
}
fn build_progressive_hybrid_fixture() -> Result<ProgressiveHybridFixture> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let workspace_path = dir.path().join("workspace");
std::fs::create_dir_all(&workspace_path)?;
let agent_id = 1_i64;
let workspace_id = 1_i64;
let source_id = crate::sources::provenance::LOCAL_SOURCE_ID;
let source_hash = crc32fast::hash(source_id.as_bytes());
let conn = Connection::open(":memory:")?;
conn.execute_batch(
r#"
CREATE TABLE agents (
id INTEGER PRIMARY KEY,
slug TEXT NOT NULL
);
CREATE TABLE workspaces (
id INTEGER PRIMARY KEY,
path TEXT NOT NULL
);
CREATE TABLE sources (
id TEXT PRIMARY KEY,
kind TEXT NOT NULL
);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
title TEXT,
source_path TEXT NOT NULL,
source_id TEXT NOT NULL,
origin_host TEXT,
started_at INTEGER
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
role TEXT NOT NULL,
created_at INTEGER,
content TEXT NOT NULL
);
"#,
)?;
conn.execute_compat(
"INSERT INTO agents (id, slug) VALUES (?1, ?2)",
params![agent_id, "codex"],
)?;
conn.execute_compat(
"INSERT INTO workspaces (id, path) VALUES (?1, ?2)",
params![workspace_id, workspace_path.to_string_lossy().to_string()],
)?;
conn.execute_compat(
"INSERT INTO sources (id, kind) VALUES (?1, ?2)",
params![source_id, "local"],
)?;
let query = "oauth refresh token middleware session cache".to_string();
let filler = " context window ranking provenance semantic upgrade lexical overlay";
let base_ts = 1_700_000_100_000_i64;
let doc_count = 64usize;
let mut message_rows = Vec::with_capacity(doc_count);
for idx in 0..doc_count {
let conversation_id = i64::try_from(idx + 1)?;
let message_id = u64::try_from(idx + 1)?;
let source_path = dir.path().join(format!("progressive-{idx:03}.jsonl"));
let repeated = filler.repeat(48);
let content = if idx % 4 == 0 {
format!(
"{query} hot path candidate {idx} with detailed search diagnostics.{repeated}"
)
} else if idx % 4 == 1 {
format!(
"search pipeline benchmark {idx} with lexical overlay and semantic ranking.{repeated}"
)
} else if idx % 4 == 2 {
format!(
"interactive typing debounce benchmark {idx} for hybrid two tier search.{repeated}"
)
} else {
format!(
"unrelated background chatter {idx} about build systems and formatting checks.{repeated}"
)
};
let created_at = base_ts + idx as i64;
let source_path_str = source_path.to_string_lossy().to_string();
let title = format!("progressive fixture {idx}");
conn.execute_compat(
"INSERT INTO conversations (
id, agent_id, workspace_id, title, source_path, source_id, origin_host, started_at
) VALUES (?1, ?2, ?3, ?4, ?5, ?6, NULL, ?7)",
params![
conversation_id,
agent_id,
workspace_id,
title,
source_path_str.clone(),
source_id,
created_at
],
)?;
conn.execute_compat(
"INSERT INTO messages (
id, conversation_id, idx, role, created_at, content
) VALUES (?1, ?2, 0, 'user', ?3, ?4)",
params![
i64::try_from(message_id)?,
conversation_id,
created_at,
content.clone()
],
)?;
message_rows.push((message_id, created_at, content.clone()));
let normalized = NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some(format!("progressive-{idx}")),
title: Some(format!("progressive fixture {idx}")),
workspace: Some(workspace_path.clone()),
source_path,
started_at: Some(created_at),
ended_at: Some(created_at),
metadata: json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("user".into()),
created_at: Some(created_at),
content,
extra: json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
}],
};
index.add_conversation(&normalized)?;
}
index.commit()?;
assert_eq!(
message_rows.len(),
doc_count,
"fixture should create the requested number of messages"
);
let fast_embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
let quality_embedder = crate::search::hash_embedder::HashEmbedder::new(384);
let filter_maps = SemanticFilterMaps::for_tests(
HashMap::from([("codex".to_string(), u32::try_from(agent_id)?)]),
HashMap::from([(
workspace_path.to_string_lossy().to_string(),
u32::try_from(workspace_id)?,
)]),
HashMap::from([(source_id.to_string(), source_hash)]),
HashSet::new(),
);
let fast_path = dir.path().join("vector.fast.idx");
let quality_path = dir.path().join("vector.quality.idx");
let mut fast_writer = VectorIndex::create_with_revision(
&fast_path,
fast_embedder.id(),
"rev-progressive-fast",
fast_embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
let mut quality_writer = VectorIndex::create_with_revision(
&quality_path,
quality_embedder.id(),
"rev-progressive-quality",
quality_embedder.dimension(),
frankensearch::index::Quantization::F16,
)?;
for (message_id, created_at_ms, content) in &message_rows {
let canonical = canonicalize_for_embedding(content);
let doc_id = SemanticDocId {
message_id: *message_id,
chunk_idx: 0,
agent_id: u32::try_from(agent_id)?,
workspace_id: u32::try_from(workspace_id)?,
source_id: source_hash,
role: ROLE_USER,
created_at_ms: *created_at_ms,
content_hash: Some(content_hash(&canonical)),
}
.to_doc_id_string();
let fast_vec = fast_embedder.embed_sync(content)?;
fast_writer.write_record(&doc_id, &fast_vec)?;
let quality_vec = quality_embedder.embed_sync(content)?;
quality_writer.write_record(&doc_id, &quality_vec)?;
}
fast_writer.finish()?;
quality_writer.finish()?;
let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
let client = SearchClient {
reader,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{}|schema:{}", CACHE_KEY_VERSION, FS_CASS_SCHEMA_HASH),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let semantic_embedder: Arc<dyn Embedder> = fast_embedder;
client.set_semantic_context(
semantic_embedder,
VectorIndex::open(&fast_path)?,
filter_maps,
None,
Some(fast_path),
)?;
Ok(ProgressiveHybridFixture {
_dir: dir,
client: Arc::new(client),
query,
})
}
fn sanitize_query(raw: &str) -> String {
nfc_sanitize_query(raw)
}
fn parse_boolean_query(query: &str) -> Vec<FsCassQueryToken> {
fs_cass_parse_boolean_query(query)
}
fn sqlite_master_name_count(db_path: &Path, name: &str) -> Result<i64> {
let conn = FrankenConnection::open(db_path.to_string_lossy().as_ref())?;
Ok(conn.query_row_map(
"SELECT COUNT(*) FROM sqlite_master WHERE name = ?1",
&[ParamValue::from(name)],
|row| row.get_typed(0),
)?)
}
type QueryToken = FsCassQueryToken;
type WildcardPattern = FsCassWildcardPattern;
type QueryTokenList = Vec<QueryToken>;
#[test]
#[ignore = "profiling harness for live hybrid progressive search"]
fn progressive_hybrid_profile_harness() -> Result<()> {
let fixture = build_progressive_hybrid_fixture()?;
let runtime = asupersync::runtime::RuntimeBuilder::current_thread()
.build()
.map_err(|err| anyhow!("build test runtime failed: {err}"))?;
let iterations = 24usize;
runtime.block_on(async {
let cx = FsCx::for_request();
fixture
.client
.search_progressive_with_callback(
ProgressiveSearchRequest {
cx: &cx,
query: &fixture.query,
filters: SearchFilters::default(),
limit: 16,
sparse_threshold: 0,
field_mask: FieldMask::new(false, true, true, true),
mode: SearchMode::Hybrid,
},
|_| {},
)
.await
})?;
let mut initial_events = 0usize;
let mut refined_events = 0usize;
let mut total_hits = 0usize;
for _ in 0..iterations {
let mut refinement_error = None;
runtime.block_on(async {
let cx = FsCx::for_request();
fixture
.client
.search_progressive_with_callback(
ProgressiveSearchRequest {
cx: &cx,
query: &fixture.query,
filters: SearchFilters::default(),
limit: 16,
sparse_threshold: 0,
field_mask: FieldMask::new(false, true, true, true),
mode: SearchMode::Hybrid,
},
|event| match event {
ProgressiveSearchEvent::Phase { kind, result, .. } => {
assert!(
!result.hits.is_empty(),
"progressive harness expects non-empty hits for each phase"
);
total_hits += result.hits.len();
match kind {
ProgressivePhaseKind::Initial => initial_events += 1,
ProgressivePhaseKind::Refined => refined_events += 1,
}
}
ProgressiveSearchEvent::RefinementFailed { error, .. } => {
refinement_error = Some(error);
}
},
)
.await
})?;
if let Some(error) = refinement_error {
bail!("progressive harness refinement failed: {error}");
}
}
assert_eq!(initial_events, iterations);
assert_eq!(refined_events, iterations);
assert!(
total_hits >= iterations.saturating_mul(16),
"harness should observe a full page for each phase"
);
Ok(())
}
#[test]
fn interner_returns_same_arc_for_same_string() {
let interner = StringInterner::new(100);
let s1 = interner.intern("test_query");
let s2 = interner.intern("test_query");
assert!(Arc::ptr_eq(&s1, &s2));
assert_eq!(&*s1, "test_query");
}
#[test]
fn interner_different_strings_return_different_arcs() {
let interner = StringInterner::new(100);
let s1 = interner.intern("query1");
let s2 = interner.intern("query2");
assert!(!Arc::ptr_eq(&s1, &s2));
assert_eq!(&*s1, "query1");
assert_eq!(&*s2, "query2");
}
#[test]
fn interner_handles_empty_string() {
let interner = StringInterner::new(100);
let s1 = interner.intern("");
let s2 = interner.intern("");
assert!(Arc::ptr_eq(&s1, &s2));
assert_eq!(&*s1, "");
}
#[test]
fn interner_handles_unicode() {
let interner = StringInterner::new(100);
let s1 = interner.intern("测试查询");
let s2 = interner.intern("测试查询");
let s3 = interner.intern("emoji 🔍 search");
assert!(Arc::ptr_eq(&s1, &s2));
assert_eq!(&*s3, "emoji 🔍 search");
}
#[test]
fn interner_respects_lru_eviction() {
let interner = StringInterner::new(3);
let _s1 = interner.intern("query1");
let _s2 = interner.intern("query2");
let _s3 = interner.intern("query3");
assert_eq!(interner.len(), 3);
let _s4 = interner.intern("query4");
assert_eq!(interner.len(), 3);
let s1_new = interner.intern("query1");
assert_eq!(&*s1_new, "query1");
}
#[test]
fn interner_concurrent_access() {
use std::thread;
let interner = Arc::new(StringInterner::new(1000));
let queries: Vec<String> = (0..100).map(|i| format!("query_{}", i)).collect();
let handles: Vec<_> = (0..4)
.map(|_| {
let interner = Arc::clone(&interner);
let queries = queries.clone();
thread::spawn(move || {
for _ in 0..10 {
for query in &queries {
let _ = interner.intern(query);
}
}
})
})
.collect();
for handle in handles {
handle.join().unwrap();
}
for query in &queries {
let s1 = interner.intern(query);
let s2 = interner.intern(query);
assert!(Arc::ptr_eq(&s1, &s2));
}
}
#[test]
fn query_terms_lower_basic() {
let terms = QueryTermsLower::from_query("Hello World");
assert_eq!(terms.query_lower, "hello world");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["hello", "world"]);
}
#[test]
fn query_terms_lower_empty() {
let terms = QueryTermsLower::from_query("");
assert!(terms.is_empty());
assert_eq!(terms.tokens().count(), 0);
}
#[test]
fn query_terms_lower_single_term() {
let terms = QueryTermsLower::from_query("TEST");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["test"]);
}
#[test]
fn query_terms_lower_with_punctuation() {
let terms = QueryTermsLower::from_query("hello, world! how's it?");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["hello", "world", "how", "s", "it"]);
}
#[test]
fn query_terms_lower_unicode() {
let terms = QueryTermsLower::from_query("Héllo Wörld");
assert_eq!(terms.query_lower, "héllo wörld");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["héllo", "wörld"]);
}
#[test]
fn query_terms_lower_bloom_mask() {
let terms = QueryTermsLower::from_query("test");
assert_ne!(terms.bloom_mask(), 0);
let terms2 = QueryTermsLower::from_query("test");
assert_eq!(terms.bloom_mask(), terms2.bloom_mask());
}
#[test]
fn hit_matches_with_precomputed_terms() {
let hit = SearchHit {
title: "Test Title".into(),
snippet: "".into(),
content: "hello world content".into(),
content_hash: stable_content_hash("hello world content"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let cached = cached_hit_from(&hit);
let terms = QueryTermsLower::from_query("hello");
assert!(hit_matches_query_cached_precomputed(&cached, &terms));
let terms_miss = QueryTermsLower::from_query("missing");
assert!(!hit_matches_query_cached_precomputed(&cached, &terms_miss));
}
fn make_fused_hit(
id: &str,
rrf: f32,
lexical: Option<usize>,
semantic: Option<usize>,
) -> FusedHit {
FusedHit {
key: SearchHitKey {
source_id: "local".to_string(),
source_path: id.to_string(),
conversation_id: None,
title: String::new(),
line_number: None,
created_at: None,
content_hash: 0,
},
score: HybridScore {
rrf,
lexical_rank: lexical,
semantic_rank: semantic,
lexical_score: None,
semantic_score: None,
},
hit: SearchHit {
title: id.into(),
snippet: "".into(),
content: "".into(),
content_hash: 0,
score: rrf,
source_path: id.into(),
agent: "test".into(),
workspace: "test".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
}
}
fn make_federated_merge_hit(id: &str, agent: &str) -> SearchHit {
SearchHit {
title: id.into(),
snippet: String::new(),
content: id.into(),
content_hash: stable_content_hash(id),
score: 0.0,
source_path: format!("{id}.jsonl"),
agent: agent.into(),
workspace: "workspace".into(),
workspace_original: None,
created_at: Some(1_700_000_000_000),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
}
}
fn make_federated_ranked_hit(
shard_index: usize,
shard_rank: usize,
id: &str,
) -> FederatedRankedHit {
FederatedRankedHit {
hit: make_federated_merge_hit(id, &format!("shard-{shard_index}")),
shard_index,
shard_rank,
fused_score: federated_rrf_score(shard_rank),
}
}
#[test]
fn federated_merge_orders_equal_rank_hits_by_stable_hit_key() {
let merged = merge_federated_ranked_hits(vec![
make_federated_ranked_hit(2, 0, "zeta"),
make_federated_ranked_hit(0, 0, "bravo"),
make_federated_ranked_hit(1, 0, "alpha"),
]);
let paths = merged
.iter()
.map(|hit| hit.source_path.as_str())
.collect::<Vec<_>>();
assert_eq!(paths, vec!["alpha.jsonl", "bravo.jsonl", "zeta.jsonl"]);
assert!(
merged
.iter()
.all(|hit| (hit.score - federated_rrf_score(0)).abs() < f32::EPSILON),
"equal per-shard rank should produce equal RRF scores"
);
}
#[test]
fn federated_merge_keeps_rrf_rank_ahead_of_stable_key() {
let merged = merge_federated_ranked_hits(vec![
make_federated_ranked_hit(0, 1, "alpha"),
make_federated_ranked_hit(1, 0, "zeta"),
]);
let paths = merged
.iter()
.map(|hit| hit.source_path.as_str())
.collect::<Vec<_>>();
assert_eq!(paths, vec!["zeta.jsonl", "alpha.jsonl"]);
assert!(merged[0].score > merged[1].score);
}
#[test]
fn federated_merge_uses_shard_index_as_duplicate_final_tiebreak() {
let merged = merge_federated_ranked_hits(vec![
FederatedRankedHit {
hit: make_federated_merge_hit("same", "shard-2"),
shard_index: 2,
shard_rank: 0,
fused_score: federated_rrf_score(0),
},
FederatedRankedHit {
hit: make_federated_merge_hit("same", "shard-0"),
shard_index: 0,
shard_rank: 0,
fused_score: federated_rrf_score(0),
},
]);
assert_eq!(merged[0].agent, "shard-0");
assert_eq!(merged[1].agent, "shard-2");
}
#[test]
fn top_k_fused_basic() {
let hits = vec![
make_fused_hit("a", 1.0, Some(0), None),
make_fused_hit("b", 3.0, Some(1), None),
make_fused_hit("c", 2.0, Some(2), None),
make_fused_hit("d", 5.0, Some(3), None),
make_fused_hit("e", 4.0, Some(4), None),
];
let top = top_k_fused(hits, 3);
assert_eq!(top.len(), 3);
assert_eq!(top[0].key.source_path, "d"); assert_eq!(top[1].key.source_path, "e"); assert_eq!(top[2].key.source_path, "b"); }
#[test]
fn top_k_fused_empty() {
let hits: Vec<FusedHit> = vec![];
let top = top_k_fused(hits, 10);
assert!(top.is_empty());
}
#[test]
fn top_k_fused_k_zero() {
let hits = vec![
make_fused_hit("a", 1.0, Some(0), None),
make_fused_hit("b", 2.0, Some(1), None),
];
let top = top_k_fused(hits, 0);
assert!(top.is_empty());
}
#[test]
fn top_k_fused_k_larger_than_n() {
let hits = vec![
make_fused_hit("a", 1.0, Some(0), None),
make_fused_hit("b", 2.0, Some(1), None),
];
let top = top_k_fused(hits, 10);
assert_eq!(top.len(), 2);
assert_eq!(top[0].key.source_path, "b"); assert_eq!(top[1].key.source_path, "a"); }
#[test]
fn top_k_fused_k_equals_n() {
let hits = vec![
make_fused_hit("a", 3.0, Some(0), None),
make_fused_hit("b", 1.0, Some(1), None),
make_fused_hit("c", 2.0, Some(2), None),
];
let top = top_k_fused(hits, 3);
assert_eq!(top.len(), 3);
assert_eq!(top[0].key.source_path, "a"); assert_eq!(top[1].key.source_path, "c"); assert_eq!(top[2].key.source_path, "b"); }
#[test]
fn top_k_fused_k_one() {
let hits = vec![
make_fused_hit("a", 1.0, Some(0), None),
make_fused_hit("b", 3.0, Some(1), None),
make_fused_hit("c", 2.0, Some(2), None),
];
let top = top_k_fused(hits, 1);
assert_eq!(top.len(), 1);
assert_eq!(top[0].key.source_path, "b");
assert_eq!(top[0].score.rrf, 3.0);
}
#[test]
fn top_k_fused_duplicate_scores() {
let hits = vec![
make_fused_hit("a", 2.0, Some(0), None),
make_fused_hit("b", 2.0, Some(1), None),
make_fused_hit("c", 2.0, Some(2), None),
make_fused_hit("d", 1.0, Some(3), None),
];
let top = top_k_fused(hits, 2);
assert_eq!(top.len(), 2);
assert_eq!(top[0].score.rrf, 2.0);
assert_eq!(top[1].score.rrf, 2.0);
}
#[test]
fn top_k_fused_dual_source_tiebreaker() {
let hits = vec![
make_fused_hit("a", 2.0, Some(0), None), make_fused_hit("b", 2.0, Some(1), Some(0)), make_fused_hit("c", 2.0, None, Some(1)), ];
let top = top_k_fused(hits, 3);
assert_eq!(top.len(), 3);
assert_eq!(top[0].key.source_path, "b");
}
#[test]
fn top_k_fused_large_input_uses_quickselect() {
let hits: Vec<FusedHit> = (0..100)
.map(|i| make_fused_hit(&format!("hit_{}", i), i as f32, Some(i), None))
.collect();
let top = top_k_fused(hits, 10);
assert_eq!(top.len(), 10);
for (i, hit) in top.iter().enumerate() {
assert_eq!(hit.key.source_path, format!("hit_{}", 99 - i));
assert_eq!(hit.score.rrf, (99 - i) as f32);
}
}
#[test]
fn top_k_fused_equivalence_with_full_sort() {
for n in [10, 50, 100, 200] {
for k in [1, 5, 10, 25] {
if k > n {
continue;
}
let hits: Vec<FusedHit> = (0..n)
.map(|i| {
let score = ((i * 17 + 7) % 1000) as f32;
make_fused_hit(&format!("hit_{}", i), score, Some(i), None)
})
.collect();
let mut baseline = hits.clone();
baseline.sort_by(cmp_fused_hit_desc);
baseline.truncate(k);
let quickselect = top_k_fused(hits, k);
assert_eq!(quickselect.len(), baseline.len(), "n={}, k={}", n, k);
for (q, b) in quickselect.iter().zip(baseline.iter()) {
assert_eq!(
q.key.source_path, b.key.source_path,
"n={}, k={}: mismatch",
n, k
);
assert_eq!(q.score.rrf, b.score.rrf, "n={}, k={}: score mismatch", n, k);
}
}
}
}
#[test]
fn cmp_fused_hit_desc_basic_ordering() {
let a = make_fused_hit("a", 2.0, Some(0), None);
let b = make_fused_hit("b", 3.0, Some(1), None);
assert_eq!(cmp_fused_hit_desc(&a, &b), CmpOrdering::Greater);
assert_eq!(cmp_fused_hit_desc(&b, &a), CmpOrdering::Less);
assert_eq!(cmp_fused_hit_desc(&a, &a), CmpOrdering::Equal);
}
#[test]
fn cache_enforces_prefix_matching() {
let hit = SearchHit {
title: "test".into(),
snippet: "".into(),
content: "arrow".into(),
content_hash: stable_content_hash("arrow"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let cached = CachedHit {
hit: hit.clone(),
lc_content: "arrow".into(),
lc_title: Some("test".into()),
bloom64: u64::MAX, };
let matched = hit_matches_query_cached(&cached, "row");
assert!(
!matched,
"Query 'row' should NOT match content 'arrow' (prefix match required)"
);
}
#[test]
fn search_deduplication_across_pages_repro() {
let dir = TempDir::new().unwrap();
let index_path = dir.path();
let mut index = TantivyIndex::open_or_create(index_path).unwrap();
let msg1 = NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1000),
content: "duplicate content".into(),
extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
};
let conv1 = NormalizedConversation {
agent_slug: "agent1".into(),
external_id: None,
title: None,
workspace: None,
source_path: "path/1".into(),
started_at: None,
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![msg1],
};
let msg2 = NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(2000), content: "duplicate content".into(), extra: serde_json::json!({}),
snippets: Vec::new(),
invocations: Vec::new(),
};
let conv2 = NormalizedConversation {
agent_slug: "agent1".into(),
external_id: None,
title: None,
workspace: None,
source_path: "path/2".into(), started_at: None,
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![msg2],
};
index.add_conversation(&conv1).unwrap();
index.add_conversation(&conv2).unwrap();
index.commit().unwrap();
let client = SearchClient::open(index_path, None).unwrap().unwrap();
let page1 = client
.search("duplicate", SearchFilters::default(), 1, 0, FieldMask::FULL)
.unwrap();
assert_eq!(page1.len(), 1);
let page2 = client
.search("duplicate", SearchFilters::default(), 1, 1, FieldMask::FULL)
.unwrap();
assert_eq!(page2.len(), 1);
assert_ne!(page1[0].source_path, page2[0].source_path);
}
#[test]
fn cache_skips_complex_queries() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let _ = client.search("foo*", SearchFilters::default(), 10, 0, FieldMask::FULL);
let stats = client.cache_stats();
assert_eq!(
stats.cache_miss, 0,
"Wildcard query should not trigger cache miss"
);
let _ = client.search(
"foo OR bar",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
);
let stats = client.cache_stats();
assert_eq!(
stats.cache_miss, 0,
"Boolean query should not trigger cache miss"
);
let _ = client.search("simple", SearchFilters::default(), 10, 0, FieldMask::FULL);
let stats = client.cache_stats();
assert_eq!(
stats.cache_miss, 1,
"Simple query should trigger cache miss"
);
}
#[test]
fn cache_prefix_lookup_handles_utf8_boundaries() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = vec![SearchHit {
title: "こんにちは".into(),
snippet: String::new(),
content: "こんにちは 世界".into(),
content_hash: stable_content_hash("こんにちは 世界"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
}];
client.put_cache("こん", &SearchFilters::default(), &hits);
let cached = client
.cached_prefix_hits("こんにちは", &SearchFilters::default())
.unwrap();
assert_eq!(cached.len(), 1);
assert_eq!(cached[0].hit.title, "こんにちは");
}
#[test]
fn bloom_gate_rejects_missing_terms() {
let hit = SearchHit {
title: "hello world".into(),
snippet: "hello world".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let cached = cached_hit_from(&hit);
assert!(hit_matches_query_cached(&cached, "hello"));
assert!(!hit_matches_query_cached(&cached, "missing"));
let metrics = Metrics::default();
metrics.inc_cache_hits();
metrics.inc_cache_miss();
metrics.inc_cache_shortfall();
metrics.inc_reload();
let (hits, miss, shortfall, reloads, _) = metrics.snapshot_all();
assert_eq!((hits, miss, shortfall, reloads), (1, 1, 1, 1));
}
#[test]
fn progressive_lexical_hit_omits_unused_content() {
let hit = SearchHit {
title: "hello world".into(),
snippet: "hello **world**".into(),
content: "hello world from a much larger conversation body".into(),
content_hash: stable_content_hash("hello world from a much larger conversation body"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: Some(3),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let snippet_only =
ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(false, true, true, true));
assert_eq!(snippet_only.title, hit.title);
assert_eq!(snippet_only.snippet, hit.snippet);
assert!(
snippet_only.content.is_empty(),
"snippet-only progressive cache should not retain full content"
);
assert_eq!(snippet_only.match_type, hit.match_type);
assert_eq!(snippet_only.line_number, hit.line_number);
assert_eq!(snippet_only.source_path, hit.source_path);
assert_eq!(snippet_only.agent, hit.agent);
assert_eq!(snippet_only.workspace, hit.workspace);
let full =
ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(true, true, true, true));
assert_eq!(full.content, hit.content);
}
#[test]
fn progressive_phase_reuses_lexical_cache_without_db_hydration() -> Result<()> {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let field_mask = FieldMask::new(false, true, true, true);
let lexical_hit = SearchHit {
title: "lexical title".into(),
snippet: "lexical snippet".into(),
content: "full lexical body".into(),
content_hash: stable_content_hash("full lexical body"),
score: 0.0,
source_path: "/tmp/session.jsonl".into(),
agent: "codex".into(),
workspace: "/tmp".into(),
workspace_original: Some("/original".into()),
created_at: Some(1_700_000_000_000),
line_number: Some(7),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let mut lexical_cache = ProgressiveLexicalCache::default();
lexical_cache.hits_by_message.insert(
42,
ProgressiveLexicalHit::from_search_hit(&lexical_hit, field_mask),
);
let hash_hex = "00".repeat(32);
let results = vec![FsScoredResult {
doc_id: format!("m|42|0|1|1|1|1|1700000000000|{hash_hex}"),
score: 0.91,
source: FsScoreSource::Lexical,
index: None,
fast_score: None,
quality_score: None,
lexical_score: Some(0.91),
rerank_score: None,
explanation: None,
metadata: None,
}];
let result = client.progressive_phase_to_result(
&results,
ProgressivePhaseContext {
query: "merged title",
filters: &SearchFilters::default(),
field_mask,
lexical_cache: Some(&lexical_cache),
limit: 1,
fetch_limit: 1,
},
)?;
assert_eq!(result.hits.len(), 1);
assert_eq!(result.hits[0].title, lexical_hit.title);
assert_eq!(result.hits[0].snippet, lexical_hit.snippet);
assert!(
result.hits[0].content.is_empty(),
"masked lexical cache should still avoid carrying full content"
);
assert_eq!(result.hits[0].source_path, lexical_hit.source_path);
assert_eq!(result.hits[0].score, 0.91);
Ok(())
}
#[test]
fn search_returns_results_with_filters_and_pagination() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("hello world convo".into()),
workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
source_path: dir.path().join("rollout-1.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("me".into()),
created_at: Some(1_700_000_000_000),
content: "hello rust world".into(),
extra: serde_json::json!({}),
snippets: vec![NormalizedSnippet {
file_path: None,
start_line: None,
end_line: None,
language: None,
snippet_text: None,
}],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.agents.insert("codex".into());
let hits = client.search("hello", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].agent, "codex");
assert!(hits[0].snippet.contains("hello"));
Ok(())
}
#[test]
fn search_honors_created_range_and_workspace() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv_a = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("needle one".into()),
workspace: Some(std::path::PathBuf::from("/ws/a")),
source_path: dir.path().join("a.jsonl"),
started_at: Some(10),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(10),
content: "alpha needle".into(),
extra: serde_json::json!({}),
snippets: vec![NormalizedSnippet {
file_path: None,
start_line: None,
end_line: None,
language: None,
snippet_text: None,
}],
invocations: Vec::new(),
}],
};
let conv_b = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("needle two".into()),
workspace: Some(std::path::PathBuf::from("/ws/b")),
source_path: dir.path().join("b.jsonl"),
started_at: Some(20),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(20),
content: "\nneedle second line".into(),
extra: serde_json::json!({}),
snippets: vec![NormalizedSnippet {
file_path: None,
start_line: None,
end_line: None,
language: None,
snippet_text: None,
}],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv_a)?;
index.add_conversation(&conv_b)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.workspaces.insert("/ws/b".into());
filters.created_from = Some(15);
filters.created_to = Some(25);
let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].workspace, "/ws/b");
assert!(hits[0].snippet.contains("second line"));
Ok(())
}
#[test]
fn pagination_skips_results() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
for i in 0..3 {
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some(format!("doc-{i}")),
workspace: Some(std::path::PathBuf::from("/ws/p")),
source_path: dir.path().join(format!("{i}.jsonl")),
started_at: Some(100 + i),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100 + i),
content: format!("pagination needle document number {i}"),
extra: serde_json::json!({}),
snippets: vec![NormalizedSnippet {
file_path: None,
start_line: None,
end_line: None,
language: None,
snippet_text: None,
}],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search(
"pagination",
SearchFilters::default(),
1,
1,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
Ok(())
}
#[test]
fn search_matches_hyphenated_term() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("cma-es notes".into()),
workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
source_path: dir.path().join("rollout-1.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("me".into()),
created_at: Some(1_700_000_000_000),
content: "Need CMA-ES strategy and CMA ES variants".into(),
extra: serde_json::json!({}),
snippets: vec![NormalizedSnippet {
file_path: None,
start_line: None,
end_line: None,
language: None,
snippet_text: None,
}],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("cma-es", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert!(hits[0].snippet.to_lowercase().contains("cma"));
Ok(())
}
#[test]
fn search_matches_prefix_edge_ngram() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("math logic".into()),
workspace: Some(std::path::PathBuf::from("/ws/m")),
source_path: dir.path().join("math.jsonl"),
started_at: Some(1000),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1000),
content: "please calculate the entropy".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("cal", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert!(hits[0].content.contains("calculate"));
let hits = client.search("entr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
Ok(())
}
#[test]
fn search_matches_snake_case() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("code".into()),
workspace: None,
source_path: dir.path().join("c.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "check the my_variable_name please".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("vari", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
let hits = client.search(
"my_variable",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
Ok(())
}
#[test]
fn search_matches_symbols_stripped() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("symbols".into()),
workspace: None,
source_path: dir.path().join("s.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "working with c++ and foo.bar today".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("c++", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
Ok(())
}
#[test]
fn search_sets_match_type_for_wildcards() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("handlers".into()),
workspace: None,
source_path: dir.path().join("h.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "the request handler delegates".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let exact = client.search("handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(exact[0].match_type, MatchType::Exact);
let prefix = client.search("hand*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(prefix[0].match_type, MatchType::Prefix);
let suffix = client.search("*handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(suffix[0].match_type, MatchType::Suffix);
let substring =
client.search("*andle*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(substring[0].match_type, MatchType::Substring);
Ok(())
}
#[test]
fn search_with_fallback_marks_implicit_wildcard() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("handlers".into()),
workspace: None,
source_path: dir.path().join("h2.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "the request handler delegates".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"andle",
SearchFilters::default(),
10,
0,
2,
FieldMask::FULL,
)?;
assert!(result.wildcard_fallback);
assert_eq!(result.hits.len(), 1);
assert_eq!(result.hits[0].match_type, MatchType::ImplicitWildcard);
Ok(())
}
#[test]
fn sqlite_backend_skips_wildcard_queries() -> Result<()> {
let conn = Connection::open(":memory:")?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.search("*handler", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
assert!(
hits.is_empty(),
"wildcard should skip sqlite fallback, not error"
);
Ok(())
}
#[test]
fn sqlite_backend_handles_null_workspace() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 't', '/tmp/session.jsonl')",
)?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
params![
1_i64,
"auth token failure",
"t",
"codex",
"/tmp/session.jsonl",
42_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].workspace, "");
assert_eq!(hits[0].line_number, Some(1));
assert_eq!(hits[0].source_id, "local");
assert_eq!(hits[0].origin_kind, "local");
Ok(())
}
#[test]
fn sqlite_backend_supports_legacy_fts_message_id_schema() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
message_id UNINDEXED,
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/legacy')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, 1, 'local', NULL, 'legacy title', '/tmp/legacy.jsonl')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, content, created_at)
VALUES(42, 1, 4, 'legacy auth token failure', 99)",
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at, message_id)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
params![
1_i64,
"legacy auth token failure",
"legacy title",
"codex",
"/legacy",
"/tmp/legacy.jsonl",
99_i64,
42_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].title, "legacy title");
assert_eq!(hits[0].source_path, "/tmp/legacy.jsonl");
assert_eq!(hits[0].workspace, "/legacy");
assert_eq!(hits[0].line_number, Some(5));
assert_eq!(hits[0].content, "legacy auth token failure");
Ok(())
}
#[test]
fn tantivy_reader_skips_sqlite_fallback_on_empty_lexical_results() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
index.commit()?;
let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
assert!(
reader.is_some(),
"test fixture should open a Tantivy reader even with an empty index"
);
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/sqlite-only')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, 1, 'local', NULL, 'sqlite fallback only', '/tmp/sqlite-only.jsonl')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, content, created_at)
VALUES(1, 1, 0, 'sqliteonlytoken overflow candidate', 42)",
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
1_i64,
"sqliteonlytoken overflow candidate",
"sqlite fallback only",
"codex",
"/sqlite-only",
"/tmp/sqlite-only.jsonl",
42_i64
],
)?;
let client = SearchClient {
reader,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let sqlite_hits = client.search_sqlite_fts5(
Path::new(":memory:"),
"sqliteonlytoken",
SearchFilters::default(),
5,
0,
FieldMask::FULL,
)?;
assert_eq!(
sqlite_hits.len(),
1,
"fixture should prove sqlite fallback would have produced a hit"
);
let tantivy_authoritative_hits = client.search(
"sqliteonlytoken",
SearchFilters::default(),
5,
0,
FieldMask::FULL,
)?;
assert!(
tantivy_authoritative_hits.is_empty(),
"a live Tantivy reader should prevent sqlite fallback from populating empty lexical results"
);
Ok(())
}
#[test]
fn sqlite_guard_does_not_repair_fts_when_generation_key_stale() -> Result<()> {
let temp_dir = TempDir::new()?;
let db_path = temp_dir.path().join("stale-gen-fts.db");
{
let storage = FrankenStorage::open(&db_path)?;
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent)?;
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(PathBuf::from("/tmp/workspace")),
external_id: Some("stale-gen-fts".into()),
title: Some("Stale FTS generation".into()),
source_path: PathBuf::from("/tmp/stale-gen-fts.jsonl"),
started_at: Some(1_700_000_000_000),
ended_at: Some(1_700_000_000_100),
approx_tokens: Some(42),
metadata_json: serde_json::Value::Null,
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_000_050),
content: "message that should remain queryable".into(),
extra_json: serde_json::Value::Null,
snippets: Vec::new(),
}],
source_id: "local".into(),
origin_host: None,
};
storage.insert_conversation_tree(agent_id, None, &conversation)?;
}
let count_before = sqlite_master_name_count(&db_path, "fts_messages")
.context("count schema rows before generation key deletion")?;
{
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
conn.execute_compat(
"DELETE FROM meta WHERE key = ?1",
&[ParamValue::from("fts_frankensqlite_rebuild_generation")],
)?;
}
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: Some(db_path.clone()),
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let guard = client
.sqlite_guard()
.context("open sqlite guard for stale generation fixture")?;
assert!(guard.is_some(), "sqlite guard should open the db");
let conn = guard
.as_ref()
.expect("sqlite guard should hold a connection");
let no_params: [ParamValue; 0] = [];
let cache_size: i64 =
conn.query_row_map("PRAGMA cache_size;", &no_params, |row| row.get_typed(0))?;
assert_eq!(
cache_size, -SEARCH_SQLITE_HYDRATION_CACHE_KIB,
"search hydration should not inherit the general storage cache profile"
);
drop(guard);
let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
let generation_after: Option<String> = conn
.query_row_map(
"SELECT value FROM meta WHERE key = ?1",
&[ParamValue::from("fts_frankensqlite_rebuild_generation")],
|row| row.get_typed(0),
)
.optional()?;
assert!(
generation_after.is_none(),
"search sqlite guard must not mutate FTS rebuild metadata"
);
let count_after = sqlite_master_name_count(&db_path, "fts_messages")
.context("count schema rows after sqlite guard reopen")?;
assert_eq!(
count_after, count_before,
"read-only reopen must leave FTS schema state unchanged"
);
Ok(())
}
#[test]
fn sqlite_path_rusqlite_fallback_matches_hyphenated_ids_with_workspace_filter() -> Result<()> {
let temp_dir = TempDir::new()?;
let db_path = temp_dir.path().join("hyphenated-rusqlite-fallback.db");
{
let storage = FrankenStorage::open(&db_path)?;
storage.ensure_search_fallback_fts_consistency()?;
let conn = storage.raw();
conn.execute(
"INSERT INTO agents(id, slug, name, kind, created_at, updated_at)
VALUES(1, 'codex', 'Codex', 'codex', 1, 1)",
)?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws/alpha')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/ws/beta')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, 1, 'local', NULL, 'alpha bead', '/tmp/alpha.jsonl')",
)?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(2, 1, 2, 'local', NULL, 'beta bead', '/tmp/beta.jsonl')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(11, 1, 0, 'user', 'Need follow-up on br-123 root cause', 100)",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(12, 2, 0, 'user', 'Need follow-up on br-123 user report', 101)",
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
&[
ParamValue::from(11_i64),
ParamValue::from("Need follow-up on br-123 root cause"),
ParamValue::from("alpha bead"),
ParamValue::from("codex"),
ParamValue::from("/ws/alpha"),
ParamValue::from("/tmp/alpha.jsonl"),
ParamValue::from(100_i64),
],
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
&[
ParamValue::from(12_i64),
ParamValue::from("Need follow-up on br-123 user report"),
ParamValue::from("beta bead"),
ParamValue::from("codex"),
ParamValue::from("/ws/beta"),
ParamValue::from("/tmp/beta.jsonl"),
ParamValue::from(101_i64),
],
)?;
let preclose_total_rows = conn.query("SELECT rowid FROM fts_messages")?;
assert_eq!(
preclose_total_rows.len(),
2,
"freshly seeded file-backed FTS should retain the inserted rows"
);
let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
let preclose_rows = conn.query_with_params(
"SELECT rowid FROM fts_messages WHERE fts_messages MATCH ?",
¶ms_from_iter(vec![ParamValue::from(transpiled.as_str())]),
)?;
assert_eq!(
preclose_rows.len(),
2,
"freshly seeded file-backed FTS should match the transpiled hyphenated query before reopen"
);
}
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: Some(db_path),
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let guard = client.sqlite_guard()?;
let conn = guard.as_ref().expect("sqlite guard should reopen file db");
let reopened_total_rows = conn.query("SELECT rowid FROM fts_messages")?;
assert_eq!(
reopened_total_rows.len(),
2,
"reopened file-backed FTS should still contain the seeded rows"
);
let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
let raw_rows = conn.query_with_params(
"SELECT rowid FROM fts_messages WHERE fts_messages MATCH ?",
¶ms_from_iter(vec![ParamValue::from(transpiled.as_str())]),
)?;
assert_eq!(
raw_rows.len(),
2,
"reopened file-backed FTS should still match the transpiled hyphenated query"
);
drop(guard);
let all_hits = client.search("br-123", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(all_hits.len(), 2);
assert!(
all_hits.iter().all(|hit| hit.content.contains("br-123")),
"hyphenated bead IDs should survive the file-backed sqlite fallback path"
);
let leading_or_hits = client.search(
"OR br-123",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(leading_or_hits.len(), 2);
let dotted_hits = client.search(
"br-123.jsonl",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(dotted_hits.len(), 2);
let dotted_prefix_hits = client.search(
"br-123.json*",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(dotted_prefix_hits.len(), 2);
let prefix_hits =
client.search("br-12*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(prefix_hits.len(), 2);
let filtered_hits = client.search(
"br-123",
SearchFilters {
workspaces: HashSet::from_iter(["/ws/beta".to_string()]),
..SearchFilters::default()
},
10,
0,
FieldMask::FULL,
)?;
assert_eq!(filtered_hits.len(), 1);
assert_eq!(filtered_hits[0].workspace, "/ws/beta");
assert_eq!(filtered_hits[0].source_path, "/tmp/beta.jsonl");
assert!(filtered_hits[0].content.contains("br-123"));
Ok(())
}
#[test]
fn sqlite_backend_orders_hits_by_bm25_score() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'best', '/tmp/best.jsonl')",
)?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'worse', '/tmp/worse.jsonl')",
)?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(7, 1, 0, 'auth auth auth failure', 42)")?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(8, 2, 0, 'auth failure', 43)")?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
7_i64,
"auth auth auth failure",
"best",
"codex",
"/ws",
"/tmp/best.jsonl",
42_i64
],
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
8_i64,
"auth failure",
"worse",
"codex",
"/ws",
"/tmp/worse.jsonl",
43_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let direct_hits = client.search_sqlite_fts5(
Path::new(":memory:"),
"auth",
SearchFilters::default(),
5,
0,
FieldMask::FULL,
)?;
assert_eq!(direct_hits.len(), 2);
let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 2);
assert_eq!(hits[0].title, "best");
assert_eq!(hits[1].title, "worse");
assert!(hits[0].score > hits[1].score);
Ok(())
}
#[test]
fn sqlite_fts5_ranked_phase_defers_content_decode_until_after_limit() {
let (rank_sql, params) =
SearchClient::sqlite_fts5_rank_query("auth", &SearchFilters::default(), 50, 0, false);
let hydrate_sql = SearchClient::sqlite_fts5_hydrate_query(
2,
FieldMask::new(true, true, true, true),
false,
);
assert!(
!rank_sql.contains("fts_messages.content"),
"rank query must not decode large content rows before LIMIT"
);
assert!(
hydrate_sql.contains("fts_messages.content"),
"hydration query should still provide requested content"
);
assert!(
rank_sql.contains("LIMIT ? OFFSET ?"),
"rank query must apply page bounds before hydration"
);
assert_eq!(params.len(), 3, "fts query plus limit and offset params");
}
#[test]
fn sqlite_fts5_hydration_chunks_stay_below_bind_variable_limit() {
let oversized_row_count = SQLITE_MAX_VARIABLE_NUMBER + 1;
let unchunked_sql = SearchClient::sqlite_fts5_hydrate_query(
oversized_row_count,
FieldMask::new(true, true, true, true),
false,
);
assert!(
unchunked_sql.matches('?').count() > SQLITE_MAX_VARIABLE_NUMBER,
"the pre-fix one-shot hydration query would exceed frankensqlite's bind limit"
);
let ranked_rows: Vec<(i64, f64)> = (0..(SQLITE_FTS5_HYDRATE_PARAM_CHUNK + 17))
.map(|idx| (idx as i64, idx as f64))
.collect();
let chunk_sizes: Vec<usize> = SearchClient::sqlite_fts5_hydrate_row_chunks(&ranked_rows)
.map(<[(i64, f64)]>::len)
.collect();
assert_eq!(
chunk_sizes,
vec![SQLITE_FTS5_HYDRATE_PARAM_CHUNK, 17],
"large fallback pages must hydrate in bounded chunks while preserving rank windows"
);
assert!(
chunk_sizes
.iter()
.all(|chunk_size| *chunk_size <= SQLITE_MAX_VARIABLE_NUMBER),
"every hydration chunk must fit under frankensqlite's bind-variable ceiling"
);
}
#[test]
fn tantivy_fallback_hydration_narrows_by_normalized_source_before_message_lookup() -> Result<()>
{
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
source_id TEXT,
origin_host TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
content TEXT NOT NULL,
UNIQUE(conversation_id, idx)
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute(
"INSERT INTO conversations(id, source_id, origin_host, source_path)
VALUES(1, '', 'devbox', '/tmp/shared-fallback.jsonl')",
)?;
conn.execute(
"INSERT INTO conversations(id, source_id, origin_host, source_path)
VALUES(2, 'local', NULL, '/tmp/shared-fallback.jsonl')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, content)
VALUES(10, 1, 2, 'remote fallback content')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, content)
VALUES(20, 2, 2, 'local content must not win')",
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let fallback_key = (
"devbox".to_string(),
"/tmp/shared-fallback.jsonl".to_string(),
2,
);
let (_, hydrated_fallback) =
client.hydrate_tantivy_hit_contents(&[], std::slice::from_ref(&fallback_key))?;
assert_eq!(
hydrated_fallback.get(&fallback_key).map(String::as_str),
Some("remote fallback content")
);
Ok(())
}
#[test]
fn exact_content_hydration_returns_only_requested_message_indices() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER NOT NULL,
content TEXT NOT NULL,
UNIQUE(conversation_id, idx)
);",
)?;
for idx in 0..8 {
conn.execute(&format!(
"INSERT INTO messages(conversation_id, idx, content)
VALUES(1, {idx}, 'conversation one row {idx}')"
))?;
}
conn.execute(
"INSERT INTO messages(conversation_id, idx, content)
VALUES(2, 0, 'conversation two row 0')",
)?;
let hydrated =
hydrate_message_content_by_conversation(&conn, &[(1, 6), (1, 2), (2, 0), (1, 99)])?;
assert_eq!(hydrated.len(), 3);
assert_eq!(
hydrated.get(&(1, 2)).map(String::as_str),
Some("conversation one row 2")
);
assert_eq!(
hydrated.get(&(1, 6)).map(String::as_str),
Some("conversation one row 6")
);
assert_eq!(
hydrated.get(&(2, 0)).map(String::as_str),
Some("conversation two row 0")
);
assert!(!hydrated.contains_key(&(1, 99)));
Ok(())
}
#[test]
fn sqlite_backend_generates_snippet_from_content() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'snippet title', '/tmp/snippet.jsonl')",
)?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'alpha beta gamma delta epsilon zeta eta theta', 42)")?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
1_i64,
"alpha beta gamma delta epsilon zeta eta theta",
"snippet title",
"codex",
"/ws",
"/tmp/snippet.jsonl",
42_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.search("delta", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].snippet, snippet_from_content(&hits[0].content));
assert!(hits[0].snippet.contains("delta"));
Ok(())
}
#[test]
fn sqlite_backend_respects_source_filter() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO sources(id, kind) VALUES('laptop', 'ssh')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/local')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/remote')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, ' local ', NULL, 'local title', '/tmp/local.jsonl')",
)?;
conn.execute("INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 2, 'laptop', 'dev@laptop', 'remote title', '/tmp/remote.jsonl')")?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
1_i64,
"auth token failure",
"local title",
"codex",
"/local",
"/tmp/local.jsonl",
42_i64
],
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
2_i64,
"auth token failure",
"remote title",
"codex",
"/remote",
"/tmp/remote.jsonl",
43_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let local_hits = client.browse_by_date(
SearchFilters {
source_filter: SourceFilter::Local,
..SearchFilters::default()
},
5,
0,
true,
FieldMask::FULL,
)?;
assert_eq!(local_hits.len(), 1);
assert_eq!(local_hits[0].source_id, "local");
let remote_hits = client.browse_by_date(
SearchFilters {
source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
..SearchFilters::default()
},
5,
0,
true,
FieldMask::FULL,
)?;
assert_eq!(remote_hits.len(), 1);
assert_eq!(remote_hits[0].source_id, "local");
assert_eq!(remote_hits[0].origin_kind, "local");
Ok(())
}
#[test]
fn sqlite_backend_remote_source_filter_matches_blank_source_id_with_origin_host() -> Result<()>
{
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, ' ', 'dev@laptop', 'remote title', '/tmp/remote-filter.jsonl')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, content, created_at)
VALUES(1, 1, 0, 'remote filter proof', 42)",
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
params![
1_i64,
"remote filter proof",
"remote title",
"codex",
"/tmp/remote-filter.jsonl",
42_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let remote_hits = client.search(
"remote",
SearchFilters {
source_filter: SourceFilter::Remote,
..Default::default()
},
5,
0,
FieldMask::FULL,
)?;
assert_eq!(remote_hits.len(), 1);
assert_eq!(remote_hits[0].source_id, "dev@laptop");
assert_eq!(remote_hits[0].origin_kind, "remote");
assert_eq!(remote_hits[0].origin_host.as_deref(), Some("dev@laptop"));
let source_hits = client.search(
"remote",
SearchFilters {
source_filter: SourceFilter::SourceId("dev@laptop".into()),
..Default::default()
},
5,
0,
FieldMask::FULL,
)?;
assert_eq!(source_hits.len(), 1);
assert_eq!(source_hits[0].source_id, "dev@laptop");
assert_eq!(source_hits[0].origin_kind, "remote");
Ok(())
}
#[test]
fn sqlite_backend_workspace_filter_matches_null_workspace_as_empty_string() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
content='',
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/named')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 'null workspace', '/tmp/null-workspace.jsonl')",
)?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'named workspace', '/tmp/named-workspace.jsonl')",
)?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
params![
1_i64,
"auth token failure",
"null workspace",
"codex",
"/tmp/null-workspace.jsonl",
42_i64
],
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
params![
2_i64,
"auth token failure",
"named workspace",
"codex",
"/named",
"/tmp/named-workspace.jsonl",
43_i64
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.search(
"auth",
SearchFilters {
workspaces: HashSet::from_iter([String::new()]),
..SearchFilters::default()
},
5,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].workspace, "");
assert_eq!(hits[0].source_path, "/tmp/null-workspace.jsonl");
Ok(())
}
#[test]
fn browse_by_date_treats_null_workspace_and_source_as_local() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
content TEXT NOT NULL,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, NULL, NULL, 'browse title', '/tmp/browse.jsonl')",
)?;
conn.execute(
"INSERT INTO messages(id, conversation_id, idx, content, created_at)
VALUES(1, 1, 0, 'browse auth token failure', 123)",
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.browse_by_date(
SearchFilters {
workspaces: HashSet::from_iter([String::new()]),
source_filter: SourceFilter::Local,
..SearchFilters::default()
},
5,
0,
true,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].workspace, "");
assert_eq!(hits[0].source_id, "local");
assert_eq!(hits[0].origin_kind, "local");
Ok(())
}
#[test]
fn hydrate_semantic_hits_with_ids_snippet_only_uses_full_content_for_snippets_and_identity()
-> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER
);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
VALUES(1, 1, NULL, 'local', NULL, 'semantic title', '/tmp/semantic.jsonl', 100)",
)?;
let shared_prefix = "shared-prefix ".repeat(32);
let first = format!("{shared_prefix}first unique semantic tail");
let second = format!("{shared_prefix}second unique semantic tail");
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
&[
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Integer(0),
fsqlite_types::value::SqliteValue::Text(first.clone().into()),
fsqlite_types::value::SqliteValue::Integer(101),
],
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
&[
fsqlite_types::value::SqliteValue::Integer(2),
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text(second.clone().into()),
fsqlite_types::value::SqliteValue::Integer(102),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.hydrate_semantic_hits_with_ids(
&[
VectorSearchResult {
message_id: 1,
chunk_idx: 0,
score: 0.9,
},
VectorSearchResult {
message_id: 2,
chunk_idx: 0,
score: 0.8,
},
],
FieldMask::new(false, true, true, true),
)?;
assert_eq!(hits.len(), 2);
assert!(hits.iter().all(|(_, hit)| hit.content.is_empty()));
assert!(hits.iter().all(|(_, hit)| !hit.snippet.is_empty()));
assert_ne!(hits[0].1.content_hash, hits[1].1.content_hash);
Ok(())
}
#[test]
fn hydrate_semantic_hits_with_ids_normalizes_trimmed_local_source_metadata() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER
);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
VALUES(1, 1, NULL, ' local ', NULL, 'trimmed local semantic', '/tmp/trimmed-local-semantic.jsonl', 100)",
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 101)",
&[
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text("trimmed local semantic body".into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.hydrate_semantic_hits_with_ids(
&[VectorSearchResult {
message_id: 1,
chunk_idx: 0,
score: 0.9,
}],
FieldMask::new(false, true, true, true),
)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].1.source_id, "local");
assert_eq!(hits[0].1.origin_kind, "local");
Ok(())
}
#[test]
fn hydrate_semantic_hits_with_ids_preserves_remote_origin_without_source_row() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER
);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
VALUES(1, 1, NULL, 'laptop', 'dev@laptop', 'remote semantic', '/tmp/remote-semantic.jsonl', 100)",
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 101)",
&[
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text("remote semantic body".into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.hydrate_semantic_hits_with_ids(
&[VectorSearchResult {
message_id: 1,
chunk_idx: 0,
score: 0.9,
}],
FieldMask::new(false, true, true, true),
)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].1.source_id, "laptop");
assert_eq!(hits[0].1.origin_kind, "remote");
assert_eq!(hits[0].1.origin_host.as_deref(), Some("dev@laptop"));
Ok(())
}
#[test]
fn resolve_semantic_doc_ids_for_hits_distinguishes_same_source_path_line_by_content_hash()
-> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
)?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
)?;
let first = "same prefix first tail".to_string();
let second = "same prefix second tail".to_string();
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(11),
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text(first.clone().into()),
],
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(22),
fsqlite_types::value::SqliteValue::Integer(2),
fsqlite_types::value::SqliteValue::Text(second.clone().into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let first_hit = SearchHit {
title: "Shared Session".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(
&first,
"/tmp/progressive-shared.jsonl",
Some(1),
Some(100),
),
score: 0.0,
source_path: "/tmp/progressive-shared.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let second_hit = SearchHit {
title: "Shared Session".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(
&second,
"/tmp/progressive-shared.jsonl",
Some(1),
Some(100),
),
score: 0.0,
source_path: "/tmp/progressive-shared.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
assert_eq!(resolved.len(), 2);
assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
assert_ne!(
resolved[0].as_ref().map(|hit| hit.doc_id.as_str()),
resolved[1].as_ref().map(|hit| hit.doc_id.as_str())
);
Ok(())
}
#[test]
fn hydrate_semantic_hits_with_ids_keeps_missing_title_empty() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL,
started_at INTEGER
);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
VALUES(1, 1, NULL, 'local', NULL, NULL, '/tmp/untitled-semantic.jsonl', 100)",
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 101)",
&[
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text("untitled semantic body".into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.hydrate_semantic_hits_with_ids(
&[VectorSearchResult {
message_id: 1,
chunk_idx: 0,
score: 0.9,
}],
FieldMask::new(false, true, true, true),
)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].1.title, "");
Ok(())
}
#[test]
fn resolve_semantic_doc_ids_for_hits_prefers_conversation_id_over_ambiguous_provenance()
-> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
)?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
)?;
let content = "same ambiguous content".to_string();
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(11),
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text(content.clone().into()),
],
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(22),
fsqlite_types::value::SqliteValue::Integer(2),
fsqlite_types::value::SqliteValue::Text(content.clone().into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let first_hit = SearchHit {
title: "Shared Session".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(
&content,
"/tmp/progressive-conversation-id.jsonl",
Some(1),
Some(100),
),
score: 0.0,
source_path: "/tmp/progressive-conversation-id.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: Some(1),
};
let second_hit = SearchHit {
conversation_id: Some(2),
..first_hit.clone()
};
let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
assert_eq!(resolved.len(), 2);
assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
Ok(())
}
#[test]
fn resolve_semantic_doc_ids_for_hits_treats_null_source_as_local() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, NULL, NULL, 'Legacy Local', '/tmp/legacy-local.jsonl')",
)?;
let content = "legacy local semantic message".to_string();
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(11),
fsqlite_types::value::SqliteValue::Text(content.clone().into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "Legacy Local".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(&content, "/tmp/legacy-local.jsonl", Some(1), Some(100)),
score: 0.0,
source_path: "/tmp/legacy-local.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
Ok(())
}
#[test]
fn resolve_semantic_doc_ids_for_hits_matches_trimmed_local_source_id() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, ' local ', NULL, 'Trimmed Local', '/tmp/trimmed-local.jsonl')",
)?;
let content = "trimmed local semantic message".to_string();
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(11),
fsqlite_types::value::SqliteValue::Text(content.clone().into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "Trimmed Local".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(&content, "/tmp/trimmed-local.jsonl", Some(1), Some(100)),
score: 0.0,
source_path: "/tmp/trimmed-local.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
Ok(())
}
#[test]
fn resolve_semantic_doc_ids_for_hits_normalizes_blank_local_source_id() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, 'local', NULL, 'Blank Local', '/tmp/blank-local.jsonl')",
)?;
let content = "blank local semantic message".to_string();
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(11),
fsqlite_types::value::SqliteValue::Text(content.clone().into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "Blank Local".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(&content, "/tmp/blank-local.jsonl", Some(1), Some(100)),
score: 0.0,
source_path: "/tmp/blank-local.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: " ".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
Ok(())
}
#[test]
fn resolve_semantic_doc_ids_for_hits_infers_remote_source_from_origin_host_when_source_id_blank()
-> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
role TEXT,
content TEXT NOT NULL,
created_at INTEGER
);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, ' ', 'dev@laptop', 'Legacy Remote', '/tmp/legacy-remote.jsonl')",
)?;
let content = "legacy remote semantic message".to_string();
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
VALUES(?1, 1, 0, 'assistant', ?2, 100)",
&[
fsqlite_types::value::SqliteValue::Integer(11),
fsqlite_types::value::SqliteValue::Text(content.clone().into()),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "Legacy Remote".into(),
snippet: String::new(),
content: String::new(),
content_hash: stable_hit_hash(&content, "/tmp/legacy-remote.jsonl", Some(1), Some(100)),
score: 0.0,
source_path: "/tmp/legacy-remote.jsonl".into(),
agent: "codex".into(),
workspace: String::new(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "dev@laptop".into(),
origin_kind: "remote".into(),
origin_host: Some("dev@laptop".into()),
conversation_id: None,
};
let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
assert_eq!(resolved.len(), 1);
assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
Ok(())
}
#[test]
fn browse_by_date_snippet_only_uses_full_content_for_hit_identity() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER NOT NULL,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT NOT NULL
);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER NOT NULL,
idx INTEGER,
content TEXT NOT NULL,
created_at INTEGER
);
CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
)?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
VALUES(1, 1, NULL, 'local', NULL, 'browse title', '/tmp/browse-shared.jsonl')",
)?;
let shared_prefix = "shared-prefix ".repeat(48);
let first = format!("{shared_prefix}first browse-only tail");
let second = format!("{shared_prefix}second browse-only tail");
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, content, created_at)
VALUES(?1, 1, ?2, ?3, ?4)",
&[
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Integer(0),
fsqlite_types::value::SqliteValue::Text(first.clone().into()),
fsqlite_types::value::SqliteValue::Integer(101),
],
)?;
conn.execute_with_params(
"INSERT INTO messages(id, conversation_id, idx, content, created_at)
VALUES(?1, 1, ?2, ?3, ?4)",
&[
fsqlite_types::value::SqliteValue::Integer(2),
fsqlite_types::value::SqliteValue::Integer(1),
fsqlite_types::value::SqliteValue::Text(second.clone().into()),
fsqlite_types::value::SqliteValue::Integer(102),
],
)?;
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.browse_by_date(
SearchFilters::default(),
10,
0,
true,
FieldMask::new(false, true, true, true),
)?;
assert_eq!(hits.len(), 2);
assert!(hits.iter().all(|hit| hit.content.is_empty()));
assert!(hits.iter().all(|hit| !hit.snippet.is_empty()));
assert_ne!(hits[0].content_hash, hits[1].content_hash);
Ok(())
}
#[test]
fn cache_invalidates_on_new_data() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv1 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("first".into()),
workspace: None,
source_path: dir.path().join("1.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "apple banana".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv1)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].content, "apple banana");
{
let cache = client.prefix_cache.lock().unwrap();
let shard = cache.shard_opt("global").unwrap();
assert!(shard.contains(&client.cache_key("app", &SearchFilters::default())));
}
let conv2 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("second".into()),
workspace: None,
source_path: dir.path().join("2.jsonl"),
started_at: Some(2),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(2),
content: "apricot".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv2)?;
index.commit()?;
std::thread::sleep(std::time::Duration::from_millis(350));
let _hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
let hits = client.search("apr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].content, "apricot");
Ok(())
}
#[test]
fn track_generation_clears_cache_on_change() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "hello world".into(),
snippet: "hello".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let hits = vec![hit];
client.put_cache("hello", &SearchFilters::default(), &hits);
{
let cache = client.prefix_cache.lock().unwrap();
assert!(!cache.shards.is_empty());
}
client.track_generation(1);
{
let cache = client.prefix_cache.lock().unwrap();
assert!(!cache.shards.is_empty());
}
client.track_generation(2);
{
let cache = client.prefix_cache.lock().unwrap();
assert!(cache.shards.is_empty());
}
}
#[test]
fn cache_total_cap_evicts_across_shards() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(2, 0)), reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "a".into(),
snippet: "a".into(),
content: "a".into(),
content_hash: stable_content_hash("a"),
score: 1.0,
source_path: "p".into(),
agent: "agent1".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let hits = vec![hit.clone()];
let mut filters = SearchFilters::default();
filters.agents.insert("agent1".into());
client.put_cache("a", &filters, &hits);
filters.agents.clear();
filters.agents.insert("agent2".into());
client.put_cache("b", &filters, &hits);
filters.agents.clear();
filters.agents.insert("agent3".into());
client.put_cache("c", &filters, &hits);
let stats = client.cache_stats();
assert!(stats.total_cost <= stats.total_cap);
assert_eq!(stats.total_cap, 2);
}
#[test]
fn cache_stats_reflect_metrics() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
client.metrics.inc_cache_hits();
client.metrics.inc_cache_miss();
client.metrics.inc_cache_shortfall();
client.metrics.record_reload(Duration::from_millis(10));
let stats = client.cache_stats();
assert_eq!(stats.cache_hits, 1);
assert_eq!(stats.cache_miss, 1);
assert_eq!(stats.cache_shortfall, 1);
assert_eq!(stats.reloads, 1);
assert_eq!(stats.reload_ms_total, 10);
assert_eq!(stats.total_cap, *CACHE_TOTAL_CAP);
assert_eq!(stats.eviction_policy, "lru");
assert_eq!(stats.prewarm_scheduled, 0);
assert_eq!(stats.prewarm_skipped_pressure, 0);
assert_eq!(CacheStats::default().eviction_policy, "unknown");
}
#[test]
fn adaptive_query_prewarm_schedules_only_after_hot_prefix_cache_entry() {
let (tx, rx) = mpsc::unbounded();
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(10, 0)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: Some(tx),
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let mut filters = SearchFilters::default();
filters.workspaces.insert("/tmp/cass-workspace".into());
client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
assert!(
rx.try_recv().is_err(),
"cold prefixes should not schedule adaptive prewarm"
);
let mut hit = projected_minimal_fields_search_hit("hello title", "p");
hit.snippet = "hello".into();
hit.content = "hello world".into();
hit.content_hash = stable_content_hash(&hit.content);
client.put_cache("hel", &filters, std::slice::from_ref(&hit));
let total_cost_before = client.cache_stats().total_cost;
client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
assert!(
rx.try_recv().is_err(),
"an exact cached query should not schedule redundant prewarm"
);
client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
let job = rx
.try_recv()
.expect("hot prefix should schedule adaptive prewarm");
assert_eq!(job.query, "hello");
assert_eq!(job.shard_name, "workspace:/tmp/cass-workspace");
assert_eq!(job.filters_fingerprint, filters_fingerprint(&filters));
let stats = client.cache_stats();
assert_eq!(stats.prewarm_scheduled, 1);
assert_eq!(stats.prewarm_skipped_pressure, 0);
assert_eq!(
stats.total_cost, total_cost_before,
"prewarm scheduling should not mutate result-cache contents"
);
}
#[test]
fn adaptive_query_prewarm_skips_when_cache_byte_cap_is_under_pressure() {
let mut hit = projected_minimal_fields_search_hit("hello title", "p");
hit.snippet = "hello".into();
hit.content = "hello world with enough content to consume the small byte budget".into();
hit.content_hash = stable_content_hash(&hit.content);
let byte_cap = cached_hit_from(&hit).approx_bytes();
let (tx, rx) = mpsc::unbounded();
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(10, byte_cap)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: Some(tx),
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let filters = SearchFilters::default();
client.put_cache("hel", &filters, std::slice::from_ref(&hit));
client.maybe_schedule_adaptive_query_prewarm("zebra", &filters);
assert_eq!(
client.cache_stats().prewarm_skipped_pressure,
0,
"cold queries should not be counted as pressure-skipped prewarm jobs"
);
client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
assert!(
rx.try_recv().is_err(),
"prewarm should be disabled while cache byte pressure is high"
);
let stats = client.cache_stats();
assert_eq!(stats.prewarm_scheduled, 0);
assert_eq!(stats.prewarm_skipped_pressure, 1);
assert!(stats.approx_bytes <= stats.byte_cap);
}
#[test]
fn cache_eviction_count_tracks_evictions() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(2, 0)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hit = SearchHit {
title: "test".into(),
snippet: "snippet".into(),
content: "content".into(),
content_hash: stable_content_hash("content"),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
client.put_cache(
"query1",
&SearchFilters::default(),
std::slice::from_ref(&hit),
);
client.put_cache(
"query2",
&SearchFilters::default(),
std::slice::from_ref(&hit),
);
client.put_cache(
"query3",
&SearchFilters::default(),
std::slice::from_ref(&hit),
);
let stats = client.cache_stats();
assert!(
stats.eviction_count >= 1,
"should have evicted at least 1 entry"
);
assert!(stats.total_cost <= 2, "should be at or below cap");
assert!(stats.approx_bytes > 0, "should track bytes used");
}
#[test]
fn default_cache_byte_cap_scales_with_available_memory() {
let gib = 1024_u64 * 1024 * 1024;
assert_eq!(
default_cache_byte_cap_for_available(None),
DEFAULT_CACHE_BYTE_CAP_FALLBACK
);
assert_eq!(
default_cache_byte_cap_for_available(Some(2 * gib)),
DEFAULT_CACHE_BYTE_CAP_FALLBACK,
"small hosts keep a conservative cache byte budget"
);
assert_eq!(
default_cache_byte_cap_for_available(Some(64 * gib)),
512 * 1024 * 1024,
"larger hosts get a proportionally larger cache byte budget"
);
assert_eq!(
default_cache_byte_cap_for_available(Some(256 * gib)),
usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX),
"large swarm hosts still have a bounded default cache budget"
);
}
#[test]
fn malformed_cache_byte_cap_env_uses_default_instead_of_disabling_guard() {
let gib = 1024_u64 * 1024 * 1024;
assert_eq!(cache_byte_cap_from_env_value(Some("0"), Some(64 * gib)), 0);
assert_eq!(
cache_byte_cap_from_env_value(Some("not-a-number"), Some(64 * gib)),
default_cache_byte_cap_for_available(Some(64 * gib)),
"malformed env should keep the default memory guard active"
);
assert_eq!(
cache_byte_cap_from_env_value(None, Some(64 * gib)),
default_cache_byte_cap_for_available(Some(64 * gib))
);
}
#[test]
fn cache_eviction_policy_env_defaults_to_lru_and_accepts_s3_fifo() {
assert_eq!(
cache_eviction_policy_from_env_value(None),
CacheEvictionPolicy::Lru
);
assert_eq!(
cache_eviction_policy_from_env_value(Some("not-a-policy")),
CacheEvictionPolicy::Lru,
"malformed env keeps the current LRU behavior"
);
assert_eq!(
cache_eviction_policy_from_env_value(Some("s3-fifo")),
CacheEvictionPolicy::S3Fifo
);
assert_eq!(
cache_eviction_policy_from_env_value(Some("s3_fifo")),
CacheEvictionPolicy::S3Fifo
);
}
#[test]
fn s3_fifo_admission_rejects_one_off_byte_heavy_entries_then_admits_ghost_replay() {
let content = "large".repeat(1_000);
let hit = SearchHit {
title: "large".into(),
snippet: "large".into(),
content: content.clone(),
content_hash: stable_content_hash(&content),
score: 1.0,
source_path: "large-path".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let cached = cached_hit_from(&hit);
let byte_cap = cached.approx_bytes() + 1_024;
assert!(
cached.approx_bytes() > byte_cap.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR)
);
let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::S3Fifo);
let key = Arc::<str>::from("large-query");
cache.put("global", key.clone(), vec![cached.clone()]);
assert_eq!(
cache.total_cost(),
0,
"first one-off large entry is not admitted"
);
assert_eq!(cache.ghost_entries(), 1);
assert_eq!(cache.admission_rejects(), 1);
cache.put("global", key, vec![cached]);
assert_eq!(
cache.total_cost(),
1,
"ghost replay admits the repeated query"
);
assert_eq!(cache.ghost_entries(), 0);
assert!(cache.ghost_keys.is_empty());
assert_eq!(cache.admission_rejects(), 1);
assert!(cache.total_bytes() <= cache.byte_cap());
}
#[test]
fn lru_policy_keeps_admitting_large_entries_under_existing_caps() {
let content = "large".repeat(1_000);
let hit = SearchHit {
title: "large".into(),
snippet: "large".into(),
content: content.clone(),
content_hash: stable_content_hash(&content),
score: 1.0,
source_path: "large-path".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let cached = cached_hit_from(&hit);
let byte_cap = cached.approx_bytes() + 1_024;
let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::Lru);
cache.put("global", Arc::<str>::from("large-query"), vec![cached]);
assert_eq!(cache.total_cost(), 1);
assert_eq!(cache.ghost_entries(), 0);
assert_eq!(cache.admission_rejects(), 0);
assert_eq!(cache.policy_label(), "lru");
}
#[test]
fn cache_byte_cap_triggers_eviction() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(1000, 100)), reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let content = "c".repeat(100);
let hit = SearchHit {
title: "a".repeat(50),
snippet: "b".repeat(50),
content: content.clone(), content_hash: stable_content_hash(&content),
score: 1.0,
source_path: "p".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
client.put_cache("q1", &SearchFilters::default(), std::slice::from_ref(&hit));
client.put_cache("q2", &SearchFilters::default(), std::slice::from_ref(&hit));
client.put_cache("q3", &SearchFilters::default(), std::slice::from_ref(&hit));
let stats = client.cache_stats();
assert!(
stats.eviction_count >= 1,
"byte cap should trigger evictions"
);
assert_eq!(stats.byte_cap, 100, "byte cap should be reported");
}
#[test]
fn cache_byte_pressure_evicts_byte_heavy_shard_before_small_entries() {
let small_hit = SearchHit {
title: "small".into(),
snippet: "small".into(),
content: "small".into(),
content_hash: stable_content_hash("small"),
score: 1.0,
source_path: "small-path".into(),
agent: "a".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let large_content = "large".repeat(2_000);
let large_hit = SearchHit {
title: "large".into(),
snippet: "large".into(),
content: large_content.clone(),
content_hash: stable_content_hash(&large_content),
score: 1.0,
source_path: "large-path".into(),
agent: "b".into(),
workspace: "w".into(),
workspace_original: None,
created_at: None,
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let mut cache = CacheShards::new(100, 1_024);
cache.put(
"small",
Arc::<str>::from("small-1"),
vec![cached_hit_from(&small_hit)],
);
cache.put(
"small",
Arc::<str>::from("small-2"),
vec![cached_hit_from(&small_hit)],
);
cache.put(
"large",
Arc::<str>::from("large-1"),
vec![cached_hit_from(&large_hit)],
);
assert_eq!(
cache.shard_opt("small").map(LruCache::len),
Some(2),
"byte pressure should preserve the small shard"
);
assert!(
cache.shard_opt("large").is_none_or(LruCache::is_empty),
"oversized shard should be evicted first under byte pressure"
);
assert!(cache.total_bytes() <= cache.byte_cap());
}
#[test]
fn wildcard_pattern_parse_exact() {
assert_eq!(
FsCassWildcardPattern::parse("hello"),
FsCassWildcardPattern::Exact("hello".into())
);
assert_eq!(
FsCassWildcardPattern::parse("HELLO"),
FsCassWildcardPattern::Exact("hello".into()) );
assert_eq!(
FsCassWildcardPattern::parse("FooBar123"),
FsCassWildcardPattern::Exact("foobar123".into())
);
}
#[test]
fn wildcard_pattern_parse_prefix() {
assert_eq!(
FsCassWildcardPattern::parse("foo*"),
FsCassWildcardPattern::Prefix("foo".into())
);
assert_eq!(
FsCassWildcardPattern::parse("CONFIG*"),
FsCassWildcardPattern::Prefix("config".into())
);
assert_eq!(
FsCassWildcardPattern::parse("test*"),
FsCassWildcardPattern::Prefix("test".into())
);
}
#[test]
fn wildcard_pattern_parse_suffix() {
assert_eq!(
FsCassWildcardPattern::parse("*foo"),
FsCassWildcardPattern::Suffix("foo".into())
);
assert_eq!(
FsCassWildcardPattern::parse("*Error"),
FsCassWildcardPattern::Suffix("error".into())
);
assert_eq!(
FsCassWildcardPattern::parse("*Handler"),
FsCassWildcardPattern::Suffix("handler".into())
);
}
#[test]
fn wildcard_pattern_parse_substring() {
assert_eq!(
FsCassWildcardPattern::parse("*foo*"),
FsCassWildcardPattern::Substring("foo".into())
);
assert_eq!(
FsCassWildcardPattern::parse("*CONFIG*"),
FsCassWildcardPattern::Substring("config".into())
);
assert_eq!(
FsCassWildcardPattern::parse("*test*"),
FsCassWildcardPattern::Substring("test".into())
);
}
#[test]
fn wildcard_pattern_parse_edge_cases() {
assert_eq!(
FsCassWildcardPattern::parse("*"),
FsCassWildcardPattern::Exact(String::new())
);
assert_eq!(
FsCassWildcardPattern::parse("**"),
FsCassWildcardPattern::Exact(String::new())
);
assert_eq!(
FsCassWildcardPattern::parse("***"),
FsCassWildcardPattern::Exact(String::new())
);
assert_eq!(
FsCassWildcardPattern::parse("*a*"),
FsCassWildcardPattern::Substring("a".into())
);
assert_eq!(
FsCassWildcardPattern::parse("a*"),
FsCassWildcardPattern::Prefix("a".into())
);
assert_eq!(
FsCassWildcardPattern::parse("*a"),
FsCassWildcardPattern::Suffix("a".into())
);
assert_eq!(
FsCassWildcardPattern::parse("***foo***"),
FsCassWildcardPattern::Substring("foo".into())
);
}
#[test]
fn wildcard_pattern_to_regex_suffix() {
let pattern = FsCassWildcardPattern::Suffix("foo".into());
assert_eq!(pattern.to_regex(), Some(".*foo$".into()));
}
#[test]
fn wildcard_pattern_to_regex_substring() {
let pattern = FsCassWildcardPattern::Substring("bar".into());
assert_eq!(pattern.to_regex(), Some(".*bar.*".into()));
}
#[test]
fn wildcard_pattern_to_regex_exact_prefix_none() {
let exact = FsCassWildcardPattern::Exact("foo".into());
assert_eq!(exact.to_regex(), None);
let prefix = FsCassWildcardPattern::Prefix("bar".into());
assert_eq!(prefix.to_regex(), None);
}
#[test]
fn match_type_quality_factors() {
assert_eq!(MatchType::Exact.quality_factor(), 1.0);
assert_eq!(MatchType::Prefix.quality_factor(), 0.9);
assert_eq!(MatchType::Suffix.quality_factor(), 0.8);
assert_eq!(MatchType::Substring.quality_factor(), 0.7);
assert_eq!(MatchType::ImplicitWildcard.quality_factor(), 0.6);
}
#[test]
fn dominant_match_type_single_terms() {
assert_eq!(dominant_match_type("hello"), MatchType::Exact);
assert_eq!(dominant_match_type("hello*"), MatchType::Prefix);
assert_eq!(dominant_match_type("*hello"), MatchType::Suffix);
assert_eq!(dominant_match_type("*hello*"), MatchType::Substring);
}
#[test]
fn dominant_match_type_multiple_terms() {
assert_eq!(dominant_match_type("foo bar"), MatchType::Exact);
assert_eq!(dominant_match_type("foo bar*"), MatchType::Prefix);
assert_eq!(dominant_match_type("foo *bar"), MatchType::Suffix);
assert_eq!(dominant_match_type("foo* *bar*"), MatchType::Substring);
assert_eq!(dominant_match_type("foo *bar* baz"), MatchType::Substring);
}
#[test]
fn dominant_match_type_empty_query() {
assert_eq!(dominant_match_type(""), MatchType::Exact);
assert_eq!(dominant_match_type(" "), MatchType::Exact);
}
#[test]
fn wildcard_pattern_to_regex_escapes_special_chars() {
assert_eq!(
FsCassWildcardPattern::Suffix("foo.bar".into()).to_regex(),
Some(".*foo\\.bar$".into())
);
assert_eq!(
FsCassWildcardPattern::Substring("a+b*c?".into()).to_regex(),
Some(".*a\\+b\\*c\\?.*".into())
);
}
#[test]
fn wildcard_pattern_to_regex_escapes_complex_patterns() {
assert_eq!(
FsCassWildcardPattern::Suffix("test[0-9]+".into()).to_regex(),
Some(".*test\\[0-9\\]\\+$".into())
);
assert_eq!(
FsCassWildcardPattern::Substring("(a|b)".into()).to_regex(),
Some(".*\\(a\\|b\\).*".into())
);
assert_eq!(
FsCassWildcardPattern::Substring("end$".into()).to_regex(),
Some(".*end\\$.*".into())
);
assert_eq!(
FsCassWildcardPattern::Substring("^start".into()).to_regex(),
Some(".*\\^start.*".into())
);
}
#[test]
fn is_tool_invocation_noise_detects_noise() {
assert!(!is_tool_invocation_noise("[Tool: Bash]"));
assert!(!is_tool_invocation_noise("[Tool: Read]"));
assert!(is_tool_invocation_noise("[Tool:]"));
assert!(is_tool_invocation_noise("[Tool: ]"));
assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
assert!(!is_tool_invocation_noise(" [Tool: Grep - Search files] "));
assert!(is_tool_invocation_noise("[tool]"));
assert!(is_tool_invocation_noise("tool: Bash"));
}
#[test]
fn is_tool_invocation_noise_allows_useful_content() {
assert!(!is_tool_invocation_noise("[Tool: Read - src/main.rs]"));
assert!(!is_tool_invocation_noise("[Tool: Bash - cargo test --lib]"));
}
#[test]
fn is_tool_invocation_noise_detects_tool_markers() {
assert!(!is_tool_invocation_noise("[Tool: Bash]"));
assert!(!is_tool_invocation_noise("[Tool: Read]"));
assert!(is_tool_invocation_noise("[Tool:]"));
assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
assert!(!is_tool_invocation_noise(" [Tool: Write - description] "));
}
#[test]
fn deduplicate_hits_removes_exact_dupes() {
let hits = vec![
SearchHit {
title: "title1".into(),
snippet: "snip1".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 1.0,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title1".into(),
snippet: "snip2".into(),
content: "hello world".into(), content_hash: stable_content_hash("hello world"),
score: 0.5, source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(), origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].score, 1.0); assert_eq!(deduped[0].title, "title1");
}
#[test]
fn deduplicate_hits_keeps_higher_score() {
let hits = vec![
SearchHit {
title: "title1".into(),
snippet: "snip1".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 0.3, source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title1".into(),
snippet: "snip2".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 0.9, source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].score, 0.9); assert_eq!(deduped[0].title, "title1");
}
#[test]
fn deduplicate_hits_keeps_repeated_same_content_at_different_lines() {
let first = SearchHit {
title: "Shared Session".into(),
snippet: String::new(),
content: "repeat me".into(),
content_hash: stable_content_hash("repeat me"),
score: 10.0,
source_path: "/shared/session.jsonl".into(),
agent: "codex".into(),
workspace: "/ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
let mut second = first.clone();
second.line_number = Some(2);
second.created_at = Some(200);
second.score = 9.0;
let deduped = deduplicate_hits(vec![first, second]);
assert_eq!(deduped.len(), 2);
}
#[test]
fn deduplicate_hits_keeps_distinct_conversation_ids_with_same_title_path_and_content() {
let mut first = make_test_hit("same", 1.0);
first.title = "Shared Session".into();
first.source_path = "/shared/session.jsonl".into();
first.content = "identical body".into();
first.content_hash = stable_content_hash("identical body");
first.conversation_id = Some(1);
let mut second = first.clone();
second.conversation_id = Some(2);
second.score = 0.9;
let deduped = deduplicate_hits(vec![first, second]);
assert_eq!(deduped.len(), 2);
assert!(deduped.iter().any(|hit| hit.conversation_id == Some(1)));
assert!(deduped.iter().any(|hit| hit.conversation_id == Some(2)));
}
#[test]
fn deduplicate_hits_coalesces_same_conversation_id_despite_title_drift() {
let mut first = make_test_hit("same", 1.0);
first.title = "Morning Session".into();
first.source_path = "/shared/session.jsonl".into();
first.content = "identical body".into();
first.content_hash = stable_content_hash("identical body");
first.conversation_id = Some(7);
let mut second = first.clone();
second.title = "Evening Session".into();
second.score = 0.9;
let deduped = deduplicate_hits(vec![first, second]);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].conversation_id, Some(7));
}
#[test]
fn deduplicate_hits_keeps_distinct_titles_with_same_source_path_and_content() {
let hits = vec![
SearchHit {
title: "Morning Session".into(),
snippet: "snip1".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 0.9,
source_path: "shared.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: None,
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "Evening Session".into(),
snippet: "snip2".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 0.8,
source_path: "shared.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: None,
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 2);
assert!(deduped.iter().any(|hit| hit.title == "Morning Session"));
assert!(deduped.iter().any(|hit| hit.title == "Evening Session"));
}
#[test]
fn deduplicate_hits_normalizes_whitespace() {
let hits = vec![
SearchHit {
title: "title1".into(),
snippet: "snip1".into(),
content: "hello world".into(), content_hash: stable_content_hash("hello world"),
score: 1.0,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title1".into(),
snippet: "snip2".into(),
content: "hello world".into(), content_hash: stable_content_hash("hello world"),
score: 0.5,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 1); }
#[test]
fn deduplicate_hits_normalizes_blank_local_source_id() {
let hits = vec![
SearchHit {
title: "title1".into(),
snippet: "snip1".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 1.0,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title1".into(),
snippet: "snip2".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 0.5,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: " ".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].source_id, "local");
}
#[test]
fn deduplicate_hits_filters_tool_noise() {
let hits = vec![
SearchHit {
title: "title1".into(),
snippet: "snip1".into(),
content: "[Tool:]".into(), content_hash: stable_content_hash("[Tool:]"),
score: 1.0,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title2".into(),
snippet: "snip2".into(),
content: "This is real content about testing".into(),
content_hash: stable_content_hash("This is real content about testing"),
score: 0.5,
source_path: "b.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(200),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 1);
assert!(deduped[0].content.contains("real content"));
}
#[test]
fn deduplicate_hits_filters_acknowledgement_noise() {
let hits = vec![
SearchHit {
title: "ack".into(),
snippet: "ack".into(),
content: "Acknowledged.".into(),
content_hash: stable_content_hash("Acknowledged."),
score: 1.0,
source_path: "ack.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "real".into(),
snippet: "real".into(),
content: "Authentication refresh logic changed".into(),
content_hash: stable_content_hash("Authentication refresh logic changed"),
score: 0.5,
source_path: "real.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(200),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits_with_query(hits, "authentication");
assert_eq!(deduped.len(), 1);
assert_eq!(deduped[0].title, "real");
}
#[test]
fn deduplicate_hits_hides_system_prompts_unless_query_requests_them() {
let prompt_hit = SearchHit {
title: "prompt".into(),
snippet: "prompt".into(),
content:
"# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly."
.into(),
content_hash: stable_content_hash(
"# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly.",
),
score: 1.0,
source_path: "prompt.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
};
assert!(
deduplicate_hits_with_query(vec![prompt_hit.clone()], "coding assistant").is_empty()
);
let kept = deduplicate_hits_with_query(vec![prompt_hit], "AGENTS.md instructions");
assert_eq!(kept.len(), 1);
assert_eq!(kept[0].title, "prompt");
}
#[test]
fn deduplicate_hits_preserves_unique_content() {
let hits = vec![
SearchHit {
title: "title1".into(),
snippet: "snip1".into(),
content: "first message".into(),
content_hash: stable_content_hash("first message"),
score: 1.0,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title2".into(),
snippet: "snip2".into(),
content: "second message".into(),
content_hash: stable_content_hash("second message"),
score: 0.8,
source_path: "b.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(200),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "title3".into(),
snippet: "snip3".into(),
content: "third message".into(),
content_hash: stable_content_hash("third message"),
score: 0.6,
source_path: "c.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(300),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(deduped.len(), 3); }
#[test]
fn deduplicate_hits_respects_source_boundaries() {
let hits = vec![
SearchHit {
title: "local title".into(),
snippet: "snip".into(),
content: "hello world".into(),
content_hash: stable_content_hash("hello world"),
score: 1.0,
source_path: "a.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(100),
line_number: None,
match_type: MatchType::Exact,
source_id: "local".into(),
origin_kind: "local".into(),
origin_host: None,
conversation_id: None,
},
SearchHit {
title: "remote title".into(),
snippet: "snip".into(),
content: "hello world".into(), content_hash: stable_content_hash("hello world"),
score: 0.9,
source_path: "b.jsonl".into(),
agent: "agent".into(),
workspace: "ws".into(),
workspace_original: None,
created_at: Some(200),
line_number: None,
match_type: MatchType::Exact,
source_id: "work-laptop".into(), origin_kind: "ssh".into(),
origin_host: Some("work-laptop.local".into()),
conversation_id: None,
},
];
let deduped = deduplicate_hits(hits);
assert_eq!(
deduped.len(),
2,
"same content from different sources should not dedupe"
);
assert!(deduped.iter().any(|h| h.source_id == "local"));
assert!(deduped.iter().any(|h| h.source_id == "work-laptop"));
}
#[test]
fn wildcard_fallback_sparse_check_uses_effective_limit() {
assert!(
!should_try_wildcard_fallback(1, 1, 0, 3),
"a filled one-result page is not sparse for fallback purposes"
);
assert!(
!should_try_wildcard_fallback(2, 2, 0, 3),
"a filled two-result page is not sparse for fallback purposes"
);
assert!(
should_try_wildcard_fallback(0, 1, 0, 3),
"zero hits should still trigger fallback even for tiny pages"
);
assert!(
should_try_wildcard_fallback(1, 2, 0, 3),
"a partially filled page should still trigger fallback"
);
assert!(
!should_try_wildcard_fallback(0, 5, 10, 3),
"pagination should not trigger wildcard fallback"
);
assert!(
should_try_wildcard_fallback(1, 0, 0, 3),
"limit zero preserves the legacy sparse-threshold semantics"
);
}
#[test]
fn snippet_preview_fast_path_requires_snippet_only_match() {
let snippet_only = FieldMask::new(false, true, false, false);
let snippet = snippet_from_preview_without_full_content(
snippet_only,
"migration checks the database constraint before writing",
"database",
)
.expect("preview should satisfy a snippet-only request when it contains the query");
assert!(snippet.contains("**database**"));
assert!(
snippet_from_preview_without_full_content(
FieldMask::FULL,
"migration checks the database constraint before writing",
"database",
)
.is_none(),
"full-content requests must keep the sqlite hydration path"
);
assert!(
snippet_from_preview_without_full_content(
snippet_only,
"migration checks constraints before writing",
"database",
)
.is_none(),
"snippet-only requests hydrate when the preview cannot show the match"
);
}
#[test]
fn search_with_fallback_returns_exact_when_sufficient() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
for i in 0..5 {
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some(format!("doc-{i}")),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join(format!("{i}.jsonl")),
started_at: Some(100 + i),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100 + i),
content: format!("apple fruit number {i} is delicious and healthy"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"apple",
SearchFilters::default(),
10,
0,
3, FieldMask::FULL,
)?;
assert!(!result.wildcard_fallback);
assert!(result.hits.len() >= 3);
Ok(())
}
#[test]
fn search_with_fallback_triggers_on_sparse_results() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("substring test".into()),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join("test.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "configuration management system".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"config",
SearchFilters::default(),
10,
0,
5, FieldMask::FULL,
)?;
assert!(!result.hits.is_empty());
Ok(())
}
#[test]
fn search_with_fallback_skips_when_query_has_wildcards() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("test".into()),
workspace: None,
source_path: dir.path().join("test.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "testing data".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"*test*",
SearchFilters::default(),
10,
0,
10, FieldMask::FULL,
)?;
assert!(!result.wildcard_fallback); Ok(())
}
#[test]
fn search_with_fallback_prefers_wildcards_when_they_add_hits() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
for (i, body) in [
"alphabet soup for coders",
"mapping the alphabet city blocks",
]
.iter()
.enumerate()
{
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some(format!("alpha-{i}")),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join(format!("alpha-{i}.jsonl")),
started_at: Some(100 + i as i64),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100 + i as i64),
content: body.to_string(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"bet",
SearchFilters::default(),
10,
0,
2,
FieldMask::FULL,
)?;
assert!(
result.wildcard_fallback,
"should switch to wildcard fallback when it yields more hits"
);
assert_eq!(
result.hits.len(),
2,
"fallback should surface all alphabet docs"
);
assert!(
result
.hits
.iter()
.all(|h| h.match_type == MatchType::ImplicitWildcard)
);
assert!(result.hits.iter().all(|h| h.content.contains("alphabet")));
Ok(())
}
#[test]
fn automatic_wildcard_fallback_skips_long_zero_hit_token() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("fruit".into()),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join("fruit.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "apple pear banana".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"zzzzzzunlikelyterm",
SearchFilters::default(),
10,
0,
1,
FieldMask::FULL,
)?;
assert!(result.hits.is_empty());
assert!(!result.wildcard_fallback);
assert!(
result
.suggestions
.iter()
.any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
"manual wildcard suggestion should remain available"
);
let short_result = client.search_with_fallback(
"pple",
SearchFilters::default(),
10,
0,
1,
FieldMask::FULL,
)?;
assert!(short_result.wildcard_fallback);
assert_eq!(short_result.hits.len(), 1);
assert_eq!(short_result.hits[0].match_type, MatchType::ImplicitWildcard);
Ok(())
}
#[test]
fn nohit_suggestions_do_not_lazy_open_sqlite_when_tantivy_is_present() -> Result<()> {
let dir = TempDir::new()?;
let index_path = dir.path().join("index");
let db_path = dir.path().join("cass.db");
let storage = FrankenStorage::open(&db_path)?;
storage.close()?;
let mut index = TantivyIndex::open_or_create(&index_path)?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("fruit".into()),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join("fruit.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "apple pear banana".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(&index_path, Some(&db_path))?.expect("index present");
assert!(
client
.sqlite
.lock()
.map(|guard| guard.is_none())
.unwrap_or(false),
"sqlite should start closed"
);
let result = client.search_with_fallback(
"zzzzzzunlikelyterm",
SearchFilters::default(),
10,
0,
1,
FieldMask::FULL,
)?;
assert!(result.hits.is_empty());
assert!(
result
.suggestions
.iter()
.any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
"manual wildcard suggestion should remain available"
);
assert!(
result
.suggestions
.iter()
.all(|s| !matches!(s.kind, SuggestionKind::AlternateAgent)),
"alternate-agent suggestions should not force a SQLite open"
);
assert!(
client
.sqlite
.lock()
.map(|guard| guard.is_none())
.unwrap_or(false),
"sqlite should stay closed after Tantivy no-hit suggestions"
);
Ok(())
}
#[test]
fn search_with_fallback_emits_wildcard_suggestion_on_zero_hits() -> Result<()> {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: "vtest|schema:none".into(),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let result = client.search_with_fallback(
"ghost",
SearchFilters::default(),
5,
0,
3,
FieldMask::FULL,
)?;
assert!(
result.hits.is_empty(),
"no index/db means no hits should be returned"
);
assert!(
!result.wildcard_fallback,
"with zero baseline and fallback hits, we should keep baseline and mark fallback=false"
);
let wildcard = result
.suggestions
.iter()
.find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
.expect("should suggest adding wildcards");
assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
Ok(())
}
#[test]
fn search_with_fallback_skips_empty_query() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("test".into()),
workspace: None,
source_path: dir.path().join("test.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "testing data".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
" ",
SearchFilters::default(),
10,
0,
10,
FieldMask::FULL,
)?;
assert!(!result.wildcard_fallback);
Ok(())
}
#[test]
fn search_with_fallback_skips_for_nonzero_offset() -> Result<()> {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: "vtest|schema:none".into(),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let result = client.search_with_fallback(
"ghost",
SearchFilters::default(),
5,
10,
3,
FieldMask::FULL,
)?;
assert!(
!result.wildcard_fallback,
"fallback should not run on paginated searches"
);
let wildcard = result
.suggestions
.iter()
.find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
.expect("wildcard suggestion present");
assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
Ok(())
}
#[test]
fn generate_suggestions_limits_and_sets_shortcuts() -> Result<()> {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: "vtest|schema:none".into(),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let mut filters = SearchFilters::default();
filters.agents.insert("codex".into());
let result = client.search_with_fallback("claud", filters, 5, 0, 3, FieldMask::FULL)?;
assert_eq!(
result.suggestions.len(),
3,
"should truncate to 3 suggestions"
);
for (idx, sugg) in result.suggestions.iter().enumerate() {
assert_eq!(
sugg.shortcut,
Some((idx + 1) as u8),
"shortcut should match position (1-based)"
);
}
assert!(
result
.suggestions
.iter()
.any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
"should suggest wildcard search"
);
assert!(
result
.suggestions
.iter()
.any(|s| matches!(s.kind, SuggestionKind::RemoveFilter)),
"should suggest removing agent filter"
);
assert!(
result
.suggestions
.iter()
.any(|s| matches!(s.kind, SuggestionKind::SpellingFix)),
"should suggest spelling fix for nearby agent name"
);
Ok(())
}
#[test]
fn generate_suggestions_includes_recent_alternate_agents() -> Result<()> {
let dir = TempDir::new()?;
let db_path = dir.path().join("cass.db");
let storage = FrankenStorage::open(&db_path)?;
let workspace_id = storage.ensure_workspace(dir.path(), None)?;
let base_ts = 1_700_000_010_000_i64;
for (idx, slug) in ["claude_code", "codex"].iter().enumerate() {
let agent = Agent {
id: None,
slug: (*slug).to_string(),
name: (*slug).to_string(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent)?;
let conversation = Conversation {
id: None,
agent_slug: (*slug).to_string(),
workspace: Some(dir.path().to_path_buf()),
external_id: Some(format!("alt-agent-{idx}")),
title: Some(format!("alternate agent {idx}")),
source_path: dir.path().join(format!("{slug}.jsonl")),
started_at: Some(base_ts + idx as i64),
ended_at: Some(base_ts + idx as i64),
approx_tokens: Some(8),
metadata_json: json!({}),
messages: vec![Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(base_ts + idx as i64),
content: format!("content from {slug}"),
extra_json: json!({}),
snippets: Vec::new(),
}],
source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
origin_host: None,
};
storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
}
drop(storage);
let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
let result = client.search_with_fallback(
"ghost",
SearchFilters::default(),
5,
0,
3,
FieldMask::FULL,
)?;
let alternate_agents: HashSet<String> = result
.suggestions
.iter()
.filter(|suggestion| matches!(suggestion.kind, SuggestionKind::AlternateAgent))
.filter_map(|suggestion| suggestion.suggested_filters.as_ref())
.flat_map(|filters| filters.agents.iter().cloned())
.collect();
assert!(
alternate_agents.contains("claude_code"),
"should suggest claude_code from normalized conversations schema"
);
assert!(
alternate_agents.contains("codex"),
"should suggest codex from normalized conversations schema"
);
Ok(())
}
#[test]
fn sanitize_query_preserves_wildcards() {
assert_eq!(fs_cass_sanitize_query("*foo*"), "*foo*");
assert_eq!(fs_cass_sanitize_query("foo*"), "foo*");
assert_eq!(fs_cass_sanitize_query("*bar"), "*bar");
assert_eq!(fs_cass_sanitize_query("*config*"), "*config*");
}
#[test]
fn sanitize_query_strips_other_special_chars() {
assert_eq!(fs_cass_sanitize_query("foo.bar"), "foo bar");
assert_eq!(fs_cass_sanitize_query("c++"), "c ");
assert_eq!(fs_cass_sanitize_query("foo-bar"), "foo-bar");
assert_eq!(fs_cass_sanitize_query("test_case"), "test case");
}
#[test]
fn sanitize_query_combined() {
assert_eq!(fs_cass_sanitize_query("*foo.bar*"), "*foo bar*");
assert_eq!(fs_cass_sanitize_query("test-*"), "test-*");
assert_eq!(fs_cass_sanitize_query("*c++*"), "*c *");
}
#[test]
fn parse_boolean_query_simple_terms() {
let tokens = fs_cass_parse_boolean_query("foo bar baz");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
assert_eq!(tokens[1], FsCassQueryToken::Term("bar".to_string()));
assert_eq!(tokens[2], FsCassQueryToken::Term("baz".to_string()));
}
#[test]
fn parse_boolean_query_and_operator() {
let tokens = fs_cass_parse_boolean_query("foo AND bar");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
assert_eq!(tokens[1], FsCassQueryToken::And);
assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
let tokens2 = fs_cass_parse_boolean_query("foo && bar");
assert_eq!(tokens2.len(), 3);
assert_eq!(tokens2[1], FsCassQueryToken::And);
}
#[test]
fn parse_boolean_query_or_operator() {
let tokens = fs_cass_parse_boolean_query("foo OR bar");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
assert_eq!(tokens[1], FsCassQueryToken::Or);
assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
let tokens2 = fs_cass_parse_boolean_query("foo || bar");
assert_eq!(tokens2.len(), 3);
assert_eq!(tokens2[1], FsCassQueryToken::Or);
}
#[test]
fn parse_boolean_query_not_operator() {
let tokens = fs_cass_parse_boolean_query("foo NOT bar");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
assert_eq!(tokens[1], FsCassQueryToken::Not);
assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
}
#[test]
fn parse_boolean_query_quoted_phrase() {
let tokens = fs_cass_parse_boolean_query(r#"foo "exact phrase" bar"#);
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
assert_eq!(
tokens[1],
FsCassQueryToken::Phrase("exact phrase".to_string())
);
assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
}
#[test]
fn parse_boolean_query_complex() {
let tokens = fs_cass_parse_boolean_query(r#"error OR warning NOT "false positive""#);
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[0], FsCassQueryToken::Term("error".to_string()));
assert_eq!(tokens[1], FsCassQueryToken::Or);
assert_eq!(tokens[2], FsCassQueryToken::Term("warning".to_string()));
assert_eq!(tokens[3], FsCassQueryToken::Not);
assert_eq!(
tokens[4],
FsCassQueryToken::Phrase("false positive".to_string())
);
}
#[test]
fn has_boolean_operators_detection() {
assert!(!fs_cass_has_boolean_operators("foo bar"));
assert!(fs_cass_has_boolean_operators("foo AND bar"));
assert!(fs_cass_has_boolean_operators("foo OR bar"));
assert!(fs_cass_has_boolean_operators("foo NOT bar"));
assert!(fs_cass_has_boolean_operators(r#""exact phrase""#));
assert!(fs_cass_has_boolean_operators("foo && bar"));
assert!(fs_cass_has_boolean_operators("foo || bar"));
}
#[test]
fn parse_boolean_query_case_insensitive_operators() {
let tokens = fs_cass_parse_boolean_query("foo and bar or baz not qux");
assert_eq!(tokens.len(), 7);
assert_eq!(tokens[1], FsCassQueryToken::And);
assert_eq!(tokens[3], FsCassQueryToken::Or);
assert_eq!(tokens[5], FsCassQueryToken::Not);
}
#[test]
fn parse_boolean_query_with_wildcards() {
let tokens = fs_cass_parse_boolean_query("*config* OR env*");
assert_eq!(tokens.len(), 3);
assert_eq!(tokens[0], FsCassQueryToken::Term("*config*".to_string()));
assert_eq!(tokens[1], FsCassQueryToken::Or);
assert_eq!(tokens[2], FsCassQueryToken::Term("env*".to_string()));
}
#[test]
fn tantivy_search_hydrates_long_content_when_content_field_is_not_stored() -> Result<()> {
let dir = TempDir::new()?;
let db_path = dir.path().join("cass.db");
let storage = FrankenStorage::open(&db_path)?;
let workspace_id = storage.ensure_workspace(dir.path(), None)?;
let agent = Agent {
id: None,
slug: "codex".into(),
name: "Codex".into(),
version: None,
kind: AgentKind::Cli,
};
let agent_id = storage.ensure_agent(&agent)?;
let long_content = format!(
"{}needle appears past the preview boundary for hydration proof",
"padding ".repeat(70)
);
let short_content = "shortneedle fits entirely inside the stored preview".to_string();
let conversation = Conversation {
id: None,
agent_slug: "codex".into(),
workspace: Some(dir.path().to_path_buf()),
external_id: Some("hydrate-long-content".into()),
title: Some("hydrated lexical doc".into()),
source_path: dir.path().join("hydrate.jsonl"),
started_at: Some(1_700_000_123_000),
ended_at: Some(1_700_000_123_000),
approx_tokens: Some(32),
metadata_json: json!({}),
messages: vec![
Message {
id: None,
idx: 0,
role: MessageRole::User,
author: Some("user".into()),
created_at: Some(1_700_000_123_000),
content: long_content.clone(),
extra_json: json!({}),
snippets: Vec::new(),
},
Message {
id: None,
idx: 1,
role: MessageRole::Agent,
author: Some("assistant".into()),
created_at: Some(1_700_000_124_000),
content: short_content.clone(),
extra_json: json!({}),
snippets: Vec::new(),
},
],
source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
origin_host: None,
};
storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
storage.close()?;
let index_path = dir.path().join("search-index");
let mut index = TantivyIndex::open_or_create(&index_path)?;
let normalized = NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some("hydrate-long-content".into()),
title: Some("hydrated lexical doc".into()),
workspace: Some(dir.path().to_path_buf()),
source_path: dir.path().join("hydrate.jsonl"),
started_at: Some(1_700_000_123_000),
ended_at: Some(1_700_000_123_000),
metadata: json!({}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("user".into()),
created_at: Some(1_700_000_123_000),
content: long_content.clone(),
extra: json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("assistant".into()),
created_at: Some(1_700_000_124_000),
content: short_content.clone(),
extra: json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
};
index.add_conversation(&normalized)?;
index.commit()?;
let client = SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
let hits = client.search("needle", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1, "expected one lexical hit");
assert_eq!(hits[0].title, "hydrated lexical doc");
assert!(
hits[0]
.content
.contains("needle appears past the preview boundary"),
"lexical hit should hydrate full content from sqlite when Tantivy content is not stored"
);
assert!(
hits[0].snippet.to_lowercase().contains("needle"),
"snippet should still be rendered from hydrated content"
);
let bounded_hits = client.search(
"needle",
SearchFilters::default(),
5,
0,
FieldMask::FULL.with_preview_content_limit(Some(200)),
)?;
assert_eq!(bounded_hits.len(), 1, "expected one lexical hit");
assert!(
bounded_hits[0].content.starts_with("padding padding"),
"bounded content may be served from the stored preview prefix"
);
assert!(
!bounded_hits[0]
.content
.contains("needle appears past the preview boundary"),
"bounded preview content should not hydrate the full sqlite row"
);
let short_client =
SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
assert!(
short_client
.sqlite
.lock()
.map(|guard| guard.is_none())
.unwrap_or(false),
"sqlite should start closed for short preview hit"
);
let short_hits = short_client.search(
"shortneedle",
SearchFilters::default(),
5,
0,
FieldMask::FULL,
)?;
assert_eq!(short_hits.len(), 1, "expected one short lexical hit");
assert_eq!(
short_hits[0].content, short_content,
"untruncated stored preview is exact full content"
);
assert!(
short_client
.sqlite
.lock()
.map(|guard| guard.is_none())
.unwrap_or(false),
"short full-content hit should not lazy-open sqlite"
);
Ok(())
}
#[test]
fn filter_fidelity_agent_filter_respected() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv_a = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("alpha doc".into()),
workspace: None,
source_path: dir.path().join("a.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "hello world findme alpha".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv_b = NormalizedConversation {
agent_slug: "claude".into(),
external_id: None,
title: Some("beta doc".into()),
workspace: None,
source_path: dir.path().join("b.jsonl"),
started_at: Some(200),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(200),
content: "hello world findme beta".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv_a)?;
index.add_conversation(&conv_b)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.agents.insert("codex".into());
let hits = client.search("findme", filters.clone(), 10, 0, FieldMask::FULL)?;
for hit in &hits {
assert_eq!(
hit.agent, "codex",
"Agent filter violated: got agent '{}' instead of 'codex'",
hit.agent
);
}
assert!(!hits.is_empty(), "Should have found results");
let cached_hits = client.search("findme", filters, 10, 0, FieldMask::FULL)?;
for hit in &cached_hits {
assert_eq!(hit.agent, "codex", "Cached search violated agent filter");
}
Ok(())
}
#[test]
fn filter_fidelity_workspace_filter_respected() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv_a = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("ws_a doc".into()),
workspace: Some(std::path::PathBuf::from("/workspace/alpha")),
source_path: dir.path().join("a.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "workspace test needle".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv_b = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("ws_b doc".into()),
workspace: Some(std::path::PathBuf::from("/workspace/beta")),
source_path: dir.path().join("b.jsonl"),
started_at: Some(200),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(200),
content: "workspace test needle".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv_a)?;
index.add_conversation(&conv_b)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.workspaces.insert("/workspace/beta".into());
let hits = client.search("needle", filters.clone(), 10, 0, FieldMask::FULL)?;
for hit in &hits {
assert_eq!(
hit.workspace, "/workspace/beta",
"Workspace filter violated: got '{}' instead of '/workspace/beta'",
hit.workspace
);
}
assert!(!hits.is_empty(), "Should have found results");
let cached_hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
for hit in &cached_hits {
assert_eq!(
hit.workspace, "/workspace/beta",
"Cached search violated workspace filter"
);
}
Ok(())
}
#[test]
fn filter_fidelity_date_range_respected() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv_early = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("early".into()),
workspace: None,
source_path: dir.path().join("early.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "date range test".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv_middle = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("middle".into()),
workspace: None,
source_path: dir.path().join("middle.jsonl"),
started_at: Some(500),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(500),
content: "date range test".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv_late = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("late".into()),
workspace: None,
source_path: dir.path().join("late.jsonl"),
started_at: Some(900),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(900),
content: "date range test".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv_early)?;
index.add_conversation(&conv_middle)?;
index.add_conversation(&conv_late)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let filters = SearchFilters {
created_from: Some(400),
created_to: Some(600),
..Default::default()
};
let hits = client.search("range", filters.clone(), 10, 0, FieldMask::FULL)?;
for hit in &hits {
if let Some(ts) = hit.created_at {
assert!(
(400..=600).contains(&ts),
"Date range filter violated: got ts={ts} outside [400, 600]"
);
}
}
assert_eq!(hits.len(), 1, "Should find exactly 1 doc in range");
let cached_hits = client.search("range", filters, 10, 0, FieldMask::FULL)?;
for hit in &cached_hits {
if let Some(ts) = hit.created_at {
assert!(
(400..=600).contains(&ts),
"Cached search violated date range filter"
);
}
}
Ok(())
}
#[test]
fn filter_fidelity_combined_filters_respected() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let combinations = [
("codex", "/ws/prod", 100), ("claude", "/ws/prod", 500), ("claude", "/ws/dev", 500), ("claude", "/ws/prod", 900), ];
for (i, (agent, ws, ts)) in combinations.iter().enumerate() {
let conv = NormalizedConversation {
agent_slug: (*agent).into(),
external_id: None,
title: Some(format!("combo-{i}")),
workspace: Some(std::path::PathBuf::from(*ws)),
source_path: dir.path().join(format!("{i}.jsonl")),
started_at: Some(*ts),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(*ts),
content: "hello world combotest query".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.agents.insert("claude".into());
filters.workspaces.insert("/ws/prod".into());
filters.created_from = Some(400);
filters.created_to = Some(600);
let hits = client.search("combotest", filters.clone(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1, "Combined filter should match exactly 1 doc");
for hit in &hits {
assert_eq!(hit.agent, "claude", "Agent filter violated");
assert_eq!(hit.workspace, "/ws/prod", "Workspace filter violated");
if let Some(ts) = hit.created_at {
assert!((400..=600).contains(&ts), "Date filter violated: ts={ts}");
}
}
let cached = client.search("combotest", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(cached.len(), 1, "Cached result count mismatch");
Ok(())
}
#[test]
fn lexical_hits_normalize_trimmed_local_source_metadata() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("trimmed local doc".into()),
workspace: None,
source_path: dir.path().join("trimmed-local.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": " LOCAL ",
"kind": "local"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "trimmed local lexical".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("trimmed", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].source_id, "local");
assert_eq!(hits[0].origin_kind, "local");
Ok(())
}
#[test]
fn lexical_hits_normalize_remote_origin_kind_without_source_id() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("remote lexical doc".into()),
workspace: None,
source_path: dir.path().join("remote-lexical.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": " ",
"kind": "ssh",
"host": "dev@laptop"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "remote lexical".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("remote", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].source_id, "dev@laptop");
assert_eq!(hits[0].origin_kind, "remote");
assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
Ok(())
}
#[test]
fn lexical_hits_infer_remote_origin_from_host_without_kind() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("legacy host-only lexical doc".into()),
workspace: None,
source_path: dir.path().join("legacy-host-only-lexical.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({
"cass": {
"origin": {
"source_id": " ",
"host": "dev@laptop"
}
}
}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "legacy remote lexical".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("legacy", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].source_id, "dev@laptop");
assert_eq!(hits[0].origin_kind, "remote");
assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
Ok(())
}
#[test]
fn filter_fidelity_source_filter_respected() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv_local = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("local doc".into()),
workspace: None,
source_path: dir.path().join("local.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "source filter test local".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv_local)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let filters = SearchFilters {
source_filter: SourceFilter::Local,
..Default::default()
};
let hits = client.search("source", filters.clone(), 10, 0, FieldMask::FULL)?;
for hit in &hits {
assert_eq!(
hit.source_id, "local",
"Source filter violated: got source_id '{}' instead of 'local'",
hit.source_id
);
}
assert!(!hits.is_empty(), "Should have found local results");
let filters_id = SearchFilters {
source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
..Default::default()
};
let hits_id = client.search("source", filters_id, 10, 0, FieldMask::FULL)?;
for hit in &hits_id {
assert_eq!(
hit.source_id, "local",
"SourceId filter violated: got '{}' instead of 'local'",
hit.source_id
);
}
assert!(
!hits_id.is_empty(),
"Should have found results for source_id=local"
);
Ok(())
}
#[test]
fn filter_fidelity_cache_key_isolation() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let filters_empty = SearchFilters::default();
let mut filters_agent = SearchFilters::default();
filters_agent.agents.insert("codex".into());
let mut filters_ws = SearchFilters::default();
filters_ws.workspaces.insert("/ws".into());
let key_empty = client.cache_key("test", &filters_empty);
let key_agent = client.cache_key("test", &filters_agent);
let key_ws = client.cache_key("test", &filters_ws);
assert_ne!(
key_empty, key_agent,
"Empty vs agent filter keys should differ"
);
assert_ne!(
key_empty, key_ws,
"Empty vs workspace filter keys should differ"
);
assert_ne!(
key_agent, key_ws,
"Agent vs workspace filter keys should differ"
);
let mut filters_agent2 = SearchFilters::default();
filters_agent2.agents.insert("codex".into());
let key_agent2 = client.cache_key("test", &filters_agent2);
assert_eq!(key_agent, key_agent2, "Same filter should produce same key");
}
#[test]
fn sanitize_query_preserves_unicode_alphanumeric() {
assert_eq!(fs_cass_sanitize_query("こんにちは"), "こんにちは");
assert_eq!(fs_cass_sanitize_query("café"), "café");
assert_eq!(fs_cass_sanitize_query("日本語123"), "日本語123");
}
#[test]
fn sanitize_query_handles_multiple_consecutive_special_chars() {
assert_eq!(fs_cass_sanitize_query("foo---bar"), "foo---bar");
assert_eq!(fs_cass_sanitize_query("a!@#$%^&()b"), "a b");
}
#[test]
fn wildcard_pattern_empty_after_trim_returns_exact_empty() {
assert_eq!(
FsCassWildcardPattern::parse("*"),
FsCassWildcardPattern::Exact(String::new())
);
assert_eq!(
FsCassWildcardPattern::parse("**"),
FsCassWildcardPattern::Exact(String::new())
);
assert_eq!(
FsCassWildcardPattern::parse("***"),
FsCassWildcardPattern::Exact(String::new())
);
}
#[test]
fn wildcard_pattern_to_regex_generation() {
assert_eq!(FsCassWildcardPattern::Exact("foo".into()).to_regex(), None);
assert_eq!(FsCassWildcardPattern::Prefix("foo".into()).to_regex(), None);
assert_eq!(
FsCassWildcardPattern::Suffix("foo".into()).to_regex(),
Some(".*foo$".into())
);
assert_eq!(
FsCassWildcardPattern::Substring("foo".into()).to_regex(),
Some(".*foo.*".into())
);
}
#[test]
fn parse_boolean_query_prefix_minus_not() {
let tokens = fs_cass_parse_boolean_query("-world");
let expected = vec![
FsCassQueryToken::Not,
FsCassQueryToken::Term("world".into()),
];
assert_eq!(tokens, expected);
let tokens = fs_cass_parse_boolean_query("hello -world");
let expected = vec![
FsCassQueryToken::Term("hello".into()),
FsCassQueryToken::Not,
FsCassQueryToken::Term("world".into()),
];
assert_eq!(tokens, expected);
}
#[test]
fn parse_boolean_query_empty_quoted_phrase_ignored() {
let tokens = parse_boolean_query("\"\"");
assert!(tokens.is_empty());
let tokens = parse_boolean_query("foo \"\" bar");
let expected: QueryTokenList = vec![
QueryToken::Term("foo".into()),
QueryToken::Term("bar".into()),
];
assert_eq!(tokens, expected);
}
#[test]
fn parse_boolean_query_unclosed_quote() {
let tokens = parse_boolean_query("\"hello world");
let expected: QueryTokenList = vec![QueryToken::Phrase("hello world".into())];
assert_eq!(tokens, expected);
}
#[test]
fn transpile_to_fts5_rejects_leading_unary_not_queries() {
assert_eq!(transpile_to_fts5("NOT foo"), None);
assert_eq!(transpile_to_fts5("-foo"), None);
}
#[test]
fn transpile_to_fts5_rejects_or_not_forms_it_cannot_represent() {
assert_eq!(transpile_to_fts5("foo OR NOT bar"), None);
assert_eq!(transpile_to_fts5("foo NOT bar OR baz"), None);
}
#[test]
fn transpile_to_fts5_ignores_leading_or() {
assert_eq!(transpile_to_fts5("OR test"), Some("test".to_string()));
assert_eq!(
transpile_to_fts5("OR foo-bar"),
Some("(foo AND bar)".to_string())
);
}
#[test]
fn transpile_to_fts5_splits_hyphenated_subterms_for_sqlite_fts() {
assert_eq!(
transpile_to_fts5("br-123.jsonl"),
Some("(br AND 123 AND jsonl)".to_string())
);
assert_eq!(
transpile_to_fts5("br-123.json*"),
Some("(br AND 123 AND json*)".to_string())
);
}
#[test]
fn transpile_to_fts5_preserves_supported_binary_not() {
assert_eq!(
transpile_to_fts5("foo NOT bar").as_deref(),
Some("foo NOT bar")
);
assert_eq!(
transpile_to_fts5("foo NOT bar-baz"),
Some("foo NOT (bar AND baz)".to_string())
);
}
#[test]
fn search_sqlite_fts5_returns_empty_when_sqlite_is_unavailable() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: false,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: "fts5-disabled".to_string(),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let hits = client.search_sqlite_fts5(
Path::new("/nonexistent"),
"test query",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
);
assert!(hits.is_ok(), "disabled FTS5 path should stay non-fatal");
assert!(
hits.unwrap().is_empty(),
"unavailable SQLite fallback should keep returning an empty result set"
);
}
#[test]
fn search_sqlite_fts5_rank_and_hydrate_split_preserves_limit_prefix_invariant() -> Result<()> {
let conn = Connection::open(":memory:")?;
conn.execute_batch(
"CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
CREATE TABLE conversations (
id INTEGER PRIMARY KEY,
agent_id INTEGER,
workspace_id INTEGER,
source_id TEXT,
origin_host TEXT,
title TEXT,
source_path TEXT
);
CREATE TABLE messages (
id INTEGER PRIMARY KEY,
conversation_id INTEGER,
idx INTEGER,
content TEXT,
created_at INTEGER
);
CREATE VIRTUAL TABLE fts_messages USING fts5(
content,
title,
agent,
workspace,
source_path,
created_at UNINDEXED,
message_id UNINDEXED,
tokenize='porter'
);",
)?;
conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/tmp/k0e5p')")?;
for (i, repeats) in (1..=6_i64).enumerate() {
let conv_id = i as i64 + 1;
let msg_id = (i as i64 + 1) * 10;
conn.execute_compat(
"INSERT INTO conversations(id, agent_id, workspace_id, source_id, \
origin_host, title, source_path) \
VALUES(?1, 1, 1, 'local', NULL, ?2, ?3)",
params![
conv_id,
format!("k0e5p-{}", i),
format!("/tmp/k0e5p/{}.jsonl", i),
],
)?;
let content = "rankprobe ".repeat(repeats as usize);
conn.execute_compat(
"INSERT INTO messages(id, conversation_id, idx, content, created_at) \
VALUES(?1, ?2, ?3, ?4, ?5)",
params![
msg_id,
conv_id,
i as i64,
content.as_str(),
1_700_000_000_i64 + i as i64
],
)?;
conn.execute_compat(
"INSERT INTO fts_messages(rowid, content, title, agent, workspace, \
source_path, created_at, message_id) \
VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
params![
msg_id,
content.as_str(),
format!("k0e5p-{}", i),
"codex",
"/tmp/k0e5p",
format!("/tmp/k0e5p/{}.jsonl", i),
1_700_000_000_i64 + i as i64,
msg_id,
],
)?;
}
let client = SearchClient {
reader: None,
sqlite: Mutex::new(Some(SendConnection(conn))),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: false,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:k0e5p"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
fn hit_keys(hits: &[SearchHit]) -> Vec<(String, Option<usize>)> {
hits.iter()
.map(|h| (h.source_path.clone(), h.line_number))
.collect()
}
let large_hits = client.search_sqlite_fts5(
Path::new(":memory:"),
"rankprobe",
SearchFilters::default(),
6,
0,
FieldMask::FULL,
)?;
assert_eq!(
large_hits.len(),
6,
"limit=N must return all N candidates when the corpus has exactly N matches"
);
let small_hits = client.search_sqlite_fts5(
Path::new(":memory:"),
"rankprobe",
SearchFilters::default(),
3,
0,
FieldMask::FULL,
)?;
assert_eq!(small_hits.len(), 3, "limit=3 must return exactly 3 hits");
let large_keys = hit_keys(&large_hits);
let small_keys = hit_keys(&small_hits);
assert_eq!(
small_keys,
large_keys[..3],
"limit=3 hit keys MUST be the first 3 of limit=6 hit keys (rank+hydrate \
split must not re-order or re-filter); small={small_keys:?} \
large_prefix={:?}",
&large_keys[..3]
);
for (idx, (small, large)) in small_hits.iter().zip(large_hits.iter()).enumerate() {
assert_eq!(
small.content, large.content,
"hit[{idx}] content must agree across limit=3 and limit=6: \
small={:?} large={:?}",
small.content, large.content
);
assert_eq!(
small.title, large.title,
"hit[{idx}] title must agree across limit=3 and limit=6"
);
}
let zero_hits = client.search_sqlite_fts5(
Path::new(":memory:"),
"rankprobe",
SearchFilters::default(),
0,
0,
FieldMask::FULL,
)?;
assert!(
zero_hits.is_empty(),
"limit=0 must return zero hits even though the rank phase has candidates; \
got {} hits",
zero_hits.len()
);
Ok(())
}
#[test]
fn levenshtein_distance_identical_strings() {
assert_eq!(levenshtein_distance("hello", "hello"), 0);
assert_eq!(levenshtein_distance("", ""), 0);
}
#[test]
fn levenshtein_distance_insertions() {
assert_eq!(levenshtein_distance("", "abc"), 3);
assert_eq!(levenshtein_distance("cat", "cats"), 1);
}
#[test]
fn levenshtein_distance_deletions() {
assert_eq!(levenshtein_distance("abc", ""), 3);
assert_eq!(levenshtein_distance("cats", "cat"), 1);
}
#[test]
fn levenshtein_distance_substitutions() {
assert_eq!(levenshtein_distance("cat", "bat"), 1);
assert_eq!(levenshtein_distance("kitten", "sitten"), 1);
}
#[test]
fn levenshtein_distance_mixed_operations() {
assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
}
#[test]
fn is_tool_invocation_noise_allows_real_content() {
assert!(!is_tool_invocation_noise("This is a normal message"));
assert!(!is_tool_invocation_noise(
"Let me use the Tool feature to accomplish this task. Here is the implementation..."
));
let long_content = "[Tool: Read] Now here is a lot of useful content that explains the implementation details and provides context for the changes being made to the codebase.";
assert!(!is_tool_invocation_noise(long_content));
}
#[test]
fn is_tool_invocation_noise_handles_short_tool_markers() {
assert!(is_tool_invocation_noise("[tool: x]"));
assert!(is_tool_invocation_noise("tool: bash"));
}
#[test]
fn search_boolean_and_filters_results() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv1 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc1".into()),
workspace: None,
source_path: dir.path().join("1.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "alpha beta gamma".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv2 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc2".into()),
workspace: None,
source_path: dir.path().join("2.jsonl"),
started_at: Some(2),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(2),
content: "alpha delta".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv1)?;
index.add_conversation(&conv2)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search(
"alpha AND beta",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert!(hits[0].content.contains("gamma"));
let hits = client.search(
"alpha AND delta",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert!(hits[0].content.contains("delta"));
Ok(())
}
#[test]
fn search_boolean_or_expands_results() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv1 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc1".into()),
workspace: None,
source_path: dir.path().join("1.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "unique xyzzy term".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv2 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc2".into()),
workspace: None,
source_path: dir.path().join("2.jsonl"),
started_at: Some(2),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(2),
content: "unique plugh term".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv1)?;
index.add_conversation(&conv2)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search(
"xyzzy OR plugh",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 2);
Ok(())
}
#[test]
fn search_boolean_not_excludes_results() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv1 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc1".into()),
workspace: None,
source_path: dir.path().join("1.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "nottest keep this".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv2 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc2".into()),
workspace: None,
source_path: dir.path().join("2.jsonl"),
started_at: Some(2),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(2),
content: "nottest exclude this".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv1)?;
index.add_conversation(&conv2)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search(
"nottest NOT exclude",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert!(
!hits[0].content.contains("exclude"),
"NOT exclude should filter out doc with 'exclude'"
);
let hits = client.search(
"nottest -exclude",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert!(
!hits[0].content.contains("exclude"),
"Prefix -exclude should filter out doc with 'exclude'"
);
Ok(())
}
#[test]
fn search_phrase_query_matches_exact_sequence() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv1 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc1".into()),
workspace: None,
source_path: dir.path().join("1.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "the quick brown fox".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv2 = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc2".into()),
workspace: None,
source_path: dir.path().join("2.jsonl"),
started_at: Some(2),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(2),
content: "the brown quick fox".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv1)?;
index.add_conversation(&conv2)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search(
"quick brown",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 2);
let hits = client.search(
"\"quick brown\"",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 1);
assert!(hits[0].content.contains("quick brown"));
Ok(())
}
#[test]
fn search_dot_punctuation_splits_terms_but_hyphens_preserve_compound_semantics() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("doc".into()),
workspace: None,
source_path: dir.path().join("3.jsonl"),
started_at: Some(1),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1),
content: "foo bar baz".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
let hits = client.search("foo-bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 0);
Ok(())
}
#[test]
fn explanation_classifies_simple_query() {
let exp = QueryExplanation::analyze("hello", &SearchFilters::default());
assert_eq!(exp.query_type, QueryType::Simple);
assert_eq!(exp.index_strategy, IndexStrategy::EdgeNgram);
assert_eq!(exp.estimated_cost, QueryCost::Low);
assert!(exp.parsed.terms.len() == 1);
assert_eq!(exp.parsed.terms[0].text, "hello");
assert!(!exp.parsed.terms[0].subterms.is_empty());
assert_eq!(exp.parsed.terms[0].subterms[0].pattern, "exact");
}
#[test]
fn explanation_classifies_wildcard_query() {
let exp = QueryExplanation::analyze("*handler*", &SearchFilters::default());
assert_eq!(exp.query_type, QueryType::Wildcard);
assert_eq!(exp.index_strategy, IndexStrategy::RegexScan);
assert_eq!(exp.estimated_cost, QueryCost::High);
assert!(!exp.parsed.terms[0].subterms.is_empty());
assert!(
exp.parsed.terms[0].subterms[0]
.pattern
.contains("substring")
);
assert!(exp.warnings.iter().any(|w| w.contains("regex scan")));
}
#[test]
fn explanation_classifies_boolean_query() {
let exp = QueryExplanation::analyze("foo AND bar", &SearchFilters::default());
assert_eq!(exp.query_type, QueryType::Boolean);
assert_eq!(exp.index_strategy, IndexStrategy::BooleanCombination);
assert!(exp.parsed.operators.contains(&"AND".to_string()));
}
#[test]
fn explanation_classifies_phrase_query() {
let exp = QueryExplanation::analyze("\"exact phrase\"", &SearchFilters::default());
assert_eq!(exp.query_type, QueryType::Phrase);
assert!(exp.parsed.phrases.contains(&"exact phrase".to_string()));
}
#[test]
fn explanation_handles_filtered_query() {
let mut filters = SearchFilters::default();
filters.agents.insert("codex".to_string());
let exp = QueryExplanation::analyze("test", &filters);
assert_eq!(exp.query_type, QueryType::Filtered);
assert_eq!(exp.filters_summary.agent_count, 1);
assert!(
exp.filters_summary
.description
.as_ref()
.unwrap()
.contains("1 agent")
);
assert!(exp.warnings.iter().any(|w| w.contains("codex")));
}
#[test]
fn explanation_handles_empty_query() {
let exp = QueryExplanation::analyze("", &SearchFilters::default());
assert_eq!(exp.query_type, QueryType::Empty);
assert_eq!(exp.index_strategy, IndexStrategy::FullScan);
assert_eq!(exp.estimated_cost, QueryCost::High);
assert!(exp.warnings.iter().any(|w| w.contains("Empty query")));
}
#[test]
fn explanation_warns_short_terms() {
let exp = QueryExplanation::analyze("a", &SearchFilters::default());
assert!(exp.warnings.iter().any(|w| w.contains("Very short term")));
}
#[test]
fn explanation_with_wildcard_fallback() {
let exp = QueryExplanation::analyze("test", &SearchFilters::default())
.with_wildcard_fallback(true);
assert!(exp.wildcard_applied);
assert!(exp.warnings.iter().any(|w| w.contains("Wildcard fallback")));
}
#[test]
fn explanation_complex_query_has_higher_cost() {
let exp = QueryExplanation::analyze(
"foo AND bar OR baz NOT qux AND \"phrase here\"",
&SearchFilters::default(),
);
assert_eq!(exp.query_type, QueryType::Boolean);
assert!(matches!(
exp.estimated_cost,
QueryCost::Medium | QueryCost::High
));
}
#[test]
fn explanation_preserves_original_query() {
let exp = QueryExplanation::analyze("Hello World!", &SearchFilters::default());
assert_eq!(exp.original_query, "Hello World!");
assert!(exp.sanitized_query.contains("Hello"));
assert!(!exp.sanitized_query.contains("!"));
}
#[test]
fn explanation_detects_not_operator() {
let exp = QueryExplanation::analyze("foo NOT bar", &SearchFilters::default());
assert!(exp.parsed.operators.contains(&"NOT".to_string()));
assert!(
exp.parsed
.terms
.iter()
.any(|t| t.negated && t.text == "bar")
);
}
#[test]
fn explanation_implicit_and() {
let exp = QueryExplanation::analyze("foo bar", &SearchFilters::default());
assert!(exp.parsed.implicit_and);
assert_eq!(exp.parsed.terms.len(), 2);
}
#[test]
fn explanation_serializes_to_json() {
let exp = QueryExplanation::analyze("test query", &SearchFilters::default());
let json = serde_json::to_value(&exp).expect("should serialize");
assert!(json["original_query"].is_string());
assert!(json["query_type"].is_string());
assert!(json["index_strategy"].is_string());
assert!(json["estimated_cost"].is_string());
assert!(json["parsed"]["terms"].is_array());
}
#[test]
fn search_multi_filter_agent_workspace_time() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let convs = [
("codex", "/ws/alpha", 100, "needle alpha codex"),
("claude", "/ws/alpha", 200, "needle alpha claude"),
("codex", "/ws/beta", 150, "needle beta codex"),
("codex", "/ws/alpha", 300, "needle alpha codex late"),
];
for (i, (agent, ws, ts, content)) in convs.iter().enumerate() {
let conv = NormalizedConversation {
agent_slug: (*agent).into(),
external_id: None,
title: Some(format!("conv-{i}")),
workspace: Some(std::path::PathBuf::from(*ws)),
source_path: dir.path().join(format!("{i}.jsonl")),
started_at: Some(*ts),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(*ts),
content: (*content).into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.agents.insert("codex".into());
filters.workspaces.insert("/ws/alpha".into());
filters.created_from = Some(50);
filters.created_to = Some(250);
let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(
hits.len(),
1,
"Should match only one conv (codex + alpha + ts=100)"
);
assert_eq!(hits[0].agent, "codex");
assert_eq!(hits[0].workspace, "/ws/alpha");
assert!(hits[0].content.contains("alpha codex"));
assert!(!hits[0].content.contains("late"));
Ok(())
}
#[test]
fn search_multi_agent_filter() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
for agent in ["codex", "claude", "cline", "gemini"] {
let conv = NormalizedConversation {
agent_slug: agent.into(),
external_id: None,
title: Some(format!("{agent}-conv")),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join(format!("{agent}.jsonl")),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: format!("needle from {agent}"),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.agents.insert("codex".into());
filters.agents.insert("claude".into());
let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 2);
let agents: Vec<_> = hits.iter().map(|h| h.agent.as_str()).collect();
assert!(agents.contains(&"codex"));
assert!(agents.contains(&"claude"));
assert!(!agents.contains(&"cline"));
assert!(!agents.contains(&"gemini"));
Ok(())
}
#[test]
fn cache_metrics_incremented_on_operations() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
assert_eq!((hits, miss, shortfall, reloads), (0, 0, 0, 0));
client.metrics.inc_cache_hits();
client.metrics.inc_cache_hits();
client.metrics.inc_cache_miss();
client.metrics.inc_cache_shortfall();
client.metrics.inc_reload();
let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
assert_eq!(hits, 2);
assert_eq!(miss, 1);
assert_eq!(shortfall, 1);
assert_eq!(reloads, 1);
}
#[test]
fn cache_shard_name_deterministic() {
let client = SearchClient {
reader: None,
sqlite: Mutex::new(None),
sqlite_path: None,
prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
reload_on_search: true,
last_reload: Mutex::new(None),
last_generation: Mutex::new(None),
reload_epoch: Arc::new(AtomicU64::new(0)),
warm_tx: None,
_warm_handle: None,
metrics: Metrics::default(),
cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
semantic: Mutex::new(None),
last_tantivy_total_count: Mutex::new(None),
};
let filters1 = SearchFilters::default();
let mut filters2 = SearchFilters::default();
filters2.agents.insert("codex".into());
let mut filters3 = SearchFilters::default();
filters3.workspaces.insert("/tmp/cass-workspace".into());
let shard1_first = client.shard_name(&filters1);
let shard1_second = client.shard_name(&filters1);
assert_eq!(
shard1_first, shard1_second,
"Same filters should produce same shard name"
);
let shard2 = client.shard_name(&filters2);
assert_ne!(
shard1_first, shard2,
"Different filters should produce different shard names"
);
assert_eq!(shard2, client.shard_name(&filters2));
assert_eq!(
client.shard_name(&filters3),
"workspace:/tmp/cass-workspace"
);
}
#[test]
fn wildcard_fallback_respects_filter_constraints() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv_match = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("match".into()),
workspace: Some(std::path::PathBuf::from("/target")),
source_path: dir.path().join("match.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "unique specific term here".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
let conv_other = NormalizedConversation {
agent_slug: "claude".into(),
external_id: None,
title: Some("other".into()),
workspace: Some(std::path::PathBuf::from("/other")),
source_path: dir.path().join("other.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "unique specific also here".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv_match)?;
index.add_conversation(&conv_other)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters.agents.insert("codex".into());
let result =
client.search_with_fallback("unique", filters.clone(), 10, 0, 100, FieldMask::FULL)?;
assert!(result.hits.iter().all(|h| h.agent == "codex"));
Ok(())
}
#[test]
fn wildcard_fallback_short_query_triggers_prefix() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some("test".into()),
workspace: None,
source_path: dir.path().join("test.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "authentication authorization oauth".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"auth",
SearchFilters::default(),
10,
0,
100,
FieldMask::FULL,
)?;
assert!(
!result.hits.is_empty(),
"Short prefix should match via prefix search"
);
assert!(result.hits[0].content.contains("auth"));
Ok(())
}
#[test]
fn search_real_fixture_multiple_messages() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "claude_code".into(),
external_id: Some("conv-123".into()),
title: Some("Implementing authentication".into()),
workspace: Some(std::path::PathBuf::from("/home/user/project")),
source_path: dir.path().join("session-1.jsonl"),
started_at: Some(1700000000000),
ended_at: Some(1700000060000),
metadata: serde_json::json!({
"model": "claude-3-sonnet",
"tokens": 1500
}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: Some("developer".into()),
created_at: Some(1700000000000),
content: "Help me implement JWT authentication for my Express API".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: Some("claude".into()),
created_at: Some(1700000010000),
content: "I'll help you implement JWT authentication. First, let's install the required packages.".into(),
extra: serde_json::json!({}),
snippets: vec![NormalizedSnippet {
file_path: Some("package.json".into()),
start_line: Some(1),
end_line: Some(5),
language: Some("json".into()),
snippet_text: Some(r#"{"dependencies":{"jsonwebtoken":"^9.0.0"}}"#.into()),
}],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 2,
role: "user".into(),
author: Some("developer".into()),
created_at: Some(1700000030000),
content: "Can you also add refresh token support?".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits = client.search(
"JWT authentication",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert!(!hits.is_empty(), "Should find JWT authentication");
assert!(hits.iter().any(|h| h.agent == "claude_code"));
assert!(
hits.iter()
.any(|h| h.snippet.contains("JWT") || h.snippet.contains("authentication"))
);
let hits = client.search(
"required packages",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert!(
!hits.is_empty(),
"Should find 'required packages' in assistant response"
);
let hits = client.search(
"refresh token",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert!(!hits.is_empty(), "Should find refresh token");
assert!(hits.iter().any(|h| h.content.contains("refresh")));
Ok(())
}
#[test]
fn search_deduplication_with_similar_content() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
for i in 0..2 {
let conv = NormalizedConversation {
agent_slug: "codex".into(),
external_id: None,
title: Some(format!("similar-{i}")),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join(format!("similar-{i}.jsonl")),
started_at: Some(100 + i),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100 + i),
content: "implement the sorting algorithm".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let result = client.search_with_fallback(
"sorting algorithm",
SearchFilters::default(),
10,
0,
100,
FieldMask::FULL,
)?;
assert!(!result.hits.is_empty());
Ok(())
}
#[test]
fn search_session_paths_filter() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let paths = [
dir.path().join("session-a.jsonl"),
dir.path().join("session-b.jsonl"),
dir.path().join("session-c.jsonl"),
];
for (i, path) in paths.iter().enumerate() {
let conv = NormalizedConversation {
agent_slug: "claude".into(),
external_id: None,
title: Some(format!("session-{}", i)),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: path.clone(),
started_at: Some(100 + i as i64),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100 + i as i64),
content: format!("needle content for session {}", i),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let hits_all = client.search("needle", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
assert_eq!(hits_all.len(), 3, "Should find all 3 sessions");
let mut filters = SearchFilters::default();
filters
.session_paths
.insert(paths[0].to_string_lossy().to_string());
filters
.session_paths
.insert(paths[2].to_string_lossy().to_string());
let hits_filtered = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(
hits_filtered.len(),
2,
"Should find only 2 sessions (A and C)"
);
let filtered_paths: HashSet<&str> = hits_filtered
.iter()
.map(|h| h.source_path.as_str())
.collect();
assert!(filtered_paths.contains(paths[0].to_string_lossy().as_ref()));
assert!(filtered_paths.contains(paths[2].to_string_lossy().as_ref()));
assert!(!filtered_paths.contains(paths[1].to_string_lossy().as_ref()));
Ok(())
}
#[test]
fn lexical_session_paths_filter_retries_past_initial_page() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let requested_path = dir.path().join("requested-session.jsonl");
for i in 0..4 {
let conv = NormalizedConversation {
agent_slug: "claude".into(),
external_id: None,
title: Some(format!("distractor-{i}")),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join(format!("distractor-{i}.jsonl")),
started_at: Some(100 + i as i64),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100 + i as i64),
content: "needle needle needle high ranking distractor".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
}
let requested = NormalizedConversation {
agent_slug: "claude".into(),
external_id: None,
title: Some("requested".into()),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: requested_path.clone(),
started_at: Some(200),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(200),
content: "needle requested session should survive post-filter paging".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&requested)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let mut filters = SearchFilters::default();
filters
.session_paths
.insert(requested_path.to_string_lossy().to_string());
let hits = client.search("needle", filters, 1, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
assert_eq!(hits[0].source_path, requested_path.to_string_lossy());
Ok(())
}
#[test]
fn search_session_paths_empty_filter_returns_all() -> Result<()> {
let dir = TempDir::new()?;
let mut index = TantivyIndex::open_or_create(dir.path())?;
let conv = NormalizedConversation {
agent_slug: "claude".into(),
external_id: None,
title: Some("test".into()),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: dir.path().join("test.jsonl"),
started_at: Some(100),
ended_at: None,
metadata: serde_json::json!({}),
messages: vec![NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(100),
content: "needle content".into(),
extra: serde_json::json!({}),
snippets: vec![],
invocations: Vec::new(),
}],
};
index.add_conversation(&conv)?;
index.commit()?;
let client = SearchClient::open(dir.path(), None)?.expect("index present");
let filters = SearchFilters::default();
assert!(filters.session_paths.is_empty());
let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
assert_eq!(hits.len(), 1);
Ok(())
}
#[test]
fn search_client_reads_federated_lexical_bundle_as_one_corpus() -> Result<()> {
let root = TempDir::new()?;
let shard_a = root.path().join("shard-a");
let shard_b = root.path().join("shard-b");
let published = root.path().join("published");
let mut shard_a_index = TantivyIndex::open_or_create(&shard_a)?;
let mut shard_b_index = TantivyIndex::open_or_create(&shard_b)?;
let make_conv =
|external_id: &str, title: &str, source_path: &str, tag: &str| NormalizedConversation {
agent_slug: "codex".into(),
external_id: Some(external_id.into()),
title: Some(title.into()),
workspace: Some(std::path::PathBuf::from("/ws")),
source_path: std::path::PathBuf::from(source_path),
started_at: Some(1_700_000_100_000),
ended_at: Some(1_700_000_100_100),
metadata: json!({}),
messages: vec![
NormalizedMessage {
idx: 0,
role: "user".into(),
author: None,
created_at: Some(1_700_000_100_010),
content: format!("shared federated needle {tag} user"),
extra: json!({}),
snippets: vec![],
invocations: Vec::new(),
},
NormalizedMessage {
idx: 1,
role: "assistant".into(),
author: None,
created_at: Some(1_700_000_100_020),
content: format!("shared federated needle {tag} assistant"),
extra: json!({}),
snippets: vec![],
invocations: Vec::new(),
},
],
};
let conv_a = make_conv(
"fed-query-a",
"Fed Query A",
"/tmp/fed-query-a.jsonl",
"alpha",
);
let conv_b = make_conv(
"fed-query-b",
"Fed Query B",
"/tmp/fed-query-b.jsonl",
"beta",
);
shard_a_index.add_conversation(&conv_a)?;
shard_b_index.add_conversation(&conv_b)?;
shard_a_index.commit()?;
shard_b_index.commit()?;
drop(shard_a_index);
drop(shard_b_index);
crate::search::tantivy::publish_federated_searchable_index_directories(
&published,
&[&shard_a, &shard_b],
)?;
let client = SearchClient::open(&published, None)?.expect("federated index present");
assert!(client.has_tantivy());
assert_eq!(client.total_docs(), 4);
let hits = client.search(
"shared federated needle",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
assert_eq!(hits.len(), 4);
let observed_order = hits
.iter()
.map(|hit| {
(
hit.source_path.clone(),
hit.line_number,
hit.content.clone(),
hit.score.to_bits(),
)
})
.collect::<Vec<_>>();
let hit_paths = hits
.iter()
.map(|hit| hit.source_path.as_str())
.collect::<std::collections::HashSet<_>>();
assert!(hit_paths.contains("/tmp/fed-query-a.jsonl"));
assert!(hit_paths.contains("/tmp/fed-query-b.jsonl"));
for attempt in 0..3 {
let repeated = client.search(
"shared federated needle",
SearchFilters::default(),
10,
0,
FieldMask::FULL,
)?;
let repeated_order = repeated
.iter()
.map(|hit| {
(
hit.source_path.clone(),
hit.line_number,
hit.content.clone(),
hit.score.to_bits(),
)
})
.collect::<Vec<_>>();
assert_eq!(
repeated_order, observed_order,
"federated lexical query order drifted on repeated attempt {attempt}"
);
}
Ok(())
}
#[test]
fn semantic_search_session_paths_filter_retries_past_initial_candidates() -> Result<()> {
let fixture = build_semantic_test_fixture()?;
let mut filters = SearchFilters::default();
filters
.session_paths
.insert(fixture.source_paths[2].clone());
let (hits, ann_stats) = fixture.client.search_semantic(
"semantic fixture query",
filters,
1,
0,
FieldMask::FULL,
false,
)?;
assert!(
ann_stats.is_none(),
"exact search should not emit ANN stats"
);
assert_eq!(
hits.len(),
1,
"filtered semantic search should still return a hit"
);
assert_eq!(
hits[0].source_path, fixture.source_paths[2],
"semantic search should keep searching until it finds the requested session path"
);
Ok(())
}
#[test]
fn semantic_search_offsets_after_session_paths_filtering() -> Result<()> {
let fixture = build_semantic_test_fixture()?;
let mut filters = SearchFilters::default();
filters
.session_paths
.insert(fixture.source_paths[1].clone());
filters
.session_paths
.insert(fixture.source_paths[2].clone());
let (hits, _) = fixture.client.search_semantic(
"semantic fixture query",
filters,
1,
1,
FieldMask::FULL,
false,
)?;
assert_eq!(
hits.len(),
1,
"second filtered page should still return one hit"
);
assert_eq!(
hits[0].source_path, fixture.source_paths[2],
"offset must apply after semantic deduplication and session path filtering"
);
Ok(())
}
#[test]
fn semantic_search_merges_sharded_vector_indexes() -> Result<()> {
let fixture = build_sharded_semantic_test_fixture()?;
let (hits, ann_stats) = fixture.client.search_semantic(
"semantic fixture query",
SearchFilters::default(),
3,
0,
FieldMask::FULL,
false,
)?;
assert!(
ann_stats.is_none(),
"sharded exact search should not emit ANN stats"
);
assert_eq!(hits.len(), 3);
assert_eq!(hits[0].source_path, fixture.source_paths[0]);
assert_eq!(hits[1].source_path, fixture.source_paths[1]);
assert_eq!(hits[2].source_path, fixture.source_paths[2]);
Ok(())
}
#[test]
fn progressive_phase_overfetches_before_session_paths_filtering() -> Result<()> {
let fixture = build_semantic_test_fixture()?;
let mut filters = SearchFilters::default();
filters
.session_paths
.insert(fixture.source_paths[2].clone());
let results = vec![
FsScoredResult {
doc_id: fixture.doc_ids[0].clone(),
score: 1.0,
source: FsScoreSource::SemanticFast,
index: None,
fast_score: Some(1.0),
quality_score: None,
lexical_score: None,
rerank_score: None,
explanation: None,
metadata: None,
},
FsScoredResult {
doc_id: fixture.doc_ids[1].clone(),
score: 0.9,
source: FsScoreSource::SemanticFast,
index: None,
fast_score: Some(0.9),
quality_score: None,
lexical_score: None,
rerank_score: None,
explanation: None,
metadata: None,
},
FsScoredResult {
doc_id: fixture.doc_ids[2].clone(),
score: 0.8,
source: FsScoreSource::SemanticFast,
index: None,
fast_score: Some(0.8),
quality_score: None,
lexical_score: None,
rerank_score: None,
explanation: None,
metadata: None,
},
];
let result = fixture.client.progressive_phase_to_result(
&results,
ProgressivePhaseContext {
query: "session path filter",
filters: &filters,
field_mask: FieldMask::FULL,
lexical_cache: None,
limit: 1,
fetch_limit: 3,
},
)?;
assert_eq!(
result.hits.len(),
1,
"progressive phase should retain enough overfetched hits to satisfy post-search session path filtering"
);
assert_eq!(
result.hits[0].source_path, fixture.source_paths[2],
"progressive phase should page after session path filtering"
);
Ok(())
}
#[test]
fn sql_placeholders_empty() {
assert_eq!(sql_placeholders(0), "");
}
#[test]
fn sql_placeholders_single() {
assert_eq!(sql_placeholders(1), "?");
}
#[test]
fn sql_placeholders_multiple() {
assert_eq!(sql_placeholders(3), "?,?,?");
assert_eq!(sql_placeholders(5), "?,?,?,?,?");
}
#[test]
fn sql_placeholders_capacity_efficient() {
let result = sql_placeholders(3);
assert_eq!(result.len(), 5);
assert!(result.capacity() >= 5);
let result = sql_placeholders(10);
assert_eq!(result.len(), 19);
assert!(result.capacity() >= 19);
}
#[test]
fn sql_placeholders_large_count() {
let result = sql_placeholders(100);
assert_eq!(result.len(), 199); assert_eq!(result.chars().filter(|c| *c == '?').count(), 100);
assert_eq!(result.chars().filter(|c| *c == ',').count(), 99);
}
#[test]
fn hybrid_budget_identifier_biases_lexical() {
let budget = hybrid_candidate_budget("src/main.rs", 20, 20, 5, 10_000);
assert!(
budget.lexical_candidates > budget.semantic_candidates,
"identifier queries should allocate more lexical than semantic fanout"
);
assert!(budget.lexical_candidates >= 25);
}
#[test]
fn hybrid_budget_natural_language_biases_semantic() {
let budget = hybrid_candidate_budget(
"how do we fix authentication middleware latency",
20,
20,
5,
10_000,
);
assert!(
budget.semantic_candidates > budget.lexical_candidates,
"natural language queries should allocate more semantic than lexical fanout"
);
}
#[test]
fn hybrid_budget_no_limit_caps_both_lexical_and_semantic() {
let total_docs = 2_000_000;
let budget =
hybrid_candidate_budget("authentication middleware", 0, total_docs, 0, total_docs);
let cap = no_limit_result_cap();
assert!(
budget.lexical_candidates <= cap,
"lexical fanout must respect no_limit_result_cap() = {cap}; got {}",
budget.lexical_candidates
);
assert!(
budget.lexical_candidates <= NO_LIMIT_RESULT_MAX,
"lexical fanout must respect the absolute NO_LIMIT_RESULT_MAX; got {}",
budget.lexical_candidates
);
assert!(budget.semantic_candidates <= HYBRID_NO_LIMIT_SEMANTIC_CAP);
assert!(
budget.semantic_candidates <= budget.lexical_candidates,
"semantic ({}) must not exceed lexical ({}) fanout",
budget.semantic_candidates,
budget.lexical_candidates
);
}
#[test]
fn compute_no_limit_result_cap_clamps_explicit_over_ceiling_env_override() {
let cap = compute_no_limit_result_cap_from(Some("999999999999".to_string()), None, None);
assert!(
cap <= NO_LIMIT_RESULT_MAX,
"explicit override must still clamp to ceiling; got {cap} > {NO_LIMIT_RESULT_MAX}"
);
assert!(cap >= NO_LIMIT_RESULT_MIN);
}
#[test]
fn compute_no_limit_result_cap_clamps_tiny_explicit_override_up_to_floor() {
let cap = compute_no_limit_result_cap_from(Some("1".to_string()), None, None);
assert_eq!(cap, NO_LIMIT_RESULT_MIN);
}
#[test]
fn compute_no_limit_result_cap_uses_meminfo_when_no_env_override() {
let cap = compute_no_limit_result_cap_from(None, None, Some(128u64 * 1024 * 1024 * 1024));
assert!(cap >= NO_LIMIT_RESULT_MIN, "cap {cap} below floor");
assert!(cap <= NO_LIMIT_RESULT_MAX, "cap {cap} above ceiling");
assert!(cap > NO_LIMIT_RESULT_MIN * 10);
}
#[test]
fn compute_no_limit_result_cap_falls_back_to_floor_when_meminfo_unavailable() {
let cap = compute_no_limit_result_cap_from(None, None, None);
assert!(cap >= NO_LIMIT_RESULT_MIN);
assert!(cap <= NO_LIMIT_RESULT_MAX);
}
#[test]
fn compute_no_limit_result_cap_bytes_env_takes_priority_over_meminfo() {
let four_gib = (4u64 * 1024 * 1024 * 1024).to_string();
let cap = compute_no_limit_result_cap_from(
None,
Some(four_gib),
Some(1024u64 * 1024 * 1024 * 1024), );
let expected_hits = ((4u64 * 1024 * 1024 * 1024) / AVG_HIT_BYTES) as usize;
let expected = expected_hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
assert_eq!(cap, expected, "bytes env must win over meminfo");
}
#[test]
fn no_limit_budget_bytes_preserves_fallback_priority() {
let huge_meminfo = Some(1024u64 * 1024 * 1024 * 1024);
let four_gib = 4u64 * 1024 * 1024 * 1024;
assert_eq!(
no_limit_budget_bytes(Some(four_gib.to_string()), huge_meminfo),
four_gib
);
assert_eq!(
no_limit_budget_bytes(Some("0".to_string()), huge_meminfo),
NO_LIMIT_BYTES_CEILING
);
assert_eq!(no_limit_budget_bytes(None, None), NO_LIMIT_BYTES_FLOOR);
}
#[test]
fn compute_no_limit_result_cap_ignores_malformed_env() {
for bad in ["", "abc", "0", "-1"] {
let cap = compute_no_limit_result_cap_from(
Some(bad.to_string()),
Some(bad.to_string()),
None,
);
assert!(cap >= NO_LIMIT_RESULT_MIN, "bad={bad:?} cap={cap}");
assert!(cap <= NO_LIMIT_RESULT_MAX, "bad={bad:?} cap={cap}");
}
}
fn make_test_hit(id: &str, score: f32) -> SearchHit {
SearchHit {
title: id.to_string(),
snippet: String::new(),
content: id.to_string(),
content_hash: stable_content_hash(id),
score,
source_path: format!("/path/{}.jsonl", id),
agent: "test".to_string(),
workspace: "/workspace".to_string(),
workspace_original: None,
created_at: Some(1_700_000_000_000),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".to_string(),
origin_kind: "local".to_string(),
origin_host: None,
conversation_id: None,
}
}
#[test]
fn test_rrf_fusion_ordering() {
let lexical = vec![
make_test_hit("A", 10.0),
make_test_hit("B", 8.0),
make_test_hit("C", 6.0),
];
let semantic = vec![
make_test_hit("A", 0.9),
make_test_hit("B", 0.7),
make_test_hit("D", 0.5),
];
let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
assert_eq!(fused.len(), 4);
assert_eq!(fused[0].title, "A"); assert_eq!(fused[1].title, "B"); }
#[test]
fn test_rrf_handles_disjoint_sets() {
let lexical = vec![make_test_hit("A", 10.0), make_test_hit("B", 8.0)];
let semantic = vec![make_test_hit("C", 0.9), make_test_hit("D", 0.7)];
let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
assert_eq!(fused.len(), 4);
let titles: Vec<&str> = fused.iter().map(|h| h.title.as_str()).collect();
assert!(titles.contains(&"A"));
assert!(titles.contains(&"B"));
assert!(titles.contains(&"C"));
assert!(titles.contains(&"D"));
}
#[test]
fn test_rrf_tie_breaking_deterministic() {
let lexical = vec![
make_test_hit("X", 5.0),
make_test_hit("Y", 5.0),
make_test_hit("Z", 5.0),
];
let semantic = vec![];
let fused1 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
let fused2 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
let fused3 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
assert_eq!(fused1.len(), fused2.len());
assert_eq!(fused2.len(), fused3.len());
for i in 0..fused1.len() {
assert_eq!(fused1[i].title, fused2[i].title, "Mismatch at index {}", i);
assert_eq!(fused2[i].title, fused3[i].title, "Mismatch at index {}", i);
}
}
#[test]
fn test_rrf_both_lists_bonus() {
let lexical = vec![
make_test_hit("solo_lex", 10.0), make_test_hit("both", 5.0), ];
let semantic = vec![
make_test_hit("solo_sem", 0.9), make_test_hit("both", 0.5), ];
let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
assert_eq!(
fused[0].title, "both",
"Doc in both lists should rank first"
);
}
#[test]
fn test_rrf_respects_limit_and_offset() {
let lexical = vec![
make_test_hit("A", 10.0),
make_test_hit("B", 8.0),
make_test_hit("C", 6.0),
];
let semantic = vec![];
let fused = rrf_fuse_hits(&lexical, &semantic, "", 2, 0);
assert_eq!(fused.len(), 2);
let fused_offset = rrf_fuse_hits(&lexical, &semantic, "", 10, 1);
assert_eq!(fused_offset.len(), 2);
let fused_empty = rrf_fuse_hits(&lexical, &semantic, "", 0, 0);
assert!(fused_empty.is_empty());
}
#[test]
fn test_rrf_empty_inputs() {
let empty: Vec<SearchHit> = vec![];
let non_empty = vec![make_test_hit("A", 10.0)];
assert!(rrf_fuse_hits(&empty, &empty, "", 10, 0).is_empty());
let fused = rrf_fuse_hits(&empty, &non_empty, "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].title, "A");
let fused = rrf_fuse_hits(&non_empty, &empty, "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].title, "A");
}
#[test]
fn test_rrf_coalesces_empty_title_hits_across_search_modes() {
let mut lexical = make_test_hit("shared", 10.0);
lexical.title.clear();
lexical.source_path = "/shared/untitled.jsonl".into();
lexical.content = "same untitled body".into();
lexical.content_hash = stable_content_hash("same untitled body");
let mut semantic = lexical.clone();
semantic.score = 0.9;
let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].title, "");
}
#[test]
fn test_rrf_coalesces_blank_local_source_id_hits_across_search_modes() {
let mut lexical = make_test_hit("shared-local", 10.0);
lexical.source_path = "/shared/local.jsonl".into();
lexical.content = "same local body".into();
lexical.content_hash = stable_content_hash("same local body");
lexical.source_id = "local".into();
lexical.origin_kind = "local".into();
let mut semantic = lexical.clone();
semantic.source_id = " ".into();
semantic.origin_kind = "local".into();
semantic.score = 0.9;
let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].source_id, "local");
}
#[test]
fn test_rrf_keeps_repeated_same_content_at_different_lines() {
let mut first = make_test_hit("same", 10.0);
first.title = "Shared Session".into();
first.source_path = "/shared/session.jsonl".into();
first.content = "repeat me".into();
first.content_hash = stable_content_hash("repeat me");
first.line_number = Some(1);
first.created_at = Some(100);
let mut second = first.clone();
second.line_number = Some(2);
second.created_at = Some(200);
second.score = 0.9;
let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
assert_eq!(fused.len(), 2);
assert_eq!(fused[0].line_number, Some(1));
assert_eq!(fused[1].line_number, Some(2));
}
#[test]
fn test_rrf_coalesces_present_and_missing_conversation_id_for_same_message() {
let mut lexical = make_test_hit("same", 10.0);
lexical.title = "Shared Session".into();
lexical.source_path = "/shared/session.jsonl".into();
lexical.content = "identical body".into();
lexical.content_hash = stable_content_hash("identical body");
lexical.created_at = Some(100);
lexical.line_number = Some(1);
lexical.conversation_id = None;
let mut semantic = lexical.clone();
semantic.conversation_id = Some(42);
semantic.score = 0.9;
let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].conversation_id, Some(42));
}
#[test]
fn test_rrf_coalesces_present_and_missing_conversation_id_despite_blank_local_source_id() {
let mut lexical = make_test_hit("same", 10.0);
lexical.title = "Shared Session".into();
lexical.source_path = "/shared/session.jsonl".into();
lexical.content = "identical body".into();
lexical.content_hash = stable_content_hash("identical body");
lexical.created_at = Some(100);
lexical.line_number = Some(1);
lexical.conversation_id = None;
lexical.source_id = "local".into();
lexical.origin_kind = "local".into();
let mut semantic = lexical.clone();
semantic.conversation_id = Some(42);
semantic.source_id = " ".into();
semantic.origin_kind = "local".into();
semantic.score = 0.9;
let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].conversation_id, Some(42));
}
#[test]
fn test_rrf_keeps_distinct_conversation_ids_for_shared_path_and_content() {
let mut first = make_test_hit("same", 10.0);
first.title = "Shared Session".into();
first.source_path = "/shared/session.jsonl".into();
first.content = "identical body".into();
first.content_hash = stable_content_hash("identical body");
first.conversation_id = Some(1);
let mut second = first.clone();
second.conversation_id = Some(2);
second.score = 0.9;
let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
assert_eq!(fused.len(), 2);
assert!(fused.iter().any(|hit| hit.conversation_id == Some(1)));
assert!(fused.iter().any(|hit| hit.conversation_id == Some(2)));
}
#[test]
fn test_rrf_coalesces_same_conversation_id_despite_title_drift() {
let mut lexical = make_test_hit("same", 10.0);
lexical.title = "Morning Session".into();
lexical.source_path = "/shared/session.jsonl".into();
lexical.content = "identical body".into();
lexical.content_hash = stable_content_hash("identical body");
lexical.conversation_id = Some(9);
let mut semantic = lexical.clone();
semantic.title = "Evening Session".into();
semantic.score = 0.9;
let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
assert_eq!(fused.len(), 1);
assert_eq!(fused[0].conversation_id, Some(9));
}
#[test]
fn test_rrf_keeps_distinct_titles_for_shared_path_and_content() {
let mut morning = make_test_hit("same", 10.0);
morning.title = "Morning Session".into();
morning.source_path = "/shared/session.jsonl".into();
morning.content = "identical body".into();
morning.content_hash = stable_content_hash("identical body");
morning.created_at = None;
let mut evening = morning.clone();
evening.title = "Evening Session".into();
evening.score = 0.9;
let fused = rrf_fuse_hits(&[morning], &[evening], "", 10, 0);
assert_eq!(fused.len(), 2);
assert!(fused.iter().any(|hit| hit.title == "Morning Session"));
assert!(fused.iter().any(|hit| hit.title == "Evening Session"));
}
#[test]
fn test_rrf_candidate_depth() {
let lexical: Vec<_> = (0..50)
.map(|i| make_test_hit(&format!("L{}", i), 100.0 - i as f32))
.collect();
let semantic: Vec<_> = (0..50)
.map(|i| make_test_hit(&format!("S{}", i), 1.0 - 0.01 * i as f32))
.collect();
let fused = rrf_fuse_hits(&lexical, &semantic, "", 20, 0);
assert_eq!(fused.len(), 20);
let mut seen = std::collections::HashSet::new();
for hit in &fused {
assert!(seen.insert(&hit.title), "Duplicate hit: {}", hit.title);
}
}
#[test]
fn query_token_list_parses_small_queries() {
let cases = [
("hello", 1),
("hello world", 2),
("hello AND world", 3),
("hello world foo bar", 4),
];
for (query, expected_len) in cases {
let tokens = parse_boolean_query(query);
assert_eq!(tokens.len(), expected_len, "{query}");
}
}
#[test]
fn query_token_list_parses_large_queries() {
let tokens = parse_boolean_query("a b c d e f g h i");
assert_eq!(tokens.len(), 9);
}
#[test]
fn query_token_list_handles_quoted_phrases() {
let tokens = parse_boolean_query("\"hello world\" test");
assert_eq!(tokens.len(), 2);
assert!(
matches!(&tokens[0], QueryToken::Phrase(phrase) if phrase == "hello world"),
"Expected Phrase token"
);
}
#[test]
fn query_token_list_handles_operators() {
let tokens = parse_boolean_query("foo AND bar OR baz");
assert_eq!(tokens.len(), 5);
assert_eq!(tokens[1], QueryToken::And);
assert_eq!(tokens[3], QueryToken::Or);
}
#[test]
fn query_token_list_empty_query() {
let tokens = parse_boolean_query("");
assert!(tokens.is_empty());
}
#[test]
fn query_token_list_iteration_works() {
let tokens = parse_boolean_query("a b c");
let terms: Vec<_> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.as_str()),
_ => None,
})
.collect();
assert_eq!(terms, vec!["a", "b", "c"]);
}
#[test]
fn unicode_emoji_treated_as_separator() {
let sanitized = sanitize_query("🚀 launch");
assert_eq!(sanitized, " launch", "Emoji should become space");
}
#[test]
fn unicode_emoji_splits_terms() {
let sanitized = sanitize_query("hot🔥code");
assert_eq!(sanitized, "hot code", "Emoji between words splits them");
}
#[test]
fn unicode_multiple_emoji_become_spaces() {
let sanitized = sanitize_query("🚀🔥💻");
assert_eq!(
sanitized.trim(),
"",
"All-emoji query sanitizes to whitespace"
);
}
#[test]
fn unicode_emoji_query_parses_without_panic() {
let tokens = parse_boolean_query("🚀 launch code 🔥");
let terms: Vec<_> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.clone()),
_ => None,
})
.collect();
assert!(
terms
.iter()
.any(|t| t.contains("launch") || t.contains("code"))
);
}
#[test]
fn unicode_emoji_query_terms_lower() {
let terms = QueryTermsLower::from_query("🚀 LAUNCH");
let tokens: Vec<&str> = terms.tokens().collect();
assert!(
tokens.contains(&"launch"),
"Should extract 'launch' from emoji query"
);
}
#[test]
fn unicode_cjk_chinese_preserved() {
assert_eq!(sanitize_query("测试代码"), "测试代码");
assert_eq!(sanitize_query("测试 代码"), "测试 代码");
}
#[test]
fn unicode_cjk_japanese_preserved() {
assert_eq!(sanitize_query("テスト"), "テスト");
assert_eq!(sanitize_query("こんにちは世界"), "こんにちは世界");
}
#[test]
fn unicode_cjk_korean_preserved() {
assert_eq!(sanitize_query("테스트"), "테스트");
assert_eq!(sanitize_query("안녕하세요"), "안녕하세요");
}
#[test]
fn unicode_cjk_parsed_as_terms() {
let tokens = parse_boolean_query("测试 代码 search");
let terms: Vec<_> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.as_str()),
_ => None,
})
.collect();
assert_eq!(terms, vec!["测试", "代码", "search"]);
}
#[test]
fn unicode_cjk_query_terms_lower() {
let terms = QueryTermsLower::from_query("测试 代码");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["测试", "代码"]);
}
#[test]
fn unicode_hebrew_preserved() {
assert_eq!(sanitize_query("שלום עולם"), "שלום עולם");
}
#[test]
fn unicode_arabic_preserved() {
assert_eq!(sanitize_query("مرحبا"), "مرحبا");
}
#[test]
fn unicode_hebrew_parsed_as_terms() {
let tokens = parse_boolean_query("שלום עולם");
let terms: Vec<_> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.as_str()),
_ => None,
})
.collect();
assert_eq!(terms, vec!["שלום", "עולם"]);
}
#[test]
fn unicode_arabic_query_terms_lower() {
let terms = QueryTermsLower::from_query("مرحبا بالعالم");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["مرحبا", "بالعالم"]);
}
#[test]
fn unicode_mixed_scripts_preserved() {
let sanitized = sanitize_query("Hello 世界 мир");
assert_eq!(sanitized, "Hello 世界 мир");
}
#[test]
fn unicode_mixed_scripts_parsed() {
let tokens = parse_boolean_query("Hello 世界 мир");
let terms: Vec<_> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.as_str()),
_ => None,
})
.collect();
assert_eq!(terms, vec!["Hello", "世界", "мир"]);
}
#[test]
fn unicode_mixed_scripts_with_emoji() {
let sanitized = sanitize_query("Hello 🌍 世界");
assert_eq!(sanitized, "Hello 世界");
}
#[test]
fn unicode_latin_cyrillic_arabic_query() {
let terms = QueryTermsLower::from_query("Hello Мир مرحبا");
let tokens: Vec<&str> = terms.tokens().collect();
assert_eq!(tokens, vec!["hello", "мир", "مرحبا"]);
}
#[test]
fn unicode_zero_width_joiner_removed() {
let sanitized = sanitize_query("test\u{200D}query");
assert_eq!(sanitized, "test query");
}
#[test]
fn unicode_zero_width_non_joiner_removed() {
let sanitized = sanitize_query("test\u{200C}query");
assert_eq!(sanitized, "test query");
}
#[test]
fn unicode_zero_width_space_removed() {
let sanitized = sanitize_query("test\u{200B}query");
assert_eq!(sanitized, "test query");
}
#[test]
fn unicode_bom_removed() {
let sanitized = sanitize_query("\u{FEFF}test");
assert_eq!(sanitized, " test");
}
#[test]
fn unicode_precomposed_accent_preserved() {
let sanitized = sanitize_query("café");
assert_eq!(sanitized, "café");
}
#[test]
fn unicode_combining_accent_becomes_separator() {
let input = "cafe\u{0301}";
let sanitized = sanitize_query(input);
assert_eq!(sanitized, "caf\u{00e9}");
}
#[test]
fn unicode_nfc_and_nfd_produce_same_sanitized_query() {
let nfc = "caf\u{00E9}";
let nfd = "cafe\u{0301}";
let san_nfc = sanitize_query(nfc);
let san_nfd = sanitize_query(nfd);
assert_eq!(san_nfc, "café");
assert_eq!(san_nfd, "café");
assert_eq!(san_nfc, san_nfd);
}
#[test]
fn unicode_combining_marks_do_not_panic() {
let zalgo = "t\u{0301}\u{0302}\u{0303}e\u{0304}\u{0305}st";
let sanitized = sanitize_query(zalgo);
assert!(sanitized.contains('t'));
assert!(sanitized.contains('s'));
}
#[test]
fn unicode_mathematical_bold_letters_preserved() {
let input = "\u{1D400}\u{1D401}\u{1D402}";
let sanitized = sanitize_query(input);
assert_eq!(
sanitized, input,
"Mathematical bold letters are alphanumeric"
);
}
#[test]
fn unicode_supplementary_ideograph_preserved() {
let input = "\u{20000}";
let sanitized = sanitize_query(input);
assert_eq!(
sanitized, input,
"Supplementary CJK ideographs are alphanumeric"
);
}
#[test]
fn unicode_supplementary_emoji_removed() {
let input = "test\u{1F600}query";
let sanitized = sanitize_query(input);
assert_eq!(sanitized, "test query");
}
#[test]
fn unicode_bidi_mixed_ltr_rtl_no_panic() {
let input = "hello שלום world עולם";
let tokens = parse_boolean_query(input);
let terms: Vec<_> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.as_str()),
_ => None,
})
.collect();
assert_eq!(terms.len(), 4);
assert!(terms.contains(&"hello"));
assert!(terms.contains(&"שלום"));
assert!(terms.contains(&"world"));
assert!(terms.contains(&"עולם"));
}
#[test]
fn unicode_bidi_override_chars_removed() {
let input = "test\u{202D}content\u{202C}end";
let sanitized = sanitize_query(input);
assert_eq!(sanitized, "test content end");
}
#[test]
fn unicode_bidi_rtl_mark_removed() {
let input = "test\u{200F}content";
let sanitized = sanitize_query(input);
assert_eq!(sanitized, "test content");
}
#[test]
fn unicode_full_pipeline_cjk_query() {
let explanation = QueryExplanation::analyze("测试 代码", &SearchFilters::default());
assert_eq!(explanation.parsed.terms.len(), 2);
assert!(!explanation.parsed.terms[0].text.is_empty());
assert!(!explanation.parsed.terms[1].text.is_empty());
}
#[test]
fn unicode_full_pipeline_mixed_script_boolean() {
let explanation =
QueryExplanation::analyze("Hello AND 世界 OR مرحبا", &SearchFilters::default());
assert!(
explanation.parsed.operators.iter().any(|op| op == "AND"),
"AND operator should be recognized in mixed-script query"
);
}
#[test]
fn unicode_full_pipeline_emoji_query_type() {
let explanation = QueryExplanation::analyze("🚀🔥💻", &SearchFilters::default());
assert!(
explanation.parsed.terms.is_empty()
|| explanation
.parsed
.terms
.iter()
.all(|t| t.subterms.is_empty()),
"All-emoji query should produce no meaningful terms"
);
}
#[test]
fn unicode_full_pipeline_phrase_with_cjk() {
let explanation = QueryExplanation::analyze("\"测试代码\"", &SearchFilters::default());
assert!(
!explanation.parsed.phrases.is_empty(),
"CJK phrase should be recognized"
);
}
#[test]
fn unicode_full_pipeline_wildcard_with_unicode() {
let explanation = QueryExplanation::analyze("*测试*", &SearchFilters::default());
assert!(
!explanation.parsed.terms.is_empty(),
"Wildcard with CJK should produce terms"
);
if let Some(term) = explanation.parsed.terms.first() {
assert!(
term.subterms
.iter()
.any(|s| s.pattern.contains("*") || s.pattern == "exact"),
"CJK wildcard should produce wildcard or exact pattern"
);
}
}
#[test]
fn unicode_query_terms_lower_case_folding() {
let terms = QueryTermsLower::from_query("STRAßE");
assert_eq!(terms.query_lower, "straße");
let terms2 = QueryTermsLower::from_query("HELLO");
assert_eq!(terms2.query_lower, "hello");
}
#[test]
fn unicode_normalize_term_parts_cjk() {
let parts = normalize_term_parts("测试 代码");
assert_eq!(parts, vec!["测试", "代码"]);
}
#[test]
fn unicode_normalize_term_parts_strips_emoji() {
let parts = normalize_term_parts("🚀launch🔥code");
assert!(parts.contains(&"launch".to_string()));
assert!(parts.contains(&"code".to_string()));
}
#[test]
fn special_char_unbalanced_quote_no_panic() {
let tokens = parse_boolean_query("\"hello world");
assert!(
tokens
.iter()
.any(|t| matches!(t, QueryToken::Phrase(p) if p.contains("hello"))),
"Unbalanced quote should still produce a phrase: {tokens:?}"
);
}
#[test]
fn special_char_unbalanced_trailing_quote() {
let tokens = parse_boolean_query("test\"");
assert!(
tokens
.iter()
.any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
"Text before trailing quote should parse as term: {tokens:?}"
);
}
#[test]
fn special_char_multiple_unbalanced_quotes() {
let tokens = parse_boolean_query("\"foo \"bar");
assert!(
!tokens.is_empty(),
"Should parse despite odd quotes: {tokens:?}"
);
}
#[test]
fn special_char_empty_quotes() {
let tokens = parse_boolean_query("\"\" test");
assert!(
tokens
.iter()
.any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
"Empty quotes should be skipped: {tokens:?}"
);
}
#[test]
fn special_char_unbalanced_via_sanitize() {
let sanitized = sanitize_query("\"hello world");
assert!(
sanitized.contains('"'),
"Quotes preserved by sanitize_query"
);
}
#[test]
fn special_char_backslash_quote_sanitize() {
let sanitized = sanitize_query("\\\"test\\\"");
assert!(sanitized.contains('"'));
assert!(!sanitized.contains('\\'), "Backslash should be stripped");
}
#[test]
fn special_char_backslash_quote_parse() {
let tokens = parse_boolean_query("\\\"test\\\"");
assert!(!tokens.is_empty(), "Should parse without panic: {tokens:?}");
}
#[test]
fn special_char_inner_escaped_quotes() {
let tokens = parse_boolean_query("\"test \\\"inner\\\" test\"");
assert!(
!tokens.is_empty(),
"Nested escaped quotes should not panic: {tokens:?}"
);
}
#[test]
fn special_char_windows_path_sanitize() {
let sanitized = sanitize_query("C:\\Users\\test");
assert_eq!(sanitized, "C Users test");
}
#[test]
fn special_char_unc_path_sanitize() {
let sanitized = sanitize_query("\\\\server\\share");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"server"));
assert!(parts.contains(&"share"));
}
#[test]
fn special_char_windows_path_terms() {
let parts = normalize_term_parts("C:\\Users\\test\\file.rs");
assert!(parts.contains(&"C".to_string()));
assert!(parts.contains(&"Users".to_string()));
assert!(parts.contains(&"test".to_string()));
assert!(parts.contains(&"file".to_string()));
assert!(parts.contains(&"rs".to_string()));
}
#[test]
fn special_char_regex_dot_star() {
let sanitized = sanitize_query("foo.*bar");
assert_eq!(sanitized, "foo *bar");
}
#[test]
fn special_char_regex_char_class() {
let sanitized = sanitize_query("[a-z]+");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["a-z"]);
assert_eq!(normalize_term_parts("[a-z]+"), vec!["a", "z"]);
}
#[test]
fn special_char_regex_anchors() {
let sanitized = sanitize_query("^start$");
assert_eq!(sanitized.trim(), "start");
}
#[test]
fn special_char_regex_pipe_groups() {
let sanitized = sanitize_query("(foo|bar)");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["foo", "bar"]);
}
#[test]
fn special_char_sql_injection_or() {
let sanitized = sanitize_query("'OR 1=1--");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"OR"));
assert!(parts.contains(&"1"));
assert!(!sanitized.contains('\''));
assert!(!sanitized.contains('='));
}
#[test]
fn special_char_sql_injection_drop() {
let sanitized = sanitize_query("; DROP TABLE users;--");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"DROP"));
assert!(parts.contains(&"TABLE"));
assert!(parts.contains(&"users"));
assert!(!sanitized.contains(';'));
}
#[test]
fn special_char_sql_injection_union() {
let sanitized = sanitize_query("' UNION SELECT * FROM passwords --");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"UNION"));
assert!(parts.contains(&"SELECT"));
assert!(parts.contains(&"*"));
assert!(parts.contains(&"FROM"));
assert!(parts.contains(&"passwords"));
}
#[test]
fn special_char_sql_parse_as_literal() {
let tokens = parse_boolean_query("OR 1=1");
assert!(
tokens.iter().any(|t| matches!(t, QueryToken::Or)),
"OR should be parsed as Or operator: {tokens:?}"
);
}
#[test]
fn special_char_shell_subshell() {
let sanitized = sanitize_query("$(cmd)");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["cmd"]);
}
#[test]
fn special_char_shell_backticks() {
let sanitized = sanitize_query("`cmd`");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["cmd"]);
}
#[test]
fn special_char_shell_pipe_rm() {
let sanitized = sanitize_query("| rm -rf /");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"rm"));
assert!(parts.contains(&"-rf"));
assert_eq!(normalize_term_parts("| rm -rf /"), vec!["rm", "rf"]);
assert!(!sanitized.contains('|'));
assert!(!sanitized.contains('/'));
}
#[test]
fn special_char_shell_semicolon_chain() {
let sanitized = sanitize_query("test; echo pwned; cat /etc/passwd");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"test"));
assert!(parts.contains(&"echo"));
assert!(parts.contains(&"pwned"));
assert!(!sanitized.contains(';'));
}
#[test]
fn special_char_null_byte_mid_string() {
let sanitized = sanitize_query("test\x00hidden");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["test", "hidden"]);
}
#[test]
fn special_char_null_byte_leading() {
let sanitized = sanitize_query("\x00\x00attack");
assert_eq!(sanitized.trim(), "attack");
}
#[test]
fn special_char_null_byte_trailing() {
let sanitized = sanitize_query("query\x00\x00\x00");
assert_eq!(sanitized.trim(), "query");
}
#[test]
fn special_char_null_byte_parse() {
let tokens = parse_boolean_query("test\x00hidden");
assert!(
!tokens.is_empty(),
"Null bytes should not prevent parsing: {tokens:?}"
);
}
#[test]
fn special_char_control_newline() {
let sanitized = sanitize_query("line1\nline2");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["line1", "line2"]);
}
#[test]
fn special_char_control_tab_cr() {
let sanitized = sanitize_query("tab\there\r\nend");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["tab", "here", "end"]);
}
#[test]
fn special_char_control_parse_whitespace() {
let tokens = parse_boolean_query("hello\tworld\ntest");
let terms: Vec<&str> = tokens
.iter()
.filter_map(|t| match t {
QueryToken::Term(s) => Some(s.as_str()),
_ => None,
})
.collect();
assert_eq!(terms, vec!["hello", "world", "test"]);
}
#[test]
fn special_char_control_bell_escape() {
let sanitized = sanitize_query("test\x07\x1b[31mred");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"test"));
assert!(parts.contains(&"31mred"));
}
#[test]
fn special_char_html_entity_lt() {
let sanitized = sanitize_query("<script>");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["lt", "script", "gt"]);
}
#[test]
fn special_char_html_numeric_entity() {
let sanitized = sanitize_query("<script>");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"x3C"));
assert!(parts.contains(&"script"));
assert!(parts.contains(&"x3E"));
}
#[test]
fn special_char_html_tags_stripped() {
let sanitized = sanitize_query("<script>alert('xss')</script>");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"script"));
assert!(parts.contains(&"alert"));
assert!(parts.contains(&"xss"));
}
#[test]
fn special_char_html_attribute() {
let sanitized = sanitize_query("<img src=\"evil.js\" onerror=\"alert(1)\">");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert!(parts.contains(&"img"));
assert!(parts.contains(&"src"));
assert!(parts.contains(&"onerror"));
}
#[test]
fn special_char_url_percent_encoding() {
let sanitized = sanitize_query("%20space%2Fslash");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["20space", "2Fslash"]);
}
#[test]
fn special_char_url_null_byte_encoded() {
let sanitized = sanitize_query("test%00hidden");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["test", "00hidden"]);
}
#[test]
fn special_char_url_full_query_string() {
let sanitized = sanitize_query("search?q=hello&lang=en");
let parts: Vec<&str> = sanitized.split_whitespace().collect();
assert_eq!(parts, vec!["search", "q", "hello", "lang", "en"]);
}
#[test]
fn special_char_explain_sql_injection() {
let filters = SearchFilters::default();
let explanation = QueryExplanation::analyze("'OR 1=1--", &filters);
assert!(
!explanation.parsed.terms.is_empty() || !explanation.parsed.phrases.is_empty(),
"SQL injection should produce parseable terms"
);
}
#[test]
fn special_char_explain_shell_injection() {
let filters = SearchFilters::default();
let explanation = QueryExplanation::analyze("$(rm -rf /)", &filters);
assert!(
!explanation.parsed.terms.is_empty(),
"Shell injection should produce parseable terms"
);
}
#[test]
fn special_char_explain_html_xss() {
let filters = SearchFilters::default();
let explanation = QueryExplanation::analyze("<script>alert('xss')</script>", &filters);
assert!(
!explanation.parsed.terms.is_empty(),
"XSS payload should produce parseable terms"
);
}
#[test]
fn special_char_terms_lower_injection() {
let qt = QueryTermsLower::from_query("'; DROP TABLE--");
let tokens: Vec<&str> = qt.tokens().collect();
for token in &tokens {
assert!(
token.chars().all(|c| c.is_alphanumeric()),
"Token should only contain alphanumeric characters: {token}"
);
}
}
#[test]
fn special_char_terms_lower_null_bytes() {
let qt = QueryTermsLower::from_query("test\x00hidden");
let tokens: Vec<&str> = qt.tokens().collect();
assert!(tokens.contains(&"test"));
assert!(tokens.contains(&"hidden"));
}
#[test]
fn special_char_boolean_with_injection() {
let tokens = parse_boolean_query("search AND 'OR 1=1-- NOT drop");
assert!(
tokens.iter().any(|t| matches!(t, QueryToken::And)),
"Boolean AND should still be recognized: {tokens:?}"
);
assert!(
tokens.iter().any(|t| matches!(t, QueryToken::Not)),
"Boolean NOT should still be recognized: {tokens:?}"
);
}
#[test]
fn stress_query_100k_chars_completes_quickly() {
let long_query = "a ".repeat(50000);
assert_eq!(long_query.len(), 100000);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&long_query);
let elapsed_sanitize = start.elapsed();
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&sanitized);
let elapsed_parse = start.elapsed();
assert!(
elapsed_sanitize < std::time::Duration::from_secs(1),
"sanitize_query with 100k chars took {:?} (>1s)",
elapsed_sanitize
);
assert!(
elapsed_parse < std::time::Duration::from_secs(1),
"parse_boolean_query with 100k chars took {:?} (>1s)",
elapsed_parse
);
assert!(!tokens.is_empty(), "100k char query should produce tokens");
}
#[test]
fn stress_query_1000_terms() {
let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
let query = words.join(" ");
let start = std::time::Instant::now();
let sanitized = sanitize_query(&query);
let tokens = parse_boolean_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"1000 terms query took {:?} (>1s)",
elapsed
);
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert!(
term_count >= 900,
"Expected ~1000 terms, got {} terms",
term_count
);
}
#[test]
fn stress_query_1000_identical_terms() {
let query = "test ".repeat(1000);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&query);
let tokens = parse_boolean_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"1000 identical terms query took {:?} (>1s)",
elapsed
);
let parsed_term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(parsed_term_count, 1000, "Parser should produce 1000 terms");
let qt = QueryTermsLower::from_query(&query);
let tokens_lower: Vec<&str> = qt.tokens().collect();
assert_eq!(
tokens_lower.len(),
1000,
"All 1000 identical terms should be preserved"
);
assert!(
tokens_lower.iter().all(|t| *t == "test"),
"All tokens should be 'test'"
);
}
#[test]
fn stress_query_10k_char_single_term() {
let long_term = "a".repeat(10000);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&long_term);
let tokens = parse_boolean_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"10k char single term took {:?} (>1s)",
elapsed
);
assert_eq!(tokens.len(), 1, "Should produce exactly one token");
assert!(
matches!(&tokens[0], QueryToken::Term(t) if t.len() == 10000),
"Expected Term token"
);
}
#[test]
fn stress_deeply_nested_parentheses() {
let open_parens = "(".repeat(100);
let close_parens = ")".repeat(100);
let query = format!("{}test{}", open_parens, close_parens);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&query);
let tokens = parse_boolean_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_millis(100),
"Deeply nested parens took {:?} (>100ms)",
elapsed
);
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(term_count, 1, "Should have 1 term after sanitizing parens");
}
#[test]
fn stress_many_boolean_operators() {
let terms: Vec<String> = (0..101).map(|i| format!("term{}", i)).collect();
let query = terms.join(" AND ");
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&query);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"100+ boolean ops took {:?} (>1s)",
elapsed
);
let and_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::And))
.count();
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(and_count, 100, "Should have 100 AND operators");
assert_eq!(term_count, 101, "Should have 101 terms");
}
#[test]
fn stress_many_or_operators() {
let terms: Vec<String> = (0..101).map(|i| format!("opt{}", i)).collect();
let query = terms.join(" OR ");
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&query);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"100+ OR ops took {:?} (>1s)",
elapsed
);
let or_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Or))
.count();
assert_eq!(or_count, 100, "Should have 100 OR operators");
}
#[test]
fn stress_mixed_boolean_operators() {
let query = "a AND b OR c NOT d AND e OR f NOT g ".repeat(50);
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&query);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"Mixed boolean ops took {:?} (>1s)",
elapsed
);
assert!(
!tokens.is_empty(),
"Complex boolean query should produce tokens"
);
}
#[test]
fn stress_memory_bounds_large_query() {
let large_query = "x".repeat(100000);
let sanitized = sanitize_query(&large_query);
let tokens = parse_boolean_query(&sanitized);
assert!(
sanitized.len() <= large_query.len(),
"Sanitized output should not exceed input size"
);
assert_eq!(tokens.len(), 1);
let qt = QueryTermsLower::from_query(&large_query);
let token_count = qt.tokens().count();
assert_eq!(token_count, 1, "Should be 1 token of 100k chars");
}
#[test]
fn stress_concurrent_queries() {
use std::thread;
let queries: Vec<String> = (0..100)
.map(|i| format!("concurrent_query_{} test search", i))
.collect();
let handles: Vec<_> = queries
.into_iter()
.map(|query| {
thread::spawn(move || {
let sanitized = sanitize_query(&query);
let tokens = parse_boolean_query(&sanitized);
let qt = QueryTermsLower::from_query(&query);
(tokens.len(), qt.tokens().count())
})
})
.collect();
for (i, handle) in handles.into_iter().enumerate() {
let (token_len, qt_len) = handle.join().expect("Thread panicked");
assert!(token_len > 0, "Query {} should produce tokens", i);
assert!(qt_len > 0, "Query {} QueryTermsLower should have tokens", i);
}
}
#[test]
fn stress_many_quoted_phrases() {
let phrases: Vec<String> = (0..50)
.map(|i| format!("\"phrase number {}\"", i))
.collect();
let query = phrases.join(" AND ");
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&query);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"50 quoted phrases took {:?} (>1s)",
elapsed
);
let phrase_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Phrase(_)))
.count();
assert_eq!(phrase_count, 50, "Should have 50 phrases");
}
#[test]
fn stress_alternating_quotes() {
let parts: Vec<String> = (0..100)
.map(|i| {
if i % 2 == 0 {
format!("\"word{}\"", i)
} else {
format!("word{}", i)
}
})
.collect();
let query = parts.join(" ");
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&query);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"100 alternating quotes took {:?} (>1s)",
elapsed
);
let phrase_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Phrase(_)))
.count();
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(phrase_count, 50, "Should have 50 phrases");
assert_eq!(term_count, 50, "Should have 50 terms");
}
#[test]
fn stress_many_wildcards() {
let patterns: Vec<&str> = vec!["pre*", "*suf", "*sub*", "a*b", "test*", "*ing", "*tion*"];
let query = patterns
.iter()
.cycle()
.take(100)
.cloned()
.collect::<Vec<_>>()
.join(" ");
let start = std::time::Instant::now();
let sanitized = sanitize_query(&query);
let tokens = parse_boolean_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"100 wildcards took {:?} (>1s)",
elapsed
);
assert!(!tokens.is_empty());
}
#[test]
fn stress_query_explanation_large_query() {
let words: Vec<String> = (0..100).map(|i| format!("term{}", i)).collect();
let query = words.join(" ");
let filters = SearchFilters::default();
let start = std::time::Instant::now();
let explanation = QueryExplanation::analyze(&query, &filters);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(2),
"QueryExplanation for 100 terms took {:?} (>2s)",
elapsed
);
assert!(
!explanation.parsed.terms.is_empty(),
"Should parse terms successfully"
);
}
#[test]
fn stress_very_long_single_quoted_phrase() {
let words: Vec<String> = (0..500).map(|i| format!("word{}", i)).collect();
let phrase = format!("\"{}\"", words.join(" "));
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&phrase);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"500-word phrase took {:?} (>1s)",
elapsed
);
let phrase_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Phrase(_)))
.count();
assert_eq!(phrase_count, 1, "Should have exactly 1 phrase");
}
#[test]
fn stress_not_prefix_many() {
let terms: Vec<String> = (0..100).map(|i| format!("-term{}", i)).collect();
let query = terms.join(" ");
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&query);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"100 NOT prefixes took {:?} (>1s)",
elapsed
);
let not_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Not))
.count();
assert_eq!(not_count, 100, "Should have 100 NOT operators");
}
#[test]
fn stress_unicode_large_cjk_query() {
let cjk_chars = "中文日本語한국어".repeat(1000);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&cjk_chars);
let qt = QueryTermsLower::from_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"Large CJK query took {:?} (>1s)",
elapsed
);
assert!(!qt.is_empty(), "CJK query should produce tokens");
}
#[test]
fn stress_unicode_many_emoji() {
let emoji_query = "🚀 🔍 📝 💻 🎯 ".repeat(500);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&emoji_query);
let tokens = parse_boolean_query(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"Emoji query took {:?} (>1s)",
elapsed
);
assert!(
tokens.is_empty(),
"Emoji-only query should produce no tokens"
);
}
#[test]
fn stress_mixed_content_large() {
let mixed = r#"
function test() { return x + y; }
SELECT * FROM users WHERE id = 1;
The quick brown fox 狐狸 jumps over lazy dog
Error: "undefined is not a function" at line 42
https://example.com/path?query=value&other=123
"#
.repeat(100);
let start = std::time::Instant::now();
let sanitized = sanitize_query(&mixed);
let tokens = parse_boolean_query(&sanitized);
let qt = QueryTermsLower::from_query(&mixed);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(2),
"Mixed content query took {:?} (>2s)",
elapsed
);
assert!(!tokens.is_empty());
assert!(!qt.is_empty());
}
#[test]
fn unicode_emoji_mixed_with_alphanumeric() {
let tokens = parse_boolean_query("rocket🚀launch");
assert_eq!(tokens.len(), 1);
let sanitized = sanitize_query("rocket🚀launch");
assert_eq!(sanitized, "rocket launch");
let sanitized2 = sanitize_query("test🔥🎯code");
assert_eq!(sanitized2, "test code");
}
#[test]
fn unicode_emoji_with_boolean_operators() {
let tokens = parse_boolean_query("🚀code AND test");
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert!(term_count >= 1, "Should have at least one term");
let tokens_or = parse_boolean_query("deploy OR 🎯target");
let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
assert!(has_or, "Should detect OR operator");
}
#[test]
fn unicode_emoji_at_word_boundaries() {
let sanitized_start = sanitize_query("🔍search");
assert_eq!(sanitized_start, " search");
let sanitized_end = sanitize_query("complete✅");
assert_eq!(sanitized_end, "complete ");
let sanitized_only = sanitize_query("🎉🎊🎁");
assert!(
sanitized_only.trim().is_empty(),
"Emoji-only should be empty after trimming"
);
}
#[test]
fn unicode_arabic_text_preserved() {
let arabic = "مرحبا بالعالم"; let sanitized = sanitize_query(arabic);
assert_eq!(
sanitized, arabic,
"Arabic alphanumeric chars should be preserved"
);
let tokens = parse_boolean_query(arabic);
assert!(!tokens.is_empty(), "Arabic query should produce tokens");
}
#[test]
fn unicode_hebrew_text_preserved() {
let hebrew = "שלום עולם"; let sanitized = sanitize_query(hebrew);
assert_eq!(
sanitized, hebrew,
"Hebrew alphanumeric chars should be preserved"
);
let tokens = parse_boolean_query(hebrew);
assert!(!tokens.is_empty(), "Hebrew query should produce tokens");
}
#[test]
fn unicode_mixed_rtl_and_ltr() {
let mixed = "hello مرحبا world";
let sanitized = sanitize_query(mixed);
assert_eq!(sanitized, mixed, "Mixed RTL/LTR should be preserved");
let tokens = parse_boolean_query(mixed);
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(term_count, 3, "Should have 3 terms");
}
#[test]
fn unicode_rtl_with_boolean_operators() {
let hebrew_and = "שלום AND עולם";
let tokens = parse_boolean_query(hebrew_and);
let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
assert!(has_and, "Should detect AND operator in Hebrew query");
let arabic_not = "مرحبا NOT بالعالم";
let tokens_not = parse_boolean_query(arabic_not);
let has_not = tokens_not.iter().any(|t| matches!(t, QueryToken::Not));
assert!(has_not, "Should detect NOT operator in Arabic query");
}
#[test]
fn special_chars_backslash_stripped() {
let query = r"path\to\file";
let sanitized = sanitize_query(query);
assert_eq!(sanitized, "path to file");
}
#[test]
fn special_chars_escaped_quotes_handling() {
let query = r#"say \"hello\""#;
let sanitized = sanitize_query(query);
assert!(sanitized.contains('"'), "Quotes should be preserved");
}
#[test]
fn special_chars_windows_paths() {
let path = r"C:\Users\test\Documents";
let sanitized = sanitize_query(path);
assert_eq!(sanitized, "C Users test Documents");
}
#[test]
fn boolean_deeply_nested_operators() {
let query = "a AND b OR c NOT d AND e";
let tokens = parse_boolean_query(query);
let mut and_count = 0;
let mut or_count = 0;
let mut not_count = 0;
for token in &tokens {
match token {
QueryToken::And => and_count += 1,
QueryToken::Or => or_count += 1,
QueryToken::Not => not_count += 1,
_ => {}
}
}
assert_eq!(and_count, 2, "Should have 2 AND operators");
assert_eq!(or_count, 1, "Should have 1 OR operator");
assert_eq!(not_count, 1, "Should have 1 NOT operator");
}
#[test]
fn boolean_consecutive_operators_degenerate() {
let tokens = parse_boolean_query("foo AND AND bar");
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert!(
term_count >= 2,
"Should have at least 2 terms (foo and bar)"
);
}
#[test]
fn boolean_operator_at_start() {
let tokens = parse_boolean_query("AND foo");
let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
assert!(has_and, "Leading AND should be detected");
let tokens_or = parse_boolean_query("OR test");
let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
assert!(has_or, "Leading OR should be detected");
}
#[test]
fn boolean_operator_at_end() {
let tokens = parse_boolean_query("foo AND");
let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
assert!(has_and, "Trailing AND should be detected");
}
#[test]
fn numeric_query_digits_only() {
let tokens = parse_boolean_query("12345");
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], QueryToken::Term("12345".to_string()));
let sanitized = sanitize_query("12345");
assert_eq!(sanitized, "12345");
}
#[test]
fn numeric_query_with_text() {
let tokens = parse_boolean_query("error 404 not found");
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert!(term_count >= 3, "Should have at least 3 terms");
}
#[test]
fn numeric_versions_with_dots() {
let sanitized = sanitize_query("version 1.2.3");
assert_eq!(sanitized, "version 1 2 3"); }
#[test]
fn whitespace_tabs_treated_as_separators() {
let tokens = parse_boolean_query("foo\tbar\tbaz");
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(term_count, 3, "Tabs should separate terms");
}
#[test]
fn whitespace_newlines_treated_as_separators() {
let tokens = parse_boolean_query("foo\nbar\nbaz");
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(term_count, 3, "Newlines should separate terms");
}
#[test]
fn whitespace_mixed_types() {
let tokens = parse_boolean_query("a \t b \n c d");
let term_count = tokens
.iter()
.filter(|t| matches!(t, QueryToken::Term(_)))
.count();
assert_eq!(term_count, 4, "Mixed whitespace should separate properly");
}
#[test]
fn stress_very_long_single_term() {
let long_term = "a".repeat(10_000);
let start = std::time::Instant::now();
let tokens = parse_boolean_query(&long_term);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"10K char term took {:?} (>1s)",
elapsed
);
assert_eq!(tokens.len(), 1);
assert!(
matches!(tokens.first(), Some(QueryToken::Term(t)) if t.len() == 10_000),
"Expected 10K Term token, got {tokens:?}"
);
}
#[test]
fn stress_very_long_term_with_wildcard() {
let long_pattern = format!("{}*", "prefix".repeat(1000));
let start = std::time::Instant::now();
let sanitized = sanitize_query(&long_pattern);
let pattern = WildcardPattern::parse(&sanitized);
let elapsed = start.elapsed();
assert!(
elapsed < std::time::Duration::from_secs(1),
"Long wildcard pattern took {:?} (>1s)",
elapsed
);
assert!(
matches!(pattern, WildcardPattern::Prefix(_)),
"Should parse as prefix pattern"
);
}
#[test]
fn query_explanation_empty_query() {
let explanation = QueryExplanation::analyze("", &SearchFilters::default());
assert_eq!(explanation.query_type, QueryType::Empty);
}
#[test]
fn search_mode_default_is_hybrid_preferred() {
assert_eq!(SearchMode::default(), SearchMode::Hybrid);
}
#[test]
fn query_explanation_whitespace_only_query() {
let explanation = QueryExplanation::analyze(" \t\n ", &SearchFilters::default());
assert_eq!(explanation.query_type, QueryType::Empty);
}
#[test]
fn query_explanation_unicode_query() {
let explanation = QueryExplanation::analyze("日本語 search", &SearchFilters::default());
assert!(!explanation.parsed.terms.is_empty());
}
#[test]
fn query_terms_lower_unicode_normalization() {
let terms = QueryTermsLower::from_query("CAFÉ RÉSUMÉ");
assert_eq!(terms.query_lower, "café résumé");
}
#[test]
fn query_terms_lower_mixed_case_unicode() {
let terms = QueryTermsLower::from_query("Hello日本語World");
assert!(terms.query_lower.contains("hello"));
assert!(terms.query_lower.contains("world"));
}
#[test]
fn query_terms_lower_preserves_numbers() {
let terms = QueryTermsLower::from_query("ABC123XYZ");
assert_eq!(terms.query_lower, "abc123xyz");
}
#[test]
fn wildcard_pattern_internal_asterisk() {
let pattern = WildcardPattern::parse("f*o");
assert!(
matches!(pattern, WildcardPattern::Complex(_)),
"Internal asterisk should be Complex"
);
}
#[test]
fn wildcard_pattern_multiple_internal_asterisks() {
let pattern = WildcardPattern::parse("a*b*c");
assert!(
matches!(pattern, WildcardPattern::Complex(_)),
"Multiple internal asterisks should be Complex"
);
}
#[test]
fn wildcard_pattern_regex_escapes_special_chars() {
let pattern = WildcardPattern::parse("*foo.bar*");
if let Some(regex) = pattern.to_regex() {
assert!(
regex.contains("\\."),
"Dot should be escaped in regex: {}",
regex
);
}
}
#[test]
fn wildcard_pattern_complex_regex_generation() {
let pattern = WildcardPattern::parse("f*o*o");
if let Some(regex) = pattern.to_regex() {
assert!(
regex.contains(".*"),
"Should have .* for internal wildcards: {}",
regex
);
}
}
#[test]
fn test_transpile_to_fts5() {
assert_eq!(
transpile_to_fts5("foo bar"),
Some("foo AND bar".to_string())
);
assert_eq!(
transpile_to_fts5("foo AND bar"),
Some("foo AND bar".to_string())
);
assert_eq!(
transpile_to_fts5("foo OR bar"),
Some("(foo OR bar)".to_string())
);
assert_eq!(transpile_to_fts5("OR foo"), Some("foo".to_string()));
assert_eq!(transpile_to_fts5("NOT foo"), None);
assert_eq!(
transpile_to_fts5("A AND B OR C"),
Some("A AND (B OR C)".to_string())
);
assert_eq!(
transpile_to_fts5("A OR B AND C"),
Some("(A OR B) AND C".to_string())
);
assert_eq!(
transpile_to_fts5("A OR B OR C"),
Some("(A OR B OR C)".to_string())
);
assert_eq!(
transpile_to_fts5("\"foo bar\""),
Some("\"foo bar\"".to_string())
);
assert_eq!(transpile_to_fts5("foo*"), Some("foo*".to_string()));
assert_eq!(transpile_to_fts5("*foo"), None);
assert_eq!(transpile_to_fts5("f*o"), None);
assert_eq!(
transpile_to_fts5("foo-bar"),
Some("(foo AND bar)".to_string())
);
assert_eq!(
transpile_to_fts5("foo-bar*"),
Some("(foo AND bar*)".to_string())
);
assert_eq!(
transpile_to_fts5("br-123.jsonl"),
Some("(br AND 123 AND jsonl)".to_string())
);
assert_eq!(
transpile_to_fts5("br-123.json*"),
Some("(br AND 123 AND json*)".to_string())
);
assert_eq!(transpile_to_fts5("NOT A OR B"), None);
}
#[test]
fn semantic_doc_id_roundtrip_from_query() {
let hash_hex = "00".repeat(32);
let doc_id = format!("m|42|2|3|7|11|1|1700000000000|{hash_hex}");
let parsed = parse_semantic_doc_id(&doc_id).expect("roundtrip parse");
assert_eq!(parsed.message_id, 42);
assert_eq!(parsed.chunk_idx, 2);
assert_eq!(parsed.agent_id, 3);
assert_eq!(parsed.workspace_id, 7);
assert_eq!(parsed.source_id, 11);
assert_eq!(parsed.role, 1);
assert_eq!(parsed.created_at_ms, 1_700_000_000_000);
}
#[test]
fn semantic_filter_applies_all_constraints() {
use frankensearch::core::filter::SearchFilter;
let filter = SemanticFilter {
agents: Some(HashSet::from([3])),
workspaces: Some(HashSet::from([7])),
sources: Some(HashSet::from([11])),
roles: Some(HashSet::from([1])),
created_from: Some(1_700_000_000_000),
created_to: Some(1_700_000_000_100),
};
assert!(filter.matches("m|42|2|3|7|11|1|1700000000001", None));
assert!(!filter.matches("m|42|2|99|7|11|1|1700000000001", None));
assert!(!filter.matches("m|42|2|3|7|11|1|1699999999999", None));
assert!(!filter.matches("not-a-doc-id", None));
}
#[test]
fn fs_semantic_index_runs_filtered_search() -> Result<()> {
let temp = TempDir::new()?;
let index_path = crate::search::vector_index::vector_index_path(temp.path(), "embed-fast");
if let Some(parent) = index_path.parent() {
std::fs::create_dir_all(parent)?;
}
let hash_a = "00".repeat(32);
let hash_b = "11".repeat(32);
let doc_a = format!("m|101|0|1|10|100|1|1700000000001|{hash_a}");
let doc_b = format!("m|202|0|2|20|200|1|1700000000002|{hash_b}");
let mut writer = VectorIndex::create_with_revision(
&index_path,
"embed-fast",
"rev-1",
2,
frankensearch::index::Quantization::F16,
)
.map_err(|err| anyhow!("create fsvi index failed: {err}"))?;
writer
.write_record(&doc_a, &[1.0, 0.0])
.map_err(|err| anyhow!("write_record failed: {err}"))?;
writer
.write_record(&doc_b, &[0.0, 1.0])
.map_err(|err| anyhow!("write_record failed: {err}"))?;
writer
.finish()
.map_err(|err| anyhow!("finish fsvi index failed: {err}"))?;
let fs_index =
VectorIndex::open(&index_path).map_err(|err| anyhow!("open fsvi failed: {err}"))?;
let filter = SemanticFilter {
agents: Some(HashSet::from([1])),
workspaces: None,
sources: None,
roles: None,
created_from: None,
created_to: None,
};
let fs_filter = semantic_filter_as_search_filter(&filter).expect("expected active filter");
let hits = fs_index
.search_top_k(&[1.0, 0.0], 5, Some(fs_filter))
.map_err(|err| anyhow!("frankensearch search failed: {err}"))?;
assert_eq!(hits.len(), 1);
let parsed = parse_semantic_doc_id(&hits[0].doc_id).expect("parse bridged doc_id");
assert_eq!(parsed.message_id, 101);
assert_eq!(parsed.agent_id, 1);
Ok(())
}
#[test]
fn hit_is_noise_returns_false_when_content_and_snippet_both_empty() {
let hit = SearchHit {
title: String::new(),
snippet: String::new(),
content: String::new(),
content_hash: 0,
conversation_id: Some(1),
score: 1.0,
source_path: "/tmp/session.jsonl".to_string(),
agent: "codex".to_string(),
workspace: String::new(),
workspace_original: None,
created_at: Some(1700000000000),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".to_string(),
origin_kind: "local".to_string(),
origin_host: None,
};
assert!(
!hit_is_noise(&hit, "anything"),
"hit with empty content AND snippet (projection-only) must NOT be classified as noise"
);
assert!(
!hit_is_noise(&hit, ""),
"noise classifier must not treat an empty-query projection-only hit as noise"
);
}
#[test]
fn hit_is_noise_still_drops_tool_acknowledgement_when_content_present() {
let hit = SearchHit {
title: String::new(),
snippet: String::new(),
content: "ok".to_string(),
content_hash: 0,
conversation_id: Some(1),
score: 1.0,
source_path: "/tmp/session.jsonl".to_string(),
agent: "codex".to_string(),
workspace: String::new(),
workspace_original: None,
created_at: Some(1700000000000),
line_number: Some(1),
match_type: MatchType::Exact,
source_id: "local".to_string(),
origin_kind: "local".to_string(),
origin_host: None,
};
assert!(
hit_is_noise(&hit, ""),
"bare tool-ack 'ok' with content present should still be dropped as noise"
);
}
}