1use anyhow::{Context, Result, anyhow, bail};
2use crossbeam_channel as mpsc;
3use frankensearch::lexical::{
4 BooleanQuery, CASS_SCHEMA_HASH as FS_CASS_SCHEMA_HASH, CassFields as FsCassFields,
5 CassQueryFilters as FsCassQueryFilters, CassQueryToken as FsCassQueryToken,
6 CassSourceFilter as FsCassSourceFilter, CassWildcardPattern as FsCassWildcardPattern, Count,
7 IndexReader, IndexRecordOption, LexicalDocHit as FsLexicalDocHit,
8 LexicalSearchResult as FsLexicalSearchResult, Occur, Query, ReloadPolicy, Searcher,
9 SnippetConfig as FsSnippetConfig, TantivyDocument, Term, TermQuery, TopDocs, Value,
10 cass_build_tantivy_query as fs_cass_build_tantivy_query,
11 cass_has_boolean_operators as fs_cass_has_boolean_operators,
12 cass_open_search_reader as fs_cass_open_search_reader,
13 cass_parse_boolean_query as fs_cass_parse_boolean_query,
14 cass_sanitize_query as fs_cass_sanitize_query, load_doc as fs_load_doc,
15 render_snippet_html as fs_render_snippet_html,
16 try_build_snippet_generator as fs_try_build_snippet_generator,
17};
18use frankensearch::{
19 Cx as FsCx, InMemoryTwoTierIndex as FsInMemoryTwoTierIndex,
20 InMemoryVectorIndex as FsInMemoryVectorIndex, LexicalSearch as FsLexicalSearch,
21 QueryClass as FsQueryClass, RrfConfig as FsRrfConfig, ScoreSource as FsScoreSource,
22 ScoredResult as FsScoredResult, SearchError as FsSearchError, SearchFuture as FsSearchFuture,
23 SearchPhase as FsSearchPhase, SyncEmbedderAdapter as FsSyncEmbedderAdapter,
24 SyncTwoTierSearcher as FsSyncTwoTierSearcher, TwoTierConfig as FsTwoTierConfig,
25 TwoTierIndex as FsTwoTierIndex, TwoTierSearcher as FsTwoTierSearcher, VectorHit as FsVectorHit,
26 candidate_count as fs_candidate_count,
27 core::filter::SearchFilter as FsSearchFilter,
28 index::{
29 HNSW_DEFAULT_EF_SEARCH as FS_HNSW_DEFAULT_EF_SEARCH, HnswIndex as FsHnswIndex,
30 VectorIndex as FsVectorIndex,
31 },
32 rrf_fuse as fs_rrf_fuse,
33};
34use lru::LruCache;
35use once_cell::sync::Lazy;
36use parking_lot::RwLock;
37use std::cell::RefCell;
38use std::cmp::Ordering as CmpOrdering;
39use std::collections::{HashMap, HashSet, VecDeque};
40use std::hash::{Hash, Hasher};
41use std::num::NonZeroUsize;
42use std::path::{Path, PathBuf};
43use std::sync::atomic::{AtomicU64, Ordering};
44use std::sync::{Arc, Mutex};
45use std::time::{Duration, Instant};
46
47use frankensqlite::Connection;
48#[cfg(test)]
49use frankensqlite::compat::OptionalExtension;
50use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
51#[cfg(test)]
52use frankensqlite::params;
53
54struct SendConnection(Connection);
62
63type TantivyContentExactKey = (i64, i64);
64type TantivyContentFallbackKey = (String, String, i64);
65type TantivyHydratedContentMaps = (
66 HashMap<TantivyContentExactKey, String>,
67 HashMap<TantivyContentFallbackKey, String>,
68);
69type SqliteFtsHydratedRow = (
70 i64,
71 String,
72 String,
73 String,
74 String,
75 String,
76 Option<i64>,
77 Option<i64>,
78 Option<i64>,
79 Option<String>,
80 Option<String>,
81 Option<String>,
82);
83
84const SQLITE_FTS5_HYDRATE_PARAM_CHUNK: usize = 30_000;
88const SQLITE_MAX_VARIABLE_NUMBER: usize = 32_766;
89const SEARCH_SQLITE_HYDRATION_CACHE_KIB: i64 = 4_096;
90
91unsafe impl Send for SendConnection {}
94
95impl std::ops::Deref for SendConnection {
96 type Target = Connection;
97 fn deref(&self) -> &Connection {
98 &self.0
99 }
100}
101
102fn open_search_hydration_sqlite(path: &Path, timeout: Duration) -> Result<Connection> {
103 let conn =
104 crate::storage::sqlite::open_franken_raw_readonly_connection_with_timeout(path, timeout)?;
105 conn.execute("PRAGMA query_only = 1;")
106 .with_context(|| "setting search hydration query_only")?;
107 conn.execute("PRAGMA busy_timeout = 5000;")
108 .with_context(|| "setting search hydration busy_timeout")?;
109 conn.execute(&format!(
110 "PRAGMA cache_size = -{SEARCH_SQLITE_HYDRATION_CACHE_KIB};"
111 ))
112 .with_context(|| "setting search hydration cache_size")?;
113 Ok(conn)
114}
115
116fn nfc_sanitize_query(raw: &str) -> String {
120 use unicode_normalization::UnicodeNormalization;
121 let nfc: String = raw.nfc().collect();
122 fs_cass_sanitize_query(&nfc)
123}
124
125fn franken_query_map_collect_retry<T, F>(
126 conn: &Connection,
127 sql: &str,
128 params: &[ParamValue],
129 map: F,
130) -> Result<Vec<T>, frankensqlite::FrankenError>
131where
132 F: Copy + Fn(&frankensqlite::Row) -> Result<T, frankensqlite::FrankenError>,
133{
134 let deadline = Instant::now() + Duration::from_secs(2);
135 let mut backoff = Duration::from_millis(4);
136 loop {
137 match conn.query_map_collect(sql, params, |row| map(row)) {
138 Ok(values) => return Ok(values),
139 Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
140 let now = Instant::now();
141 if now >= deadline {
142 return Err(err);
143 }
144 let remaining = deadline.saturating_duration_since(now);
145 crate::storage::sqlite::sleep_with_franken_retry_backoff(
146 &mut backoff,
147 remaining,
148 Duration::from_millis(64),
149 );
150 }
151 Err(err) => return Err(err),
152 }
153 }
154}
155
156fn hydrate_message_content_by_conversation(
157 conn: &Connection,
158 requests: &[TantivyContentExactKey],
159) -> Result<HashMap<TantivyContentExactKey, String>> {
160 if requests.is_empty() {
161 return Ok(HashMap::new());
162 }
163
164 let mut wanted_by_conversation: HashMap<i64, HashSet<i64>> = HashMap::new();
165 for &(conversation_id, line_idx) in requests {
166 wanted_by_conversation
167 .entry(conversation_id)
168 .or_default()
169 .insert(line_idx);
170 }
171
172 let mut conversation_ids = wanted_by_conversation.keys().copied().collect::<Vec<_>>();
173 conversation_ids.sort_unstable();
174 let mut hydrated = HashMap::with_capacity(requests.len());
175
176 for conversation_id in conversation_ids {
177 let Some(wanted_indices) = wanted_by_conversation.get(&conversation_id) else {
178 continue;
179 };
180 let mut wanted_indices = wanted_indices.iter().copied().collect::<Vec<_>>();
181 wanted_indices.sort_unstable();
182 let placeholders = sql_placeholders(wanted_indices.len());
183 let sql = format!(
184 "SELECT m.conversation_id, m.idx, m.content
185 FROM messages m INDEXED BY sqlite_autoindex_messages_1
186 WHERE m.conversation_id = ? AND m.idx IN ({placeholders})
187 ORDER BY m.idx"
188 );
189 let mut params = Vec::with_capacity(wanted_indices.len() + 1);
190 params.push(ParamValue::from(conversation_id));
191 params.extend(wanted_indices.iter().copied().map(ParamValue::from));
192 let rows: Vec<(i64, i64, String)> =
193 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
194 Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?))
195 })?;
196 for (conversation_id, line_idx, content) in rows {
197 hydrated.insert((conversation_id, line_idx), content);
198 }
199 }
200
201 Ok(hydrated)
202}
203
204fn semantic_message_id_from_db(message_id: i64) -> std::io::Result<u64> {
205 u64::try_from(message_id).map_err(|_| std::io::Error::other("negative message_id"))
206}
207
208fn semantic_doc_component_id_from_db(raw: Option<i64>) -> u32 {
209 raw.map(|value| u32::try_from(value.max(0)).unwrap_or(u32::MAX))
210 .unwrap_or(0)
211}
212
213use crate::search::canonicalize::{canonicalize_for_embedding, content_hash, is_search_noise_text};
214use crate::search::embedder::Embedder;
215use crate::search::vector_index::{
216 ROLE_USER, SemanticDocId, SemanticFilter, SemanticFilterMaps, VectorIndex, VectorSearchResult,
217 parse_semantic_doc_id, role_code_from_str,
218};
219use crate::sources::provenance::SourceFilter;
220
221pub struct StringInterner {
232 cache: RwLock<LruCache<Arc<str>, Arc<str>>>,
233}
234
235impl StringInterner {
236 pub fn new(capacity: usize) -> Self {
238 Self {
239 cache: RwLock::new(LruCache::new(
240 NonZeroUsize::new(capacity).expect("capacity must be > 0"),
241 )),
242 }
243 }
244
245 pub fn intern(&self, s: &str) -> Arc<str> {
251 {
253 let cache = self.cache.read();
254 if let Some(arc) = cache.peek(s) {
257 return Arc::clone(arc);
258 }
259 }
260
261 let mut cache = self.cache.write();
263
264 if let Some(arc) = cache.get(s) {
267 return Arc::clone(arc);
268 }
269
270 let arc: Arc<str> = Arc::from(s);
272 cache.put(Arc::clone(&arc), Arc::clone(&arc));
273 arc
274 }
275
276 #[allow(dead_code)]
278 pub fn len(&self) -> usize {
279 self.cache.read().len()
280 }
281
282 #[allow(dead_code)]
284 pub fn is_empty(&self) -> bool {
285 self.cache.read().is_empty()
286 }
287}
288
289static CACHE_KEY_INTERNER: Lazy<StringInterner> = Lazy::new(|| StringInterner::new(10_000));
292
293#[inline]
295fn intern_cache_key(s: &str) -> Arc<str> {
296 CACHE_KEY_INTERNER.intern(s)
297}
298
299#[inline]
315pub fn sql_placeholders(count: usize) -> String {
316 if count == 0 {
317 return String::new();
318 }
319 let capacity = count.saturating_mul(2).saturating_sub(1);
321 let mut result = String::with_capacity(capacity);
322 for i in 0..count {
323 if i > 0 {
324 result.push(',');
325 }
326 result.push('?');
327 }
328 result
329}
330
331#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
332pub struct SearchFilters {
333 pub agents: HashSet<String>,
334 pub workspaces: HashSet<String>,
335 pub created_from: Option<i64>,
336 pub created_to: Option<i64>,
337 #[serde(skip_serializing_if = "SourceFilter::is_all")]
339 pub source_filter: SourceFilter,
340 #[serde(skip_serializing_if = "HashSet::is_empty")]
342 pub session_paths: HashSet<String>,
343}
344
345#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, clap::ValueEnum)]
346#[serde(rename_all = "snake_case")]
347pub enum SearchMode {
348 Lexical,
350 Semantic,
352 #[default]
354 Hybrid,
355}
356
357impl SearchMode {
358 pub fn next(self) -> Self {
359 match self {
360 SearchMode::Lexical => SearchMode::Semantic,
361 SearchMode::Semantic => SearchMode::Hybrid,
362 SearchMode::Hybrid => SearchMode::Lexical,
363 }
364 }
365}
366
367#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize)]
374#[serde(rename_all = "snake_case")]
375pub enum SemanticTierMode {
376 #[default]
377 Single,
378 Progressive,
379 FastOnly,
380 QualityOnly,
381}
382
383impl SemanticTierMode {
384 const fn wants_two_tier(self) -> bool {
385 !matches!(self, Self::Single)
386 }
387
388 fn to_frankensearch_config(self) -> FsTwoTierConfig {
389 let mut config = frankensearch_two_tier_config();
390 match self {
391 Self::Single | Self::Progressive => {}
392 Self::FastOnly => {
393 config.fast_only = true;
394 }
395 Self::QualityOnly => {
396 config.fast_only = false;
397 config.quality_weight = 1.0;
398 }
399 }
400 config
401 }
402}
403
404const PROGRESSIVE_EMBEDDING_CACHE_CAPACITY: usize = 64;
405const ANN_CANDIDATE_MULTIPLIER: usize = 4;
406const HYBRID_NO_LIMIT_PLANNING_WINDOW: usize = 64;
407const HYBRID_NO_LIMIT_SEMANTIC_CAP: usize = 2048;
408const AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS: usize = 16;
409
410pub const NO_LIMIT_RESULT_MIN: usize = 1_000;
431pub const NO_LIMIT_RESULT_MAX: usize = 1_000_000;
432
433const AVG_HIT_BYTES: u64 = 80 * 1024;
438
439const NO_LIMIT_BYTES_CEILING: u64 = 16 * 1024 * 1024 * 1024;
445
446const NO_LIMIT_BYTES_FLOOR: u64 = 256 * 1024 * 1024;
450
451const NO_LIMIT_RAM_DIVISOR: u64 = 16;
455
456fn available_memory_bytes() -> Option<u64> {
457 let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?;
458 for line in meminfo.lines() {
459 if let Some(rest) = line.strip_prefix("MemAvailable:") {
460 let kb: u64 = rest.split_whitespace().next()?.parse().ok()?;
461 return Some(kb.saturating_mul(1024));
462 }
463 }
464 None
465}
466
467fn no_limit_result_cap() -> usize {
468 static CAP: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
469 *CAP.get_or_init(|| {
470 compute_no_limit_result_cap_from(
471 std::env::var("CASS_SEARCH_NO_LIMIT_CAP").ok(),
472 std::env::var("CASS_SEARCH_NO_LIMIT_BYTES").ok(),
473 available_memory_bytes(),
474 )
475 })
476}
477
478fn compute_no_limit_result_cap_from(
485 cap_env: Option<String>,
486 bytes_env: Option<String>,
487 available_bytes: Option<u64>,
488) -> usize {
489 if let Some(hits) = cap_env
493 .and_then(|v| v.parse::<usize>().ok())
494 .filter(|v| *v > 0)
495 {
496 return hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
497 }
498
499 let budget_bytes = no_limit_budget_bytes(bytes_env, available_bytes);
500 let hits = (budget_bytes / AVG_HIT_BYTES) as usize;
501 hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX)
502}
503
504fn no_limit_budget_bytes(bytes_env: Option<String>, available_bytes: Option<u64>) -> u64 {
505 bytes_env
506 .and_then(|v| v.parse::<u64>().ok())
507 .filter(|v| *v > 0)
508 .or_else(|| no_limit_available_memory_budget(available_bytes))
509 .unwrap_or(NO_LIMIT_BYTES_FLOOR)
510}
511
512fn no_limit_available_memory_budget(available_bytes: Option<u64>) -> Option<u64> {
513 available_bytes.map(|avail| {
514 (avail / NO_LIMIT_RAM_DIVISOR).clamp(NO_LIMIT_BYTES_FLOOR, NO_LIMIT_BYTES_CEILING)
515 })
516}
517
518static FRANKENSEARCH_TWO_TIER_CONFIG: Lazy<FsTwoTierConfig> =
519 Lazy::new(|| FsTwoTierConfig::optimized().with_env_overrides());
520
521fn frankensearch_two_tier_config() -> FsTwoTierConfig {
522 FRANKENSEARCH_TWO_TIER_CONFIG.clone()
523}
524
525#[inline]
526const fn progressive_phase_fetch_limit(limit: usize) -> usize {
527 let limit = if limit == 0 { 1 } else { limit };
528 limit.saturating_mul(3)
529}
530
531#[derive(Debug, Clone, Copy, PartialEq, Eq)]
532struct HybridCandidateBudget {
533 lexical_candidates: usize,
534 semantic_candidates: usize,
535}
536
537#[inline]
538const fn hybrid_stage_multipliers(query_class: FsQueryClass) -> (usize, usize) {
539 match query_class {
540 FsQueryClass::Identifier => (6, 2),
542 FsQueryClass::ShortKeyword => (4, 4),
544 FsQueryClass::NaturalLanguage => (2, 8),
546 FsQueryClass::Empty => (0, 0),
548 }
549}
550
551#[inline]
552fn hybrid_candidate_budget(
553 query: &str,
554 requested_limit: usize,
555 effective_limit: usize,
556 offset: usize,
557 total_docs: usize,
558) -> HybridCandidateBudget {
559 let query_class = FsQueryClass::classify(query);
560 let (lex_mult, sem_mult) = hybrid_stage_multipliers(query_class);
561 let total_docs = total_docs.max(1);
562
563 if requested_limit == 0 {
566 let planning_window = HYBRID_NO_LIMIT_PLANNING_WINDOW.max(offset.saturating_add(1));
567 let lexical = effective_limit.min(total_docs).min(no_limit_result_cap());
572 let semantic = fs_candidate_count(planning_window, 0, sem_mult)
580 .max(planning_window)
581 .min(HYBRID_NO_LIMIT_SEMANTIC_CAP.max(offset.saturating_add(planning_window)))
582 .min(total_docs)
583 .min(lexical);
584 return HybridCandidateBudget {
585 lexical_candidates: lexical,
586 semantic_candidates: semantic,
587 };
588 }
589
590 let lexical = fs_candidate_count(requested_limit, offset, lex_mult.max(1))
591 .max(requested_limit.saturating_add(offset))
592 .min(total_docs);
593 let semantic = fs_candidate_count(requested_limit, offset, sem_mult.max(1))
594 .max(requested_limit.saturating_add(offset))
595 .min(total_docs);
596
597 HybridCandidateBudget {
598 lexical_candidates: lexical,
599 semantic_candidates: semantic,
600 }
601}
602
603#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
609#[serde(rename_all = "snake_case")]
610pub enum QueryType {
611 Simple,
613 Phrase,
615 Boolean,
617 Wildcard,
619 Filtered,
621 Empty,
623}
624
625#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
627#[serde(rename_all = "snake_case")]
628pub enum IndexStrategy {
629 EdgeNgram,
631 RegexScan,
633 BooleanCombination,
635 RangeScan,
637 FullScan,
639}
640
641#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
643#[serde(rename_all = "snake_case")]
644pub enum QueryCost {
645 Low,
647 Medium,
649 High,
651}
652
653#[derive(Debug, Clone, serde::Serialize)]
655pub struct ParsedSubTerm {
656 pub text: String,
657 pub pattern: String,
658}
659
660#[derive(Debug, Clone, serde::Serialize)]
662pub struct ParsedTerm {
663 pub text: String,
665 pub negated: bool,
667 pub subterms: Vec<ParsedSubTerm>,
669}
670
671#[derive(Debug, Clone, Default, serde::Serialize)]
673pub struct ParsedQuery {
674 pub terms: Vec<ParsedTerm>,
676 pub phrases: Vec<String>,
678 pub operators: Vec<String>,
680 pub implicit_and: bool,
682}
683
684#[derive(Debug, Clone, serde::Serialize)]
686pub struct QueryExplanation {
687 pub original_query: String,
689 pub sanitized_query: String,
691 pub parsed: ParsedQuery,
693 pub query_type: QueryType,
695 pub index_strategy: IndexStrategy,
697 pub wildcard_applied: bool,
699 pub estimated_cost: QueryCost,
701 pub filters_summary: FiltersSummary,
703 pub warnings: Vec<String>,
705}
706
707#[derive(Debug, Clone, Default, serde::Serialize)]
709pub struct FiltersSummary {
710 pub agent_count: usize,
712 pub workspace_count: usize,
714 pub has_time_filter: bool,
716 pub description: Option<String>,
718}
719
720impl QueryExplanation {
721 pub fn analyze(query: &str, filters: &SearchFilters) -> Self {
723 let sanitized = nfc_sanitize_query(query);
724 let tokens = fs_cass_parse_boolean_query(query);
726
727 let mut parsed = ParsedQuery::default();
729 let mut has_explicit_operator = false;
730 let mut next_negated = false;
731
732 for token in &tokens {
733 match token {
734 FsCassQueryToken::Term(t) => {
735 let parts: Vec<String> = nfc_sanitize_query(t)
736 .split_whitespace()
737 .map(|s| s.to_string())
738 .collect();
739 if parts.is_empty() {
740 next_negated = false;
741 continue;
742 }
743 let mut subterms = Vec::new();
744 for part in parts {
745 let pattern = FsCassWildcardPattern::parse(&part);
746 let pattern_str = match &pattern {
747 FsCassWildcardPattern::Exact(_) => "exact",
748 FsCassWildcardPattern::Prefix(_) => "prefix (*)",
749 FsCassWildcardPattern::Suffix(_) => "suffix (*)",
750 FsCassWildcardPattern::Substring(_) => "substring (*)",
751 FsCassWildcardPattern::Complex(_) => "complex (*)",
752 };
753 subterms.push(ParsedSubTerm {
754 text: part,
755 pattern: pattern_str.to_string(),
756 });
757 }
758 parsed.terms.push(ParsedTerm {
759 text: t.clone(),
760 negated: next_negated,
761 subterms,
762 });
763 next_negated = false;
764 }
765 FsCassQueryToken::Phrase(p) => {
766 let parts: Vec<String> = nfc_sanitize_query(p)
767 .split_whitespace()
768 .map(|s| s.trim_matches('*').to_lowercase())
769 .filter(|s| !s.is_empty())
770 .collect();
771 if !parts.is_empty() {
772 parsed.phrases.push(parts.join(" "));
773 }
774 next_negated = false;
775 }
776 FsCassQueryToken::And => {
777 parsed.operators.push("AND".to_string());
778 has_explicit_operator = true;
779 }
780 FsCassQueryToken::Or => {
781 parsed.operators.push("OR".to_string());
782 has_explicit_operator = true;
783 }
784 FsCassQueryToken::Not => {
785 parsed.operators.push("NOT".to_string());
786 has_explicit_operator = true;
787 next_negated = true;
788 }
789 }
790 }
791
792 parsed.implicit_and = !has_explicit_operator && parsed.terms.len() > 1;
794
795 let query_type = Self::classify_query(&parsed, filters, &sanitized);
797
798 let index_strategy = Self::determine_strategy(&parsed, &sanitized);
800
801 let estimated_cost = Self::estimate_cost(&parsed, &index_strategy, filters);
803
804 let filters_summary = Self::summarize_filters(filters);
806
807 let warnings = Self::generate_warnings(&parsed, &sanitized, filters);
809
810 Self {
811 original_query: query.to_string(),
812 sanitized_query: sanitized,
813 parsed,
814 query_type,
815 index_strategy,
816 wildcard_applied: false, estimated_cost,
818 filters_summary,
819 warnings,
820 }
821 }
822
823 fn classify_query(parsed: &ParsedQuery, filters: &SearchFilters, sanitized: &str) -> QueryType {
824 if sanitized.trim().is_empty() {
825 return QueryType::Empty;
826 }
827
828 let has_filters = !filters.agents.is_empty()
830 || !filters.workspaces.is_empty()
831 || filters.created_from.is_some()
832 || filters.created_to.is_some()
833 || !filters.source_filter.is_all();
834
835 if has_filters {
836 return QueryType::Filtered;
837 }
838
839 if !parsed.operators.is_empty() {
841 return QueryType::Boolean;
842 }
843
844 if !parsed.phrases.is_empty() {
846 return QueryType::Phrase;
847 }
848
849 let has_wildcards = parsed
851 .terms
852 .iter()
853 .flat_map(|t| &t.subterms)
854 .any(|t| t.pattern != "exact");
855 if has_wildcards {
856 return QueryType::Wildcard;
857 }
858
859 QueryType::Simple
860 }
861
862 fn determine_strategy(parsed: &ParsedQuery, sanitized: &str) -> IndexStrategy {
863 if sanitized.trim().is_empty() {
864 return IndexStrategy::FullScan;
865 }
866
867 let has_leading_wildcard = parsed
869 .terms
870 .iter()
871 .flat_map(|t| &t.subterms)
872 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
873
874 if has_leading_wildcard {
875 return IndexStrategy::RegexScan;
876 }
877
878 let has_compound_terms = parsed.terms.iter().any(|t| t.subterms.len() > 1);
881
882 if !parsed.operators.is_empty()
883 || parsed.terms.len() > 1
884 || !parsed.phrases.is_empty()
885 || has_compound_terms
886 {
887 return IndexStrategy::BooleanCombination;
888 }
889
890 IndexStrategy::EdgeNgram
892 }
893
894 fn estimate_cost(
895 parsed: &ParsedQuery,
896 strategy: &IndexStrategy,
897 filters: &SearchFilters,
898 ) -> QueryCost {
899 if matches!(strategy, IndexStrategy::RegexScan) {
901 return QueryCost::High;
902 }
903
904 if matches!(strategy, IndexStrategy::FullScan) {
906 return QueryCost::High;
907 }
908
909 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
911
912 let term_count: usize = parsed.terms.iter().map(|t| t.subterms.len()).sum();
914 let operator_count = parsed.operators.len();
915 let phrase_count = parsed.phrases.len();
916
917 let complexity = term_count + operator_count * 2 + phrase_count * 2;
918
919 if complexity > 6 || has_time_filter {
920 QueryCost::High
921 } else if complexity > 2 {
922 QueryCost::Medium
923 } else {
924 QueryCost::Low
925 }
926 }
927
928 fn summarize_filters(filters: &SearchFilters) -> FiltersSummary {
929 let agent_count = filters.agents.len();
930 let workspace_count = filters.workspaces.len();
931 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
932
933 let mut parts = Vec::new();
934 if agent_count > 0 {
935 parts.push(format!(
936 "{} agent{}",
937 agent_count,
938 if agent_count > 1 { "s" } else { "" }
939 ));
940 }
941 if workspace_count > 0 {
942 parts.push(format!(
943 "{} workspace{}",
944 workspace_count,
945 if workspace_count > 1 { "s" } else { "" }
946 ));
947 }
948 if has_time_filter {
949 parts.push("time range".to_string());
950 }
951
952 let description = if parts.is_empty() {
953 None
954 } else {
955 Some(format!("Filtering by: {}", parts.join(", ")))
956 };
957
958 FiltersSummary {
959 agent_count,
960 workspace_count,
961 has_time_filter,
962 description,
963 }
964 }
965
966 fn generate_warnings(
967 parsed: &ParsedQuery,
968 sanitized: &str,
969 filters: &SearchFilters,
970 ) -> Vec<String> {
971 let mut warnings = Vec::new();
972
973 let has_leading_wildcard = parsed
975 .terms
976 .iter()
977 .flat_map(|t| &t.subterms)
978 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
979 if has_leading_wildcard {
980 warnings.push(
981 "Leading wildcards (*foo) require regex scan and may be slow on large indexes"
982 .to_string(),
983 );
984 }
985
986 for term in &parsed.terms {
988 for sub in &term.subterms {
989 if sub.text.trim_matches('*').len() < 2 {
990 warnings.push(format!(
991 "Very short term '{}' may match many documents",
992 sub.text
993 ));
994 }
995 }
996 }
997
998 if sanitized.trim().is_empty() {
1000 warnings.push("Empty query will return all documents (expensive)".to_string());
1001 }
1002
1003 if parsed.operators.len() > 3 {
1005 warnings.push("Complex boolean query may have unexpected precedence".to_string());
1006 }
1007
1008 if let Some(agent) = filters.agents.iter().next()
1010 && filters.agents.len() == 1
1011 && filters.workspaces.is_empty()
1012 {
1013 warnings.push(format!(
1014 "Searching only in agent '{}' - results from other agents will be excluded",
1015 agent
1016 ));
1017 }
1018
1019 warnings
1020 }
1021
1022 pub fn with_wildcard_fallback(mut self, applied: bool) -> Self {
1024 self.wildcard_applied = applied;
1025 if applied
1026 && !self
1027 .warnings
1028 .iter()
1029 .any(|w| w.contains("wildcard fallback"))
1030 {
1031 self.warnings.push(
1032 "Wildcard fallback was applied automatically due to sparse exact matches"
1033 .to_string(),
1034 );
1035 }
1036 self
1037 }
1038}
1039
1040#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize)]
1043#[serde(rename_all = "snake_case")]
1044pub enum MatchType {
1045 #[default]
1047 Exact,
1048 Prefix,
1050 Suffix,
1052 Substring,
1054 Wildcard,
1056 ImplicitWildcard,
1058}
1059
1060impl MatchType {
1061 pub fn quality_factor(self) -> f32 {
1063 match self {
1064 MatchType::Exact => 1.0,
1065 MatchType::Prefix => 0.9,
1066 MatchType::Suffix => 0.8,
1067 MatchType::Substring => 0.7,
1068 MatchType::Wildcard => 0.65,
1069 MatchType::ImplicitWildcard => 0.6,
1070 }
1071 }
1072}
1073
1074#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
1076#[serde(rename_all = "snake_case")]
1077pub enum SuggestionKind {
1078 SpellingFix,
1080 WildcardQuery,
1082 RemoveFilter,
1084 AlternateAgent,
1086 BroaderDateRange,
1088}
1089
1090#[derive(Debug, Clone, serde::Serialize)]
1092pub struct QuerySuggestion {
1093 pub kind: SuggestionKind,
1095 pub message: String,
1097 pub suggested_query: Option<String>,
1099 pub suggested_filters: Option<SearchFilters>,
1101 pub shortcut: Option<u8>,
1103}
1104
1105impl QuerySuggestion {
1106 fn spelling(_query: &str, corrected: &str) -> Self {
1107 Self {
1108 kind: SuggestionKind::SpellingFix,
1109 message: format!("Did you mean: \"{corrected}\"?"),
1110 suggested_query: Some(corrected.to_string()),
1111 suggested_filters: None,
1112 shortcut: None,
1113 }
1114 }
1115
1116 fn wildcard(query: &str) -> Self {
1117 let wildcard_query = format!("*{}*", query.trim_matches('*'));
1118 Self {
1119 kind: SuggestionKind::WildcardQuery,
1120 message: format!("Try broader search: \"{wildcard_query}\""),
1121 suggested_query: Some(wildcard_query),
1122 suggested_filters: None,
1123 shortcut: None,
1124 }
1125 }
1126
1127 fn remove_agent_filter(current_agent: &str, current_filters: &SearchFilters) -> Self {
1128 let mut filters = current_filters.clone();
1131 filters.agents.clear();
1132 Self {
1133 kind: SuggestionKind::RemoveFilter,
1134 message: format!("Remove agent filter (currently: {current_agent})"),
1135 suggested_query: None,
1136 suggested_filters: Some(filters),
1137 shortcut: None,
1138 }
1139 }
1140
1141 fn try_agent(agent_slug: &str) -> Self {
1142 let mut filters = SearchFilters::default();
1143 filters.agents.insert(agent_slug.to_string());
1144 Self {
1145 kind: SuggestionKind::AlternateAgent,
1146 message: format!("Try searching in: {agent_slug}"),
1147 suggested_query: None,
1148 suggested_filters: Some(filters),
1149 shortcut: None,
1150 }
1151 }
1152
1153 fn with_shortcut(mut self, key: u8) -> Self {
1154 self.shortcut = Some(key);
1155 self
1156 }
1157}
1158
1159#[derive(Debug, Clone, Copy)]
1160pub struct FieldMask {
1161 flags: u8,
1162 preview_content_chars: Option<usize>,
1163}
1164
1165impl FieldMask {
1166 const CONTENT: u8 = 1 << 0;
1167 const SNIPPET: u8 = 1 << 1;
1168 const TITLE: u8 = 1 << 2;
1169 const CACHE: u8 = 1 << 3;
1170
1171 pub const FULL: Self = Self {
1172 flags: Self::CONTENT | Self::SNIPPET | Self::TITLE | Self::CACHE,
1173 preview_content_chars: None,
1174 };
1175
1176 pub fn new(
1177 wants_content: bool,
1178 wants_snippet: bool,
1179 wants_title: bool,
1180 allows_cache: bool,
1181 ) -> Self {
1182 let mut flags = 0;
1183 if wants_content {
1184 flags |= Self::CONTENT;
1185 }
1186 if wants_snippet {
1187 flags |= Self::SNIPPET;
1188 }
1189 if wants_title {
1190 flags |= Self::TITLE;
1191 }
1192 if allows_cache {
1193 flags |= Self::CACHE;
1194 }
1195 Self {
1196 flags,
1197 preview_content_chars: None,
1198 }
1199 }
1200
1201 pub fn with_preview_content_limit(mut self, max_chars: Option<usize>) -> Self {
1202 self.preview_content_chars = max_chars;
1203 if max_chars.is_some() {
1204 self.flags &= !Self::CACHE;
1205 }
1206 self
1207 }
1208
1209 pub fn needs_content(self) -> bool {
1210 self.flags & Self::CONTENT != 0
1211 }
1212
1213 pub fn wants_snippet(self) -> bool {
1214 self.flags & Self::SNIPPET != 0
1215 }
1216
1217 pub fn wants_title(self) -> bool {
1218 self.flags & Self::TITLE != 0
1219 }
1220
1221 pub fn allows_cache(self) -> bool {
1222 self.flags & Self::CACHE != 0
1223 }
1224
1225 pub fn preview_content_limit(self) -> Option<usize> {
1226 self.preview_content_chars
1227 }
1228}
1229
1230#[derive(Debug, Clone, serde::Serialize)]
1231pub struct SearchHit {
1232 pub title: String,
1233 pub snippet: String,
1234 pub content: String,
1235 #[serde(skip_serializing)]
1236 pub content_hash: u64,
1237 #[serde(skip_serializing)]
1238 pub conversation_id: Option<i64>,
1239 pub score: f32,
1240 pub source_path: String,
1241 pub agent: String,
1242 pub workspace: String,
1243 #[serde(skip_serializing_if = "Option::is_none")]
1245 pub workspace_original: Option<String>,
1246 pub created_at: Option<i64>,
1247 pub line_number: Option<usize>,
1249 #[serde(default)]
1251 pub match_type: MatchType,
1252 #[serde(default = "default_source_id")]
1255 pub source_id: String,
1256 #[serde(default = "default_source_id")]
1258 pub origin_kind: String,
1259 #[serde(skip_serializing_if = "Option::is_none")]
1261 pub origin_host: Option<String>,
1262}
1263
1264static LAZY_FIELDS_ENABLED: Lazy<bool> = Lazy::new(|| {
1265 dotenvy::var("CASS_LAZY_FIELDS")
1266 .ok()
1267 .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
1268 .unwrap_or(true)
1269});
1270
1271fn default_source_id() -> String {
1272 "local".to_string()
1273}
1274
1275fn effective_field_mask(field_mask: FieldMask) -> FieldMask {
1276 if *LAZY_FIELDS_ENABLED {
1277 field_mask
1278 } else {
1279 FieldMask::FULL
1280 }
1281}
1282
1283fn execute_query_with_lazy_exact_count(
1284 searcher: &Searcher,
1285 query: &dyn Query,
1286 limit: usize,
1287 offset: usize,
1288) -> Result<FsLexicalSearchResult> {
1289 let top_docs = searcher.search(
1290 query,
1291 &TopDocs::with_limit(limit)
1292 .and_offset(offset)
1293 .order_by_score(),
1294 )?;
1295 let page_saturated = top_docs.len() == limit;
1296 let total_count = if page_saturated {
1297 searcher.search(query, &Count)?
1298 } else {
1299 offset.saturating_add(top_docs.len())
1300 };
1301 let hits = top_docs
1302 .into_iter()
1303 .enumerate()
1304 .map(|(rank, (bm25_score, doc_address))| FsLexicalDocHit {
1305 bm25_score,
1306 rank,
1307 doc_address,
1308 })
1309 .collect();
1310
1311 Ok(FsLexicalSearchResult { hits, total_count })
1312}
1313
1314#[derive(Debug, Clone)]
1316pub struct SearchResult {
1317 pub hits: Vec<SearchHit>,
1319 pub wildcard_fallback: bool,
1321 pub cache_stats: CacheStats,
1323 pub suggestions: Vec<QuerySuggestion>,
1325 pub ann_stats: Option<crate::search::ann_index::AnnSearchStats>,
1327 pub total_count: Option<usize>,
1333}
1334
1335#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1336pub enum ProgressivePhaseKind {
1337 Initial,
1338 Refined,
1339}
1340
1341#[allow(clippy::large_enum_variant)]
1344#[derive(Debug, Clone)]
1345pub enum ProgressiveSearchEvent {
1346 Phase {
1347 kind: ProgressivePhaseKind,
1348 result: SearchResult,
1349 elapsed_ms: u128,
1350 },
1351 RefinementFailed {
1352 latency_ms: u128,
1353 error: String,
1354 },
1355}
1356
1357#[derive(Debug, Clone)]
1358pub(crate) struct ProgressiveSearchRequest<'a> {
1359 pub(crate) cx: &'a FsCx,
1360 pub(crate) query: &'a str,
1361 pub(crate) filters: SearchFilters,
1362 pub(crate) limit: usize,
1363 pub(crate) sparse_threshold: usize,
1364 pub(crate) field_mask: FieldMask,
1365 pub(crate) mode: SearchMode,
1366}
1367
1368#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1369struct SearchHitKey {
1370 source_id: String,
1371 source_path: String,
1372 conversation_id: Option<i64>,
1373 title: String,
1374 line_number: Option<usize>,
1375 created_at: Option<i64>,
1376 content_hash: u64,
1377}
1378
1379fn normalized_search_source_id_sql_expr(
1380 source_id_column: &str,
1381 origin_kind_column: &str,
1382 origin_host_column: &str,
1383) -> String {
1384 format!(
1385 "CASE \
1386 WHEN TRIM(COALESCE({source_id_column}, '')) != '' THEN \
1387 CASE \
1388 WHEN LOWER(TRIM(COALESCE({source_id_column}, ''))) = '{local}' THEN '{local}' \
1389 ELSE TRIM(COALESCE({source_id_column}, '')) \
1390 END \
1391 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) IN ('ssh', 'remote') THEN \
1392 CASE \
1393 WHEN TRIM(COALESCE({origin_host_column}, '')) = '' THEN 'remote' \
1394 ELSE TRIM(COALESCE({origin_host_column}, '')) \
1395 END \
1396 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) = '{local}' THEN '{local}' \
1397 WHEN TRIM(COALESCE({origin_host_column}, '')) != '' THEN TRIM(COALESCE({origin_host_column}, '')) \
1398 ELSE '{local}' \
1399 END",
1400 local = crate::sources::provenance::LOCAL_SOURCE_ID,
1401 )
1402}
1403
1404fn normalize_search_source_filter_value(source_id: &str) -> String {
1405 let trimmed = source_id.trim();
1406 if trimmed.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1407 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1408 } else {
1409 trimmed.to_string()
1410 }
1411}
1412
1413fn normalized_search_hit_source_id_parts(
1414 source_id: &str,
1415 origin_kind: &str,
1416 origin_host: Option<&str>,
1417) -> String {
1418 let trimmed_source_id = source_id.trim();
1419 if !trimmed_source_id.is_empty() {
1420 if trimmed_source_id.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1421 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1422 }
1423 return trimmed_source_id.to_string();
1424 }
1425
1426 let trimmed_origin_host = origin_host.map(str::trim).filter(|value| !value.is_empty());
1427 let trimmed_origin_kind = origin_kind.trim();
1428 if trimmed_origin_kind.eq_ignore_ascii_case("ssh")
1429 || trimmed_origin_kind.eq_ignore_ascii_case("remote")
1430 {
1431 return trimmed_origin_host.unwrap_or("remote").to_string();
1432 }
1433 if let Some(origin_host) = trimmed_origin_host {
1434 return origin_host.to_string();
1435 }
1436
1437 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1438}
1439
1440fn normalized_search_hit_origin_kind(source_id: &str, origin_kind: Option<&str>) -> String {
1441 if let Some(kind) = origin_kind.map(str::trim).filter(|value| !value.is_empty()) {
1442 if kind.eq_ignore_ascii_case("local") {
1443 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1444 }
1445 if kind.eq_ignore_ascii_case("ssh") || kind.eq_ignore_ascii_case("remote") {
1446 return "remote".to_string();
1447 }
1448 return kind.to_ascii_lowercase();
1449 }
1450
1451 if source_id == crate::sources::provenance::LOCAL_SOURCE_ID {
1452 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1453 } else {
1454 "remote".to_string()
1455 }
1456}
1457
1458fn normalized_search_hit_source_id(hit: &SearchHit) -> String {
1459 normalized_search_hit_source_id_parts(
1460 hit.source_id.as_str(),
1461 hit.origin_kind.as_str(),
1462 hit.origin_host.as_deref(),
1463 )
1464}
1465
1466impl SearchHitKey {
1467 fn from_hit(hit: &SearchHit) -> Self {
1468 Self {
1469 source_id: normalized_search_hit_source_id(hit),
1470 source_path: hit.source_path.clone(),
1471 conversation_id: hit.conversation_id,
1472 title: if hit.conversation_id.is_some() {
1473 String::new()
1474 } else {
1475 hit.title.trim().to_string()
1476 },
1477 line_number: hit.line_number,
1478 created_at: hit.created_at,
1479 content_hash: hit.content_hash,
1480 }
1481 }
1482}
1483
1484impl Ord for SearchHitKey {
1485 fn cmp(&self, other: &Self) -> CmpOrdering {
1486 self.source_id
1487 .cmp(&other.source_id)
1488 .then_with(|| self.source_path.cmp(&other.source_path))
1489 .then_with(|| self.conversation_id.cmp(&other.conversation_id))
1490 .then_with(|| self.title.cmp(&other.title))
1491 .then_with(|| self.line_number.cmp(&other.line_number))
1492 .then_with(|| self.created_at.cmp(&other.created_at))
1493 .then_with(|| self.content_hash.cmp(&other.content_hash))
1494 }
1495}
1496
1497impl PartialOrd for SearchHitKey {
1498 fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
1499 Some(self.cmp(other))
1500 }
1501}
1502
1503const FEDERATED_RRF_K: f32 = 60.0;
1504
1505#[derive(Debug)]
1506struct FederatedRankedHit {
1507 hit: SearchHit,
1508 shard_index: usize,
1509 shard_rank: usize,
1510 fused_score: f32,
1511}
1512
1513fn federated_rrf_score(shard_rank: usize) -> f32 {
1514 1.0 / (FEDERATED_RRF_K + shard_rank as f32 + 1.0)
1515}
1516
1517fn merge_federated_ranked_hits(mut ranked_hits: Vec<FederatedRankedHit>) -> Vec<SearchHit> {
1518 ranked_hits.sort_by(|a, b| {
1519 b.fused_score
1520 .total_cmp(&a.fused_score)
1521 .then_with(|| a.shard_rank.cmp(&b.shard_rank))
1522 .then_with(|| SearchHitKey::from_hit(&a.hit).cmp(&SearchHitKey::from_hit(&b.hit)))
1523 .then_with(|| a.shard_index.cmp(&b.shard_index))
1524 });
1525 ranked_hits
1526 .into_iter()
1527 .map(|mut ranked| {
1528 ranked.hit.score = ranked.fused_score;
1529 ranked.hit
1530 })
1531 .collect()
1532}
1533
1534#[cfg(test)]
1535#[allow(dead_code)]
1536#[derive(Debug, Default, Clone)]
1537struct HybridScore {
1538 rrf: f32,
1539 lexical_rank: Option<usize>,
1540 semantic_rank: Option<usize>,
1541 lexical_score: Option<f32>,
1542 semantic_score: Option<f32>,
1543}
1544
1545#[cfg(test)]
1546#[allow(dead_code)]
1547#[derive(Debug, Clone)]
1548struct FusedHit {
1549 key: SearchHitKey,
1550 score: HybridScore,
1551 hit: SearchHit,
1552}
1553
1554pub(crate) fn stable_content_hash(content: &str) -> u64 {
1564 use xxhash_rust::xxh3::Xxh3;
1565 let mut hasher = Xxh3::new();
1566 let mut first = true;
1567 for token in content.split_whitespace() {
1568 if !first {
1569 hasher.update(b" ");
1570 }
1571 hasher.update(token.as_bytes());
1572 first = false;
1573 }
1574 hasher.digest()
1575}
1576
1577fn stable_hit_hash(
1578 content: &str,
1579 source_path: &str,
1580 line_number: Option<usize>,
1581 created_at: Option<i64>,
1582) -> u64 {
1583 use xxhash_rust::xxh3::Xxh3;
1584 let mut hasher = Xxh3::new();
1585 if !content.is_empty() {
1588 hasher.update(&stable_content_hash(content).to_le_bytes());
1589 }
1590 hasher.update(b"|");
1591 hasher.update(source_path.as_bytes());
1592 hasher.update(b"|");
1593 if let Some(line) = line_number {
1594 let mut buf = itoa::Buffer::new();
1595 hasher.update(buf.format(line).as_bytes());
1596 }
1597 hasher.update(b"|");
1598 if let Some(ts) = created_at {
1599 let mut buf = itoa::Buffer::new();
1600 hasher.update(buf.format(ts).as_bytes());
1601 }
1602 hasher.digest()
1603}
1604
1605fn search_hit_key_doc_id(key: &SearchHitKey) -> String {
1606 use std::fmt::Write as _;
1614 const SEP: char = '\u{1f}';
1615 let capacity = key.source_id.len()
1617 + key.source_path.len()
1618 + key.title.len()
1619 + 6 + 3 * 20 + 20; let mut out = String::with_capacity(capacity);
1623 out.push_str(&key.source_id);
1624 out.push(SEP);
1625 out.push_str(&key.source_path);
1626 out.push(SEP);
1627 if let Some(v) = key.conversation_id {
1628 let _ = write!(out, "{v}");
1629 }
1630 out.push(SEP);
1631 out.push_str(&key.title);
1632 out.push(SEP);
1633 if let Some(v) = key.line_number {
1634 let _ = write!(out, "{v}");
1635 }
1636 out.push(SEP);
1637 if let Some(v) = key.created_at {
1638 let _ = write!(out, "{v}");
1639 }
1640 out.push(SEP);
1641 let _ = write!(out, "{}", key.content_hash);
1642 out
1643}
1644
1645fn search_hit_doc_id(hit: &SearchHit) -> String {
1646 search_hit_key_doc_id(&SearchHitKey::from_hit(hit))
1647}
1648
1649#[cfg(test)]
1651fn cmp_fused_hit_desc(a: &FusedHit, b: &FusedHit) -> CmpOrdering {
1652 b.score
1653 .rrf
1654 .total_cmp(&a.score.rrf)
1655 .then_with(|| {
1656 let a_both = a.score.lexical_rank.is_some() && a.score.semantic_rank.is_some();
1657 let b_both = b.score.lexical_rank.is_some() && b.score.semantic_rank.is_some();
1658 match (b_both, a_both) {
1659 (true, false) => CmpOrdering::Greater,
1660 (false, true) => CmpOrdering::Less,
1661 _ => CmpOrdering::Equal,
1662 }
1663 })
1664 .then_with(|| a.key.cmp(&b.key))
1665}
1666
1667#[cfg(test)]
1669#[allow(dead_code)]
1670const QUICKSELECT_THRESHOLD: usize = 64;
1671
1672#[cfg(test)]
1681#[allow(dead_code)]
1682fn top_k_fused(mut hits: Vec<FusedHit>, k: usize) -> Vec<FusedHit> {
1683 let n = hits.len();
1684
1685 if n == 0 || k == 0 {
1687 return Vec::new();
1688 }
1689 if k >= n {
1690 hits.sort_by(cmp_fused_hit_desc);
1691 return hits;
1692 }
1693
1694 if n < QUICKSELECT_THRESHOLD {
1696 hits.sort_by(cmp_fused_hit_desc);
1697 hits.truncate(k);
1698 return hits;
1699 }
1700
1701 hits.select_nth_unstable_by(k - 1, cmp_fused_hit_desc);
1703
1704 hits.truncate(k);
1706
1707 hits.sort_by(cmp_fused_hit_desc);
1709
1710 hits
1711}
1712
1713pub fn rrf_fuse_hits(
1716 lexical: &[SearchHit],
1717 semantic: &[SearchHit],
1718 query: &str,
1719 limit: usize,
1720 offset: usize,
1721) -> Vec<SearchHit> {
1722 if limit == 0 {
1723 return Vec::new();
1724 }
1725 let total_candidates = lexical.len().saturating_add(semantic.len());
1726 if total_candidates == 0 {
1727 return Vec::new();
1728 }
1729
1730 let mut lexical_scored = Vec::with_capacity(lexical.len());
1731 let mut semantic_scored = Vec::with_capacity(semantic.len());
1732 let mut hit_by_doc_id: HashMap<String, SearchHit> = HashMap::with_capacity(total_candidates);
1733
1734 for hit in lexical {
1735 let doc_id = search_hit_doc_id(hit);
1736 hit_by_doc_id.insert(doc_id.clone(), hit.clone());
1738 lexical_scored.push(FsScoredResult {
1739 doc_id,
1740 score: hit.score,
1741 source: FsScoreSource::Lexical,
1742 index: None,
1743 fast_score: None,
1744 quality_score: None,
1745 lexical_score: Some(hit.score),
1746 rerank_score: None,
1747 explanation: None,
1748 metadata: None,
1749 });
1750 }
1751
1752 for (idx, hit) in semantic.iter().enumerate() {
1753 let doc_id = search_hit_doc_id(hit);
1754 hit_by_doc_id
1755 .entry(doc_id.clone())
1756 .or_insert_with(|| hit.clone());
1757 semantic_scored.push(FsVectorHit {
1758 index: u32::try_from(idx).unwrap_or(u32::MAX),
1759 score: hit.score,
1760 doc_id,
1761 });
1762 }
1763
1764 let fused = fs_rrf_fuse(
1767 &lexical_scored,
1768 &semantic_scored,
1769 total_candidates,
1770 0,
1771 &FsRrfConfig::default(),
1772 );
1773
1774 #[derive(Clone, Copy)]
1779 struct CompatSlot {
1780 index: usize,
1781 conversation_id: Option<i64>,
1782 ambiguous: bool,
1783 }
1784
1785 let mut source_ids: HashMap<String, u32> = HashMap::new();
1786 let mut path_ids: HashMap<String, u32> = HashMap::new();
1787 let mut title_ids: HashMap<String, u32> = HashMap::new();
1788 let mut next_source_id: u32 = 0;
1789 let mut next_path_id: u32 = 0;
1790 let mut next_title_id: u32 = 0;
1791 type CompatExactKey = (
1792 u32,
1793 u32,
1794 Option<i64>,
1795 Option<u32>,
1796 Option<usize>,
1797 Option<i64>,
1798 u64,
1799 );
1800 type CompatFallbackKey = (u32, u32, u32, Option<usize>, Option<i64>, u64);
1801
1802 let mut exact_seen: HashMap<CompatExactKey, usize> = HashMap::with_capacity(fused.len());
1803 let mut fallback_seen: HashMap<CompatFallbackKey, CompatSlot> =
1804 HashMap::with_capacity(fused.len());
1805 let mut unique_hits: Vec<SearchHit> = Vec::with_capacity(fused.len());
1806
1807 let update_slot = |slot: &mut CompatSlot, conversation_id: Option<i64>| {
1808 if slot.ambiguous {
1809 return;
1810 }
1811 match (slot.conversation_id, conversation_id) {
1812 (Some(existing), Some(current)) if existing != current => slot.ambiguous = true,
1813 (None, Some(current)) => slot.conversation_id = Some(current),
1814 _ => {}
1815 }
1816 };
1817
1818 for fused_hit in fused {
1819 let mut hit = match hit_by_doc_id.remove(&fused_hit.doc_id) {
1820 Some(hit) => hit,
1821 None => continue,
1822 };
1823 if hit_is_noise(&hit, query) {
1824 continue;
1825 }
1826
1827 let normalized_source_id = normalized_search_hit_source_id(&hit);
1828 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
1829 *id
1830 } else {
1831 let id = next_source_id;
1832 next_source_id = next_source_id.saturating_add(1);
1833 source_ids.insert(normalized_source_id, id);
1834 id
1835 };
1836 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
1837 *id
1838 } else {
1839 let id = next_path_id;
1840 next_path_id = next_path_id.saturating_add(1);
1841 path_ids.insert(hit.source_path.clone(), id);
1842 id
1843 };
1844 let normalized_title = hit.title.trim();
1845 let fallback_title_key = if let Some(id) = title_ids.get(normalized_title) {
1846 *id
1847 } else {
1848 let id = next_title_id;
1849 next_title_id = next_title_id.saturating_add(1);
1850 title_ids.insert(normalized_title.to_string(), id);
1851 id
1852 };
1853 let exact_title_key = if hit.conversation_id.is_some() {
1854 None
1855 } else {
1856 Some(fallback_title_key)
1857 };
1858 let exact_key = (
1859 source_key,
1860 path_key,
1861 hit.conversation_id,
1862 exact_title_key,
1863 hit.line_number,
1864 hit.created_at,
1865 hit.content_hash,
1866 );
1867 let fallback_key = (
1868 source_key,
1869 path_key,
1870 fallback_title_key,
1871 hit.line_number,
1872 hit.created_at,
1873 hit.content_hash,
1874 );
1875
1876 let merged_idx = exact_seen.get(&exact_key).copied().or_else(|| {
1877 fallback_seen.get(&fallback_key).and_then(|slot| {
1878 if slot.ambiguous {
1879 return None;
1880 }
1881 match (slot.conversation_id, hit.conversation_id) {
1882 (Some(existing), Some(current)) if existing != current => None,
1883 _ => Some(slot.index),
1884 }
1885 })
1886 });
1887
1888 if let Some(existing_idx) = merged_idx {
1889 exact_seen.insert(exact_key, existing_idx);
1890 let slot = fallback_seen.entry(fallback_key).or_insert(CompatSlot {
1891 index: existing_idx,
1892 conversation_id: hit.conversation_id,
1893 ambiguous: false,
1894 });
1895 update_slot(slot, hit.conversation_id);
1896 if unique_hits[existing_idx].conversation_id.is_none() && hit.conversation_id.is_some()
1897 {
1898 unique_hits[existing_idx].conversation_id = hit.conversation_id;
1899 }
1900 unique_hits[existing_idx].score += fused_hit.rrf_score as f32;
1901 continue;
1902 }
1903
1904 hit.score = fused_hit.rrf_score as f32;
1905 let index = unique_hits.len();
1906 unique_hits.push(hit);
1907 exact_seen.insert(exact_key, index);
1908 match fallback_seen.get_mut(&fallback_key) {
1909 Some(slot) => update_slot(slot, unique_hits[index].conversation_id),
1910 None => {
1911 fallback_seen.insert(
1912 fallback_key,
1913 CompatSlot {
1914 index,
1915 conversation_id: unique_hits[index].conversation_id,
1916 ambiguous: false,
1917 },
1918 );
1919 }
1920 }
1921 }
1922
1923 unique_hits.sort_by(|a, b| {
1924 b.score
1925 .total_cmp(&a.score)
1926 .then_with(|| SearchHitKey::from_hit(a).cmp(&SearchHitKey::from_hit(b)))
1927 });
1928
1929 let start = offset.min(unique_hits.len());
1930 unique_hits.into_iter().skip(start).take(limit).collect()
1931}
1932
1933struct QueryCache {
1934 embedder_id: String,
1935 embeddings: LruCache<String, Vec<f32>>,
1936}
1937
1938impl QueryCache {
1939 fn new(embedder_id: &str, capacity: NonZeroUsize) -> Self {
1940 Self {
1941 embedder_id: embedder_id.to_string(),
1942 embeddings: LruCache::new(capacity),
1943 }
1944 }
1945
1946 fn align_embedder(&mut self, embedder: &dyn Embedder) {
1947 if self.embedder_id != embedder.id() {
1948 self.embedder_id = embedder.id().to_string();
1949 self.embeddings.clear();
1950 }
1951 }
1952
1953 fn get_cached(&mut self, embedder: &dyn Embedder, canonical: &str) -> Option<Vec<f32>> {
1954 self.align_embedder(embedder);
1955 self.embeddings.get(canonical).cloned()
1956 }
1957
1958 fn store(&mut self, embedder: &dyn Embedder, canonical: &str, embedding: Vec<f32>) {
1959 self.align_embedder(embedder);
1960 self.embeddings.put(canonical.to_string(), embedding);
1961 }
1962}
1963
1964fn semantic_filter_as_search_filter(filter: &SemanticFilter) -> Option<&dyn FsSearchFilter> {
1967 let unrestricted = filter.agents.is_none()
1968 && filter.workspaces.is_none()
1969 && filter.sources.is_none()
1970 && filter.roles.is_none()
1971 && filter.created_from.is_none()
1972 && filter.created_to.is_none();
1973 if unrestricted { None } else { Some(filter) }
1974}
1975
1976fn open_fs_semantic_ann_index(fs_index: &FsVectorIndex, ann_path: &Path) -> Result<FsHnswIndex> {
1977 if !ann_path.is_file() {
1978 bail!(
1979 "approximate search unavailable: HNSW index not found at {}",
1980 ann_path.display()
1981 );
1982 }
1983
1984 let ann = FsHnswIndex::load(ann_path, fs_index)
1985 .map_err(|err| anyhow!("open HNSW index failed: {err}"))?;
1986 let matches = ann
1987 .matches_vector_index(fs_index)
1988 .map_err(|err| anyhow!("validate HNSW index failed: {err}"))?;
1989 if !matches {
1990 bail!(
1991 "approximate search unavailable: HNSW index at {} is stale for current semantic index (run 'cass index --semantic --build-hnsw')",
1992 ann_path.display()
1993 );
1994 }
1995
1996 Ok(ann)
1997}
1998
1999struct SemanticSearchState {
2000 context_token: Arc<()>,
2001 embedder: Arc<dyn Embedder>,
2002 fs_semantic_index: Arc<FsVectorIndex>,
2003 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2004 fs_ann_index: Option<Arc<FsHnswIndex>>,
2005 ann_path: Option<PathBuf>,
2006 fs_in_memory_two_tier_index: Option<Arc<FsInMemoryTwoTierIndex>>,
2007 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable,
2008 progressive_context: Option<Arc<ProgressiveTwoTierContext>>,
2009 progressive_context_unavailable: bool,
2010 filter_maps: SemanticFilterMaps,
2011 roles: Option<HashSet<u8>>,
2012 query_cache: QueryCache,
2013}
2014
2015#[derive(Debug, Clone, Copy, Default)]
2016struct InMemoryTwoTierUnavailable {
2017 fast_only: bool,
2018 quality: bool,
2019}
2020
2021impl InMemoryTwoTierUnavailable {
2022 fn is_known_unavailable(self, tier_mode: SemanticTierMode) -> bool {
2023 match tier_mode {
2024 SemanticTierMode::Single => false,
2025 SemanticTierMode::FastOnly => self.fast_only,
2026 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => self.quality,
2027 }
2028 }
2029
2030 fn mark_unavailable(&mut self, tier_mode: SemanticTierMode) {
2031 match tier_mode {
2032 SemanticTierMode::Single => {}
2033 SemanticTierMode::FastOnly => {
2034 self.fast_only = true;
2035 }
2036 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => {
2037 self.quality = true;
2038 }
2039 }
2040 }
2041}
2042
2043struct ProgressiveTwoTierContext {
2044 context_token: Arc<()>,
2045 index: Arc<FsTwoTierIndex>,
2046 fast_embedder: Arc<dyn frankensearch::Embedder>,
2047 quality_embedder: Option<Arc<dyn frankensearch::Embedder>>,
2048}
2049
2050#[derive(Clone)]
2051struct SemanticCandidateContext {
2052 fs_semantic_index: Arc<FsVectorIndex>,
2053 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2054 filter_maps: SemanticFilterMaps,
2055 roles: Option<HashSet<u8>>,
2056}
2057
2058struct SemanticCandidateSearchRequest<'a> {
2059 fetch_limit: usize,
2060 approximate: bool,
2061 tier_mode: SemanticTierMode,
2062 in_memory_two_tier_index: Option<&'a Arc<FsInMemoryTwoTierIndex>>,
2063 ann_index: Option<&'a Arc<FsHnswIndex>>,
2064}
2065
2066struct SemanticQueryEmbedding {
2067 context_token: Arc<()>,
2068 vector: Vec<f32>,
2069}
2070
2071struct SharedCassSyncEmbedder {
2072 inner: Arc<dyn Embedder>,
2073 cache: Mutex<LruCache<String, Vec<f32>>>,
2074}
2075
2076impl SharedCassSyncEmbedder {
2077 fn new(inner: Arc<dyn Embedder>) -> Self {
2078 let cache_capacity =
2079 NonZeroUsize::new(PROGRESSIVE_EMBEDDING_CACHE_CAPACITY).expect("cache capacity > 0");
2080 Self {
2081 inner,
2082 cache: Mutex::new(LruCache::new(cache_capacity)),
2083 }
2084 }
2085}
2086
2087impl Embedder for SharedCassSyncEmbedder {
2088 fn embed_sync(&self, text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
2089 if let Ok(mut cache) = self.cache.lock()
2090 && let Some(embedding) = cache.get(text).cloned()
2091 {
2092 return Ok(embedding);
2093 }
2094
2095 let embedding = self.inner.embed_sync(text)?;
2096 if let Ok(mut cache) = self.cache.lock() {
2097 cache.put(text.to_owned(), embedding.clone());
2098 }
2099 Ok(embedding)
2100 }
2101
2102 fn embed_batch_sync(
2103 &self,
2104 texts: &[&str],
2105 ) -> crate::search::embedder::EmbedderResult<Vec<Vec<f32>>> {
2106 self.inner.embed_batch_sync(texts)
2107 }
2108
2109 fn dimension(&self) -> usize {
2110 self.inner.dimension()
2111 }
2112
2113 fn id(&self) -> &str {
2114 self.inner.id()
2115 }
2116
2117 fn model_name(&self) -> &str {
2118 self.inner.model_name()
2119 }
2120
2121 fn is_ready(&self) -> bool {
2122 self.inner.is_ready()
2123 }
2124
2125 fn is_semantic(&self) -> bool {
2126 self.inner.is_semantic()
2127 }
2128
2129 fn category(&self) -> frankensearch::ModelCategory {
2130 self.inner.category()
2131 }
2132
2133 fn tier(&self) -> frankensearch::ModelTier {
2134 self.inner.tier()
2135 }
2136
2137 fn supports_mrl(&self) -> bool {
2138 self.inner.supports_mrl()
2139 }
2140}
2141
2142fn build_in_memory_two_tier_index(
2143 ann_path: Option<PathBuf>,
2144 embedder_id: &str,
2145 tier_mode: SemanticTierMode,
2146) -> Option<Arc<FsInMemoryTwoTierIndex>> {
2147 let index_dir = ann_path
2148 .as_ref()
2149 .and_then(|path| path.parent().map(Path::to_path_buf));
2150 let Some(index_dir) = index_dir else {
2151 tracing::debug!("two-tier semantic unavailable: ann/index directory path missing");
2152 return None;
2153 };
2154
2155 match FsInMemoryTwoTierIndex::from_dir(&index_dir) {
2156 Ok(index) => return Some(Arc::new(index)),
2157 Err(err) => {
2158 tracing::debug!(
2159 dir = %index_dir.display(),
2160 error = %err,
2161 "two-tier semantic index load failed; considering fallback"
2162 );
2163 }
2164 }
2165
2166 if !matches!(tier_mode, SemanticTierMode::FastOnly) {
2167 return None;
2168 }
2169
2170 let fallback_fast = index_dir.join(format!("index-{embedder_id}.fsvi"));
2171 if !fallback_fast.is_file() {
2172 return None;
2173 }
2174
2175 match FsInMemoryVectorIndex::from_fsvi(&fallback_fast) {
2176 Ok(fast) => Some(Arc::new(FsInMemoryTwoTierIndex::new(fast, None))),
2177 Err(err) => {
2178 tracing::debug!(
2179 path = %fallback_fast.display(),
2180 error = %err,
2181 "fast-only semantic fallback index load failed"
2182 );
2183 None
2184 }
2185 }
2186}
2187
2188fn two_tier_index_supports_mode(
2189 index: &FsInMemoryTwoTierIndex,
2190 tier_mode: SemanticTierMode,
2191) -> bool {
2192 !matches!(
2193 tier_mode,
2194 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly
2195 ) || index.has_quality_index()
2196}
2197
2198#[derive(Debug, Clone)]
2199struct ResolvedSemanticDocId {
2200 message_id: u64,
2201 doc_id: String,
2202}
2203
2204type ProgressiveLookupKey = (String, String, Option<i64>, String, i64, Option<i64>, u64);
2205type ProgressiveExactQueryKey = (i64, i64);
2206type ProgressiveFallbackQueryKey = (String, String, i64);
2207type ResolvedSemanticLookupRow = Option<(ProgressiveLookupKey, ResolvedSemanticDocId)>;
2208
2209#[derive(Debug, Clone)]
2210struct ProgressiveLexicalHit {
2211 title: String,
2212 snippet: String,
2213 content: String,
2214 content_hash: u64,
2215 conversation_id: Option<i64>,
2216 source_path: String,
2217 agent: String,
2218 workspace: String,
2219 workspace_original: Option<String>,
2220 created_at: Option<i64>,
2221 match_type: MatchType,
2222 line_number: Option<usize>,
2223 source_id: String,
2224 origin_kind: String,
2225 origin_host: Option<String>,
2226}
2227
2228impl ProgressiveLexicalHit {
2229 fn from_search_hit(hit: &SearchHit, field_mask: FieldMask) -> Self {
2230 Self {
2231 title: if field_mask.wants_title() {
2232 hit.title.clone()
2233 } else {
2234 String::new()
2235 },
2236 snippet: if field_mask.wants_snippet() {
2237 hit.snippet.clone()
2238 } else {
2239 String::new()
2240 },
2241 content: if field_mask.needs_content() {
2242 hit.content.clone()
2243 } else {
2244 String::new()
2245 },
2246 content_hash: hit.content_hash,
2247 conversation_id: hit.conversation_id,
2248 source_path: hit.source_path.clone(),
2249 agent: hit.agent.clone(),
2250 workspace: hit.workspace.clone(),
2251 workspace_original: hit.workspace_original.clone(),
2252 created_at: hit.created_at,
2253 match_type: hit.match_type,
2254 line_number: hit.line_number,
2255 source_id: hit.source_id.clone(),
2256 origin_kind: hit.origin_kind.clone(),
2257 origin_host: hit.origin_host.clone(),
2258 }
2259 }
2260
2261 fn to_search_hit(&self, score: f32) -> SearchHit {
2262 SearchHit {
2263 title: self.title.clone(),
2264 snippet: self.snippet.clone(),
2265 content: self.content.clone(),
2266 content_hash: self.content_hash,
2267 conversation_id: self.conversation_id,
2268 score,
2269 source_path: self.source_path.clone(),
2270 agent: self.agent.clone(),
2271 workspace: self.workspace.clone(),
2272 workspace_original: self.workspace_original.clone(),
2273 created_at: self.created_at,
2274 line_number: self.line_number,
2275 match_type: self.match_type,
2276 source_id: self.source_id.clone(),
2277 origin_kind: self.origin_kind.clone(),
2278 origin_host: self.origin_host.clone(),
2279 }
2280 }
2281}
2282
2283#[derive(Debug, Default)]
2284struct ProgressiveLexicalCache {
2285 hits_by_message: HashMap<u64, ProgressiveLexicalHit>,
2286 wildcard_fallback: bool,
2287 suggestions: Vec<QuerySuggestion>,
2288}
2289
2290#[derive(Clone, Copy)]
2291struct ProgressivePhaseContext<'a> {
2292 query: &'a str,
2293 filters: &'a SearchFilters,
2294 field_mask: FieldMask,
2295 lexical_cache: Option<&'a ProgressiveLexicalCache>,
2296 limit: usize,
2297 fetch_limit: usize,
2298}
2299
2300type ProgressiveLexicalSnapshot = Arc<ProgressiveLexicalCache>;
2301
2302struct CassProgressiveLexicalAdapter {
2303 client: Arc<SearchClient>,
2304 filters: SearchFilters,
2305 field_mask: FieldMask,
2306 sparse_threshold: usize,
2307 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2308}
2309
2310impl CassProgressiveLexicalAdapter {
2311 fn new(
2312 client: Arc<SearchClient>,
2313 filters: SearchFilters,
2314 field_mask: FieldMask,
2315 sparse_threshold: usize,
2316 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2317 ) -> Self {
2318 Self {
2319 client,
2320 filters,
2321 field_mask,
2322 sparse_threshold,
2323 shared,
2324 }
2325 }
2326}
2327
2328impl FsLexicalSearch for CassProgressiveLexicalAdapter {
2329 fn search<'a>(
2330 &'a self,
2331 cx: &'a FsCx,
2332 query: &'a str,
2333 limit: usize,
2334 ) -> FsSearchFuture<'a, Vec<FsScoredResult>> {
2335 Box::pin(async move {
2336 if cx.is_cancel_requested() {
2337 return Err(FsSearchError::Cancelled {
2338 phase: "lexical".to_string(),
2339 reason: "cancel requested".to_string(),
2340 });
2341 }
2342
2343 let result = self
2344 .client
2345 .search_with_fallback(
2346 query,
2347 self.filters.clone(),
2348 limit,
2349 0,
2350 self.sparse_threshold,
2351 self.field_mask,
2352 )
2353 .map_err(|err| FsSearchError::SubsystemError {
2354 subsystem: "cass_lexical_adapter",
2355 source: Box::new(std::io::Error::other(err.to_string())),
2356 })?;
2357
2358 let resolved = self
2359 .client
2360 .resolve_semantic_doc_ids_for_hits(&result.hits)
2361 .map_err(|err| FsSearchError::SubsystemError {
2362 subsystem: "cass_lexical_adapter",
2363 source: Box::new(std::io::Error::other(err.to_string())),
2364 })?;
2365
2366 let mut scored = Vec::with_capacity(result.hits.len());
2367 let mut hits_by_message = HashMap::with_capacity(result.hits.len());
2368
2369 for (hit, resolved_doc) in result.hits.iter().zip(resolved) {
2370 let Some(resolved_doc) = resolved_doc else {
2371 continue;
2372 };
2373 hits_by_message
2374 .entry(resolved_doc.message_id)
2375 .or_insert_with(|| {
2376 ProgressiveLexicalHit::from_search_hit(hit, self.field_mask)
2377 });
2378 scored.push(FsScoredResult {
2379 doc_id: resolved_doc.doc_id,
2380 score: hit.score,
2381 source: FsScoreSource::Lexical,
2382 index: None,
2383 fast_score: None,
2384 quality_score: None,
2385 lexical_score: Some(hit.score),
2386 rerank_score: None,
2387 explanation: None,
2388 metadata: None,
2389 });
2390 }
2391
2392 if let Ok(mut guard) = self.shared.lock() {
2393 *guard = Arc::new(ProgressiveLexicalCache {
2394 hits_by_message,
2395 wildcard_fallback: result.wildcard_fallback,
2396 suggestions: result.suggestions,
2397 });
2398 }
2399
2400 Ok(scored)
2401 })
2402 }
2403
2404 fn index_document<'a>(
2405 &'a self,
2406 _cx: &'a FsCx,
2407 _doc: &'a frankensearch::IndexableDocument,
2408 ) -> FsSearchFuture<'a, ()> {
2409 Box::pin(async move {
2410 Err(FsSearchError::SubsystemError {
2411 subsystem: "cass_lexical_adapter",
2412 source: Box::new(std::io::Error::other("cass lexical adapter is read-only")),
2413 })
2414 })
2415 }
2416
2417 fn commit<'a>(&'a self, _cx: &'a FsCx) -> FsSearchFuture<'a, ()> {
2418 Box::pin(async move { Ok(()) })
2419 }
2420
2421 fn doc_count(&self) -> usize {
2422 self.client.total_docs()
2423 }
2424}
2425
2426pub struct SearchClient {
2427 reader: Option<(IndexReader, FsCassFields)>,
2428 sqlite: Mutex<Option<SendConnection>>,
2429 sqlite_path: Option<PathBuf>,
2430 prefix_cache: Mutex<CacheShards>,
2431 reload_on_search: bool,
2432 last_reload: Mutex<Option<Instant>>,
2433 last_generation: Mutex<Option<u64>>,
2434 reload_epoch: Arc<AtomicU64>,
2435 warm_tx: Option<mpsc::Sender<WarmJob>>,
2436 _warm_handle: Option<std::thread::JoinHandle<()>>,
2437 metrics: Metrics,
2438 cache_namespace: String,
2439 semantic: Mutex<Option<SemanticSearchState>>,
2440 last_tantivy_total_count: Mutex<Option<usize>>,
2444}
2445
2446#[derive(Debug, Clone, Copy)]
2447pub struct SearchClientOptions {
2448 pub enable_reload: bool,
2449 pub enable_warm: bool,
2450}
2451
2452impl Default for SearchClientOptions {
2453 fn default() -> Self {
2454 Self {
2455 enable_reload: true,
2456 enable_warm: true,
2457 }
2458 }
2459}
2460
2461impl Drop for SearchClient {
2462 fn drop(&mut self) {
2463 FEDERATED_SEARCH_READERS
2464 .write()
2465 .remove(&self.cache_namespace);
2466 }
2467}
2468
2469#[derive(Debug, Clone, PartialEq, Eq)]
2470pub struct CacheStats {
2471 pub cache_hits: u64,
2472 pub cache_miss: u64,
2473 pub cache_shortfall: u64,
2474 pub reloads: u64,
2475 pub reload_ms_total: u128,
2476 pub total_cap: usize,
2477 pub total_cost: usize,
2478 pub eviction_count: u64,
2480 pub approx_bytes: usize,
2482 pub byte_cap: usize,
2484 pub eviction_policy: &'static str,
2486 pub ghost_entries: usize,
2488 pub admission_rejects: u64,
2490 pub prewarm_scheduled: u64,
2492 pub prewarm_skipped_pressure: u64,
2494 pub reader_generation: Option<u64>,
2496}
2497
2498impl Default for CacheStats {
2499 fn default() -> Self {
2500 Self {
2501 cache_hits: 0,
2502 cache_miss: 0,
2503 cache_shortfall: 0,
2504 reloads: 0,
2505 reload_ms_total: 0,
2506 total_cap: 0,
2507 total_cost: 0,
2508 eviction_count: 0,
2509 approx_bytes: 0,
2510 byte_cap: 0,
2511 eviction_policy: "unknown",
2512 ghost_entries: 0,
2513 admission_rejects: 0,
2514 prewarm_scheduled: 0,
2515 prewarm_skipped_pressure: 0,
2516 reader_generation: None,
2517 }
2518 }
2519}
2520
2521static CACHE_SHARD_CAP: Lazy<usize> = Lazy::new(|| {
2524 dotenvy::var("CASS_CACHE_SHARD_CAP")
2525 .ok()
2526 .and_then(|v| v.parse::<usize>().ok())
2527 .filter(|v| *v > 0)
2528 .unwrap_or(256)
2529});
2530
2531static CACHE_TOTAL_CAP: Lazy<usize> = Lazy::new(|| {
2533 dotenvy::var("CASS_CACHE_TOTAL_CAP")
2534 .ok()
2535 .and_then(|v| v.parse::<usize>().ok())
2536 .filter(|v| *v > 0)
2537 .unwrap_or(2048)
2538});
2539
2540static CACHE_DEBUG_ENABLED: Lazy<bool> = Lazy::new(|| {
2541 dotenvy::var("CASS_DEBUG_CACHE_METRICS")
2542 .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
2543 .unwrap_or(false)
2544});
2545
2546static CACHE_BYTE_CAP: Lazy<usize> = Lazy::new(|| match dotenvy::var("CASS_CACHE_BYTE_CAP") {
2549 Ok(value) => cache_byte_cap_from_env_value(Some(&value), available_memory_bytes()),
2550 Err(_) => default_cache_byte_cap(),
2551});
2552
2553static CACHE_EVICTION_POLICY: Lazy<CacheEvictionPolicy> = Lazy::new(|| {
2554 cache_eviction_policy_from_env_value(dotenvy::var("CASS_CACHE_EVICTION_POLICY").ok().as_deref())
2555});
2556
2557const DEFAULT_CACHE_BYTE_CAP_FALLBACK: usize = 64 * 1024 * 1024;
2558const DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR: u64 = 128;
2559const DEFAULT_CACHE_BYTE_CAP_CEILING: u64 = 2 * 1024 * 1024 * 1024;
2560const S3_FIFO_GHOST_CAP_MULTIPLIER: usize = 2;
2561const S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR: usize = 4;
2562const PREWARM_ENTRY_PRESSURE_NUMERATOR: usize = 9;
2563const PREWARM_ENTRY_PRESSURE_DENOMINATOR: usize = 10;
2564const PREWARM_BYTE_PRESSURE_NUMERATOR: usize = 4;
2565const PREWARM_BYTE_PRESSURE_DENOMINATOR: usize = 5;
2566
2567const CACHE_KEY_VERSION: &str = "1";
2568
2569static WARM_DEBOUNCE_MS: Lazy<u64> = Lazy::new(|| {
2571 dotenvy::var("CASS_WARM_DEBOUNCE_MS")
2572 .ok()
2573 .and_then(|v| v.parse::<u64>().ok())
2574 .filter(|v| *v > 0)
2575 .unwrap_or(120)
2576});
2577
2578fn default_cache_byte_cap() -> usize {
2579 default_cache_byte_cap_for_available(available_memory_bytes())
2580}
2581
2582fn cache_byte_cap_from_env_value(value: Option<&str>, available_bytes: Option<u64>) -> usize {
2583 let Some(raw) = value else {
2584 return default_cache_byte_cap_for_available(available_bytes);
2585 };
2586 raw.parse::<usize>()
2587 .unwrap_or_else(|_| default_cache_byte_cap_for_available(available_bytes))
2588}
2589
2590fn default_cache_byte_cap_for_available(available_bytes: Option<u64>) -> usize {
2591 let Some(available_bytes) = available_bytes else {
2592 return DEFAULT_CACHE_BYTE_CAP_FALLBACK;
2593 };
2594 let ceiling = usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX);
2595 let budget = available_bytes / DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR;
2596 let budget = budget.min(DEFAULT_CACHE_BYTE_CAP_CEILING);
2597 let budget = usize::try_from(budget).unwrap_or(ceiling);
2598 budget.clamp(DEFAULT_CACHE_BYTE_CAP_FALLBACK, ceiling)
2599}
2600
2601#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2602enum CacheEvictionPolicy {
2603 Lru,
2604 S3Fifo,
2605}
2606
2607impl CacheEvictionPolicy {
2608 fn label(self) -> &'static str {
2609 match self {
2610 CacheEvictionPolicy::Lru => "lru",
2611 CacheEvictionPolicy::S3Fifo => "s3-fifo",
2612 }
2613 }
2614}
2615
2616fn cache_eviction_policy_from_env_value(value: Option<&str>) -> CacheEvictionPolicy {
2617 match value.map(str::trim).filter(|value| !value.is_empty()) {
2618 Some(value) if value.eq_ignore_ascii_case("s3-fifo") => CacheEvictionPolicy::S3Fifo,
2619 Some(value) if value.eq_ignore_ascii_case("s3fifo") => CacheEvictionPolicy::S3Fifo,
2620 Some(value) if value.eq_ignore_ascii_case("s3_fifo") => CacheEvictionPolicy::S3Fifo,
2621 _ => CacheEvictionPolicy::Lru,
2622 }
2623}
2624
2625#[derive(Clone)]
2626struct CachedHit {
2627 hit: SearchHit,
2628 lc_content: String,
2629 lc_title: Option<String>,
2630 bloom64: u64,
2631}
2632
2633impl CachedHit {
2634 fn approx_bytes(&self) -> usize {
2637 let base = std::mem::size_of::<Self>();
2639 let hit_strings = self.hit.title.len()
2641 + self.hit.snippet.len()
2642 + self.hit.content.len()
2643 + self.hit.source_path.len()
2644 + self.hit.agent.len()
2645 + self.hit.workspace.len()
2646 + self
2647 .hit
2648 .workspace_original
2649 .as_ref()
2650 .map_or(0, std::string::String::len)
2651 + self.hit.source_id.len()
2652 + self.hit.origin_kind.len()
2653 + self
2654 .hit
2655 .origin_host
2656 .as_ref()
2657 .map_or(0, std::string::String::len);
2658 let lc_strings =
2660 self.lc_content.len() + self.lc_title.as_ref().map_or(0, std::string::String::len);
2661 base + hit_strings + lc_strings
2662 }
2663}
2664
2665struct CacheShards {
2666 shards: HashMap<Arc<str>, LruCache<Arc<str>, Vec<CachedHit>>>,
2668 total_cap: usize,
2669 total_cost: usize,
2670 eviction_count: u64,
2672 total_bytes: usize,
2674 byte_cap: usize,
2676 policy: CacheEvictionPolicy,
2678 ghost_keys: VecDeque<Arc<str>>,
2680 ghost_set: HashSet<Arc<str>>,
2681 admission_rejects: u64,
2682}
2683
2684impl CacheShards {
2685 fn new(total_cap: usize, byte_cap: usize) -> Self {
2686 Self::new_with_policy(total_cap, byte_cap, *CACHE_EVICTION_POLICY)
2687 }
2688
2689 fn new_with_policy(total_cap: usize, byte_cap: usize, policy: CacheEvictionPolicy) -> Self {
2690 Self {
2691 shards: HashMap::new(),
2692 total_cap: total_cap.max(1),
2693 total_cost: 0,
2694 eviction_count: 0,
2695 total_bytes: 0,
2696 byte_cap,
2697 policy,
2698 ghost_keys: VecDeque::new(),
2699 ghost_set: HashSet::new(),
2700 admission_rejects: 0,
2701 }
2702 }
2703
2704 fn shard_mut(&mut self, name: &str) -> &mut LruCache<Arc<str>, Vec<CachedHit>> {
2705 let interned_name = intern_cache_key(name);
2707 self.shards
2708 .entry(interned_name)
2709 .or_insert_with(|| LruCache::new(NonZeroUsize::new(*CACHE_SHARD_CAP).unwrap()))
2710 }
2711
2712 fn shard_opt(&self, name: &str) -> Option<&LruCache<Arc<str>, Vec<CachedHit>>> {
2713 self.shards.get(name)
2715 }
2716
2717 fn put(&mut self, shard_name: &str, key: Arc<str>, value: Vec<CachedHit>) {
2718 let new_cost = value.len();
2719 let new_bytes: usize = value.iter().map(CachedHit::approx_bytes).sum();
2720 let replacing = self
2721 .shard_opt(shard_name)
2722 .is_some_and(|shard| shard.contains(&key));
2723
2724 if !replacing && !self.should_admit(&key, new_cost, new_bytes) {
2725 self.admission_rejects += 1;
2726 self.record_ghost(key);
2727 return;
2728 }
2729
2730 self.remove_ghost(&key);
2731
2732 let shard = self.shard_mut(shard_name);
2733 let old_val = shard.put(key, value);
2734 let (old_cost, old_bytes) = old_val.as_ref().map_or((0, 0), |v| {
2735 (v.len(), v.iter().map(CachedHit::approx_bytes).sum())
2736 });
2737
2738 self.total_cost = self
2739 .total_cost
2740 .saturating_add(new_cost)
2741 .saturating_sub(old_cost);
2742 self.total_bytes = self
2743 .total_bytes
2744 .saturating_add(new_bytes)
2745 .saturating_sub(old_bytes);
2746 self.evict_until_within_cap();
2747 }
2748
2749 fn evict_until_within_cap(&mut self) {
2750 while self.total_cost > self.total_cap
2752 || (self.byte_cap > 0 && self.total_bytes > self.byte_cap)
2753 {
2754 let byte_pressure = self.byte_cap > 0 && self.total_bytes > self.byte_cap;
2759 let mut largest_shard_key = None;
2760 let mut max_score = 0usize;
2761 for (k, v) in self.shards.iter() {
2762 let score = if byte_pressure {
2763 shard_cached_bytes(v)
2764 } else {
2765 v.len()
2766 };
2767 if score > max_score {
2768 max_score = score;
2769 largest_shard_key = Some(k.clone());
2770 }
2771 }
2772
2773 if let Some(key) = largest_shard_key {
2774 if let Some(shard) = self.shards.get_mut(&key)
2775 && let Some((evicted_key, v)) = shard.pop_lru()
2776 {
2777 let evicted_bytes: usize = v.iter().map(CachedHit::approx_bytes).sum();
2778 self.total_cost = self.total_cost.saturating_sub(v.len());
2779 self.total_bytes = self.total_bytes.saturating_sub(evicted_bytes);
2780 self.eviction_count += 1;
2781 self.record_ghost(evicted_key);
2782 }
2783 } else {
2784 break; }
2786 }
2787 }
2788
2789 fn should_admit(&self, key: &Arc<str>, cost: usize, bytes: usize) -> bool {
2790 if self.policy == CacheEvictionPolicy::Lru || self.ghost_set.contains(key) {
2791 return true;
2792 }
2793 !self.is_s3_fifo_large_candidate(cost, bytes)
2794 }
2795
2796 fn is_s3_fifo_large_candidate(&self, cost: usize, bytes: usize) -> bool {
2797 let entry_heavy = cost
2798 > self
2799 .total_cap
2800 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2801 let byte_heavy = self.byte_cap > 0
2802 && bytes
2803 > self
2804 .byte_cap
2805 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2806 entry_heavy || byte_heavy
2807 }
2808
2809 fn record_ghost(&mut self, key: Arc<str>) {
2810 if self.policy != CacheEvictionPolicy::S3Fifo {
2811 return;
2812 }
2813 if self.ghost_set.insert(key.clone()) {
2814 self.ghost_keys.push_back(key);
2815 }
2816 let cap = self
2817 .total_cap
2818 .saturating_mul(S3_FIFO_GHOST_CAP_MULTIPLIER)
2819 .max(1);
2820 while self.ghost_set.len() > cap {
2821 if let Some(old) = self.ghost_keys.pop_front() {
2822 self.ghost_set.remove(&old);
2823 } else {
2824 break;
2825 }
2826 }
2827 }
2828
2829 fn remove_ghost(&mut self, key: &Arc<str>) {
2830 self.ghost_set.remove(key);
2831 self.ghost_keys.retain(|candidate| candidate != key);
2832 }
2833
2834 fn clear(&mut self) {
2835 self.shards.clear();
2836 self.total_cost = 0;
2837 self.total_bytes = 0;
2838 self.ghost_keys.clear();
2839 self.ghost_set.clear();
2840 }
2842
2843 fn total_cost(&self) -> usize {
2844 self.total_cost
2845 }
2846
2847 fn total_cap(&self) -> usize {
2848 self.total_cap
2849 }
2850
2851 fn eviction_count(&self) -> u64 {
2852 self.eviction_count
2853 }
2854
2855 fn total_bytes(&self) -> usize {
2856 self.total_bytes
2857 }
2858
2859 fn byte_cap(&self) -> usize {
2860 self.byte_cap
2861 }
2862
2863 fn policy_label(&self) -> &'static str {
2864 self.policy.label()
2865 }
2866
2867 fn ghost_entries(&self) -> usize {
2868 self.ghost_set.len()
2869 }
2870
2871 fn admission_rejects(&self) -> u64 {
2872 self.admission_rejects
2873 }
2874
2875 fn prewarm_pressure(&self) -> bool {
2876 let entry_pressure = self
2877 .total_cost
2878 .saturating_mul(PREWARM_ENTRY_PRESSURE_DENOMINATOR)
2879 >= self
2880 .total_cap
2881 .saturating_mul(PREWARM_ENTRY_PRESSURE_NUMERATOR);
2882 let byte_pressure = self.byte_cap > 0
2883 && self
2884 .total_bytes
2885 .saturating_mul(PREWARM_BYTE_PRESSURE_DENOMINATOR)
2886 >= self
2887 .byte_cap
2888 .saturating_mul(PREWARM_BYTE_PRESSURE_NUMERATOR);
2889 entry_pressure || byte_pressure
2890 }
2891}
2892
2893fn shard_cached_bytes(shard: &LruCache<Arc<str>, Vec<CachedHit>>) -> usize {
2894 shard
2895 .iter()
2896 .map(|(_key, hits)| hits.iter().map(CachedHit::approx_bytes).sum::<usize>())
2897 .sum()
2898}
2899
2900#[derive(Clone)]
2901struct WarmJob {
2902 query: String,
2903 filters_fingerprint: String,
2904 shard_name: String,
2905}
2906
2907#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2908enum AdaptivePrewarmDecision {
2909 Schedule,
2910 SkipCold,
2911 SkipPressure,
2912}
2913
2914#[derive(Clone)]
2915struct SearcherCacheEntry {
2916 epoch: u64,
2917 reader_key: usize,
2918 searcher: Searcher,
2919}
2920
2921thread_local! {
2922 static THREAD_SEARCHER: RefCell<Option<SearcherCacheEntry>> = const { RefCell::new(None) };
2923}
2924
2925#[derive(Clone)]
2926struct FederatedIndexReader {
2927 reader: IndexReader,
2928 fields: FsCassFields,
2929}
2930
2931static FEDERATED_SEARCH_READERS: Lazy<RwLock<HashMap<String, Arc<Vec<FederatedIndexReader>>>>> =
2932 Lazy::new(|| RwLock::new(HashMap::new()));
2933static SEARCH_CLIENT_INSTANCE_COUNTER: AtomicU64 = AtomicU64::new(1);
2934
2935fn levenshtein_distance(a: &str, b: &str) -> usize {
2938 let a_chars: Vec<char> = a.chars().collect();
2939 let b_chars: Vec<char> = b.chars().collect();
2940 let a_len = a_chars.len();
2941 let b_len = b_chars.len();
2942
2943 if a_len == 0 {
2944 return b_len;
2945 }
2946 if b_len == 0 {
2947 return a_len;
2948 }
2949
2950 let mut prev_row: Vec<usize> = (0..=b_len).collect();
2952 let mut curr_row: Vec<usize> = vec![0; b_len + 1];
2953
2954 for (i, a_char) in a_chars.iter().enumerate() {
2955 curr_row[0] = i + 1;
2956 for (j, b_char) in b_chars.iter().enumerate() {
2957 let cost = usize::from(a_char != b_char);
2958 curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
2962 std::mem::swap(&mut prev_row, &mut curr_row);
2963 }
2964
2965 prev_row[b_len]
2966}
2967
2968fn normalize_term_parts(raw: &str) -> Vec<String> {
2973 let mut parts = Vec::new();
2974 for token in nfc_sanitize_query(raw).split_whitespace() {
2975 let mut current = String::new();
2976 let mut chars = token.chars().peekable();
2977 while let Some(ch) = chars.next() {
2978 let trailing_wildcard = ch == '*' && chars.peek().is_none() && !current.is_empty();
2979 if ch.is_alphanumeric() || ch == '_' || trailing_wildcard {
2980 current.push(ch);
2981 continue;
2982 }
2983
2984 if !current.is_empty() {
2985 parts.push(std::mem::take(&mut current));
2986 }
2987 }
2988
2989 if !current.is_empty() {
2990 parts.push(current);
2991 }
2992 }
2993 parts
2994}
2995
2996fn normalize_phrase_terms(raw: &str) -> Vec<String> {
2998 normalize_term_parts(raw)
2999 .into_iter()
3000 .map(|s| s.trim_matches('*').to_lowercase())
3001 .filter(|s| !s.is_empty())
3002 .collect()
3003}
3004
3005fn render_fts5_term_part(part: &str) -> Option<String> {
3006 let pattern = FsCassWildcardPattern::parse(part);
3007 if matches!(
3008 pattern,
3009 FsCassWildcardPattern::Suffix(_)
3010 | FsCassWildcardPattern::Substring(_)
3011 | FsCassWildcardPattern::Complex(_)
3012 ) {
3013 return None;
3014 }
3015
3016 Some(part.to_string())
3017}
3018
3019fn dominant_match_type(query: &str) -> MatchType {
3022 let mut worst = MatchType::Exact;
3023 for term in query.split_whitespace() {
3024 let pattern = FsCassWildcardPattern::parse(term);
3025 let mt = match pattern {
3026 FsCassWildcardPattern::Exact(_) => MatchType::Exact,
3027 FsCassWildcardPattern::Prefix(_) => MatchType::Prefix,
3028 FsCassWildcardPattern::Suffix(_) => MatchType::Suffix,
3029 FsCassWildcardPattern::Substring(_) => MatchType::Substring,
3030 FsCassWildcardPattern::Complex(_) => MatchType::Wildcard,
3031 };
3032 if mt.quality_factor() < worst.quality_factor() {
3034 worst = mt;
3035 }
3036 }
3037 worst
3038}
3039
3040pub(crate) fn is_tool_invocation_noise(content: &str) -> bool {
3043 let trimmed = content.trim();
3044
3045 if trimmed.starts_with("[Tool:") {
3047 if let Some(close_idx) = trimmed.find(']') {
3049 let after = &trimmed[close_idx + 1..];
3051 if !after.trim().is_empty() {
3052 return false; }
3054
3055 let inner = &trimmed[6..close_idx]; return inner.trim().is_empty();
3061 }
3062 return true;
3064 }
3065
3066 if trimmed.len() < 20 {
3068 let lower = trimmed.to_lowercase();
3069 if lower.starts_with("[tool") || lower.starts_with("tool:") {
3070 return true;
3071 }
3072 }
3073
3074 false
3075}
3076
3077fn hit_content_for_noise_check(hit: &SearchHit) -> &str {
3078 if hit.content.is_empty() {
3079 &hit.snippet
3080 } else {
3081 &hit.content
3082 }
3083}
3084
3085fn hit_is_noise(hit: &SearchHit, query: &str) -> bool {
3086 let content_to_check = hit_content_for_noise_check(hit);
3087 if content_to_check.is_empty() {
3097 return false;
3098 }
3099 is_search_noise_text(content_to_check, query) || is_tool_invocation_noise(content_to_check)
3100}
3101
3102fn snippet_from_content(content: &str) -> String {
3103 let trimmed = content.trim();
3104 let mut chars = trimmed.chars();
3105 let preview: String = chars.by_ref().take(200).collect();
3106 if chars.next().is_some() {
3107 format!("{preview}...")
3108 } else {
3109 preview
3110 }
3111}
3112
3113#[cfg(test)]
3121pub(crate) fn deduplicate_hits(hits: Vec<SearchHit>) -> Vec<SearchHit> {
3122 deduplicate_hits_with_query(hits, "")
3123}
3124
3125pub(crate) fn deduplicate_hits_with_query(hits: Vec<SearchHit>, query: &str) -> Vec<SearchHit> {
3126 let mut source_ids: HashMap<String, u32> = HashMap::new();
3133 let mut path_ids: HashMap<String, u32> = HashMap::new();
3134 let mut title_ids: HashMap<String, u32> = HashMap::new();
3135 let mut next_source_id: u32 = 0;
3136 let mut next_path_id: u32 = 0;
3137 let mut next_title_id: u32 = 0;
3138 type DedupKey = (
3139 u32,
3140 u32,
3141 Option<i64>,
3142 Option<u32>,
3143 Option<usize>,
3144 Option<i64>,
3145 u64,
3146 );
3147
3148 let mut seen: HashMap<DedupKey, usize> = HashMap::new();
3149 let mut deduped: Vec<SearchHit> = Vec::new();
3150
3151 for hit in hits {
3152 if hit_is_noise(&hit, query) {
3153 continue;
3154 }
3155
3156 let normalized_source_id = normalized_search_hit_source_id(&hit);
3159 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
3160 *id
3161 } else {
3162 let id = next_source_id;
3163 next_source_id = next_source_id.saturating_add(1);
3164 source_ids.insert(normalized_source_id, id);
3165 id
3166 };
3167 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
3168 *id
3169 } else {
3170 let id = next_path_id;
3171 next_path_id = next_path_id.saturating_add(1);
3172 path_ids.insert(hit.source_path.clone(), id);
3173 id
3174 };
3175 let title_key = if hit.conversation_id.is_some() {
3176 None
3177 } else {
3178 let normalized_title = hit.title.trim();
3179 Some(if let Some(id) = title_ids.get(normalized_title) {
3180 *id
3181 } else {
3182 let id = next_title_id;
3183 next_title_id = next_title_id.saturating_add(1);
3184 title_ids.insert(normalized_title.to_string(), id);
3185 id
3186 })
3187 };
3188 let key = (
3189 source_key,
3190 path_key,
3191 hit.conversation_id,
3192 title_key,
3193 hit.line_number,
3194 hit.created_at,
3195 hit.content_hash,
3196 );
3197
3198 if let Some(&existing_idx) = seen.get(&key) {
3199 if deduped[existing_idx].score < hit.score {
3201 deduped[existing_idx] = hit;
3202 }
3203 } else {
3205 seen.insert(key, deduped.len());
3206 deduped.push(hit);
3207 }
3208 }
3209
3210 deduped
3211}
3212
3213fn should_try_wildcard_fallback(
3214 returned_hits: usize,
3215 limit: usize,
3216 offset: usize,
3217 sparse_threshold: usize,
3218) -> bool {
3219 if offset != 0 {
3220 return false;
3221 }
3222
3223 let effective_sparse_threshold = if limit == 0 {
3224 sparse_threshold
3225 } else {
3226 sparse_threshold.min(limit)
3227 };
3228
3229 returned_hits < effective_sparse_threshold
3230}
3231
3232fn should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(
3233 query: &str,
3234 returned_hits: usize,
3235) -> bool {
3236 if returned_hits != 0 {
3237 return false;
3238 }
3239
3240 for token in normalize_phrase_terms(query) {
3241 if token.chars().count() > AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS {
3242 return true;
3243 }
3244 }
3245
3246 false
3247}
3248
3249fn snippet_from_preview_without_full_content(
3250 field_mask: FieldMask,
3251 stored_preview: &str,
3252 query: &str,
3253) -> Option<String> {
3254 if field_mask.needs_content() || !field_mask.wants_snippet() || stored_preview.is_empty() {
3255 return None;
3256 }
3257
3258 cached_prefix_snippet(stored_preview, query, 160)
3259}
3260
3261fn stored_preview_is_complete_content(stored_preview: &str) -> bool {
3262 !stored_preview.is_empty() && !stored_preview.ends_with('…')
3265}
3266
3267impl SearchClient {
3268 pub fn open(index_path: &Path, db_path: Option<&Path>) -> Result<Option<Self>> {
3269 Self::open_with_options(index_path, db_path, SearchClientOptions::default())
3270 }
3271
3272 pub fn open_with_options(
3273 index_path: &Path,
3274 db_path: Option<&Path>,
3275 options: SearchClientOptions,
3276 ) -> Result<Option<Self>> {
3277 let tantivy = fs_cass_open_search_reader(index_path, ReloadPolicy::Manual).ok();
3278 let client_id = SEARCH_CLIENT_INSTANCE_COUNTER.fetch_add(1, Ordering::Relaxed);
3279 let cache_namespace = format!(
3280 "v{}|schema:{}|client:{}|index:{}",
3281 CACHE_KEY_VERSION,
3282 FS_CASS_SCHEMA_HASH,
3283 client_id,
3284 index_path.display()
3285 );
3286 let federated_readers = if tantivy.is_none() {
3287 crate::search::tantivy::open_federated_search_readers(index_path, ReloadPolicy::Manual)
3288 .ok()
3289 .flatten()
3290 .filter(|readers| !readers.is_empty())
3291 .map(|readers| {
3292 Arc::new(
3293 readers
3294 .into_iter()
3295 .map(|(reader, fields)| FederatedIndexReader { reader, fields })
3296 .collect::<Vec<_>>(),
3297 )
3298 })
3299 } else {
3300 None
3301 };
3302
3303 let sqlite_path = db_path.map(Path::to_path_buf).filter(|path| path.exists());
3304
3305 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_some() {
3306 tracing::warn!(
3307 index_path = %index_path.display(),
3308 "Tantivy search index not found or incompatible. \
3309 Search results will be degraded. \
3310 Run `cass index --full` to rebuild the index."
3311 );
3312 }
3313
3314 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_none() {
3315 return Ok(None);
3316 }
3317
3318 let reload_epoch = Arc::new(AtomicU64::new(0));
3319 let metrics = Metrics::default();
3320
3321 let warm_pair = if options.enable_warm
3322 && let Some((reader, fields)) = &tantivy
3323 {
3324 maybe_spawn_warm_worker(
3325 reader.clone(),
3326 *fields,
3327 reload_epoch.clone(),
3328 metrics.clone(),
3329 )
3330 } else {
3331 None
3332 };
3333
3334 if let Some(readers) = &federated_readers {
3335 FEDERATED_SEARCH_READERS
3336 .write()
3337 .insert(cache_namespace.clone(), Arc::clone(readers));
3338 } else {
3339 FEDERATED_SEARCH_READERS.write().remove(&cache_namespace);
3340 }
3341
3342 Ok(Some(Self {
3343 reader: tantivy,
3344 sqlite: Mutex::new(None),
3345 sqlite_path,
3346 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
3347 reload_on_search: options.enable_reload,
3348 last_reload: Mutex::new(None),
3349 last_generation: Mutex::new(None),
3350 reload_epoch,
3351 warm_tx: warm_pair.as_ref().map(|(tx, _)| tx.clone()),
3352 _warm_handle: warm_pair.map(|(_, h)| h),
3353 metrics,
3354 cache_namespace,
3355 semantic: Mutex::new(None),
3356 last_tantivy_total_count: Mutex::new(None),
3357 }))
3358 }
3359
3360 fn sqlite_guard(&self) -> Result<std::sync::MutexGuard<'_, Option<SendConnection>>> {
3361 let mut guard = self
3362 .sqlite
3363 .lock()
3364 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3365
3366 if guard.is_none()
3367 && let Some(path) = &self.sqlite_path
3368 {
3369 match open_search_hydration_sqlite(path, std::time::Duration::from_secs(1)) {
3370 Ok(conn) => {
3371 *guard = Some(SendConnection(conn));
3372 }
3373 Err(err) => {
3374 tracing::debug!(
3375 error = %err,
3376 path = %path.display(),
3377 "readonly sqlite open failed for search client"
3378 );
3379 }
3380 }
3381 }
3382
3383 Ok(guard)
3384 }
3385
3386 pub fn search(
3387 &self,
3388 query: &str,
3389 filters: SearchFilters,
3390 limit: usize,
3391 offset: usize,
3392 field_mask: FieldMask,
3393 ) -> Result<Vec<SearchHit>> {
3394 use unicode_normalization::UnicodeNormalization;
3398 let query: String = query.nfc().collect();
3399 let query: &str = &query;
3400 let sanitized = nfc_sanitize_query(query);
3401 let field_mask = effective_field_mask(field_mask);
3402 let limit = if limit == 0 {
3403 self.total_docs().min(no_limit_result_cap()).max(1)
3404 } else {
3405 limit
3406 };
3407 let can_use_cache =
3408 field_mask.allows_cache() && (field_mask.needs_content() || field_mask.wants_snippet());
3409
3410 if let Some((reader, _)) = &self.reader {
3413 self.maybe_reload_reader(reader)?;
3414 let searcher = self.searcher_for_thread(reader);
3415 self.track_generation(searcher.generation().generation_id());
3416 } else if let Some(readers) = self.federated_readers()
3417 && let Some(signature) = self.maybe_reload_federated_readers(readers.as_ref())?
3418 {
3419 self.track_generation(signature);
3420 }
3421
3422 if can_use_cache
3427 && offset == 0
3428 && !query.contains('*')
3429 && !fs_cass_has_boolean_operators(query)
3430 {
3431 self.maybe_schedule_adaptive_query_prewarm(&sanitized, &filters);
3432 if let Some(cached) = self.cached_prefix_hits(&sanitized, &filters) {
3433 let query_terms = QueryTermsLower::from_query(&sanitized);
3435 let mut filtered: Vec<SearchHit> = cached
3436 .into_iter()
3437 .filter(|h| hit_matches_query_cached_precomputed(h, &query_terms))
3438 .map(|c| c.hit.clone())
3439 .collect();
3440 if filtered.len() >= limit {
3441 filtered.truncate(limit);
3442 self.metrics.inc_cache_hits();
3443 self.maybe_log_cache_metrics("hit");
3444 return Ok(filtered);
3445 }
3446 self.metrics.inc_cache_shortfall();
3448 self.maybe_log_cache_metrics("shortfall");
3449 } else {
3450 self.metrics.inc_cache_miss();
3452 self.maybe_log_cache_metrics("miss");
3453 }
3454 }
3455
3456 let target_hits = offset.saturating_add(limit);
3460 let initial_fetch_limit = if target_hits <= 16 {
3461 target_hits.saturating_mul(2)
3462 } else {
3463 target_hits.saturating_mul(3).div_ceil(2)
3466 };
3467 let session_path_filter_active = !filters.session_paths.is_empty();
3468 let fallback_fetch_limit = if session_path_filter_active {
3469 self.total_docs()
3470 .min(no_limit_result_cap())
3471 .max(target_hits.saturating_mul(3))
3472 .max(1)
3473 } else {
3474 target_hits.saturating_mul(3)
3475 };
3476
3477 if let Some((reader, fields)) = &self.reader {
3479 tracing::info!(
3480 backend = "tantivy",
3481 query = sanitized,
3482 limit = initial_fetch_limit,
3483 offset = 0,
3484 "search_start"
3485 );
3486 let (hits, tantivy_total_count) = self.search_tantivy(
3487 reader,
3488 fields,
3489 query,
3490 &sanitized,
3491 filters.clone(),
3492 initial_fetch_limit,
3493 0, field_mask,
3495 )?;
3496 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3497 *tc = Some(tantivy_total_count);
3498 }
3499 if !hits.is_empty() {
3500 let initial_hit_count = hits.len();
3501 let page_hits = |raw_hits: Vec<SearchHit>| {
3502 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3503 };
3504
3505 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3506
3507 let needs_retry = deduped_len < target_hits
3508 && initial_hit_count == initial_fetch_limit
3509 && initial_fetch_limit < fallback_fetch_limit;
3510
3511 if needs_retry {
3512 tracing::debug!(
3513 query = sanitized,
3514 target_hits,
3515 deduped_len,
3516 initial_fetch_limit,
3517 fallback_fetch_limit,
3518 session_path_filter_active,
3519 "retrying lexical fetch due to dedup or session-path shortfall"
3520 );
3521 let (retry_hits, retry_total_count) = self.search_tantivy(
3522 reader,
3523 fields,
3524 query,
3525 &sanitized,
3526 filters.clone(),
3527 fallback_fetch_limit,
3528 0,
3529 field_mask,
3530 )?;
3531 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3532 *tc = Some(retry_total_count);
3533 }
3534 if !retry_hits.is_empty() {
3535 (deduped_len, paged_hits) = page_hits(retry_hits);
3536 }
3537 }
3538
3539 tracing::trace!(
3540 query = sanitized,
3541 target_hits,
3542 deduped_len,
3543 returned = paged_hits.len(),
3544 "lexical fetch complete"
3545 );
3546
3547 if can_use_cache && offset == 0 {
3548 self.put_cache(&sanitized, &filters, &paged_hits);
3549 }
3550 return Ok(paged_hits);
3551 }
3552 tracing::debug!(
3553 query = sanitized,
3554 "tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3555 );
3556 return Ok(Vec::new());
3557 } else if let Some(readers) = self.federated_readers() {
3558 tracing::info!(
3559 backend = "tantivy-federated",
3560 query = sanitized,
3561 limit = initial_fetch_limit,
3562 offset = 0,
3563 shards = readers.len(),
3564 "search_start"
3565 );
3566 let (hits, tantivy_total_count) = self.search_tantivy_federated(
3567 readers.as_ref(),
3568 query,
3569 &sanitized,
3570 filters.clone(),
3571 initial_fetch_limit,
3572 field_mask,
3573 )?;
3574 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3575 *tc = Some(tantivy_total_count);
3576 }
3577 if !hits.is_empty() {
3578 let initial_hit_count = hits.len();
3579 let page_hits = |raw_hits: Vec<SearchHit>| {
3580 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3581 };
3582
3583 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3584 let expected_federated_capacity = initial_fetch_limit.saturating_mul(readers.len());
3585 let federated_initial_capacity_reached = if session_path_filter_active {
3586 initial_hit_count >= initial_fetch_limit.min(expected_federated_capacity)
3587 } else {
3588 initial_hit_count == expected_federated_capacity
3589 };
3590 let needs_retry = deduped_len < target_hits
3591 && federated_initial_capacity_reached
3592 && initial_fetch_limit < fallback_fetch_limit;
3593
3594 if needs_retry {
3595 tracing::debug!(
3596 query = sanitized,
3597 target_hits,
3598 deduped_len,
3599 initial_fetch_limit,
3600 fallback_fetch_limit,
3601 shards = readers.len(),
3602 session_path_filter_active,
3603 "retrying federated lexical fetch due to dedup or session-path shortfall"
3604 );
3605 let (retry_hits, retry_total_count) = self.search_tantivy_federated(
3606 readers.as_ref(),
3607 query,
3608 &sanitized,
3609 filters.clone(),
3610 fallback_fetch_limit,
3611 field_mask,
3612 )?;
3613 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3614 *tc = Some(retry_total_count);
3615 }
3616 if !retry_hits.is_empty() {
3617 (deduped_len, paged_hits) = page_hits(retry_hits);
3618 }
3619 }
3620
3621 tracing::trace!(
3622 query = sanitized,
3623 target_hits,
3624 deduped_len,
3625 returned = paged_hits.len(),
3626 shards = readers.len(),
3627 "federated lexical fetch complete"
3628 );
3629
3630 if can_use_cache && offset == 0 {
3631 self.put_cache(&sanitized, &filters, &paged_hits);
3632 }
3633 return Ok(paged_hits);
3634 }
3635 tracing::debug!(
3636 query = sanitized,
3637 shards = readers.len(),
3638 "federated tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3639 );
3640 return Ok(Vec::new());
3641 }
3642
3643 let unsupported_wildcards = sanitized.split_whitespace().any(|t| {
3647 let core = t.trim_end_matches('*');
3648 core.contains('*') });
3650
3651 if unsupported_wildcards {
3652 return Ok(Vec::new());
3653 }
3654
3655 let has_sqlite_backend = {
3656 let sqlite_guard = self
3657 .sqlite
3658 .lock()
3659 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3660 sqlite_guard.is_some() || self.sqlite_path.is_some()
3661 };
3662
3663 if has_sqlite_backend {
3664 tracing::info!(
3665 backend = "sqlite-fts5",
3666 query = sanitized,
3667 limit = fallback_fetch_limit,
3668 offset = 0,
3669 "search_start"
3670 );
3671 let hits = self.search_sqlite_fts5(
3672 self.sqlite_path
3673 .as_deref()
3674 .unwrap_or_else(|| Path::new(":memory:")),
3675 query,
3676 filters.clone(),
3677 fallback_fetch_limit,
3678 0, field_mask,
3680 )?;
3681 let (_, paged_hits) =
3682 self.postprocess_hits_page(hits, &sanitized, &filters, limit, offset);
3683
3684 if can_use_cache && offset == 0 {
3685 self.put_cache(&sanitized, &filters, &paged_hits);
3686 }
3687 return Ok(paged_hits);
3688 }
3689
3690 tracing::info!(backend = "none", query = query, "search_start");
3691 Ok(Vec::new())
3692 }
3693
3694 pub fn set_semantic_context(
3695 &self,
3696 embedder: Arc<dyn Embedder>,
3697 fs_semantic_index: VectorIndex,
3698 filter_maps: SemanticFilterMaps,
3699 roles: Option<HashSet<u8>>,
3700 ann_path: Option<PathBuf>,
3701 ) -> Result<()> {
3702 self.set_semantic_indexes_context(
3703 embedder,
3704 vec![fs_semantic_index],
3705 filter_maps,
3706 roles,
3707 ann_path,
3708 )
3709 }
3710
3711 pub fn set_semantic_indexes_context(
3712 &self,
3713 embedder: Arc<dyn Embedder>,
3714 fs_semantic_indexes: Vec<VectorIndex>,
3715 filter_maps: SemanticFilterMaps,
3716 roles: Option<HashSet<u8>>,
3717 ann_path: Option<PathBuf>,
3718 ) -> Result<()> {
3719 if fs_semantic_indexes.is_empty() {
3720 bail!("semantic context requires at least one vector index");
3721 }
3722
3723 let fs_semantic_indexes = fs_semantic_indexes
3724 .into_iter()
3725 .map(|index| {
3726 let embedder_id = index.embedder_id().to_string();
3727 let dimension = index.dimension();
3728 if embedder_id != embedder.id() {
3729 bail!(
3730 "embedder mismatch: index uses {}, embedder is {}",
3731 embedder_id,
3732 embedder.id()
3733 );
3734 }
3735 if dimension != embedder.dimension() {
3736 bail!(
3737 "embedder dimension mismatch: index uses {}, embedder is {}",
3738 dimension,
3739 embedder.dimension()
3740 );
3741 }
3742 Ok(Arc::new(index))
3743 })
3744 .collect::<Result<Vec<_>>>()?;
3745 let fs_semantic_index = Arc::clone(&fs_semantic_indexes[0]);
3746 let shard_count = fs_semantic_indexes.len();
3747 let ann_path = if shard_count == 1 { ann_path } else { None };
3748 let embedder_id = fs_semantic_index.embedder_id().to_string();
3749 let dimension = fs_semantic_index.dimension();
3750 let fs_semantic_indexes = Arc::new(fs_semantic_indexes);
3751
3752 let capacity = NonZeroUsize::new(100).ok_or_else(|| anyhow!("invalid cache size"))?;
3753 let context_token = Arc::new(());
3754 let mut state_guard = self
3755 .semantic
3756 .lock()
3757 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3758 *state_guard = Some(SemanticSearchState {
3759 context_token,
3760 embedder,
3761 fs_semantic_index,
3762 fs_semantic_indexes,
3763 fs_ann_index: None,
3764 ann_path,
3765 fs_in_memory_two_tier_index: None,
3766 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable::default(),
3767 progressive_context: None,
3768 progressive_context_unavailable: false,
3769 filter_maps,
3770 roles,
3771 query_cache: QueryCache::new(embedder_id.as_str(), capacity),
3772 });
3773 if shard_count > 1 {
3774 tracing::info!(
3775 shard_count,
3776 dimension,
3777 embedder = embedder_id,
3778 "semantic search context loaded sharded vector generation"
3779 );
3780 }
3781 Ok(())
3782 }
3783
3784 pub fn clear_semantic_context(&self) -> Result<()> {
3785 let mut guard = self
3786 .semantic
3787 .lock()
3788 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3789 *guard = None;
3790 Ok(())
3791 }
3792
3793 fn semantic_context_matches(&self, context_token: &Arc<()>) -> Result<bool> {
3794 let guard = self
3795 .semantic
3796 .lock()
3797 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3798 Ok(guard
3799 .as_ref()
3800 .is_some_and(|state| Arc::ptr_eq(&state.context_token, context_token)))
3801 }
3802
3803 fn semantic_query_embedding(&self, canonical: &str) -> Result<SemanticQueryEmbedding> {
3804 loop {
3805 let (embedder, context_token) = {
3806 let mut guard = self
3807 .semantic
3808 .lock()
3809 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3810 let state = guard.as_mut().ok_or_else(|| {
3811 anyhow!("semantic search unavailable (no embedder or vector index)")
3812 })?;
3813 if let Some(hit) = state
3814 .query_cache
3815 .get_cached(state.embedder.as_ref(), canonical)
3816 {
3817 return Ok(SemanticQueryEmbedding {
3818 context_token: Arc::clone(&state.context_token),
3819 vector: hit,
3820 });
3821 }
3822 (
3823 Arc::clone(&state.embedder),
3824 Arc::clone(&state.context_token),
3825 )
3826 };
3827
3828 let embedding = embedder
3829 .embed_sync(canonical)
3830 .map_err(|e| anyhow!("embedding failed: {e}"))?;
3831
3832 let mut guard = self
3833 .semantic
3834 .lock()
3835 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3836 let state = guard.as_mut().ok_or_else(|| {
3837 anyhow!("semantic search unavailable (no embedder or vector index)")
3838 })?;
3839 if !Arc::ptr_eq(&state.context_token, &context_token) {
3840 continue;
3841 }
3842 if let Some(hit) = state
3843 .query_cache
3844 .get_cached(state.embedder.as_ref(), canonical)
3845 {
3846 return Ok(SemanticQueryEmbedding {
3847 context_token,
3848 vector: hit,
3849 });
3850 }
3851 state
3852 .query_cache
3853 .store(state.embedder.as_ref(), canonical, embedding.clone());
3854 return Ok(SemanticQueryEmbedding {
3855 context_token,
3856 vector: embedding,
3857 });
3858 }
3859 }
3860
3861 fn in_memory_two_tier_index(
3862 &self,
3863 tier_mode: SemanticTierMode,
3864 ) -> Result<Option<Arc<FsInMemoryTwoTierIndex>>> {
3865 loop {
3866 let (ann_path, embedder_id, context_token) = {
3867 let mut guard = self
3868 .semantic
3869 .lock()
3870 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3871 let state = guard.as_mut().ok_or_else(|| {
3872 anyhow!("semantic search unavailable (no embedder or vector index)")
3873 })?;
3874 if let Some(index) = state.fs_in_memory_two_tier_index.as_ref()
3875 && two_tier_index_supports_mode(index.as_ref(), tier_mode)
3876 {
3877 return Ok(Some(Arc::clone(index)));
3878 }
3879 if state
3880 .in_memory_two_tier_unavailable
3881 .is_known_unavailable(tier_mode)
3882 {
3883 return Ok(None);
3884 }
3885 (
3886 state.ann_path.clone(),
3887 state.embedder.id().to_string(),
3888 Arc::clone(&state.context_token),
3889 )
3890 };
3891
3892 let index = build_in_memory_two_tier_index(ann_path.clone(), &embedder_id, tier_mode);
3893
3894 let mut guard = self
3895 .semantic
3896 .lock()
3897 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3898 let state = guard.as_mut().ok_or_else(|| {
3899 anyhow!("semantic search unavailable (no embedder or vector index)")
3900 })?;
3901 if let Some(existing) = state.fs_in_memory_two_tier_index.as_ref()
3902 && two_tier_index_supports_mode(existing.as_ref(), tier_mode)
3903 {
3904 return Ok(Some(Arc::clone(existing)));
3905 }
3906 if !Arc::ptr_eq(&state.context_token, &context_token) {
3907 continue;
3908 }
3909 let Some(index) = index else {
3910 state
3911 .in_memory_two_tier_unavailable
3912 .mark_unavailable(tier_mode);
3913 return Ok(None);
3914 };
3915 if !two_tier_index_supports_mode(index.as_ref(), tier_mode) {
3916 state
3917 .in_memory_two_tier_unavailable
3918 .mark_unavailable(tier_mode);
3919 return Ok(None);
3920 }
3921 state.fs_in_memory_two_tier_index = Some(Arc::clone(&index));
3922 if index.has_quality_index() {
3923 state.in_memory_two_tier_unavailable = InMemoryTwoTierUnavailable::default();
3924 } else {
3925 state.in_memory_two_tier_unavailable.fast_only = false;
3926 }
3927 return Ok(Some(index));
3928 }
3929 }
3930
3931 fn ann_index(&self) -> Result<Arc<FsHnswIndex>> {
3932 loop {
3933 let (ann_path, fs_semantic_index) = {
3934 let mut guard = self
3935 .semantic
3936 .lock()
3937 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3938 let state = guard.as_mut().ok_or_else(|| {
3939 anyhow!("semantic search unavailable (no embedder or vector index)")
3940 })?;
3941 if let Some(index) = state.fs_ann_index.as_ref() {
3942 return Ok(Arc::clone(index));
3943 }
3944 let ann_path = state.ann_path.clone().ok_or_else(|| {
3945 anyhow!(
3946 "approximate search unavailable: HNSW index missing (run 'cass index --semantic --build-hnsw')"
3947 )
3948 })?;
3949 (ann_path, Arc::clone(&state.fs_semantic_index))
3950 };
3951
3952 let ann = Arc::new(open_fs_semantic_ann_index(
3953 fs_semantic_index.as_ref(),
3954 &ann_path,
3955 )?);
3956
3957 let mut guard = self
3958 .semantic
3959 .lock()
3960 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3961 let state = guard.as_mut().ok_or_else(|| {
3962 anyhow!("semantic search unavailable (no embedder or vector index)")
3963 })?;
3964 if let Some(existing) = state.fs_ann_index.as_ref() {
3965 return Ok(Arc::clone(existing));
3966 }
3967 if state.ann_path.as_ref() != Some(&ann_path)
3968 || !Arc::ptr_eq(&state.fs_semantic_index, &fs_semantic_index)
3969 {
3970 continue;
3971 }
3972 state.fs_ann_index = Some(Arc::clone(&ann));
3973 return Ok(ann);
3974 }
3975 }
3976
3977 fn collapse_semantic_results(
3978 best_by_message: HashMap<u64, VectorSearchResult>,
3979 fetch_limit: usize,
3980 ) -> Vec<VectorSearchResult> {
3981 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
3982 collapsed.sort_by(|a, b| {
3983 b.score
3984 .total_cmp(&a.score)
3985 .then_with(|| a.message_id.cmp(&b.message_id))
3986 });
3987 if collapsed.len() > fetch_limit {
3988 collapsed.truncate(fetch_limit);
3989 }
3990 collapsed
3991 }
3992
3993 fn record_fs_semantic_hit(
3994 best_by_message: &mut HashMap<u64, VectorSearchResult>,
3995 hit: &FsVectorHit,
3996 ) {
3997 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
3998 return;
3999 };
4000 best_by_message
4001 .entry(parsed.message_id)
4002 .and_modify(|entry| {
4003 if hit.score > entry.score {
4004 entry.score = hit.score;
4005 entry.chunk_idx = parsed.chunk_idx;
4006 }
4007 })
4008 .or_insert(VectorSearchResult {
4009 message_id: parsed.message_id,
4010 chunk_idx: parsed.chunk_idx,
4011 score: hit.score,
4012 });
4013 }
4014
4015 fn search_exact_semantic_indexes(
4016 context: &SemanticCandidateContext,
4017 embedding: &[f32],
4018 fetch_limit: usize,
4019 fs_filter: Option<&dyn FsSearchFilter>,
4020 ) -> Result<(Vec<VectorSearchResult>, bool)> {
4021 if context.fs_semantic_indexes.len() == 1 {
4022 let fs_hits = context
4023 .fs_semantic_index
4024 .search_top_k(embedding, fetch_limit, fs_filter)
4025 .map_err(|err| anyhow!("frankensearch semantic search failed: {err}"))?;
4026 let mut best_by_message = HashMap::with_capacity(fs_hits.len());
4027 for hit in &fs_hits {
4028 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4029 }
4030 return Ok((
4031 Self::collapse_semantic_results(best_by_message, fetch_limit),
4032 fs_hits.len() >= fetch_limit,
4033 ));
4034 }
4035
4036 let mut best_by_message = HashMap::new();
4037 let mut raw_hits = 0usize;
4038 for index in context.fs_semantic_indexes.iter() {
4039 let shard_limit = index.record_count();
4040 if shard_limit == 0 {
4041 continue;
4042 }
4043 let fs_hits = index
4044 .search_top_k(embedding, shard_limit, fs_filter)
4045 .map_err(|err| anyhow!("frankensearch sharded semantic search failed: {err}"))?;
4046 raw_hits = raw_hits.saturating_add(fs_hits.len());
4047 best_by_message.reserve(fs_hits.len());
4048 for hit in &fs_hits {
4049 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4050 }
4051 }
4052 let collapsed = Self::collapse_semantic_results(best_by_message, fetch_limit);
4053 tracing::debug!(
4054 shard_count = context.fs_semantic_indexes.len(),
4055 raw_hits,
4056 returned = collapsed.len(),
4057 "semantic sharded exact merge complete"
4058 );
4059 Ok((collapsed, false))
4060 }
4061
4062 fn search_semantic_candidates(
4063 &self,
4064 context: &SemanticCandidateContext,
4065 embedding: &[f32],
4066 filters: &SearchFilters,
4067 request: SemanticCandidateSearchRequest<'_>,
4068 ) -> Result<(
4069 Vec<VectorSearchResult>,
4070 bool,
4071 Option<crate::search::ann_index::AnnSearchStats>,
4072 )> {
4073 let mut semantic_filter =
4074 SemanticFilter::from_search_filters(filters, &context.filter_maps)?;
4075 if let Some(roles) = context.roles.clone() {
4076 semantic_filter = semantic_filter.with_roles(Some(roles));
4077 }
4078
4079 if request.tier_mode.wants_two_tier() && !request.approximate {
4080 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4081 if let Some(two_tier_index) = request.in_memory_two_tier_index {
4082 let config = request.tier_mode.to_frankensearch_config();
4083 let searcher = FsSyncTwoTierSearcher::new(Arc::clone(two_tier_index), config);
4084 let (tier_hits, metrics) = searcher
4085 .search_collect_with_filter(embedding, request.fetch_limit, fs_filter)
4086 .map_err(|err| {
4087 anyhow!("frankensearch two-tier semantic search failed: {err}")
4088 })?;
4089
4090 tracing::debug!(
4091 tier_mode = ?request.tier_mode,
4092 phase1_ms = metrics.phase1_total_ms,
4093 phase2_ms = metrics.phase2_total_ms,
4094 skip_reason = ?metrics.skip_reason,
4095 returned = tier_hits.len(),
4096 "semantic two-tier search executed"
4097 );
4098
4099 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4100 HashMap::with_capacity(tier_hits.len());
4101 for hit in tier_hits.iter() {
4102 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4103 continue;
4104 };
4105 best_by_message
4106 .entry(parsed.message_id)
4107 .and_modify(|entry| {
4108 if hit.score > entry.score {
4109 entry.score = hit.score;
4110 entry.chunk_idx = parsed.chunk_idx;
4111 }
4112 })
4113 .or_insert(VectorSearchResult {
4114 message_id: parsed.message_id,
4115 chunk_idx: parsed.chunk_idx,
4116 score: hit.score,
4117 });
4118 }
4119
4120 return Ok((
4121 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4122 tier_hits.len() >= request.fetch_limit,
4123 None,
4124 ));
4125 }
4126
4127 tracing::debug!(
4128 tier_mode = ?request.tier_mode,
4129 "two-tier semantic unavailable; falling back to exact single-tier search"
4130 );
4131
4132 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4133 let (results, truncated) = Self::search_exact_semantic_indexes(
4134 context,
4135 embedding,
4136 request.fetch_limit,
4137 fs_filter,
4138 )?;
4139 return Ok((results, truncated, None));
4140 }
4141
4142 if request.approximate {
4143 if request.tier_mode.wants_two_tier() {
4144 tracing::debug!(
4145 tier_mode = ?request.tier_mode,
4146 "approximate search requested; bypassing two-tier mode"
4147 );
4148 }
4149
4150 let ann = request
4151 .ann_index
4152 .ok_or_else(|| anyhow!("HNSW index failed to initialize"))?;
4153 let candidate = request
4154 .fetch_limit
4155 .saturating_mul(ANN_CANDIDATE_MULTIPLIER)
4156 .max(request.fetch_limit);
4157 let ef = FS_HNSW_DEFAULT_EF_SEARCH.max(candidate);
4158 let (ann_results, search_stats) =
4159 ann.knn_search_with_stats(embedding, candidate, ef)
4160 .map_err(|err| anyhow!("frankensearch approximate search failed: {err}"))?;
4161 let ann_stats = Some(crate::search::ann_index::AnnSearchStats {
4162 index_size: search_stats.index_size,
4163 dimension: search_stats.dimension,
4164 ef_search: search_stats.ef_search,
4165 k_requested: search_stats.k_requested,
4166 k_returned: search_stats.k_returned,
4167 search_time_us: search_stats.search_time_us,
4168 estimated_recall: search_stats.estimated_recall as f32,
4169 is_approximate: search_stats.is_approximate,
4170 });
4171
4172 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4173
4174 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4175 HashMap::with_capacity(ann_results.len());
4176 for hit in ann_results.iter() {
4177 if let Some(filter) = fs_filter
4178 && !filter.matches(&hit.doc_id, None)
4179 {
4180 continue;
4181 }
4182 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4183 continue;
4184 };
4185 best_by_message
4186 .entry(parsed.message_id)
4187 .and_modify(|entry| {
4188 if hit.score > entry.score {
4189 entry.score = hit.score;
4190 entry.chunk_idx = parsed.chunk_idx;
4191 }
4192 })
4193 .or_insert(VectorSearchResult {
4194 message_id: parsed.message_id,
4195 chunk_idx: parsed.chunk_idx,
4196 score: hit.score,
4197 });
4198 }
4199
4200 return Ok((
4201 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4202 ann_results.len() >= candidate,
4203 ann_stats,
4204 ));
4205 }
4206
4207 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4208 let (results, truncated) = Self::search_exact_semantic_indexes(
4209 context,
4210 embedding,
4211 request.fetch_limit,
4212 fs_filter,
4213 )?;
4214 Ok((results, truncated, None))
4215 }
4216
4217 pub fn can_progressively_refine(&self) -> bool {
4218 self.progressive_context()
4219 .map(|context| {
4220 context.as_ref().is_some_and(|ctx| {
4221 ctx.quality_embedder.is_some() && ctx.index.has_quality_index()
4222 })
4223 })
4224 .unwrap_or(false)
4225 }
4226
4227 fn progressive_context(&self) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4228 loop {
4229 let (ann_path, embedder, context_token) = {
4230 let mut guard = self
4231 .semantic
4232 .lock()
4233 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4234 let state = guard.as_mut().ok_or_else(|| {
4235 anyhow!("semantic search unavailable (no embedder or vector index)")
4236 })?;
4237 if let Some(context) = state.progressive_context.as_ref() {
4238 return Ok(Some(Arc::clone(context)));
4239 }
4240 if state.progressive_context_unavailable {
4241 return Ok(None);
4242 }
4243 (
4244 state.ann_path.clone(),
4245 Arc::clone(&state.embedder),
4246 Arc::clone(&state.context_token),
4247 )
4248 };
4249
4250 let context = match self.build_progressive_context(
4251 ann_path.clone(),
4252 embedder,
4253 Arc::clone(&context_token),
4254 ) {
4255 Ok(context) => context,
4256 Err(err) => {
4257 let mut guard = self
4258 .semantic
4259 .lock()
4260 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4261 let state = guard.as_mut().ok_or_else(|| {
4262 anyhow!("semantic search unavailable (no embedder or vector index)")
4263 })?;
4264 if let Some(existing) = state.progressive_context.as_ref() {
4265 return Ok(Some(Arc::clone(existing)));
4266 }
4267 if !Arc::ptr_eq(&state.context_token, &context_token) {
4268 continue;
4269 }
4270 return Err(err);
4271 }
4272 };
4273
4274 let Some(context) = context else {
4275 let mut guard = self
4276 .semantic
4277 .lock()
4278 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4279 let state = guard.as_mut().ok_or_else(|| {
4280 anyhow!("semantic search unavailable (no embedder or vector index)")
4281 })?;
4282 if let Some(existing) = state.progressive_context.as_ref() {
4283 return Ok(Some(Arc::clone(existing)));
4284 }
4285 if !Arc::ptr_eq(&state.context_token, &context_token) {
4286 continue;
4287 }
4288 state.progressive_context_unavailable = true;
4289 return Ok(None);
4290 };
4291
4292 let mut guard = self
4293 .semantic
4294 .lock()
4295 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4296 let state = guard.as_mut().ok_or_else(|| {
4297 anyhow!("semantic search unavailable (no embedder or vector index)")
4298 })?;
4299 if let Some(existing) = state.progressive_context.as_ref() {
4300 return Ok(Some(Arc::clone(existing)));
4301 }
4302 if !Arc::ptr_eq(&state.context_token, &context_token) {
4303 continue;
4304 }
4305 state.progressive_context_unavailable = false;
4306 state.progressive_context = Some(Arc::clone(&context));
4307 return Ok(Some(context));
4308 }
4309 }
4310
4311 fn build_progressive_context(
4312 &self,
4313 ann_path: Option<PathBuf>,
4314 embedder: Arc<dyn Embedder>,
4315 context_token: Arc<()>,
4316 ) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4317 let Some(index_dir) = ann_path
4318 .as_ref()
4319 .and_then(|path| path.parent().map(Path::to_path_buf))
4320 else {
4321 return Ok(None);
4322 };
4323
4324 let fast_path = {
4325 let explicit = index_dir.join("vector.fast.idx");
4326 if explicit.is_file() {
4327 explicit
4328 } else {
4329 let fallback = index_dir.join("vector.idx");
4330 if fallback.is_file() {
4331 fallback
4332 } else {
4333 return Ok(None);
4334 }
4335 }
4336 };
4337 let quality_path = index_dir.join("vector.quality.idx");
4338 if !quality_path.is_file() {
4339 return Ok(None);
4340 }
4341
4342 let fast_index = FsVectorIndex::open(&fast_path)
4343 .map_err(|err| anyhow!("open fast-tier index failed: {err}"))?;
4344 let quality_index = FsVectorIndex::open(&quality_path)
4345 .map_err(|err| anyhow!("open quality-tier index failed: {err}"))?;
4346 let index = Arc::new(
4347 FsTwoTierIndex::open(&index_dir, frankensearch_two_tier_config())
4348 .map_err(|err| anyhow!("open progressive two-tier index failed: {err}"))?,
4349 );
4350
4351 let fast_embedder = self.load_embedder_for_progressive_id(
4352 &embedder,
4353 fast_index.embedder_id(),
4354 fast_index.dimension(),
4355 )?;
4356 let fast_embedder: Arc<dyn frankensearch::Embedder> = Arc::new(FsSyncEmbedderAdapter(
4357 SharedCassSyncEmbedder::new(fast_embedder),
4358 ));
4359 let quality_embedder = Some(self.load_embedder_for_progressive_id(
4360 &embedder,
4361 quality_index.embedder_id(),
4362 quality_index.dimension(),
4363 )?);
4364 let quality_embedder = quality_embedder.map(|embedder| {
4365 Arc::new(FsSyncEmbedderAdapter(SharedCassSyncEmbedder::new(embedder)))
4366 as Arc<dyn frankensearch::Embedder>
4367 });
4368
4369 Ok(Some(Arc::new(ProgressiveTwoTierContext {
4370 context_token,
4371 index,
4372 fast_embedder,
4373 quality_embedder,
4374 })))
4375 }
4376
4377 fn load_embedder_for_progressive_id(
4378 &self,
4379 current_embedder: &Arc<dyn Embedder>,
4380 embedder_id: &str,
4381 dimension: usize,
4382 ) -> Result<Arc<dyn Embedder>> {
4383 if current_embedder.id() == embedder_id {
4384 return Ok(Arc::clone(current_embedder));
4385 }
4386
4387 if let Some(dim) = embedder_id.strip_prefix("fnv1a-")
4388 && let Ok(parsed) = dim.parse::<usize>()
4389 {
4390 return Ok(Arc::new(crate::search::hash_embedder::HashEmbedder::new(
4391 parsed.max(dimension),
4392 )));
4393 }
4394
4395 if embedder_id == crate::search::fastembed_embedder::FastEmbedder::embedder_id_static() {
4396 let data_dir = self
4397 .sqlite_path
4398 .as_ref()
4399 .and_then(|path| path.parent())
4400 .ok_or_else(|| anyhow!("cannot resolve data dir for progressive embedder load"))?;
4401 let model_dir =
4402 crate::search::fastembed_embedder::FastEmbedder::default_model_dir(data_dir);
4403 let embedder =
4404 crate::search::fastembed_embedder::FastEmbedder::load_from_dir(&model_dir)
4405 .with_context(|| {
4406 format!("loading FastEmbed model from {}", model_dir.display())
4407 })?;
4408 if embedder.dimension() != dimension {
4409 bail!(
4410 "progressive embedder dimension mismatch: {} index expects {}, model has {}",
4411 embedder_id,
4412 dimension,
4413 embedder.dimension()
4414 );
4415 }
4416 return Ok(Arc::new(embedder));
4417 }
4418
4419 bail!("unsupported progressive embedder id: {embedder_id}");
4420 }
4421
4422 fn resolve_semantic_doc_ids_for_hits(
4423 &self,
4424 hits: &[SearchHit],
4425 ) -> Result<Vec<Option<ResolvedSemanticDocId>>> {
4426 if hits.is_empty() {
4427 return Ok(Vec::new());
4428 }
4429
4430 let lookup_keys: Vec<Option<ProgressiveLookupKey>> = hits
4431 .iter()
4432 .map(|hit| {
4433 let idx = hit
4434 .line_number
4435 .and_then(|line| line.checked_sub(1))
4436 .map(i64::try_from)
4437 .transpose()
4438 .ok()
4439 .flatten()?;
4440 Some((
4441 normalized_search_hit_source_id(hit),
4442 hit.source_path.clone(),
4443 hit.conversation_id,
4444 hit.title.trim().to_string(),
4445 idx,
4446 hit.created_at,
4447 hit.content_hash,
4448 ))
4449 })
4450 .collect();
4451
4452 let mut seen_exact = HashSet::new();
4453 let mut exact_query_keys = Vec::new();
4454 let mut seen_fallback = HashSet::new();
4455 let mut fallback_query_keys = Vec::new();
4456 for (source_id, source_path, conversation_id, _title, idx, _created_at, _content_hash) in
4457 lookup_keys.iter().flatten()
4458 {
4459 if let Some(conversation_id) = conversation_id {
4460 let query_key: ProgressiveExactQueryKey = (*conversation_id, *idx);
4461 if seen_exact.insert(query_key) {
4462 exact_query_keys.push(query_key);
4463 }
4464 } else {
4465 let query_key: ProgressiveFallbackQueryKey =
4466 (source_id.clone(), source_path.clone(), *idx);
4467 if seen_fallback.insert(query_key.clone()) {
4468 fallback_query_keys.push(query_key);
4469 }
4470 }
4471 }
4472
4473 if exact_query_keys.is_empty() && fallback_query_keys.is_empty() {
4474 return Ok(vec![None; hits.len()]);
4475 }
4476
4477 let sqlite_guard = self.sqlite_guard()?;
4478 let conn = sqlite_guard
4479 .as_ref()
4480 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4481
4482 let mut resolved_by_key = HashMap::new();
4483 let normalized_source_sql =
4484 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4485
4486 const CHUNK_SIZE: usize = 300;
4487 for chunk in exact_query_keys.chunks(CHUNK_SIZE) {
4488 let mut sql = String::from("SELECT c.id, ");
4489 sql.push_str(&normalized_source_sql);
4490 sql.push_str(
4491 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4492 FROM messages m
4493 JOIN conversations c ON m.conversation_id = c.id
4494 LEFT JOIN sources s ON c.source_id = s.id
4495 WHERE ",
4496 );
4497 let mut params = Vec::with_capacity(chunk.len().saturating_mul(2));
4498 for (idx, (conversation_id, line_idx)) in chunk.iter().enumerate() {
4499 if idx > 0 {
4500 sql.push_str(" OR ");
4501 }
4502 sql.push_str("(c.id = ? AND m.idx = ?)");
4503 params.push(ParamValue::from(*conversation_id));
4504 params.push(ParamValue::from(*line_idx));
4505 }
4506
4507 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4508 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4509 let conversation_id: i64 = row.get_typed(0)?;
4510 let source_id: String = row.get_typed(1)?;
4511 let source_path: String = row.get_typed(2)?;
4512 let idx: i64 = row.get_typed(3)?;
4513 let message_id_raw: i64 = row.get_typed(4)?;
4514 let agent_id_raw: Option<i64> = row.get_typed(5)?;
4517 let workspace_id_raw: Option<i64> = row.get_typed(6)?;
4518 let role_raw: String = row.get_typed(7)?;
4519 let created_at_ms: Option<i64> = row.get_typed(8)?;
4520 let content: String = row.get_typed(9)?;
4521 let title: Option<String> = row.get_typed(10)?;
4522
4523 let canonical = canonicalize_for_embedding(&content);
4524 if canonical.is_empty() {
4525 return Ok(None);
4526 }
4527
4528 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4529 std::io::Error::other("message id out of range for progressive doc_id")
4530 })?;
4531 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4532 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4533 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4534 let doc_id = SemanticDocId {
4535 message_id,
4536 chunk_idx: 0,
4537 agent_id,
4538 workspace_id,
4539 source_id: crc32fast::hash(source_id.as_bytes()),
4540 role,
4541 created_at_ms: created_at_ms.unwrap_or(0),
4542 content_hash: Some(content_hash(&canonical)),
4543 }
4544 .to_doc_id_string();
4545 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4546 let lookup_key = (
4547 source_id,
4548 source_path.clone(),
4549 Some(conversation_id),
4550 title.unwrap_or_default().trim().to_string(),
4551 idx,
4552 created_at_ms,
4553 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4554 );
4555
4556 Ok(Some((
4557 lookup_key,
4558 ResolvedSemanticDocId { message_id, doc_id },
4559 )))
4560 })?;
4561
4562 for row in chunk_rows.into_iter().flatten() {
4563 resolved_by_key.insert(row.0, row.1);
4564 }
4565 }
4566
4567 for chunk in fallback_query_keys.chunks(CHUNK_SIZE) {
4568 let mut sql = String::from("SELECT ");
4569 sql.push_str(&normalized_source_sql);
4570 sql.push_str(
4571 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4572 FROM messages m
4573 JOIN conversations c ON m.conversation_id = c.id
4574 LEFT JOIN sources s ON c.source_id = s.id
4575 WHERE ",
4576 );
4577 let mut params = Vec::with_capacity(chunk.len().saturating_mul(3));
4578 for (idx, (source_id, source_path, line_idx)) in chunk.iter().enumerate() {
4579 if idx > 0 {
4580 sql.push_str(" OR ");
4581 }
4582 sql.push_str(&format!(
4583 "({normalized_source_sql} = ? AND c.source_path = ? AND m.idx = ?)"
4584 ));
4585 params.push(ParamValue::from(normalize_search_source_filter_value(
4586 source_id,
4587 )));
4588 params.push(ParamValue::from(source_path.clone()));
4589 params.push(ParamValue::from(*line_idx));
4590 }
4591
4592 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4593 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4594 let source_id: String = row.get_typed(0)?;
4595 let source_path: String = row.get_typed(1)?;
4596 let idx: i64 = row.get_typed(2)?;
4597 let message_id_raw: i64 = row.get_typed(3)?;
4598 let agent_id_raw: Option<i64> = row.get_typed(4)?;
4601 let workspace_id_raw: Option<i64> = row.get_typed(5)?;
4602 let role_raw: String = row.get_typed(6)?;
4603 let created_at_ms: Option<i64> = row.get_typed(7)?;
4604 let content: String = row.get_typed(8)?;
4605 let title: Option<String> = row.get_typed(9)?;
4606
4607 let canonical = canonicalize_for_embedding(&content);
4608 if canonical.is_empty() {
4609 return Ok(None);
4610 }
4611
4612 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4613 std::io::Error::other("message id out of range for progressive doc_id")
4614 })?;
4615 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4616 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4617 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4618 let doc_id = SemanticDocId {
4619 message_id,
4620 chunk_idx: 0,
4621 agent_id,
4622 workspace_id,
4623 source_id: crc32fast::hash(source_id.as_bytes()),
4624 role,
4625 created_at_ms: created_at_ms.unwrap_or(0),
4626 content_hash: Some(content_hash(&canonical)),
4627 }
4628 .to_doc_id_string();
4629 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4630 let lookup_key = (
4631 source_id,
4632 source_path.clone(),
4633 None,
4634 title.unwrap_or_default().trim().to_string(),
4635 idx,
4636 created_at_ms,
4637 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4638 );
4639
4640 Ok(Some((
4641 lookup_key,
4642 ResolvedSemanticDocId { message_id, doc_id },
4643 )))
4644 })?;
4645
4646 for row in chunk_rows.into_iter().flatten() {
4647 resolved_by_key.insert(row.0, row.1);
4648 }
4649 }
4650
4651 Ok(lookup_keys
4652 .into_iter()
4653 .map(|key| key.and_then(|lookup| resolved_by_key.get(&lookup).cloned()))
4654 .collect())
4655 }
4656
4657 fn load_message_text_by_id(&self, message_id: u64) -> Result<Option<String>> {
4658 let sqlite_guard = self.sqlite_guard()?;
4659 let conn = sqlite_guard
4660 .as_ref()
4661 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4662 let rows: Vec<String> = conn.query_map_collect(
4663 "SELECT content FROM messages WHERE id = ?",
4664 &[ParamValue::from(i64::try_from(message_id)?)],
4665 |row: &frankensqlite::Row| row.get_typed(0),
4666 )?;
4667 Ok(rows.into_iter().next())
4668 }
4669
4670 fn collapse_progressive_scored_results(
4671 &self,
4672 results: &[FsScoredResult],
4673 fetch_limit: usize,
4674 ) -> Vec<VectorSearchResult> {
4675 let fetch = fetch_limit.max(1);
4676 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4677 HashMap::with_capacity(results.len());
4678 for hit in results {
4679 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4680 continue;
4681 };
4682 best_by_message
4683 .entry(parsed.message_id)
4684 .and_modify(|entry| {
4685 if hit.score > entry.score {
4686 entry.score = hit.score;
4687 entry.chunk_idx = parsed.chunk_idx;
4688 }
4689 })
4690 .or_insert(VectorSearchResult {
4691 message_id: parsed.message_id,
4692 chunk_idx: parsed.chunk_idx,
4693 score: hit.score,
4694 });
4695 }
4696 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
4697 collapsed.sort_by(|a, b| {
4698 b.score
4699 .total_cmp(&a.score)
4700 .then_with(|| a.message_id.cmp(&b.message_id))
4701 });
4702 if collapsed.len() > fetch {
4703 collapsed.truncate(fetch);
4704 }
4705 collapsed
4706 }
4707
4708 fn hydrate_semantic_hits_with_ids(
4709 &self,
4710 results: &[VectorSearchResult],
4711 field_mask: FieldMask,
4712 ) -> Result<Vec<(u64, SearchHit)>> {
4713 if results.is_empty() {
4714 return Ok(Vec::new());
4715 }
4716 let sqlite_guard = self.sqlite_guard()?;
4717 let conn = sqlite_guard
4718 .as_ref()
4719 .ok_or_else(|| anyhow!("semantic search requires database connection"))?;
4720
4721 let placeholder_capacity = results.len().saturating_mul(2).saturating_sub(1);
4722 let mut placeholders = String::with_capacity(placeholder_capacity);
4723 let mut params: Vec<ParamValue> = Vec::with_capacity(results.len());
4724 for (idx, result) in results.iter().enumerate() {
4725 if idx > 0 {
4726 placeholders.push(',');
4727 }
4728 placeholders.push('?');
4729 params.push(ParamValue::from(i64::try_from(result.message_id)?));
4730 }
4731
4732 let title_expr = if field_mask.wants_title() {
4733 "c.title"
4734 } else {
4735 "''"
4736 };
4737 let normalized_source_sql =
4738 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4739 let sql = format!(
4744 "SELECT m.id, c.id, m.content, m.created_at, m.idx, m.role, {title_expr}, c.source_path, {normalized_source_sql}, c.origin_host, COALESCE(a.slug, 'unknown'), w.path, s.kind, c.started_at
4745 FROM messages m
4746 JOIN conversations c ON m.conversation_id = c.id
4747 LEFT JOIN agents a ON c.agent_id = a.id
4748 LEFT JOIN workspaces w ON c.workspace_id = w.id
4749 LEFT JOIN sources s ON c.source_id = s.id
4750 WHERE m.id IN ({placeholders})"
4751 );
4752
4753 let rows: Vec<(u64, SearchHit)> =
4754 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4755 let message_id: i64 = row.get_typed(0)?;
4756 let conversation_id: i64 = row.get_typed(1)?;
4757 let full_content: String = row.get_typed(2)?;
4758 let msg_created_at: Option<i64> = row.get_typed(3)?;
4759 let idx: Option<i64> = row.get_typed(4)?;
4760 let title: Option<String> = if field_mask.wants_title() {
4761 row.get_typed(6)?
4762 } else {
4763 None
4764 };
4765 let source_path: String = row.get_typed(7)?;
4766 let raw_source_id: String = row.get_typed(8)?;
4767 let origin_host: Option<String> = row.get_typed(9)?;
4768 let agent: String = row.get_typed(10)?;
4769 let workspace: Option<String> = row.get_typed(11)?;
4770 let raw_origin_kind: Option<String> = row.get_typed(12)?;
4771 let started_at: Option<i64> = row.get_typed(13)?;
4772
4773 let created_at = msg_created_at.or(started_at);
4774 let line_number = idx
4775 .and_then(|i| usize::try_from(i).ok())
4776 .map(|i| i.saturating_add(1));
4777 let snippet = if field_mask.wants_snippet() {
4778 snippet_from_content(&full_content)
4779 } else {
4780 String::new()
4781 };
4782 let content = if field_mask.needs_content() {
4783 full_content.clone()
4784 } else {
4785 String::new()
4786 };
4787 let content_hash =
4788 stable_hit_hash(&full_content, &source_path, line_number, created_at);
4789 let source_id = normalized_search_hit_source_id_parts(
4790 raw_source_id.as_str(),
4791 raw_origin_kind.as_deref().unwrap_or_default(),
4792 origin_host.as_deref(),
4793 );
4794 let origin_kind =
4795 normalized_search_hit_origin_kind(&source_id, raw_origin_kind.as_deref());
4796
4797 let hit = SearchHit {
4798 title: if field_mask.wants_title() {
4799 title.unwrap_or_default()
4800 } else {
4801 String::new()
4802 },
4803 snippet,
4804 content,
4805 content_hash,
4806 conversation_id: Some(conversation_id),
4807 score: 0.0,
4808 source_path,
4809 agent,
4810 workspace: workspace.unwrap_or_default(),
4811 workspace_original: None,
4812 created_at,
4813 line_number,
4814 match_type: MatchType::Exact,
4815 source_id,
4816 origin_kind,
4817 origin_host,
4818 };
4819
4820 Ok((semantic_message_id_from_db(message_id)?, hit))
4821 })?;
4822
4823 let mut hits_by_id = HashMap::new();
4824 for (id, hit) in rows {
4825 hits_by_id.insert(id, hit);
4826 }
4827
4828 let mut ordered = Vec::new();
4829 for result in results {
4830 if let Some(mut hit) = hits_by_id.remove(&result.message_id) {
4831 hit.score = result.score;
4832 ordered.push((result.message_id, hit));
4833 }
4834 }
4835
4836 Ok(ordered)
4837 }
4838
4839 fn overlay_progressive_lexical_hit(
4840 &self,
4841 hit: &mut SearchHit,
4842 lexical: &ProgressiveLexicalHit,
4843 field_mask: FieldMask,
4844 ) {
4845 if field_mask.wants_title() && !lexical.title.is_empty() {
4846 hit.title = lexical.title.clone();
4847 }
4848 if field_mask.wants_snippet() && !lexical.snippet.is_empty() {
4849 hit.snippet = lexical.snippet.clone();
4850 }
4851 if field_mask.needs_content() && !lexical.content.is_empty() {
4852 hit.content = lexical.content.clone();
4853 }
4854 hit.match_type = lexical.match_type;
4855 hit.line_number = lexical.line_number.or(hit.line_number);
4856 }
4857
4858 fn progressive_phase_to_result(
4859 &self,
4860 results: &[FsScoredResult],
4861 ctx: ProgressivePhaseContext<'_>,
4862 ) -> Result<SearchResult> {
4863 let collapsed = self.collapse_progressive_scored_results(results, ctx.fetch_limit);
4864 let missing: Vec<VectorSearchResult> = collapsed
4865 .iter()
4866 .filter(|result| {
4867 ctx.lexical_cache
4868 .and_then(|cache| cache.hits_by_message.get(&result.message_id))
4869 .is_none()
4870 })
4871 .map(|result| VectorSearchResult {
4872 message_id: result.message_id,
4873 chunk_idx: result.chunk_idx,
4874 score: result.score,
4875 })
4876 .collect();
4877 let mut hydrated_by_id: HashMap<u64, SearchHit> = self
4878 .hydrate_semantic_hits_with_ids(&missing, ctx.field_mask)?
4879 .into_iter()
4880 .collect();
4881
4882 let mut hydrated: Vec<(u64, SearchHit)> = Vec::with_capacity(collapsed.len());
4883 for result in &collapsed {
4884 if let Some(cache) = ctx.lexical_cache
4885 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
4886 {
4887 hydrated.push((result.message_id, lexical.to_search_hit(result.score)));
4888 continue;
4889 }
4890 if let Some(mut hit) = hydrated_by_id.remove(&result.message_id) {
4891 if let Some(cache) = ctx.lexical_cache
4892 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
4893 {
4894 self.overlay_progressive_lexical_hit(&mut hit, lexical, ctx.field_mask);
4895 }
4896 hydrated.push((result.message_id, hit));
4897 }
4898 }
4899
4900 let mut hits: Vec<SearchHit> = hydrated.into_iter().map(|(_, hit)| hit).collect();
4901 (_, hits) = self.postprocess_hits_page(hits, ctx.query, ctx.filters, ctx.limit, 0);
4902
4903 let (wildcard_fallback, suggestions) = ctx
4904 .lexical_cache
4905 .map(|cache| {
4906 let suggestions = if hits.is_empty() {
4907 cache.suggestions.clone()
4908 } else {
4909 Vec::new()
4910 };
4911 (cache.wildcard_fallback, suggestions)
4912 })
4913 .unwrap_or((false, Vec::new()));
4914
4915 Ok(SearchResult {
4916 hits,
4917 wildcard_fallback,
4918 cache_stats: self.cache_stats(),
4919 suggestions,
4920 ann_stats: None,
4921 total_count: None,
4922 })
4923 }
4924
4925 pub(crate) async fn search_progressive_with_callback(
4926 self: &Arc<Self>,
4927 request: ProgressiveSearchRequest<'_>,
4928 mut on_event: impl FnMut(ProgressiveSearchEvent) + Send,
4929 ) -> Result<()> {
4930 let ProgressiveSearchRequest {
4931 cx,
4932 query,
4933 filters,
4934 limit,
4935 sparse_threshold,
4936 field_mask,
4937 mode,
4938 } = request;
4939 let field_mask = effective_field_mask(field_mask);
4940 let limit = limit.max(1);
4941 let fetch_limit = progressive_phase_fetch_limit(limit);
4942
4943 match mode {
4944 SearchMode::Lexical => {
4945 let started = Instant::now();
4946 let result = self.search_with_fallback(
4947 query,
4948 filters,
4949 limit,
4950 0,
4951 sparse_threshold,
4952 field_mask,
4953 )?;
4954 on_event(ProgressiveSearchEvent::Phase {
4955 kind: ProgressivePhaseKind::Initial,
4956 elapsed_ms: started.elapsed().as_millis(),
4957 result,
4958 });
4959 return Ok(());
4960 }
4961 SearchMode::Semantic | SearchMode::Hybrid => {}
4962 }
4963
4964 let progressive_context = {
4965 self.progressive_context()?
4966 .ok_or_else(|| anyhow!("progressive two-tier context unavailable"))?
4967 };
4968 let progressive_context_token = Arc::clone(&progressive_context.context_token);
4969
4970 let lexical_cache: Arc<Mutex<ProgressiveLexicalSnapshot>> =
4971 Arc::new(Mutex::new(Arc::new(ProgressiveLexicalCache::default())));
4972 let text_cache: Arc<Mutex<HashMap<u64, String>>> = Arc::new(Mutex::new(HashMap::new()));
4973 let text_client = Arc::clone(self);
4974 let text_cache_for_lookup = Arc::clone(&text_cache);
4975 let text_fn = move |doc_id: &str| -> Option<String> {
4976 let parsed = parse_semantic_doc_id(doc_id)?;
4977 if let Ok(cache) = text_cache_for_lookup.lock()
4978 && let Some(text) = cache.get(&parsed.message_id)
4979 {
4980 return Some(text.clone());
4981 }
4982 let loaded = text_client
4983 .load_message_text_by_id(parsed.message_id)
4984 .ok()
4985 .flatten()?;
4986 if let Ok(mut cache) = text_cache_for_lookup.lock() {
4987 cache.insert(parsed.message_id, loaded.clone());
4988 }
4989 Some(loaded)
4990 };
4991
4992 let mut searcher = FsTwoTierSearcher::new(
4993 Arc::clone(&progressive_context.index),
4994 Arc::clone(&progressive_context.fast_embedder),
4995 frankensearch_two_tier_config(),
4996 );
4997
4998 if let Some(quality_embedder) = progressive_context.quality_embedder.as_ref() {
4999 searcher = searcher.with_quality_embedder(Arc::clone(quality_embedder));
5000 }
5001
5002 if matches!(mode, SearchMode::Hybrid) {
5003 let lexical = Arc::new(CassProgressiveLexicalAdapter::new(
5004 Arc::clone(self),
5005 filters.clone(),
5006 field_mask,
5007 sparse_threshold,
5008 Arc::clone(&lexical_cache),
5009 ));
5010 searcher = searcher.with_lexical(lexical);
5011 }
5012
5013 let phase_client = Arc::clone(self);
5014 let phase_filters = filters.clone();
5015 let phase_cache = Arc::clone(&lexical_cache);
5016 let mut phase_error: Option<anyhow::Error> = None;
5017
5018 let search_result = searcher
5019 .search(cx, query, fetch_limit, text_fn, |phase| {
5020 if phase_error.is_some() {
5021 return;
5022 }
5023 match phase_client.semantic_context_matches(&progressive_context_token) {
5024 Ok(true) => {}
5025 Ok(false) => {
5026 phase_error = Some(anyhow!(
5027 "progressive search aborted: semantic context changed"
5028 ));
5029 cx.set_cancel_requested(true);
5030 return;
5031 }
5032 Err(err) => {
5033 phase_error = Some(err);
5034 cx.set_cancel_requested(true);
5035 return;
5036 }
5037 }
5038 let lexical_snapshot = phase_cache.lock().ok().map(|guard| Arc::clone(&guard));
5039 let event_result = match phase {
5040 FsSearchPhase::Initial {
5041 results, latency, ..
5042 } => phase_client
5043 .progressive_phase_to_result(
5044 &results,
5045 ProgressivePhaseContext {
5046 query,
5047 filters: &phase_filters,
5048 field_mask,
5049 lexical_cache: lexical_snapshot.as_deref(),
5050 limit,
5051 fetch_limit,
5052 },
5053 )
5054 .map(|result| ProgressiveSearchEvent::Phase {
5055 kind: ProgressivePhaseKind::Initial,
5056 elapsed_ms: latency.as_millis(),
5057 result,
5058 }),
5059 FsSearchPhase::Refined {
5060 results, latency, ..
5061 } => phase_client
5062 .progressive_phase_to_result(
5063 &results,
5064 ProgressivePhaseContext {
5065 query,
5066 filters: &phase_filters,
5067 field_mask,
5068 lexical_cache: lexical_snapshot.as_deref(),
5069 limit,
5070 fetch_limit,
5071 },
5072 )
5073 .map(|result| ProgressiveSearchEvent::Phase {
5074 kind: ProgressivePhaseKind::Refined,
5075 elapsed_ms: latency.as_millis(),
5076 result,
5077 }),
5078 FsSearchPhase::Reranked {
5084 results, latency, ..
5085 } => phase_client
5086 .progressive_phase_to_result(
5087 &results,
5088 ProgressivePhaseContext {
5089 query,
5090 filters: &phase_filters,
5091 field_mask,
5092 lexical_cache: lexical_snapshot.as_deref(),
5093 limit,
5094 fetch_limit,
5095 },
5096 )
5097 .map(|result| ProgressiveSearchEvent::Phase {
5098 kind: ProgressivePhaseKind::Refined,
5099 elapsed_ms: latency.as_millis(),
5100 result,
5101 }),
5102 FsSearchPhase::RefinementFailed { error, latency, .. } => {
5103 Ok(ProgressiveSearchEvent::RefinementFailed {
5104 latency_ms: latency.as_millis(),
5105 error: error.to_string(),
5106 })
5107 }
5108 };
5109
5110 match event_result {
5111 Ok(event) => on_event(event),
5112 Err(err) => {
5113 phase_error = Some(err);
5114 cx.set_cancel_requested(true);
5115 }
5116 }
5117 })
5118 .await;
5119
5120 if let Some(err) = phase_error {
5121 return Err(err);
5122 }
5123
5124 search_result
5125 .map(|_| ())
5126 .map_err(|err| anyhow!("progressive search failed: {err}"))
5127 }
5128
5129 pub fn search_semantic(
5131 &self,
5132 query: &str,
5133 filters: SearchFilters,
5134 limit: usize,
5135 offset: usize,
5136 field_mask: FieldMask,
5137 approximate: bool,
5138 ) -> Result<(
5139 Vec<SearchHit>,
5140 Option<crate::search::ann_index::AnnSearchStats>,
5141 )> {
5142 self.search_semantic_with_tier(
5143 query,
5144 filters,
5145 limit,
5146 offset,
5147 field_mask,
5148 approximate,
5149 SemanticTierMode::Single,
5150 )
5151 }
5152
5153 #[allow(clippy::too_many_arguments)]
5155 pub fn search_semantic_with_tier(
5156 &self,
5157 query: &str,
5158 filters: SearchFilters,
5159 limit: usize,
5160 offset: usize,
5161 field_mask: FieldMask,
5162 approximate: bool,
5163 tier_mode: SemanticTierMode,
5164 ) -> Result<(
5165 Vec<SearchHit>,
5166 Option<crate::search::ann_index::AnnSearchStats>,
5167 )> {
5168 let field_mask = effective_field_mask(field_mask);
5169 let canonical = canonicalize_for_embedding(query);
5170 if canonical.trim().is_empty() {
5171 return Ok((Vec::new(), None));
5172 }
5173 let limit = if limit == 0 {
5174 self.total_docs().min(no_limit_result_cap()).max(1)
5175 } else {
5176 limit
5177 };
5178 let target_hits = limit.saturating_add(offset);
5179 if target_hits == 0 {
5180 return Ok((Vec::new(), None));
5181 }
5182 let initial_fetch_limit = target_hits;
5183 let fallback_fetch_limit = target_hits.saturating_mul(3);
5184 loop {
5185 let (embedding, candidate_context, in_memory_two_tier_index, ann_index, context_token) = loop {
5186 let embedding = self.semantic_query_embedding(&canonical)?;
5187 let (candidate_context, context_token) = {
5188 let guard = self
5189 .semantic
5190 .lock()
5191 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5192 let state = guard.as_ref().ok_or_else(|| {
5193 anyhow!("semantic search unavailable (no embedder or vector index)")
5194 })?;
5195 (
5196 SemanticCandidateContext {
5197 fs_semantic_index: Arc::clone(&state.fs_semantic_index),
5198 fs_semantic_indexes: Arc::clone(&state.fs_semantic_indexes),
5199 filter_maps: state.filter_maps.clone(),
5200 roles: state.roles.clone(),
5201 },
5202 Arc::clone(&state.context_token),
5203 )
5204 };
5205 if !Arc::ptr_eq(&embedding.context_token, &context_token) {
5206 continue;
5207 }
5208 let in_memory_two_tier_index = if tier_mode.wants_two_tier() && !approximate {
5209 self.in_memory_two_tier_index(tier_mode)?
5210 } else {
5211 None
5212 };
5213 let ann_index = if approximate {
5214 Some(self.ann_index()?)
5215 } else {
5216 None
5217 };
5218
5219 let guard = self
5220 .semantic
5221 .lock()
5222 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5223 let state = guard.as_ref().ok_or_else(|| {
5224 anyhow!("semantic search unavailable (no embedder or vector index)")
5225 })?;
5226 if !Arc::ptr_eq(&state.context_token, &context_token) {
5227 continue;
5228 }
5229 break (
5230 embedding.vector,
5231 candidate_context,
5232 in_memory_two_tier_index,
5233 ann_index,
5234 context_token,
5235 );
5236 };
5237
5238 let finalize_hits =
5239 |results: &[VectorSearchResult]| -> Result<(usize, Vec<SearchHit>)> {
5240 let hits = self.hydrate_semantic_hits(results, field_mask)?;
5241 Ok(self.postprocess_hits_page(hits, query, &filters, limit, offset))
5242 };
5243
5244 let (results, search_was_truncated, mut ann_stats) = self.search_semantic_candidates(
5245 &candidate_context,
5246 &embedding,
5247 &filters,
5248 SemanticCandidateSearchRequest {
5249 fetch_limit: initial_fetch_limit,
5250 approximate,
5251 tier_mode,
5252 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5253 ann_index: ann_index.as_ref(),
5254 },
5255 )?;
5256 if !self.semantic_context_matches(&context_token)? {
5257 tracing::debug!("semantic context changed during candidate search; retrying");
5258 continue;
5259 }
5260 let (mut available_hits, mut paged_hits) = finalize_hits(&results)?;
5261
5262 let needs_retry = available_hits < target_hits
5263 && search_was_truncated
5264 && initial_fetch_limit < fallback_fetch_limit;
5265
5266 if needs_retry {
5267 tracing::debug!(
5268 query = canonical,
5269 target_hits,
5270 available_hits,
5271 initial_fetch_limit,
5272 fallback_fetch_limit,
5273 "retrying semantic fetch due to post-filter shortfall"
5274 );
5275 let (retry_results, _, retry_ann_stats) = self.search_semantic_candidates(
5276 &candidate_context,
5277 &embedding,
5278 &filters,
5279 SemanticCandidateSearchRequest {
5280 fetch_limit: fallback_fetch_limit,
5281 approximate,
5282 tier_mode,
5283 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5284 ann_index: ann_index.as_ref(),
5285 },
5286 )?;
5287 if !self.semantic_context_matches(&context_token)? {
5288 tracing::debug!("semantic context changed during retry fetch; retrying");
5289 continue;
5290 }
5291 (available_hits, paged_hits) = finalize_hits(&retry_results)?;
5292 ann_stats = retry_ann_stats;
5293 }
5294
5295 tracing::trace!(
5296 query = canonical,
5297 target_hits,
5298 available_hits,
5299 returned = paged_hits.len(),
5300 "semantic fetch complete"
5301 );
5302
5303 return Ok((paged_hits, ann_stats));
5304 }
5305 }
5306
5307 fn hydrate_semantic_hits(
5308 &self,
5309 results: &[VectorSearchResult],
5310 field_mask: FieldMask,
5311 ) -> Result<Vec<SearchHit>> {
5312 self.hydrate_semantic_hits_with_ids(results, field_mask)
5313 .map(|rows| rows.into_iter().map(|(_, hit)| hit).collect())
5314 }
5315
5316 fn postprocess_hits_page(
5317 &self,
5318 hits: Vec<SearchHit>,
5319 query: &str,
5320 filters: &SearchFilters,
5321 limit: usize,
5322 offset: usize,
5323 ) -> (usize, Vec<SearchHit>) {
5324 let mut hits = deduplicate_hits_with_query(hits, query);
5325 if !filters.session_paths.is_empty() {
5326 hits.retain(|hit| filters.session_paths.contains(&hit.source_path));
5327 }
5328 let available_hits = hits.len();
5329 let paged_hits = hits.into_iter().skip(offset).take(limit).collect();
5330 (available_hits, paged_hits)
5331 }
5332
5333 pub fn search_with_fallback(
5337 &self,
5338 query: &str,
5339 filters: SearchFilters,
5340 limit: usize,
5341 offset: usize,
5342 sparse_threshold: usize,
5343 field_mask: FieldMask,
5344 ) -> Result<SearchResult> {
5345 let hits = self.search(query, filters.clone(), limit, offset, field_mask)?;
5347 let baseline_stats = self.cache_stats();
5348 let tantivy_total = self
5350 .last_tantivy_total_count
5351 .lock()
5352 .ok()
5353 .and_then(|guard| *guard);
5354
5355 let query_has_wildcards = query.contains('*');
5357 let has_boolean_or_phrase = fs_cass_has_boolean_operators(query);
5358 let is_sparse = should_try_wildcard_fallback(hits.len(), limit, offset, sparse_threshold);
5359
5360 if !is_sparse || query_has_wildcards || has_boolean_or_phrase || query.trim().is_empty() {
5361 let suggestions = if hits.is_empty() && !query.trim().is_empty() {
5365 self.generate_suggestions(query, &filters)
5366 } else {
5367 Vec::new()
5368 };
5369 return Ok(SearchResult {
5370 hits,
5371 wildcard_fallback: false,
5372 cache_stats: baseline_stats,
5373 suggestions,
5374 ann_stats: None,
5375 total_count: tantivy_total,
5376 });
5377 }
5378
5379 if should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(query, hits.len()) {
5380 let suggestions = if hits.is_empty() {
5381 self.generate_suggestions(query, &filters)
5382 } else {
5383 Vec::new()
5384 };
5385 return Ok(SearchResult {
5386 hits,
5387 wildcard_fallback: false,
5388 cache_stats: baseline_stats,
5389 suggestions,
5390 ann_stats: None,
5391 total_count: tantivy_total,
5392 });
5393 }
5394
5395 let wildcard_query = query
5397 .split_whitespace()
5398 .map(|term| format!("*{}*", term.trim_matches('*')))
5399 .collect::<Vec<_>>()
5400 .join(" ");
5401
5402 tracing::info!(
5403 original_query = query,
5404 wildcard_query = wildcard_query,
5405 original_count = hits.len(),
5406 "wildcard_fallback"
5407 );
5408
5409 let mut fallback_hits =
5410 self.search(&wildcard_query, filters.clone(), limit, offset, field_mask)?;
5411 let fallback_stats = self.cache_stats();
5412 let fallback_tantivy_total = self
5414 .last_tantivy_total_count
5415 .lock()
5416 .ok()
5417 .and_then(|guard| *guard);
5418
5419 if fallback_hits.len() > hits.len() {
5421 for hit in &mut fallback_hits {
5423 hit.match_type = MatchType::ImplicitWildcard;
5424 }
5425 let suggestions = if fallback_hits.is_empty() {
5427 self.generate_suggestions(query, &filters)
5428 } else {
5429 Vec::new()
5430 };
5431 Ok(SearchResult {
5432 hits: fallback_hits,
5433 wildcard_fallback: true,
5434 cache_stats: fallback_stats,
5435 suggestions,
5436 ann_stats: None,
5437 total_count: fallback_tantivy_total,
5438 })
5439 } else {
5440 let suggestions = if hits.is_empty() {
5443 self.generate_suggestions(query, &filters)
5444 } else {
5445 Vec::new()
5446 };
5447 Ok(SearchResult {
5448 hits,
5449 wildcard_fallback: false,
5450 cache_stats: baseline_stats,
5451 suggestions,
5452 ann_stats: None,
5453 total_count: tantivy_total,
5454 })
5455 }
5456 }
5457
5458 #[allow(clippy::too_many_arguments)]
5460 pub fn search_hybrid(
5461 &self,
5462 lexical_query: &str,
5463 semantic_query: &str,
5464 filters: SearchFilters,
5465 limit: usize,
5466 offset: usize,
5467 sparse_threshold: usize,
5468 field_mask: FieldMask,
5469 approximate: bool,
5470 ) -> Result<SearchResult> {
5471 self.search_hybrid_with_tier(
5472 lexical_query,
5473 semantic_query,
5474 filters,
5475 limit,
5476 offset,
5477 sparse_threshold,
5478 field_mask,
5479 approximate,
5480 SemanticTierMode::Single,
5481 )
5482 }
5483
5484 #[allow(clippy::too_many_arguments)]
5487 pub fn search_hybrid_with_tier(
5488 &self,
5489 lexical_query: &str,
5490 semantic_query: &str,
5491 filters: SearchFilters,
5492 limit: usize,
5493 offset: usize,
5494 sparse_threshold: usize,
5495 field_mask: FieldMask,
5496 approximate: bool,
5497 semantic_tier_mode: SemanticTierMode,
5498 ) -> Result<SearchResult> {
5499 let requested_limit = limit;
5500 let total_docs = self.total_docs().max(1);
5501 let limit = if requested_limit == 0 {
5502 total_docs.min(no_limit_result_cap()).max(1)
5503 } else {
5504 requested_limit
5505 };
5506 let fetch = limit.saturating_add(offset);
5507 if fetch == 0 {
5508 return Ok(SearchResult {
5509 hits: Vec::new(),
5510 wildcard_fallback: false,
5511 cache_stats: self.cache_stats(),
5512 suggestions: Vec::new(),
5513 ann_stats: None,
5514 total_count: None,
5515 });
5516 }
5517
5518 if semantic_query.trim().is_empty() {
5519 return self.search_with_fallback(
5520 lexical_query,
5521 filters,
5522 limit,
5523 offset,
5524 sparse_threshold,
5525 field_mask,
5526 );
5527 }
5528
5529 let budget =
5530 hybrid_candidate_budget(semantic_query, requested_limit, limit, offset, total_docs);
5531 let lexical = self.search_with_fallback(
5532 lexical_query,
5533 filters.clone(),
5534 budget.lexical_candidates,
5535 0,
5536 sparse_threshold,
5537 field_mask,
5538 )?;
5539 let (semantic_hits, semantic_ann_stats) = self.search_semantic_with_tier(
5540 semantic_query,
5541 filters,
5542 budget.semantic_candidates,
5543 0,
5544 field_mask,
5545 approximate,
5546 semantic_tier_mode,
5547 )?;
5548 let fused = rrf_fuse_hits(&lexical.hits, &semantic_hits, semantic_query, limit, offset);
5549 let suggestions = if fused.is_empty() {
5550 lexical.suggestions.clone()
5551 } else {
5552 Vec::new()
5553 };
5554 Ok(SearchResult {
5555 hits: fused,
5556 wildcard_fallback: lexical.wildcard_fallback,
5557 cache_stats: lexical.cache_stats,
5558 suggestions,
5559 ann_stats: semantic_ann_stats,
5560 total_count: None,
5561 })
5562 }
5563
5564 fn generate_suggestions(&self, query: &str, filters: &SearchFilters) -> Vec<QuerySuggestion> {
5566 let mut suggestions = Vec::new();
5567 let query_lower = query.to_lowercase();
5568
5569 if !query.contains('*') && query.len() >= 2 {
5571 suggestions.push(QuerySuggestion::wildcard(query).with_shortcut(1));
5572 }
5573
5574 if !filters.agents.is_empty() {
5576 let agents: Vec<&str> = filters
5577 .agents
5578 .iter()
5579 .map(std::string::String::as_str)
5580 .collect();
5581 let agent_str = agents.join(", ");
5582 suggestions
5583 .push(QuerySuggestion::remove_agent_filter(&agent_str, filters).with_shortcut(2));
5584 }
5585
5586 let known_agents = [
5588 "codex",
5589 "claude",
5590 "claude_code",
5591 "cline",
5592 "gemini",
5593 "amp",
5594 "opencode",
5595 ];
5596 for agent in &known_agents {
5597 if levenshtein_distance(&query_lower, agent) <= 2 && query_lower != *agent {
5598 suggestions.push(
5599 QuerySuggestion::spelling(query, agent)
5600 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5601 );
5602 break; }
5604 }
5605
5606 if filters.agents.is_empty()
5610 && let Ok(sqlite_guard) = self.sqlite.lock()
5611 && let Some(conn) = sqlite_guard.as_ref()
5612 && let Ok(rows) = conn.query_map_collect(
5613 "SELECT a.slug
5614 FROM conversations c
5615 JOIN agents a ON c.agent_id = a.id
5616 GROUP BY a.slug
5617 ORDER BY MAX(c.id) DESC
5618 LIMIT 3",
5619 &[],
5620 |row: &frankensqlite::Row| row.get_typed::<String>(0),
5621 )
5622 {
5623 for row in rows {
5624 if suggestions.len() < 3 {
5625 suggestions.push(
5626 QuerySuggestion::try_agent(&row)
5627 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5628 );
5629 }
5630 }
5631 }
5632
5633 suggestions.truncate(3);
5635 for (i, sugg) in suggestions.iter_mut().enumerate() {
5636 sugg.shortcut = Some((i + 1) as u8);
5637 }
5638
5639 suggestions
5640 }
5641
5642 fn searcher_for_thread(&self, reader: &IndexReader) -> Searcher {
5643 let epoch = self.reload_epoch.load(Ordering::Relaxed);
5644 let reader_key = reader as *const IndexReader as usize;
5645 THREAD_SEARCHER.with(|slot| {
5646 let mut slot = slot.borrow_mut();
5647 if let Some(entry) = slot.as_ref()
5648 && entry.epoch == epoch
5649 && entry.reader_key == reader_key
5650 {
5651 return entry.searcher.clone();
5652 }
5653 let searcher = reader.searcher();
5654 *slot = Some(SearcherCacheEntry {
5655 epoch,
5656 reader_key,
5657 searcher: searcher.clone(),
5658 });
5659 searcher
5660 })
5661 }
5662
5663 fn federated_readers(&self) -> Option<Arc<Vec<FederatedIndexReader>>> {
5664 FEDERATED_SEARCH_READERS
5665 .read()
5666 .get(&self.cache_namespace)
5667 .cloned()
5668 }
5669
5670 fn maybe_reload_federated_readers(
5671 &self,
5672 readers: &[FederatedIndexReader],
5673 ) -> Result<Option<u64>> {
5674 if !self.reload_on_search || readers.is_empty() {
5675 return Ok(None);
5676 }
5677 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
5678 let now = Instant::now();
5679 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
5680 if guard
5681 .map(|t| now.duration_since(t) < MIN_RELOAD_INTERVAL)
5682 .unwrap_or(false)
5683 {
5684 let signature = self.federated_generation_signature(readers);
5685 return Ok(Some(signature));
5686 }
5687
5688 let reload_started = Instant::now();
5689 for shard in readers {
5690 shard.reader.reload()?;
5691 }
5692 let elapsed = reload_started.elapsed();
5693 *guard = Some(now);
5694 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
5695 self.metrics.record_reload(elapsed);
5696 tracing::debug!(
5697 duration_ms = elapsed.as_millis() as u64,
5698 reload_epoch = epoch,
5699 shards = readers.len(),
5700 "tantivy_reader_reload_federated"
5701 );
5702 Ok(Some(self.federated_generation_signature(readers)))
5703 }
5704
5705 fn federated_generation_signature(&self, readers: &[FederatedIndexReader]) -> u64 {
5706 let mut hasher = std::collections::hash_map::DefaultHasher::new();
5707 readers.len().hash(&mut hasher);
5708 for shard in readers {
5709 self.searcher_for_thread(&shard.reader)
5710 .generation()
5711 .generation_id()
5712 .hash(&mut hasher);
5713 }
5714 hasher.finish()
5715 }
5716
5717 fn track_generation(&self, generation: u64) {
5718 let mut guard = self
5719 .last_generation
5720 .lock()
5721 .unwrap_or_else(|e| e.into_inner());
5722 if let Some(prev) = *guard
5723 && prev != generation
5724 && let Ok(mut cache) = self.prefix_cache.lock()
5725 {
5726 cache.clear();
5727 }
5728 *guard = Some(generation);
5729 }
5730
5731 fn hydrate_tantivy_hit_contents(
5732 &self,
5733 exact_keys: &[TantivyContentExactKey],
5734 fallback_keys: &[TantivyContentFallbackKey],
5735 ) -> Result<TantivyHydratedContentMaps> {
5736 if exact_keys.is_empty() && fallback_keys.is_empty() {
5737 return Ok((HashMap::new(), HashMap::new()));
5738 }
5739
5740 let sqlite_guard = match self.sqlite_guard() {
5741 Ok(guard) => guard,
5742 Err(_) => return Ok((HashMap::new(), HashMap::new())),
5743 };
5744 let Some(conn) = sqlite_guard.as_ref() else {
5745 return Ok((HashMap::new(), HashMap::new()));
5746 };
5747
5748 let mut hydrated_exact = HashMap::new();
5749 let mut hydrated_fallback = HashMap::new();
5750 const CHUNK_SIZE: usize = 300;
5751
5752 if !exact_keys.is_empty() {
5753 let mut unique_exact_keys = Vec::with_capacity(exact_keys.len());
5754 let mut seen = HashSet::with_capacity(exact_keys.len());
5755 for key in exact_keys {
5756 if seen.insert(*key) {
5757 unique_exact_keys.push(*key);
5758 }
5759 }
5760
5761 hydrated_exact.extend(hydrate_message_content_by_conversation(
5762 conn,
5763 &unique_exact_keys,
5764 )?);
5765 }
5766
5767 if !fallback_keys.is_empty() {
5768 let mut unique_fallback_keys = Vec::with_capacity(fallback_keys.len());
5769 let mut seen = HashSet::with_capacity(fallback_keys.len());
5770 for key in fallback_keys {
5771 if seen.insert(key.clone()) {
5772 unique_fallback_keys.push(key.clone());
5773 }
5774 }
5775
5776 let mut unique_source_paths = Vec::with_capacity(unique_fallback_keys.len());
5777 let mut seen_source_paths = HashSet::with_capacity(unique_fallback_keys.len());
5778 for (_, source_path, _) in &unique_fallback_keys {
5779 if seen_source_paths.insert(source_path.clone()) {
5780 unique_source_paths.push(source_path.clone());
5781 }
5782 }
5783
5784 let mut conversations_by_key: HashMap<(String, String), Vec<i64>> = HashMap::new();
5785 for chunk in unique_source_paths.chunks(CHUNK_SIZE) {
5786 let placeholders = sql_placeholders(chunk.len());
5787 let sql = format!(
5788 "SELECT c.id,
5789 c.source_path,
5790 COALESCE(c.source_id, ''),
5791 COALESCE(c.origin_host, ''),
5792 COALESCE(s.kind, '')
5793 FROM conversations c
5794 LEFT JOIN sources s ON c.source_id = s.id
5795 WHERE c.source_path IN ({placeholders})
5796 ORDER BY c.id"
5797 );
5798 let params = chunk
5799 .iter()
5800 .map(|source_path| ParamValue::from(source_path.clone()))
5801 .collect::<Vec<_>>();
5802 let rows: Vec<(i64, String, String, String, String)> =
5803 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
5804 Ok((
5805 row.get_typed(0)?,
5806 row.get_typed(1)?,
5807 row.get_typed(2)?,
5808 row.get_typed(3)?,
5809 row.get_typed(4)?,
5810 ))
5811 })?;
5812
5813 for (conversation_id, source_path, raw_source_id, origin_host, origin_kind) in rows
5814 {
5815 let normalized_source_id = normalized_search_hit_source_id_parts(
5816 &raw_source_id,
5817 &origin_kind,
5818 (!origin_host.trim().is_empty()).then_some(origin_host.as_str()),
5819 );
5820 conversations_by_key
5821 .entry((normalized_source_id, source_path))
5822 .or_default()
5823 .push(conversation_id);
5824 }
5825 }
5826
5827 let mut message_requests = Vec::new();
5828 let mut fallback_keys_by_exact: HashMap<
5829 TantivyContentExactKey,
5830 Vec<TantivyContentFallbackKey>,
5831 > = HashMap::new();
5832 let mut seen_message_requests = HashSet::new();
5833 for (source_id, source_path, line_idx) in &unique_fallback_keys {
5834 let key = (source_id.clone(), source_path.clone());
5835 let Some(conversation_ids) = conversations_by_key.get(&key) else {
5836 continue;
5837 };
5838 for &conversation_id in conversation_ids {
5839 let exact_key = (conversation_id, *line_idx);
5840 if seen_message_requests.insert(exact_key) {
5841 message_requests.push(exact_key);
5842 }
5843 fallback_keys_by_exact.entry(exact_key).or_default().push((
5844 source_id.clone(),
5845 source_path.clone(),
5846 *line_idx,
5847 ));
5848 }
5849 }
5850
5851 for ((conversation_id, line_idx), content) in
5852 hydrate_message_content_by_conversation(conn, &message_requests)?
5853 {
5854 if let Some(fallback_keys) =
5855 fallback_keys_by_exact.get(&(conversation_id, line_idx))
5856 {
5857 for fallback_key in fallback_keys {
5858 hydrated_fallback.insert(fallback_key.clone(), content.clone());
5859 }
5860 }
5861 }
5862 }
5863
5864 Ok((hydrated_exact, hydrated_fallback))
5865 }
5866
5867 #[allow(clippy::too_many_arguments)]
5868 fn search_tantivy(
5869 &self,
5870 reader: &IndexReader,
5871 fields: &FsCassFields,
5872 raw_query: &str,
5873 sanitized_query: &str,
5874 filters: SearchFilters,
5875 limit: usize,
5876 offset: usize,
5877 field_mask: FieldMask,
5878 ) -> Result<(Vec<SearchHit>, usize)> {
5879 struct PendingTantivyHit {
5880 score: f32,
5881 doc: TantivyDocument,
5882 title: String,
5883 stored_content: String,
5884 stored_preview: String,
5885 agent: String,
5886 source_path: String,
5887 workspace: String,
5888 workspace_original: Option<String>,
5889 created_at: Option<i64>,
5890 line_number: Option<usize>,
5891 stored_preview_snippet: Option<String>,
5892 source_id: String,
5893 conversation_id: Option<i64>,
5894 raw_origin_kind: Option<String>,
5895 origin_host: Option<String>,
5896 }
5897
5898 self.maybe_reload_reader(reader)?;
5899 let searcher = self.searcher_for_thread(reader);
5900 self.track_generation(searcher.generation().generation_id());
5901
5902 let wants_snippet = field_mask.wants_snippet();
5903 let needs_content = field_mask.needs_content() || wants_snippet;
5904
5905 let fs_filters = FsCassQueryFilters {
5908 agents: filters.agents.into_iter().collect(),
5909 workspaces: filters.workspaces.into_iter().collect(),
5910 created_from: filters.created_from,
5911 created_to: filters.created_to,
5912 source_filter: match filters.source_filter {
5913 SourceFilter::All => FsCassSourceFilter::All,
5914 SourceFilter::Local => FsCassSourceFilter::Local,
5915 SourceFilter::Remote => FsCassSourceFilter::Remote,
5916 SourceFilter::SourceId(id) => {
5917 FsCassSourceFilter::SourceId(normalize_search_source_filter_value(&id))
5918 }
5919 },
5920 };
5921
5922 let q: Box<dyn Query> = fs_cass_build_tantivy_query(raw_query, &fs_filters, fields);
5925
5926 let prefix_only = is_prefix_only(sanitized_query);
5927 let top_docs = execute_query_with_lazy_exact_count(&searcher, &*q, limit, offset)?;
5928 let tantivy_total_count = top_docs.total_count;
5929 let query_match_type = dominant_match_type(sanitized_query);
5930 let mut pending_hits = Vec::with_capacity(top_docs.hits.len());
5931 let mut missing_exact_content_keys = Vec::new();
5932 let mut missing_fallback_content_keys = Vec::new();
5933
5934 for ranked_hit in top_docs.hits {
5935 let score = ranked_hit.bm25_score;
5936 let doc: TantivyDocument = fs_load_doc(&searcher, ranked_hit.doc_address)?;
5937 let title = if field_mask.wants_title() {
5938 doc.get_first(fields.title)
5939 .and_then(|v| v.as_str())
5940 .unwrap_or("")
5941 .to_string()
5942 } else {
5943 String::new()
5944 };
5945 let stored_content = doc
5946 .get_first(fields.content)
5947 .and_then(|v| v.as_str())
5948 .unwrap_or("")
5949 .to_string();
5950 let stored_preview = doc
5951 .get_first(fields.preview)
5952 .and_then(|v| v.as_str())
5953 .unwrap_or("")
5954 .to_string();
5955 let stored_preview_snippet = snippet_from_preview_without_full_content(
5956 field_mask,
5957 &stored_preview,
5958 sanitized_query,
5959 );
5960 let agent = doc
5961 .get_first(fields.agent)
5962 .and_then(|v| v.as_str())
5963 .unwrap_or("")
5964 .to_string();
5965 let workspace = doc
5966 .get_first(fields.workspace)
5967 .and_then(|v| v.as_str())
5968 .unwrap_or("")
5969 .to_string();
5970 let workspace_original = doc
5971 .get_first(fields.workspace_original)
5972 .and_then(|v| v.as_str())
5973 .filter(|s| !s.is_empty())
5974 .map(String::from);
5975 let created_at = doc.get_first(fields.created_at).and_then(|v| v.as_i64());
5976 let line_number = doc
5977 .get_first(fields.msg_idx)
5978 .and_then(|v| v.as_u64())
5979 .and_then(|i| usize::try_from(i).ok())
5980 .map(|i| i.saturating_add(1));
5981 let raw_source_id = doc
5982 .get_first(fields.source_id)
5983 .and_then(|v| v.as_str())
5984 .unwrap_or_default()
5985 .to_string();
5986 let conversation_id = fields
5987 .conversation_id
5988 .and_then(|field| doc.get_first(field))
5989 .and_then(|v| v.as_i64());
5990 let source_path = doc
5991 .get_first(fields.source_path)
5992 .and_then(|v| v.as_str())
5993 .unwrap_or("")
5994 .to_string();
5995 let raw_origin_kind = doc
5996 .get_first(fields.origin_kind)
5997 .and_then(|v| v.as_str())
5998 .map(str::to_string);
5999 let origin_host = doc
6000 .get_first(fields.origin_host)
6001 .and_then(|v| v.as_str())
6002 .filter(|s| !s.is_empty())
6003 .map(String::from);
6004 let source_id = normalized_search_hit_source_id_parts(
6005 raw_source_id.as_str(),
6006 raw_origin_kind.as_deref().unwrap_or_default(),
6007 origin_host.as_deref(),
6008 );
6009
6010 let preview_satisfies_bounded_content =
6011 field_mask.preview_content_limit().is_some() && !stored_preview.is_empty();
6012 let preview_satisfies_full_content = field_mask.needs_content()
6013 && field_mask.preview_content_limit().is_none()
6014 && stored_preview_is_complete_content(&stored_preview);
6015 if needs_content
6016 && let Some(line_idx) = line_number
6017 .and_then(|line| line.checked_sub(1))
6018 .and_then(|line| i64::try_from(line).ok())
6019 && stored_content.is_empty()
6020 && !preview_satisfies_bounded_content
6021 && !preview_satisfies_full_content
6022 && stored_preview_snippet.is_none()
6023 {
6024 if let Some(conversation_id) = conversation_id {
6025 missing_exact_content_keys.push((conversation_id, line_idx));
6026 } else {
6027 missing_fallback_content_keys.push((
6028 source_id.clone(),
6029 source_path.clone(),
6030 line_idx,
6031 ));
6032 }
6033 }
6034
6035 pending_hits.push(PendingTantivyHit {
6036 score,
6037 doc,
6038 title,
6039 stored_content,
6040 stored_preview,
6041 agent,
6042 source_path,
6043 workspace,
6044 workspace_original,
6045 created_at,
6046 line_number,
6047 stored_preview_snippet,
6048 source_id,
6049 conversation_id,
6050 raw_origin_kind,
6051 origin_host,
6052 });
6053 }
6054
6055 let (hydrated_contents, hydrated_fallback_contents) = if needs_content
6056 && (!missing_exact_content_keys.is_empty() || !missing_fallback_content_keys.is_empty())
6057 {
6058 self.hydrate_tantivy_hit_contents(
6059 &missing_exact_content_keys,
6060 &missing_fallback_content_keys,
6061 )?
6062 } else {
6063 (HashMap::new(), HashMap::new())
6064 };
6065 let needs_tantivy_snippet_generator = wants_snippet
6066 && !prefix_only
6067 && pending_hits
6068 .iter()
6069 .any(|pending| pending.stored_preview_snippet.is_none());
6070 let snippet_generator = if needs_tantivy_snippet_generator {
6071 let snippet_cfg = FsSnippetConfig {
6072 max_chars: 160,
6073 highlight_prefix: "<b>".to_string(),
6074 highlight_postfix: "</b>".to_string(),
6075 };
6076 fs_try_build_snippet_generator(&searcher, &*q, fields.content, &snippet_cfg)
6077 } else {
6078 None
6079 };
6080 let mut hits = Vec::with_capacity(pending_hits.len());
6081 for pending in pending_hits {
6082 let hydrated_content = pending
6083 .line_number
6084 .and_then(|line| line.checked_sub(1))
6085 .and_then(|line| i64::try_from(line).ok())
6086 .and_then(|line_idx| {
6087 if let Some(conversation_id) = pending.conversation_id {
6088 hydrated_contents.get(&(conversation_id, line_idx)).cloned()
6089 } else {
6090 hydrated_fallback_contents
6091 .get(&(
6092 pending.source_id.clone(),
6093 pending.source_path.clone(),
6094 line_idx,
6095 ))
6096 .cloned()
6097 }
6098 });
6099 let preview_satisfies_effective_content = !pending.stored_preview.is_empty()
6100 && (field_mask.preview_content_limit().is_some()
6101 || (field_mask.needs_content()
6102 && field_mask.preview_content_limit().is_none()
6103 && stored_preview_is_complete_content(&pending.stored_preview)));
6104 let effective_content = if !pending.stored_content.is_empty() {
6105 pending.stored_content.clone()
6106 } else if preview_satisfies_effective_content {
6107 pending.stored_preview.clone()
6108 } else if let Some(content) = hydrated_content {
6109 content
6110 } else {
6111 pending.stored_preview.clone()
6112 };
6113 let snippet = if wants_snippet {
6114 if let Some(snippet) = pending.stored_preview_snippet.clone() {
6115 snippet
6116 } else if let Some(r#gen) = &snippet_generator {
6117 let rendered = if !pending.stored_content.is_empty() {
6118 fs_render_snippet_html(r#gen, &pending.doc, "<b>", "</b>")
6119 } else if !effective_content.is_empty() {
6120 let mut snippet_doc = TantivyDocument::new();
6121 snippet_doc.add_text(fields.content, &effective_content);
6122 fs_render_snippet_html(r#gen, &snippet_doc, "<b>", "</b>")
6123 } else {
6124 None
6125 };
6126 rendered
6127 .map(|html| html.replace("<b>", "**").replace("</b>", "**"))
6128 .or_else(|| cached_prefix_snippet(&effective_content, sanitized_query, 160))
6129 .unwrap_or_else(|| {
6130 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6131 })
6132 } else if let Some(sn) =
6133 cached_prefix_snippet(&effective_content, sanitized_query, 160)
6134 {
6135 sn
6136 } else {
6137 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6138 }
6139 } else {
6140 String::new()
6141 };
6142 let content = if field_mask.needs_content() {
6143 effective_content.clone()
6144 } else {
6145 String::new()
6146 };
6147 let content_hash = stable_hit_hash(
6148 &effective_content,
6149 &pending.source_path,
6150 pending.line_number,
6151 pending.created_at,
6152 );
6153 let origin_kind = normalized_search_hit_origin_kind(
6154 &pending.source_id,
6155 pending.raw_origin_kind.as_deref(),
6156 )
6157 .to_string();
6158 hits.push(SearchHit {
6159 title: pending.title,
6160 snippet,
6161 content,
6162 content_hash,
6163 conversation_id: pending.conversation_id,
6164 score: pending.score,
6165 source_path: pending.source_path,
6166 agent: pending.agent,
6167 workspace: pending.workspace,
6168 workspace_original: pending.workspace_original,
6169 created_at: pending.created_at,
6170 line_number: pending.line_number,
6171 match_type: query_match_type,
6172 source_id: pending.source_id,
6173 origin_kind,
6174 origin_host: pending.origin_host,
6175 });
6176 }
6177 Ok((hits, tantivy_total_count))
6178 }
6179
6180 #[allow(clippy::too_many_arguments)]
6181 fn search_tantivy_federated(
6182 &self,
6183 readers: &[FederatedIndexReader],
6184 raw_query: &str,
6185 sanitized_query: &str,
6186 filters: SearchFilters,
6187 limit: usize,
6188 field_mask: FieldMask,
6189 ) -> Result<(Vec<SearchHit>, usize)> {
6190 let mut ranked_hits = Vec::new();
6191 let mut total_count = 0usize;
6192
6193 for (shard_index, shard) in readers.iter().enumerate() {
6194 let (shard_hits, shard_total_count) = self.search_tantivy(
6195 &shard.reader,
6196 &shard.fields,
6197 raw_query,
6198 sanitized_query,
6199 filters.clone(),
6200 limit,
6201 0,
6202 field_mask,
6203 )?;
6204 total_count = total_count.saturating_add(shard_total_count);
6205 for (shard_rank, hit) in shard_hits.into_iter().enumerate() {
6206 ranked_hits.push(FederatedRankedHit {
6207 hit,
6208 shard_index,
6209 shard_rank,
6210 fused_score: federated_rrf_score(shard_rank),
6211 });
6212 }
6213 }
6214
6215 let raw_hit_count = ranked_hits.len();
6216 let generation_signature = self.federated_generation_signature(readers);
6217 self.track_generation(generation_signature);
6218 let combined_hits = merge_federated_ranked_hits(ranked_hits);
6219 tracing::debug!(
6220 generation_signature,
6221 shard_count = readers.len(),
6222 total_count,
6223 raw_hit_count,
6224 returned_hit_count = combined_hits.len(),
6225 merge_policy = "rrf_rank_then_stable_hit_key",
6226 "federated lexical search merged shard results"
6227 );
6228
6229 Ok((combined_hits, total_count))
6230 }
6231
6232 fn sqlite_fts_uses_message_id_column(conn: &Connection) -> Result<bool> {
6233 let params: [ParamValue; 0] = [];
6234 let ddl_rows: Vec<String> = franken_query_map_collect_retry(
6235 conn,
6236 "SELECT COALESCE(sql, '')
6237 FROM sqlite_master
6238 WHERE name = 'fts_messages'
6239 ORDER BY rowid DESC
6240 LIMIT 1",
6241 ¶ms,
6242 |row: &frankensqlite::Row| row.get_typed::<String>(0),
6243 )?;
6244 Ok(ddl_rows
6245 .first()
6246 .map(|sql| sql.to_ascii_lowercase().contains("message_id"))
6247 .unwrap_or(false))
6248 }
6249
6250 fn sqlite_fts5_rank_query(
6251 fts_query: &str,
6252 filters: &SearchFilters,
6253 limit: usize,
6254 offset: usize,
6255 uses_message_id: bool,
6256 ) -> (String, Vec<ParamValue>) {
6257 let normalized_source_sql =
6258 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6259 let created_at_expr = "CAST(fts_messages.created_at AS INTEGER)";
6260 let message_key_expr = if uses_message_id {
6261 "CAST(fts_messages.message_id AS INTEGER)"
6262 } else {
6263 "fts_messages.rowid"
6264 };
6265
6266 let mut sql = format!(
6267 "SELECT fts_messages.rowid,
6268 bm25(fts_messages)
6269 FROM fts_messages
6270 LEFT JOIN messages m ON {message_key_expr} = m.id
6271 LEFT JOIN conversations c ON m.conversation_id = c.id
6272 LEFT JOIN sources s ON c.source_id = s.id
6273 WHERE fts_messages MATCH ?"
6274 );
6275 let mut params = Vec::with_capacity(filters.agents.len() + filters.workspaces.len() + 5);
6276 params.push(ParamValue::from(fts_query));
6277
6278 if !filters.agents.is_empty() {
6279 let placeholders = sql_placeholders(filters.agents.len());
6280 sql.push_str(&format!(" AND fts_messages.agent IN ({placeholders})"));
6281 for agent in &filters.agents {
6282 params.push(ParamValue::from(agent.as_str()));
6283 }
6284 }
6285
6286 if !filters.workspaces.is_empty() {
6287 let placeholders = sql_placeholders(filters.workspaces.len());
6288 sql.push_str(&format!(
6289 " AND COALESCE(fts_messages.workspace, '') IN ({placeholders})"
6290 ));
6291 for workspace in &filters.workspaces {
6292 params.push(ParamValue::from(workspace.as_str()));
6293 }
6294 }
6295
6296 if let Some(created_from) = filters.created_from {
6297 sql.push_str(&format!(" AND {created_at_expr} >= ?"));
6298 params.push(ParamValue::from(created_from));
6299 }
6300 if let Some(created_to) = filters.created_to {
6301 sql.push_str(&format!(" AND {created_at_expr} <= ?"));
6302 params.push(ParamValue::from(created_to));
6303 }
6304
6305 match &filters.source_filter {
6306 SourceFilter::All => {}
6307 SourceFilter::Local => sql.push_str(&format!(
6308 " AND {normalized_source_sql} = '{local}'",
6309 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6310 )),
6311 SourceFilter::Remote => sql.push_str(&format!(
6312 " AND {normalized_source_sql} != '{local}'",
6313 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6314 )),
6315 SourceFilter::SourceId(id) => {
6316 sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
6317 params.push(ParamValue::from(normalize_search_source_filter_value(id)));
6318 }
6319 }
6320
6321 sql.push_str(&format!(
6322 " ORDER BY bm25(fts_messages), {message_key_expr}, fts_messages.rowid LIMIT ? OFFSET ?"
6323 ));
6324 params.push(ParamValue::from(limit as i64));
6325 params.push(ParamValue::from(offset as i64));
6326
6327 (sql, params)
6328 }
6329
6330 fn sqlite_fts5_hydrate_query(
6331 row_count: usize,
6332 field_mask: FieldMask,
6333 uses_message_id: bool,
6334 ) -> String {
6335 let title_expr = if field_mask.wants_title() {
6336 "fts_messages.title"
6337 } else {
6338 "''"
6339 };
6340 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6341 "fts_messages.content"
6342 } else {
6343 "''"
6344 };
6345 let normalized_source_sql =
6346 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6347 let created_at_expr = "CAST(fts_messages.created_at AS INTEGER)";
6348 let message_key_expr = if uses_message_id {
6349 "CAST(fts_messages.message_id AS INTEGER)"
6350 } else {
6351 "fts_messages.rowid"
6352 };
6353 let placeholders = sql_placeholders(row_count);
6354
6355 format!(
6356 "SELECT fts_messages.rowid,
6357 {title_expr},
6358 {content_expr},
6359 fts_messages.agent,
6360 COALESCE(fts_messages.workspace, ''),
6361 fts_messages.source_path,
6362 {created_at_expr},
6363 m.idx,
6364 c.id,
6365 {normalized_source_sql},
6366 c.origin_host,
6367 s.kind
6368 FROM fts_messages
6369 LEFT JOIN messages m ON {message_key_expr} = m.id
6370 LEFT JOIN conversations c ON m.conversation_id = c.id
6371 LEFT JOIN sources s ON c.source_id = s.id
6372 WHERE fts_messages.rowid IN ({placeholders})"
6373 )
6374 }
6375
6376 fn sqlite_fts5_hydrate_row_chunks(
6377 ranked_rows: &[(i64, f64)],
6378 ) -> impl Iterator<Item = &[(i64, f64)]> {
6379 const _: () = assert!(SQLITE_FTS5_HYDRATE_PARAM_CHUNK <= SQLITE_MAX_VARIABLE_NUMBER);
6380 ranked_rows.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK)
6381 }
6382
6383 fn search_sqlite_fts5(
6384 &self,
6385 _db_path: &Path,
6386 raw_query: &str,
6387 filters: SearchFilters,
6388 limit: usize,
6389 offset: usize,
6390 field_mask: FieldMask,
6391 ) -> Result<Vec<SearchHit>> {
6392 let fts_query = match transpile_to_fts5(raw_query) {
6393 Some(q) if !q.trim().is_empty() => q,
6394 _ => return Ok(Vec::new()),
6395 };
6396
6397 let sqlite_guard = self.sqlite_guard()?;
6398 let Some(conn) = sqlite_guard.as_ref() else {
6399 return Ok(Vec::new());
6400 };
6401
6402 let empty_params: [ParamValue; 0] = [];
6403 let has_fts = franken_query_map_collect_retry(
6404 conn,
6405 "SELECT 1 FROM sqlite_master WHERE name = 'fts_messages'",
6406 &empty_params,
6407 |row| row.get_typed::<i64>(0),
6408 )
6409 .map(|rows| !rows.is_empty())
6410 .unwrap_or(false);
6411 if !has_fts {
6412 return Ok(Vec::new());
6413 }
6414
6415 let query_match_type = dominant_match_type(raw_query);
6416 let uses_message_id =
6417 if let Ok(uses_message_id) = Self::sqlite_fts_uses_message_id_column(conn) {
6418 uses_message_id
6419 } else {
6420 tracing::warn!(
6421 "sqlite FTS fallback is present but not queryable; skipping fallback search"
6422 );
6423 return Ok(Vec::new());
6424 };
6425 let (rank_sql, rank_params) = Self::sqlite_fts5_rank_query(
6426 fts_query.as_str(),
6427 &filters,
6428 limit,
6429 offset,
6430 uses_message_id,
6431 );
6432 let ranked_rows: Vec<(i64, f64)> =
6433 match franken_query_map_collect_retry(conn, &rank_sql, &rank_params, |row| {
6434 Ok((row.get_typed(0)?, row.get_typed(1)?))
6435 }) {
6436 Ok(rows) => rows,
6437 Err(err) => {
6438 tracing::warn!(
6439 error = %err,
6440 "sqlite FTS fallback rank query failed; returning no fallback hits"
6441 );
6442 return Ok(Vec::new());
6443 }
6444 };
6445 if ranked_rows.is_empty() {
6446 return Ok(Vec::new());
6447 }
6448
6449 let bm25_by_rowid: HashMap<i64, f64> = ranked_rows.iter().copied().collect();
6450 let mut hits_by_rowid = HashMap::with_capacity(ranked_rows.len());
6451 for rank_chunk in Self::sqlite_fts5_hydrate_row_chunks(&ranked_rows) {
6452 let hydrate_sql =
6453 Self::sqlite_fts5_hydrate_query(rank_chunk.len(), field_mask, uses_message_id);
6454 let hydrate_params = rank_chunk
6455 .iter()
6456 .map(|(fts_rowid, _)| ParamValue::from(*fts_rowid))
6457 .collect::<Vec<_>>();
6458 let rows: Vec<SqliteFtsHydratedRow> =
6459 match franken_query_map_collect_retry(conn, &hydrate_sql, &hydrate_params, |row| {
6460 Ok((
6461 row.get_typed(0)?,
6462 row.get_typed(1)?,
6463 row.get_typed(2)?,
6464 row.get_typed(3)?,
6465 row.get_typed(4)?,
6466 row.get_typed(5)?,
6467 row.get_typed(6)?,
6468 row.get_typed(7)?,
6469 row.get_typed(8)?,
6470 row.get_typed::<Option<String>>(9)?,
6471 row.get_typed(10)?,
6472 row.get_typed(11)?,
6473 ))
6474 }) {
6475 Ok(rows) => rows,
6476 Err(err) => {
6477 tracing::warn!(
6478 error = %err,
6479 "sqlite FTS fallback hydration query failed; returning no fallback hits"
6480 );
6481 return Ok(Vec::new());
6482 }
6483 };
6484 for (
6485 fts_rowid,
6486 title,
6487 raw_content,
6488 agent,
6489 workspace,
6490 source_path,
6491 created_at,
6492 idx,
6493 conversation_id,
6494 raw_source_id,
6495 origin_host,
6496 raw_origin_kind,
6497 ) in rows
6498 {
6499 let Some(&bm25_score) = bm25_by_rowid.get(&fts_rowid) else {
6500 continue;
6501 };
6502 let raw_source_id = raw_source_id.unwrap_or_else(default_source_id);
6503
6504 let source_id = normalized_search_hit_source_id_parts(
6505 raw_source_id.as_str(),
6506 raw_origin_kind.as_deref().unwrap_or_default(),
6507 origin_host.as_deref(),
6508 );
6509 let origin_kind = normalized_search_hit_origin_kind(
6510 source_id.as_str(),
6511 raw_origin_kind.as_deref(),
6512 )
6513 .to_string();
6514 let line_number = idx
6515 .and_then(|i| usize::try_from(i).ok())
6516 .map(|i| i.saturating_add(1));
6517 let snippet = if field_mask.wants_snippet() {
6518 snippet_from_content(&raw_content)
6519 } else {
6520 String::new()
6521 };
6522 let content = if field_mask.needs_content() {
6523 raw_content
6524 } else {
6525 String::new()
6526 };
6527 let content_hash = if content.is_empty() {
6528 stable_hit_hash(&snippet, &source_path, line_number, created_at)
6529 } else {
6530 stable_hit_hash(&content, &source_path, line_number, created_at)
6531 };
6532
6533 let hit = SearchHit {
6534 title,
6535 snippet,
6536 content,
6537 content_hash,
6538 conversation_id,
6539 score: (-bm25_score) as f32,
6540 source_path,
6541 agent,
6542 workspace,
6543 workspace_original: None,
6544 created_at,
6545 line_number,
6546 match_type: query_match_type,
6547 source_id,
6548 origin_kind,
6549 origin_host,
6550 };
6551 hits_by_rowid.insert(fts_rowid, hit);
6552 }
6553 }
6554
6555 let mut hits = Vec::with_capacity(ranked_rows.len());
6556 for (fts_rowid, _) in ranked_rows {
6557 if let Some(hit) = hits_by_rowid.remove(&fts_rowid) {
6558 hits.push(hit);
6559 }
6560 }
6561 Ok(hits)
6562 }
6563
6564 pub fn browse_by_date(
6571 &self,
6572 filters: SearchFilters,
6573 limit: usize,
6574 offset: usize,
6575 newest_first: bool,
6576 field_mask: FieldMask,
6577 ) -> Result<Vec<SearchHit>> {
6578 let sqlite_guard = self.sqlite_guard()?;
6579 if let Some(conn) = sqlite_guard.as_ref() {
6580 self.browse_by_date_sqlite(conn, filters, limit, offset, newest_first, field_mask)
6581 } else {
6582 Ok(Vec::new())
6583 }
6584 }
6585
6586 fn browse_by_date_sqlite(
6587 &self,
6588 conn: &Connection,
6589 filters: SearchFilters,
6590 limit: usize,
6591 offset: usize,
6592 newest_first: bool,
6593 field_mask: FieldMask,
6594 ) -> Result<Vec<SearchHit>> {
6595 let order = if newest_first { "DESC" } else { "ASC" };
6596 let title_expr = if field_mask.wants_title() {
6597 "c.title"
6598 } else {
6599 "''"
6600 };
6601 let normalized_source_sql =
6609 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6610 let mut sql = format!(
6611 "SELECT c.id, {title_expr}, m.content, \
6612 COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'), \
6613 w.path, c.source_path, m.created_at, m.idx, \
6614 {normalized_source_sql}, c.origin_host, s.kind
6615 FROM messages m
6616 JOIN conversations c ON m.conversation_id = c.id
6617 LEFT JOIN workspaces w ON c.workspace_id = w.id
6618 LEFT JOIN sources s ON c.source_id = s.id
6619 WHERE 1=1"
6620 );
6621 let mut params: Vec<ParamValue> = Vec::new();
6622
6623 if !filters.agents.is_empty() {
6624 let placeholders = sql_placeholders(filters.agents.len());
6625 sql.push_str(&format!(
6626 " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug IN ({placeholders}))"
6627 ));
6628 for a in &filters.agents {
6629 params.push(ParamValue::from(a.as_str()));
6630 }
6631 }
6632
6633 if !filters.workspaces.is_empty() {
6634 let placeholders = sql_placeholders(filters.workspaces.len());
6635 sql.push_str(&format!(" AND COALESCE(w.path, '') IN ({placeholders})"));
6636 for w in &filters.workspaces {
6637 params.push(ParamValue::from(w.as_str()));
6638 }
6639 }
6640
6641 if let Some(created_from) = filters.created_from {
6642 sql.push_str(" AND m.created_at >= ?");
6643 params.push(ParamValue::from(created_from));
6644 }
6645 if let Some(created_to) = filters.created_to {
6646 sql.push_str(" AND m.created_at <= ?");
6647 params.push(ParamValue::from(created_to));
6648 }
6649
6650 match &filters.source_filter {
6652 SourceFilter::All => {}
6653 SourceFilter::Local => sql.push_str(&format!(
6654 " AND {normalized_source_sql} = '{local}'",
6655 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6656 )),
6657 SourceFilter::Remote => sql.push_str(&format!(
6658 " AND {normalized_source_sql} != '{local}'",
6659 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6660 )),
6661 SourceFilter::SourceId(id) => {
6662 sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
6663 params.push(ParamValue::from(normalize_search_source_filter_value(id)));
6664 }
6665 }
6666
6667 sql.push_str(&format!(
6668 " ORDER BY CASE WHEN m.created_at IS NULL THEN 1 ELSE 0 END, m.created_at {order}, m.id {order} LIMIT ? OFFSET ?"
6669 ));
6670 params.push(ParamValue::from(limit as i64));
6671 params.push(ParamValue::from(offset as i64));
6672
6673 let rows: Vec<SearchHit> =
6674 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
6675 let conversation_id: i64 = row.get_typed(0)?;
6676 let title: String = if field_mask.wants_title() {
6677 row.get_typed::<Option<String>>(1)?.unwrap_or_default()
6678 } else {
6679 String::new()
6680 };
6681 let raw_content: String = row.get_typed(2)?;
6682 let agent: String = row.get_typed(3)?;
6683 let workspace: Option<String> = row.get_typed(4)?;
6684 let source_path: String = row.get_typed(5)?;
6685 let created_at: Option<i64> = row.get_typed(6)?;
6686 let idx: Option<i64> = row.get_typed(7)?;
6687 let raw_source_id: String = row
6688 .get_typed::<Option<String>>(8)?
6689 .unwrap_or_else(default_source_id);
6690 let origin_host: Option<String> = row.get_typed(9)?;
6691 let raw_origin_kind: Option<String> = row.get_typed(10)?;
6692 let source_id = normalized_search_hit_source_id_parts(
6693 raw_source_id.as_str(),
6694 raw_origin_kind.as_deref().unwrap_or_default(),
6695 origin_host.as_deref(),
6696 );
6697 let origin_kind = normalized_search_hit_origin_kind(
6698 source_id.as_str(),
6699 raw_origin_kind.as_deref(),
6700 );
6701 let line_number = idx
6702 .and_then(|i| usize::try_from(i).ok())
6703 .map(|i| i.saturating_add(1));
6704 let snippet = if field_mask.wants_snippet() {
6705 snippet_from_content(&raw_content)
6706 } else {
6707 String::new()
6708 };
6709 let content = if field_mask.needs_content() {
6710 raw_content.clone()
6711 } else {
6712 String::new()
6713 };
6714 let content_hash =
6715 stable_hit_hash(&raw_content, &source_path, line_number, created_at);
6716 Ok(SearchHit {
6717 title,
6718 snippet,
6719 content,
6720 content_hash,
6721 conversation_id: Some(conversation_id),
6722 score: 0.0,
6723 source_path,
6724 agent,
6725 workspace: workspace.unwrap_or_default(),
6726 workspace_original: None,
6727 created_at,
6728 line_number,
6729 match_type: MatchType::Exact,
6730 source_id,
6731 origin_kind,
6732 origin_host,
6733 })
6734 })?;
6735 Ok(rows)
6736 }
6737}
6738
6739#[doc(hidden)]
6746pub fn fuzz_transpile_to_fts5(raw_query: &str) -> Option<String> {
6747 transpile_to_fts5(raw_query)
6748}
6749
6750fn transpile_to_fts5(raw_query: &str) -> Option<String> {
6754 let tokens = fs_cass_parse_boolean_query(raw_query);
6755 if tokens.is_empty() {
6756 return Some("".to_string());
6757 }
6758
6759 let mut fts_clauses: Vec<(&str, String)> = Vec::new();
6760 let mut pending_or_group: Vec<String> = Vec::new();
6761 let mut next_op = "AND";
6762 let mut in_or_sequence = false;
6763 for token in tokens {
6764 match token {
6765 FsCassQueryToken::And => {
6766 if !pending_or_group.is_empty() {
6767 let group = if pending_or_group.len() > 1 {
6768 format!("({})", pending_or_group.join(" OR "))
6769 } else {
6770 pending_or_group.pop().unwrap_or_default()
6771 };
6772 fts_clauses.push(("AND", group));
6773 pending_or_group.clear();
6774 }
6775 in_or_sequence = false;
6776 next_op = "AND";
6777 }
6778 FsCassQueryToken::Or => {
6779 if fts_clauses.is_empty() && pending_or_group.is_empty() {
6780 continue;
6784 }
6785 in_or_sequence = true;
6788 }
6789 FsCassQueryToken::Not => {
6790 if in_or_sequence {
6794 return None;
6795 }
6796
6797 if fts_clauses.is_empty() && pending_or_group.is_empty() {
6798 return None;
6799 }
6800
6801 if !pending_or_group.is_empty() {
6802 let group = if pending_or_group.len() > 1 {
6803 format!("({})", pending_or_group.join(" OR "))
6804 } else {
6805 pending_or_group.pop().unwrap_or_default()
6806 };
6807 fts_clauses.push(("AND", group));
6808 pending_or_group.clear();
6809 }
6810 in_or_sequence = false;
6811 next_op = "NOT";
6812 }
6813 FsCassQueryToken::Term(t) => {
6814 let raw_pattern = FsCassWildcardPattern::parse(&t);
6815 if matches!(
6816 raw_pattern,
6817 FsCassWildcardPattern::Suffix(_)
6818 | FsCassWildcardPattern::Substring(_)
6819 | FsCassWildcardPattern::Complex(_)
6820 ) {
6821 return None;
6822 }
6823
6824 let term_parts = normalize_term_parts(&t);
6828 if term_parts.is_empty() {
6829 continue;
6830 }
6831
6832 let mut rendered_parts = Vec::with_capacity(term_parts.len());
6833 for part in &term_parts {
6834 rendered_parts.push(render_fts5_term_part(part)?);
6835 }
6836
6837 let fts_term = if rendered_parts.len() > 1 {
6840 format!("({})", rendered_parts.join(" AND "))
6841 } else {
6842 rendered_parts[0].clone()
6843 };
6844
6845 if in_or_sequence {
6846 if pending_or_group.is_empty() {
6847 let (op, _) = fts_clauses.last()?;
6848 if *op != "AND" {
6849 return None;
6852 }
6853 let (_, val) = fts_clauses.pop()?;
6854 pending_or_group.push(val);
6855 }
6856 pending_or_group.push(fts_term);
6857 in_or_sequence = true;
6858 } else {
6859 fts_clauses.push((next_op, fts_term));
6860 }
6861 next_op = "AND";
6862 }
6863 FsCassQueryToken::Phrase(p) => {
6864 let phrase_parts = normalize_phrase_terms(&p);
6865 if phrase_parts.is_empty() {
6866 continue;
6867 }
6868 let fts_phrase = format!("\"{}\"", phrase_parts.join(" "));
6869
6870 if in_or_sequence {
6871 if pending_or_group.is_empty() {
6872 let (op, _) = fts_clauses.last()?;
6873 if *op != "AND" {
6874 return None;
6877 }
6878 let (_, val) = fts_clauses.pop()?;
6879 pending_or_group.push(val);
6880 }
6881 pending_or_group.push(fts_phrase);
6882 in_or_sequence = true;
6883 } else {
6884 fts_clauses.push((next_op, fts_phrase));
6885 }
6886 next_op = "AND";
6887 }
6888 }
6889 }
6890
6891 if !pending_or_group.is_empty() {
6892 let group = if pending_or_group.len() > 1 {
6893 format!("({})", pending_or_group.join(" OR "))
6894 } else {
6895 pending_or_group.pop().unwrap_or_default()
6896 };
6897 fts_clauses.push((next_op, group));
6898 }
6899
6900 if fts_clauses.is_empty() {
6901 return Some("".to_string());
6902 }
6903
6904 if fts_clauses.first().is_some_and(|(op, _)| *op == "NOT") {
6907 return None;
6908 }
6909
6910 let mut query = String::new();
6912 for (i, (op, text)) in fts_clauses.into_iter().enumerate() {
6913 if i > 0 {
6914 query.push_str(&format!(" {} ", op));
6915 }
6916 query.push_str(&text);
6917 }
6918
6919 Some(query)
6920}
6921
6922#[derive(Default, Clone)]
6923struct Metrics {
6924 cache_hits: Arc<AtomicU64>,
6925 cache_miss: Arc<AtomicU64>,
6926 cache_shortfall: Arc<AtomicU64>,
6927 reloads: Arc<AtomicU64>,
6928 reload_ms_total: Arc<AtomicU64>,
6929 prewarm_scheduled: Arc<AtomicU64>,
6930 prewarm_skipped_pressure: Arc<AtomicU64>,
6931}
6932
6933impl Metrics {
6934 fn inc_cache_hits(&self) {
6935 self.cache_hits.fetch_add(1, Ordering::Relaxed);
6936 }
6937 fn inc_cache_miss(&self) {
6938 self.cache_miss.fetch_add(1, Ordering::Relaxed);
6939 }
6940 fn inc_cache_shortfall(&self) {
6941 self.cache_shortfall.fetch_add(1, Ordering::Relaxed);
6942 }
6943 fn inc_prewarm_scheduled(&self) {
6944 self.prewarm_scheduled.fetch_add(1, Ordering::Relaxed);
6945 }
6946 fn inc_prewarm_skipped_pressure(&self) {
6947 self.prewarm_skipped_pressure
6948 .fetch_add(1, Ordering::Relaxed);
6949 }
6950 fn inc_reload(&self) {
6951 self.reloads.fetch_add(1, Ordering::Relaxed);
6952 }
6953 fn record_reload(&self, duration: Duration) {
6954 self.inc_reload();
6955 self.reload_ms_total
6956 .fetch_add(duration.as_millis() as u64, Ordering::Relaxed);
6957 }
6958
6959 fn snapshot_all(&self) -> (u64, u64, u64, u64, u128) {
6960 (
6961 self.cache_hits.load(Ordering::Relaxed),
6962 self.cache_miss.load(Ordering::Relaxed),
6963 self.cache_shortfall.load(Ordering::Relaxed),
6964 self.reloads.load(Ordering::Relaxed),
6965 self.reload_ms_total.load(Ordering::Relaxed) as u128,
6966 )
6967 }
6968
6969 fn snapshot_prewarm(&self) -> (u64, u64) {
6970 (
6971 self.prewarm_scheduled.load(Ordering::Relaxed),
6972 self.prewarm_skipped_pressure.load(Ordering::Relaxed),
6973 )
6974 }
6975
6976 #[cfg(test)]
6977 #[allow(dead_code)]
6978 fn reset(&self) {
6979 self.cache_hits.store(0, Ordering::Relaxed);
6980 self.cache_miss.store(0, Ordering::Relaxed);
6981 self.cache_shortfall.store(0, Ordering::Relaxed);
6982 self.reloads.store(0, Ordering::Relaxed);
6983 self.reload_ms_total.store(0, Ordering::Relaxed);
6984 self.prewarm_scheduled.store(0, Ordering::Relaxed);
6985 self.prewarm_skipped_pressure.store(0, Ordering::Relaxed);
6986 }
6987}
6988
6989fn maybe_spawn_warm_worker(
6990 reader: IndexReader,
6991 fields: FsCassFields,
6992 reload_epoch: Arc<AtomicU64>,
6993 metrics: Metrics,
6994) -> Option<(mpsc::Sender<WarmJob>, std::thread::JoinHandle<()>)> {
6995 let (tx, rx) = mpsc::unbounded::<WarmJob>();
6996 let handle = std::thread::Builder::new()
6997 .name("cass-warm-worker".into())
6998 .spawn(move || {
6999 let mut last_run = Instant::now();
7001 while let Ok(job) = rx.recv() {
7002 let now = Instant::now();
7003 if now.duration_since(last_run) < Duration::from_millis(*WARM_DEBOUNCE_MS) {
7004 continue;
7005 }
7006 last_run = now;
7007 let reload_started = Instant::now();
7008 if let Err(err) = reader.reload() {
7009 tracing::warn!(error = ?err, "warm_worker_reload_failed");
7010 continue;
7011 }
7012 let elapsed = reload_started.elapsed();
7013 let epoch = reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
7014 metrics.record_reload(elapsed);
7015 tracing::debug!(
7016 duration_ms = elapsed.as_millis() as u64,
7017 reload_epoch = epoch,
7018 filters = %job.filters_fingerprint,
7019 shard = %job.shard_name,
7020 "warm_worker_reload"
7021 );
7022 let searcher = reader.searcher();
7025 let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
7026 for term_str in job.query.split_whitespace() {
7027 let term_lower = term_str.to_lowercase();
7028 let term_shoulds: Vec<(Occur, Box<dyn Query>)> = vec![
7029 (
7030 Occur::Should,
7031 Box::new(TermQuery::new(
7032 Term::from_field_text(fields.title, &term_lower),
7033 IndexRecordOption::WithFreqsAndPositions,
7034 )),
7035 ),
7036 (
7037 Occur::Should,
7038 Box::new(TermQuery::new(
7039 Term::from_field_text(fields.content, &term_lower),
7040 IndexRecordOption::WithFreqsAndPositions,
7041 )),
7042 ),
7043 ];
7044 clauses.push((Occur::Must, Box::new(BooleanQuery::new(term_shoulds))));
7045 }
7046 if !clauses.is_empty() {
7047 let q: Box<dyn Query> = Box::new(BooleanQuery::new(clauses));
7048 let _ = searcher.search(&q, &TopDocs::with_limit(1).order_by_score());
7049 }
7050 }
7051 })
7052 .ok()?;
7053 Some((tx, handle))
7054}
7055
7056fn cached_hit_from(hit: &SearchHit) -> CachedHit {
7057 let cache_text = if hit.content.is_empty() {
7058 hit.snippet.as_str()
7059 } else {
7060 hit.content.as_str()
7061 };
7062 let lc_content = cache_text.to_lowercase();
7063 let lc_title = (!hit.title.is_empty()).then(|| hit.title.to_lowercase());
7064 let bloom64 = bloom_from_text(&lc_content, &lc_title);
7066 CachedHit {
7067 hit: hit.clone(),
7068 lc_content,
7069 lc_title,
7070 bloom64,
7071 }
7072}
7073
7074fn bloom_from_text(content: &str, title: &Option<String>) -> u64 {
7075 let mut bits = 0u64;
7076 for token in token_stream(content) {
7077 bits |= hash_token(token);
7078 }
7079 if let Some(t) = title {
7080 for token in token_stream(t) {
7081 bits |= hash_token(token);
7082 }
7083 }
7084 bits
7085}
7086
7087fn token_stream(text: &str) -> impl Iterator<Item = &str> {
7088 text.split(|c: char| !c.is_alphanumeric())
7089 .filter(|s| !s.is_empty())
7090}
7091
7092fn hash_token(tok: &str) -> u64 {
7093 let mut h: u64 = 5381;
7095 for b in tok.as_bytes() {
7096 h = ((h << 5).wrapping_add(h)).wrapping_add(u64::from(*b));
7097 }
7098 1u64 << (h % 64)
7099}
7100
7101struct QueryTermsLower {
7111 query_lower: String,
7113 token_ranges: Vec<(usize, usize)>,
7115 bloom_mask: u64,
7117}
7118
7119impl QueryTermsLower {
7120 fn from_query(query: &str) -> Self {
7122 if query.is_empty() {
7123 return Self {
7124 query_lower: String::new(),
7125 token_ranges: Vec::new(),
7126 bloom_mask: 0,
7127 };
7128 }
7129
7130 let query_lower = query.to_lowercase();
7131 let mut token_ranges = Vec::new();
7132 let mut bloom_mask = 0u64;
7133
7134 let mut start = None;
7136 for (i, c) in query_lower.char_indices() {
7137 if c.is_alphanumeric() {
7138 if start.is_none() {
7139 start = Some(i);
7140 }
7141 } else if let Some(s) = start.take() {
7142 let token = &query_lower[s..i];
7143 bloom_mask |= hash_token(token);
7144 token_ranges.push((s, i));
7145 }
7146 }
7147 if let Some(s) = start {
7149 let token = &query_lower[s..];
7150 bloom_mask |= hash_token(token);
7151 token_ranges.push((s, query_lower.len()));
7152 }
7153
7154 Self {
7155 query_lower,
7156 token_ranges,
7157 bloom_mask,
7158 }
7159 }
7160
7161 #[inline]
7163 fn is_empty(&self) -> bool {
7164 self.token_ranges.is_empty()
7165 }
7166
7167 #[inline]
7169 fn tokens(&self) -> impl Iterator<Item = &str> {
7170 self.token_ranges
7171 .iter()
7172 .map(|(s, e)| &self.query_lower[*s..*e])
7173 }
7174
7175 #[inline]
7177 fn bloom_mask(&self) -> u64 {
7178 self.bloom_mask
7179 }
7180}
7181
7182fn hit_matches_query_cached_precomputed(hit: &CachedHit, terms: &QueryTermsLower) -> bool {
7185 if terms.is_empty() {
7186 return true;
7187 }
7188
7189 if hit.bloom64 & terms.bloom_mask() != terms.bloom_mask() {
7191 return false;
7192 }
7193
7194 terms.tokens().all(|t| {
7196 if token_stream(&hit.lc_content).any(|word| word.starts_with(t)) {
7198 return true;
7199 }
7200 if let Some(title) = &hit.lc_title
7202 && token_stream(title).any(|word| word.starts_with(t))
7203 {
7204 return true;
7205 }
7206 false
7207 })
7208}
7209
7210#[cfg(test)]
7213fn hit_matches_query_cached(hit: &CachedHit, query: &str) -> bool {
7214 let terms = QueryTermsLower::from_query(query);
7215 hit_matches_query_cached_precomputed(hit, &terms)
7216}
7217
7218fn is_prefix_only(query: &str) -> bool {
7219 let tokens: Vec<&str> = query.split_whitespace().collect();
7220 if tokens.len() != 1 {
7223 return false;
7224 }
7225 tokens[0].chars().all(char::is_alphanumeric)
7226}
7227
7228fn quick_prefix_snippet(content: &str, query: &str, max_chars: usize) -> String {
7229 if query.is_empty() {
7231 let mut chars = content.chars();
7232 let snippet: String = chars.by_ref().take(max_chars).collect();
7233 return if chars.next().is_some() {
7234 format!("{snippet}…")
7235 } else {
7236 snippet
7237 };
7238 }
7239
7240 let lc_content = content.to_lowercase();
7241 let lc_query = query.to_lowercase();
7242
7243 if let Some(pos) = lc_content.find(&lc_query) {
7244 let match_start_char_idx = lc_content[..pos].chars().count();
7246 let query_char_len = lc_query.chars().count();
7247
7248 let start_char = match_start_char_idx.saturating_sub(15);
7250 let mut chars_iter = content.chars().skip(start_char);
7251 let mut snippet = String::new();
7252 let mut chars_taken = 0;
7253 let mut current_idx = start_char;
7254
7255 while chars_taken < max_chars {
7256 if current_idx == match_start_char_idx {
7257 snippet.push_str("**");
7258 for _ in 0..query_char_len {
7259 if let Some(ch) = chars_iter.next() {
7260 snippet.push(ch);
7261 chars_taken += 1;
7262 current_idx += 1;
7263 }
7264 }
7265 snippet.push_str("**");
7266 if chars_taken >= max_chars {
7267 break;
7268 }
7269 continue;
7270 }
7271
7272 if let Some(ch) = chars_iter.next() {
7273 snippet.push(ch);
7274 chars_taken += 1;
7275 current_idx += 1;
7276 } else {
7277 break;
7278 }
7279 }
7280
7281 if chars_iter.next().is_some() {
7282 format!("{snippet}…")
7283 } else {
7284 snippet
7285 }
7286 } else {
7287 let mut chars = content.chars();
7288 let snippet: String = chars.by_ref().take(max_chars).collect();
7289 if chars.next().is_some() {
7290 format!("{snippet}…")
7291 } else {
7292 snippet
7293 }
7294 }
7295}
7296
7297fn cached_prefix_snippet(content: &str, query: &str, max_chars: usize) -> Option<String> {
7298 if query.trim().is_empty() {
7299 return None;
7300 }
7301 let lc_content = content.to_lowercase();
7302 let lc_query = query.to_lowercase();
7303 lc_content.find(&lc_query).map(|pos| {
7304 let match_start_char_idx = lc_content[..pos].chars().count();
7305 let query_char_len = lc_query.chars().count();
7306
7307 let start_char = match_start_char_idx.saturating_sub(15);
7308 let mut chars_iter = content.chars().skip(start_char);
7309 let mut snippet = String::new();
7310 let mut chars_taken = 0;
7311 let mut current_idx = start_char;
7312
7313 while chars_taken < max_chars {
7314 if current_idx == match_start_char_idx {
7315 snippet.push_str("**");
7316 for _ in 0..query_char_len {
7317 if let Some(ch) = chars_iter.next() {
7318 snippet.push(ch);
7319 chars_taken += 1;
7320 current_idx += 1;
7321 }
7322 }
7323 snippet.push_str("**");
7324 if chars_taken >= max_chars {
7325 break;
7326 }
7327 continue;
7328 }
7329
7330 if let Some(ch) = chars_iter.next() {
7331 snippet.push(ch);
7332 chars_taken += 1;
7333 current_idx += 1;
7334 } else {
7335 break;
7336 }
7337 }
7338
7339 if chars_iter.next().is_some() {
7340 format!("{snippet}…")
7341 } else {
7342 snippet
7343 }
7344 })
7345}
7346
7347fn filters_fingerprint(filters: &SearchFilters) -> String {
7348 let mut parts = Vec::new();
7349 if !filters.agents.is_empty() {
7350 let mut v: Vec<_> = filters.agents.iter().cloned().collect();
7351 v.sort();
7352 parts.push(format!("a:{v:?}"));
7353 }
7354 if !filters.workspaces.is_empty() {
7355 let mut v: Vec<_> = filters.workspaces.iter().cloned().collect();
7356 v.sort();
7357 parts.push(format!("w:{v:?}"));
7358 }
7359 if let Some(f) = filters.created_from {
7360 parts.push(format!("from:{f}"));
7361 }
7362 if let Some(t) = filters.created_to {
7363 parts.push(format!("to:{t}"));
7364 }
7365 if !matches!(
7367 filters.source_filter,
7368 crate::sources::provenance::SourceFilter::All
7369 ) {
7370 parts.push(format!("src:{:?}", filters.source_filter));
7371 }
7372 if !filters.session_paths.is_empty() {
7374 let mut v: Vec<_> = filters.session_paths.iter().cloned().collect();
7375 v.sort();
7376 parts.push(format!("sp:{v:?}"));
7377 }
7378 parts.join("|")
7379}
7380
7381impl SearchClient {
7382 pub fn total_docs(&self) -> usize {
7384 if let Some((reader, _)) = &self.reader {
7385 return reader.searcher().num_docs() as usize;
7386 }
7387 self.federated_readers()
7388 .map(|readers| {
7389 readers
7390 .iter()
7391 .map(|shard| shard.reader.searcher().num_docs() as usize)
7392 .sum()
7393 })
7394 .unwrap_or(0)
7395 }
7396
7397 pub fn has_tantivy(&self) -> bool {
7399 self.reader.is_some() || self.federated_readers().is_some()
7400 }
7401
7402 fn maybe_reload_reader(&self, reader: &IndexReader) -> Result<()> {
7403 if !self.reload_on_search {
7404 return Ok(());
7405 }
7406 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
7407 let now = Instant::now();
7408 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
7409 if guard
7410 .map(|t| now.duration_since(t) >= MIN_RELOAD_INTERVAL)
7411 .unwrap_or(true)
7412 {
7413 let reload_started = Instant::now();
7414 reader.reload()?;
7415 let elapsed = reload_started.elapsed();
7416 *guard = Some(now);
7417 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
7418 self.metrics.record_reload(elapsed);
7419 tracing::debug!(
7420 duration_ms = elapsed.as_millis() as u64,
7421 reload_epoch = epoch,
7422 "tantivy_reader_reload"
7423 );
7424 }
7425 Ok(())
7426 }
7427
7428 fn maybe_log_cache_metrics(&self, event: &str) {
7429 if !*CACHE_DEBUG_ENABLED {
7430 return;
7431 }
7432 let stats = self.cache_stats();
7433 tracing::debug!(
7434 event = event,
7435 hits = stats.cache_hits,
7436 miss = stats.cache_miss,
7437 shortfall = stats.cache_shortfall,
7438 reloads = stats.reloads,
7439 reload_ms_total = stats.reload_ms_total,
7440 total_cap = stats.total_cap,
7441 total_cost = stats.total_cost,
7442 evictions = stats.eviction_count,
7443 approx_bytes = stats.approx_bytes,
7444 byte_cap = stats.byte_cap,
7445 eviction_policy = stats.eviction_policy,
7446 ghost_entries = stats.ghost_entries,
7447 admission_rejects = stats.admission_rejects,
7448 "cache_metrics"
7449 );
7450 }
7451
7452 fn cache_key(&self, query: &str, filters: &SearchFilters) -> Arc<str> {
7455 let key_str = format!(
7456 "{}|{}::{}",
7457 self.cache_namespace,
7458 query,
7459 filters_fingerprint(filters)
7460 );
7461 intern_cache_key(&key_str)
7462 }
7463
7464 fn shard_name(&self, filters: &SearchFilters) -> String {
7465 if filters.agents.len() == 1 {
7466 format!(
7467 "agent:{}",
7468 filters
7469 .agents
7470 .iter()
7471 .next()
7472 .cloned()
7473 .unwrap_or_else(|| "global".into())
7474 )
7475 } else if filters.workspaces.len() == 1 {
7476 format!(
7477 "workspace:{}",
7478 filters
7479 .workspaces
7480 .iter()
7481 .next()
7482 .cloned()
7483 .unwrap_or_else(|| "global".into())
7484 )
7485 } else {
7486 "global".into()
7487 }
7488 }
7489 fn cached_prefix_key_exists_in_shard(
7490 &self,
7491 shard: &LruCache<Arc<str>, Vec<CachedHit>>,
7492 query: &str,
7493 filters: &SearchFilters,
7494 ) -> bool {
7495 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
7496 byte_indices.push(query.len());
7497 let query_len = query.len();
7498 for &end in byte_indices.iter().rev() {
7499 if end == 0 || end == query_len {
7500 continue;
7501 }
7502 let key = self.cache_key(&query[..end], filters);
7503 if shard.contains(&key) {
7504 return true;
7505 }
7506 }
7507 false
7508 }
7509
7510 fn maybe_schedule_adaptive_query_prewarm(&self, query: &str, filters: &SearchFilters) {
7511 if query.is_empty() {
7512 return;
7513 }
7514 let Some(tx) = &self.warm_tx else {
7515 return;
7516 };
7517
7518 let shard_name = self.shard_name(filters);
7519 let decision = match self.prefix_cache.lock() {
7520 Ok(cache) => {
7521 let hot_prefix = cache.shard_opt(&shard_name).is_some_and(|shard| {
7522 self.cached_prefix_key_exists_in_shard(shard, query, filters)
7523 });
7524 if !hot_prefix {
7525 AdaptivePrewarmDecision::SkipCold
7526 } else if cache.prewarm_pressure() {
7527 AdaptivePrewarmDecision::SkipPressure
7528 } else {
7529 AdaptivePrewarmDecision::Schedule
7530 }
7531 }
7532 Err(_) => return,
7533 };
7534
7535 if decision == AdaptivePrewarmDecision::SkipPressure {
7536 self.metrics.inc_prewarm_skipped_pressure();
7537 return;
7538 }
7539 if decision == AdaptivePrewarmDecision::SkipCold {
7540 return;
7541 }
7542
7543 if tx
7544 .send(WarmJob {
7545 query: query.to_string(),
7546 filters_fingerprint: filters_fingerprint(filters),
7547 shard_name,
7548 })
7549 .is_ok()
7550 {
7551 self.metrics.inc_prewarm_scheduled();
7552 }
7553 }
7554
7555 fn cached_prefix_hits(&self, query: &str, filters: &SearchFilters) -> Option<Vec<CachedHit>> {
7556 if query.is_empty() {
7557 return None;
7558 }
7559 let cache = self.prefix_cache.lock().ok()?;
7560 let shard_name = self.shard_name(filters);
7561 let shard = cache.shard_opt(&shard_name)?;
7562 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
7564 byte_indices.push(query.len());
7565 for &end in byte_indices.iter().rev() {
7566 if end == 0 {
7567 continue;
7568 }
7569 let key = self.cache_key(&query[..end], filters);
7570 if let Some(hits) = shard.peek(&key) {
7572 return Some(hits.clone());
7573 }
7574 }
7575 None
7576 }
7577
7578 fn put_cache(&self, query: &str, filters: &SearchFilters, hits: &[SearchHit]) {
7579 if query.is_empty() || hits.is_empty() {
7580 return;
7581 }
7582 if let Ok(mut cache) = self.prefix_cache.lock() {
7583 let shard_name = self.shard_name(filters);
7584 let key = self.cache_key(query, filters);
7585 let cached_hits: Vec<CachedHit> = hits.iter().map(cached_hit_from).collect();
7586 cache.put(&shard_name, key, cached_hits);
7587 }
7588 }
7589
7590 pub fn cache_stats(&self) -> CacheStats {
7591 let (hits, miss, shortfall, reloads, reload_ms_total) = self.metrics.snapshot_all();
7592 let (prewarm_scheduled, prewarm_skipped_pressure) = self.metrics.snapshot_prewarm();
7593 let reader_generation = self.last_generation.lock().ok().and_then(|guard| *guard);
7594 let (
7595 total_cap,
7596 total_cost,
7597 eviction_count,
7598 approx_bytes,
7599 byte_cap,
7600 eviction_policy,
7601 ghost_entries,
7602 admission_rejects,
7603 ) = if let Ok(cache) = self.prefix_cache.lock() {
7604 (
7605 cache.total_cap(),
7606 cache.total_cost(),
7607 cache.eviction_count(),
7608 cache.total_bytes(),
7609 cache.byte_cap(),
7610 cache.policy_label(),
7611 cache.ghost_entries(),
7612 cache.admission_rejects(),
7613 )
7614 } else {
7615 (0, 0, 0, 0, 0, "unknown", 0, 0)
7616 };
7617 CacheStats {
7618 cache_hits: hits,
7619 cache_miss: miss,
7620 cache_shortfall: shortfall,
7621 reloads,
7622 reload_ms_total,
7623 total_cap,
7624 total_cost,
7625 eviction_count,
7626 approx_bytes,
7627 byte_cap,
7628 eviction_policy,
7629 ghost_entries,
7630 admission_rejects,
7631 prewarm_scheduled,
7632 prewarm_skipped_pressure,
7633 reader_generation,
7634 }
7635 }
7636}
7637
7638#[cfg(test)]
7639mod tests {
7640 use super::*;
7641 use crate::connectors::{NormalizedConversation, NormalizedMessage, NormalizedSnippet};
7642 use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
7643 use crate::search::tantivy::TantivyIndex;
7644 use crate::storage::sqlite::FrankenStorage;
7645 use frankensqlite::Connection as FrankenConnection;
7646 use frankensqlite::compat::{ParamValue, params_from_iter};
7647 use serde_json::json;
7648 use tempfile::TempDir;
7649
7650 fn search_hit_key_doc_id_reference_v0(key: &SearchHitKey) -> String {
7654 let sep = '\u{1f}';
7655 format!(
7656 "{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}",
7657 key.source_id,
7658 key.source_path,
7659 key.conversation_id
7660 .map(|v| v.to_string())
7661 .unwrap_or_default(),
7662 key.title,
7663 key.line_number.map(|v| v.to_string()).unwrap_or_default(),
7664 key.created_at.map(|v| v.to_string()).unwrap_or_default(),
7665 key.content_hash,
7666 )
7667 }
7668
7669 fn stable_hit_hash_reference_v0(
7670 content: &str,
7671 source_path: &str,
7672 line_number: Option<usize>,
7673 created_at: Option<i64>,
7674 ) -> u64 {
7675 use xxhash_rust::xxh3::Xxh3;
7676
7677 let mut hasher = Xxh3::new();
7678 if !content.is_empty() {
7679 hasher.update(&stable_content_hash(content).to_le_bytes());
7680 }
7681 hasher.update(b"|");
7682 hasher.update(source_path.as_bytes());
7683 hasher.update(b"|");
7684 if let Some(line) = line_number {
7685 hasher.update(line.to_string().as_bytes());
7686 }
7687 hasher.update(b"|");
7688 if let Some(ts) = created_at {
7689 hasher.update(ts.to_string().as_bytes());
7690 }
7691 hasher.digest()
7692 }
7693
7694 #[test]
7695 fn stable_hit_hash_matches_reference_and_is_deterministic() {
7696 let fixtures = [
7697 ("", "", None, None),
7698 (
7699 "same content\nnormalized",
7700 "/tmp/session.jsonl",
7701 Some(1),
7702 Some(0),
7703 ),
7704 (
7705 "tool output with repeated whitespace",
7706 "/tmp/path with spaces.jsonl",
7707 Some(42),
7708 Some(1_700_000_000_000),
7709 ),
7710 (
7711 "unicode stays in the content hash path: café",
7712 "/remote/host/session.jsonl",
7713 Some(usize::MAX),
7714 Some(i64::MIN),
7715 ),
7716 (
7717 "negative timestamp fixture",
7718 "/tmp/negative.jsonl",
7719 None,
7720 Some(-123_456),
7721 ),
7722 ];
7723
7724 for (content, source_path, line_number, created_at) in fixtures {
7725 let optimized = stable_hit_hash(content, source_path, line_number, created_at);
7726 let repeated = stable_hit_hash(content, source_path, line_number, created_at);
7727 let reference =
7728 stable_hit_hash_reference_v0(content, source_path, line_number, created_at);
7729
7730 assert_eq!(optimized, repeated);
7731 assert_eq!(optimized, reference);
7732 }
7733 }
7734
7735 #[test]
7736 fn semantic_message_id_from_db_rejects_negative_values() {
7737 let err = semantic_message_id_from_db(-1).expect_err("negative DB ids must be rejected");
7738 assert!(
7739 err.to_string().contains("negative message_id"),
7740 "unexpected error: {err}"
7741 );
7742 assert_eq!(semantic_message_id_from_db(42).expect("positive id"), 42);
7743 }
7744
7745 #[test]
7746 fn semantic_doc_component_id_from_db_clamps_bounds() {
7747 assert_eq!(semantic_doc_component_id_from_db(None), 0);
7748 assert_eq!(semantic_doc_component_id_from_db(Some(-7)), 0);
7749 assert_eq!(semantic_doc_component_id_from_db(Some(0)), 0);
7750 assert_eq!(semantic_doc_component_id_from_db(Some(7)), 7);
7751 assert_eq!(
7752 semantic_doc_component_id_from_db(Some(i64::from(u32::MAX) + 123)),
7753 u32::MAX
7754 );
7755 }
7756
7757 #[test]
7758 fn search_hit_key_doc_id_matches_reference_byte_for_byte() {
7759 let fixtures = [
7760 SearchHitKey {
7761 source_id: "local".into(),
7762 source_path: "/tmp/path.jsonl".into(),
7763 conversation_id: Some(42),
7764 title: "Demo chat".into(),
7765 line_number: Some(7),
7766 created_at: Some(1_700_000_000_000),
7767 content_hash: 0xdead_beef_u64,
7768 },
7769 SearchHitKey {
7770 source_id: "ssh:host".into(),
7771 source_path: "/remote/path with spaces.jsonl".into(),
7772 conversation_id: None,
7773 title: String::new(),
7774 line_number: None,
7775 created_at: None,
7776 content_hash: 0,
7777 },
7778 SearchHitKey {
7779 source_id: String::new(),
7780 source_path: String::new(),
7781 conversation_id: Some(i64::MIN),
7782 title: "unicode title — héllo".into(),
7783 line_number: Some(usize::MAX),
7784 created_at: Some(i64::MAX),
7785 content_hash: u64::MAX,
7786 },
7787 SearchHitKey {
7788 source_id: "a".into(),
7789 source_path: "b".into(),
7790 conversation_id: Some(0),
7791 title: "c".into(),
7792 line_number: Some(0),
7793 created_at: Some(0),
7794 content_hash: 0,
7795 },
7796 SearchHitKey {
7797 source_id: "with\u{1f}separator".into(),
7798 source_path: "with\u{1f}separator".into(),
7799 conversation_id: Some(-1),
7800 title: "with\u{1f}separator".into(),
7801 line_number: None,
7802 created_at: Some(-1),
7803 content_hash: 1,
7804 },
7805 ];
7806 for (idx, key) in fixtures.iter().enumerate() {
7807 let optimized = search_hit_key_doc_id(key);
7808 let reference = search_hit_key_doc_id_reference_v0(key);
7809 assert_eq!(
7810 optimized, reference,
7811 "fixture {idx} produced divergent doc_id; byte-exact dedup key is a contract"
7812 );
7813 }
7814
7815 let structural_key = SearchHitKey {
7820 source_id: "clean".into(),
7821 source_path: "/no/separators/here.jsonl".into(),
7822 conversation_id: Some(1),
7823 title: "plain title".into(),
7824 line_number: Some(2),
7825 created_at: Some(3),
7826 content_hash: 4,
7827 };
7828 let encoded = search_hit_key_doc_id(&structural_key);
7829 assert_eq!(
7830 encoded.matches('\u{1f}').count(),
7831 6,
7832 "structural fixture must contain exactly six 0x1F separators; got {encoded:?}"
7833 );
7834 }
7835
7836 #[derive(Debug)]
7837 struct FixedTestEmbedder {
7838 id: String,
7839 vector: Vec<f32>,
7840 }
7841
7842 impl FixedTestEmbedder {
7843 fn new(id: &str, vector: &[f32]) -> Self {
7844 Self {
7845 id: id.to_string(),
7846 vector: vector.to_vec(),
7847 }
7848 }
7849 }
7850
7851 #[derive(Debug)]
7852 struct BlockingTestEmbedder {
7853 id: String,
7854 vector: Vec<f32>,
7855 started_tx: Mutex<Option<std::sync::mpsc::Sender<()>>>,
7856 unblock_rx: Mutex<std::sync::mpsc::Receiver<()>>,
7857 }
7858
7859 impl BlockingTestEmbedder {
7860 fn new(
7861 id: &str,
7862 vector: &[f32],
7863 started_tx: std::sync::mpsc::Sender<()>,
7864 unblock_rx: std::sync::mpsc::Receiver<()>,
7865 ) -> Self {
7866 Self {
7867 id: id.to_string(),
7868 vector: vector.to_vec(),
7869 started_tx: Mutex::new(Some(started_tx)),
7870 unblock_rx: Mutex::new(unblock_rx),
7871 }
7872 }
7873 }
7874
7875 impl crate::search::embedder::Embedder for BlockingTestEmbedder {
7876 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
7877 if let Ok(mut guard) = self.started_tx.lock()
7878 && let Some(tx) = guard.take()
7879 {
7880 let _ = tx.send(());
7881 }
7882 self.unblock_rx
7883 .lock()
7884 .expect("blocking embedder receiver")
7885 .recv()
7886 .expect("blocking embedder unblock signal");
7887 Ok(self.vector.clone())
7888 }
7889
7890 fn dimension(&self) -> usize {
7891 self.vector.len()
7892 }
7893
7894 fn id(&self) -> &str {
7895 &self.id
7896 }
7897
7898 fn is_semantic(&self) -> bool {
7899 false
7900 }
7901
7902 fn category(&self) -> frankensearch::ModelCategory {
7903 frankensearch::ModelCategory::HashEmbedder
7904 }
7905 }
7906
7907 impl crate::search::embedder::Embedder for FixedTestEmbedder {
7908 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
7909 Ok(self.vector.clone())
7910 }
7911
7912 fn dimension(&self) -> usize {
7913 self.vector.len()
7914 }
7915
7916 fn id(&self) -> &str {
7917 &self.id
7918 }
7919
7920 fn is_semantic(&self) -> bool {
7921 false
7922 }
7923
7924 fn category(&self) -> frankensearch::ModelCategory {
7925 frankensearch::ModelCategory::HashEmbedder
7926 }
7927 }
7928
7929 struct SemanticTestFixture {
7930 _dir: TempDir,
7931 client: SearchClient,
7932 doc_ids: Vec<String>,
7933 source_paths: Vec<String>,
7934 }
7935
7936 struct ProgressiveHybridFixture {
7937 _dir: TempDir,
7938 client: Arc<SearchClient>,
7939 query: String,
7940 }
7941
7942 fn projected_minimal_fields_search_hit(title: &str, source_path: &str) -> SearchHit {
7948 SearchHit {
7949 title: title.to_string(),
7950 snippet: String::new(),
7951 content: String::new(),
7952 content_hash: 0,
7953 conversation_id: Some(42),
7954 score: 1.0,
7955 source_path: source_path.to_string(),
7956 agent: "test-agent".into(),
7957 workspace: "/tmp/workspace".into(),
7958 workspace_original: None,
7959 created_at: Some(1_700_000_000_000),
7960 line_number: Some(1),
7961 match_type: MatchType::default(),
7962 source_id: "local".into(),
7963 origin_kind: "local".into(),
7964 origin_host: None,
7965 }
7966 }
7967
7968 #[test]
7978 fn hit_is_noise_returns_false_for_projected_minimal_fields_hit() {
7979 let hit = projected_minimal_fields_search_hit(
7980 "Demo conversation about authentication",
7981 "/tmp/sessions/demo-auth.jsonl",
7982 );
7983 assert_eq!(hit.content, "");
7984 assert_eq!(hit.snippet, "");
7985 assert!(
7986 !hit_is_noise(&hit, "authentication"),
7987 "projected --fields minimal hit must NOT be classified as noise; \
7988 doing so silently drops every real match (bead bd-q6xf9)"
7989 );
7990 }
7991
7992 #[test]
7998 fn hit_is_noise_still_suppresses_real_tool_invocation_noise_when_content_present() {
7999 let mut hit =
8000 projected_minimal_fields_search_hit("Tool ping", "/tmp/sessions/tool-ping.jsonl");
8001 hit.content =
8005 "[tool_call]: {\"name\": \"bash\", \"arguments\": {\"command\": \"ls\"}}".into();
8006 let classified_as_noise_on_real_content =
8007 hit_is_noise(&hit, "ls") || hit_is_noise(&hit, "bash");
8008 let _ = classified_as_noise_on_real_content;
8015 assert!(!hit.content.is_empty(), "precondition: content populated");
8016 }
8017
8018 #[test]
8025 fn hit_is_noise_uses_snippet_when_content_empty_but_snippet_populated() {
8026 let mut hit = projected_minimal_fields_search_hit(
8027 "Real authentication hit",
8028 "/tmp/sessions/real-auth.jsonl",
8029 );
8030 hit.content = String::new();
8031 hit.snippet = "The user asked about authentication flow options.".into();
8032 assert!(
8035 !hit_is_noise(&hit, "authentication"),
8036 "snippet-only hits with real content must survive the noise filter"
8037 );
8038 }
8039
8040 #[test]
8041 fn search_client_is_send_sync_without_phantom_filters() {
8042 fn assert_send_sync<T: Send + Sync>() {}
8043 assert_send_sync::<SearchClient>();
8044 }
8045
8046 #[test]
8047 fn semantic_embedding_releases_semantic_lock_while_embedding() -> Result<()> {
8048 let fixture = build_semantic_test_fixture()?;
8049 let client = Arc::new(fixture.client);
8050 let (started_tx, started_rx) = std::sync::mpsc::channel();
8051 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8052
8053 {
8054 let mut guard = client
8055 .semantic
8056 .lock()
8057 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8058 let state = guard
8059 .as_mut()
8060 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8061 state.embedder = Arc::new(BlockingTestEmbedder::new(
8062 "test-fixed-2d",
8063 &[1.0, 0.0],
8064 started_tx,
8065 unblock_rx,
8066 ));
8067 state.query_cache = QueryCache::new(
8068 "test-fixed-2d",
8069 NonZeroUsize::new(100).expect("cache capacity"),
8070 );
8071 }
8072
8073 let search_client = Arc::clone(&client);
8074 let search_handle = std::thread::spawn(move || {
8075 search_client.search_semantic(
8076 "lock scope regression",
8077 SearchFilters::default(),
8078 3,
8079 0,
8080 FieldMask::FULL,
8081 false,
8082 )
8083 });
8084
8085 started_rx
8086 .recv_timeout(Duration::from_secs(1))
8087 .expect("embedder should start");
8088
8089 let clear_client = Arc::clone(&client);
8090 let (clear_tx, clear_rx) = std::sync::mpsc::channel();
8091 let clear_handle = std::thread::spawn(move || {
8092 let _ = clear_tx.send(clear_client.clear_semantic_context());
8093 });
8094
8095 clear_rx
8096 .recv_timeout(Duration::from_millis(500))
8097 .expect("semantic lock should not stay held during embed")?;
8098
8099 unblock_tx.send(()).expect("unblock embedder");
8100 clear_handle.join().expect("clear thread join");
8101 let search_result = search_handle.join().expect("search thread join");
8102 assert!(
8103 search_result.is_err(),
8104 "search should observe semantic context cleared after embedding"
8105 );
8106
8107 Ok(())
8108 }
8109
8110 #[test]
8111 fn semantic_embedding_ignores_stale_same_id_context_after_swap() -> Result<()> {
8112 let fixture = build_semantic_test_fixture()?;
8113 let client = Arc::new(fixture.client);
8114 let (started_tx, started_rx) = std::sync::mpsc::channel();
8115 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8116
8117 {
8118 let mut guard = client
8119 .semantic
8120 .lock()
8121 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8122 let state = guard
8123 .as_mut()
8124 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8125 state.embedder = Arc::new(BlockingTestEmbedder::new(
8126 "test-fixed-2d",
8127 &[1.0, 0.0],
8128 started_tx,
8129 unblock_rx,
8130 ));
8131 state.query_cache = QueryCache::new(
8132 "test-fixed-2d",
8133 NonZeroUsize::new(100).expect("cache capacity"),
8134 );
8135 }
8136
8137 let embedding_client = Arc::clone(&client);
8138 let handle =
8139 std::thread::spawn(move || embedding_client.semantic_query_embedding("context-swap"));
8140
8141 started_rx
8142 .recv_timeout(Duration::from_secs(1))
8143 .expect("embedder should start");
8144
8145 {
8146 let mut guard = client
8147 .semantic
8148 .lock()
8149 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8150 let state = guard
8151 .as_mut()
8152 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8153 state.context_token = Arc::new(());
8154 state.embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[0.0, 1.0]));
8155 state.query_cache = QueryCache::new(
8156 "test-fixed-2d",
8157 NonZeroUsize::new(100).expect("cache capacity"),
8158 );
8159 }
8160
8161 unblock_tx.send(()).expect("unblock embedder");
8162
8163 let embedding = handle.join().expect("embedding thread join")?.vector;
8164 assert_eq!(
8165 embedding,
8166 vec![0.0, 1.0],
8167 "stale embedding from the previous same-id context must not leak across the swap"
8168 );
8169
8170 Ok(())
8171 }
8172
8173 #[test]
8174 fn quality_mode_does_not_reuse_fast_only_two_tier_cache() -> Result<()> {
8175 let dir = TempDir::new()?;
8176 let mut index = TantivyIndex::open_or_create(dir.path())?;
8177 index.commit()?;
8178
8179 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8180 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8181 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8182 let writer = VectorIndex::create_with_revision(
8183 &fast_path,
8184 embedder.id(),
8185 "rev-fast-only",
8186 embedder.dimension(),
8187 frankensearch::index::Quantization::F16,
8188 )?;
8189 writer.finish()?;
8190
8191 client.set_semantic_context(
8192 embedder,
8193 VectorIndex::open(&fast_path)?,
8194 SemanticFilterMaps::for_tests(
8195 HashMap::new(),
8196 HashMap::new(),
8197 HashMap::new(),
8198 HashSet::new(),
8199 ),
8200 None,
8201 Some(fast_path),
8202 )?;
8203
8204 let fast_only_index = client
8205 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
8206 .expect("fast-only index should load");
8207 assert!(
8208 !fast_only_index.has_quality_index(),
8209 "fixture should only provide the fast tier"
8210 );
8211
8212 let quality_index = client.in_memory_two_tier_index(SemanticTierMode::QualityOnly)?;
8213 assert!(
8214 quality_index.is_none(),
8215 "quality mode must not reuse a cached fast-only two-tier index"
8216 );
8217
8218 Ok(())
8219 }
8220
8221 #[test]
8222 fn failed_quality_probe_does_not_block_fast_only_two_tier_load() -> Result<()> {
8223 let dir = TempDir::new()?;
8224 let mut index = TantivyIndex::open_or_create(dir.path())?;
8225 index.commit()?;
8226
8227 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8228 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8229 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8230 let writer = VectorIndex::create_with_revision(
8231 &fast_path,
8232 embedder.id(),
8233 "rev-fast-only",
8234 embedder.dimension(),
8235 frankensearch::index::Quantization::F16,
8236 )?;
8237 writer.finish()?;
8238
8239 client.set_semantic_context(
8240 embedder,
8241 VectorIndex::open(&fast_path)?,
8242 SemanticFilterMaps::for_tests(
8243 HashMap::new(),
8244 HashMap::new(),
8245 HashMap::new(),
8246 HashSet::new(),
8247 ),
8248 None,
8249 Some(fast_path),
8250 )?;
8251
8252 assert!(
8253 client
8254 .in_memory_two_tier_index(SemanticTierMode::QualityOnly)?
8255 .is_none(),
8256 "quality-only lookup should fail for a fast-only fixture"
8257 );
8258
8259 let fast_only_index = client
8260 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
8261 .expect("a failed quality-only probe must not poison fast-only loads");
8262 assert!(
8263 !fast_only_index.has_quality_index(),
8264 "fixture should still resolve to the fast-only tier"
8265 );
8266
8267 Ok(())
8268 }
8269
8270 #[test]
8271 fn progressive_context_error_does_not_poison_future_attempts() -> Result<()> {
8272 let dir = TempDir::new()?;
8273 let mut index = TantivyIndex::open_or_create(dir.path())?;
8274 index.commit()?;
8275
8276 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8277 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8278 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8279 let writer = VectorIndex::create_with_revision(
8280 &fast_path,
8281 embedder.id(),
8282 "rev-progressive-error",
8283 embedder.dimension(),
8284 frankensearch::index::Quantization::F16,
8285 )?;
8286 writer.finish()?;
8287 std::fs::write(dir.path().join("vector.fast.idx"), b"not-a-valid-index")?;
8288 std::fs::write(dir.path().join("vector.quality.idx"), b"not-a-valid-index")?;
8289
8290 client.set_semantic_context(
8291 embedder,
8292 VectorIndex::open(&fast_path)?,
8293 SemanticFilterMaps::for_tests(
8294 HashMap::new(),
8295 HashMap::new(),
8296 HashMap::new(),
8297 HashSet::new(),
8298 ),
8299 None,
8300 Some(fast_path),
8301 )?;
8302
8303 let first_err = client
8304 .progressive_context()
8305 .err()
8306 .expect("invalid progressive index files should fail to load");
8307 assert!(
8308 first_err
8309 .to_string()
8310 .contains("open fast-tier index failed"),
8311 "unexpected first progressive-context error: {first_err}"
8312 );
8313
8314 let second_err = client
8315 .progressive_context()
8316 .err()
8317 .expect("a failed progressive load must not be memoized as None");
8318 assert!(
8319 second_err
8320 .to_string()
8321 .contains("open fast-tier index failed"),
8322 "unexpected second progressive-context error: {second_err}"
8323 );
8324
8325 Ok(())
8326 }
8327
8328 fn build_semantic_test_fixture() -> Result<SemanticTestFixture> {
8329 build_semantic_test_fixture_with_shards(false)
8330 }
8331
8332 fn build_sharded_semantic_test_fixture() -> Result<SemanticTestFixture> {
8333 build_semantic_test_fixture_with_shards(true)
8334 }
8335
8336 fn build_semantic_test_fixture_with_shards(sharded: bool) -> Result<SemanticTestFixture> {
8337 let dir = TempDir::new()?;
8338 let db_path = dir.path().join("cass.db");
8339 let storage = FrankenStorage::open(&db_path)?;
8340
8341 let agent = Agent {
8342 id: None,
8343 slug: "codex".into(),
8344 name: "Codex".into(),
8345 version: None,
8346 kind: AgentKind::Cli,
8347 };
8348 let agent_id = storage.ensure_agent(&agent)?;
8349 let workspace_path = dir.path().join("workspace");
8350 std::fs::create_dir_all(&workspace_path)?;
8351 let workspace_id = storage.ensure_workspace(&workspace_path, None)?;
8352
8353 let documents = [
8354 ("session-a.jsonl", "top semantic match", [1.0_f32, 0.0_f32]),
8355 (
8356 "session-b.jsonl",
8357 "middle semantic match",
8358 [0.9_f32, 0.1_f32],
8359 ),
8360 ("session-c.jsonl", "late semantic match", [0.8_f32, 0.2_f32]),
8361 ];
8362 let base_ts = 1_700_000_000_000_i64;
8363 let mut doc_ids = Vec::with_capacity(documents.len());
8364 let mut source_paths = Vec::with_capacity(documents.len());
8365
8366 for (idx, (name, content, _vector)) in documents.iter().enumerate() {
8367 let source_path = dir.path().join(name);
8368 source_paths.push(source_path.to_string_lossy().to_string());
8369
8370 let conversation = Conversation {
8371 id: None,
8372 agent_slug: agent.slug.clone(),
8373 workspace: Some(workspace_path.clone()),
8374 external_id: Some(format!("semantic-{idx}")),
8375 title: Some(format!("semantic session {idx}")),
8376 source_path,
8377 started_at: Some(base_ts + idx as i64),
8378 ended_at: Some(base_ts + idx as i64),
8379 approx_tokens: Some(16),
8380 metadata_json: json!({"fixture": "semantic_search"}),
8381 messages: vec![Message {
8382 id: None,
8383 idx: 0,
8384 role: MessageRole::User,
8385 author: Some("user".into()),
8386 created_at: Some(base_ts + idx as i64),
8387 content: (*content).to_string(),
8388 extra_json: json!({}),
8389 snippets: Vec::new(),
8390 }],
8391 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
8392 origin_host: None,
8393 };
8394
8395 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
8396 }
8397
8398 let message_rows: Vec<(u64, i64)> = storage.raw().query_map_collect(
8399 "SELECT m.id, COALESCE(m.created_at, c.started_at, 0)
8400 FROM messages m
8401 JOIN conversations c ON m.conversation_id = c.id
8402 ORDER BY c.id",
8403 &[],
8404 |row: &frankensqlite::Row| {
8405 let message_id: i64 = row.get_typed(0)?;
8406 let created_at: i64 = row.get_typed(1)?;
8407 Ok((u64::try_from(message_id).unwrap_or(u64::MAX), created_at))
8408 },
8409 )?;
8410 assert_eq!(
8411 message_rows.len(),
8412 documents.len(),
8413 "fixture should create 3 messages"
8414 );
8415
8416 let filter_maps = SemanticFilterMaps::from_storage(&storage)?;
8417 let embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[1.0, 0.0]));
8418 let source_hash = crc32fast::hash(crate::sources::provenance::LOCAL_SOURCE_ID.as_bytes());
8419 let vector_dir = dir.path().join("vector_index");
8420 std::fs::create_dir_all(&vector_dir)?;
8421 let mut vector_records = Vec::with_capacity(documents.len());
8422
8423 for ((message_id, created_at_ms), (_, _, vector)) in message_rows.iter().zip(documents) {
8424 let doc_id = SemanticDocId {
8425 message_id: *message_id,
8426 chunk_idx: 0,
8427 agent_id: u32::try_from(agent_id)?,
8428 workspace_id: u32::try_from(workspace_id)?,
8429 source_id: source_hash,
8430 role: ROLE_USER,
8431 created_at_ms: *created_at_ms,
8432 content_hash: None,
8433 }
8434 .to_doc_id_string();
8435 doc_ids.push(doc_id.clone());
8436 vector_records.push((doc_id, vector));
8437 }
8438
8439 let mut vector_indexes = Vec::new();
8440 if sharded {
8441 for (shard_index, chunk) in vector_records.chunks(2).enumerate() {
8442 let vector_path = vector_dir.join(format!("shard-{shard_index}.fsvi"));
8443 let mut writer = VectorIndex::create_with_revision(
8444 &vector_path,
8445 embedder.id(),
8446 "rev-1",
8447 embedder.dimension(),
8448 frankensearch::index::Quantization::F16,
8449 )?;
8450 for (doc_id, vector) in chunk {
8451 writer.write_record(doc_id, vector)?;
8452 }
8453 writer.finish()?;
8454 vector_indexes.push(VectorIndex::open(&vector_path)?);
8455 }
8456 } else {
8457 let vector_path = vector_dir.join("index-test-fixed-2d.fsvi");
8458 let mut writer = VectorIndex::create_with_revision(
8459 &vector_path,
8460 embedder.id(),
8461 "rev-1",
8462 embedder.dimension(),
8463 frankensearch::index::Quantization::F16,
8464 )?;
8465 for (doc_id, vector) in &vector_records {
8466 writer.write_record(doc_id, vector)?;
8467 }
8468 writer.finish()?;
8469 vector_indexes.push(VectorIndex::open(&vector_path)?);
8470 }
8471 drop(storage);
8472
8473 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
8474 client.set_semantic_indexes_context(embedder, vector_indexes, filter_maps, None, None)?;
8475
8476 Ok(SemanticTestFixture {
8477 _dir: dir,
8478 client,
8479 doc_ids,
8480 source_paths,
8481 })
8482 }
8483
8484 fn build_progressive_hybrid_fixture() -> Result<ProgressiveHybridFixture> {
8485 let dir = TempDir::new()?;
8486 let mut index = TantivyIndex::open_or_create(dir.path())?;
8487 let workspace_path = dir.path().join("workspace");
8488 std::fs::create_dir_all(&workspace_path)?;
8489 let agent_id = 1_i64;
8490 let workspace_id = 1_i64;
8491 let source_id = crate::sources::provenance::LOCAL_SOURCE_ID;
8492 let source_hash = crc32fast::hash(source_id.as_bytes());
8493 let conn = Connection::open(":memory:")?;
8494 conn.execute_batch(
8495 r#"
8496 CREATE TABLE agents (
8497 id INTEGER PRIMARY KEY,
8498 slug TEXT NOT NULL
8499 );
8500 CREATE TABLE workspaces (
8501 id INTEGER PRIMARY KEY,
8502 path TEXT NOT NULL
8503 );
8504 CREATE TABLE sources (
8505 id TEXT PRIMARY KEY,
8506 kind TEXT NOT NULL
8507 );
8508 CREATE TABLE conversations (
8509 id INTEGER PRIMARY KEY,
8510 agent_id INTEGER NOT NULL,
8511 workspace_id INTEGER,
8512 title TEXT,
8513 source_path TEXT NOT NULL,
8514 source_id TEXT NOT NULL,
8515 origin_host TEXT,
8516 started_at INTEGER
8517 );
8518 CREATE TABLE messages (
8519 id INTEGER PRIMARY KEY,
8520 conversation_id INTEGER NOT NULL,
8521 idx INTEGER NOT NULL,
8522 role TEXT NOT NULL,
8523 created_at INTEGER,
8524 content TEXT NOT NULL
8525 );
8526 "#,
8527 )?;
8528 conn.execute_compat(
8529 "INSERT INTO agents (id, slug) VALUES (?1, ?2)",
8530 params![agent_id, "codex"],
8531 )?;
8532 conn.execute_compat(
8533 "INSERT INTO workspaces (id, path) VALUES (?1, ?2)",
8534 params![workspace_id, workspace_path.to_string_lossy().to_string()],
8535 )?;
8536 conn.execute_compat(
8537 "INSERT INTO sources (id, kind) VALUES (?1, ?2)",
8538 params![source_id, "local"],
8539 )?;
8540
8541 let query = "oauth refresh token middleware session cache".to_string();
8542 let filler = " context window ranking provenance semantic upgrade lexical overlay";
8543 let base_ts = 1_700_000_100_000_i64;
8544 let doc_count = 64usize;
8545 let mut message_rows = Vec::with_capacity(doc_count);
8546
8547 for idx in 0..doc_count {
8548 let conversation_id = i64::try_from(idx + 1)?;
8549 let message_id = u64::try_from(idx + 1)?;
8550 let source_path = dir.path().join(format!("progressive-{idx:03}.jsonl"));
8551 let repeated = filler.repeat(48);
8552 let content = if idx % 4 == 0 {
8553 format!(
8554 "{query} hot path candidate {idx} with detailed search diagnostics.{repeated}"
8555 )
8556 } else if idx % 4 == 1 {
8557 format!(
8558 "search pipeline benchmark {idx} with lexical overlay and semantic ranking.{repeated}"
8559 )
8560 } else if idx % 4 == 2 {
8561 format!(
8562 "interactive typing debounce benchmark {idx} for hybrid two tier search.{repeated}"
8563 )
8564 } else {
8565 format!(
8566 "unrelated background chatter {idx} about build systems and formatting checks.{repeated}"
8567 )
8568 };
8569 let created_at = base_ts + idx as i64;
8570 let source_path_str = source_path.to_string_lossy().to_string();
8571 let title = format!("progressive fixture {idx}");
8572
8573 conn.execute_compat(
8574 "INSERT INTO conversations (
8575 id, agent_id, workspace_id, title, source_path, source_id, origin_host, started_at
8576 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, NULL, ?7)",
8577 params![
8578 conversation_id,
8579 agent_id,
8580 workspace_id,
8581 title,
8582 source_path_str.clone(),
8583 source_id,
8584 created_at
8585 ],
8586 )?;
8587 conn.execute_compat(
8588 "INSERT INTO messages (
8589 id, conversation_id, idx, role, created_at, content
8590 ) VALUES (?1, ?2, 0, 'user', ?3, ?4)",
8591 params![
8592 i64::try_from(message_id)?,
8593 conversation_id,
8594 created_at,
8595 content.clone()
8596 ],
8597 )?;
8598 message_rows.push((message_id, created_at, content.clone()));
8599
8600 let normalized = NormalizedConversation {
8601 agent_slug: "codex".into(),
8602 external_id: Some(format!("progressive-{idx}")),
8603 title: Some(format!("progressive fixture {idx}")),
8604 workspace: Some(workspace_path.clone()),
8605 source_path,
8606 started_at: Some(created_at),
8607 ended_at: Some(created_at),
8608 metadata: json!({}),
8609 messages: vec![NormalizedMessage {
8610 idx: 0,
8611 role: "user".into(),
8612 author: Some("user".into()),
8613 created_at: Some(created_at),
8614 content,
8615 extra: json!({}),
8616 snippets: Vec::new(),
8617 invocations: Vec::new(),
8618 }],
8619 };
8620 index.add_conversation(&normalized)?;
8621 }
8622 index.commit()?;
8623
8624 assert_eq!(
8625 message_rows.len(),
8626 doc_count,
8627 "fixture should create the requested number of messages"
8628 );
8629
8630 let fast_embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8631 let quality_embedder = crate::search::hash_embedder::HashEmbedder::new(384);
8632 let filter_maps = SemanticFilterMaps::for_tests(
8633 HashMap::from([("codex".to_string(), u32::try_from(agent_id)?)]),
8634 HashMap::from([(
8635 workspace_path.to_string_lossy().to_string(),
8636 u32::try_from(workspace_id)?,
8637 )]),
8638 HashMap::from([(source_id.to_string(), source_hash)]),
8639 HashSet::new(),
8640 );
8641 let fast_path = dir.path().join("vector.fast.idx");
8642 let quality_path = dir.path().join("vector.quality.idx");
8643
8644 let mut fast_writer = VectorIndex::create_with_revision(
8645 &fast_path,
8646 fast_embedder.id(),
8647 "rev-progressive-fast",
8648 fast_embedder.dimension(),
8649 frankensearch::index::Quantization::F16,
8650 )?;
8651 let mut quality_writer = VectorIndex::create_with_revision(
8652 &quality_path,
8653 quality_embedder.id(),
8654 "rev-progressive-quality",
8655 quality_embedder.dimension(),
8656 frankensearch::index::Quantization::F16,
8657 )?;
8658
8659 for (message_id, created_at_ms, content) in &message_rows {
8660 let canonical = canonicalize_for_embedding(content);
8661 let doc_id = SemanticDocId {
8662 message_id: *message_id,
8663 chunk_idx: 0,
8664 agent_id: u32::try_from(agent_id)?,
8665 workspace_id: u32::try_from(workspace_id)?,
8666 source_id: source_hash,
8667 role: ROLE_USER,
8668 created_at_ms: *created_at_ms,
8669 content_hash: Some(content_hash(&canonical)),
8670 }
8671 .to_doc_id_string();
8672
8673 let fast_vec = fast_embedder.embed_sync(content)?;
8674 fast_writer.write_record(&doc_id, &fast_vec)?;
8675 let quality_vec = quality_embedder.embed_sync(content)?;
8676 quality_writer.write_record(&doc_id, &quality_vec)?;
8677 }
8678 fast_writer.finish()?;
8679 quality_writer.finish()?;
8680
8681 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
8682 let client = SearchClient {
8683 reader,
8684 sqlite: Mutex::new(Some(SendConnection(conn))),
8685 sqlite_path: None,
8686 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
8687 reload_on_search: true,
8688 last_reload: Mutex::new(None),
8689 last_generation: Mutex::new(None),
8690 reload_epoch: Arc::new(AtomicU64::new(0)),
8691 warm_tx: None,
8692 _warm_handle: None,
8693 metrics: Metrics::default(),
8694 cache_namespace: format!("v{}|schema:{}", CACHE_KEY_VERSION, FS_CASS_SCHEMA_HASH),
8695 semantic: Mutex::new(None),
8696 last_tantivy_total_count: Mutex::new(None),
8697 };
8698 let semantic_embedder: Arc<dyn Embedder> = fast_embedder;
8699 client.set_semantic_context(
8700 semantic_embedder,
8701 VectorIndex::open(&fast_path)?,
8702 filter_maps,
8703 None,
8704 Some(fast_path),
8705 )?;
8706
8707 Ok(ProgressiveHybridFixture {
8708 _dir: dir,
8709 client: Arc::new(client),
8710 query,
8711 })
8712 }
8713
8714 fn sanitize_query(raw: &str) -> String {
8715 nfc_sanitize_query(raw)
8716 }
8717
8718 fn parse_boolean_query(query: &str) -> Vec<FsCassQueryToken> {
8719 fs_cass_parse_boolean_query(query)
8720 }
8721
8722 fn sqlite_master_name_count(db_path: &Path, name: &str) -> Result<i64> {
8723 let conn = FrankenConnection::open(db_path.to_string_lossy().as_ref())?;
8724 Ok(conn.query_row_map(
8725 "SELECT COUNT(*) FROM sqlite_master WHERE name = ?1",
8726 &[ParamValue::from(name)],
8727 |row| row.get_typed(0),
8728 )?)
8729 }
8730
8731 type QueryToken = FsCassQueryToken;
8732 type WildcardPattern = FsCassWildcardPattern;
8733 type QueryTokenList = Vec<QueryToken>;
8734
8735 #[test]
8736 #[ignore = "profiling harness for live hybrid progressive search"]
8737 fn progressive_hybrid_profile_harness() -> Result<()> {
8738 let fixture = build_progressive_hybrid_fixture()?;
8739 let runtime = asupersync::runtime::RuntimeBuilder::current_thread()
8740 .build()
8741 .map_err(|err| anyhow!("build test runtime failed: {err}"))?;
8742 let iterations = 24usize;
8743
8744 runtime.block_on(async {
8745 let cx = FsCx::for_request();
8746 fixture
8747 .client
8748 .search_progressive_with_callback(
8749 ProgressiveSearchRequest {
8750 cx: &cx,
8751 query: &fixture.query,
8752 filters: SearchFilters::default(),
8753 limit: 16,
8754 sparse_threshold: 0,
8755 field_mask: FieldMask::new(false, true, true, true),
8756 mode: SearchMode::Hybrid,
8757 },
8758 |_| {},
8759 )
8760 .await
8761 })?;
8762
8763 let mut initial_events = 0usize;
8764 let mut refined_events = 0usize;
8765 let mut total_hits = 0usize;
8766 for _ in 0..iterations {
8767 let mut refinement_error = None;
8768 runtime.block_on(async {
8769 let cx = FsCx::for_request();
8770 fixture
8771 .client
8772 .search_progressive_with_callback(
8773 ProgressiveSearchRequest {
8774 cx: &cx,
8775 query: &fixture.query,
8776 filters: SearchFilters::default(),
8777 limit: 16,
8778 sparse_threshold: 0,
8779 field_mask: FieldMask::new(false, true, true, true),
8780 mode: SearchMode::Hybrid,
8781 },
8782 |event| match event {
8783 ProgressiveSearchEvent::Phase { kind, result, .. } => {
8784 assert!(
8785 !result.hits.is_empty(),
8786 "progressive harness expects non-empty hits for each phase"
8787 );
8788 total_hits += result.hits.len();
8789 match kind {
8790 ProgressivePhaseKind::Initial => initial_events += 1,
8791 ProgressivePhaseKind::Refined => refined_events += 1,
8792 }
8793 }
8794 ProgressiveSearchEvent::RefinementFailed { error, .. } => {
8795 refinement_error = Some(error);
8796 }
8797 },
8798 )
8799 .await
8800 })?;
8801 if let Some(error) = refinement_error {
8802 bail!("progressive harness refinement failed: {error}");
8803 }
8804 }
8805
8806 assert_eq!(initial_events, iterations);
8807 assert_eq!(refined_events, iterations);
8808 assert!(
8809 total_hits >= iterations.saturating_mul(16),
8810 "harness should observe a full page for each phase"
8811 );
8812
8813 Ok(())
8814 }
8815
8816 #[test]
8821 fn interner_returns_same_arc_for_same_string() {
8822 let interner = StringInterner::new(100);
8823
8824 let s1 = interner.intern("test_query");
8825 let s2 = interner.intern("test_query");
8826
8827 assert!(Arc::ptr_eq(&s1, &s2));
8829 assert_eq!(&*s1, "test_query");
8830 }
8831
8832 #[test]
8833 fn interner_different_strings_return_different_arcs() {
8834 let interner = StringInterner::new(100);
8835
8836 let s1 = interner.intern("query1");
8837 let s2 = interner.intern("query2");
8838
8839 assert!(!Arc::ptr_eq(&s1, &s2));
8840 assert_eq!(&*s1, "query1");
8841 assert_eq!(&*s2, "query2");
8842 }
8843
8844 #[test]
8845 fn interner_handles_empty_string() {
8846 let interner = StringInterner::new(100);
8847
8848 let s1 = interner.intern("");
8849 let s2 = interner.intern("");
8850
8851 assert!(Arc::ptr_eq(&s1, &s2));
8852 assert_eq!(&*s1, "");
8853 }
8854
8855 #[test]
8856 fn interner_handles_unicode() {
8857 let interner = StringInterner::new(100);
8858
8859 let s1 = interner.intern("测试查询");
8860 let s2 = interner.intern("测试查询");
8861 let s3 = interner.intern("emoji 🔍 search");
8862
8863 assert!(Arc::ptr_eq(&s1, &s2));
8864 assert_eq!(&*s3, "emoji 🔍 search");
8865 }
8866
8867 #[test]
8868 fn interner_respects_lru_eviction() {
8869 let interner = StringInterner::new(3);
8870
8871 let _s1 = interner.intern("query1");
8872 let _s2 = interner.intern("query2");
8873 let _s3 = interner.intern("query3");
8874
8875 assert_eq!(interner.len(), 3);
8876
8877 let _s4 = interner.intern("query4");
8879
8880 assert_eq!(interner.len(), 3);
8881
8882 let s1_new = interner.intern("query1");
8884 assert_eq!(&*s1_new, "query1");
8885 }
8886
8887 #[test]
8888 fn interner_concurrent_access() {
8889 use std::thread;
8890
8891 let interner = Arc::new(StringInterner::new(1000));
8892 let queries: Vec<String> = (0..100).map(|i| format!("query_{}", i)).collect();
8893
8894 let handles: Vec<_> = (0..4)
8895 .map(|_| {
8896 let interner = Arc::clone(&interner);
8897 let queries = queries.clone();
8898
8899 thread::spawn(move || {
8900 for _ in 0..10 {
8901 for query in &queries {
8902 let _ = interner.intern(query);
8903 }
8904 }
8905 })
8906 })
8907 .collect();
8908
8909 for handle in handles {
8910 handle.join().unwrap();
8911 }
8912
8913 for query in &queries {
8915 let s1 = interner.intern(query);
8916 let s2 = interner.intern(query);
8917 assert!(Arc::ptr_eq(&s1, &s2));
8918 }
8919 }
8920
8921 #[test]
8926 fn query_terms_lower_basic() {
8927 let terms = QueryTermsLower::from_query("Hello World");
8928
8929 assert_eq!(terms.query_lower, "hello world");
8930 let tokens: Vec<&str> = terms.tokens().collect();
8931 assert_eq!(tokens, vec!["hello", "world"]);
8932 }
8933
8934 #[test]
8935 fn query_terms_lower_empty() {
8936 let terms = QueryTermsLower::from_query("");
8937
8938 assert!(terms.is_empty());
8939 assert_eq!(terms.tokens().count(), 0);
8940 }
8941
8942 #[test]
8943 fn query_terms_lower_single_term() {
8944 let terms = QueryTermsLower::from_query("TEST");
8945
8946 let tokens: Vec<&str> = terms.tokens().collect();
8947 assert_eq!(tokens, vec!["test"]);
8948 }
8949
8950 #[test]
8951 fn query_terms_lower_with_punctuation() {
8952 let terms = QueryTermsLower::from_query("hello, world! how's it?");
8953
8954 let tokens: Vec<&str> = terms.tokens().collect();
8955 assert_eq!(tokens, vec!["hello", "world", "how", "s", "it"]);
8956 }
8957
8958 #[test]
8959 fn query_terms_lower_unicode() {
8960 let terms = QueryTermsLower::from_query("Héllo Wörld");
8961
8962 assert_eq!(terms.query_lower, "héllo wörld");
8963 let tokens: Vec<&str> = terms.tokens().collect();
8964 assert_eq!(tokens, vec!["héllo", "wörld"]);
8965 }
8966
8967 #[test]
8968 fn query_terms_lower_bloom_mask() {
8969 let terms = QueryTermsLower::from_query("test");
8970
8971 assert_ne!(terms.bloom_mask(), 0);
8973
8974 let terms2 = QueryTermsLower::from_query("test");
8976 assert_eq!(terms.bloom_mask(), terms2.bloom_mask());
8977 }
8978
8979 #[test]
8980 fn hit_matches_with_precomputed_terms() {
8981 let hit = SearchHit {
8982 title: "Test Title".into(),
8983 snippet: "".into(),
8984 content: "hello world content".into(),
8985 content_hash: stable_content_hash("hello world content"),
8986 score: 1.0,
8987 source_path: "p".into(),
8988 agent: "a".into(),
8989 workspace: "w".into(),
8990 workspace_original: None,
8991 created_at: None,
8992 line_number: None,
8993 match_type: MatchType::Exact,
8994 source_id: "local".into(),
8995 origin_kind: "local".into(),
8996 origin_host: None,
8997 conversation_id: None,
8998 };
8999 let cached = cached_hit_from(&hit);
9000
9001 let terms = QueryTermsLower::from_query("hello");
9003 assert!(hit_matches_query_cached_precomputed(&cached, &terms));
9004
9005 let terms_miss = QueryTermsLower::from_query("missing");
9006 assert!(!hit_matches_query_cached_precomputed(&cached, &terms_miss));
9007 }
9008
9009 fn make_fused_hit(
9014 id: &str,
9015 rrf: f32,
9016 lexical: Option<usize>,
9017 semantic: Option<usize>,
9018 ) -> FusedHit {
9019 FusedHit {
9020 key: SearchHitKey {
9021 source_id: "local".to_string(),
9022 source_path: id.to_string(),
9023 conversation_id: None,
9024 title: String::new(),
9025 line_number: None,
9026 created_at: None,
9027 content_hash: 0,
9028 },
9029 score: HybridScore {
9030 rrf,
9031 lexical_rank: lexical,
9032 semantic_rank: semantic,
9033 lexical_score: None,
9034 semantic_score: None,
9035 },
9036 hit: SearchHit {
9037 title: id.into(),
9038 snippet: "".into(),
9039 content: "".into(),
9040 content_hash: 0,
9041 score: rrf,
9042 source_path: id.into(),
9043 agent: "test".into(),
9044 workspace: "test".into(),
9045 workspace_original: None,
9046 created_at: None,
9047 line_number: None,
9048 match_type: MatchType::Exact,
9049 source_id: "local".into(),
9050 origin_kind: "local".into(),
9051 origin_host: None,
9052 conversation_id: None,
9053 },
9054 }
9055 }
9056
9057 fn make_federated_merge_hit(id: &str, agent: &str) -> SearchHit {
9058 SearchHit {
9059 title: id.into(),
9060 snippet: String::new(),
9061 content: id.into(),
9062 content_hash: stable_content_hash(id),
9063 score: 0.0,
9064 source_path: format!("{id}.jsonl"),
9065 agent: agent.into(),
9066 workspace: "workspace".into(),
9067 workspace_original: None,
9068 created_at: Some(1_700_000_000_000),
9069 line_number: Some(1),
9070 match_type: MatchType::Exact,
9071 source_id: "local".into(),
9072 origin_kind: "local".into(),
9073 origin_host: None,
9074 conversation_id: None,
9075 }
9076 }
9077
9078 fn make_federated_ranked_hit(
9079 shard_index: usize,
9080 shard_rank: usize,
9081 id: &str,
9082 ) -> FederatedRankedHit {
9083 FederatedRankedHit {
9084 hit: make_federated_merge_hit(id, &format!("shard-{shard_index}")),
9085 shard_index,
9086 shard_rank,
9087 fused_score: federated_rrf_score(shard_rank),
9088 }
9089 }
9090
9091 #[test]
9092 fn federated_merge_orders_equal_rank_hits_by_stable_hit_key() {
9093 let merged = merge_federated_ranked_hits(vec![
9094 make_federated_ranked_hit(2, 0, "zeta"),
9095 make_federated_ranked_hit(0, 0, "bravo"),
9096 make_federated_ranked_hit(1, 0, "alpha"),
9097 ]);
9098
9099 let paths = merged
9100 .iter()
9101 .map(|hit| hit.source_path.as_str())
9102 .collect::<Vec<_>>();
9103 assert_eq!(paths, vec!["alpha.jsonl", "bravo.jsonl", "zeta.jsonl"]);
9104 assert!(
9105 merged
9106 .iter()
9107 .all(|hit| (hit.score - federated_rrf_score(0)).abs() < f32::EPSILON),
9108 "equal per-shard rank should produce equal RRF scores"
9109 );
9110 }
9111
9112 #[test]
9113 fn federated_merge_keeps_rrf_rank_ahead_of_stable_key() {
9114 let merged = merge_federated_ranked_hits(vec![
9115 make_federated_ranked_hit(0, 1, "alpha"),
9116 make_federated_ranked_hit(1, 0, "zeta"),
9117 ]);
9118
9119 let paths = merged
9120 .iter()
9121 .map(|hit| hit.source_path.as_str())
9122 .collect::<Vec<_>>();
9123 assert_eq!(paths, vec!["zeta.jsonl", "alpha.jsonl"]);
9124 assert!(merged[0].score > merged[1].score);
9125 }
9126
9127 #[test]
9128 fn federated_merge_uses_shard_index_as_duplicate_final_tiebreak() {
9129 let merged = merge_federated_ranked_hits(vec![
9130 FederatedRankedHit {
9131 hit: make_federated_merge_hit("same", "shard-2"),
9132 shard_index: 2,
9133 shard_rank: 0,
9134 fused_score: federated_rrf_score(0),
9135 },
9136 FederatedRankedHit {
9137 hit: make_federated_merge_hit("same", "shard-0"),
9138 shard_index: 0,
9139 shard_rank: 0,
9140 fused_score: federated_rrf_score(0),
9141 },
9142 ]);
9143
9144 assert_eq!(merged[0].agent, "shard-0");
9145 assert_eq!(merged[1].agent, "shard-2");
9146 }
9147
9148 #[test]
9149 fn top_k_fused_basic() {
9150 let hits = vec![
9151 make_fused_hit("a", 1.0, Some(0), None),
9152 make_fused_hit("b", 3.0, Some(1), None),
9153 make_fused_hit("c", 2.0, Some(2), None),
9154 make_fused_hit("d", 5.0, Some(3), None),
9155 make_fused_hit("e", 4.0, Some(4), None),
9156 ];
9157
9158 let top = top_k_fused(hits, 3);
9159
9160 assert_eq!(top.len(), 3);
9161 assert_eq!(top[0].key.source_path, "d"); assert_eq!(top[1].key.source_path, "e"); assert_eq!(top[2].key.source_path, "b"); }
9165
9166 #[test]
9167 fn top_k_fused_empty() {
9168 let hits: Vec<FusedHit> = vec![];
9169 let top = top_k_fused(hits, 10);
9170 assert!(top.is_empty());
9171 }
9172
9173 #[test]
9174 fn top_k_fused_k_zero() {
9175 let hits = vec![
9176 make_fused_hit("a", 1.0, Some(0), None),
9177 make_fused_hit("b", 2.0, Some(1), None),
9178 ];
9179 let top = top_k_fused(hits, 0);
9180 assert!(top.is_empty());
9181 }
9182
9183 #[test]
9184 fn top_k_fused_k_larger_than_n() {
9185 let hits = vec![
9186 make_fused_hit("a", 1.0, Some(0), None),
9187 make_fused_hit("b", 2.0, Some(1), None),
9188 ];
9189
9190 let top = top_k_fused(hits, 10);
9191
9192 assert_eq!(top.len(), 2);
9193 assert_eq!(top[0].key.source_path, "b"); assert_eq!(top[1].key.source_path, "a"); }
9196
9197 #[test]
9198 fn top_k_fused_k_equals_n() {
9199 let hits = vec![
9200 make_fused_hit("a", 3.0, Some(0), None),
9201 make_fused_hit("b", 1.0, Some(1), None),
9202 make_fused_hit("c", 2.0, Some(2), None),
9203 ];
9204
9205 let top = top_k_fused(hits, 3);
9206
9207 assert_eq!(top.len(), 3);
9208 assert_eq!(top[0].key.source_path, "a"); assert_eq!(top[1].key.source_path, "c"); assert_eq!(top[2].key.source_path, "b"); }
9212
9213 #[test]
9214 fn top_k_fused_k_one() {
9215 let hits = vec![
9216 make_fused_hit("a", 1.0, Some(0), None),
9217 make_fused_hit("b", 3.0, Some(1), None),
9218 make_fused_hit("c", 2.0, Some(2), None),
9219 ];
9220
9221 let top = top_k_fused(hits, 1);
9222
9223 assert_eq!(top.len(), 1);
9224 assert_eq!(top[0].key.source_path, "b");
9225 assert_eq!(top[0].score.rrf, 3.0);
9226 }
9227
9228 #[test]
9229 fn top_k_fused_duplicate_scores() {
9230 let hits = vec![
9231 make_fused_hit("a", 2.0, Some(0), None),
9232 make_fused_hit("b", 2.0, Some(1), None),
9233 make_fused_hit("c", 2.0, Some(2), None),
9234 make_fused_hit("d", 1.0, Some(3), None),
9235 ];
9236
9237 let top = top_k_fused(hits, 2);
9238
9239 assert_eq!(top.len(), 2);
9240 assert_eq!(top[0].score.rrf, 2.0);
9242 assert_eq!(top[1].score.rrf, 2.0);
9243 }
9244
9245 #[test]
9246 fn top_k_fused_dual_source_tiebreaker() {
9247 let hits = vec![
9249 make_fused_hit("a", 2.0, Some(0), None), make_fused_hit("b", 2.0, Some(1), Some(0)), make_fused_hit("c", 2.0, None, Some(1)), ];
9253
9254 let top = top_k_fused(hits, 3);
9255
9256 assert_eq!(top.len(), 3);
9257 assert_eq!(top[0].key.source_path, "b");
9259 }
9260
9261 #[test]
9262 fn top_k_fused_large_input_uses_quickselect() {
9263 let hits: Vec<FusedHit> = (0..100)
9265 .map(|i| make_fused_hit(&format!("hit_{}", i), i as f32, Some(i), None))
9266 .collect();
9267
9268 let top = top_k_fused(hits, 10);
9269
9270 assert_eq!(top.len(), 10);
9271 for (i, hit) in top.iter().enumerate() {
9273 assert_eq!(hit.key.source_path, format!("hit_{}", 99 - i));
9274 assert_eq!(hit.score.rrf, (99 - i) as f32);
9275 }
9276 }
9277
9278 #[test]
9279 fn top_k_fused_equivalence_with_full_sort() {
9280 for n in [10, 50, 100, 200] {
9282 for k in [1, 5, 10, 25] {
9283 if k > n {
9284 continue;
9285 }
9286
9287 let hits: Vec<FusedHit> = (0..n)
9288 .map(|i| {
9289 let score = ((i * 17 + 7) % 1000) as f32;
9291 make_fused_hit(&format!("hit_{}", i), score, Some(i), None)
9292 })
9293 .collect();
9294
9295 let mut baseline = hits.clone();
9297 baseline.sort_by(cmp_fused_hit_desc);
9298 baseline.truncate(k);
9299
9300 let quickselect = top_k_fused(hits, k);
9302
9303 assert_eq!(quickselect.len(), baseline.len(), "n={}, k={}", n, k);
9305
9306 for (q, b) in quickselect.iter().zip(baseline.iter()) {
9308 assert_eq!(
9309 q.key.source_path, b.key.source_path,
9310 "n={}, k={}: mismatch",
9311 n, k
9312 );
9313 assert_eq!(q.score.rrf, b.score.rrf, "n={}, k={}: score mismatch", n, k);
9314 }
9315 }
9316 }
9317 }
9318
9319 #[test]
9320 fn cmp_fused_hit_desc_basic_ordering() {
9321 let a = make_fused_hit("a", 2.0, Some(0), None);
9322 let b = make_fused_hit("b", 3.0, Some(1), None);
9323
9324 assert_eq!(cmp_fused_hit_desc(&a, &b), CmpOrdering::Greater);
9326 assert_eq!(cmp_fused_hit_desc(&b, &a), CmpOrdering::Less);
9327 assert_eq!(cmp_fused_hit_desc(&a, &a), CmpOrdering::Equal);
9328 }
9329
9330 #[test]
9335 fn cache_enforces_prefix_matching() {
9336 let hit = SearchHit {
9338 title: "test".into(),
9339 snippet: "".into(),
9340 content: "arrow".into(),
9341 content_hash: stable_content_hash("arrow"),
9342 score: 1.0,
9343 source_path: "p".into(),
9344 agent: "a".into(),
9345 workspace: "w".into(),
9346 workspace_original: None,
9347 created_at: None,
9348 line_number: None,
9349 match_type: MatchType::Exact,
9350 source_id: "local".into(),
9351 origin_kind: "local".into(),
9352 origin_host: None,
9353 conversation_id: None,
9354 };
9355
9356 let cached = CachedHit {
9357 hit: hit.clone(),
9358 lc_content: "arrow".into(),
9359 lc_title: Some("test".into()),
9360 bloom64: u64::MAX, };
9362
9363 let matched = hit_matches_query_cached(&cached, "row");
9366
9367 assert!(
9368 !matched,
9369 "Query 'row' should NOT match content 'arrow' (prefix match required)"
9370 );
9371 }
9372
9373 #[test]
9374 fn search_deduplication_across_pages_repro() {
9375 let dir = TempDir::new().unwrap();
9380 let index_path = dir.path();
9381 let mut index = TantivyIndex::open_or_create(index_path).unwrap();
9382
9383 let msg1 = NormalizedMessage {
9387 idx: 0,
9388 role: "user".into(),
9389 author: None,
9390 created_at: Some(1000),
9391 content: "duplicate content".into(),
9392 extra: serde_json::json!({}),
9393 snippets: Vec::new(),
9394 invocations: Vec::new(),
9395 };
9396 let conv1 = NormalizedConversation {
9397 agent_slug: "agent1".into(),
9398 external_id: None,
9399 title: None,
9400 workspace: None,
9401 source_path: "path/1".into(),
9402 started_at: None,
9403 ended_at: None,
9404 metadata: serde_json::json!({}),
9405 messages: vec![msg1],
9406 };
9407
9408 let msg2 = NormalizedMessage {
9409 idx: 0,
9410 role: "user".into(),
9411 author: None,
9412 created_at: Some(2000), content: "duplicate content".into(), extra: serde_json::json!({}),
9415 snippets: Vec::new(),
9416 invocations: Vec::new(),
9417 };
9418 let conv2 = NormalizedConversation {
9419 agent_slug: "agent1".into(),
9420 external_id: None,
9421 title: None,
9422 workspace: None,
9423 source_path: "path/2".into(), started_at: None,
9425 ended_at: None,
9426 metadata: serde_json::json!({}),
9427 messages: vec![msg2],
9428 };
9429
9430 index.add_conversation(&conv1).unwrap();
9431 index.add_conversation(&conv2).unwrap();
9432 index.commit().unwrap();
9433
9434 let client = SearchClient::open(index_path, None).unwrap().unwrap();
9435
9436 let page1 = client
9438 .search("duplicate", SearchFilters::default(), 1, 0, FieldMask::FULL)
9439 .unwrap();
9440 assert_eq!(page1.len(), 1);
9441
9442 let page2 = client
9444 .search("duplicate", SearchFilters::default(), 1, 1, FieldMask::FULL)
9445 .unwrap();
9446
9447 assert_eq!(page2.len(), 1);
9448 assert_ne!(page1[0].source_path, page2[0].source_path);
9449 }
9450
9451 #[test]
9452 fn cache_skips_complex_queries() {
9453 let client = SearchClient {
9454 reader: None,
9455 sqlite: Mutex::new(None),
9456 sqlite_path: None,
9457 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9458 reload_on_search: true,
9459 last_reload: Mutex::new(None),
9460 last_generation: Mutex::new(None),
9461 reload_epoch: Arc::new(AtomicU64::new(0)),
9462 warm_tx: None,
9463 _warm_handle: None,
9464 metrics: Metrics::default(),
9465 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
9466 semantic: Mutex::new(None),
9467 last_tantivy_total_count: Mutex::new(None),
9468 };
9469
9470 let _ = client.search("foo*", SearchFilters::default(), 10, 0, FieldMask::FULL);
9472 let stats = client.cache_stats();
9473 assert_eq!(
9474 stats.cache_miss, 0,
9475 "Wildcard query should not trigger cache miss"
9476 );
9477
9478 let _ = client.search(
9480 "foo OR bar",
9481 SearchFilters::default(),
9482 10,
9483 0,
9484 FieldMask::FULL,
9485 );
9486 let stats = client.cache_stats();
9487 assert_eq!(
9488 stats.cache_miss, 0,
9489 "Boolean query should not trigger cache miss"
9490 );
9491
9492 let _ = client.search("simple", SearchFilters::default(), 10, 0, FieldMask::FULL);
9494 let stats = client.cache_stats();
9495 assert_eq!(
9496 stats.cache_miss, 1,
9497 "Simple query should trigger cache miss"
9498 );
9499 }
9500
9501 #[test]
9502 fn cache_prefix_lookup_handles_utf8_boundaries() {
9503 let client = SearchClient {
9504 reader: None,
9505 sqlite: Mutex::new(None),
9506 sqlite_path: None,
9507 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9508 reload_on_search: true,
9509 last_reload: Mutex::new(None),
9510 last_generation: Mutex::new(None),
9511 reload_epoch: Arc::new(AtomicU64::new(0)),
9512 warm_tx: None,
9513 _warm_handle: None,
9514 metrics: Metrics::default(),
9515 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
9516 semantic: Mutex::new(None),
9517 last_tantivy_total_count: Mutex::new(None),
9518 };
9519
9520 let hits = vec![SearchHit {
9521 title: "こんにちは".into(),
9522 snippet: String::new(),
9523 content: "こんにちは 世界".into(),
9524 content_hash: stable_content_hash("こんにちは 世界"),
9525 score: 1.0,
9526 source_path: "p".into(),
9527 agent: "a".into(),
9528 workspace: "w".into(),
9529 workspace_original: None,
9530 created_at: None,
9531 line_number: None,
9532 match_type: MatchType::Exact,
9533 source_id: "local".into(),
9534 origin_kind: "local".into(),
9535 origin_host: None,
9536 conversation_id: None,
9537 }];
9538
9539 client.put_cache("こん", &SearchFilters::default(), &hits);
9540
9541 let cached = client
9542 .cached_prefix_hits("こんにちは", &SearchFilters::default())
9543 .unwrap();
9544 assert_eq!(cached.len(), 1);
9545 assert_eq!(cached[0].hit.title, "こんにちは");
9546 }
9547
9548 #[test]
9549 fn bloom_gate_rejects_missing_terms() {
9550 let hit = SearchHit {
9551 title: "hello world".into(),
9552 snippet: "hello world".into(),
9553 content: "hello world".into(),
9554 content_hash: stable_content_hash("hello world"),
9555 score: 1.0,
9556 source_path: "p".into(),
9557 agent: "a".into(),
9558 workspace: "w".into(),
9559 workspace_original: None,
9560 created_at: None,
9561 line_number: None,
9562 match_type: MatchType::Exact,
9563 source_id: "local".into(),
9564 origin_kind: "local".into(),
9565 origin_host: None,
9566 conversation_id: None,
9567 };
9568 let cached = cached_hit_from(&hit);
9569 assert!(hit_matches_query_cached(&cached, "hello"));
9570 assert!(!hit_matches_query_cached(&cached, "missing"));
9571
9572 let metrics = Metrics::default();
9573 metrics.inc_cache_hits();
9574 metrics.inc_cache_miss();
9575 metrics.inc_cache_shortfall();
9576 metrics.inc_reload();
9577 let (hits, miss, shortfall, reloads, _) = metrics.snapshot_all();
9578 assert_eq!((hits, miss, shortfall, reloads), (1, 1, 1, 1));
9579 }
9580
9581 #[test]
9582 fn progressive_lexical_hit_omits_unused_content() {
9583 let hit = SearchHit {
9584 title: "hello world".into(),
9585 snippet: "hello **world**".into(),
9586 content: "hello world from a much larger conversation body".into(),
9587 content_hash: stable_content_hash("hello world from a much larger conversation body"),
9588 score: 1.0,
9589 source_path: "p".into(),
9590 agent: "a".into(),
9591 workspace: "w".into(),
9592 workspace_original: None,
9593 created_at: None,
9594 line_number: Some(3),
9595 match_type: MatchType::Exact,
9596 source_id: "local".into(),
9597 origin_kind: "local".into(),
9598 origin_host: None,
9599 conversation_id: None,
9600 };
9601
9602 let snippet_only =
9603 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(false, true, true, true));
9604 assert_eq!(snippet_only.title, hit.title);
9605 assert_eq!(snippet_only.snippet, hit.snippet);
9606 assert!(
9607 snippet_only.content.is_empty(),
9608 "snippet-only progressive cache should not retain full content"
9609 );
9610 assert_eq!(snippet_only.match_type, hit.match_type);
9611 assert_eq!(snippet_only.line_number, hit.line_number);
9612 assert_eq!(snippet_only.source_path, hit.source_path);
9613 assert_eq!(snippet_only.agent, hit.agent);
9614 assert_eq!(snippet_only.workspace, hit.workspace);
9615
9616 let full =
9617 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(true, true, true, true));
9618 assert_eq!(full.content, hit.content);
9619 }
9620
9621 #[test]
9622 fn progressive_phase_reuses_lexical_cache_without_db_hydration() -> Result<()> {
9623 let client = SearchClient {
9624 reader: None,
9625 sqlite: Mutex::new(None),
9626 sqlite_path: None,
9627 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9628 reload_on_search: true,
9629 last_reload: Mutex::new(None),
9630 last_generation: Mutex::new(None),
9631 reload_epoch: Arc::new(AtomicU64::new(0)),
9632 warm_tx: None,
9633 _warm_handle: None,
9634 metrics: Metrics::default(),
9635 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
9636 semantic: Mutex::new(None),
9637 last_tantivy_total_count: Mutex::new(None),
9638 };
9639 let field_mask = FieldMask::new(false, true, true, true);
9640 let lexical_hit = SearchHit {
9641 title: "lexical title".into(),
9642 snippet: "lexical snippet".into(),
9643 content: "full lexical body".into(),
9644 content_hash: stable_content_hash("full lexical body"),
9645 score: 0.0,
9646 source_path: "/tmp/session.jsonl".into(),
9647 agent: "codex".into(),
9648 workspace: "/tmp".into(),
9649 workspace_original: Some("/original".into()),
9650 created_at: Some(1_700_000_000_000),
9651 line_number: Some(7),
9652 match_type: MatchType::Exact,
9653 source_id: "local".into(),
9654 origin_kind: "local".into(),
9655 origin_host: None,
9656 conversation_id: None,
9657 };
9658 let mut lexical_cache = ProgressiveLexicalCache::default();
9659 lexical_cache.hits_by_message.insert(
9660 42,
9661 ProgressiveLexicalHit::from_search_hit(&lexical_hit, field_mask),
9662 );
9663
9664 let hash_hex = "00".repeat(32);
9665 let results = vec![FsScoredResult {
9666 doc_id: format!("m|42|0|1|1|1|1|1700000000000|{hash_hex}"),
9667 score: 0.91,
9668 source: FsScoreSource::Lexical,
9669 index: None,
9670 fast_score: None,
9671 quality_score: None,
9672 lexical_score: Some(0.91),
9673 rerank_score: None,
9674 explanation: None,
9675 metadata: None,
9676 }];
9677
9678 let result = client.progressive_phase_to_result(
9679 &results,
9680 ProgressivePhaseContext {
9681 query: "merged title",
9682 filters: &SearchFilters::default(),
9683 field_mask,
9684 lexical_cache: Some(&lexical_cache),
9685 limit: 1,
9686 fetch_limit: 1,
9687 },
9688 )?;
9689
9690 assert_eq!(result.hits.len(), 1);
9691 assert_eq!(result.hits[0].title, lexical_hit.title);
9692 assert_eq!(result.hits[0].snippet, lexical_hit.snippet);
9693 assert!(
9694 result.hits[0].content.is_empty(),
9695 "masked lexical cache should still avoid carrying full content"
9696 );
9697 assert_eq!(result.hits[0].source_path, lexical_hit.source_path);
9698 assert_eq!(result.hits[0].score, 0.91);
9699
9700 Ok(())
9701 }
9702
9703 #[test]
9704 fn search_returns_results_with_filters_and_pagination() -> Result<()> {
9705 let dir = TempDir::new()?;
9706 let mut index = TantivyIndex::open_or_create(dir.path())?;
9707 let conv = NormalizedConversation {
9708 agent_slug: "codex".into(),
9709 external_id: None,
9710 title: Some("hello world convo".into()),
9711 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
9712 source_path: dir.path().join("rollout-1.jsonl"),
9713 started_at: Some(1_700_000_000_000),
9714 ended_at: None,
9715 metadata: serde_json::json!({}),
9716 messages: vec![NormalizedMessage {
9717 idx: 0,
9718 role: "user".into(),
9719 author: Some("me".into()),
9720 created_at: Some(1_700_000_000_000),
9721 content: "hello rust world".into(),
9722 extra: serde_json::json!({}),
9723 snippets: vec![NormalizedSnippet {
9724 file_path: None,
9725 start_line: None,
9726 end_line: None,
9727 language: None,
9728 snippet_text: None,
9729 }],
9730 invocations: Vec::new(),
9731 }],
9732 };
9733 index.add_conversation(&conv)?;
9734 index.commit()?;
9735
9736 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9737 let mut filters = SearchFilters::default();
9738 filters.agents.insert("codex".into());
9739
9740 let hits = client.search("hello", filters, 10, 0, FieldMask::FULL)?;
9741 assert_eq!(hits.len(), 1);
9742 assert_eq!(hits[0].agent, "codex");
9743 assert!(hits[0].snippet.contains("hello"));
9744 Ok(())
9745 }
9746
9747 #[test]
9748 fn search_honors_created_range_and_workspace() -> Result<()> {
9749 let dir = TempDir::new()?;
9750 let mut index = TantivyIndex::open_or_create(dir.path())?;
9751
9752 let conv_a = NormalizedConversation {
9753 agent_slug: "codex".into(),
9754 external_id: None,
9755 title: Some("needle one".into()),
9756 workspace: Some(std::path::PathBuf::from("/ws/a")),
9757 source_path: dir.path().join("a.jsonl"),
9758 started_at: Some(10),
9759 ended_at: None,
9760 metadata: serde_json::json!({}),
9761 messages: vec![NormalizedMessage {
9762 idx: 0,
9763 role: "user".into(),
9764 author: None,
9765 created_at: Some(10),
9766 content: "alpha needle".into(),
9767 extra: serde_json::json!({}),
9768 snippets: vec![NormalizedSnippet {
9769 file_path: None,
9770 start_line: None,
9771 end_line: None,
9772 language: None,
9773 snippet_text: None,
9774 }],
9775 invocations: Vec::new(),
9776 }],
9777 };
9778 let conv_b = NormalizedConversation {
9779 agent_slug: "codex".into(),
9780 external_id: None,
9781 title: Some("needle two".into()),
9782 workspace: Some(std::path::PathBuf::from("/ws/b")),
9783 source_path: dir.path().join("b.jsonl"),
9784 started_at: Some(20),
9785 ended_at: None,
9786 metadata: serde_json::json!({}),
9787 messages: vec![NormalizedMessage {
9788 idx: 0,
9789 role: "user".into(),
9790 author: None,
9791 created_at: Some(20),
9792 content: "\nneedle second line".into(),
9793 extra: serde_json::json!({}),
9794 snippets: vec![NormalizedSnippet {
9795 file_path: None,
9796 start_line: None,
9797 end_line: None,
9798 language: None,
9799 snippet_text: None,
9800 }],
9801 invocations: Vec::new(),
9802 }],
9803 };
9804 index.add_conversation(&conv_a)?;
9805 index.add_conversation(&conv_b)?;
9806 index.commit()?;
9807
9808 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9809 let mut filters = SearchFilters::default();
9810 filters.workspaces.insert("/ws/b".into());
9811 filters.created_from = Some(15);
9812 filters.created_to = Some(25);
9813
9814 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
9815 assert_eq!(hits.len(), 1);
9816 assert_eq!(hits[0].workspace, "/ws/b");
9817 assert!(hits[0].snippet.contains("second line"));
9818 Ok(())
9819 }
9820
9821 #[test]
9822 fn pagination_skips_results() -> Result<()> {
9823 let dir = TempDir::new()?;
9824 let mut index = TantivyIndex::open_or_create(dir.path())?;
9825 for i in 0..3 {
9826 let conv = NormalizedConversation {
9827 agent_slug: "codex".into(),
9828 external_id: None,
9829 title: Some(format!("doc-{i}")),
9830 workspace: Some(std::path::PathBuf::from("/ws/p")),
9831 source_path: dir.path().join(format!("{i}.jsonl")),
9832 started_at: Some(100 + i),
9833 ended_at: None,
9834 metadata: serde_json::json!({}),
9835 messages: vec![NormalizedMessage {
9836 idx: 0,
9837 role: "user".into(),
9838 author: None,
9839 created_at: Some(100 + i),
9840 content: format!("pagination needle document number {i}"),
9842 extra: serde_json::json!({}),
9843 snippets: vec![NormalizedSnippet {
9844 file_path: None,
9845 start_line: None,
9846 end_line: None,
9847 language: None,
9848 snippet_text: None,
9849 }],
9850 invocations: Vec::new(),
9851 }],
9852 };
9853 index.add_conversation(&conv)?;
9854 }
9855 index.commit()?;
9856
9857 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9858 let hits = client.search(
9859 "pagination",
9860 SearchFilters::default(),
9861 1,
9862 1,
9863 FieldMask::FULL,
9864 )?;
9865 assert_eq!(hits.len(), 1);
9866 Ok(())
9867 }
9868
9869 #[test]
9870 fn search_matches_hyphenated_term() -> Result<()> {
9871 let dir = TempDir::new()?;
9872 let mut index = TantivyIndex::open_or_create(dir.path())?;
9873 let conv = NormalizedConversation {
9874 agent_slug: "codex".into(),
9875 external_id: None,
9876 title: Some("cma-es notes".into()),
9877 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
9878 source_path: dir.path().join("rollout-1.jsonl"),
9879 started_at: Some(1_700_000_000_000),
9880 ended_at: None,
9881 metadata: serde_json::json!({}),
9882 messages: vec![NormalizedMessage {
9883 idx: 0,
9884 role: "user".into(),
9885 author: Some("me".into()),
9886 created_at: Some(1_700_000_000_000),
9887 content: "Need CMA-ES strategy and CMA ES variants".into(),
9888 extra: serde_json::json!({}),
9889 snippets: vec![NormalizedSnippet {
9890 file_path: None,
9891 start_line: None,
9892 end_line: None,
9893 language: None,
9894 snippet_text: None,
9895 }],
9896 invocations: Vec::new(),
9897 }],
9898 };
9899 index.add_conversation(&conv)?;
9900 index.commit()?;
9901
9902 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9903 let hits = client.search("cma-es", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
9904 assert_eq!(hits.len(), 1);
9905 assert!(hits[0].snippet.to_lowercase().contains("cma"));
9906 Ok(())
9907 }
9908
9909 #[test]
9910 fn search_matches_prefix_edge_ngram() -> Result<()> {
9911 let dir = TempDir::new()?;
9912 let mut index = TantivyIndex::open_or_create(dir.path())?;
9913 let conv = NormalizedConversation {
9914 agent_slug: "codex".into(),
9915 external_id: None,
9916 title: Some("math logic".into()),
9917 workspace: Some(std::path::PathBuf::from("/ws/m")),
9918 source_path: dir.path().join("math.jsonl"),
9919 started_at: Some(1000),
9920 ended_at: None,
9921 metadata: serde_json::json!({}),
9922 messages: vec![NormalizedMessage {
9923 idx: 0,
9924 role: "user".into(),
9925 author: None,
9926 created_at: Some(1000),
9927 content: "please calculate the entropy".into(),
9928 extra: serde_json::json!({}),
9929 snippets: vec![],
9930 invocations: Vec::new(),
9931 }],
9932 };
9933 index.add_conversation(&conv)?;
9934 index.commit()?;
9935
9936 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9937
9938 let hits = client.search("cal", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
9940 assert_eq!(hits.len(), 1);
9941 assert!(hits[0].content.contains("calculate"));
9942
9943 let hits = client.search("entr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
9945 assert_eq!(hits.len(), 1);
9946
9947 Ok(())
9948 }
9949
9950 #[test]
9951 fn search_matches_snake_case() -> Result<()> {
9952 let dir = TempDir::new()?;
9953 let mut index = TantivyIndex::open_or_create(dir.path())?;
9954 let conv = NormalizedConversation {
9955 agent_slug: "codex".into(),
9956 external_id: None,
9957 title: Some("code".into()),
9958 workspace: None,
9959 source_path: dir.path().join("c.jsonl"),
9960 started_at: Some(1),
9961 ended_at: None,
9962 metadata: serde_json::json!({}),
9963 messages: vec![NormalizedMessage {
9964 idx: 0,
9965 role: "user".into(),
9966 author: None,
9967 created_at: Some(1),
9968 content: "check the my_variable_name please".into(),
9969 extra: serde_json::json!({}),
9970 snippets: vec![],
9971 invocations: Vec::new(),
9972 }],
9973 };
9974 index.add_conversation(&conv)?;
9975 index.commit()?;
9976
9977 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9978
9979 let hits = client.search("vari", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
9981 assert_eq!(hits.len(), 1);
9982
9983 let hits = client.search(
9985 "my_variable",
9986 SearchFilters::default(),
9987 10,
9988 0,
9989 FieldMask::FULL,
9990 )?;
9991 assert_eq!(hits.len(), 1);
9992
9993 Ok(())
9994 }
9995
9996 #[test]
9997 fn search_matches_symbols_stripped() -> Result<()> {
9998 let dir = TempDir::new()?;
9999 let mut index = TantivyIndex::open_or_create(dir.path())?;
10000 let conv = NormalizedConversation {
10001 agent_slug: "codex".into(),
10002 external_id: None,
10003 title: Some("symbols".into()),
10004 workspace: None,
10005 source_path: dir.path().join("s.jsonl"),
10006 started_at: Some(1),
10007 ended_at: None,
10008 metadata: serde_json::json!({}),
10009 messages: vec![NormalizedMessage {
10010 idx: 0,
10011 role: "user".into(),
10012 author: None,
10013 created_at: Some(1),
10014 content: "working with c++ and foo.bar today".into(),
10015 extra: serde_json::json!({}),
10016 snippets: vec![],
10017 invocations: Vec::new(),
10018 }],
10019 };
10020 index.add_conversation(&conv)?;
10021 index.commit()?;
10022
10023 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10024
10025 let hits = client.search("c++", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10027 assert_eq!(hits.len(), 1);
10028
10029 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10031 assert_eq!(hits.len(), 1);
10032
10033 Ok(())
10034 }
10035
10036 #[test]
10037 fn search_sets_match_type_for_wildcards() -> Result<()> {
10038 let dir = TempDir::new()?;
10039 let mut index = TantivyIndex::open_or_create(dir.path())?;
10040
10041 let conv = NormalizedConversation {
10042 agent_slug: "codex".into(),
10043 external_id: None,
10044 title: Some("handlers".into()),
10045 workspace: None,
10046 source_path: dir.path().join("h.jsonl"),
10047 started_at: Some(1),
10048 ended_at: None,
10049 metadata: serde_json::json!({}),
10050 messages: vec![NormalizedMessage {
10051 idx: 0,
10052 role: "user".into(),
10053 author: None,
10054 created_at: Some(1),
10055 content: "the request handler delegates".into(),
10056 extra: serde_json::json!({}),
10057 snippets: vec![],
10058 invocations: Vec::new(),
10059 }],
10060 };
10061 index.add_conversation(&conv)?;
10062 index.commit()?;
10063
10064 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10065
10066 let exact = client.search("handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10067 assert_eq!(exact[0].match_type, MatchType::Exact);
10068
10069 let prefix = client.search("hand*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10070 assert_eq!(prefix[0].match_type, MatchType::Prefix);
10071
10072 let suffix = client.search("*handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10073 assert_eq!(suffix[0].match_type, MatchType::Suffix);
10074
10075 let substring =
10076 client.search("*andle*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10077 assert_eq!(substring[0].match_type, MatchType::Substring);
10078
10079 Ok(())
10080 }
10081
10082 #[test]
10083 fn search_with_fallback_marks_implicit_wildcard() -> Result<()> {
10084 let dir = TempDir::new()?;
10085 let mut index = TantivyIndex::open_or_create(dir.path())?;
10086
10087 let conv = NormalizedConversation {
10088 agent_slug: "codex".into(),
10089 external_id: None,
10090 title: Some("handlers".into()),
10091 workspace: None,
10092 source_path: dir.path().join("h2.jsonl"),
10093 started_at: Some(1),
10094 ended_at: None,
10095 metadata: serde_json::json!({}),
10096 messages: vec![NormalizedMessage {
10097 idx: 0,
10098 role: "user".into(),
10099 author: None,
10100 created_at: Some(1),
10101 content: "the request handler delegates".into(),
10102 extra: serde_json::json!({}),
10103 snippets: vec![],
10104 invocations: Vec::new(),
10105 }],
10106 };
10107 index.add_conversation(&conv)?;
10108 index.commit()?;
10109
10110 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10111
10112 let result = client.search_with_fallback(
10114 "andle",
10115 SearchFilters::default(),
10116 10,
10117 0,
10118 2,
10119 FieldMask::FULL,
10120 )?;
10121 assert!(result.wildcard_fallback);
10122 assert_eq!(result.hits.len(), 1);
10123 assert_eq!(result.hits[0].match_type, MatchType::ImplicitWildcard);
10124
10125 Ok(())
10126 }
10127
10128 #[test]
10129 fn sqlite_backend_skips_wildcard_queries() -> Result<()> {
10130 let conn = Connection::open(":memory:")?;
10132 let client = SearchClient {
10133 reader: None,
10134 sqlite: Mutex::new(Some(SendConnection(conn))),
10135 sqlite_path: None,
10136 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10137 reload_on_search: true,
10138 last_reload: Mutex::new(None),
10139 last_generation: Mutex::new(None),
10140 reload_epoch: Arc::new(AtomicU64::new(0)),
10141 warm_tx: None,
10142 _warm_handle: None,
10143 metrics: Metrics::default(),
10144 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10145 semantic: Mutex::new(None),
10146 last_tantivy_total_count: Mutex::new(None),
10147 };
10148
10149 let hits = client.search("*handler", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10150 assert!(
10151 hits.is_empty(),
10152 "wildcard should skip sqlite fallback, not error"
10153 );
10154
10155 Ok(())
10156 }
10157
10158 #[test]
10159 fn sqlite_backend_handles_null_workspace() -> Result<()> {
10160 let conn = Connection::open(":memory:")?;
10161 conn.execute_batch(
10162 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10163 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10164 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10165 CREATE TABLE conversations (
10166 id INTEGER PRIMARY KEY,
10167 agent_id INTEGER,
10168 workspace_id INTEGER,
10169 source_id TEXT,
10170 origin_host TEXT,
10171 title TEXT,
10172 source_path TEXT
10173 );
10174 CREATE TABLE messages (
10175 id INTEGER PRIMARY KEY,
10176 conversation_id INTEGER,
10177 idx INTEGER,
10178 content TEXT,
10179 created_at INTEGER
10180 );
10181 CREATE VIRTUAL TABLE fts_messages USING fts5(
10182 content,
10183 title,
10184 agent,
10185 workspace,
10186 source_path,
10187 created_at UNINDEXED,
10188 content='',
10189 tokenize='porter'
10190 );",
10191 )?;
10192 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10193 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10194 conn.execute(
10195 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 't', '/tmp/session.jsonl')",
10196 )?;
10197 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
10198 conn.execute_compat(
10199 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10200 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
10201 params![
10202 1_i64,
10203 "auth token failure",
10204 "t",
10205 "codex",
10206 "/tmp/session.jsonl",
10207 42_i64
10208 ],
10209 )?;
10210
10211 let client = SearchClient {
10212 reader: None,
10213 sqlite: Mutex::new(Some(SendConnection(conn))),
10214 sqlite_path: None,
10215 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10216 reload_on_search: true,
10217 last_reload: Mutex::new(None),
10218 last_generation: Mutex::new(None),
10219 reload_epoch: Arc::new(AtomicU64::new(0)),
10220 warm_tx: None,
10221 _warm_handle: None,
10222 metrics: Metrics::default(),
10223 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10224 semantic: Mutex::new(None),
10225 last_tantivy_total_count: Mutex::new(None),
10226 };
10227
10228 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10229 assert_eq!(hits.len(), 1);
10230 assert_eq!(hits[0].workspace, "");
10231 assert_eq!(hits[0].line_number, Some(1));
10232 assert_eq!(hits[0].source_id, "local");
10233 assert_eq!(hits[0].origin_kind, "local");
10234 Ok(())
10235 }
10236
10237 #[test]
10238 fn sqlite_backend_supports_legacy_fts_message_id_schema() -> Result<()> {
10239 let conn = Connection::open(":memory:")?;
10240 conn.execute_batch(
10241 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10242 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10243 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10244 CREATE TABLE conversations (
10245 id INTEGER PRIMARY KEY,
10246 agent_id INTEGER,
10247 workspace_id INTEGER,
10248 source_id TEXT,
10249 origin_host TEXT,
10250 title TEXT,
10251 source_path TEXT
10252 );
10253 CREATE TABLE messages (
10254 id INTEGER PRIMARY KEY,
10255 conversation_id INTEGER,
10256 idx INTEGER,
10257 content TEXT,
10258 created_at INTEGER
10259 );
10260 CREATE VIRTUAL TABLE fts_messages USING fts5(
10261 content,
10262 title,
10263 agent,
10264 workspace,
10265 source_path,
10266 created_at UNINDEXED,
10267 message_id UNINDEXED,
10268 tokenize='porter'
10269 );",
10270 )?;
10271 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10272 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10273 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/legacy')")?;
10274 conn.execute(
10275 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10276 VALUES(1, 1, 1, 'local', NULL, 'legacy title', '/tmp/legacy.jsonl')",
10277 )?;
10278 conn.execute(
10279 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
10280 VALUES(42, 1, 4, 'legacy auth token failure', 99)",
10281 )?;
10282 conn.execute_compat(
10283 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at, message_id)
10284 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
10285 params![
10286 1_i64,
10287 "legacy auth token failure",
10288 "legacy title",
10289 "codex",
10290 "/legacy",
10291 "/tmp/legacy.jsonl",
10292 99_i64,
10293 42_i64
10294 ],
10295 )?;
10296
10297 let client = SearchClient {
10298 reader: None,
10299 sqlite: Mutex::new(Some(SendConnection(conn))),
10300 sqlite_path: None,
10301 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10302 reload_on_search: true,
10303 last_reload: Mutex::new(None),
10304 last_generation: Mutex::new(None),
10305 reload_epoch: Arc::new(AtomicU64::new(0)),
10306 warm_tx: None,
10307 _warm_handle: None,
10308 metrics: Metrics::default(),
10309 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10310 semantic: Mutex::new(None),
10311 last_tantivy_total_count: Mutex::new(None),
10312 };
10313
10314 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10315 assert_eq!(hits.len(), 1);
10316 assert_eq!(hits[0].title, "legacy title");
10317 assert_eq!(hits[0].source_path, "/tmp/legacy.jsonl");
10318 assert_eq!(hits[0].workspace, "/legacy");
10319 assert_eq!(hits[0].line_number, Some(5));
10320 assert_eq!(hits[0].content, "legacy auth token failure");
10321 Ok(())
10322 }
10323
10324 #[test]
10325 fn tantivy_reader_skips_sqlite_fallback_on_empty_lexical_results() -> Result<()> {
10326 let dir = TempDir::new()?;
10327 let mut index = TantivyIndex::open_or_create(dir.path())?;
10328 index.commit()?;
10329 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
10330 assert!(
10331 reader.is_some(),
10332 "test fixture should open a Tantivy reader even with an empty index"
10333 );
10334
10335 let conn = Connection::open(":memory:")?;
10336 conn.execute_batch(
10337 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10338 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10339 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10340 CREATE TABLE conversations (
10341 id INTEGER PRIMARY KEY,
10342 agent_id INTEGER,
10343 workspace_id INTEGER,
10344 source_id TEXT,
10345 origin_host TEXT,
10346 title TEXT,
10347 source_path TEXT
10348 );
10349 CREATE TABLE messages (
10350 id INTEGER PRIMARY KEY,
10351 conversation_id INTEGER,
10352 idx INTEGER,
10353 content TEXT,
10354 created_at INTEGER
10355 );
10356 CREATE VIRTUAL TABLE fts_messages USING fts5(
10357 content,
10358 title,
10359 agent,
10360 workspace,
10361 source_path,
10362 created_at UNINDEXED,
10363 content='',
10364 tokenize='porter'
10365 );",
10366 )?;
10367 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10368 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10369 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/sqlite-only')")?;
10370 conn.execute(
10371 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10372 VALUES(1, 1, 1, 'local', NULL, 'sqlite fallback only', '/tmp/sqlite-only.jsonl')",
10373 )?;
10374 conn.execute(
10375 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
10376 VALUES(1, 1, 0, 'sqliteonlytoken overflow candidate', 42)",
10377 )?;
10378 conn.execute_compat(
10379 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10380 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10381 params![
10382 1_i64,
10383 "sqliteonlytoken overflow candidate",
10384 "sqlite fallback only",
10385 "codex",
10386 "/sqlite-only",
10387 "/tmp/sqlite-only.jsonl",
10388 42_i64
10389 ],
10390 )?;
10391
10392 let client = SearchClient {
10393 reader,
10394 sqlite: Mutex::new(Some(SendConnection(conn))),
10395 sqlite_path: None,
10396 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10397 reload_on_search: true,
10398 last_reload: Mutex::new(None),
10399 last_generation: Mutex::new(None),
10400 reload_epoch: Arc::new(AtomicU64::new(0)),
10401 warm_tx: None,
10402 _warm_handle: None,
10403 metrics: Metrics::default(),
10404 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10405 semantic: Mutex::new(None),
10406 last_tantivy_total_count: Mutex::new(None),
10407 };
10408
10409 let sqlite_hits = client.search_sqlite_fts5(
10410 Path::new(":memory:"),
10411 "sqliteonlytoken",
10412 SearchFilters::default(),
10413 5,
10414 0,
10415 FieldMask::FULL,
10416 )?;
10417 assert_eq!(
10418 sqlite_hits.len(),
10419 1,
10420 "fixture should prove sqlite fallback would have produced a hit"
10421 );
10422
10423 let tantivy_authoritative_hits = client.search(
10424 "sqliteonlytoken",
10425 SearchFilters::default(),
10426 5,
10427 0,
10428 FieldMask::FULL,
10429 )?;
10430 assert!(
10431 tantivy_authoritative_hits.is_empty(),
10432 "a live Tantivy reader should prevent sqlite fallback from populating empty lexical results"
10433 );
10434 Ok(())
10435 }
10436
10437 #[test]
10438 fn sqlite_guard_does_not_repair_fts_when_generation_key_stale() -> Result<()> {
10439 let temp_dir = TempDir::new()?;
10440 let db_path = temp_dir.path().join("stale-gen-fts.db");
10441
10442 {
10444 let storage = FrankenStorage::open(&db_path)?;
10445 let agent = Agent {
10446 id: None,
10447 slug: "codex".into(),
10448 name: "Codex".into(),
10449 version: None,
10450 kind: AgentKind::Cli,
10451 };
10452 let agent_id = storage.ensure_agent(&agent)?;
10453 let conversation = Conversation {
10454 id: None,
10455 agent_slug: "codex".into(),
10456 workspace: Some(PathBuf::from("/tmp/workspace")),
10457 external_id: Some("stale-gen-fts".into()),
10458 title: Some("Stale FTS generation".into()),
10459 source_path: PathBuf::from("/tmp/stale-gen-fts.jsonl"),
10460 started_at: Some(1_700_000_000_000),
10461 ended_at: Some(1_700_000_000_100),
10462 approx_tokens: Some(42),
10463 metadata_json: serde_json::Value::Null,
10464 messages: vec![Message {
10465 id: None,
10466 idx: 0,
10467 role: MessageRole::User,
10468 author: Some("user".into()),
10469 created_at: Some(1_700_000_000_050),
10470 content: "message that should remain queryable".into(),
10471 extra_json: serde_json::Value::Null,
10472 snippets: Vec::new(),
10473 }],
10474 source_id: "local".into(),
10475 origin_host: None,
10476 };
10477 storage.insert_conversation_tree(agent_id, None, &conversation)?;
10478 }
10479
10480 let count_before = sqlite_master_name_count(&db_path, "fts_messages")
10481 .context("count schema rows before generation key deletion")?;
10482
10483 {
10487 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
10488 conn.execute_compat(
10489 "DELETE FROM meta WHERE key = ?1",
10490 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
10491 )?;
10492 }
10493
10494 let client = SearchClient {
10497 reader: None,
10498 sqlite: Mutex::new(None),
10499 sqlite_path: Some(db_path.clone()),
10500 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10501 reload_on_search: true,
10502 last_reload: Mutex::new(None),
10503 last_generation: Mutex::new(None),
10504 reload_epoch: Arc::new(AtomicU64::new(0)),
10505 warm_tx: None,
10506 _warm_handle: None,
10507 metrics: Metrics::default(),
10508 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10509 semantic: Mutex::new(None),
10510 last_tantivy_total_count: Mutex::new(None),
10511 };
10512
10513 let guard = client
10514 .sqlite_guard()
10515 .context("open sqlite guard for stale generation fixture")?;
10516 assert!(guard.is_some(), "sqlite guard should open the db");
10517 let conn = guard
10518 .as_ref()
10519 .expect("sqlite guard should hold a connection");
10520 let no_params: [ParamValue; 0] = [];
10521 let cache_size: i64 =
10522 conn.query_row_map("PRAGMA cache_size;", &no_params, |row| row.get_typed(0))?;
10523 assert_eq!(
10524 cache_size, -SEARCH_SQLITE_HYDRATION_CACHE_KIB,
10525 "search hydration should not inherit the general storage cache profile"
10526 );
10527 drop(guard);
10528
10529 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
10531 let generation_after: Option<String> = conn
10532 .query_row_map(
10533 "SELECT value FROM meta WHERE key = ?1",
10534 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
10535 |row| row.get_typed(0),
10536 )
10537 .optional()?;
10538 assert!(
10539 generation_after.is_none(),
10540 "search sqlite guard must not mutate FTS rebuild metadata"
10541 );
10542
10543 let count_after = sqlite_master_name_count(&db_path, "fts_messages")
10545 .context("count schema rows after sqlite guard reopen")?;
10546 assert_eq!(
10547 count_after, count_before,
10548 "read-only reopen must leave FTS schema state unchanged"
10549 );
10550
10551 Ok(())
10552 }
10553
10554 #[test]
10555 fn sqlite_path_rusqlite_fallback_matches_hyphenated_ids_with_workspace_filter() -> Result<()> {
10556 let temp_dir = TempDir::new()?;
10557 let db_path = temp_dir.path().join("hyphenated-rusqlite-fallback.db");
10558
10559 {
10560 let storage = FrankenStorage::open(&db_path)?;
10561 storage.ensure_search_fallback_fts_consistency()?;
10564 let conn = storage.raw();
10565 conn.execute(
10566 "INSERT INTO agents(id, slug, name, kind, created_at, updated_at)
10567 VALUES(1, 'codex', 'Codex', 'codex', 1, 1)",
10568 )?;
10569 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws/alpha')")?;
10570 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/ws/beta')")?;
10571 conn.execute(
10572 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10573 VALUES(1, 1, 1, 'local', NULL, 'alpha bead', '/tmp/alpha.jsonl')",
10574 )?;
10575 conn.execute(
10576 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10577 VALUES(2, 1, 2, 'local', NULL, 'beta bead', '/tmp/beta.jsonl')",
10578 )?;
10579 conn.execute(
10580 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
10581 VALUES(11, 1, 0, 'user', 'Need follow-up on br-123 root cause', 100)",
10582 )?;
10583 conn.execute(
10584 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
10585 VALUES(12, 2, 0, 'user', 'Need follow-up on br-123 user report', 101)",
10586 )?;
10587 conn.execute_compat(
10588 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10589 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10590 &[
10591 ParamValue::from(11_i64),
10592 ParamValue::from("Need follow-up on br-123 root cause"),
10593 ParamValue::from("alpha bead"),
10594 ParamValue::from("codex"),
10595 ParamValue::from("/ws/alpha"),
10596 ParamValue::from("/tmp/alpha.jsonl"),
10597 ParamValue::from(100_i64),
10598 ],
10599 )?;
10600 conn.execute_compat(
10601 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10602 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10603 &[
10604 ParamValue::from(12_i64),
10605 ParamValue::from("Need follow-up on br-123 user report"),
10606 ParamValue::from("beta bead"),
10607 ParamValue::from("codex"),
10608 ParamValue::from("/ws/beta"),
10609 ParamValue::from("/tmp/beta.jsonl"),
10610 ParamValue::from(101_i64),
10611 ],
10612 )?;
10613 let preclose_total_rows = conn.query("SELECT rowid FROM fts_messages")?;
10614 assert_eq!(
10615 preclose_total_rows.len(),
10616 2,
10617 "freshly seeded file-backed FTS should retain the inserted rows"
10618 );
10619 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
10620 let preclose_rows = conn.query_with_params(
10621 "SELECT rowid FROM fts_messages WHERE fts_messages MATCH ?",
10622 ¶ms_from_iter(vec![ParamValue::from(transpiled.as_str())]),
10623 )?;
10624 assert_eq!(
10625 preclose_rows.len(),
10626 2,
10627 "freshly seeded file-backed FTS should match the transpiled hyphenated query before reopen"
10628 );
10629 }
10630
10631 let client = SearchClient {
10632 reader: None,
10633 sqlite: Mutex::new(None),
10634 sqlite_path: Some(db_path),
10635 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10636 reload_on_search: true,
10637 last_reload: Mutex::new(None),
10638 last_generation: Mutex::new(None),
10639 reload_epoch: Arc::new(AtomicU64::new(0)),
10640 warm_tx: None,
10641 _warm_handle: None,
10642 metrics: Metrics::default(),
10643 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10644 semantic: Mutex::new(None),
10645 last_tantivy_total_count: Mutex::new(None),
10646 };
10647
10648 let guard = client.sqlite_guard()?;
10649 let conn = guard.as_ref().expect("sqlite guard should reopen file db");
10650 let reopened_total_rows = conn.query("SELECT rowid FROM fts_messages")?;
10651 assert_eq!(
10652 reopened_total_rows.len(),
10653 2,
10654 "reopened file-backed FTS should still contain the seeded rows"
10655 );
10656 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
10657 let raw_rows = conn.query_with_params(
10658 "SELECT rowid FROM fts_messages WHERE fts_messages MATCH ?",
10659 ¶ms_from_iter(vec![ParamValue::from(transpiled.as_str())]),
10660 )?;
10661 assert_eq!(
10662 raw_rows.len(),
10663 2,
10664 "reopened file-backed FTS should still match the transpiled hyphenated query"
10665 );
10666 drop(guard);
10667
10668 let all_hits = client.search("br-123", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10669 assert_eq!(all_hits.len(), 2);
10670 assert!(
10671 all_hits.iter().all(|hit| hit.content.contains("br-123")),
10672 "hyphenated bead IDs should survive the file-backed sqlite fallback path"
10673 );
10674
10675 let leading_or_hits = client.search(
10676 "OR br-123",
10677 SearchFilters::default(),
10678 10,
10679 0,
10680 FieldMask::FULL,
10681 )?;
10682 assert_eq!(leading_or_hits.len(), 2);
10683
10684 let dotted_hits = client.search(
10685 "br-123.jsonl",
10686 SearchFilters::default(),
10687 10,
10688 0,
10689 FieldMask::FULL,
10690 )?;
10691 assert_eq!(dotted_hits.len(), 2);
10692
10693 let dotted_prefix_hits = client.search(
10694 "br-123.json*",
10695 SearchFilters::default(),
10696 10,
10697 0,
10698 FieldMask::FULL,
10699 )?;
10700 assert_eq!(dotted_prefix_hits.len(), 2);
10701
10702 let prefix_hits =
10703 client.search("br-12*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10704 assert_eq!(prefix_hits.len(), 2);
10705
10706 let filtered_hits = client.search(
10707 "br-123",
10708 SearchFilters {
10709 workspaces: HashSet::from_iter(["/ws/beta".to_string()]),
10710 ..SearchFilters::default()
10711 },
10712 10,
10713 0,
10714 FieldMask::FULL,
10715 )?;
10716 assert_eq!(filtered_hits.len(), 1);
10717 assert_eq!(filtered_hits[0].workspace, "/ws/beta");
10718 assert_eq!(filtered_hits[0].source_path, "/tmp/beta.jsonl");
10719 assert!(filtered_hits[0].content.contains("br-123"));
10720
10721 Ok(())
10722 }
10723
10724 #[test]
10725 fn sqlite_backend_orders_hits_by_bm25_score() -> Result<()> {
10726 let conn = Connection::open(":memory:")?;
10727 conn.execute_batch(
10728 "CREATE TABLE conversations (
10729 id INTEGER PRIMARY KEY,
10730 agent_id INTEGER,
10731 workspace_id INTEGER,
10732 source_id TEXT,
10733 origin_host TEXT,
10734 title TEXT,
10735 source_path TEXT
10736 );
10737 CREATE TABLE messages (
10738 id INTEGER PRIMARY KEY,
10739 conversation_id INTEGER,
10740 idx INTEGER,
10741 content TEXT,
10742 created_at INTEGER
10743 );
10744 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10745 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10746 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10747 CREATE VIRTUAL TABLE fts_messages USING fts5(
10748 content,
10749 title,
10750 agent,
10751 workspace,
10752 source_path,
10753 created_at UNINDEXED,
10754 content='',
10755 tokenize='porter'
10756 );",
10757 )?;
10758 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10759 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10760 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
10761 conn.execute(
10762 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'best', '/tmp/best.jsonl')",
10763 )?;
10764 conn.execute(
10765 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'worse', '/tmp/worse.jsonl')",
10766 )?;
10767 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(7, 1, 0, 'auth auth auth failure', 42)")?;
10768 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(8, 2, 0, 'auth failure', 43)")?;
10769 conn.execute_compat(
10770 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10771 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10772 params![
10773 7_i64,
10774 "auth auth auth failure",
10775 "best",
10776 "codex",
10777 "/ws",
10778 "/tmp/best.jsonl",
10779 42_i64
10780 ],
10781 )?;
10782 conn.execute_compat(
10783 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10784 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10785 params![
10786 8_i64,
10787 "auth failure",
10788 "worse",
10789 "codex",
10790 "/ws",
10791 "/tmp/worse.jsonl",
10792 43_i64
10793 ],
10794 )?;
10795 let client = SearchClient {
10796 reader: None,
10797 sqlite: Mutex::new(Some(SendConnection(conn))),
10798 sqlite_path: None,
10799 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10800 reload_on_search: true,
10801 last_reload: Mutex::new(None),
10802 last_generation: Mutex::new(None),
10803 reload_epoch: Arc::new(AtomicU64::new(0)),
10804 warm_tx: None,
10805 _warm_handle: None,
10806 metrics: Metrics::default(),
10807 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10808 semantic: Mutex::new(None),
10809 last_tantivy_total_count: Mutex::new(None),
10810 };
10811 let direct_hits = client.search_sqlite_fts5(
10812 Path::new(":memory:"),
10813 "auth",
10814 SearchFilters::default(),
10815 5,
10816 0,
10817 FieldMask::FULL,
10818 )?;
10819 assert_eq!(direct_hits.len(), 2);
10820
10821 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10822 assert_eq!(hits.len(), 2);
10823 assert_eq!(hits[0].title, "best");
10824 assert_eq!(hits[1].title, "worse");
10825 assert!(hits[0].score > hits[1].score);
10826
10827 Ok(())
10828 }
10829
10830 #[test]
10831 fn sqlite_fts5_ranked_phase_defers_content_decode_until_after_limit() {
10832 let (rank_sql, params) =
10833 SearchClient::sqlite_fts5_rank_query("auth", &SearchFilters::default(), 50, 0, false);
10834 let hydrate_sql = SearchClient::sqlite_fts5_hydrate_query(
10835 2,
10836 FieldMask::new(true, true, true, true),
10837 false,
10838 );
10839
10840 assert!(
10841 !rank_sql.contains("fts_messages.content"),
10842 "rank query must not decode large content rows before LIMIT"
10843 );
10844 assert!(
10845 hydrate_sql.contains("fts_messages.content"),
10846 "hydration query should still provide requested content"
10847 );
10848 assert!(
10849 rank_sql.contains("LIMIT ? OFFSET ?"),
10850 "rank query must apply page bounds before hydration"
10851 );
10852 assert_eq!(params.len(), 3, "fts query plus limit and offset params");
10853 }
10854
10855 #[test]
10856 fn sqlite_fts5_hydration_chunks_stay_below_bind_variable_limit() {
10857 let oversized_row_count = SQLITE_MAX_VARIABLE_NUMBER + 1;
10858 let unchunked_sql = SearchClient::sqlite_fts5_hydrate_query(
10859 oversized_row_count,
10860 FieldMask::new(true, true, true, true),
10861 false,
10862 );
10863 assert!(
10864 unchunked_sql.matches('?').count() > SQLITE_MAX_VARIABLE_NUMBER,
10865 "the pre-fix one-shot hydration query would exceed frankensqlite's bind limit"
10866 );
10867
10868 let ranked_rows: Vec<(i64, f64)> = (0..(SQLITE_FTS5_HYDRATE_PARAM_CHUNK + 17))
10869 .map(|idx| (idx as i64, idx as f64))
10870 .collect();
10871 let chunk_sizes: Vec<usize> = SearchClient::sqlite_fts5_hydrate_row_chunks(&ranked_rows)
10872 .map(<[(i64, f64)]>::len)
10873 .collect();
10874
10875 assert_eq!(
10876 chunk_sizes,
10877 vec![SQLITE_FTS5_HYDRATE_PARAM_CHUNK, 17],
10878 "large fallback pages must hydrate in bounded chunks while preserving rank windows"
10879 );
10880 assert!(
10881 chunk_sizes
10882 .iter()
10883 .all(|chunk_size| *chunk_size <= SQLITE_MAX_VARIABLE_NUMBER),
10884 "every hydration chunk must fit under frankensqlite's bind-variable ceiling"
10885 );
10886 }
10887
10888 #[test]
10889 fn tantivy_fallback_hydration_narrows_by_normalized_source_before_message_lookup() -> Result<()>
10890 {
10891 let conn = Connection::open(":memory:")?;
10892 conn.execute_batch(
10893 "CREATE TABLE conversations (
10894 id INTEGER PRIMARY KEY,
10895 source_id TEXT,
10896 origin_host TEXT,
10897 source_path TEXT NOT NULL
10898 );
10899 CREATE TABLE messages (
10900 id INTEGER PRIMARY KEY,
10901 conversation_id INTEGER NOT NULL,
10902 idx INTEGER NOT NULL,
10903 content TEXT NOT NULL,
10904 UNIQUE(conversation_id, idx)
10905 );
10906 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
10907 )?;
10908 conn.execute(
10909 "INSERT INTO conversations(id, source_id, origin_host, source_path)
10910 VALUES(1, '', 'devbox', '/tmp/shared-fallback.jsonl')",
10911 )?;
10912 conn.execute(
10913 "INSERT INTO conversations(id, source_id, origin_host, source_path)
10914 VALUES(2, 'local', NULL, '/tmp/shared-fallback.jsonl')",
10915 )?;
10916 conn.execute(
10917 "INSERT INTO messages(id, conversation_id, idx, content)
10918 VALUES(10, 1, 2, 'remote fallback content')",
10919 )?;
10920 conn.execute(
10921 "INSERT INTO messages(id, conversation_id, idx, content)
10922 VALUES(20, 2, 2, 'local content must not win')",
10923 )?;
10924
10925 let client = SearchClient {
10926 reader: None,
10927 sqlite: Mutex::new(Some(SendConnection(conn))),
10928 sqlite_path: None,
10929 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10930 reload_on_search: true,
10931 last_reload: Mutex::new(None),
10932 last_generation: Mutex::new(None),
10933 reload_epoch: Arc::new(AtomicU64::new(0)),
10934 warm_tx: None,
10935 _warm_handle: None,
10936 metrics: Metrics::default(),
10937 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10938 semantic: Mutex::new(None),
10939 last_tantivy_total_count: Mutex::new(None),
10940 };
10941
10942 let fallback_key = (
10943 "devbox".to_string(),
10944 "/tmp/shared-fallback.jsonl".to_string(),
10945 2,
10946 );
10947 let (_, hydrated_fallback) =
10948 client.hydrate_tantivy_hit_contents(&[], std::slice::from_ref(&fallback_key))?;
10949
10950 assert_eq!(
10951 hydrated_fallback.get(&fallback_key).map(String::as_str),
10952 Some("remote fallback content")
10953 );
10954
10955 Ok(())
10956 }
10957
10958 #[test]
10959 fn exact_content_hydration_returns_only_requested_message_indices() -> Result<()> {
10960 let conn = Connection::open(":memory:")?;
10961 conn.execute_batch(
10962 "CREATE TABLE messages (
10963 id INTEGER PRIMARY KEY,
10964 conversation_id INTEGER NOT NULL,
10965 idx INTEGER NOT NULL,
10966 content TEXT NOT NULL,
10967 UNIQUE(conversation_id, idx)
10968 );",
10969 )?;
10970
10971 for idx in 0..8 {
10972 conn.execute(&format!(
10973 "INSERT INTO messages(conversation_id, idx, content)
10974 VALUES(1, {idx}, 'conversation one row {idx}')"
10975 ))?;
10976 }
10977 conn.execute(
10978 "INSERT INTO messages(conversation_id, idx, content)
10979 VALUES(2, 0, 'conversation two row 0')",
10980 )?;
10981
10982 let hydrated =
10983 hydrate_message_content_by_conversation(&conn, &[(1, 6), (1, 2), (2, 0), (1, 99)])?;
10984
10985 assert_eq!(hydrated.len(), 3);
10986 assert_eq!(
10987 hydrated.get(&(1, 2)).map(String::as_str),
10988 Some("conversation one row 2")
10989 );
10990 assert_eq!(
10991 hydrated.get(&(1, 6)).map(String::as_str),
10992 Some("conversation one row 6")
10993 );
10994 assert_eq!(
10995 hydrated.get(&(2, 0)).map(String::as_str),
10996 Some("conversation two row 0")
10997 );
10998 assert!(!hydrated.contains_key(&(1, 99)));
10999
11000 Ok(())
11001 }
11002
11003 #[test]
11004 fn sqlite_backend_generates_snippet_from_content() -> Result<()> {
11005 let conn = Connection::open(":memory:")?;
11006 conn.execute_batch(
11007 "CREATE TABLE conversations (
11008 id INTEGER PRIMARY KEY,
11009 agent_id INTEGER,
11010 workspace_id INTEGER,
11011 source_id TEXT,
11012 origin_host TEXT,
11013 title TEXT,
11014 source_path TEXT
11015 );
11016 CREATE TABLE messages (
11017 id INTEGER PRIMARY KEY,
11018 conversation_id INTEGER,
11019 idx INTEGER,
11020 content TEXT,
11021 created_at INTEGER
11022 );
11023 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11024 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11025 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11026 CREATE VIRTUAL TABLE fts_messages USING fts5(
11027 content,
11028 title,
11029 agent,
11030 workspace,
11031 source_path,
11032 created_at UNINDEXED,
11033 content='',
11034 tokenize='porter'
11035 );",
11036 )?;
11037 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11038 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11039 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
11040 conn.execute(
11041 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'snippet title', '/tmp/snippet.jsonl')",
11042 )?;
11043 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'alpha beta gamma delta epsilon zeta eta theta', 42)")?;
11044 conn.execute_compat(
11045 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11046 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11047 params![
11048 1_i64,
11049 "alpha beta gamma delta epsilon zeta eta theta",
11050 "snippet title",
11051 "codex",
11052 "/ws",
11053 "/tmp/snippet.jsonl",
11054 42_i64
11055 ],
11056 )?;
11057
11058 let client = SearchClient {
11059 reader: None,
11060 sqlite: Mutex::new(Some(SendConnection(conn))),
11061 sqlite_path: None,
11062 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11063 reload_on_search: true,
11064 last_reload: Mutex::new(None),
11065 last_generation: Mutex::new(None),
11066 reload_epoch: Arc::new(AtomicU64::new(0)),
11067 warm_tx: None,
11068 _warm_handle: None,
11069 metrics: Metrics::default(),
11070 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11071 semantic: Mutex::new(None),
11072 last_tantivy_total_count: Mutex::new(None),
11073 };
11074
11075 let hits = client.search("delta", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11076 assert_eq!(hits.len(), 1);
11077 assert_eq!(hits[0].snippet, snippet_from_content(&hits[0].content));
11079 assert!(hits[0].snippet.contains("delta"));
11080
11081 Ok(())
11082 }
11083
11084 #[test]
11085 fn sqlite_backend_respects_source_filter() -> Result<()> {
11086 let conn = Connection::open(":memory:")?;
11087 conn.execute_batch(
11088 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11089 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11090 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11091 CREATE TABLE conversations (
11092 id INTEGER PRIMARY KEY,
11093 agent_id INTEGER,
11094 workspace_id INTEGER,
11095 source_id TEXT,
11096 origin_host TEXT,
11097 title TEXT,
11098 source_path TEXT
11099 );
11100 CREATE TABLE messages (
11101 id INTEGER PRIMARY KEY,
11102 conversation_id INTEGER,
11103 idx INTEGER,
11104 content TEXT,
11105 created_at INTEGER
11106 );
11107 CREATE VIRTUAL TABLE fts_messages USING fts5(
11108 content,
11109 title,
11110 agent,
11111 workspace,
11112 source_path,
11113 created_at UNINDEXED,
11114 content='',
11115 tokenize='porter'
11116 );",
11117 )?;
11118 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11119 conn.execute("INSERT INTO sources(id, kind) VALUES('laptop', 'ssh')")?;
11120 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11121 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/local')")?;
11122 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/remote')")?;
11123 conn.execute(
11124 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, ' local ', NULL, 'local title', '/tmp/local.jsonl')",
11125 )?;
11126 conn.execute("INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 2, 'laptop', 'dev@laptop', 'remote title', '/tmp/remote.jsonl')")?;
11127 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
11128 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
11129 conn.execute_compat(
11130 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11131 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11132 params![
11133 1_i64,
11134 "auth token failure",
11135 "local title",
11136 "codex",
11137 "/local",
11138 "/tmp/local.jsonl",
11139 42_i64
11140 ],
11141 )?;
11142 conn.execute_compat(
11143 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11144 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11145 params![
11146 2_i64,
11147 "auth token failure",
11148 "remote title",
11149 "codex",
11150 "/remote",
11151 "/tmp/remote.jsonl",
11152 43_i64
11153 ],
11154 )?;
11155
11156 let client = SearchClient {
11157 reader: None,
11158 sqlite: Mutex::new(Some(SendConnection(conn))),
11159 sqlite_path: None,
11160 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11161 reload_on_search: true,
11162 last_reload: Mutex::new(None),
11163 last_generation: Mutex::new(None),
11164 reload_epoch: Arc::new(AtomicU64::new(0)),
11165 warm_tx: None,
11166 _warm_handle: None,
11167 metrics: Metrics::default(),
11168 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11169 semantic: Mutex::new(None),
11170 last_tantivy_total_count: Mutex::new(None),
11171 };
11172
11173 let local_hits = client.browse_by_date(
11174 SearchFilters {
11175 source_filter: SourceFilter::Local,
11176 ..SearchFilters::default()
11177 },
11178 5,
11179 0,
11180 true,
11181 FieldMask::FULL,
11182 )?;
11183 assert_eq!(local_hits.len(), 1);
11184 assert_eq!(local_hits[0].source_id, "local");
11185
11186 let remote_hits = client.browse_by_date(
11187 SearchFilters {
11188 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
11189 ..SearchFilters::default()
11190 },
11191 5,
11192 0,
11193 true,
11194 FieldMask::FULL,
11195 )?;
11196 assert_eq!(remote_hits.len(), 1);
11197 assert_eq!(remote_hits[0].source_id, "local");
11198 assert_eq!(remote_hits[0].origin_kind, "local");
11199
11200 Ok(())
11201 }
11202
11203 #[test]
11204 fn sqlite_backend_remote_source_filter_matches_blank_source_id_with_origin_host() -> Result<()>
11205 {
11206 let conn = Connection::open(":memory:")?;
11207 conn.execute_batch(
11208 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11209 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11210 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11211 CREATE TABLE conversations (
11212 id INTEGER PRIMARY KEY,
11213 agent_id INTEGER,
11214 workspace_id INTEGER,
11215 source_id TEXT,
11216 origin_host TEXT,
11217 title TEXT,
11218 source_path TEXT
11219 );
11220 CREATE TABLE messages (
11221 id INTEGER PRIMARY KEY,
11222 conversation_id INTEGER,
11223 idx INTEGER,
11224 content TEXT,
11225 created_at INTEGER
11226 );
11227 CREATE VIRTUAL TABLE fts_messages USING fts5(
11228 content,
11229 title,
11230 agent,
11231 workspace,
11232 source_path,
11233 created_at UNINDEXED,
11234 content='',
11235 tokenize='porter'
11236 );",
11237 )?;
11238 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11239 conn.execute(
11240 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11241 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'remote title', '/tmp/remote-filter.jsonl')",
11242 )?;
11243 conn.execute(
11244 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11245 VALUES(1, 1, 0, 'remote filter proof', 42)",
11246 )?;
11247 conn.execute_compat(
11248 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11249 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
11250 params![
11251 1_i64,
11252 "remote filter proof",
11253 "remote title",
11254 "codex",
11255 "/tmp/remote-filter.jsonl",
11256 42_i64
11257 ],
11258 )?;
11259
11260 let client = SearchClient {
11261 reader: None,
11262 sqlite: Mutex::new(Some(SendConnection(conn))),
11263 sqlite_path: None,
11264 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11265 reload_on_search: true,
11266 last_reload: Mutex::new(None),
11267 last_generation: Mutex::new(None),
11268 reload_epoch: Arc::new(AtomicU64::new(0)),
11269 warm_tx: None,
11270 _warm_handle: None,
11271 metrics: Metrics::default(),
11272 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11273 semantic: Mutex::new(None),
11274 last_tantivy_total_count: Mutex::new(None),
11275 };
11276
11277 let remote_hits = client.search(
11278 "remote",
11279 SearchFilters {
11280 source_filter: SourceFilter::Remote,
11281 ..Default::default()
11282 },
11283 5,
11284 0,
11285 FieldMask::FULL,
11286 )?;
11287 assert_eq!(remote_hits.len(), 1);
11288 assert_eq!(remote_hits[0].source_id, "dev@laptop");
11289 assert_eq!(remote_hits[0].origin_kind, "remote");
11290 assert_eq!(remote_hits[0].origin_host.as_deref(), Some("dev@laptop"));
11291
11292 let source_hits = client.search(
11293 "remote",
11294 SearchFilters {
11295 source_filter: SourceFilter::SourceId("dev@laptop".into()),
11296 ..Default::default()
11297 },
11298 5,
11299 0,
11300 FieldMask::FULL,
11301 )?;
11302 assert_eq!(source_hits.len(), 1);
11303 assert_eq!(source_hits[0].source_id, "dev@laptop");
11304 assert_eq!(source_hits[0].origin_kind, "remote");
11305
11306 Ok(())
11307 }
11308
11309 #[test]
11310 fn sqlite_backend_workspace_filter_matches_null_workspace_as_empty_string() -> Result<()> {
11311 let conn = Connection::open(":memory:")?;
11312 conn.execute_batch(
11313 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11314 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11315 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11316 CREATE TABLE conversations (
11317 id INTEGER PRIMARY KEY,
11318 agent_id INTEGER,
11319 workspace_id INTEGER,
11320 source_id TEXT,
11321 origin_host TEXT,
11322 title TEXT,
11323 source_path TEXT
11324 );
11325 CREATE TABLE messages (
11326 id INTEGER PRIMARY KEY,
11327 conversation_id INTEGER,
11328 idx INTEGER,
11329 content TEXT,
11330 created_at INTEGER
11331 );
11332 CREATE VIRTUAL TABLE fts_messages USING fts5(
11333 content,
11334 title,
11335 agent,
11336 workspace,
11337 source_path,
11338 created_at UNINDEXED,
11339 content='',
11340 tokenize='porter'
11341 );",
11342 )?;
11343 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11344 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11345 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/named')")?;
11346 conn.execute(
11348 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 'null workspace', '/tmp/null-workspace.jsonl')",
11349 )?;
11350 conn.execute(
11352 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'named workspace', '/tmp/named-workspace.jsonl')",
11353 )?;
11354 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
11355 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
11356 conn.execute_compat(
11357 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11358 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
11359 params![
11360 1_i64,
11361 "auth token failure",
11362 "null workspace",
11363 "codex",
11364 "/tmp/null-workspace.jsonl",
11365 42_i64
11366 ],
11367 )?;
11368 conn.execute_compat(
11369 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11370 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11371 params![
11372 2_i64,
11373 "auth token failure",
11374 "named workspace",
11375 "codex",
11376 "/named",
11377 "/tmp/named-workspace.jsonl",
11378 43_i64
11379 ],
11380 )?;
11381
11382 let client = SearchClient {
11383 reader: None,
11384 sqlite: Mutex::new(Some(SendConnection(conn))),
11385 sqlite_path: None,
11386 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11387 reload_on_search: true,
11388 last_reload: Mutex::new(None),
11389 last_generation: Mutex::new(None),
11390 reload_epoch: Arc::new(AtomicU64::new(0)),
11391 warm_tx: None,
11392 _warm_handle: None,
11393 metrics: Metrics::default(),
11394 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11395 semantic: Mutex::new(None),
11396 last_tantivy_total_count: Mutex::new(None),
11397 };
11398
11399 let hits = client.search(
11400 "auth",
11401 SearchFilters {
11402 workspaces: HashSet::from_iter([String::new()]),
11403 ..SearchFilters::default()
11404 },
11405 5,
11406 0,
11407 FieldMask::FULL,
11408 )?;
11409 assert_eq!(hits.len(), 1);
11410 assert_eq!(hits[0].workspace, "");
11411 assert_eq!(hits[0].source_path, "/tmp/null-workspace.jsonl");
11412
11413 Ok(())
11414 }
11415
11416 #[test]
11417 fn browse_by_date_treats_null_workspace_and_source_as_local() -> Result<()> {
11418 let conn = Connection::open(":memory:")?;
11419 conn.execute_batch(
11420 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11421 CREATE TABLE conversations (
11422 id INTEGER PRIMARY KEY,
11423 agent_id INTEGER NOT NULL,
11424 workspace_id INTEGER,
11425 source_id TEXT,
11426 origin_host TEXT,
11427 title TEXT,
11428 source_path TEXT NOT NULL
11429 );
11430 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11431 CREATE TABLE messages (
11432 id INTEGER PRIMARY KEY,
11433 conversation_id INTEGER NOT NULL,
11434 idx INTEGER,
11435 content TEXT NOT NULL,
11436 created_at INTEGER
11437 );
11438 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11439 )?;
11440 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11441 conn.execute(
11442 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11443 VALUES(1, 1, NULL, NULL, NULL, 'browse title', '/tmp/browse.jsonl')",
11444 )?;
11445 conn.execute(
11446 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11447 VALUES(1, 1, 0, 'browse auth token failure', 123)",
11448 )?;
11449
11450 let client = SearchClient {
11451 reader: None,
11452 sqlite: Mutex::new(Some(SendConnection(conn))),
11453 sqlite_path: None,
11454 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11455 reload_on_search: true,
11456 last_reload: Mutex::new(None),
11457 last_generation: Mutex::new(None),
11458 reload_epoch: Arc::new(AtomicU64::new(0)),
11459 warm_tx: None,
11460 _warm_handle: None,
11461 metrics: Metrics::default(),
11462 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11463 semantic: Mutex::new(None),
11464 last_tantivy_total_count: Mutex::new(None),
11465 };
11466
11467 let hits = client.browse_by_date(
11468 SearchFilters {
11469 workspaces: HashSet::from_iter([String::new()]),
11470 source_filter: SourceFilter::Local,
11471 ..SearchFilters::default()
11472 },
11473 5,
11474 0,
11475 true,
11476 FieldMask::FULL,
11477 )?;
11478 assert_eq!(hits.len(), 1);
11479 assert_eq!(hits[0].workspace, "");
11480 assert_eq!(hits[0].source_id, "local");
11481 assert_eq!(hits[0].origin_kind, "local");
11482
11483 Ok(())
11484 }
11485
11486 #[test]
11487 fn hydrate_semantic_hits_with_ids_snippet_only_uses_full_content_for_snippets_and_identity()
11488 -> Result<()> {
11489 let conn = Connection::open(":memory:")?;
11490 conn.execute_batch(
11491 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11492 CREATE TABLE conversations (
11493 id INTEGER PRIMARY KEY,
11494 agent_id INTEGER NOT NULL,
11495 workspace_id INTEGER,
11496 source_id TEXT,
11497 origin_host TEXT,
11498 title TEXT,
11499 source_path TEXT NOT NULL,
11500 started_at INTEGER
11501 );
11502 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11503 CREATE TABLE messages (
11504 id INTEGER PRIMARY KEY,
11505 conversation_id INTEGER NOT NULL,
11506 idx INTEGER,
11507 role TEXT,
11508 content TEXT NOT NULL,
11509 created_at INTEGER
11510 );
11511 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11512 )?;
11513 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11514 conn.execute(
11515 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11516 VALUES(1, 1, NULL, 'local', NULL, 'semantic title', '/tmp/semantic.jsonl', 100)",
11517 )?;
11518 let shared_prefix = "shared-prefix ".repeat(32);
11519 let first = format!("{shared_prefix}first unique semantic tail");
11520 let second = format!("{shared_prefix}second unique semantic tail");
11521 conn.execute_with_params(
11522 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11523 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
11524 &[
11525 fsqlite_types::value::SqliteValue::Integer(1),
11526 fsqlite_types::value::SqliteValue::Integer(0),
11527 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
11528 fsqlite_types::value::SqliteValue::Integer(101),
11529 ],
11530 )?;
11531 conn.execute_with_params(
11532 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11533 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
11534 &[
11535 fsqlite_types::value::SqliteValue::Integer(2),
11536 fsqlite_types::value::SqliteValue::Integer(1),
11537 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
11538 fsqlite_types::value::SqliteValue::Integer(102),
11539 ],
11540 )?;
11541
11542 let client = SearchClient {
11543 reader: None,
11544 sqlite: Mutex::new(Some(SendConnection(conn))),
11545 sqlite_path: None,
11546 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11547 reload_on_search: true,
11548 last_reload: Mutex::new(None),
11549 last_generation: Mutex::new(None),
11550 reload_epoch: Arc::new(AtomicU64::new(0)),
11551 warm_tx: None,
11552 _warm_handle: None,
11553 metrics: Metrics::default(),
11554 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11555 semantic: Mutex::new(None),
11556 last_tantivy_total_count: Mutex::new(None),
11557 };
11558
11559 let hits = client.hydrate_semantic_hits_with_ids(
11560 &[
11561 VectorSearchResult {
11562 message_id: 1,
11563 chunk_idx: 0,
11564 score: 0.9,
11565 },
11566 VectorSearchResult {
11567 message_id: 2,
11568 chunk_idx: 0,
11569 score: 0.8,
11570 },
11571 ],
11572 FieldMask::new(false, true, true, true),
11573 )?;
11574 assert_eq!(hits.len(), 2);
11575 assert!(hits.iter().all(|(_, hit)| hit.content.is_empty()));
11576 assert!(hits.iter().all(|(_, hit)| !hit.snippet.is_empty()));
11577 assert_ne!(hits[0].1.content_hash, hits[1].1.content_hash);
11578
11579 Ok(())
11580 }
11581
11582 #[test]
11583 fn hydrate_semantic_hits_with_ids_normalizes_trimmed_local_source_metadata() -> Result<()> {
11584 let conn = Connection::open(":memory:")?;
11585 conn.execute_batch(
11586 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11587 CREATE TABLE conversations (
11588 id INTEGER PRIMARY KEY,
11589 agent_id INTEGER NOT NULL,
11590 workspace_id INTEGER,
11591 source_id TEXT,
11592 origin_host TEXT,
11593 title TEXT,
11594 source_path TEXT NOT NULL,
11595 started_at INTEGER
11596 );
11597 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11598 CREATE TABLE messages (
11599 id INTEGER PRIMARY KEY,
11600 conversation_id INTEGER NOT NULL,
11601 idx INTEGER,
11602 role TEXT,
11603 content TEXT NOT NULL,
11604 created_at INTEGER
11605 );
11606 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11607 )?;
11608 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11609 conn.execute(
11610 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11611 VALUES(1, 1, NULL, ' local ', NULL, 'trimmed local semantic', '/tmp/trimmed-local-semantic.jsonl', 100)",
11612 )?;
11613 conn.execute_with_params(
11614 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11615 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
11616 &[
11617 fsqlite_types::value::SqliteValue::Integer(1),
11618 fsqlite_types::value::SqliteValue::Text("trimmed local semantic body".into()),
11619 ],
11620 )?;
11621
11622 let client = SearchClient {
11623 reader: None,
11624 sqlite: Mutex::new(Some(SendConnection(conn))),
11625 sqlite_path: None,
11626 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11627 reload_on_search: true,
11628 last_reload: Mutex::new(None),
11629 last_generation: Mutex::new(None),
11630 reload_epoch: Arc::new(AtomicU64::new(0)),
11631 warm_tx: None,
11632 _warm_handle: None,
11633 metrics: Metrics::default(),
11634 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11635 semantic: Mutex::new(None),
11636 last_tantivy_total_count: Mutex::new(None),
11637 };
11638
11639 let hits = client.hydrate_semantic_hits_with_ids(
11640 &[VectorSearchResult {
11641 message_id: 1,
11642 chunk_idx: 0,
11643 score: 0.9,
11644 }],
11645 FieldMask::new(false, true, true, true),
11646 )?;
11647 assert_eq!(hits.len(), 1);
11648 assert_eq!(hits[0].1.source_id, "local");
11649 assert_eq!(hits[0].1.origin_kind, "local");
11650
11651 Ok(())
11652 }
11653
11654 #[test]
11655 fn hydrate_semantic_hits_with_ids_preserves_remote_origin_without_source_row() -> Result<()> {
11656 let conn = Connection::open(":memory:")?;
11657 conn.execute_batch(
11658 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11659 CREATE TABLE conversations (
11660 id INTEGER PRIMARY KEY,
11661 agent_id INTEGER NOT NULL,
11662 workspace_id INTEGER,
11663 source_id TEXT,
11664 origin_host TEXT,
11665 title TEXT,
11666 source_path TEXT NOT NULL,
11667 started_at INTEGER
11668 );
11669 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11670 CREATE TABLE messages (
11671 id INTEGER PRIMARY KEY,
11672 conversation_id INTEGER NOT NULL,
11673 idx INTEGER,
11674 role TEXT,
11675 content TEXT NOT NULL,
11676 created_at INTEGER
11677 );
11678 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11679 )?;
11680 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11681 conn.execute(
11682 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11683 VALUES(1, 1, NULL, 'laptop', 'dev@laptop', 'remote semantic', '/tmp/remote-semantic.jsonl', 100)",
11684 )?;
11685 conn.execute_with_params(
11686 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11687 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
11688 &[
11689 fsqlite_types::value::SqliteValue::Integer(1),
11690 fsqlite_types::value::SqliteValue::Text("remote semantic body".into()),
11691 ],
11692 )?;
11693
11694 let client = SearchClient {
11695 reader: None,
11696 sqlite: Mutex::new(Some(SendConnection(conn))),
11697 sqlite_path: None,
11698 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11699 reload_on_search: true,
11700 last_reload: Mutex::new(None),
11701 last_generation: Mutex::new(None),
11702 reload_epoch: Arc::new(AtomicU64::new(0)),
11703 warm_tx: None,
11704 _warm_handle: None,
11705 metrics: Metrics::default(),
11706 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11707 semantic: Mutex::new(None),
11708 last_tantivy_total_count: Mutex::new(None),
11709 };
11710
11711 let hits = client.hydrate_semantic_hits_with_ids(
11712 &[VectorSearchResult {
11713 message_id: 1,
11714 chunk_idx: 0,
11715 score: 0.9,
11716 }],
11717 FieldMask::new(false, true, true, true),
11718 )?;
11719 assert_eq!(hits.len(), 1);
11720 assert_eq!(hits[0].1.source_id, "laptop");
11721 assert_eq!(hits[0].1.origin_kind, "remote");
11722 assert_eq!(hits[0].1.origin_host.as_deref(), Some("dev@laptop"));
11723
11724 Ok(())
11725 }
11726
11727 #[test]
11728 fn resolve_semantic_doc_ids_for_hits_distinguishes_same_source_path_line_by_content_hash()
11729 -> Result<()> {
11730 let conn = Connection::open(":memory:")?;
11731 conn.execute_batch(
11732 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11733 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11734 CREATE TABLE conversations (
11735 id INTEGER PRIMARY KEY,
11736 agent_id INTEGER NOT NULL,
11737 workspace_id INTEGER,
11738 source_id TEXT,
11739 origin_host TEXT,
11740 title TEXT,
11741 source_path TEXT NOT NULL
11742 );
11743 CREATE TABLE messages (
11744 id INTEGER PRIMARY KEY,
11745 conversation_id INTEGER NOT NULL,
11746 idx INTEGER,
11747 role TEXT,
11748 content TEXT NOT NULL,
11749 created_at INTEGER
11750 );",
11751 )?;
11752 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11753 conn.execute(
11754 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11755 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
11756 )?;
11757 conn.execute(
11758 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11759 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
11760 )?;
11761 let first = "same prefix first tail".to_string();
11762 let second = "same prefix second tail".to_string();
11763 conn.execute_with_params(
11764 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11765 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
11766 &[
11767 fsqlite_types::value::SqliteValue::Integer(11),
11768 fsqlite_types::value::SqliteValue::Integer(1),
11769 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
11770 ],
11771 )?;
11772 conn.execute_with_params(
11773 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11774 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
11775 &[
11776 fsqlite_types::value::SqliteValue::Integer(22),
11777 fsqlite_types::value::SqliteValue::Integer(2),
11778 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
11779 ],
11780 )?;
11781
11782 let client = SearchClient {
11783 reader: None,
11784 sqlite: Mutex::new(Some(SendConnection(conn))),
11785 sqlite_path: None,
11786 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11787 reload_on_search: true,
11788 last_reload: Mutex::new(None),
11789 last_generation: Mutex::new(None),
11790 reload_epoch: Arc::new(AtomicU64::new(0)),
11791 warm_tx: None,
11792 _warm_handle: None,
11793 metrics: Metrics::default(),
11794 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11795 semantic: Mutex::new(None),
11796 last_tantivy_total_count: Mutex::new(None),
11797 };
11798
11799 let first_hit = SearchHit {
11800 title: "Shared Session".into(),
11801 snippet: String::new(),
11802 content: String::new(),
11803 content_hash: stable_hit_hash(
11804 &first,
11805 "/tmp/progressive-shared.jsonl",
11806 Some(1),
11807 Some(100),
11808 ),
11809 score: 0.0,
11810 source_path: "/tmp/progressive-shared.jsonl".into(),
11811 agent: "codex".into(),
11812 workspace: String::new(),
11813 workspace_original: None,
11814 created_at: Some(100),
11815 line_number: Some(1),
11816 match_type: MatchType::Exact,
11817 source_id: "local".into(),
11818 origin_kind: "local".into(),
11819 origin_host: None,
11820 conversation_id: None,
11821 };
11822 let second_hit = SearchHit {
11823 title: "Shared Session".into(),
11824 snippet: String::new(),
11825 content: String::new(),
11826 content_hash: stable_hit_hash(
11827 &second,
11828 "/tmp/progressive-shared.jsonl",
11829 Some(1),
11830 Some(100),
11831 ),
11832 score: 0.0,
11833 source_path: "/tmp/progressive-shared.jsonl".into(),
11834 agent: "codex".into(),
11835 workspace: String::new(),
11836 workspace_original: None,
11837 created_at: Some(100),
11838 line_number: Some(1),
11839 match_type: MatchType::Exact,
11840 source_id: "local".into(),
11841 origin_kind: "local".into(),
11842 origin_host: None,
11843 conversation_id: None,
11844 };
11845
11846 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
11847 assert_eq!(resolved.len(), 2);
11848 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
11849 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
11850 assert_ne!(
11851 resolved[0].as_ref().map(|hit| hit.doc_id.as_str()),
11852 resolved[1].as_ref().map(|hit| hit.doc_id.as_str())
11853 );
11854
11855 Ok(())
11856 }
11857
11858 #[test]
11859 fn hydrate_semantic_hits_with_ids_keeps_missing_title_empty() -> Result<()> {
11860 let conn = Connection::open(":memory:")?;
11861 conn.execute_batch(
11862 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11863 CREATE TABLE conversations (
11864 id INTEGER PRIMARY KEY,
11865 agent_id INTEGER NOT NULL,
11866 workspace_id INTEGER,
11867 source_id TEXT,
11868 origin_host TEXT,
11869 title TEXT,
11870 source_path TEXT NOT NULL,
11871 started_at INTEGER
11872 );
11873 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11874 CREATE TABLE messages (
11875 id INTEGER PRIMARY KEY,
11876 conversation_id INTEGER NOT NULL,
11877 idx INTEGER,
11878 role TEXT,
11879 content TEXT NOT NULL,
11880 created_at INTEGER
11881 );
11882 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11883 )?;
11884 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11885 conn.execute(
11886 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11887 VALUES(1, 1, NULL, 'local', NULL, NULL, '/tmp/untitled-semantic.jsonl', 100)",
11888 )?;
11889 conn.execute_with_params(
11890 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11891 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
11892 &[
11893 fsqlite_types::value::SqliteValue::Integer(1),
11894 fsqlite_types::value::SqliteValue::Text("untitled semantic body".into()),
11895 ],
11896 )?;
11897
11898 let client = SearchClient {
11899 reader: None,
11900 sqlite: Mutex::new(Some(SendConnection(conn))),
11901 sqlite_path: None,
11902 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11903 reload_on_search: true,
11904 last_reload: Mutex::new(None),
11905 last_generation: Mutex::new(None),
11906 reload_epoch: Arc::new(AtomicU64::new(0)),
11907 warm_tx: None,
11908 _warm_handle: None,
11909 metrics: Metrics::default(),
11910 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11911 semantic: Mutex::new(None),
11912 last_tantivy_total_count: Mutex::new(None),
11913 };
11914
11915 let hits = client.hydrate_semantic_hits_with_ids(
11916 &[VectorSearchResult {
11917 message_id: 1,
11918 chunk_idx: 0,
11919 score: 0.9,
11920 }],
11921 FieldMask::new(false, true, true, true),
11922 )?;
11923 assert_eq!(hits.len(), 1);
11924 assert_eq!(hits[0].1.title, "");
11925
11926 Ok(())
11927 }
11928
11929 #[test]
11930 fn resolve_semantic_doc_ids_for_hits_prefers_conversation_id_over_ambiguous_provenance()
11931 -> Result<()> {
11932 let conn = Connection::open(":memory:")?;
11933 conn.execute_batch(
11934 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11935 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11936 CREATE TABLE conversations (
11937 id INTEGER PRIMARY KEY,
11938 agent_id INTEGER NOT NULL,
11939 workspace_id INTEGER,
11940 source_id TEXT,
11941 origin_host TEXT,
11942 title TEXT,
11943 source_path TEXT NOT NULL
11944 );
11945 CREATE TABLE messages (
11946 id INTEGER PRIMARY KEY,
11947 conversation_id INTEGER NOT NULL,
11948 idx INTEGER,
11949 role TEXT,
11950 content TEXT NOT NULL,
11951 created_at INTEGER
11952 );",
11953 )?;
11954 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11955 conn.execute(
11956 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11957 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
11958 )?;
11959 conn.execute(
11960 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11961 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
11962 )?;
11963 let content = "same ambiguous content".to_string();
11964 conn.execute_with_params(
11965 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11966 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
11967 &[
11968 fsqlite_types::value::SqliteValue::Integer(11),
11969 fsqlite_types::value::SqliteValue::Integer(1),
11970 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
11971 ],
11972 )?;
11973 conn.execute_with_params(
11974 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11975 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
11976 &[
11977 fsqlite_types::value::SqliteValue::Integer(22),
11978 fsqlite_types::value::SqliteValue::Integer(2),
11979 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
11980 ],
11981 )?;
11982
11983 let client = SearchClient {
11984 reader: None,
11985 sqlite: Mutex::new(Some(SendConnection(conn))),
11986 sqlite_path: None,
11987 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11988 reload_on_search: true,
11989 last_reload: Mutex::new(None),
11990 last_generation: Mutex::new(None),
11991 reload_epoch: Arc::new(AtomicU64::new(0)),
11992 warm_tx: None,
11993 _warm_handle: None,
11994 metrics: Metrics::default(),
11995 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11996 semantic: Mutex::new(None),
11997 last_tantivy_total_count: Mutex::new(None),
11998 };
11999
12000 let first_hit = SearchHit {
12001 title: "Shared Session".into(),
12002 snippet: String::new(),
12003 content: String::new(),
12004 content_hash: stable_hit_hash(
12005 &content,
12006 "/tmp/progressive-conversation-id.jsonl",
12007 Some(1),
12008 Some(100),
12009 ),
12010 score: 0.0,
12011 source_path: "/tmp/progressive-conversation-id.jsonl".into(),
12012 agent: "codex".into(),
12013 workspace: String::new(),
12014 workspace_original: None,
12015 created_at: Some(100),
12016 line_number: Some(1),
12017 match_type: MatchType::Exact,
12018 source_id: "local".into(),
12019 origin_kind: "local".into(),
12020 origin_host: None,
12021 conversation_id: Some(1),
12022 };
12023 let second_hit = SearchHit {
12024 conversation_id: Some(2),
12025 ..first_hit.clone()
12026 };
12027
12028 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
12029 assert_eq!(resolved.len(), 2);
12030 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12031 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
12032
12033 Ok(())
12034 }
12035
12036 #[test]
12037 fn resolve_semantic_doc_ids_for_hits_treats_null_source_as_local() -> Result<()> {
12038 let conn = Connection::open(":memory:")?;
12039 conn.execute_batch(
12040 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12041 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12042 CREATE TABLE conversations (
12043 id INTEGER PRIMARY KEY,
12044 agent_id INTEGER NOT NULL,
12045 workspace_id INTEGER,
12046 source_id TEXT,
12047 origin_host TEXT,
12048 title TEXT,
12049 source_path TEXT NOT NULL
12050 );
12051 CREATE TABLE messages (
12052 id INTEGER PRIMARY KEY,
12053 conversation_id INTEGER NOT NULL,
12054 idx INTEGER,
12055 role TEXT,
12056 content TEXT NOT NULL,
12057 created_at INTEGER
12058 );",
12059 )?;
12060 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12061 conn.execute(
12062 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12063 VALUES(1, 1, NULL, NULL, NULL, 'Legacy Local', '/tmp/legacy-local.jsonl')",
12064 )?;
12065 let content = "legacy local semantic message".to_string();
12066 conn.execute_with_params(
12067 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12068 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12069 &[
12070 fsqlite_types::value::SqliteValue::Integer(11),
12071 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12072 ],
12073 )?;
12074
12075 let client = SearchClient {
12076 reader: None,
12077 sqlite: Mutex::new(Some(SendConnection(conn))),
12078 sqlite_path: None,
12079 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12080 reload_on_search: true,
12081 last_reload: Mutex::new(None),
12082 last_generation: Mutex::new(None),
12083 reload_epoch: Arc::new(AtomicU64::new(0)),
12084 warm_tx: None,
12085 _warm_handle: None,
12086 metrics: Metrics::default(),
12087 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12088 semantic: Mutex::new(None),
12089 last_tantivy_total_count: Mutex::new(None),
12090 };
12091
12092 let hit = SearchHit {
12093 title: "Legacy Local".into(),
12094 snippet: String::new(),
12095 content: String::new(),
12096 content_hash: stable_hit_hash(&content, "/tmp/legacy-local.jsonl", Some(1), Some(100)),
12097 score: 0.0,
12098 source_path: "/tmp/legacy-local.jsonl".into(),
12099 agent: "codex".into(),
12100 workspace: String::new(),
12101 workspace_original: None,
12102 created_at: Some(100),
12103 line_number: Some(1),
12104 match_type: MatchType::Exact,
12105 source_id: "local".into(),
12106 origin_kind: "local".into(),
12107 origin_host: None,
12108 conversation_id: None,
12109 };
12110
12111 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12112 assert_eq!(resolved.len(), 1);
12113 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12114
12115 Ok(())
12116 }
12117
12118 #[test]
12119 fn resolve_semantic_doc_ids_for_hits_matches_trimmed_local_source_id() -> Result<()> {
12120 let conn = Connection::open(":memory:")?;
12121 conn.execute_batch(
12122 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12123 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12124 CREATE TABLE conversations (
12125 id INTEGER PRIMARY KEY,
12126 agent_id INTEGER NOT NULL,
12127 workspace_id INTEGER,
12128 source_id TEXT,
12129 origin_host TEXT,
12130 title TEXT,
12131 source_path TEXT NOT NULL
12132 );
12133 CREATE TABLE messages (
12134 id INTEGER PRIMARY KEY,
12135 conversation_id INTEGER NOT NULL,
12136 idx INTEGER,
12137 role TEXT,
12138 content TEXT NOT NULL,
12139 created_at INTEGER
12140 );",
12141 )?;
12142 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12143 conn.execute(
12144 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12145 VALUES(1, 1, NULL, ' local ', NULL, 'Trimmed Local', '/tmp/trimmed-local.jsonl')",
12146 )?;
12147 let content = "trimmed local semantic message".to_string();
12148 conn.execute_with_params(
12149 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12150 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12151 &[
12152 fsqlite_types::value::SqliteValue::Integer(11),
12153 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12154 ],
12155 )?;
12156
12157 let client = SearchClient {
12158 reader: None,
12159 sqlite: Mutex::new(Some(SendConnection(conn))),
12160 sqlite_path: None,
12161 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12162 reload_on_search: true,
12163 last_reload: Mutex::new(None),
12164 last_generation: Mutex::new(None),
12165 reload_epoch: Arc::new(AtomicU64::new(0)),
12166 warm_tx: None,
12167 _warm_handle: None,
12168 metrics: Metrics::default(),
12169 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12170 semantic: Mutex::new(None),
12171 last_tantivy_total_count: Mutex::new(None),
12172 };
12173
12174 let hit = SearchHit {
12175 title: "Trimmed Local".into(),
12176 snippet: String::new(),
12177 content: String::new(),
12178 content_hash: stable_hit_hash(&content, "/tmp/trimmed-local.jsonl", Some(1), Some(100)),
12179 score: 0.0,
12180 source_path: "/tmp/trimmed-local.jsonl".into(),
12181 agent: "codex".into(),
12182 workspace: String::new(),
12183 workspace_original: None,
12184 created_at: Some(100),
12185 line_number: Some(1),
12186 match_type: MatchType::Exact,
12187 source_id: "local".into(),
12188 origin_kind: "local".into(),
12189 origin_host: None,
12190 conversation_id: None,
12191 };
12192
12193 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12194 assert_eq!(resolved.len(), 1);
12195 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
12196
12197 Ok(())
12198 }
12199
12200 #[test]
12201 fn resolve_semantic_doc_ids_for_hits_normalizes_blank_local_source_id() -> Result<()> {
12202 let conn = Connection::open(":memory:")?;
12203 conn.execute_batch(
12204 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12205 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12206 CREATE TABLE conversations (
12207 id INTEGER PRIMARY KEY,
12208 agent_id INTEGER NOT NULL,
12209 workspace_id INTEGER,
12210 source_id TEXT,
12211 origin_host TEXT,
12212 title TEXT,
12213 source_path TEXT NOT NULL
12214 );
12215 CREATE TABLE messages (
12216 id INTEGER PRIMARY KEY,
12217 conversation_id INTEGER NOT NULL,
12218 idx INTEGER,
12219 role TEXT,
12220 content TEXT NOT NULL,
12221 created_at INTEGER
12222 );",
12223 )?;
12224 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12225 conn.execute(
12226 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12227 VALUES(1, 1, NULL, 'local', NULL, 'Blank Local', '/tmp/blank-local.jsonl')",
12228 )?;
12229 let content = "blank local semantic message".to_string();
12230 conn.execute_with_params(
12231 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12232 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12233 &[
12234 fsqlite_types::value::SqliteValue::Integer(11),
12235 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12236 ],
12237 )?;
12238
12239 let client = SearchClient {
12240 reader: None,
12241 sqlite: Mutex::new(Some(SendConnection(conn))),
12242 sqlite_path: None,
12243 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12244 reload_on_search: true,
12245 last_reload: Mutex::new(None),
12246 last_generation: Mutex::new(None),
12247 reload_epoch: Arc::new(AtomicU64::new(0)),
12248 warm_tx: None,
12249 _warm_handle: None,
12250 metrics: Metrics::default(),
12251 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12252 semantic: Mutex::new(None),
12253 last_tantivy_total_count: Mutex::new(None),
12254 };
12255
12256 let hit = SearchHit {
12257 title: "Blank Local".into(),
12258 snippet: String::new(),
12259 content: String::new(),
12260 content_hash: stable_hit_hash(&content, "/tmp/blank-local.jsonl", Some(1), Some(100)),
12261 score: 0.0,
12262 source_path: "/tmp/blank-local.jsonl".into(),
12263 agent: "codex".into(),
12264 workspace: String::new(),
12265 workspace_original: None,
12266 created_at: Some(100),
12267 line_number: Some(1),
12268 match_type: MatchType::Exact,
12269 source_id: " ".into(),
12270 origin_kind: "local".into(),
12271 origin_host: None,
12272 conversation_id: None,
12273 };
12274
12275 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12276 assert_eq!(resolved.len(), 1);
12277 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
12278
12279 Ok(())
12280 }
12281
12282 #[test]
12283 fn resolve_semantic_doc_ids_for_hits_infers_remote_source_from_origin_host_when_source_id_blank()
12284 -> Result<()> {
12285 let conn = Connection::open(":memory:")?;
12286 conn.execute_batch(
12287 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12288 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12289 CREATE TABLE conversations (
12290 id INTEGER PRIMARY KEY,
12291 agent_id INTEGER NOT NULL,
12292 workspace_id INTEGER,
12293 source_id TEXT,
12294 origin_host TEXT,
12295 title TEXT,
12296 source_path TEXT NOT NULL
12297 );
12298 CREATE TABLE messages (
12299 id INTEGER PRIMARY KEY,
12300 conversation_id INTEGER NOT NULL,
12301 idx INTEGER,
12302 role TEXT,
12303 content TEXT NOT NULL,
12304 created_at INTEGER
12305 );",
12306 )?;
12307 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12308 conn.execute(
12309 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12310 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'Legacy Remote', '/tmp/legacy-remote.jsonl')",
12311 )?;
12312 let content = "legacy remote semantic message".to_string();
12313 conn.execute_with_params(
12314 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12315 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12316 &[
12317 fsqlite_types::value::SqliteValue::Integer(11),
12318 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12319 ],
12320 )?;
12321
12322 let client = SearchClient {
12323 reader: None,
12324 sqlite: Mutex::new(Some(SendConnection(conn))),
12325 sqlite_path: None,
12326 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12327 reload_on_search: true,
12328 last_reload: Mutex::new(None),
12329 last_generation: Mutex::new(None),
12330 reload_epoch: Arc::new(AtomicU64::new(0)),
12331 warm_tx: None,
12332 _warm_handle: None,
12333 metrics: Metrics::default(),
12334 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12335 semantic: Mutex::new(None),
12336 last_tantivy_total_count: Mutex::new(None),
12337 };
12338
12339 let hit = SearchHit {
12340 title: "Legacy Remote".into(),
12341 snippet: String::new(),
12342 content: String::new(),
12343 content_hash: stable_hit_hash(&content, "/tmp/legacy-remote.jsonl", Some(1), Some(100)),
12344 score: 0.0,
12345 source_path: "/tmp/legacy-remote.jsonl".into(),
12346 agent: "codex".into(),
12347 workspace: String::new(),
12348 workspace_original: None,
12349 created_at: Some(100),
12350 line_number: Some(1),
12351 match_type: MatchType::Exact,
12352 source_id: "dev@laptop".into(),
12353 origin_kind: "remote".into(),
12354 origin_host: Some("dev@laptop".into()),
12355 conversation_id: None,
12356 };
12357
12358 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12359 assert_eq!(resolved.len(), 1);
12360 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
12361
12362 Ok(())
12363 }
12364
12365 #[test]
12366 fn browse_by_date_snippet_only_uses_full_content_for_hit_identity() -> Result<()> {
12367 let conn = Connection::open(":memory:")?;
12368 conn.execute_batch(
12369 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12370 CREATE TABLE conversations (
12371 id INTEGER PRIMARY KEY,
12372 agent_id INTEGER NOT NULL,
12373 workspace_id INTEGER,
12374 source_id TEXT,
12375 origin_host TEXT,
12376 title TEXT,
12377 source_path TEXT NOT NULL
12378 );
12379 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12380 CREATE TABLE messages (
12381 id INTEGER PRIMARY KEY,
12382 conversation_id INTEGER NOT NULL,
12383 idx INTEGER,
12384 content TEXT NOT NULL,
12385 created_at INTEGER
12386 );
12387 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12388 )?;
12389 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12390 conn.execute(
12391 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12392 VALUES(1, 1, NULL, 'local', NULL, 'browse title', '/tmp/browse-shared.jsonl')",
12393 )?;
12394 let shared_prefix = "shared-prefix ".repeat(48);
12395 let first = format!("{shared_prefix}first browse-only tail");
12396 let second = format!("{shared_prefix}second browse-only tail");
12397 conn.execute_with_params(
12398 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12399 VALUES(?1, 1, ?2, ?3, ?4)",
12400 &[
12401 fsqlite_types::value::SqliteValue::Integer(1),
12402 fsqlite_types::value::SqliteValue::Integer(0),
12403 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
12404 fsqlite_types::value::SqliteValue::Integer(101),
12405 ],
12406 )?;
12407 conn.execute_with_params(
12408 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12409 VALUES(?1, 1, ?2, ?3, ?4)",
12410 &[
12411 fsqlite_types::value::SqliteValue::Integer(2),
12412 fsqlite_types::value::SqliteValue::Integer(1),
12413 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
12414 fsqlite_types::value::SqliteValue::Integer(102),
12415 ],
12416 )?;
12417
12418 let client = SearchClient {
12419 reader: None,
12420 sqlite: Mutex::new(Some(SendConnection(conn))),
12421 sqlite_path: None,
12422 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12423 reload_on_search: true,
12424 last_reload: Mutex::new(None),
12425 last_generation: Mutex::new(None),
12426 reload_epoch: Arc::new(AtomicU64::new(0)),
12427 warm_tx: None,
12428 _warm_handle: None,
12429 metrics: Metrics::default(),
12430 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12431 semantic: Mutex::new(None),
12432 last_tantivy_total_count: Mutex::new(None),
12433 };
12434
12435 let hits = client.browse_by_date(
12436 SearchFilters::default(),
12437 10,
12438 0,
12439 true,
12440 FieldMask::new(false, true, true, true),
12441 )?;
12442 assert_eq!(hits.len(), 2);
12443 assert!(hits.iter().all(|hit| hit.content.is_empty()));
12444 assert!(hits.iter().all(|hit| !hit.snippet.is_empty()));
12445 assert_ne!(hits[0].content_hash, hits[1].content_hash);
12446
12447 Ok(())
12448 }
12449
12450 #[test]
12451 fn cache_invalidates_on_new_data() -> Result<()> {
12452 let dir = TempDir::new()?;
12453 let mut index = TantivyIndex::open_or_create(dir.path())?;
12454
12455 let conv1 = NormalizedConversation {
12457 agent_slug: "codex".into(),
12458 external_id: None,
12459 title: Some("first".into()),
12460 workspace: None,
12461 source_path: dir.path().join("1.jsonl"),
12462 started_at: Some(1),
12463 ended_at: None,
12464 metadata: serde_json::json!({}),
12465 messages: vec![NormalizedMessage {
12466 idx: 0,
12467 role: "user".into(),
12468 author: None,
12469 created_at: Some(1),
12470 content: "apple banana".into(),
12471 extra: serde_json::json!({}),
12472 snippets: vec![],
12473 invocations: Vec::new(),
12474 }],
12475 };
12476 index.add_conversation(&conv1)?;
12477 index.commit()?;
12478
12479 let client = SearchClient::open(dir.path(), None)?.expect("index present");
12480
12481 let hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
12483 assert_eq!(hits.len(), 1);
12484 assert_eq!(hits[0].content, "apple banana");
12485
12486 {
12488 let cache = client.prefix_cache.lock().unwrap();
12489 let shard = cache.shard_opt("global").unwrap();
12490 assert!(shard.contains(&client.cache_key("app", &SearchFilters::default())));
12492 }
12493
12494 let conv2 = NormalizedConversation {
12496 agent_slug: "codex".into(),
12497 external_id: None,
12498 title: Some("second".into()),
12499 workspace: None,
12500 source_path: dir.path().join("2.jsonl"),
12501 started_at: Some(2),
12502 ended_at: None,
12503 metadata: serde_json::json!({}),
12504 messages: vec![NormalizedMessage {
12505 idx: 0,
12506 role: "user".into(),
12507 author: None,
12508 created_at: Some(2),
12509 content: "apricot".into(),
12510 extra: serde_json::json!({}),
12511 snippets: vec![],
12512 invocations: Vec::new(),
12513 }],
12514 };
12515 index.add_conversation(&conv2)?;
12516 index.commit()?;
12517
12518 std::thread::sleep(std::time::Duration::from_millis(350));
12524
12525 let _hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
12528 let hits = client.search("apr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
12532 assert_eq!(hits.len(), 1);
12533 assert_eq!(hits[0].content, "apricot");
12534
12535 Ok(())
12539 }
12540
12541 #[test]
12542 fn track_generation_clears_cache_on_change() {
12543 let client = SearchClient {
12544 reader: None,
12545 sqlite: Mutex::new(None),
12546 sqlite_path: None,
12547 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12548 reload_on_search: true,
12549 last_reload: Mutex::new(None),
12550 last_generation: Mutex::new(None),
12551 reload_epoch: Arc::new(AtomicU64::new(0)),
12552 warm_tx: None,
12553 _warm_handle: None,
12554 metrics: Metrics::default(),
12555 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12556 semantic: Mutex::new(None),
12557 last_tantivy_total_count: Mutex::new(None),
12558 };
12559
12560 let hit = SearchHit {
12561 title: "hello world".into(),
12562 snippet: "hello".into(),
12563 content: "hello world".into(),
12564 content_hash: stable_content_hash("hello world"),
12565 score: 1.0,
12566 source_path: "p".into(),
12567 agent: "a".into(),
12568 workspace: "w".into(),
12569 workspace_original: None,
12570 created_at: None,
12571 line_number: None,
12572 match_type: MatchType::Exact,
12573 source_id: "local".into(),
12574 origin_kind: "local".into(),
12575 origin_host: None,
12576 conversation_id: None,
12577 };
12578 let hits = vec![hit];
12579
12580 client.put_cache("hello", &SearchFilters::default(), &hits);
12581 {
12582 let cache = client.prefix_cache.lock().unwrap();
12583 assert!(!cache.shards.is_empty());
12584 }
12585
12586 client.track_generation(1);
12587 {
12588 let cache = client.prefix_cache.lock().unwrap();
12589 assert!(!cache.shards.is_empty());
12590 }
12591
12592 client.track_generation(2);
12593 {
12594 let cache = client.prefix_cache.lock().unwrap();
12595 assert!(cache.shards.is_empty());
12596 }
12597 }
12598
12599 #[test]
12600 fn cache_total_cap_evicts_across_shards() {
12601 let client = SearchClient {
12602 reader: None,
12603 sqlite: Mutex::new(None),
12604 sqlite_path: None,
12605 prefix_cache: Mutex::new(CacheShards::new(2, 0)), reload_on_search: true,
12607 last_reload: Mutex::new(None),
12608 last_generation: Mutex::new(None),
12609 reload_epoch: Arc::new(AtomicU64::new(0)),
12610 warm_tx: None,
12611 _warm_handle: None,
12612 metrics: Metrics::default(),
12613 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12614 semantic: Mutex::new(None),
12615 last_tantivy_total_count: Mutex::new(None),
12616 };
12617
12618 let hit = SearchHit {
12619 title: "a".into(),
12620 snippet: "a".into(),
12621 content: "a".into(),
12622 content_hash: stable_content_hash("a"),
12623 score: 1.0,
12624 source_path: "p".into(),
12625 agent: "agent1".into(),
12626 workspace: "w".into(),
12627 workspace_original: None,
12628 created_at: None,
12629 line_number: None,
12630 match_type: MatchType::Exact,
12631 source_id: "local".into(),
12632 origin_kind: "local".into(),
12633 origin_host: None,
12634 conversation_id: None,
12635 };
12636 let hits = vec![hit.clone()];
12637
12638 let mut filters = SearchFilters::default();
12639 filters.agents.insert("agent1".into());
12640 client.put_cache("a", &filters, &hits);
12641 filters.agents.clear();
12642 filters.agents.insert("agent2".into());
12643 client.put_cache("b", &filters, &hits);
12644 filters.agents.clear();
12645 filters.agents.insert("agent3".into());
12646 client.put_cache("c", &filters, &hits);
12647
12648 let stats = client.cache_stats();
12649 assert!(stats.total_cost <= stats.total_cap);
12650 assert_eq!(stats.total_cap, 2);
12651 }
12652
12653 #[test]
12654 fn cache_stats_reflect_metrics() {
12655 let client = SearchClient {
12656 reader: None,
12657 sqlite: Mutex::new(None),
12658 sqlite_path: None,
12659 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12660 reload_on_search: true,
12661 last_reload: Mutex::new(None),
12662 last_generation: Mutex::new(None),
12663 reload_epoch: Arc::new(AtomicU64::new(0)),
12664 warm_tx: None,
12665 _warm_handle: None,
12666 metrics: Metrics::default(),
12667 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12668 semantic: Mutex::new(None),
12669 last_tantivy_total_count: Mutex::new(None),
12670 };
12671
12672 client.metrics.inc_cache_hits();
12673 client.metrics.inc_cache_miss();
12674 client.metrics.inc_cache_shortfall();
12675 client.metrics.record_reload(Duration::from_millis(10));
12676
12677 let stats = client.cache_stats();
12678 assert_eq!(stats.cache_hits, 1);
12679 assert_eq!(stats.cache_miss, 1);
12680 assert_eq!(stats.cache_shortfall, 1);
12681 assert_eq!(stats.reloads, 1);
12682 assert_eq!(stats.reload_ms_total, 10);
12683 assert_eq!(stats.total_cap, *CACHE_TOTAL_CAP);
12684 assert_eq!(stats.eviction_policy, "lru");
12685 assert_eq!(stats.prewarm_scheduled, 0);
12686 assert_eq!(stats.prewarm_skipped_pressure, 0);
12687 assert_eq!(CacheStats::default().eviction_policy, "unknown");
12688 }
12689
12690 #[test]
12691 fn adaptive_query_prewarm_schedules_only_after_hot_prefix_cache_entry() {
12692 let (tx, rx) = mpsc::unbounded();
12693 let client = SearchClient {
12694 reader: None,
12695 sqlite: Mutex::new(None),
12696 sqlite_path: None,
12697 prefix_cache: Mutex::new(CacheShards::new(10, 0)),
12698 reload_on_search: true,
12699 last_reload: Mutex::new(None),
12700 last_generation: Mutex::new(None),
12701 reload_epoch: Arc::new(AtomicU64::new(0)),
12702 warm_tx: Some(tx),
12703 _warm_handle: None,
12704 metrics: Metrics::default(),
12705 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12706 semantic: Mutex::new(None),
12707 last_tantivy_total_count: Mutex::new(None),
12708 };
12709 let mut filters = SearchFilters::default();
12710 filters.workspaces.insert("/tmp/cass-workspace".into());
12711
12712 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
12713 assert!(
12714 rx.try_recv().is_err(),
12715 "cold prefixes should not schedule adaptive prewarm"
12716 );
12717
12718 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
12719 hit.snippet = "hello".into();
12720 hit.content = "hello world".into();
12721 hit.content_hash = stable_content_hash(&hit.content);
12722 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
12723
12724 let total_cost_before = client.cache_stats().total_cost;
12725 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
12726 assert!(
12727 rx.try_recv().is_err(),
12728 "an exact cached query should not schedule redundant prewarm"
12729 );
12730 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
12731
12732 let job = rx
12733 .try_recv()
12734 .expect("hot prefix should schedule adaptive prewarm");
12735 assert_eq!(job.query, "hello");
12736 assert_eq!(job.shard_name, "workspace:/tmp/cass-workspace");
12737 assert_eq!(job.filters_fingerprint, filters_fingerprint(&filters));
12738 let stats = client.cache_stats();
12739 assert_eq!(stats.prewarm_scheduled, 1);
12740 assert_eq!(stats.prewarm_skipped_pressure, 0);
12741 assert_eq!(
12742 stats.total_cost, total_cost_before,
12743 "prewarm scheduling should not mutate result-cache contents"
12744 );
12745 }
12746
12747 #[test]
12748 fn adaptive_query_prewarm_skips_when_cache_byte_cap_is_under_pressure() {
12749 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
12750 hit.snippet = "hello".into();
12751 hit.content = "hello world with enough content to consume the small byte budget".into();
12752 hit.content_hash = stable_content_hash(&hit.content);
12753 let byte_cap = cached_hit_from(&hit).approx_bytes();
12754
12755 let (tx, rx) = mpsc::unbounded();
12756 let client = SearchClient {
12757 reader: None,
12758 sqlite: Mutex::new(None),
12759 sqlite_path: None,
12760 prefix_cache: Mutex::new(CacheShards::new(10, byte_cap)),
12761 reload_on_search: true,
12762 last_reload: Mutex::new(None),
12763 last_generation: Mutex::new(None),
12764 reload_epoch: Arc::new(AtomicU64::new(0)),
12765 warm_tx: Some(tx),
12766 _warm_handle: None,
12767 metrics: Metrics::default(),
12768 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12769 semantic: Mutex::new(None),
12770 last_tantivy_total_count: Mutex::new(None),
12771 };
12772 let filters = SearchFilters::default();
12773
12774 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
12775 client.maybe_schedule_adaptive_query_prewarm("zebra", &filters);
12776 assert_eq!(
12777 client.cache_stats().prewarm_skipped_pressure,
12778 0,
12779 "cold queries should not be counted as pressure-skipped prewarm jobs"
12780 );
12781
12782 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
12783
12784 assert!(
12785 rx.try_recv().is_err(),
12786 "prewarm should be disabled while cache byte pressure is high"
12787 );
12788 let stats = client.cache_stats();
12789 assert_eq!(stats.prewarm_scheduled, 0);
12790 assert_eq!(stats.prewarm_skipped_pressure, 1);
12791 assert!(stats.approx_bytes <= stats.byte_cap);
12792 }
12793
12794 #[test]
12795 fn cache_eviction_count_tracks_evictions() {
12796 let client = SearchClient {
12798 reader: None,
12799 sqlite: Mutex::new(None),
12800 sqlite_path: None,
12801 prefix_cache: Mutex::new(CacheShards::new(2, 0)),
12802 reload_on_search: true,
12803 last_reload: Mutex::new(None),
12804 last_generation: Mutex::new(None),
12805 reload_epoch: Arc::new(AtomicU64::new(0)),
12806 warm_tx: None,
12807 _warm_handle: None,
12808 metrics: Metrics::default(),
12809 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12810 semantic: Mutex::new(None),
12811 last_tantivy_total_count: Mutex::new(None),
12812 };
12813
12814 let hit = SearchHit {
12815 title: "test".into(),
12816 snippet: "snippet".into(),
12817 content: "content".into(),
12818 content_hash: stable_content_hash("content"),
12819 score: 1.0,
12820 source_path: "p".into(),
12821 agent: "a".into(),
12822 workspace: "w".into(),
12823 workspace_original: None,
12824 created_at: None,
12825 line_number: None,
12826 match_type: MatchType::Exact,
12827 source_id: "local".into(),
12828 origin_kind: "local".into(),
12829 origin_host: None,
12830 conversation_id: None,
12831 };
12832
12833 client.put_cache(
12835 "query1",
12836 &SearchFilters::default(),
12837 std::slice::from_ref(&hit),
12838 );
12839 client.put_cache(
12840 "query2",
12841 &SearchFilters::default(),
12842 std::slice::from_ref(&hit),
12843 );
12844 client.put_cache(
12845 "query3",
12846 &SearchFilters::default(),
12847 std::slice::from_ref(&hit),
12848 );
12849
12850 let stats = client.cache_stats();
12851 assert!(
12852 stats.eviction_count >= 1,
12853 "should have evicted at least 1 entry"
12854 );
12855 assert!(stats.total_cost <= 2, "should be at or below cap");
12856 assert!(stats.approx_bytes > 0, "should track bytes used");
12857 }
12858
12859 #[test]
12860 fn default_cache_byte_cap_scales_with_available_memory() {
12861 let gib = 1024_u64 * 1024 * 1024;
12862
12863 assert_eq!(
12864 default_cache_byte_cap_for_available(None),
12865 DEFAULT_CACHE_BYTE_CAP_FALLBACK
12866 );
12867 assert_eq!(
12868 default_cache_byte_cap_for_available(Some(2 * gib)),
12869 DEFAULT_CACHE_BYTE_CAP_FALLBACK,
12870 "small hosts keep a conservative cache byte budget"
12871 );
12872 assert_eq!(
12873 default_cache_byte_cap_for_available(Some(64 * gib)),
12874 512 * 1024 * 1024,
12875 "larger hosts get a proportionally larger cache byte budget"
12876 );
12877 assert_eq!(
12878 default_cache_byte_cap_for_available(Some(256 * gib)),
12879 usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX),
12880 "large swarm hosts still have a bounded default cache budget"
12881 );
12882 }
12883
12884 #[test]
12885 fn malformed_cache_byte_cap_env_uses_default_instead_of_disabling_guard() {
12886 let gib = 1024_u64 * 1024 * 1024;
12887
12888 assert_eq!(cache_byte_cap_from_env_value(Some("0"), Some(64 * gib)), 0);
12889 assert_eq!(
12890 cache_byte_cap_from_env_value(Some("not-a-number"), Some(64 * gib)),
12891 default_cache_byte_cap_for_available(Some(64 * gib)),
12892 "malformed env should keep the default memory guard active"
12893 );
12894 assert_eq!(
12895 cache_byte_cap_from_env_value(None, Some(64 * gib)),
12896 default_cache_byte_cap_for_available(Some(64 * gib))
12897 );
12898 }
12899
12900 #[test]
12901 fn cache_eviction_policy_env_defaults_to_lru_and_accepts_s3_fifo() {
12902 assert_eq!(
12903 cache_eviction_policy_from_env_value(None),
12904 CacheEvictionPolicy::Lru
12905 );
12906 assert_eq!(
12907 cache_eviction_policy_from_env_value(Some("not-a-policy")),
12908 CacheEvictionPolicy::Lru,
12909 "malformed env keeps the current LRU behavior"
12910 );
12911 assert_eq!(
12912 cache_eviction_policy_from_env_value(Some("s3-fifo")),
12913 CacheEvictionPolicy::S3Fifo
12914 );
12915 assert_eq!(
12916 cache_eviction_policy_from_env_value(Some("s3_fifo")),
12917 CacheEvictionPolicy::S3Fifo
12918 );
12919 }
12920
12921 #[test]
12922 fn s3_fifo_admission_rejects_one_off_byte_heavy_entries_then_admits_ghost_replay() {
12923 let content = "large".repeat(1_000);
12924 let hit = SearchHit {
12925 title: "large".into(),
12926 snippet: "large".into(),
12927 content: content.clone(),
12928 content_hash: stable_content_hash(&content),
12929 score: 1.0,
12930 source_path: "large-path".into(),
12931 agent: "a".into(),
12932 workspace: "w".into(),
12933 workspace_original: None,
12934 created_at: None,
12935 line_number: None,
12936 match_type: MatchType::Exact,
12937 source_id: "local".into(),
12938 origin_kind: "local".into(),
12939 origin_host: None,
12940 conversation_id: None,
12941 };
12942 let cached = cached_hit_from(&hit);
12943 let byte_cap = cached.approx_bytes() + 1_024;
12944 assert!(
12945 cached.approx_bytes() > byte_cap.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR)
12946 );
12947
12948 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::S3Fifo);
12949 let key = Arc::<str>::from("large-query");
12950
12951 cache.put("global", key.clone(), vec![cached.clone()]);
12952 assert_eq!(
12953 cache.total_cost(),
12954 0,
12955 "first one-off large entry is not admitted"
12956 );
12957 assert_eq!(cache.ghost_entries(), 1);
12958 assert_eq!(cache.admission_rejects(), 1);
12959
12960 cache.put("global", key, vec![cached]);
12961 assert_eq!(
12962 cache.total_cost(),
12963 1,
12964 "ghost replay admits the repeated query"
12965 );
12966 assert_eq!(cache.ghost_entries(), 0);
12967 assert!(cache.ghost_keys.is_empty());
12968 assert_eq!(cache.admission_rejects(), 1);
12969 assert!(cache.total_bytes() <= cache.byte_cap());
12970 }
12971
12972 #[test]
12973 fn lru_policy_keeps_admitting_large_entries_under_existing_caps() {
12974 let content = "large".repeat(1_000);
12975 let hit = SearchHit {
12976 title: "large".into(),
12977 snippet: "large".into(),
12978 content: content.clone(),
12979 content_hash: stable_content_hash(&content),
12980 score: 1.0,
12981 source_path: "large-path".into(),
12982 agent: "a".into(),
12983 workspace: "w".into(),
12984 workspace_original: None,
12985 created_at: None,
12986 line_number: None,
12987 match_type: MatchType::Exact,
12988 source_id: "local".into(),
12989 origin_kind: "local".into(),
12990 origin_host: None,
12991 conversation_id: None,
12992 };
12993 let cached = cached_hit_from(&hit);
12994 let byte_cap = cached.approx_bytes() + 1_024;
12995 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::Lru);
12996
12997 cache.put("global", Arc::<str>::from("large-query"), vec![cached]);
12998
12999 assert_eq!(cache.total_cost(), 1);
13000 assert_eq!(cache.ghost_entries(), 0);
13001 assert_eq!(cache.admission_rejects(), 0);
13002 assert_eq!(cache.policy_label(), "lru");
13003 }
13004
13005 #[test]
13006 fn cache_byte_cap_triggers_eviction() {
13007 let client = SearchClient {
13009 reader: None,
13010 sqlite: Mutex::new(None),
13011 sqlite_path: None,
13012 prefix_cache: Mutex::new(CacheShards::new(1000, 100)), reload_on_search: true,
13014 last_reload: Mutex::new(None),
13015 last_generation: Mutex::new(None),
13016 reload_epoch: Arc::new(AtomicU64::new(0)),
13017 warm_tx: None,
13018 _warm_handle: None,
13019 metrics: Metrics::default(),
13020 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13021 semantic: Mutex::new(None),
13022 last_tantivy_total_count: Mutex::new(None),
13023 };
13024
13025 let content = "c".repeat(100);
13027 let hit = SearchHit {
13028 title: "a".repeat(50),
13029 snippet: "b".repeat(50),
13030 content: content.clone(), content_hash: stable_content_hash(&content),
13032 score: 1.0,
13033 source_path: "p".into(),
13034 agent: "a".into(),
13035 workspace: "w".into(),
13036 workspace_original: None,
13037 created_at: None,
13038 line_number: None,
13039 match_type: MatchType::Exact,
13040 source_id: "local".into(),
13041 origin_kind: "local".into(),
13042 origin_host: None,
13043 conversation_id: None,
13044 };
13045
13046 client.put_cache("q1", &SearchFilters::default(), std::slice::from_ref(&hit));
13048 client.put_cache("q2", &SearchFilters::default(), std::slice::from_ref(&hit));
13049 client.put_cache("q3", &SearchFilters::default(), std::slice::from_ref(&hit));
13050
13051 let stats = client.cache_stats();
13052 assert!(
13053 stats.eviction_count >= 1,
13054 "byte cap should trigger evictions"
13055 );
13056 assert_eq!(stats.byte_cap, 100, "byte cap should be reported");
13057 }
13059
13060 #[test]
13061 fn cache_byte_pressure_evicts_byte_heavy_shard_before_small_entries() {
13062 let small_hit = SearchHit {
13063 title: "small".into(),
13064 snippet: "small".into(),
13065 content: "small".into(),
13066 content_hash: stable_content_hash("small"),
13067 score: 1.0,
13068 source_path: "small-path".into(),
13069 agent: "a".into(),
13070 workspace: "w".into(),
13071 workspace_original: None,
13072 created_at: None,
13073 line_number: None,
13074 match_type: MatchType::Exact,
13075 source_id: "local".into(),
13076 origin_kind: "local".into(),
13077 origin_host: None,
13078 conversation_id: None,
13079 };
13080 let large_content = "large".repeat(2_000);
13081 let large_hit = SearchHit {
13082 title: "large".into(),
13083 snippet: "large".into(),
13084 content: large_content.clone(),
13085 content_hash: stable_content_hash(&large_content),
13086 score: 1.0,
13087 source_path: "large-path".into(),
13088 agent: "b".into(),
13089 workspace: "w".into(),
13090 workspace_original: None,
13091 created_at: None,
13092 line_number: None,
13093 match_type: MatchType::Exact,
13094 source_id: "local".into(),
13095 origin_kind: "local".into(),
13096 origin_host: None,
13097 conversation_id: None,
13098 };
13099
13100 let mut cache = CacheShards::new(100, 1_024);
13101 cache.put(
13102 "small",
13103 Arc::<str>::from("small-1"),
13104 vec![cached_hit_from(&small_hit)],
13105 );
13106 cache.put(
13107 "small",
13108 Arc::<str>::from("small-2"),
13109 vec![cached_hit_from(&small_hit)],
13110 );
13111 cache.put(
13112 "large",
13113 Arc::<str>::from("large-1"),
13114 vec![cached_hit_from(&large_hit)],
13115 );
13116
13117 assert_eq!(
13118 cache.shard_opt("small").map(LruCache::len),
13119 Some(2),
13120 "byte pressure should preserve the small shard"
13121 );
13122 assert!(
13123 cache.shard_opt("large").is_none_or(LruCache::is_empty),
13124 "oversized shard should be evicted first under byte pressure"
13125 );
13126 assert!(cache.total_bytes() <= cache.byte_cap());
13127 }
13128
13129 #[test]
13134 fn wildcard_pattern_parse_exact() {
13135 assert_eq!(
13137 FsCassWildcardPattern::parse("hello"),
13138 FsCassWildcardPattern::Exact("hello".into())
13139 );
13140 assert_eq!(
13141 FsCassWildcardPattern::parse("HELLO"),
13142 FsCassWildcardPattern::Exact("hello".into()) );
13144 assert_eq!(
13145 FsCassWildcardPattern::parse("FooBar123"),
13146 FsCassWildcardPattern::Exact("foobar123".into())
13147 );
13148 }
13149
13150 #[test]
13151 fn wildcard_pattern_parse_prefix() {
13152 assert_eq!(
13154 FsCassWildcardPattern::parse("foo*"),
13155 FsCassWildcardPattern::Prefix("foo".into())
13156 );
13157 assert_eq!(
13158 FsCassWildcardPattern::parse("CONFIG*"),
13159 FsCassWildcardPattern::Prefix("config".into())
13160 );
13161 assert_eq!(
13162 FsCassWildcardPattern::parse("test*"),
13163 FsCassWildcardPattern::Prefix("test".into())
13164 );
13165 }
13166
13167 #[test]
13168 fn wildcard_pattern_parse_suffix() {
13169 assert_eq!(
13171 FsCassWildcardPattern::parse("*foo"),
13172 FsCassWildcardPattern::Suffix("foo".into())
13173 );
13174 assert_eq!(
13175 FsCassWildcardPattern::parse("*Error"),
13176 FsCassWildcardPattern::Suffix("error".into())
13177 );
13178 assert_eq!(
13179 FsCassWildcardPattern::parse("*Handler"),
13180 FsCassWildcardPattern::Suffix("handler".into())
13181 );
13182 }
13183
13184 #[test]
13185 fn wildcard_pattern_parse_substring() {
13186 assert_eq!(
13188 FsCassWildcardPattern::parse("*foo*"),
13189 FsCassWildcardPattern::Substring("foo".into())
13190 );
13191 assert_eq!(
13192 FsCassWildcardPattern::parse("*CONFIG*"),
13193 FsCassWildcardPattern::Substring("config".into())
13194 );
13195 assert_eq!(
13196 FsCassWildcardPattern::parse("*test*"),
13197 FsCassWildcardPattern::Substring("test".into())
13198 );
13199 }
13200
13201 #[test]
13202 fn wildcard_pattern_parse_edge_cases() {
13203 assert_eq!(
13205 FsCassWildcardPattern::parse("*"),
13206 FsCassWildcardPattern::Exact(String::new())
13207 );
13208 assert_eq!(
13209 FsCassWildcardPattern::parse("**"),
13210 FsCassWildcardPattern::Exact(String::new())
13211 );
13212 assert_eq!(
13213 FsCassWildcardPattern::parse("***"),
13214 FsCassWildcardPattern::Exact(String::new())
13215 );
13216
13217 assert_eq!(
13219 FsCassWildcardPattern::parse("*a*"),
13220 FsCassWildcardPattern::Substring("a".into())
13221 );
13222 assert_eq!(
13223 FsCassWildcardPattern::parse("a*"),
13224 FsCassWildcardPattern::Prefix("a".into())
13225 );
13226 assert_eq!(
13227 FsCassWildcardPattern::parse("*a"),
13228 FsCassWildcardPattern::Suffix("a".into())
13229 );
13230
13231 assert_eq!(
13233 FsCassWildcardPattern::parse("***foo***"),
13234 FsCassWildcardPattern::Substring("foo".into())
13235 );
13236 }
13237
13238 #[test]
13239 fn wildcard_pattern_to_regex_suffix() {
13240 let pattern = FsCassWildcardPattern::Suffix("foo".into());
13241 assert_eq!(pattern.to_regex(), Some(".*foo$".into()));
13243 }
13244
13245 #[test]
13246 fn wildcard_pattern_to_regex_substring() {
13247 let pattern = FsCassWildcardPattern::Substring("bar".into());
13248 assert_eq!(pattern.to_regex(), Some(".*bar.*".into()));
13249 }
13250
13251 #[test]
13252 fn wildcard_pattern_to_regex_exact_prefix_none() {
13253 let exact = FsCassWildcardPattern::Exact("foo".into());
13255 assert_eq!(exact.to_regex(), None);
13256
13257 let prefix = FsCassWildcardPattern::Prefix("bar".into());
13258 assert_eq!(prefix.to_regex(), None);
13259 }
13260
13261 #[test]
13262 fn match_type_quality_factors() {
13263 assert_eq!(MatchType::Exact.quality_factor(), 1.0);
13265 assert_eq!(MatchType::Prefix.quality_factor(), 0.9);
13267 assert_eq!(MatchType::Suffix.quality_factor(), 0.8);
13269 assert_eq!(MatchType::Substring.quality_factor(), 0.7);
13271 assert_eq!(MatchType::ImplicitWildcard.quality_factor(), 0.6);
13273 }
13274
13275 #[test]
13276 fn dominant_match_type_single_terms() {
13277 assert_eq!(dominant_match_type("hello"), MatchType::Exact);
13279 assert_eq!(dominant_match_type("hello*"), MatchType::Prefix);
13280 assert_eq!(dominant_match_type("*hello"), MatchType::Suffix);
13281 assert_eq!(dominant_match_type("*hello*"), MatchType::Substring);
13282 }
13283
13284 #[test]
13285 fn dominant_match_type_multiple_terms() {
13286 assert_eq!(dominant_match_type("foo bar"), MatchType::Exact);
13288 assert_eq!(dominant_match_type("foo bar*"), MatchType::Prefix);
13289 assert_eq!(dominant_match_type("foo *bar"), MatchType::Suffix);
13290 assert_eq!(dominant_match_type("foo* *bar*"), MatchType::Substring);
13291 assert_eq!(dominant_match_type("foo *bar* baz"), MatchType::Substring);
13293 }
13294
13295 #[test]
13296 fn dominant_match_type_empty_query() {
13297 assert_eq!(dominant_match_type(""), MatchType::Exact);
13298 assert_eq!(dominant_match_type(" "), MatchType::Exact);
13299 }
13300
13301 #[test]
13302 fn wildcard_pattern_to_regex_escapes_special_chars() {
13303 assert_eq!(
13304 FsCassWildcardPattern::Suffix("foo.bar".into()).to_regex(),
13305 Some(".*foo\\.bar$".into())
13306 );
13307 assert_eq!(
13308 FsCassWildcardPattern::Substring("a+b*c?".into()).to_regex(),
13309 Some(".*a\\+b\\*c\\?.*".into())
13310 );
13311 }
13312
13313 #[test]
13314 fn wildcard_pattern_to_regex_escapes_complex_patterns() {
13315 assert_eq!(
13316 FsCassWildcardPattern::Suffix("test[0-9]+".into()).to_regex(),
13317 Some(".*test\\[0-9\\]\\+$".into())
13318 );
13319 assert_eq!(
13320 FsCassWildcardPattern::Substring("(a|b)".into()).to_regex(),
13321 Some(".*\\(a\\|b\\).*".into())
13322 );
13323 assert_eq!(
13324 FsCassWildcardPattern::Substring("end$".into()).to_regex(),
13325 Some(".*end\\$.*".into())
13326 );
13327 assert_eq!(
13328 FsCassWildcardPattern::Substring("^start".into()).to_regex(),
13329 Some(".*\\^start.*".into())
13330 );
13331 }
13332
13333 #[test]
13334 fn is_tool_invocation_noise_detects_noise() {
13335 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
13337 assert!(!is_tool_invocation_noise("[Tool: Read]"));
13338
13339 assert!(is_tool_invocation_noise("[Tool:]"));
13341 assert!(is_tool_invocation_noise("[Tool: ]"));
13342
13343 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
13345 assert!(!is_tool_invocation_noise(" [Tool: Grep - Search files] "));
13346
13347 assert!(is_tool_invocation_noise("[tool]"));
13349 assert!(is_tool_invocation_noise("tool: Bash"));
13350 }
13351
13352 #[test]
13353 fn is_tool_invocation_noise_allows_useful_content() {
13354 assert!(!is_tool_invocation_noise("[Tool: Read - src/main.rs]"));
13356 assert!(!is_tool_invocation_noise("[Tool: Bash - cargo test --lib]"));
13357 }
13358
13359 #[test]
13360 fn is_tool_invocation_noise_detects_tool_markers() {
13361 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
13363 assert!(!is_tool_invocation_noise("[Tool: Read]"));
13364
13365 assert!(is_tool_invocation_noise("[Tool:]"));
13367
13368 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
13370 assert!(!is_tool_invocation_noise(" [Tool: Write - description] "));
13371 }
13372
13373 #[test]
13374 fn deduplicate_hits_removes_exact_dupes() {
13375 let hits = vec![
13376 SearchHit {
13377 title: "title1".into(),
13378 snippet: "snip1".into(),
13379 content: "hello world".into(),
13380 content_hash: stable_content_hash("hello world"),
13381 score: 1.0,
13382 source_path: "a.jsonl".into(),
13383 agent: "agent".into(),
13384 workspace: "ws".into(),
13385 workspace_original: None,
13386 created_at: Some(100),
13387 line_number: None,
13388 match_type: MatchType::Exact,
13389 source_id: "local".into(),
13390 origin_kind: "local".into(),
13391 origin_host: None,
13392 conversation_id: None,
13393 },
13394 SearchHit {
13395 title: "title1".into(),
13396 snippet: "snip2".into(),
13397 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13399 score: 0.5, source_path: "a.jsonl".into(),
13401 agent: "agent".into(),
13402 workspace: "ws".into(),
13403 workspace_original: None,
13404 created_at: Some(100),
13405 line_number: None,
13406 match_type: MatchType::Exact,
13407 source_id: "local".into(), origin_kind: "local".into(),
13409 origin_host: None,
13410 conversation_id: None,
13411 },
13412 ];
13413
13414 let deduped = deduplicate_hits(hits);
13415 assert_eq!(deduped.len(), 1);
13416 assert_eq!(deduped[0].score, 1.0); assert_eq!(deduped[0].title, "title1");
13418 }
13419
13420 #[test]
13421 fn deduplicate_hits_keeps_higher_score() {
13422 let hits = vec![
13423 SearchHit {
13424 title: "title1".into(),
13425 snippet: "snip1".into(),
13426 content: "hello world".into(),
13427 content_hash: stable_content_hash("hello world"),
13428 score: 0.3, source_path: "a.jsonl".into(),
13430 agent: "agent".into(),
13431 workspace: "ws".into(),
13432 workspace_original: None,
13433 created_at: Some(100),
13434 line_number: None,
13435 match_type: MatchType::Exact,
13436 source_id: "local".into(),
13437 origin_kind: "local".into(),
13438 origin_host: None,
13439 conversation_id: None,
13440 },
13441 SearchHit {
13442 title: "title1".into(),
13443 snippet: "snip2".into(),
13444 content: "hello world".into(),
13445 content_hash: stable_content_hash("hello world"),
13446 score: 0.9, source_path: "a.jsonl".into(),
13448 agent: "agent".into(),
13449 workspace: "ws".into(),
13450 workspace_original: None,
13451 created_at: Some(100),
13452 line_number: None,
13453 match_type: MatchType::Exact,
13454 source_id: "local".into(),
13455 origin_kind: "local".into(),
13456 origin_host: None,
13457 conversation_id: None,
13458 },
13459 ];
13460
13461 let deduped = deduplicate_hits(hits);
13462 assert_eq!(deduped.len(), 1);
13463 assert_eq!(deduped[0].score, 0.9); assert_eq!(deduped[0].title, "title1");
13465 }
13466
13467 #[test]
13468 fn deduplicate_hits_keeps_repeated_same_content_at_different_lines() {
13469 let first = SearchHit {
13470 title: "Shared Session".into(),
13471 snippet: String::new(),
13472 content: "repeat me".into(),
13473 content_hash: stable_content_hash("repeat me"),
13474 score: 10.0,
13475 source_path: "/shared/session.jsonl".into(),
13476 agent: "codex".into(),
13477 workspace: "/ws".into(),
13478 workspace_original: None,
13479 created_at: Some(100),
13480 line_number: Some(1),
13481 match_type: MatchType::Exact,
13482 source_id: "local".into(),
13483 origin_kind: "local".into(),
13484 origin_host: None,
13485 conversation_id: None,
13486 };
13487 let mut second = first.clone();
13488 second.line_number = Some(2);
13489 second.created_at = Some(200);
13490 second.score = 9.0;
13491
13492 let deduped = deduplicate_hits(vec![first, second]);
13493 assert_eq!(deduped.len(), 2);
13494 }
13495
13496 #[test]
13497 fn deduplicate_hits_keeps_distinct_conversation_ids_with_same_title_path_and_content() {
13498 let mut first = make_test_hit("same", 1.0);
13499 first.title = "Shared Session".into();
13500 first.source_path = "/shared/session.jsonl".into();
13501 first.content = "identical body".into();
13502 first.content_hash = stable_content_hash("identical body");
13503 first.conversation_id = Some(1);
13504
13505 let mut second = first.clone();
13506 second.conversation_id = Some(2);
13507 second.score = 0.9;
13508
13509 let deduped = deduplicate_hits(vec![first, second]);
13510 assert_eq!(deduped.len(), 2);
13511 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(1)));
13512 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(2)));
13513 }
13514
13515 #[test]
13516 fn deduplicate_hits_coalesces_same_conversation_id_despite_title_drift() {
13517 let mut first = make_test_hit("same", 1.0);
13518 first.title = "Morning Session".into();
13519 first.source_path = "/shared/session.jsonl".into();
13520 first.content = "identical body".into();
13521 first.content_hash = stable_content_hash("identical body");
13522 first.conversation_id = Some(7);
13523
13524 let mut second = first.clone();
13525 second.title = "Evening Session".into();
13526 second.score = 0.9;
13527
13528 let deduped = deduplicate_hits(vec![first, second]);
13529 assert_eq!(deduped.len(), 1);
13530 assert_eq!(deduped[0].conversation_id, Some(7));
13531 }
13532
13533 #[test]
13534 fn deduplicate_hits_keeps_distinct_titles_with_same_source_path_and_content() {
13535 let hits = vec![
13536 SearchHit {
13537 title: "Morning Session".into(),
13538 snippet: "snip1".into(),
13539 content: "hello world".into(),
13540 content_hash: stable_content_hash("hello world"),
13541 score: 0.9,
13542 source_path: "shared.jsonl".into(),
13543 agent: "agent".into(),
13544 workspace: "ws".into(),
13545 workspace_original: None,
13546 created_at: None,
13547 line_number: Some(1),
13548 match_type: MatchType::Exact,
13549 source_id: "local".into(),
13550 origin_kind: "local".into(),
13551 origin_host: None,
13552 conversation_id: None,
13553 },
13554 SearchHit {
13555 title: "Evening Session".into(),
13556 snippet: "snip2".into(),
13557 content: "hello world".into(),
13558 content_hash: stable_content_hash("hello world"),
13559 score: 0.8,
13560 source_path: "shared.jsonl".into(),
13561 agent: "agent".into(),
13562 workspace: "ws".into(),
13563 workspace_original: None,
13564 created_at: None,
13565 line_number: Some(1),
13566 match_type: MatchType::Exact,
13567 source_id: "local".into(),
13568 origin_kind: "local".into(),
13569 origin_host: None,
13570 conversation_id: None,
13571 },
13572 ];
13573
13574 let deduped = deduplicate_hits(hits);
13575 assert_eq!(deduped.len(), 2);
13576 assert!(deduped.iter().any(|hit| hit.title == "Morning Session"));
13577 assert!(deduped.iter().any(|hit| hit.title == "Evening Session"));
13578 }
13579
13580 #[test]
13581 fn deduplicate_hits_normalizes_whitespace() {
13582 let hits = vec![
13583 SearchHit {
13584 title: "title1".into(),
13585 snippet: "snip1".into(),
13586 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13588 score: 1.0,
13589 source_path: "a.jsonl".into(),
13590 agent: "agent".into(),
13591 workspace: "ws".into(),
13592 workspace_original: None,
13593 created_at: Some(100),
13594 line_number: None,
13595 match_type: MatchType::Exact,
13596 source_id: "local".into(),
13597 origin_kind: "local".into(),
13598 origin_host: None,
13599 conversation_id: None,
13600 },
13601 SearchHit {
13602 title: "title1".into(),
13603 snippet: "snip2".into(),
13604 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13606 score: 0.5,
13607 source_path: "a.jsonl".into(),
13608 agent: "agent".into(),
13609 workspace: "ws".into(),
13610 workspace_original: None,
13611 created_at: Some(100),
13612 line_number: None,
13613 match_type: MatchType::Exact,
13614 source_id: "local".into(),
13615 origin_kind: "local".into(),
13616 origin_host: None,
13617 conversation_id: None,
13618 },
13619 ];
13620
13621 let deduped = deduplicate_hits(hits);
13622 assert_eq!(deduped.len(), 1); }
13624
13625 #[test]
13626 fn deduplicate_hits_normalizes_blank_local_source_id() {
13627 let hits = vec![
13628 SearchHit {
13629 title: "title1".into(),
13630 snippet: "snip1".into(),
13631 content: "hello world".into(),
13632 content_hash: stable_content_hash("hello world"),
13633 score: 1.0,
13634 source_path: "a.jsonl".into(),
13635 agent: "agent".into(),
13636 workspace: "ws".into(),
13637 workspace_original: None,
13638 created_at: Some(100),
13639 line_number: None,
13640 match_type: MatchType::Exact,
13641 source_id: "local".into(),
13642 origin_kind: "local".into(),
13643 origin_host: None,
13644 conversation_id: None,
13645 },
13646 SearchHit {
13647 title: "title1".into(),
13648 snippet: "snip2".into(),
13649 content: "hello world".into(),
13650 content_hash: stable_content_hash("hello world"),
13651 score: 0.5,
13652 source_path: "a.jsonl".into(),
13653 agent: "agent".into(),
13654 workspace: "ws".into(),
13655 workspace_original: None,
13656 created_at: Some(100),
13657 line_number: None,
13658 match_type: MatchType::Exact,
13659 source_id: " ".into(),
13660 origin_kind: "local".into(),
13661 origin_host: None,
13662 conversation_id: None,
13663 },
13664 ];
13665
13666 let deduped = deduplicate_hits(hits);
13667 assert_eq!(deduped.len(), 1);
13668 assert_eq!(deduped[0].source_id, "local");
13669 }
13670
13671 #[test]
13672 fn deduplicate_hits_filters_tool_noise() {
13673 let hits = vec![
13674 SearchHit {
13675 title: "title1".into(),
13676 snippet: "snip1".into(),
13677 content: "[Tool:]".into(), content_hash: stable_content_hash("[Tool:]"),
13679 score: 1.0,
13680 source_path: "a.jsonl".into(),
13681 agent: "agent".into(),
13682 workspace: "ws".into(),
13683 workspace_original: None,
13684 created_at: Some(100),
13685 line_number: None,
13686 match_type: MatchType::Exact,
13687 source_id: "local".into(),
13688 origin_kind: "local".into(),
13689 origin_host: None,
13690 conversation_id: None,
13691 },
13692 SearchHit {
13693 title: "title2".into(),
13694 snippet: "snip2".into(),
13695 content: "This is real content about testing".into(),
13696 content_hash: stable_content_hash("This is real content about testing"),
13697 score: 0.5,
13698 source_path: "b.jsonl".into(),
13699 agent: "agent".into(),
13700 workspace: "ws".into(),
13701 workspace_original: None,
13702 created_at: Some(200),
13703 line_number: None,
13704 match_type: MatchType::Exact,
13705 source_id: "local".into(),
13706 origin_kind: "local".into(),
13707 origin_host: None,
13708 conversation_id: None,
13709 },
13710 ];
13711
13712 let deduped = deduplicate_hits(hits);
13713 assert_eq!(deduped.len(), 1);
13714 assert!(deduped[0].content.contains("real content"));
13715 }
13716
13717 #[test]
13718 fn deduplicate_hits_filters_acknowledgement_noise() {
13719 let hits = vec![
13720 SearchHit {
13721 title: "ack".into(),
13722 snippet: "ack".into(),
13723 content: "Acknowledged.".into(),
13724 content_hash: stable_content_hash("Acknowledged."),
13725 score: 1.0,
13726 source_path: "ack.jsonl".into(),
13727 agent: "agent".into(),
13728 workspace: "ws".into(),
13729 workspace_original: None,
13730 created_at: Some(100),
13731 line_number: None,
13732 match_type: MatchType::Exact,
13733 source_id: "local".into(),
13734 origin_kind: "local".into(),
13735 origin_host: None,
13736 conversation_id: None,
13737 },
13738 SearchHit {
13739 title: "real".into(),
13740 snippet: "real".into(),
13741 content: "Authentication refresh logic changed".into(),
13742 content_hash: stable_content_hash("Authentication refresh logic changed"),
13743 score: 0.5,
13744 source_path: "real.jsonl".into(),
13745 agent: "agent".into(),
13746 workspace: "ws".into(),
13747 workspace_original: None,
13748 created_at: Some(200),
13749 line_number: None,
13750 match_type: MatchType::Exact,
13751 source_id: "local".into(),
13752 origin_kind: "local".into(),
13753 origin_host: None,
13754 conversation_id: None,
13755 },
13756 ];
13757
13758 let deduped = deduplicate_hits_with_query(hits, "authentication");
13759 assert_eq!(deduped.len(), 1);
13760 assert_eq!(deduped[0].title, "real");
13761 }
13762
13763 #[test]
13764 fn deduplicate_hits_hides_system_prompts_unless_query_requests_them() {
13765 let prompt_hit = SearchHit {
13766 title: "prompt".into(),
13767 snippet: "prompt".into(),
13768 content:
13769 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly."
13770 .into(),
13771 content_hash: stable_content_hash(
13772 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly.",
13773 ),
13774 score: 1.0,
13775 source_path: "prompt.jsonl".into(),
13776 agent: "agent".into(),
13777 workspace: "ws".into(),
13778 workspace_original: None,
13779 created_at: Some(100),
13780 line_number: None,
13781 match_type: MatchType::Exact,
13782 source_id: "local".into(),
13783 origin_kind: "local".into(),
13784 origin_host: None,
13785 conversation_id: None,
13786 };
13787
13788 assert!(
13789 deduplicate_hits_with_query(vec![prompt_hit.clone()], "coding assistant").is_empty()
13790 );
13791
13792 let kept = deduplicate_hits_with_query(vec![prompt_hit], "AGENTS.md instructions");
13793 assert_eq!(kept.len(), 1);
13794 assert_eq!(kept[0].title, "prompt");
13795 }
13796
13797 #[test]
13798 fn deduplicate_hits_preserves_unique_content() {
13799 let hits = vec![
13800 SearchHit {
13801 title: "title1".into(),
13802 snippet: "snip1".into(),
13803 content: "first message".into(),
13804 content_hash: stable_content_hash("first message"),
13805 score: 1.0,
13806 source_path: "a.jsonl".into(),
13807 agent: "agent".into(),
13808 workspace: "ws".into(),
13809 workspace_original: None,
13810 created_at: Some(100),
13811 line_number: None,
13812 match_type: MatchType::Exact,
13813 source_id: "local".into(),
13814 origin_kind: "local".into(),
13815 origin_host: None,
13816 conversation_id: None,
13817 },
13818 SearchHit {
13819 title: "title2".into(),
13820 snippet: "snip2".into(),
13821 content: "second message".into(),
13822 content_hash: stable_content_hash("second message"),
13823 score: 0.8,
13824 source_path: "b.jsonl".into(),
13825 agent: "agent".into(),
13826 workspace: "ws".into(),
13827 workspace_original: None,
13828 created_at: Some(200),
13829 line_number: None,
13830 match_type: MatchType::Exact,
13831 source_id: "local".into(),
13832 origin_kind: "local".into(),
13833 origin_host: None,
13834 conversation_id: None,
13835 },
13836 SearchHit {
13837 title: "title3".into(),
13838 snippet: "snip3".into(),
13839 content: "third message".into(),
13840 content_hash: stable_content_hash("third message"),
13841 score: 0.6,
13842 source_path: "c.jsonl".into(),
13843 agent: "agent".into(),
13844 workspace: "ws".into(),
13845 workspace_original: None,
13846 created_at: Some(300),
13847 line_number: None,
13848 match_type: MatchType::Exact,
13849 source_id: "local".into(),
13850 origin_kind: "local".into(),
13851 origin_host: None,
13852 conversation_id: None,
13853 },
13854 ];
13855
13856 let deduped = deduplicate_hits(hits);
13857 assert_eq!(deduped.len(), 3); }
13859
13860 #[test]
13863 fn deduplicate_hits_respects_source_boundaries() {
13864 let hits = vec![
13865 SearchHit {
13866 title: "local title".into(),
13867 snippet: "snip".into(),
13868 content: "hello world".into(),
13869 content_hash: stable_content_hash("hello world"),
13870 score: 1.0,
13871 source_path: "a.jsonl".into(),
13872 agent: "agent".into(),
13873 workspace: "ws".into(),
13874 workspace_original: None,
13875 created_at: Some(100),
13876 line_number: None,
13877 match_type: MatchType::Exact,
13878 source_id: "local".into(),
13879 origin_kind: "local".into(),
13880 origin_host: None,
13881 conversation_id: None,
13882 },
13883 SearchHit {
13884 title: "remote title".into(),
13885 snippet: "snip".into(),
13886 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13888 score: 0.9,
13889 source_path: "b.jsonl".into(),
13890 agent: "agent".into(),
13891 workspace: "ws".into(),
13892 workspace_original: None,
13893 created_at: Some(200),
13894 line_number: None,
13895 match_type: MatchType::Exact,
13896 source_id: "work-laptop".into(), origin_kind: "ssh".into(),
13898 origin_host: Some("work-laptop.local".into()),
13899 conversation_id: None,
13900 },
13901 ];
13902
13903 let deduped = deduplicate_hits(hits);
13904 assert_eq!(
13905 deduped.len(),
13906 2,
13907 "same content from different sources should not dedupe"
13908 );
13909 assert!(deduped.iter().any(|h| h.source_id == "local"));
13910 assert!(deduped.iter().any(|h| h.source_id == "work-laptop"));
13911 }
13912
13913 #[test]
13914 fn wildcard_fallback_sparse_check_uses_effective_limit() {
13915 assert!(
13916 !should_try_wildcard_fallback(1, 1, 0, 3),
13917 "a filled one-result page is not sparse for fallback purposes"
13918 );
13919 assert!(
13920 !should_try_wildcard_fallback(2, 2, 0, 3),
13921 "a filled two-result page is not sparse for fallback purposes"
13922 );
13923 assert!(
13924 should_try_wildcard_fallback(0, 1, 0, 3),
13925 "zero hits should still trigger fallback even for tiny pages"
13926 );
13927 assert!(
13928 should_try_wildcard_fallback(1, 2, 0, 3),
13929 "a partially filled page should still trigger fallback"
13930 );
13931 assert!(
13932 !should_try_wildcard_fallback(0, 5, 10, 3),
13933 "pagination should not trigger wildcard fallback"
13934 );
13935 assert!(
13936 should_try_wildcard_fallback(1, 0, 0, 3),
13937 "limit zero preserves the legacy sparse-threshold semantics"
13938 );
13939 }
13940
13941 #[test]
13942 fn snippet_preview_fast_path_requires_snippet_only_match() {
13943 let snippet_only = FieldMask::new(false, true, false, false);
13944 let snippet = snippet_from_preview_without_full_content(
13945 snippet_only,
13946 "migration checks the database constraint before writing",
13947 "database",
13948 )
13949 .expect("preview should satisfy a snippet-only request when it contains the query");
13950 assert!(snippet.contains("**database**"));
13951
13952 assert!(
13953 snippet_from_preview_without_full_content(
13954 FieldMask::FULL,
13955 "migration checks the database constraint before writing",
13956 "database",
13957 )
13958 .is_none(),
13959 "full-content requests must keep the sqlite hydration path"
13960 );
13961 assert!(
13962 snippet_from_preview_without_full_content(
13963 snippet_only,
13964 "migration checks constraints before writing",
13965 "database",
13966 )
13967 .is_none(),
13968 "snippet-only requests hydrate when the preview cannot show the match"
13969 );
13970 }
13971
13972 #[test]
13973 fn search_with_fallback_returns_exact_when_sufficient() -> Result<()> {
13974 let dir = TempDir::new()?;
13975 let mut index = TantivyIndex::open_or_create(dir.path())?;
13976
13977 for i in 0..5 {
13979 let conv = NormalizedConversation {
13980 agent_slug: "codex".into(),
13981 external_id: None,
13982 title: Some(format!("doc-{i}")),
13983 workspace: Some(std::path::PathBuf::from("/ws")),
13984 source_path: dir.path().join(format!("{i}.jsonl")),
13985 started_at: Some(100 + i),
13986 ended_at: None,
13987 metadata: serde_json::json!({}),
13988 messages: vec![NormalizedMessage {
13989 idx: 0,
13990 role: "user".into(),
13991 author: None,
13992 created_at: Some(100 + i),
13993 content: format!("apple fruit number {i} is delicious and healthy"),
13995 extra: serde_json::json!({}),
13996 snippets: vec![],
13997 invocations: Vec::new(),
13998 }],
13999 };
14000 index.add_conversation(&conv)?;
14001 }
14002 index.commit()?;
14003
14004 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14005
14006 let result = client.search_with_fallback(
14008 "apple",
14009 SearchFilters::default(),
14010 10,
14011 0,
14012 3, FieldMask::FULL,
14014 )?;
14015
14016 assert!(!result.wildcard_fallback);
14017 assert!(result.hits.len() >= 3); Ok(())
14020 }
14021
14022 #[test]
14023 fn search_with_fallback_triggers_on_sparse_results() -> Result<()> {
14024 let dir = TempDir::new()?;
14025 let mut index = TantivyIndex::open_or_create(dir.path())?;
14026
14027 let conv = NormalizedConversation {
14029 agent_slug: "codex".into(),
14030 external_id: None,
14031 title: Some("substring test".into()),
14032 workspace: Some(std::path::PathBuf::from("/ws")),
14033 source_path: dir.path().join("test.jsonl"),
14034 started_at: Some(100),
14035 ended_at: None,
14036 metadata: serde_json::json!({}),
14037 messages: vec![NormalizedMessage {
14038 idx: 0,
14039 role: "user".into(),
14040 author: None,
14041 created_at: Some(100),
14042 content: "configuration management system".into(),
14043 extra: serde_json::json!({}),
14044 snippets: vec![],
14045 invocations: Vec::new(),
14046 }],
14047 };
14048 index.add_conversation(&conv)?;
14049 index.commit()?;
14050
14051 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14052
14053 let result = client.search_with_fallback(
14055 "config",
14056 SearchFilters::default(),
14057 10,
14058 0,
14059 5, FieldMask::FULL,
14061 )?;
14062
14063 assert!(!result.hits.is_empty());
14066
14067 Ok(())
14068 }
14069
14070 #[test]
14071 fn search_with_fallback_skips_when_query_has_wildcards() -> Result<()> {
14072 let dir = TempDir::new()?;
14073 let mut index = TantivyIndex::open_or_create(dir.path())?;
14074
14075 let conv = NormalizedConversation {
14076 agent_slug: "codex".into(),
14077 external_id: None,
14078 title: Some("test".into()),
14079 workspace: None,
14080 source_path: dir.path().join("test.jsonl"),
14081 started_at: Some(100),
14082 ended_at: None,
14083 metadata: serde_json::json!({}),
14084 messages: vec![NormalizedMessage {
14085 idx: 0,
14086 role: "user".into(),
14087 author: None,
14088 created_at: Some(100),
14089 content: "testing data".into(),
14090 extra: serde_json::json!({}),
14091 snippets: vec![],
14092 invocations: Vec::new(),
14093 }],
14094 };
14095 index.add_conversation(&conv)?;
14096 index.commit()?;
14097
14098 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14099
14100 let result = client.search_with_fallback(
14102 "*test*",
14103 SearchFilters::default(),
14104 10,
14105 0,
14106 10, FieldMask::FULL,
14108 )?;
14109
14110 assert!(!result.wildcard_fallback); Ok(())
14112 }
14113
14114 #[test]
14115 fn search_with_fallback_prefers_wildcards_when_they_add_hits() -> Result<()> {
14116 let dir = TempDir::new()?;
14117 let mut index = TantivyIndex::open_or_create(dir.path())?;
14118
14119 for (i, body) in [
14122 "alphabet soup for coders",
14123 "mapping the alphabet city blocks",
14124 ]
14125 .iter()
14126 .enumerate()
14127 {
14128 let conv = NormalizedConversation {
14129 agent_slug: "codex".into(),
14130 external_id: None,
14131 title: Some(format!("alpha-{i}")),
14132 workspace: Some(std::path::PathBuf::from("/ws")),
14133 source_path: dir.path().join(format!("alpha-{i}.jsonl")),
14134 started_at: Some(100 + i as i64),
14135 ended_at: None,
14136 metadata: serde_json::json!({}),
14137 messages: vec![NormalizedMessage {
14138 idx: 0,
14139 role: "user".into(),
14140 author: None,
14141 created_at: Some(100 + i as i64),
14142 content: body.to_string(),
14143 extra: serde_json::json!({}),
14144 snippets: vec![],
14145 invocations: Vec::new(),
14146 }],
14147 };
14148 index.add_conversation(&conv)?;
14149 }
14150 index.commit()?;
14151
14152 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14153
14154 let result = client.search_with_fallback(
14155 "bet",
14156 SearchFilters::default(),
14157 10,
14158 0,
14159 2,
14160 FieldMask::FULL,
14161 )?;
14162
14163 assert!(
14164 result.wildcard_fallback,
14165 "should switch to wildcard fallback when it yields more hits"
14166 );
14167 assert_eq!(
14168 result.hits.len(),
14169 2,
14170 "fallback should surface all alphabet docs"
14171 );
14172 assert!(
14173 result
14174 .hits
14175 .iter()
14176 .all(|h| h.match_type == MatchType::ImplicitWildcard)
14177 );
14178 assert!(result.hits.iter().all(|h| h.content.contains("alphabet")));
14179
14180 Ok(())
14181 }
14182
14183 #[test]
14184 fn automatic_wildcard_fallback_skips_long_zero_hit_token() -> Result<()> {
14185 let dir = TempDir::new()?;
14186 let mut index = TantivyIndex::open_or_create(dir.path())?;
14187
14188 let conv = NormalizedConversation {
14189 agent_slug: "codex".into(),
14190 external_id: None,
14191 title: Some("fruit".into()),
14192 workspace: Some(std::path::PathBuf::from("/ws")),
14193 source_path: dir.path().join("fruit.jsonl"),
14194 started_at: Some(100),
14195 ended_at: None,
14196 metadata: serde_json::json!({}),
14197 messages: vec![NormalizedMessage {
14198 idx: 0,
14199 role: "user".into(),
14200 author: None,
14201 created_at: Some(100),
14202 content: "apple pear banana".into(),
14203 extra: serde_json::json!({}),
14204 snippets: vec![],
14205 invocations: Vec::new(),
14206 }],
14207 };
14208 index.add_conversation(&conv)?;
14209 index.commit()?;
14210
14211 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14212
14213 let result = client.search_with_fallback(
14214 "zzzzzzunlikelyterm",
14215 SearchFilters::default(),
14216 10,
14217 0,
14218 1,
14219 FieldMask::FULL,
14220 )?;
14221 assert!(result.hits.is_empty());
14222 assert!(!result.wildcard_fallback);
14223 assert!(
14224 result
14225 .suggestions
14226 .iter()
14227 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
14228 "manual wildcard suggestion should remain available"
14229 );
14230
14231 let short_result = client.search_with_fallback(
14232 "pple",
14233 SearchFilters::default(),
14234 10,
14235 0,
14236 1,
14237 FieldMask::FULL,
14238 )?;
14239 assert!(short_result.wildcard_fallback);
14240 assert_eq!(short_result.hits.len(), 1);
14241 assert_eq!(short_result.hits[0].match_type, MatchType::ImplicitWildcard);
14242
14243 Ok(())
14244 }
14245
14246 #[test]
14247 fn nohit_suggestions_do_not_lazy_open_sqlite_when_tantivy_is_present() -> Result<()> {
14248 let dir = TempDir::new()?;
14249 let index_path = dir.path().join("index");
14250 let db_path = dir.path().join("cass.db");
14251
14252 let storage = FrankenStorage::open(&db_path)?;
14253 storage.close()?;
14254
14255 let mut index = TantivyIndex::open_or_create(&index_path)?;
14256 let conv = NormalizedConversation {
14257 agent_slug: "codex".into(),
14258 external_id: None,
14259 title: Some("fruit".into()),
14260 workspace: Some(std::path::PathBuf::from("/ws")),
14261 source_path: dir.path().join("fruit.jsonl"),
14262 started_at: Some(100),
14263 ended_at: None,
14264 metadata: serde_json::json!({}),
14265 messages: vec![NormalizedMessage {
14266 idx: 0,
14267 role: "user".into(),
14268 author: None,
14269 created_at: Some(100),
14270 content: "apple pear banana".into(),
14271 extra: serde_json::json!({}),
14272 snippets: vec![],
14273 invocations: Vec::new(),
14274 }],
14275 };
14276 index.add_conversation(&conv)?;
14277 index.commit()?;
14278
14279 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("index present");
14280 assert!(
14281 client
14282 .sqlite
14283 .lock()
14284 .map(|guard| guard.is_none())
14285 .unwrap_or(false),
14286 "sqlite should start closed"
14287 );
14288
14289 let result = client.search_with_fallback(
14290 "zzzzzzunlikelyterm",
14291 SearchFilters::default(),
14292 10,
14293 0,
14294 1,
14295 FieldMask::FULL,
14296 )?;
14297
14298 assert!(result.hits.is_empty());
14299 assert!(
14300 result
14301 .suggestions
14302 .iter()
14303 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
14304 "manual wildcard suggestion should remain available"
14305 );
14306 assert!(
14307 result
14308 .suggestions
14309 .iter()
14310 .all(|s| !matches!(s.kind, SuggestionKind::AlternateAgent)),
14311 "alternate-agent suggestions should not force a SQLite open"
14312 );
14313 assert!(
14314 client
14315 .sqlite
14316 .lock()
14317 .map(|guard| guard.is_none())
14318 .unwrap_or(false),
14319 "sqlite should stay closed after Tantivy no-hit suggestions"
14320 );
14321
14322 Ok(())
14323 }
14324
14325 #[test]
14326 fn search_with_fallback_emits_wildcard_suggestion_on_zero_hits() -> Result<()> {
14327 let client = SearchClient {
14328 reader: None,
14329 sqlite: Mutex::new(None),
14330 sqlite_path: None,
14331 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
14332 reload_on_search: true,
14333 last_reload: Mutex::new(None),
14334 last_generation: Mutex::new(None),
14335 reload_epoch: Arc::new(AtomicU64::new(0)),
14336 warm_tx: None,
14337 _warm_handle: None,
14338 metrics: Metrics::default(),
14339 cache_namespace: "vtest|schema:none".into(),
14340 semantic: Mutex::new(None),
14341 last_tantivy_total_count: Mutex::new(None),
14342 };
14343
14344 let result = client.search_with_fallback(
14345 "ghost",
14346 SearchFilters::default(),
14347 5,
14348 0,
14349 3,
14350 FieldMask::FULL,
14351 )?;
14352
14353 assert!(
14354 result.hits.is_empty(),
14355 "no index/db means no hits should be returned"
14356 );
14357 assert!(
14358 !result.wildcard_fallback,
14359 "with zero baseline and fallback hits, we should keep baseline and mark fallback=false"
14360 );
14361
14362 let wildcard = result
14363 .suggestions
14364 .iter()
14365 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
14366 .expect("should suggest adding wildcards");
14367 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
14368
14369 Ok(())
14370 }
14371
14372 #[test]
14373 fn search_with_fallback_skips_empty_query() -> Result<()> {
14374 let dir = TempDir::new()?;
14375 let mut index = TantivyIndex::open_or_create(dir.path())?;
14376
14377 let conv = NormalizedConversation {
14378 agent_slug: "codex".into(),
14379 external_id: None,
14380 title: Some("test".into()),
14381 workspace: None,
14382 source_path: dir.path().join("test.jsonl"),
14383 started_at: Some(100),
14384 ended_at: None,
14385 metadata: serde_json::json!({}),
14386 messages: vec![NormalizedMessage {
14387 idx: 0,
14388 role: "user".into(),
14389 author: None,
14390 created_at: Some(100),
14391 content: "testing data".into(),
14392 extra: serde_json::json!({}),
14393 snippets: vec![],
14394 invocations: Vec::new(),
14395 }],
14396 };
14397 index.add_conversation(&conv)?;
14398 index.commit()?;
14399
14400 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14401
14402 let result = client.search_with_fallback(
14404 " ",
14405 SearchFilters::default(),
14406 10,
14407 0,
14408 10,
14409 FieldMask::FULL,
14410 )?;
14411
14412 assert!(!result.wildcard_fallback);
14413 Ok(())
14414 }
14415
14416 #[test]
14417 fn search_with_fallback_skips_for_nonzero_offset() -> Result<()> {
14418 let client = SearchClient {
14420 reader: None,
14421 sqlite: Mutex::new(None),
14422 sqlite_path: None,
14423 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
14424 reload_on_search: true,
14425 last_reload: Mutex::new(None),
14426 last_generation: Mutex::new(None),
14427 reload_epoch: Arc::new(AtomicU64::new(0)),
14428 warm_tx: None,
14429 _warm_handle: None,
14430 metrics: Metrics::default(),
14431 cache_namespace: "vtest|schema:none".into(),
14432 semantic: Mutex::new(None),
14433 last_tantivy_total_count: Mutex::new(None),
14434 };
14435
14436 let result = client.search_with_fallback(
14437 "ghost",
14438 SearchFilters::default(),
14439 5,
14440 10,
14441 3,
14442 FieldMask::FULL,
14443 )?;
14444
14445 assert!(
14446 !result.wildcard_fallback,
14447 "fallback should not run on paginated searches"
14448 );
14449 let wildcard = result
14451 .suggestions
14452 .iter()
14453 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
14454 .expect("wildcard suggestion present");
14455 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
14456
14457 Ok(())
14458 }
14459
14460 #[test]
14461 fn generate_suggestions_limits_and_sets_shortcuts() -> Result<()> {
14462 let client = SearchClient {
14464 reader: None,
14465 sqlite: Mutex::new(None),
14466 sqlite_path: None,
14467 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
14468 reload_on_search: true,
14469 last_reload: Mutex::new(None),
14470 last_generation: Mutex::new(None),
14471 reload_epoch: Arc::new(AtomicU64::new(0)),
14472 warm_tx: None,
14473 _warm_handle: None,
14474 metrics: Metrics::default(),
14475 cache_namespace: "vtest|schema:none".into(),
14476 semantic: Mutex::new(None),
14477 last_tantivy_total_count: Mutex::new(None),
14478 };
14479
14480 let mut filters = SearchFilters::default();
14481 filters.agents.insert("codex".into()); let result = client.search_with_fallback("claud", filters, 5, 0, 3, FieldMask::FULL)?;
14484
14485 assert_eq!(
14487 result.suggestions.len(),
14488 3,
14489 "should truncate to 3 suggestions"
14490 );
14491 for (idx, sugg) in result.suggestions.iter().enumerate() {
14492 assert_eq!(
14493 sugg.shortcut,
14494 Some((idx + 1) as u8),
14495 "shortcut should match position (1-based)"
14496 );
14497 }
14498
14499 assert!(
14501 result
14502 .suggestions
14503 .iter()
14504 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
14505 "should suggest wildcard search"
14506 );
14507 assert!(
14508 result
14509 .suggestions
14510 .iter()
14511 .any(|s| matches!(s.kind, SuggestionKind::RemoveFilter)),
14512 "should suggest removing agent filter"
14513 );
14514 assert!(
14515 result
14516 .suggestions
14517 .iter()
14518 .any(|s| matches!(s.kind, SuggestionKind::SpellingFix)),
14519 "should suggest spelling fix for nearby agent name"
14520 );
14521
14522 Ok(())
14523 }
14524
14525 #[test]
14526 fn generate_suggestions_includes_recent_alternate_agents() -> Result<()> {
14527 let dir = TempDir::new()?;
14528 let db_path = dir.path().join("cass.db");
14529 let storage = FrankenStorage::open(&db_path)?;
14530 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
14531 let base_ts = 1_700_000_010_000_i64;
14532
14533 for (idx, slug) in ["claude_code", "codex"].iter().enumerate() {
14534 let agent = Agent {
14535 id: None,
14536 slug: (*slug).to_string(),
14537 name: (*slug).to_string(),
14538 version: None,
14539 kind: AgentKind::Cli,
14540 };
14541 let agent_id = storage.ensure_agent(&agent)?;
14542 let conversation = Conversation {
14543 id: None,
14544 agent_slug: (*slug).to_string(),
14545 workspace: Some(dir.path().to_path_buf()),
14546 external_id: Some(format!("alt-agent-{idx}")),
14547 title: Some(format!("alternate agent {idx}")),
14548 source_path: dir.path().join(format!("{slug}.jsonl")),
14549 started_at: Some(base_ts + idx as i64),
14550 ended_at: Some(base_ts + idx as i64),
14551 approx_tokens: Some(8),
14552 metadata_json: json!({}),
14553 messages: vec![Message {
14554 id: None,
14555 idx: 0,
14556 role: MessageRole::User,
14557 author: Some("user".into()),
14558 created_at: Some(base_ts + idx as i64),
14559 content: format!("content from {slug}"),
14560 extra_json: json!({}),
14561 snippets: Vec::new(),
14562 }],
14563 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
14564 origin_host: None,
14565 };
14566 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
14567 }
14568 drop(storage);
14569
14570 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
14571 let result = client.search_with_fallback(
14572 "ghost",
14573 SearchFilters::default(),
14574 5,
14575 0,
14576 3,
14577 FieldMask::FULL,
14578 )?;
14579
14580 let alternate_agents: HashSet<String> = result
14581 .suggestions
14582 .iter()
14583 .filter(|suggestion| matches!(suggestion.kind, SuggestionKind::AlternateAgent))
14584 .filter_map(|suggestion| suggestion.suggested_filters.as_ref())
14585 .flat_map(|filters| filters.agents.iter().cloned())
14586 .collect();
14587
14588 assert!(
14589 alternate_agents.contains("claude_code"),
14590 "should suggest claude_code from normalized conversations schema"
14591 );
14592 assert!(
14593 alternate_agents.contains("codex"),
14594 "should suggest codex from normalized conversations schema"
14595 );
14596
14597 Ok(())
14598 }
14599
14600 #[test]
14601 fn sanitize_query_preserves_wildcards() {
14602 assert_eq!(fs_cass_sanitize_query("*foo*"), "*foo*");
14604 assert_eq!(fs_cass_sanitize_query("foo*"), "foo*");
14605 assert_eq!(fs_cass_sanitize_query("*bar"), "*bar");
14606 assert_eq!(fs_cass_sanitize_query("*config*"), "*config*");
14607 }
14608
14609 #[test]
14610 fn sanitize_query_strips_other_special_chars() {
14611 assert_eq!(fs_cass_sanitize_query("foo.bar"), "foo bar");
14613 assert_eq!(fs_cass_sanitize_query("c++"), "c ");
14614 assert_eq!(fs_cass_sanitize_query("foo-bar"), "foo-bar");
14615 assert_eq!(fs_cass_sanitize_query("test_case"), "test case");
14616 }
14617
14618 #[test]
14619 fn sanitize_query_combined() {
14620 assert_eq!(fs_cass_sanitize_query("*foo.bar*"), "*foo bar*");
14622 assert_eq!(fs_cass_sanitize_query("test-*"), "test-*");
14623 assert_eq!(fs_cass_sanitize_query("*c++*"), "*c *");
14624 }
14625
14626 #[test]
14628 fn parse_boolean_query_simple_terms() {
14629 let tokens = fs_cass_parse_boolean_query("foo bar baz");
14630 assert_eq!(tokens.len(), 3);
14631 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14632 assert_eq!(tokens[1], FsCassQueryToken::Term("bar".to_string()));
14633 assert_eq!(tokens[2], FsCassQueryToken::Term("baz".to_string()));
14634 }
14635
14636 #[test]
14637 fn parse_boolean_query_and_operator() {
14638 let tokens = fs_cass_parse_boolean_query("foo AND bar");
14639 assert_eq!(tokens.len(), 3);
14640 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14641 assert_eq!(tokens[1], FsCassQueryToken::And);
14642 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14643
14644 let tokens2 = fs_cass_parse_boolean_query("foo && bar");
14646 assert_eq!(tokens2.len(), 3);
14647 assert_eq!(tokens2[1], FsCassQueryToken::And);
14648 }
14649
14650 #[test]
14651 fn parse_boolean_query_or_operator() {
14652 let tokens = fs_cass_parse_boolean_query("foo OR bar");
14653 assert_eq!(tokens.len(), 3);
14654 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14655 assert_eq!(tokens[1], FsCassQueryToken::Or);
14656 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14657
14658 let tokens2 = fs_cass_parse_boolean_query("foo || bar");
14660 assert_eq!(tokens2.len(), 3);
14661 assert_eq!(tokens2[1], FsCassQueryToken::Or);
14662 }
14663
14664 #[test]
14665 fn parse_boolean_query_not_operator() {
14666 let tokens = fs_cass_parse_boolean_query("foo NOT bar");
14667 assert_eq!(tokens.len(), 3);
14668 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14669 assert_eq!(tokens[1], FsCassQueryToken::Not);
14670 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14671 }
14672
14673 #[test]
14674 fn parse_boolean_query_quoted_phrase() {
14675 let tokens = fs_cass_parse_boolean_query(r#"foo "exact phrase" bar"#);
14676 assert_eq!(tokens.len(), 3);
14677 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14678 assert_eq!(
14679 tokens[1],
14680 FsCassQueryToken::Phrase("exact phrase".to_string())
14681 );
14682 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14683 }
14684
14685 #[test]
14686 fn parse_boolean_query_complex() {
14687 let tokens = fs_cass_parse_boolean_query(r#"error OR warning NOT "false positive""#);
14688 assert_eq!(tokens.len(), 5);
14689 assert_eq!(tokens[0], FsCassQueryToken::Term("error".to_string()));
14690 assert_eq!(tokens[1], FsCassQueryToken::Or);
14691 assert_eq!(tokens[2], FsCassQueryToken::Term("warning".to_string()));
14692 assert_eq!(tokens[3], FsCassQueryToken::Not);
14693 assert_eq!(
14694 tokens[4],
14695 FsCassQueryToken::Phrase("false positive".to_string())
14696 );
14697 }
14698
14699 #[test]
14700 fn has_boolean_operators_detection() {
14701 assert!(!fs_cass_has_boolean_operators("foo bar"));
14702 assert!(fs_cass_has_boolean_operators("foo AND bar"));
14703 assert!(fs_cass_has_boolean_operators("foo OR bar"));
14704 assert!(fs_cass_has_boolean_operators("foo NOT bar"));
14705 assert!(fs_cass_has_boolean_operators(r#""exact phrase""#));
14706 assert!(fs_cass_has_boolean_operators("foo && bar"));
14707 assert!(fs_cass_has_boolean_operators("foo || bar"));
14708 }
14709
14710 #[test]
14711 fn parse_boolean_query_case_insensitive_operators() {
14712 let tokens = fs_cass_parse_boolean_query("foo and bar or baz not qux");
14714 assert_eq!(tokens.len(), 7);
14715 assert_eq!(tokens[1], FsCassQueryToken::And);
14716 assert_eq!(tokens[3], FsCassQueryToken::Or);
14717 assert_eq!(tokens[5], FsCassQueryToken::Not);
14718 }
14719
14720 #[test]
14721 fn parse_boolean_query_with_wildcards() {
14722 let tokens = fs_cass_parse_boolean_query("*config* OR env*");
14723 assert_eq!(tokens.len(), 3);
14724 assert_eq!(tokens[0], FsCassQueryToken::Term("*config*".to_string()));
14725 assert_eq!(tokens[1], FsCassQueryToken::Or);
14726 assert_eq!(tokens[2], FsCassQueryToken::Term("env*".to_string()));
14727 }
14728
14729 #[test]
14735 fn tantivy_search_hydrates_long_content_when_content_field_is_not_stored() -> Result<()> {
14736 let dir = TempDir::new()?;
14737 let db_path = dir.path().join("cass.db");
14738 let storage = FrankenStorage::open(&db_path)?;
14739 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
14740 let agent = Agent {
14741 id: None,
14742 slug: "codex".into(),
14743 name: "Codex".into(),
14744 version: None,
14745 kind: AgentKind::Cli,
14746 };
14747 let agent_id = storage.ensure_agent(&agent)?;
14748 let long_content = format!(
14749 "{}needle appears past the preview boundary for hydration proof",
14750 "padding ".repeat(70)
14751 );
14752 let short_content = "shortneedle fits entirely inside the stored preview".to_string();
14753 let conversation = Conversation {
14754 id: None,
14755 agent_slug: "codex".into(),
14756 workspace: Some(dir.path().to_path_buf()),
14757 external_id: Some("hydrate-long-content".into()),
14758 title: Some("hydrated lexical doc".into()),
14759 source_path: dir.path().join("hydrate.jsonl"),
14760 started_at: Some(1_700_000_123_000),
14761 ended_at: Some(1_700_000_123_000),
14762 approx_tokens: Some(32),
14763 metadata_json: json!({}),
14764 messages: vec![
14765 Message {
14766 id: None,
14767 idx: 0,
14768 role: MessageRole::User,
14769 author: Some("user".into()),
14770 created_at: Some(1_700_000_123_000),
14771 content: long_content.clone(),
14772 extra_json: json!({}),
14773 snippets: Vec::new(),
14774 },
14775 Message {
14776 id: None,
14777 idx: 1,
14778 role: MessageRole::Agent,
14779 author: Some("assistant".into()),
14780 created_at: Some(1_700_000_124_000),
14781 content: short_content.clone(),
14782 extra_json: json!({}),
14783 snippets: Vec::new(),
14784 },
14785 ],
14786 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
14787 origin_host: None,
14788 };
14789 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
14790 storage.close()?;
14791
14792 let index_path = dir.path().join("search-index");
14793 let mut index = TantivyIndex::open_or_create(&index_path)?;
14794 let normalized = NormalizedConversation {
14795 agent_slug: "codex".into(),
14796 external_id: Some("hydrate-long-content".into()),
14797 title: Some("hydrated lexical doc".into()),
14798 workspace: Some(dir.path().to_path_buf()),
14799 source_path: dir.path().join("hydrate.jsonl"),
14800 started_at: Some(1_700_000_123_000),
14801 ended_at: Some(1_700_000_123_000),
14802 metadata: json!({}),
14803 messages: vec![
14804 NormalizedMessage {
14805 idx: 0,
14806 role: "user".into(),
14807 author: Some("user".into()),
14808 created_at: Some(1_700_000_123_000),
14809 content: long_content.clone(),
14810 extra: json!({}),
14811 snippets: vec![],
14812 invocations: Vec::new(),
14813 },
14814 NormalizedMessage {
14815 idx: 1,
14816 role: "assistant".into(),
14817 author: Some("assistant".into()),
14818 created_at: Some(1_700_000_124_000),
14819 content: short_content.clone(),
14820 extra: json!({}),
14821 snippets: vec![],
14822 invocations: Vec::new(),
14823 },
14824 ],
14825 };
14826 index.add_conversation(&normalized)?;
14827 index.commit()?;
14828
14829 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
14830 let hits = client.search("needle", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
14831
14832 assert_eq!(hits.len(), 1, "expected one lexical hit");
14833 assert_eq!(hits[0].title, "hydrated lexical doc");
14834 assert!(
14835 hits[0]
14836 .content
14837 .contains("needle appears past the preview boundary"),
14838 "lexical hit should hydrate full content from sqlite when Tantivy content is not stored"
14839 );
14840 assert!(
14841 hits[0].snippet.to_lowercase().contains("needle"),
14842 "snippet should still be rendered from hydrated content"
14843 );
14844
14845 let bounded_hits = client.search(
14846 "needle",
14847 SearchFilters::default(),
14848 5,
14849 0,
14850 FieldMask::FULL.with_preview_content_limit(Some(200)),
14851 )?;
14852
14853 assert_eq!(bounded_hits.len(), 1, "expected one lexical hit");
14854 assert!(
14855 bounded_hits[0].content.starts_with("padding padding"),
14856 "bounded content may be served from the stored preview prefix"
14857 );
14858 assert!(
14859 !bounded_hits[0]
14860 .content
14861 .contains("needle appears past the preview boundary"),
14862 "bounded preview content should not hydrate the full sqlite row"
14863 );
14864
14865 let short_client =
14866 SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
14867 assert!(
14868 short_client
14869 .sqlite
14870 .lock()
14871 .map(|guard| guard.is_none())
14872 .unwrap_or(false),
14873 "sqlite should start closed for short preview hit"
14874 );
14875
14876 let short_hits = short_client.search(
14877 "shortneedle",
14878 SearchFilters::default(),
14879 5,
14880 0,
14881 FieldMask::FULL,
14882 )?;
14883
14884 assert_eq!(short_hits.len(), 1, "expected one short lexical hit");
14885 assert_eq!(
14886 short_hits[0].content, short_content,
14887 "untruncated stored preview is exact full content"
14888 );
14889 assert!(
14890 short_client
14891 .sqlite
14892 .lock()
14893 .map(|guard| guard.is_none())
14894 .unwrap_or(false),
14895 "short full-content hit should not lazy-open sqlite"
14896 );
14897
14898 Ok(())
14899 }
14900
14901 #[test]
14902 fn filter_fidelity_agent_filter_respected() -> Result<()> {
14903 let dir = TempDir::new()?;
14905 let mut index = TantivyIndex::open_or_create(dir.path())?;
14906
14907 let conv_a = NormalizedConversation {
14909 agent_slug: "codex".into(),
14910 external_id: None,
14911 title: Some("alpha doc".into()),
14912 workspace: None,
14913 source_path: dir.path().join("a.jsonl"),
14914 started_at: Some(100),
14915 ended_at: None,
14916 metadata: serde_json::json!({}),
14917 messages: vec![NormalizedMessage {
14918 idx: 0,
14919 role: "user".into(),
14920 author: None,
14921 created_at: Some(100),
14922 content: "hello world findme alpha".into(),
14923 extra: serde_json::json!({}),
14924 snippets: vec![],
14925 invocations: Vec::new(),
14926 }],
14927 };
14928 let conv_b = NormalizedConversation {
14930 agent_slug: "claude".into(),
14931 external_id: None,
14932 title: Some("beta doc".into()),
14933 workspace: None,
14934 source_path: dir.path().join("b.jsonl"),
14935 started_at: Some(200),
14936 ended_at: None,
14937 metadata: serde_json::json!({}),
14938 messages: vec![NormalizedMessage {
14939 idx: 0,
14940 role: "user".into(),
14941 author: None,
14942 created_at: Some(200),
14943 content: "hello world findme beta".into(),
14944 extra: serde_json::json!({}),
14945 snippets: vec![],
14946 invocations: Vec::new(),
14947 }],
14948 };
14949 index.add_conversation(&conv_a)?;
14950 index.add_conversation(&conv_b)?;
14951 index.commit()?;
14952
14953 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14954
14955 let mut filters = SearchFilters::default();
14957 filters.agents.insert("codex".into());
14958
14959 let hits = client.search("findme", filters.clone(), 10, 0, FieldMask::FULL)?;
14960
14961 for hit in &hits {
14963 assert_eq!(
14964 hit.agent, "codex",
14965 "Agent filter violated: got agent '{}' instead of 'codex'",
14966 hit.agent
14967 );
14968 }
14969 assert!(!hits.is_empty(), "Should have found results");
14970
14971 let cached_hits = client.search("findme", filters, 10, 0, FieldMask::FULL)?;
14973 for hit in &cached_hits {
14974 assert_eq!(hit.agent, "codex", "Cached search violated agent filter");
14975 }
14976
14977 Ok(())
14978 }
14979
14980 #[test]
14981 fn filter_fidelity_workspace_filter_respected() -> Result<()> {
14982 let dir = TempDir::new()?;
14984 let mut index = TantivyIndex::open_or_create(dir.path())?;
14985
14986 let conv_a = NormalizedConversation {
14988 agent_slug: "codex".into(),
14989 external_id: None,
14990 title: Some("ws_a doc".into()),
14991 workspace: Some(std::path::PathBuf::from("/workspace/alpha")),
14992 source_path: dir.path().join("a.jsonl"),
14993 started_at: Some(100),
14994 ended_at: None,
14995 metadata: serde_json::json!({}),
14996 messages: vec![NormalizedMessage {
14997 idx: 0,
14998 role: "user".into(),
14999 author: None,
15000 created_at: Some(100),
15001 content: "workspace test needle".into(),
15002 extra: serde_json::json!({}),
15003 snippets: vec![],
15004 invocations: Vec::new(),
15005 }],
15006 };
15007 let conv_b = NormalizedConversation {
15009 agent_slug: "codex".into(),
15010 external_id: None,
15011 title: Some("ws_b doc".into()),
15012 workspace: Some(std::path::PathBuf::from("/workspace/beta")),
15013 source_path: dir.path().join("b.jsonl"),
15014 started_at: Some(200),
15015 ended_at: None,
15016 metadata: serde_json::json!({}),
15017 messages: vec![NormalizedMessage {
15018 idx: 0,
15019 role: "user".into(),
15020 author: None,
15021 created_at: Some(200),
15022 content: "workspace test needle".into(),
15023 extra: serde_json::json!({}),
15024 snippets: vec![],
15025 invocations: Vec::new(),
15026 }],
15027 };
15028 index.add_conversation(&conv_a)?;
15029 index.add_conversation(&conv_b)?;
15030 index.commit()?;
15031
15032 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15033
15034 let mut filters = SearchFilters::default();
15036 filters.workspaces.insert("/workspace/beta".into());
15037
15038 let hits = client.search("needle", filters.clone(), 10, 0, FieldMask::FULL)?;
15039
15040 for hit in &hits {
15042 assert_eq!(
15043 hit.workspace, "/workspace/beta",
15044 "Workspace filter violated: got '{}' instead of '/workspace/beta'",
15045 hit.workspace
15046 );
15047 }
15048 assert!(!hits.is_empty(), "Should have found results");
15049
15050 let cached_hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
15052 for hit in &cached_hits {
15053 assert_eq!(
15054 hit.workspace, "/workspace/beta",
15055 "Cached search violated workspace filter"
15056 );
15057 }
15058
15059 Ok(())
15060 }
15061
15062 #[test]
15063 fn filter_fidelity_date_range_respected() -> Result<()> {
15064 let dir = TempDir::new()?;
15066 let mut index = TantivyIndex::open_or_create(dir.path())?;
15067
15068 let conv_early = NormalizedConversation {
15070 agent_slug: "codex".into(),
15071 external_id: None,
15072 title: Some("early".into()),
15073 workspace: None,
15074 source_path: dir.path().join("early.jsonl"),
15075 started_at: Some(100),
15076 ended_at: None,
15077 metadata: serde_json::json!({}),
15078 messages: vec![NormalizedMessage {
15079 idx: 0,
15080 role: "user".into(),
15081 author: None,
15082 created_at: Some(100),
15083 content: "date range test".into(),
15084 extra: serde_json::json!({}),
15085 snippets: vec![],
15086 invocations: Vec::new(),
15087 }],
15088 };
15089 let conv_middle = NormalizedConversation {
15091 agent_slug: "codex".into(),
15092 external_id: None,
15093 title: Some("middle".into()),
15094 workspace: None,
15095 source_path: dir.path().join("middle.jsonl"),
15096 started_at: Some(500),
15097 ended_at: None,
15098 metadata: serde_json::json!({}),
15099 messages: vec![NormalizedMessage {
15100 idx: 0,
15101 role: "user".into(),
15102 author: None,
15103 created_at: Some(500),
15104 content: "date range test".into(),
15105 extra: serde_json::json!({}),
15106 snippets: vec![],
15107 invocations: Vec::new(),
15108 }],
15109 };
15110 let conv_late = NormalizedConversation {
15112 agent_slug: "codex".into(),
15113 external_id: None,
15114 title: Some("late".into()),
15115 workspace: None,
15116 source_path: dir.path().join("late.jsonl"),
15117 started_at: Some(900),
15118 ended_at: None,
15119 metadata: serde_json::json!({}),
15120 messages: vec![NormalizedMessage {
15121 idx: 0,
15122 role: "user".into(),
15123 author: None,
15124 created_at: Some(900),
15125 content: "date range test".into(),
15126 extra: serde_json::json!({}),
15127 snippets: vec![],
15128 invocations: Vec::new(),
15129 }],
15130 };
15131 index.add_conversation(&conv_early)?;
15132 index.add_conversation(&conv_middle)?;
15133 index.add_conversation(&conv_late)?;
15134 index.commit()?;
15135
15136 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15137
15138 let filters = SearchFilters {
15140 created_from: Some(400),
15141 created_to: Some(600),
15142 ..Default::default()
15143 };
15144
15145 let hits = client.search("range", filters.clone(), 10, 0, FieldMask::FULL)?;
15146
15147 for hit in &hits {
15149 if let Some(ts) = hit.created_at {
15150 assert!(
15151 (400..=600).contains(&ts),
15152 "Date range filter violated: got ts={ts} outside [400, 600]"
15153 );
15154 }
15155 }
15156 assert_eq!(hits.len(), 1, "Should find exactly 1 doc in range");
15158
15159 let cached_hits = client.search("range", filters, 10, 0, FieldMask::FULL)?;
15161 for hit in &cached_hits {
15162 if let Some(ts) = hit.created_at {
15163 assert!(
15164 (400..=600).contains(&ts),
15165 "Cached search violated date range filter"
15166 );
15167 }
15168 }
15169
15170 Ok(())
15171 }
15172
15173 #[test]
15174 fn filter_fidelity_combined_filters_respected() -> Result<()> {
15175 let dir = TempDir::new()?;
15177 let mut index = TantivyIndex::open_or_create(dir.path())?;
15178
15179 let combinations = [
15181 ("codex", "/ws/prod", 100), ("claude", "/ws/prod", 500), ("claude", "/ws/dev", 500), ("claude", "/ws/prod", 900), ];
15186
15187 for (i, (agent, ws, ts)) in combinations.iter().enumerate() {
15188 let conv = NormalizedConversation {
15189 agent_slug: (*agent).into(),
15190 external_id: None,
15191 title: Some(format!("combo-{i}")),
15192 workspace: Some(std::path::PathBuf::from(*ws)),
15193 source_path: dir.path().join(format!("{i}.jsonl")),
15194 started_at: Some(*ts),
15195 ended_at: None,
15196 metadata: serde_json::json!({}),
15197 messages: vec![NormalizedMessage {
15198 idx: 0,
15199 role: "user".into(),
15200 author: None,
15201 created_at: Some(*ts),
15202 content: "hello world combotest query".into(),
15203 extra: serde_json::json!({}),
15204 snippets: vec![],
15205 invocations: Vec::new(),
15206 }],
15207 };
15208 index.add_conversation(&conv)?;
15209 }
15210 index.commit()?;
15211
15212 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15213
15214 let mut filters = SearchFilters::default();
15216 filters.agents.insert("claude".into());
15217 filters.workspaces.insert("/ws/prod".into());
15218 filters.created_from = Some(400);
15219 filters.created_to = Some(600);
15220
15221 let hits = client.search("combotest", filters.clone(), 10, 0, FieldMask::FULL)?;
15222
15223 assert_eq!(hits.len(), 1, "Combined filter should match exactly 1 doc");
15225
15226 for hit in &hits {
15227 assert_eq!(hit.agent, "claude", "Agent filter violated");
15228 assert_eq!(hit.workspace, "/ws/prod", "Workspace filter violated");
15229 if let Some(ts) = hit.created_at {
15230 assert!((400..=600).contains(&ts), "Date filter violated: ts={ts}");
15231 }
15232 }
15233
15234 let cached = client.search("combotest", filters, 10, 0, FieldMask::FULL)?;
15236 assert_eq!(cached.len(), 1, "Cached result count mismatch");
15237
15238 Ok(())
15239 }
15240
15241 #[test]
15242 fn lexical_hits_normalize_trimmed_local_source_metadata() -> Result<()> {
15243 let dir = TempDir::new()?;
15244 let mut index = TantivyIndex::open_or_create(dir.path())?;
15245
15246 let conv = NormalizedConversation {
15247 agent_slug: "codex".into(),
15248 external_id: None,
15249 title: Some("trimmed local doc".into()),
15250 workspace: None,
15251 source_path: dir.path().join("trimmed-local.jsonl"),
15252 started_at: Some(100),
15253 ended_at: None,
15254 metadata: serde_json::json!({
15255 "cass": {
15256 "origin": {
15257 "source_id": " LOCAL ",
15258 "kind": "local"
15259 }
15260 }
15261 }),
15262 messages: vec![NormalizedMessage {
15263 idx: 0,
15264 role: "user".into(),
15265 author: None,
15266 created_at: Some(100),
15267 content: "trimmed local lexical".into(),
15268 extra: serde_json::json!({}),
15269 snippets: vec![],
15270 invocations: Vec::new(),
15271 }],
15272 };
15273 index.add_conversation(&conv)?;
15274 index.commit()?;
15275
15276 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15277 let hits = client.search("trimmed", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
15278
15279 assert_eq!(hits.len(), 1);
15280 assert_eq!(hits[0].source_id, "local");
15281 assert_eq!(hits[0].origin_kind, "local");
15282
15283 Ok(())
15284 }
15285
15286 #[test]
15287 fn lexical_hits_normalize_remote_origin_kind_without_source_id() -> Result<()> {
15288 let dir = TempDir::new()?;
15289 let mut index = TantivyIndex::open_or_create(dir.path())?;
15290
15291 let conv = NormalizedConversation {
15292 agent_slug: "codex".into(),
15293 external_id: None,
15294 title: Some("remote lexical doc".into()),
15295 workspace: None,
15296 source_path: dir.path().join("remote-lexical.jsonl"),
15297 started_at: Some(100),
15298 ended_at: None,
15299 metadata: serde_json::json!({
15300 "cass": {
15301 "origin": {
15302 "source_id": " ",
15303 "kind": "ssh",
15304 "host": "dev@laptop"
15305 }
15306 }
15307 }),
15308 messages: vec![NormalizedMessage {
15309 idx: 0,
15310 role: "user".into(),
15311 author: None,
15312 created_at: Some(100),
15313 content: "remote lexical".into(),
15314 extra: serde_json::json!({}),
15315 snippets: vec![],
15316 invocations: Vec::new(),
15317 }],
15318 };
15319 index.add_conversation(&conv)?;
15320 index.commit()?;
15321
15322 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15323 let hits = client.search("remote", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
15324
15325 assert_eq!(hits.len(), 1);
15326 assert_eq!(hits[0].source_id, "dev@laptop");
15327 assert_eq!(hits[0].origin_kind, "remote");
15328 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
15329
15330 Ok(())
15331 }
15332
15333 #[test]
15334 fn lexical_hits_infer_remote_origin_from_host_without_kind() -> Result<()> {
15335 let dir = TempDir::new()?;
15336 let mut index = TantivyIndex::open_or_create(dir.path())?;
15337
15338 let conv = NormalizedConversation {
15339 agent_slug: "codex".into(),
15340 external_id: None,
15341 title: Some("legacy host-only lexical doc".into()),
15342 workspace: None,
15343 source_path: dir.path().join("legacy-host-only-lexical.jsonl"),
15344 started_at: Some(100),
15345 ended_at: None,
15346 metadata: serde_json::json!({
15347 "cass": {
15348 "origin": {
15349 "source_id": " ",
15350 "host": "dev@laptop"
15351 }
15352 }
15353 }),
15354 messages: vec![NormalizedMessage {
15355 idx: 0,
15356 role: "user".into(),
15357 author: None,
15358 created_at: Some(100),
15359 content: "legacy remote lexical".into(),
15360 extra: serde_json::json!({}),
15361 snippets: vec![],
15362 invocations: Vec::new(),
15363 }],
15364 };
15365 index.add_conversation(&conv)?;
15366 index.commit()?;
15367
15368 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15369 let hits = client.search("legacy", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
15370
15371 assert_eq!(hits.len(), 1);
15372 assert_eq!(hits[0].source_id, "dev@laptop");
15373 assert_eq!(hits[0].origin_kind, "remote");
15374 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
15375
15376 Ok(())
15377 }
15378
15379 #[test]
15380 fn filter_fidelity_source_filter_respected() -> Result<()> {
15381 let dir = TempDir::new()?;
15383 let mut index = TantivyIndex::open_or_create(dir.path())?;
15384
15385 let conv_local = NormalizedConversation {
15387 agent_slug: "codex".into(),
15388 external_id: None,
15389 title: Some("local doc".into()),
15390 workspace: None,
15391 source_path: dir.path().join("local.jsonl"),
15392 started_at: Some(100),
15393 ended_at: None,
15394 metadata: serde_json::json!({}),
15395 messages: vec![NormalizedMessage {
15396 idx: 0,
15397 role: "user".into(),
15398 author: None,
15399 created_at: Some(100),
15400 content: "source filter test local".into(),
15401 extra: serde_json::json!({}),
15402 snippets: vec![],
15403 invocations: Vec::new(),
15404 }],
15405 };
15406 index.add_conversation(&conv_local)?;
15409 index.commit()?;
15410
15411 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15412
15413 let filters = SearchFilters {
15415 source_filter: SourceFilter::Local,
15416 ..Default::default()
15417 };
15418
15419 let hits = client.search("source", filters.clone(), 10, 0, FieldMask::FULL)?;
15420
15421 for hit in &hits {
15423 assert_eq!(
15424 hit.source_id, "local",
15425 "Source filter violated: got source_id '{}' instead of 'local'",
15426 hit.source_id
15427 );
15428 }
15429 assert!(!hits.is_empty(), "Should have found local results");
15430
15431 let filters_id = SearchFilters {
15433 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
15434 ..Default::default()
15435 };
15436
15437 let hits_id = client.search("source", filters_id, 10, 0, FieldMask::FULL)?;
15438 for hit in &hits_id {
15439 assert_eq!(
15440 hit.source_id, "local",
15441 "SourceId filter violated: got '{}' instead of 'local'",
15442 hit.source_id
15443 );
15444 }
15445 assert!(
15446 !hits_id.is_empty(),
15447 "Should have found results for source_id=local"
15448 );
15449
15450 Ok(())
15451 }
15452
15453 #[test]
15454 fn filter_fidelity_cache_key_isolation() {
15455 let client = SearchClient {
15457 reader: None,
15458 sqlite: Mutex::new(None),
15459 sqlite_path: None,
15460 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15461 reload_on_search: true,
15462 last_reload: Mutex::new(None),
15463 last_generation: Mutex::new(None),
15464 reload_epoch: Arc::new(AtomicU64::new(0)),
15465 warm_tx: None,
15466 _warm_handle: None,
15467 metrics: Metrics::default(),
15468 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
15469 semantic: Mutex::new(None),
15470 last_tantivy_total_count: Mutex::new(None),
15471 };
15472
15473 let filters_empty = SearchFilters::default();
15474 let mut filters_agent = SearchFilters::default();
15475 filters_agent.agents.insert("codex".into());
15476
15477 let mut filters_ws = SearchFilters::default();
15478 filters_ws.workspaces.insert("/ws".into());
15479
15480 let key_empty = client.cache_key("test", &filters_empty);
15481 let key_agent = client.cache_key("test", &filters_agent);
15482 let key_ws = client.cache_key("test", &filters_ws);
15483
15484 assert_ne!(
15486 key_empty, key_agent,
15487 "Empty vs agent filter keys should differ"
15488 );
15489 assert_ne!(
15490 key_empty, key_ws,
15491 "Empty vs workspace filter keys should differ"
15492 );
15493 assert_ne!(
15494 key_agent, key_ws,
15495 "Agent vs workspace filter keys should differ"
15496 );
15497
15498 let mut filters_agent2 = SearchFilters::default();
15500 filters_agent2.agents.insert("codex".into());
15501 let key_agent2 = client.cache_key("test", &filters_agent2);
15502 assert_eq!(key_agent, key_agent2, "Same filter should produce same key");
15503 }
15504
15505 #[test]
15513 fn sanitize_query_preserves_unicode_alphanumeric() {
15514 assert_eq!(fs_cass_sanitize_query("こんにちは"), "こんにちは");
15516 assert_eq!(fs_cass_sanitize_query("café"), "café");
15517 assert_eq!(fs_cass_sanitize_query("日本語123"), "日本語123");
15518 }
15519
15520 #[test]
15521 fn sanitize_query_handles_multiple_consecutive_special_chars() {
15522 assert_eq!(fs_cass_sanitize_query("foo---bar"), "foo---bar");
15523 assert_eq!(fs_cass_sanitize_query("a!@#$%^&()b"), "a b");
15525 }
15526
15527 #[test]
15530 fn wildcard_pattern_empty_after_trim_returns_exact_empty() {
15531 assert_eq!(
15532 FsCassWildcardPattern::parse("*"),
15533 FsCassWildcardPattern::Exact(String::new())
15534 );
15535 assert_eq!(
15536 FsCassWildcardPattern::parse("**"),
15537 FsCassWildcardPattern::Exact(String::new())
15538 );
15539 assert_eq!(
15540 FsCassWildcardPattern::parse("***"),
15541 FsCassWildcardPattern::Exact(String::new())
15542 );
15543 }
15544
15545 #[test]
15546 fn wildcard_pattern_to_regex_generation() {
15547 assert_eq!(FsCassWildcardPattern::Exact("foo".into()).to_regex(), None);
15549 assert_eq!(FsCassWildcardPattern::Prefix("foo".into()).to_regex(), None);
15550 assert_eq!(
15553 FsCassWildcardPattern::Suffix("foo".into()).to_regex(),
15554 Some(".*foo$".into())
15555 );
15556 assert_eq!(
15557 FsCassWildcardPattern::Substring("foo".into()).to_regex(),
15558 Some(".*foo.*".into())
15559 );
15560 }
15561
15562 #[test]
15565 fn parse_boolean_query_prefix_minus_not() {
15566 let tokens = fs_cass_parse_boolean_query("-world");
15568 let expected = vec![
15569 FsCassQueryToken::Not,
15570 FsCassQueryToken::Term("world".into()),
15571 ];
15572 assert_eq!(tokens, expected);
15573
15574 let tokens = fs_cass_parse_boolean_query("hello -world");
15576 let expected = vec![
15577 FsCassQueryToken::Term("hello".into()),
15578 FsCassQueryToken::Not,
15579 FsCassQueryToken::Term("world".into()),
15580 ];
15581 assert_eq!(tokens, expected);
15582 }
15583
15584 #[test]
15585 fn parse_boolean_query_empty_quoted_phrase_ignored() {
15586 let tokens = parse_boolean_query("\"\"");
15587 assert!(tokens.is_empty());
15588
15589 let tokens = parse_boolean_query("foo \"\" bar");
15590 let expected: QueryTokenList = vec![
15591 QueryToken::Term("foo".into()),
15592 QueryToken::Term("bar".into()),
15593 ];
15594 assert_eq!(tokens, expected);
15595 }
15596
15597 #[test]
15598 fn parse_boolean_query_unclosed_quote() {
15599 let tokens = parse_boolean_query("\"hello world");
15601 let expected: QueryTokenList = vec![QueryToken::Phrase("hello world".into())];
15602 assert_eq!(tokens, expected);
15603 }
15604
15605 #[test]
15606 fn transpile_to_fts5_rejects_leading_unary_not_queries() {
15607 assert_eq!(transpile_to_fts5("NOT foo"), None);
15608 assert_eq!(transpile_to_fts5("-foo"), None);
15609 }
15610
15611 #[test]
15612 fn transpile_to_fts5_rejects_or_not_forms_it_cannot_represent() {
15613 assert_eq!(transpile_to_fts5("foo OR NOT bar"), None);
15614 assert_eq!(transpile_to_fts5("foo NOT bar OR baz"), None);
15615 }
15616
15617 #[test]
15618 fn transpile_to_fts5_ignores_leading_or() {
15619 assert_eq!(transpile_to_fts5("OR test"), Some("test".to_string()));
15620 assert_eq!(
15621 transpile_to_fts5("OR foo-bar"),
15622 Some("(foo AND bar)".to_string())
15623 );
15624 }
15625
15626 #[test]
15627 fn transpile_to_fts5_splits_hyphenated_subterms_for_sqlite_fts() {
15628 assert_eq!(
15629 transpile_to_fts5("br-123.jsonl"),
15630 Some("(br AND 123 AND jsonl)".to_string())
15631 );
15632 assert_eq!(
15633 transpile_to_fts5("br-123.json*"),
15634 Some("(br AND 123 AND json*)".to_string())
15635 );
15636 }
15637
15638 #[test]
15639 fn transpile_to_fts5_preserves_supported_binary_not() {
15640 assert_eq!(
15641 transpile_to_fts5("foo NOT bar").as_deref(),
15642 Some("foo NOT bar")
15643 );
15644 assert_eq!(
15645 transpile_to_fts5("foo NOT bar-baz"),
15646 Some("foo NOT (bar AND baz)".to_string())
15647 );
15648 }
15649
15650 #[test]
15651 fn search_sqlite_fts5_returns_empty_when_sqlite_is_unavailable() {
15652 let client = SearchClient {
15653 reader: None,
15654 sqlite: Mutex::new(None),
15655 sqlite_path: None,
15656 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15657 reload_on_search: false,
15658 last_reload: Mutex::new(None),
15659 last_generation: Mutex::new(None),
15660 reload_epoch: Arc::new(AtomicU64::new(0)),
15661 warm_tx: None,
15662 _warm_handle: None,
15663 metrics: Metrics::default(),
15664 cache_namespace: "fts5-disabled".to_string(),
15665 semantic: Mutex::new(None),
15666 last_tantivy_total_count: Mutex::new(None),
15667 };
15668
15669 let hits = client.search_sqlite_fts5(
15670 Path::new("/nonexistent"),
15671 "test query",
15672 SearchFilters::default(),
15673 10,
15674 0,
15675 FieldMask::FULL,
15676 );
15677
15678 assert!(hits.is_ok(), "disabled FTS5 path should stay non-fatal");
15679 assert!(
15680 hits.unwrap().is_empty(),
15681 "unavailable SQLite fallback should keep returning an empty result set"
15682 );
15683 }
15684
15685 #[test]
15707 fn search_sqlite_fts5_rank_and_hydrate_split_preserves_limit_prefix_invariant() -> Result<()> {
15708 let conn = Connection::open(":memory:")?;
15709 conn.execute_batch(
15710 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
15711 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
15712 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
15713 CREATE TABLE conversations (
15714 id INTEGER PRIMARY KEY,
15715 agent_id INTEGER,
15716 workspace_id INTEGER,
15717 source_id TEXT,
15718 origin_host TEXT,
15719 title TEXT,
15720 source_path TEXT
15721 );
15722 CREATE TABLE messages (
15723 id INTEGER PRIMARY KEY,
15724 conversation_id INTEGER,
15725 idx INTEGER,
15726 content TEXT,
15727 created_at INTEGER
15728 );
15729 CREATE VIRTUAL TABLE fts_messages USING fts5(
15730 content,
15731 title,
15732 agent,
15733 workspace,
15734 source_path,
15735 created_at UNINDEXED,
15736 message_id UNINDEXED,
15737 tokenize='porter'
15738 );",
15739 )?;
15740 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
15741 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
15742 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/tmp/k0e5p')")?;
15743
15744 for (i, repeats) in (1..=6_i64).enumerate() {
15751 let conv_id = i as i64 + 1;
15752 let msg_id = (i as i64 + 1) * 10;
15753 conn.execute_compat(
15754 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, \
15755 origin_host, title, source_path) \
15756 VALUES(?1, 1, 1, 'local', NULL, ?2, ?3)",
15757 params![
15758 conv_id,
15759 format!("k0e5p-{}", i),
15760 format!("/tmp/k0e5p/{}.jsonl", i),
15761 ],
15762 )?;
15763 let content = "rankprobe ".repeat(repeats as usize);
15764 conn.execute_compat(
15765 "INSERT INTO messages(id, conversation_id, idx, content, created_at) \
15766 VALUES(?1, ?2, ?3, ?4, ?5)",
15767 params![
15768 msg_id,
15769 conv_id,
15770 i as i64,
15771 content.as_str(),
15772 1_700_000_000_i64 + i as i64
15773 ],
15774 )?;
15775 conn.execute_compat(
15776 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, \
15777 source_path, created_at, message_id) \
15778 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
15779 params![
15780 msg_id,
15781 content.as_str(),
15782 format!("k0e5p-{}", i),
15783 "codex",
15784 "/tmp/k0e5p",
15785 format!("/tmp/k0e5p/{}.jsonl", i),
15786 1_700_000_000_i64 + i as i64,
15787 msg_id,
15788 ],
15789 )?;
15790 }
15791
15792 let client = SearchClient {
15793 reader: None,
15794 sqlite: Mutex::new(Some(SendConnection(conn))),
15795 sqlite_path: None,
15796 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15797 reload_on_search: false,
15798 last_reload: Mutex::new(None),
15799 last_generation: Mutex::new(None),
15800 reload_epoch: Arc::new(AtomicU64::new(0)),
15801 warm_tx: None,
15802 _warm_handle: None,
15803 metrics: Metrics::default(),
15804 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:k0e5p"),
15805 semantic: Mutex::new(None),
15806 last_tantivy_total_count: Mutex::new(None),
15807 };
15808
15809 fn hit_keys(hits: &[SearchHit]) -> Vec<(String, Option<usize>)> {
15814 hits.iter()
15815 .map(|h| (h.source_path.clone(), h.line_number))
15816 .collect()
15817 }
15818
15819 let large_hits = client.search_sqlite_fts5(
15820 Path::new(":memory:"),
15821 "rankprobe",
15822 SearchFilters::default(),
15823 6,
15824 0,
15825 FieldMask::FULL,
15826 )?;
15827 assert_eq!(
15828 large_hits.len(),
15829 6,
15830 "limit=N must return all N candidates when the corpus has exactly N matches"
15831 );
15832
15833 let small_hits = client.search_sqlite_fts5(
15834 Path::new(":memory:"),
15835 "rankprobe",
15836 SearchFilters::default(),
15837 3,
15838 0,
15839 FieldMask::FULL,
15840 )?;
15841 assert_eq!(small_hits.len(), 3, "limit=3 must return exactly 3 hits");
15842
15843 let large_keys = hit_keys(&large_hits);
15846 let small_keys = hit_keys(&small_hits);
15847 assert_eq!(
15848 small_keys,
15849 large_keys[..3],
15850 "limit=3 hit keys MUST be the first 3 of limit=6 hit keys (rank+hydrate \
15851 split must not re-order or re-filter); small={small_keys:?} \
15852 large_prefix={:?}",
15853 &large_keys[..3]
15854 );
15855
15856 for (idx, (small, large)) in small_hits.iter().zip(large_hits.iter()).enumerate() {
15862 assert_eq!(
15863 small.content, large.content,
15864 "hit[{idx}] content must agree across limit=3 and limit=6: \
15865 small={:?} large={:?}",
15866 small.content, large.content
15867 );
15868 assert_eq!(
15869 small.title, large.title,
15870 "hit[{idx}] title must agree across limit=3 and limit=6"
15871 );
15872 }
15873
15874 let zero_hits = client.search_sqlite_fts5(
15878 Path::new(":memory:"),
15879 "rankprobe",
15880 SearchFilters::default(),
15881 0,
15882 0,
15883 FieldMask::FULL,
15884 )?;
15885 assert!(
15886 zero_hits.is_empty(),
15887 "limit=0 must return zero hits even though the rank phase has candidates; \
15888 got {} hits",
15889 zero_hits.len()
15890 );
15891
15892 Ok(())
15893 }
15894
15895 #[test]
15898 fn levenshtein_distance_identical_strings() {
15899 assert_eq!(levenshtein_distance("hello", "hello"), 0);
15900 assert_eq!(levenshtein_distance("", ""), 0);
15901 }
15902
15903 #[test]
15904 fn levenshtein_distance_insertions() {
15905 assert_eq!(levenshtein_distance("", "abc"), 3);
15906 assert_eq!(levenshtein_distance("cat", "cats"), 1);
15907 }
15908
15909 #[test]
15910 fn levenshtein_distance_deletions() {
15911 assert_eq!(levenshtein_distance("abc", ""), 3);
15912 assert_eq!(levenshtein_distance("cats", "cat"), 1);
15913 }
15914
15915 #[test]
15916 fn levenshtein_distance_substitutions() {
15917 assert_eq!(levenshtein_distance("cat", "bat"), 1);
15918 assert_eq!(levenshtein_distance("kitten", "sitten"), 1);
15919 }
15920
15921 #[test]
15922 fn levenshtein_distance_mixed_operations() {
15923 assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
15924 assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
15925 }
15926
15927 #[test]
15930 fn is_tool_invocation_noise_allows_real_content() {
15931 assert!(!is_tool_invocation_noise("This is a normal message"));
15932 assert!(!is_tool_invocation_noise(
15933 "Let me use the Tool feature to accomplish this task. Here is the implementation..."
15934 ));
15935 let long_content = "[Tool: Read] Now here is a lot of useful content that explains the implementation details and provides context for the changes being made to the codebase.";
15937 assert!(!is_tool_invocation_noise(long_content));
15938 }
15939
15940 #[test]
15941 fn is_tool_invocation_noise_handles_short_tool_markers() {
15942 assert!(is_tool_invocation_noise("[tool: x]"));
15943 assert!(is_tool_invocation_noise("tool: bash"));
15944 }
15945
15946 #[test]
15949 fn search_boolean_and_filters_results() -> Result<()> {
15950 let dir = TempDir::new()?;
15951 let mut index = TantivyIndex::open_or_create(dir.path())?;
15952
15953 let conv1 = NormalizedConversation {
15955 agent_slug: "codex".into(),
15956 external_id: None,
15957 title: Some("doc1".into()),
15958 workspace: None,
15959 source_path: dir.path().join("1.jsonl"),
15960 started_at: Some(1),
15961 ended_at: None,
15962 metadata: serde_json::json!({}),
15963 messages: vec![NormalizedMessage {
15964 idx: 0,
15965 role: "user".into(),
15966 author: None,
15967 created_at: Some(1),
15968 content: "alpha beta gamma".into(),
15969 extra: serde_json::json!({}),
15970 snippets: vec![],
15971 invocations: Vec::new(),
15972 }],
15973 };
15974 let conv2 = NormalizedConversation {
15975 agent_slug: "codex".into(),
15976 external_id: None,
15977 title: Some("doc2".into()),
15978 workspace: None,
15979 source_path: dir.path().join("2.jsonl"),
15980 started_at: Some(2),
15981 ended_at: None,
15982 metadata: serde_json::json!({}),
15983 messages: vec![NormalizedMessage {
15984 idx: 0,
15985 role: "user".into(),
15986 author: None,
15987 created_at: Some(2),
15988 content: "alpha delta".into(),
15989 extra: serde_json::json!({}),
15990 snippets: vec![],
15991 invocations: Vec::new(),
15992 }],
15993 };
15994 index.add_conversation(&conv1)?;
15995 index.add_conversation(&conv2)?;
15996 index.commit()?;
15997
15998 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15999
16000 let hits = client.search(
16002 "alpha AND beta",
16003 SearchFilters::default(),
16004 10,
16005 0,
16006 FieldMask::FULL,
16007 )?;
16008 assert_eq!(hits.len(), 1);
16009 assert!(hits[0].content.contains("gamma"));
16010
16011 let hits = client.search(
16013 "alpha AND delta",
16014 SearchFilters::default(),
16015 10,
16016 0,
16017 FieldMask::FULL,
16018 )?;
16019 assert_eq!(hits.len(), 1);
16020 assert!(hits[0].content.contains("delta"));
16021
16022 Ok(())
16023 }
16024
16025 #[test]
16026 fn search_boolean_or_expands_results() -> Result<()> {
16027 let dir = TempDir::new()?;
16028 let mut index = TantivyIndex::open_or_create(dir.path())?;
16029
16030 let conv1 = NormalizedConversation {
16031 agent_slug: "codex".into(),
16032 external_id: None,
16033 title: Some("doc1".into()),
16034 workspace: None,
16035 source_path: dir.path().join("1.jsonl"),
16036 started_at: Some(1),
16037 ended_at: None,
16038 metadata: serde_json::json!({}),
16039 messages: vec![NormalizedMessage {
16040 idx: 0,
16041 role: "user".into(),
16042 author: None,
16043 created_at: Some(1),
16044 content: "unique xyzzy term".into(),
16045 extra: serde_json::json!({}),
16046 snippets: vec![],
16047 invocations: Vec::new(),
16048 }],
16049 };
16050 let conv2 = NormalizedConversation {
16051 agent_slug: "codex".into(),
16052 external_id: None,
16053 title: Some("doc2".into()),
16054 workspace: None,
16055 source_path: dir.path().join("2.jsonl"),
16056 started_at: Some(2),
16057 ended_at: None,
16058 metadata: serde_json::json!({}),
16059 messages: vec![NormalizedMessage {
16060 idx: 0,
16061 role: "user".into(),
16062 author: None,
16063 created_at: Some(2),
16064 content: "unique plugh term".into(),
16065 extra: serde_json::json!({}),
16066 snippets: vec![],
16067 invocations: Vec::new(),
16068 }],
16069 };
16070 index.add_conversation(&conv1)?;
16071 index.add_conversation(&conv2)?;
16072 index.commit()?;
16073
16074 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16075
16076 let hits = client.search(
16078 "xyzzy OR plugh",
16079 SearchFilters::default(),
16080 10,
16081 0,
16082 FieldMask::FULL,
16083 )?;
16084 assert_eq!(hits.len(), 2);
16085
16086 Ok(())
16087 }
16088
16089 #[test]
16090 fn search_boolean_not_excludes_results() -> Result<()> {
16091 let dir = TempDir::new()?;
16092 let mut index = TantivyIndex::open_or_create(dir.path())?;
16093
16094 let conv1 = NormalizedConversation {
16095 agent_slug: "codex".into(),
16096 external_id: None,
16097 title: Some("doc1".into()),
16098 workspace: None,
16099 source_path: dir.path().join("1.jsonl"),
16100 started_at: Some(1),
16101 ended_at: None,
16102 metadata: serde_json::json!({}),
16103 messages: vec![NormalizedMessage {
16104 idx: 0,
16105 role: "user".into(),
16106 author: None,
16107 created_at: Some(1),
16108 content: "nottest keep this".into(),
16109 extra: serde_json::json!({}),
16110 snippets: vec![],
16111 invocations: Vec::new(),
16112 }],
16113 };
16114 let conv2 = NormalizedConversation {
16115 agent_slug: "codex".into(),
16116 external_id: None,
16117 title: Some("doc2".into()),
16118 workspace: None,
16119 source_path: dir.path().join("2.jsonl"),
16120 started_at: Some(2),
16121 ended_at: None,
16122 metadata: serde_json::json!({}),
16123 messages: vec![NormalizedMessage {
16124 idx: 0,
16125 role: "user".into(),
16126 author: None,
16127 created_at: Some(2),
16128 content: "nottest exclude this".into(),
16129 extra: serde_json::json!({}),
16130 snippets: vec![],
16131 invocations: Vec::new(),
16132 }],
16133 };
16134 index.add_conversation(&conv1)?;
16135 index.add_conversation(&conv2)?;
16136 index.commit()?;
16137
16138 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16139
16140 let hits = client.search(
16142 "nottest NOT exclude",
16143 SearchFilters::default(),
16144 10,
16145 0,
16146 FieldMask::FULL,
16147 )?;
16148 assert_eq!(hits.len(), 1);
16149 assert!(
16151 !hits[0].content.contains("exclude"),
16152 "NOT exclude should filter out doc with 'exclude'"
16153 );
16154
16155 let hits = client.search(
16157 "nottest -exclude",
16158 SearchFilters::default(),
16159 10,
16160 0,
16161 FieldMask::FULL,
16162 )?;
16163 assert_eq!(hits.len(), 1);
16164 assert!(
16165 !hits[0].content.contains("exclude"),
16166 "Prefix -exclude should filter out doc with 'exclude'"
16167 );
16168
16169 Ok(())
16170 }
16171
16172 #[test]
16173 fn search_phrase_query_matches_exact_sequence() -> Result<()> {
16174 let dir = TempDir::new()?;
16175 let mut index = TantivyIndex::open_or_create(dir.path())?;
16176
16177 let conv1 = NormalizedConversation {
16178 agent_slug: "codex".into(),
16179 external_id: None,
16180 title: Some("doc1".into()),
16181 workspace: None,
16182 source_path: dir.path().join("1.jsonl"),
16183 started_at: Some(1),
16184 ended_at: None,
16185 metadata: serde_json::json!({}),
16186 messages: vec![NormalizedMessage {
16187 idx: 0,
16188 role: "user".into(),
16189 author: None,
16190 created_at: Some(1),
16191 content: "the quick brown fox".into(),
16192 extra: serde_json::json!({}),
16193 snippets: vec![],
16194 invocations: Vec::new(),
16195 }],
16196 };
16197 let conv2 = NormalizedConversation {
16198 agent_slug: "codex".into(),
16199 external_id: None,
16200 title: Some("doc2".into()),
16201 workspace: None,
16202 source_path: dir.path().join("2.jsonl"),
16203 started_at: Some(2),
16204 ended_at: None,
16205 metadata: serde_json::json!({}),
16206 messages: vec![NormalizedMessage {
16207 idx: 0,
16208 role: "user".into(),
16209 author: None,
16210 created_at: Some(2),
16211 content: "the brown quick fox".into(),
16212 extra: serde_json::json!({}),
16213 snippets: vec![],
16214 invocations: Vec::new(),
16215 }],
16216 };
16217 index.add_conversation(&conv1)?;
16218 index.add_conversation(&conv2)?;
16219 index.commit()?;
16220
16221 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16222
16223 let hits = client.search(
16225 "quick brown",
16226 SearchFilters::default(),
16227 10,
16228 0,
16229 FieldMask::FULL,
16230 )?;
16231 assert_eq!(hits.len(), 2);
16232
16233 let hits = client.search(
16235 "\"quick brown\"",
16236 SearchFilters::default(),
16237 10,
16238 0,
16239 FieldMask::FULL,
16240 )?;
16241 assert_eq!(hits.len(), 1);
16242 assert!(hits[0].content.contains("quick brown"));
16243
16244 Ok(())
16245 }
16246
16247 #[test]
16248 fn search_dot_punctuation_splits_terms_but_hyphens_preserve_compound_semantics() -> Result<()> {
16249 let dir = TempDir::new()?;
16250 let mut index = TantivyIndex::open_or_create(dir.path())?;
16251
16252 let conv = NormalizedConversation {
16253 agent_slug: "codex".into(),
16254 external_id: None,
16255 title: Some("doc".into()),
16256 workspace: None,
16257 source_path: dir.path().join("3.jsonl"),
16258 started_at: Some(1),
16259 ended_at: None,
16260 metadata: serde_json::json!({}),
16261 messages: vec![NormalizedMessage {
16262 idx: 0,
16263 role: "user".into(),
16264 author: None,
16265 created_at: Some(1),
16266 content: "foo bar baz".into(),
16267 extra: serde_json::json!({}),
16268 snippets: vec![],
16269 invocations: Vec::new(),
16270 }],
16271 };
16272 index.add_conversation(&conv)?;
16273 index.commit()?;
16274
16275 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16276
16277 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16278 assert_eq!(hits.len(), 1);
16279
16280 let hits = client.search("foo-bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16281 assert_eq!(hits.len(), 0);
16282
16283 Ok(())
16284 }
16285
16286 #[test]
16291 fn explanation_classifies_simple_query() {
16292 let exp = QueryExplanation::analyze("hello", &SearchFilters::default());
16293 assert_eq!(exp.query_type, QueryType::Simple);
16294 assert_eq!(exp.index_strategy, IndexStrategy::EdgeNgram);
16295 assert_eq!(exp.estimated_cost, QueryCost::Low);
16296 assert!(exp.parsed.terms.len() == 1);
16297 assert_eq!(exp.parsed.terms[0].text, "hello");
16298 assert!(!exp.parsed.terms[0].subterms.is_empty());
16299 assert_eq!(exp.parsed.terms[0].subterms[0].pattern, "exact");
16300 }
16301
16302 #[test]
16303 fn explanation_classifies_wildcard_query() {
16304 let exp = QueryExplanation::analyze("*handler*", &SearchFilters::default());
16305 assert_eq!(exp.query_type, QueryType::Wildcard);
16306 assert_eq!(exp.index_strategy, IndexStrategy::RegexScan);
16307 assert_eq!(exp.estimated_cost, QueryCost::High);
16308 assert!(!exp.parsed.terms[0].subterms.is_empty());
16309 assert!(
16310 exp.parsed.terms[0].subterms[0]
16311 .pattern
16312 .contains("substring")
16313 );
16314 assert!(exp.warnings.iter().any(|w| w.contains("regex scan")));
16315 }
16316
16317 #[test]
16318 fn explanation_classifies_boolean_query() {
16319 let exp = QueryExplanation::analyze("foo AND bar", &SearchFilters::default());
16320 assert_eq!(exp.query_type, QueryType::Boolean);
16321 assert_eq!(exp.index_strategy, IndexStrategy::BooleanCombination);
16322 assert!(exp.parsed.operators.contains(&"AND".to_string()));
16323 }
16324
16325 #[test]
16326 fn explanation_classifies_phrase_query() {
16327 let exp = QueryExplanation::analyze("\"exact phrase\"", &SearchFilters::default());
16328 assert_eq!(exp.query_type, QueryType::Phrase);
16329 assert!(exp.parsed.phrases.contains(&"exact phrase".to_string()));
16330 }
16331
16332 #[test]
16333 fn explanation_handles_filtered_query() {
16334 let mut filters = SearchFilters::default();
16335 filters.agents.insert("codex".to_string());
16336
16337 let exp = QueryExplanation::analyze("test", &filters);
16338 assert_eq!(exp.query_type, QueryType::Filtered);
16339 assert_eq!(exp.filters_summary.agent_count, 1);
16340 assert!(
16341 exp.filters_summary
16342 .description
16343 .as_ref()
16344 .unwrap()
16345 .contains("1 agent")
16346 );
16347 assert!(exp.warnings.iter().any(|w| w.contains("codex")));
16348 }
16349
16350 #[test]
16351 fn explanation_handles_empty_query() {
16352 let exp = QueryExplanation::analyze("", &SearchFilters::default());
16353 assert_eq!(exp.query_type, QueryType::Empty);
16354 assert_eq!(exp.index_strategy, IndexStrategy::FullScan);
16355 assert_eq!(exp.estimated_cost, QueryCost::High);
16356 assert!(exp.warnings.iter().any(|w| w.contains("Empty query")));
16357 }
16358
16359 #[test]
16360 fn explanation_warns_short_terms() {
16361 let exp = QueryExplanation::analyze("a", &SearchFilters::default());
16362 assert!(exp.warnings.iter().any(|w| w.contains("Very short term")));
16363 }
16364
16365 #[test]
16366 fn explanation_with_wildcard_fallback() {
16367 let exp = QueryExplanation::analyze("test", &SearchFilters::default())
16368 .with_wildcard_fallback(true);
16369 assert!(exp.wildcard_applied);
16370 assert!(exp.warnings.iter().any(|w| w.contains("Wildcard fallback")));
16372 }
16373
16374 #[test]
16375 fn explanation_complex_query_has_higher_cost() {
16376 let exp = QueryExplanation::analyze(
16377 "foo AND bar OR baz NOT qux AND \"phrase here\"",
16378 &SearchFilters::default(),
16379 );
16380 assert_eq!(exp.query_type, QueryType::Boolean);
16381 assert!(matches!(
16383 exp.estimated_cost,
16384 QueryCost::Medium | QueryCost::High
16385 ));
16386 }
16387
16388 #[test]
16389 fn explanation_preserves_original_query() {
16390 let exp = QueryExplanation::analyze("Hello World!", &SearchFilters::default());
16391 assert_eq!(exp.original_query, "Hello World!");
16392 assert!(exp.sanitized_query.contains("Hello"));
16394 assert!(!exp.sanitized_query.contains("!"));
16396 }
16397
16398 #[test]
16399 fn explanation_detects_not_operator() {
16400 let exp = QueryExplanation::analyze("foo NOT bar", &SearchFilters::default());
16401 assert!(exp.parsed.operators.contains(&"NOT".to_string()));
16402 assert!(
16404 exp.parsed
16405 .terms
16406 .iter()
16407 .any(|t| t.negated && t.text == "bar")
16408 );
16409 }
16410
16411 #[test]
16412 fn explanation_implicit_and() {
16413 let exp = QueryExplanation::analyze("foo bar", &SearchFilters::default());
16414 assert!(exp.parsed.implicit_and);
16415 assert_eq!(exp.parsed.terms.len(), 2);
16416 }
16417
16418 #[test]
16419 fn explanation_serializes_to_json() {
16420 let exp = QueryExplanation::analyze("test query", &SearchFilters::default());
16421 let json = serde_json::to_value(&exp).expect("should serialize");
16422 assert!(json["original_query"].is_string());
16423 assert!(json["query_type"].is_string());
16424 assert!(json["index_strategy"].is_string());
16425 assert!(json["estimated_cost"].is_string());
16426 assert!(json["parsed"]["terms"].is_array());
16427 }
16428
16429 #[test]
16434 fn search_multi_filter_agent_workspace_time() -> Result<()> {
16435 let dir = TempDir::new()?;
16437 let mut index = TantivyIndex::open_or_create(dir.path())?;
16438
16439 let convs = [
16441 ("codex", "/ws/alpha", 100, "needle alpha codex"),
16442 ("claude", "/ws/alpha", 200, "needle alpha claude"),
16443 ("codex", "/ws/beta", 150, "needle beta codex"),
16444 ("codex", "/ws/alpha", 300, "needle alpha codex late"),
16445 ];
16446
16447 for (i, (agent, ws, ts, content)) in convs.iter().enumerate() {
16448 let conv = NormalizedConversation {
16449 agent_slug: (*agent).into(),
16450 external_id: None,
16451 title: Some(format!("conv-{i}")),
16452 workspace: Some(std::path::PathBuf::from(*ws)),
16453 source_path: dir.path().join(format!("{i}.jsonl")),
16454 started_at: Some(*ts),
16455 ended_at: None,
16456 metadata: serde_json::json!({}),
16457 messages: vec![NormalizedMessage {
16458 idx: 0,
16459 role: "user".into(),
16460 author: None,
16461 created_at: Some(*ts),
16462 content: (*content).into(),
16463 extra: serde_json::json!({}),
16464 snippets: vec![],
16465 invocations: Vec::new(),
16466 }],
16467 };
16468 index.add_conversation(&conv)?;
16469 }
16470 index.commit()?;
16471
16472 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16473
16474 let mut filters = SearchFilters::default();
16476 filters.agents.insert("codex".into());
16477 filters.workspaces.insert("/ws/alpha".into());
16478 filters.created_from = Some(50);
16479 filters.created_to = Some(250);
16480
16481 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
16482 assert_eq!(
16483 hits.len(),
16484 1,
16485 "Should match only one conv (codex + alpha + ts=100)"
16486 );
16487 assert_eq!(hits[0].agent, "codex");
16488 assert_eq!(hits[0].workspace, "/ws/alpha");
16489 assert!(hits[0].content.contains("alpha codex"));
16490 assert!(!hits[0].content.contains("late")); Ok(())
16493 }
16494
16495 #[test]
16496 fn search_multi_agent_filter() -> Result<()> {
16497 let dir = TempDir::new()?;
16499 let mut index = TantivyIndex::open_or_create(dir.path())?;
16500
16501 for agent in ["codex", "claude", "cline", "gemini"] {
16502 let conv = NormalizedConversation {
16503 agent_slug: agent.into(),
16504 external_id: None,
16505 title: Some(format!("{agent}-conv")),
16506 workspace: Some(std::path::PathBuf::from("/ws")),
16507 source_path: dir.path().join(format!("{agent}.jsonl")),
16508 started_at: Some(100),
16509 ended_at: None,
16510 metadata: serde_json::json!({}),
16511 messages: vec![NormalizedMessage {
16512 idx: 0,
16513 role: "user".into(),
16514 author: None,
16515 created_at: Some(100),
16516 content: format!("needle from {agent}"),
16517 extra: serde_json::json!({}),
16518 snippets: vec![],
16519 invocations: Vec::new(),
16520 }],
16521 };
16522 index.add_conversation(&conv)?;
16523 }
16524 index.commit()?;
16525
16526 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16527
16528 let mut filters = SearchFilters::default();
16530 filters.agents.insert("codex".into());
16531 filters.agents.insert("claude".into());
16532
16533 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
16534 assert_eq!(hits.len(), 2);
16535 let agents: Vec<_> = hits.iter().map(|h| h.agent.as_str()).collect();
16536 assert!(agents.contains(&"codex"));
16537 assert!(agents.contains(&"claude"));
16538 assert!(!agents.contains(&"cline"));
16539 assert!(!agents.contains(&"gemini"));
16540
16541 Ok(())
16542 }
16543
16544 #[test]
16549 fn cache_metrics_incremented_on_operations() {
16550 let client = SearchClient {
16551 reader: None,
16552 sqlite: Mutex::new(None),
16553 sqlite_path: None,
16554 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16555 reload_on_search: true,
16556 last_reload: Mutex::new(None),
16557 last_generation: Mutex::new(None),
16558 reload_epoch: Arc::new(AtomicU64::new(0)),
16559 warm_tx: None,
16560 _warm_handle: None,
16561 metrics: Metrics::default(),
16562 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
16563 semantic: Mutex::new(None),
16564 last_tantivy_total_count: Mutex::new(None),
16565 };
16566
16567 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
16569 assert_eq!((hits, miss, shortfall, reloads), (0, 0, 0, 0));
16570
16571 client.metrics.inc_cache_hits();
16573 client.metrics.inc_cache_hits();
16574 client.metrics.inc_cache_miss();
16575 client.metrics.inc_cache_shortfall();
16576 client.metrics.inc_reload();
16577
16578 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
16579 assert_eq!(hits, 2);
16580 assert_eq!(miss, 1);
16581 assert_eq!(shortfall, 1);
16582 assert_eq!(reloads, 1);
16583 }
16584
16585 #[test]
16586 fn cache_shard_name_deterministic() {
16587 let client = SearchClient {
16589 reader: None,
16590 sqlite: Mutex::new(None),
16591 sqlite_path: None,
16592 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16593 reload_on_search: true,
16594 last_reload: Mutex::new(None),
16595 last_generation: Mutex::new(None),
16596 reload_epoch: Arc::new(AtomicU64::new(0)),
16597 warm_tx: None,
16598 _warm_handle: None,
16599 metrics: Metrics::default(),
16600 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
16601 semantic: Mutex::new(None),
16602 last_tantivy_total_count: Mutex::new(None),
16603 };
16604
16605 let filters1 = SearchFilters::default();
16606 let mut filters2 = SearchFilters::default();
16607 filters2.agents.insert("codex".into());
16608 let mut filters3 = SearchFilters::default();
16609 filters3.workspaces.insert("/tmp/cass-workspace".into());
16610
16611 let shard1_first = client.shard_name(&filters1);
16613 let shard1_second = client.shard_name(&filters1);
16614 assert_eq!(
16615 shard1_first, shard1_second,
16616 "Same filters should produce same shard name"
16617 );
16618
16619 let shard2 = client.shard_name(&filters2);
16621 assert_ne!(
16622 shard1_first, shard2,
16623 "Different filters should produce different shard names"
16624 );
16625
16626 assert_eq!(shard2, client.shard_name(&filters2));
16628 assert_eq!(
16629 client.shard_name(&filters3),
16630 "workspace:/tmp/cass-workspace"
16631 );
16632 }
16633
16634 #[test]
16639 fn wildcard_fallback_respects_filter_constraints() -> Result<()> {
16640 let dir = TempDir::new()?;
16641 let mut index = TantivyIndex::open_or_create(dir.path())?;
16642
16643 let conv_match = NormalizedConversation {
16645 agent_slug: "codex".into(),
16646 external_id: None,
16647 title: Some("match".into()),
16648 workspace: Some(std::path::PathBuf::from("/target")),
16649 source_path: dir.path().join("match.jsonl"),
16650 started_at: Some(100),
16651 ended_at: None,
16652 metadata: serde_json::json!({}),
16653 messages: vec![NormalizedMessage {
16654 idx: 0,
16655 role: "user".into(),
16656 author: None,
16657 created_at: Some(100),
16658 content: "unique specific term here".into(),
16659 extra: serde_json::json!({}),
16660 snippets: vec![],
16661 invocations: Vec::new(),
16662 }],
16663 };
16664
16665 let conv_other = NormalizedConversation {
16666 agent_slug: "claude".into(),
16667 external_id: None,
16668 title: Some("other".into()),
16669 workspace: Some(std::path::PathBuf::from("/other")),
16670 source_path: dir.path().join("other.jsonl"),
16671 started_at: Some(100),
16672 ended_at: None,
16673 metadata: serde_json::json!({}),
16674 messages: vec![NormalizedMessage {
16675 idx: 0,
16676 role: "user".into(),
16677 author: None,
16678 created_at: Some(100),
16679 content: "unique specific also here".into(),
16680 extra: serde_json::json!({}),
16681 snippets: vec![],
16682 invocations: Vec::new(),
16683 }],
16684 };
16685
16686 index.add_conversation(&conv_match)?;
16687 index.add_conversation(&conv_other)?;
16688 index.commit()?;
16689
16690 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16691
16692 let mut filters = SearchFilters::default();
16694 filters.agents.insert("codex".into());
16695
16696 let result =
16697 client.search_with_fallback("unique", filters.clone(), 10, 0, 100, FieldMask::FULL)?;
16698 assert!(result.hits.iter().all(|h| h.agent == "codex"));
16700
16701 Ok(())
16702 }
16703
16704 #[test]
16705 fn wildcard_fallback_short_query_triggers_prefix() -> Result<()> {
16706 let dir = TempDir::new()?;
16707 let mut index = TantivyIndex::open_or_create(dir.path())?;
16708
16709 let conv = NormalizedConversation {
16710 agent_slug: "codex".into(),
16711 external_id: None,
16712 title: Some("test".into()),
16713 workspace: None,
16714 source_path: dir.path().join("test.jsonl"),
16715 started_at: Some(100),
16716 ended_at: None,
16717 metadata: serde_json::json!({}),
16718 messages: vec![NormalizedMessage {
16719 idx: 0,
16720 role: "user".into(),
16721 author: None,
16722 created_at: Some(100),
16723 content: "authentication authorization oauth".into(),
16724 extra: serde_json::json!({}),
16725 snippets: vec![],
16726 invocations: Vec::new(),
16727 }],
16728 };
16729 index.add_conversation(&conv)?;
16730 index.commit()?;
16731
16732 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16733
16734 let result = client.search_with_fallback(
16736 "auth",
16737 SearchFilters::default(),
16738 10,
16739 0,
16740 100,
16741 FieldMask::FULL,
16742 )?;
16743 assert!(
16744 !result.hits.is_empty(),
16745 "Short prefix should match via prefix search"
16746 );
16747 assert!(result.hits[0].content.contains("auth"));
16748
16749 Ok(())
16750 }
16751
16752 #[test]
16757 fn search_real_fixture_multiple_messages() -> Result<()> {
16758 let dir = TempDir::new()?;
16759 let mut index = TantivyIndex::open_or_create(dir.path())?;
16760
16761 let conv = NormalizedConversation {
16763 agent_slug: "claude_code".into(),
16764 external_id: Some("conv-123".into()),
16765 title: Some("Implementing authentication".into()),
16766 workspace: Some(std::path::PathBuf::from("/home/user/project")),
16767 source_path: dir.path().join("session-1.jsonl"),
16768 started_at: Some(1700000000000),
16769 ended_at: Some(1700000060000),
16770 metadata: serde_json::json!({
16771 "model": "claude-3-sonnet",
16772 "tokens": 1500
16773 }),
16774 messages: vec![
16775 NormalizedMessage {
16776 idx: 0,
16777 role: "user".into(),
16778 author: Some("developer".into()),
16779 created_at: Some(1700000000000),
16780 content: "Help me implement JWT authentication for my Express API".into(),
16781 extra: serde_json::json!({}),
16782 snippets: vec![],
16783 invocations: Vec::new(),
16784 },
16785 NormalizedMessage {
16786 idx: 1,
16787 role: "assistant".into(),
16788 author: Some("claude".into()),
16789 created_at: Some(1700000010000),
16790 content: "I'll help you implement JWT authentication. First, let's install the required packages.".into(),
16791 extra: serde_json::json!({}),
16792 snippets: vec![NormalizedSnippet {
16793 file_path: Some("package.json".into()),
16794 start_line: Some(1),
16795 end_line: Some(5),
16796 language: Some("json".into()),
16797 snippet_text: Some(r#"{"dependencies":{"jsonwebtoken":"^9.0.0"}}"#.into()),
16798 }],
16799 invocations: Vec::new(),
16800 },
16801 NormalizedMessage {
16802 idx: 2,
16803 role: "user".into(),
16804 author: Some("developer".into()),
16805 created_at: Some(1700000030000),
16806 content: "Can you also add refresh token support?".into(),
16807 extra: serde_json::json!({}),
16808 snippets: vec![],
16809 invocations: Vec::new(),
16810 },
16811 ],
16812 };
16813 index.add_conversation(&conv)?;
16814 index.commit()?;
16815
16816 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16817
16818 let hits = client.search(
16820 "JWT authentication",
16821 SearchFilters::default(),
16822 10,
16823 0,
16824 FieldMask::FULL,
16825 )?;
16826 assert!(!hits.is_empty(), "Should find JWT authentication");
16827 assert!(hits.iter().any(|h| h.agent == "claude_code"));
16828 assert!(
16829 hits.iter()
16830 .any(|h| h.snippet.contains("JWT") || h.snippet.contains("authentication"))
16831 );
16832
16833 let hits = client.search(
16835 "required packages",
16836 SearchFilters::default(),
16837 10,
16838 0,
16839 FieldMask::FULL,
16840 )?;
16841 assert!(
16842 !hits.is_empty(),
16843 "Should find 'required packages' in assistant response"
16844 );
16845
16846 let hits = client.search(
16848 "refresh token",
16849 SearchFilters::default(),
16850 10,
16851 0,
16852 FieldMask::FULL,
16853 )?;
16854 assert!(!hits.is_empty(), "Should find refresh token");
16855 assert!(hits.iter().any(|h| h.content.contains("refresh")));
16856
16857 Ok(())
16858 }
16859
16860 #[test]
16861 fn search_deduplication_with_similar_content() -> Result<()> {
16862 let dir = TempDir::new()?;
16863 let mut index = TantivyIndex::open_or_create(dir.path())?;
16864
16865 for i in 0..2 {
16867 let conv = NormalizedConversation {
16868 agent_slug: "codex".into(),
16869 external_id: None,
16870 title: Some(format!("similar-{i}")),
16871 workspace: Some(std::path::PathBuf::from("/ws")),
16872 source_path: dir.path().join(format!("similar-{i}.jsonl")),
16873 started_at: Some(100 + i),
16874 ended_at: None,
16875 metadata: serde_json::json!({}),
16876 messages: vec![NormalizedMessage {
16877 idx: 0,
16878 role: "user".into(),
16879 author: None,
16880 created_at: Some(100 + i),
16881 content: "implement the sorting algorithm".into(),
16883 extra: serde_json::json!({}),
16884 snippets: vec![],
16885 invocations: Vec::new(),
16886 }],
16887 };
16888 index.add_conversation(&conv)?;
16889 }
16890 index.commit()?;
16891
16892 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16893 let result = client.search_with_fallback(
16894 "sorting algorithm",
16895 SearchFilters::default(),
16896 10,
16897 0,
16898 100,
16899 FieldMask::FULL,
16900 )?;
16901
16902 assert!(!result.hits.is_empty());
16905
16906 Ok(())
16907 }
16908
16909 #[test]
16914 fn search_session_paths_filter() -> Result<()> {
16915 let dir = TempDir::new()?;
16917 let mut index = TantivyIndex::open_or_create(dir.path())?;
16918
16919 let paths = [
16921 dir.path().join("session-a.jsonl"),
16922 dir.path().join("session-b.jsonl"),
16923 dir.path().join("session-c.jsonl"),
16924 ];
16925
16926 for (i, path) in paths.iter().enumerate() {
16927 let conv = NormalizedConversation {
16928 agent_slug: "claude".into(),
16929 external_id: None,
16930 title: Some(format!("session-{}", i)),
16931 workspace: Some(std::path::PathBuf::from("/ws")),
16932 source_path: path.clone(),
16933 started_at: Some(100 + i as i64),
16934 ended_at: None,
16935 metadata: serde_json::json!({}),
16936 messages: vec![NormalizedMessage {
16937 idx: 0,
16938 role: "user".into(),
16939 author: None,
16940 created_at: Some(100 + i as i64),
16941 content: format!("needle content for session {}", i),
16942 extra: serde_json::json!({}),
16943 snippets: vec![],
16944 invocations: Vec::new(),
16945 }],
16946 };
16947 index.add_conversation(&conv)?;
16948 }
16949 index.commit()?;
16950
16951 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16952
16953 let hits_all = client.search("needle", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16955 assert_eq!(hits_all.len(), 3, "Should find all 3 sessions");
16956
16957 let mut filters = SearchFilters::default();
16959 filters
16960 .session_paths
16961 .insert(paths[0].to_string_lossy().to_string());
16962 filters
16963 .session_paths
16964 .insert(paths[2].to_string_lossy().to_string());
16965
16966 let hits_filtered = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
16967 assert_eq!(
16968 hits_filtered.len(),
16969 2,
16970 "Should find only 2 sessions (A and C)"
16971 );
16972
16973 let filtered_paths: HashSet<&str> = hits_filtered
16975 .iter()
16976 .map(|h| h.source_path.as_str())
16977 .collect();
16978 assert!(filtered_paths.contains(paths[0].to_string_lossy().as_ref()));
16979 assert!(filtered_paths.contains(paths[2].to_string_lossy().as_ref()));
16980 assert!(!filtered_paths.contains(paths[1].to_string_lossy().as_ref()));
16981
16982 Ok(())
16983 }
16984
16985 #[test]
16986 fn lexical_session_paths_filter_retries_past_initial_page() -> Result<()> {
16987 let dir = TempDir::new()?;
16988 let mut index = TantivyIndex::open_or_create(dir.path())?;
16989 let requested_path = dir.path().join("requested-session.jsonl");
16990
16991 for i in 0..4 {
16992 let conv = NormalizedConversation {
16993 agent_slug: "claude".into(),
16994 external_id: None,
16995 title: Some(format!("distractor-{i}")),
16996 workspace: Some(std::path::PathBuf::from("/ws")),
16997 source_path: dir.path().join(format!("distractor-{i}.jsonl")),
16998 started_at: Some(100 + i as i64),
16999 ended_at: None,
17000 metadata: serde_json::json!({}),
17001 messages: vec![NormalizedMessage {
17002 idx: 0,
17003 role: "user".into(),
17004 author: None,
17005 created_at: Some(100 + i as i64),
17006 content: "needle needle needle high ranking distractor".into(),
17007 extra: serde_json::json!({}),
17008 snippets: vec![],
17009 invocations: Vec::new(),
17010 }],
17011 };
17012 index.add_conversation(&conv)?;
17013 }
17014
17015 let requested = NormalizedConversation {
17016 agent_slug: "claude".into(),
17017 external_id: None,
17018 title: Some("requested".into()),
17019 workspace: Some(std::path::PathBuf::from("/ws")),
17020 source_path: requested_path.clone(),
17021 started_at: Some(200),
17022 ended_at: None,
17023 metadata: serde_json::json!({}),
17024 messages: vec![NormalizedMessage {
17025 idx: 0,
17026 role: "user".into(),
17027 author: None,
17028 created_at: Some(200),
17029 content: "needle requested session should survive post-filter paging".into(),
17030 extra: serde_json::json!({}),
17031 snippets: vec![],
17032 invocations: Vec::new(),
17033 }],
17034 };
17035 index.add_conversation(&requested)?;
17036 index.commit()?;
17037
17038 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17039 let mut filters = SearchFilters::default();
17040 filters
17041 .session_paths
17042 .insert(requested_path.to_string_lossy().to_string());
17043
17044 let hits = client.search("needle", filters, 1, 0, FieldMask::FULL)?;
17045
17046 assert_eq!(hits.len(), 1);
17047 assert_eq!(hits[0].source_path, requested_path.to_string_lossy());
17048
17049 Ok(())
17050 }
17051
17052 #[test]
17053 fn search_session_paths_empty_filter_returns_all() -> Result<()> {
17054 let dir = TempDir::new()?;
17056 let mut index = TantivyIndex::open_or_create(dir.path())?;
17057
17058 let conv = NormalizedConversation {
17059 agent_slug: "claude".into(),
17060 external_id: None,
17061 title: Some("test".into()),
17062 workspace: Some(std::path::PathBuf::from("/ws")),
17063 source_path: dir.path().join("test.jsonl"),
17064 started_at: Some(100),
17065 ended_at: None,
17066 metadata: serde_json::json!({}),
17067 messages: vec![NormalizedMessage {
17068 idx: 0,
17069 role: "user".into(),
17070 author: None,
17071 created_at: Some(100),
17072 content: "needle content".into(),
17073 extra: serde_json::json!({}),
17074 snippets: vec![],
17075 invocations: Vec::new(),
17076 }],
17077 };
17078 index.add_conversation(&conv)?;
17079 index.commit()?;
17080
17081 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17082
17083 let filters = SearchFilters::default();
17085 assert!(filters.session_paths.is_empty());
17086
17087 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17088 assert_eq!(hits.len(), 1);
17089
17090 Ok(())
17091 }
17092
17093 #[test]
17094 fn search_client_reads_federated_lexical_bundle_as_one_corpus() -> Result<()> {
17095 let root = TempDir::new()?;
17096 let shard_a = root.path().join("shard-a");
17097 let shard_b = root.path().join("shard-b");
17098 let published = root.path().join("published");
17099
17100 let mut shard_a_index = TantivyIndex::open_or_create(&shard_a)?;
17101 let mut shard_b_index = TantivyIndex::open_or_create(&shard_b)?;
17102
17103 let make_conv =
17104 |external_id: &str, title: &str, source_path: &str, tag: &str| NormalizedConversation {
17105 agent_slug: "codex".into(),
17106 external_id: Some(external_id.into()),
17107 title: Some(title.into()),
17108 workspace: Some(std::path::PathBuf::from("/ws")),
17109 source_path: std::path::PathBuf::from(source_path),
17110 started_at: Some(1_700_000_100_000),
17111 ended_at: Some(1_700_000_100_100),
17112 metadata: json!({}),
17113 messages: vec![
17114 NormalizedMessage {
17115 idx: 0,
17116 role: "user".into(),
17117 author: None,
17118 created_at: Some(1_700_000_100_010),
17119 content: format!("shared federated needle {tag} user"),
17120 extra: json!({}),
17121 snippets: vec![],
17122 invocations: Vec::new(),
17123 },
17124 NormalizedMessage {
17125 idx: 1,
17126 role: "assistant".into(),
17127 author: None,
17128 created_at: Some(1_700_000_100_020),
17129 content: format!("shared federated needle {tag} assistant"),
17130 extra: json!({}),
17131 snippets: vec![],
17132 invocations: Vec::new(),
17133 },
17134 ],
17135 };
17136
17137 let conv_a = make_conv(
17138 "fed-query-a",
17139 "Fed Query A",
17140 "/tmp/fed-query-a.jsonl",
17141 "alpha",
17142 );
17143 let conv_b = make_conv(
17144 "fed-query-b",
17145 "Fed Query B",
17146 "/tmp/fed-query-b.jsonl",
17147 "beta",
17148 );
17149
17150 shard_a_index.add_conversation(&conv_a)?;
17151 shard_b_index.add_conversation(&conv_b)?;
17152 shard_a_index.commit()?;
17153 shard_b_index.commit()?;
17154 drop(shard_a_index);
17155 drop(shard_b_index);
17156
17157 crate::search::tantivy::publish_federated_searchable_index_directories(
17158 &published,
17159 &[&shard_a, &shard_b],
17160 )?;
17161
17162 let client = SearchClient::open(&published, None)?.expect("federated index present");
17163 assert!(client.has_tantivy());
17164 assert_eq!(client.total_docs(), 4);
17165
17166 let hits = client.search(
17167 "shared federated needle",
17168 SearchFilters::default(),
17169 10,
17170 0,
17171 FieldMask::FULL,
17172 )?;
17173 assert_eq!(hits.len(), 4);
17174 let observed_order = hits
17175 .iter()
17176 .map(|hit| {
17177 (
17178 hit.source_path.clone(),
17179 hit.line_number,
17180 hit.content.clone(),
17181 hit.score.to_bits(),
17182 )
17183 })
17184 .collect::<Vec<_>>();
17185 let hit_paths = hits
17186 .iter()
17187 .map(|hit| hit.source_path.as_str())
17188 .collect::<std::collections::HashSet<_>>();
17189 assert!(hit_paths.contains("/tmp/fed-query-a.jsonl"));
17190 assert!(hit_paths.contains("/tmp/fed-query-b.jsonl"));
17191
17192 for attempt in 0..3 {
17193 let repeated = client.search(
17194 "shared federated needle",
17195 SearchFilters::default(),
17196 10,
17197 0,
17198 FieldMask::FULL,
17199 )?;
17200 let repeated_order = repeated
17201 .iter()
17202 .map(|hit| {
17203 (
17204 hit.source_path.clone(),
17205 hit.line_number,
17206 hit.content.clone(),
17207 hit.score.to_bits(),
17208 )
17209 })
17210 .collect::<Vec<_>>();
17211 assert_eq!(
17212 repeated_order, observed_order,
17213 "federated lexical query order drifted on repeated attempt {attempt}"
17214 );
17215 }
17216
17217 Ok(())
17218 }
17219
17220 #[test]
17221 fn semantic_search_session_paths_filter_retries_past_initial_candidates() -> Result<()> {
17222 let fixture = build_semantic_test_fixture()?;
17223 let mut filters = SearchFilters::default();
17224 filters
17225 .session_paths
17226 .insert(fixture.source_paths[2].clone());
17227
17228 let (hits, ann_stats) = fixture.client.search_semantic(
17229 "semantic fixture query",
17230 filters,
17231 1,
17232 0,
17233 FieldMask::FULL,
17234 false,
17235 )?;
17236
17237 assert!(
17238 ann_stats.is_none(),
17239 "exact search should not emit ANN stats"
17240 );
17241 assert_eq!(
17242 hits.len(),
17243 1,
17244 "filtered semantic search should still return a hit"
17245 );
17246 assert_eq!(
17247 hits[0].source_path, fixture.source_paths[2],
17248 "semantic search should keep searching until it finds the requested session path"
17249 );
17250
17251 Ok(())
17252 }
17253
17254 #[test]
17255 fn semantic_search_offsets_after_session_paths_filtering() -> Result<()> {
17256 let fixture = build_semantic_test_fixture()?;
17257 let mut filters = SearchFilters::default();
17258 filters
17259 .session_paths
17260 .insert(fixture.source_paths[1].clone());
17261 filters
17262 .session_paths
17263 .insert(fixture.source_paths[2].clone());
17264
17265 let (hits, _) = fixture.client.search_semantic(
17266 "semantic fixture query",
17267 filters,
17268 1,
17269 1,
17270 FieldMask::FULL,
17271 false,
17272 )?;
17273
17274 assert_eq!(
17275 hits.len(),
17276 1,
17277 "second filtered page should still return one hit"
17278 );
17279 assert_eq!(
17280 hits[0].source_path, fixture.source_paths[2],
17281 "offset must apply after semantic deduplication and session path filtering"
17282 );
17283
17284 Ok(())
17285 }
17286
17287 #[test]
17288 fn semantic_search_merges_sharded_vector_indexes() -> Result<()> {
17289 let fixture = build_sharded_semantic_test_fixture()?;
17290 let (hits, ann_stats) = fixture.client.search_semantic(
17291 "semantic fixture query",
17292 SearchFilters::default(),
17293 3,
17294 0,
17295 FieldMask::FULL,
17296 false,
17297 )?;
17298
17299 assert!(
17300 ann_stats.is_none(),
17301 "sharded exact search should not emit ANN stats"
17302 );
17303 assert_eq!(hits.len(), 3);
17304 assert_eq!(hits[0].source_path, fixture.source_paths[0]);
17305 assert_eq!(hits[1].source_path, fixture.source_paths[1]);
17306 assert_eq!(hits[2].source_path, fixture.source_paths[2]);
17307
17308 Ok(())
17309 }
17310
17311 #[test]
17312 fn progressive_phase_overfetches_before_session_paths_filtering() -> Result<()> {
17313 let fixture = build_semantic_test_fixture()?;
17314 let mut filters = SearchFilters::default();
17315 filters
17316 .session_paths
17317 .insert(fixture.source_paths[2].clone());
17318
17319 let results = vec![
17320 FsScoredResult {
17321 doc_id: fixture.doc_ids[0].clone(),
17322 score: 1.0,
17323 source: FsScoreSource::SemanticFast,
17324 index: None,
17325 fast_score: Some(1.0),
17326 quality_score: None,
17327 lexical_score: None,
17328 rerank_score: None,
17329 explanation: None,
17330 metadata: None,
17331 },
17332 FsScoredResult {
17333 doc_id: fixture.doc_ids[1].clone(),
17334 score: 0.9,
17335 source: FsScoreSource::SemanticFast,
17336 index: None,
17337 fast_score: Some(0.9),
17338 quality_score: None,
17339 lexical_score: None,
17340 rerank_score: None,
17341 explanation: None,
17342 metadata: None,
17343 },
17344 FsScoredResult {
17345 doc_id: fixture.doc_ids[2].clone(),
17346 score: 0.8,
17347 source: FsScoreSource::SemanticFast,
17348 index: None,
17349 fast_score: Some(0.8),
17350 quality_score: None,
17351 lexical_score: None,
17352 rerank_score: None,
17353 explanation: None,
17354 metadata: None,
17355 },
17356 ];
17357
17358 let result = fixture.client.progressive_phase_to_result(
17359 &results,
17360 ProgressivePhaseContext {
17361 query: "session path filter",
17362 filters: &filters,
17363 field_mask: FieldMask::FULL,
17364 lexical_cache: None,
17365 limit: 1,
17366 fetch_limit: 3,
17367 },
17368 )?;
17369
17370 assert_eq!(
17371 result.hits.len(),
17372 1,
17373 "progressive phase should retain enough overfetched hits to satisfy post-search session path filtering"
17374 );
17375 assert_eq!(
17376 result.hits[0].source_path, fixture.source_paths[2],
17377 "progressive phase should page after session path filtering"
17378 );
17379
17380 Ok(())
17381 }
17382
17383 #[test]
17388 fn sql_placeholders_empty() {
17389 assert_eq!(sql_placeholders(0), "");
17390 }
17391
17392 #[test]
17393 fn sql_placeholders_single() {
17394 assert_eq!(sql_placeholders(1), "?");
17395 }
17396
17397 #[test]
17398 fn sql_placeholders_multiple() {
17399 assert_eq!(sql_placeholders(3), "?,?,?");
17400 assert_eq!(sql_placeholders(5), "?,?,?,?,?");
17401 }
17402
17403 #[test]
17404 fn sql_placeholders_capacity_efficient() {
17405 let result = sql_placeholders(3);
17407 assert_eq!(result.len(), 5);
17408 assert!(result.capacity() >= 5); let result = sql_placeholders(10);
17412 assert_eq!(result.len(), 19);
17413 assert!(result.capacity() >= 19);
17414 }
17415
17416 #[test]
17417 fn sql_placeholders_large_count() {
17418 let result = sql_placeholders(100);
17420 assert_eq!(result.len(), 199); assert_eq!(result.chars().filter(|c| *c == '?').count(), 100);
17422 assert_eq!(result.chars().filter(|c| *c == ',').count(), 99);
17423 }
17424
17425 #[test]
17426 fn hybrid_budget_identifier_biases_lexical() {
17427 let budget = hybrid_candidate_budget("src/main.rs", 20, 20, 5, 10_000);
17428 assert!(
17429 budget.lexical_candidates > budget.semantic_candidates,
17430 "identifier queries should allocate more lexical than semantic fanout"
17431 );
17432 assert!(budget.lexical_candidates >= 25);
17433 }
17434
17435 #[test]
17436 fn hybrid_budget_natural_language_biases_semantic() {
17437 let budget = hybrid_candidate_budget(
17438 "how do we fix authentication middleware latency",
17439 20,
17440 20,
17441 5,
17442 10_000,
17443 );
17444 assert!(
17445 budget.semantic_candidates > budget.lexical_candidates,
17446 "natural language queries should allocate more semantic than lexical fanout"
17447 );
17448 }
17449
17450 #[test]
17451 fn hybrid_budget_no_limit_caps_both_lexical_and_semantic() {
17452 let total_docs = 2_000_000;
17460 let budget =
17461 hybrid_candidate_budget("authentication middleware", 0, total_docs, 0, total_docs);
17462 let cap = no_limit_result_cap();
17463 assert!(
17464 budget.lexical_candidates <= cap,
17465 "lexical fanout must respect no_limit_result_cap() = {cap}; got {}",
17466 budget.lexical_candidates
17467 );
17468 assert!(
17469 budget.lexical_candidates <= NO_LIMIT_RESULT_MAX,
17470 "lexical fanout must respect the absolute NO_LIMIT_RESULT_MAX; got {}",
17471 budget.lexical_candidates
17472 );
17473 assert!(budget.semantic_candidates <= HYBRID_NO_LIMIT_SEMANTIC_CAP);
17474 assert!(
17481 budget.semantic_candidates <= budget.lexical_candidates,
17482 "semantic ({}) must not exceed lexical ({}) fanout",
17483 budget.semantic_candidates,
17484 budget.lexical_candidates
17485 );
17486 }
17487
17488 #[test]
17489 fn compute_no_limit_result_cap_clamps_explicit_over_ceiling_env_override() {
17490 let cap = compute_no_limit_result_cap_from(Some("999999999999".to_string()), None, None);
17496 assert!(
17497 cap <= NO_LIMIT_RESULT_MAX,
17498 "explicit override must still clamp to ceiling; got {cap} > {NO_LIMIT_RESULT_MAX}"
17499 );
17500 assert!(cap >= NO_LIMIT_RESULT_MIN);
17501 }
17502
17503 #[test]
17504 fn compute_no_limit_result_cap_clamps_tiny_explicit_override_up_to_floor() {
17505 let cap = compute_no_limit_result_cap_from(Some("1".to_string()), None, None);
17507 assert_eq!(cap, NO_LIMIT_RESULT_MIN);
17508 }
17509
17510 #[test]
17511 fn compute_no_limit_result_cap_uses_meminfo_when_no_env_override() {
17512 let cap = compute_no_limit_result_cap_from(None, None, Some(128u64 * 1024 * 1024 * 1024));
17516 assert!(cap >= NO_LIMIT_RESULT_MIN, "cap {cap} below floor");
17517 assert!(cap <= NO_LIMIT_RESULT_MAX, "cap {cap} above ceiling");
17518 assert!(cap > NO_LIMIT_RESULT_MIN * 10);
17520 }
17521
17522 #[test]
17523 fn compute_no_limit_result_cap_falls_back_to_floor_when_meminfo_unavailable() {
17524 let cap = compute_no_limit_result_cap_from(None, None, None);
17528 assert!(cap >= NO_LIMIT_RESULT_MIN);
17529 assert!(cap <= NO_LIMIT_RESULT_MAX);
17530 }
17531
17532 #[test]
17533 fn compute_no_limit_result_cap_bytes_env_takes_priority_over_meminfo() {
17534 let four_gib = (4u64 * 1024 * 1024 * 1024).to_string();
17539 let cap = compute_no_limit_result_cap_from(
17540 None,
17541 Some(four_gib),
17542 Some(1024u64 * 1024 * 1024 * 1024), );
17544 let expected_hits = ((4u64 * 1024 * 1024 * 1024) / AVG_HIT_BYTES) as usize;
17545 let expected = expected_hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
17546 assert_eq!(cap, expected, "bytes env must win over meminfo");
17547 }
17548
17549 #[test]
17550 fn no_limit_budget_bytes_preserves_fallback_priority() {
17551 let huge_meminfo = Some(1024u64 * 1024 * 1024 * 1024);
17552 let four_gib = 4u64 * 1024 * 1024 * 1024;
17553
17554 assert_eq!(
17555 no_limit_budget_bytes(Some(four_gib.to_string()), huge_meminfo),
17556 four_gib
17557 );
17558 assert_eq!(
17559 no_limit_budget_bytes(Some("0".to_string()), huge_meminfo),
17560 NO_LIMIT_BYTES_CEILING
17561 );
17562 assert_eq!(no_limit_budget_bytes(None, None), NO_LIMIT_BYTES_FLOOR);
17563 }
17564
17565 #[test]
17566 fn compute_no_limit_result_cap_ignores_malformed_env() {
17567 for bad in ["", "abc", "0", "-1"] {
17569 let cap = compute_no_limit_result_cap_from(
17570 Some(bad.to_string()),
17571 Some(bad.to_string()),
17572 None,
17573 );
17574 assert!(cap >= NO_LIMIT_RESULT_MIN, "bad={bad:?} cap={cap}");
17575 assert!(cap <= NO_LIMIT_RESULT_MAX, "bad={bad:?} cap={cap}");
17576 }
17577 }
17578
17579 fn make_test_hit(id: &str, score: f32) -> SearchHit {
17584 SearchHit {
17585 title: id.to_string(),
17586 snippet: String::new(),
17587 content: id.to_string(),
17588 content_hash: stable_content_hash(id),
17589 score,
17590 source_path: format!("/path/{}.jsonl", id),
17591 agent: "test".to_string(),
17592 workspace: "/workspace".to_string(),
17593 workspace_original: None,
17594 created_at: Some(1_700_000_000_000),
17595 line_number: Some(1),
17596 match_type: MatchType::Exact,
17597 source_id: "local".to_string(),
17598 origin_kind: "local".to_string(),
17599 origin_host: None,
17600 conversation_id: None,
17601 }
17602 }
17603
17604 #[test]
17605 fn test_rrf_fusion_ordering() {
17606 let lexical = vec![
17609 make_test_hit("A", 10.0),
17610 make_test_hit("B", 8.0),
17611 make_test_hit("C", 6.0),
17612 ];
17613 let semantic = vec![
17614 make_test_hit("A", 0.9),
17615 make_test_hit("B", 0.7),
17616 make_test_hit("D", 0.5),
17617 ];
17618
17619 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17620
17621 assert_eq!(fused.len(), 4);
17623 assert_eq!(fused[0].title, "A"); assert_eq!(fused[1].title, "B"); }
17627
17628 #[test]
17629 fn test_rrf_handles_disjoint_sets() {
17630 let lexical = vec![make_test_hit("A", 10.0), make_test_hit("B", 8.0)];
17632 let semantic = vec![make_test_hit("C", 0.9), make_test_hit("D", 0.7)];
17633
17634 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17635
17636 assert_eq!(fused.len(), 4);
17638 let titles: Vec<&str> = fused.iter().map(|h| h.title.as_str()).collect();
17639 assert!(titles.contains(&"A"));
17640 assert!(titles.contains(&"B"));
17641 assert!(titles.contains(&"C"));
17642 assert!(titles.contains(&"D"));
17643 }
17644
17645 #[test]
17646 fn test_rrf_tie_breaking_deterministic() {
17647 let lexical = vec![
17649 make_test_hit("X", 5.0),
17650 make_test_hit("Y", 5.0),
17651 make_test_hit("Z", 5.0),
17652 ];
17653 let semantic = vec![]; let fused1 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17657 let fused2 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17658 let fused3 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17659
17660 assert_eq!(fused1.len(), fused2.len());
17662 assert_eq!(fused2.len(), fused3.len());
17663
17664 for i in 0..fused1.len() {
17665 assert_eq!(fused1[i].title, fused2[i].title, "Mismatch at index {}", i);
17666 assert_eq!(fused2[i].title, fused3[i].title, "Mismatch at index {}", i);
17667 }
17668 }
17669
17670 #[test]
17671 fn test_rrf_both_lists_bonus() {
17672 let lexical = vec![
17675 make_test_hit("solo_lex", 10.0), make_test_hit("both", 5.0), ];
17678 let semantic = vec![
17679 make_test_hit("solo_sem", 0.9), make_test_hit("both", 0.5), ];
17682
17683 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17684
17685 assert_eq!(
17689 fused[0].title, "both",
17690 "Doc in both lists should rank first"
17691 );
17692 }
17693
17694 #[test]
17695 fn test_rrf_respects_limit_and_offset() {
17696 let lexical = vec![
17697 make_test_hit("A", 10.0),
17698 make_test_hit("B", 8.0),
17699 make_test_hit("C", 6.0),
17700 ];
17701 let semantic = vec![];
17702
17703 let fused = rrf_fuse_hits(&lexical, &semantic, "", 2, 0);
17705 assert_eq!(fused.len(), 2);
17706
17707 let fused_offset = rrf_fuse_hits(&lexical, &semantic, "", 10, 1);
17709 assert_eq!(fused_offset.len(), 2); let fused_empty = rrf_fuse_hits(&lexical, &semantic, "", 0, 0);
17713 assert!(fused_empty.is_empty());
17714 }
17715
17716 #[test]
17717 fn test_rrf_empty_inputs() {
17718 let empty: Vec<SearchHit> = vec![];
17719 let non_empty = vec![make_test_hit("A", 10.0)];
17720
17721 assert!(rrf_fuse_hits(&empty, &empty, "", 10, 0).is_empty());
17723
17724 let fused = rrf_fuse_hits(&empty, &non_empty, "", 10, 0);
17726 assert_eq!(fused.len(), 1);
17727 assert_eq!(fused[0].title, "A");
17728
17729 let fused = rrf_fuse_hits(&non_empty, &empty, "", 10, 0);
17731 assert_eq!(fused.len(), 1);
17732 assert_eq!(fused[0].title, "A");
17733 }
17734
17735 #[test]
17736 fn test_rrf_coalesces_empty_title_hits_across_search_modes() {
17737 let mut lexical = make_test_hit("shared", 10.0);
17738 lexical.title.clear();
17739 lexical.source_path = "/shared/untitled.jsonl".into();
17740 lexical.content = "same untitled body".into();
17741 lexical.content_hash = stable_content_hash("same untitled body");
17742
17743 let mut semantic = lexical.clone();
17744 semantic.score = 0.9;
17745
17746 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17747 assert_eq!(fused.len(), 1);
17748 assert_eq!(fused[0].title, "");
17749 }
17750
17751 #[test]
17752 fn test_rrf_coalesces_blank_local_source_id_hits_across_search_modes() {
17753 let mut lexical = make_test_hit("shared-local", 10.0);
17754 lexical.source_path = "/shared/local.jsonl".into();
17755 lexical.content = "same local body".into();
17756 lexical.content_hash = stable_content_hash("same local body");
17757 lexical.source_id = "local".into();
17758 lexical.origin_kind = "local".into();
17759
17760 let mut semantic = lexical.clone();
17761 semantic.source_id = " ".into();
17762 semantic.origin_kind = "local".into();
17763 semantic.score = 0.9;
17764
17765 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17766 assert_eq!(fused.len(), 1);
17767 assert_eq!(fused[0].source_id, "local");
17768 }
17769
17770 #[test]
17771 fn test_rrf_keeps_repeated_same_content_at_different_lines() {
17772 let mut first = make_test_hit("same", 10.0);
17773 first.title = "Shared Session".into();
17774 first.source_path = "/shared/session.jsonl".into();
17775 first.content = "repeat me".into();
17776 first.content_hash = stable_content_hash("repeat me");
17777 first.line_number = Some(1);
17778 first.created_at = Some(100);
17779
17780 let mut second = first.clone();
17781 second.line_number = Some(2);
17782 second.created_at = Some(200);
17783 second.score = 0.9;
17784
17785 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
17786 assert_eq!(fused.len(), 2);
17787 assert_eq!(fused[0].line_number, Some(1));
17788 assert_eq!(fused[1].line_number, Some(2));
17789 }
17790
17791 #[test]
17792 fn test_rrf_coalesces_present_and_missing_conversation_id_for_same_message() {
17793 let mut lexical = make_test_hit("same", 10.0);
17794 lexical.title = "Shared Session".into();
17795 lexical.source_path = "/shared/session.jsonl".into();
17796 lexical.content = "identical body".into();
17797 lexical.content_hash = stable_content_hash("identical body");
17798 lexical.created_at = Some(100);
17799 lexical.line_number = Some(1);
17800 lexical.conversation_id = None;
17801
17802 let mut semantic = lexical.clone();
17803 semantic.conversation_id = Some(42);
17804 semantic.score = 0.9;
17805
17806 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17807 assert_eq!(fused.len(), 1);
17808 assert_eq!(fused[0].conversation_id, Some(42));
17809 }
17810
17811 #[test]
17812 fn test_rrf_coalesces_present_and_missing_conversation_id_despite_blank_local_source_id() {
17813 let mut lexical = make_test_hit("same", 10.0);
17814 lexical.title = "Shared Session".into();
17815 lexical.source_path = "/shared/session.jsonl".into();
17816 lexical.content = "identical body".into();
17817 lexical.content_hash = stable_content_hash("identical body");
17818 lexical.created_at = Some(100);
17819 lexical.line_number = Some(1);
17820 lexical.conversation_id = None;
17821 lexical.source_id = "local".into();
17822 lexical.origin_kind = "local".into();
17823
17824 let mut semantic = lexical.clone();
17825 semantic.conversation_id = Some(42);
17826 semantic.source_id = " ".into();
17827 semantic.origin_kind = "local".into();
17828 semantic.score = 0.9;
17829
17830 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17831 assert_eq!(fused.len(), 1);
17832 assert_eq!(fused[0].conversation_id, Some(42));
17833 }
17834
17835 #[test]
17836 fn test_rrf_keeps_distinct_conversation_ids_for_shared_path_and_content() {
17837 let mut first = make_test_hit("same", 10.0);
17838 first.title = "Shared Session".into();
17839 first.source_path = "/shared/session.jsonl".into();
17840 first.content = "identical body".into();
17841 first.content_hash = stable_content_hash("identical body");
17842 first.conversation_id = Some(1);
17843
17844 let mut second = first.clone();
17845 second.conversation_id = Some(2);
17846 second.score = 0.9;
17847
17848 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
17849 assert_eq!(fused.len(), 2);
17850 assert!(fused.iter().any(|hit| hit.conversation_id == Some(1)));
17851 assert!(fused.iter().any(|hit| hit.conversation_id == Some(2)));
17852 }
17853
17854 #[test]
17855 fn test_rrf_coalesces_same_conversation_id_despite_title_drift() {
17856 let mut lexical = make_test_hit("same", 10.0);
17857 lexical.title = "Morning Session".into();
17858 lexical.source_path = "/shared/session.jsonl".into();
17859 lexical.content = "identical body".into();
17860 lexical.content_hash = stable_content_hash("identical body");
17861 lexical.conversation_id = Some(9);
17862
17863 let mut semantic = lexical.clone();
17864 semantic.title = "Evening Session".into();
17865 semantic.score = 0.9;
17866
17867 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17868 assert_eq!(fused.len(), 1);
17869 assert_eq!(fused[0].conversation_id, Some(9));
17870 }
17871
17872 #[test]
17873 fn test_rrf_keeps_distinct_titles_for_shared_path_and_content() {
17874 let mut morning = make_test_hit("same", 10.0);
17875 morning.title = "Morning Session".into();
17876 morning.source_path = "/shared/session.jsonl".into();
17877 morning.content = "identical body".into();
17878 morning.content_hash = stable_content_hash("identical body");
17879 morning.created_at = None;
17880
17881 let mut evening = morning.clone();
17882 evening.title = "Evening Session".into();
17883 evening.score = 0.9;
17884
17885 let fused = rrf_fuse_hits(&[morning], &[evening], "", 10, 0);
17886 assert_eq!(fused.len(), 2);
17887 assert!(fused.iter().any(|hit| hit.title == "Morning Session"));
17888 assert!(fused.iter().any(|hit| hit.title == "Evening Session"));
17889 }
17890
17891 #[test]
17892 fn test_rrf_candidate_depth() {
17893 let lexical: Vec<_> = (0..50)
17895 .map(|i| make_test_hit(&format!("L{}", i), 100.0 - i as f32))
17896 .collect();
17897 let semantic: Vec<_> = (0..50)
17898 .map(|i| make_test_hit(&format!("S{}", i), 1.0 - 0.01 * i as f32))
17899 .collect();
17900
17901 let fused = rrf_fuse_hits(&lexical, &semantic, "", 20, 0);
17902
17903 assert_eq!(fused.len(), 20);
17905
17906 let mut seen = std::collections::HashSet::new();
17908 for hit in &fused {
17909 assert!(seen.insert(&hit.title), "Duplicate hit: {}", hit.title);
17910 }
17911 }
17912
17913 #[test]
17918 fn query_token_list_parses_small_queries() {
17919 let cases = [
17920 ("hello", 1),
17921 ("hello world", 2),
17922 ("hello AND world", 3),
17923 ("hello world foo bar", 4),
17924 ];
17925
17926 for (query, expected_len) in cases {
17927 let tokens = parse_boolean_query(query);
17928 assert_eq!(tokens.len(), expected_len, "{query}");
17929 }
17930 }
17931
17932 #[test]
17933 fn query_token_list_parses_large_queries() {
17934 let tokens = parse_boolean_query("a b c d e f g h i");
17935 assert_eq!(tokens.len(), 9);
17936 }
17937
17938 #[test]
17939 fn query_token_list_handles_quoted_phrases() {
17940 let tokens = parse_boolean_query("\"hello world\" test");
17941 assert_eq!(tokens.len(), 2);
17942
17943 assert!(
17945 matches!(&tokens[0], QueryToken::Phrase(phrase) if phrase == "hello world"),
17946 "Expected Phrase token"
17947 );
17948 }
17949
17950 #[test]
17951 fn query_token_list_handles_operators() {
17952 let tokens = parse_boolean_query("foo AND bar OR baz");
17953 assert_eq!(tokens.len(), 5);
17954 assert_eq!(tokens[1], QueryToken::And);
17955 assert_eq!(tokens[3], QueryToken::Or);
17956 }
17957
17958 #[test]
17959 fn query_token_list_empty_query() {
17960 let tokens = parse_boolean_query("");
17961 assert!(tokens.is_empty());
17962 }
17963
17964 #[test]
17965 fn query_token_list_iteration_works() {
17966 let tokens = parse_boolean_query("a b c");
17967 let terms: Vec<_> = tokens
17968 .iter()
17969 .filter_map(|t| match t {
17970 QueryToken::Term(s) => Some(s.as_str()),
17971 _ => None,
17972 })
17973 .collect();
17974 assert_eq!(terms, vec!["a", "b", "c"]);
17975 }
17976
17977 #[test]
17987 fn unicode_emoji_treated_as_separator() {
17988 let sanitized = sanitize_query("🚀 launch");
17990 assert_eq!(sanitized, " launch", "Emoji should become space");
17991 }
17992
17993 #[test]
17994 fn unicode_emoji_splits_terms() {
17995 let sanitized = sanitize_query("hot🔥code");
17997 assert_eq!(sanitized, "hot code", "Emoji between words splits them");
17998 }
17999
18000 #[test]
18001 fn unicode_multiple_emoji_become_spaces() {
18002 let sanitized = sanitize_query("🚀🔥💻");
18003 assert_eq!(
18004 sanitized.trim(),
18005 "",
18006 "All-emoji query sanitizes to whitespace"
18007 );
18008 }
18009
18010 #[test]
18011 fn unicode_emoji_query_parses_without_panic() {
18012 let tokens = parse_boolean_query("🚀 launch code 🔥");
18013 let terms: Vec<_> = tokens
18014 .iter()
18015 .filter_map(|t| match t {
18016 QueryToken::Term(s) => Some(s.clone()),
18017 _ => None,
18018 })
18019 .collect();
18020 assert!(
18022 terms
18023 .iter()
18024 .any(|t| t.contains("launch") || t.contains("code"))
18025 );
18026 }
18027
18028 #[test]
18029 fn unicode_emoji_query_terms_lower() {
18030 let terms = QueryTermsLower::from_query("🚀 LAUNCH");
18031 let tokens: Vec<&str> = terms.tokens().collect();
18033 assert!(
18034 tokens.contains(&"launch"),
18035 "Should extract 'launch' from emoji query"
18036 );
18037 }
18038
18039 #[test]
18042 fn unicode_cjk_chinese_preserved() {
18043 assert_eq!(sanitize_query("测试代码"), "测试代码");
18044 assert_eq!(sanitize_query("测试 代码"), "测试 代码");
18045 }
18046
18047 #[test]
18048 fn unicode_cjk_japanese_preserved() {
18049 assert_eq!(sanitize_query("テスト"), "テスト");
18050 assert_eq!(sanitize_query("こんにちは世界"), "こんにちは世界");
18052 }
18053
18054 #[test]
18055 fn unicode_cjk_korean_preserved() {
18056 assert_eq!(sanitize_query("테스트"), "테스트");
18057 assert_eq!(sanitize_query("안녕하세요"), "안녕하세요");
18058 }
18059
18060 #[test]
18061 fn unicode_cjk_parsed_as_terms() {
18062 let tokens = parse_boolean_query("测试 代码 search");
18063 let terms: Vec<_> = tokens
18064 .iter()
18065 .filter_map(|t| match t {
18066 QueryToken::Term(s) => Some(s.as_str()),
18067 _ => None,
18068 })
18069 .collect();
18070 assert_eq!(terms, vec!["测试", "代码", "search"]);
18071 }
18072
18073 #[test]
18074 fn unicode_cjk_query_terms_lower() {
18075 let terms = QueryTermsLower::from_query("测试 代码");
18076 let tokens: Vec<&str> = terms.tokens().collect();
18077 assert_eq!(tokens, vec!["测试", "代码"]);
18078 }
18079
18080 #[test]
18083 fn unicode_hebrew_preserved() {
18084 assert_eq!(sanitize_query("שלום עולם"), "שלום עולם");
18085 }
18086
18087 #[test]
18088 fn unicode_arabic_preserved() {
18089 assert_eq!(sanitize_query("مرحبا"), "مرحبا");
18090 }
18091
18092 #[test]
18093 fn unicode_hebrew_parsed_as_terms() {
18094 let tokens = parse_boolean_query("שלום עולם");
18095 let terms: Vec<_> = tokens
18096 .iter()
18097 .filter_map(|t| match t {
18098 QueryToken::Term(s) => Some(s.as_str()),
18099 _ => None,
18100 })
18101 .collect();
18102 assert_eq!(terms, vec!["שלום", "עולם"]);
18103 }
18104
18105 #[test]
18106 fn unicode_arabic_query_terms_lower() {
18107 let terms = QueryTermsLower::from_query("مرحبا بالعالم");
18109 let tokens: Vec<&str> = terms.tokens().collect();
18110 assert_eq!(tokens, vec!["مرحبا", "بالعالم"]);
18111 }
18112
18113 #[test]
18116 fn unicode_mixed_scripts_preserved() {
18117 let sanitized = sanitize_query("Hello 世界 мир");
18118 assert_eq!(sanitized, "Hello 世界 мир");
18119 }
18120
18121 #[test]
18122 fn unicode_mixed_scripts_parsed() {
18123 let tokens = parse_boolean_query("Hello 世界 мир");
18124 let terms: Vec<_> = tokens
18125 .iter()
18126 .filter_map(|t| match t {
18127 QueryToken::Term(s) => Some(s.as_str()),
18128 _ => None,
18129 })
18130 .collect();
18131 assert_eq!(terms, vec!["Hello", "世界", "мир"]);
18132 }
18133
18134 #[test]
18135 fn unicode_mixed_scripts_with_emoji() {
18136 let sanitized = sanitize_query("Hello 🌍 世界");
18138 assert_eq!(sanitized, "Hello 世界");
18139 }
18140
18141 #[test]
18142 fn unicode_latin_cyrillic_arabic_query() {
18143 let terms = QueryTermsLower::from_query("Hello Мир مرحبا");
18144 let tokens: Vec<&str> = terms.tokens().collect();
18145 assert_eq!(tokens, vec!["hello", "мир", "مرحبا"]);
18146 }
18147
18148 #[test]
18151 fn unicode_zero_width_joiner_removed() {
18152 let sanitized = sanitize_query("test\u{200D}query");
18154 assert_eq!(sanitized, "test query");
18155 }
18156
18157 #[test]
18158 fn unicode_zero_width_non_joiner_removed() {
18159 let sanitized = sanitize_query("test\u{200C}query");
18161 assert_eq!(sanitized, "test query");
18162 }
18163
18164 #[test]
18165 fn unicode_zero_width_space_removed() {
18166 let sanitized = sanitize_query("test\u{200B}query");
18168 assert_eq!(sanitized, "test query");
18169 }
18170
18171 #[test]
18172 fn unicode_bom_removed() {
18173 let sanitized = sanitize_query("\u{FEFF}test");
18175 assert_eq!(sanitized, " test");
18176 }
18177
18178 #[test]
18181 fn unicode_precomposed_accent_preserved() {
18182 let sanitized = sanitize_query("café");
18184 assert_eq!(sanitized, "café");
18185 }
18186
18187 #[test]
18188 fn unicode_combining_accent_becomes_separator() {
18189 let input = "cafe\u{0301}";
18193 let sanitized = sanitize_query(input);
18194 assert_eq!(sanitized, "caf\u{00e9}");
18195 }
18196
18197 #[test]
18198 fn unicode_nfc_and_nfd_produce_same_sanitized_query() {
18199 let nfc = "caf\u{00E9}";
18201 let nfd = "cafe\u{0301}";
18203
18204 let san_nfc = sanitize_query(nfc);
18205 let san_nfd = sanitize_query(nfd);
18206
18207 assert_eq!(san_nfc, "café");
18211 assert_eq!(san_nfd, "café");
18212 assert_eq!(san_nfc, san_nfd);
18213 }
18214
18215 #[test]
18216 fn unicode_combining_marks_do_not_panic() {
18217 let zalgo = "t\u{0301}\u{0302}\u{0303}e\u{0304}\u{0305}st";
18219 let sanitized = sanitize_query(zalgo);
18220 assert!(sanitized.contains('t'));
18222 assert!(sanitized.contains('s'));
18223 }
18224
18225 #[test]
18228 fn unicode_mathematical_bold_letters_preserved() {
18229 let input = "\u{1D400}\u{1D401}\u{1D402}";
18231 let sanitized = sanitize_query(input);
18232 assert_eq!(
18233 sanitized, input,
18234 "Mathematical bold letters are alphanumeric"
18235 );
18236 }
18237
18238 #[test]
18239 fn unicode_supplementary_ideograph_preserved() {
18240 let input = "\u{20000}";
18242 let sanitized = sanitize_query(input);
18243 assert_eq!(
18244 sanitized, input,
18245 "Supplementary CJK ideographs are alphanumeric"
18246 );
18247 }
18248
18249 #[test]
18250 fn unicode_supplementary_emoji_removed() {
18251 let input = "test\u{1F600}query";
18253 let sanitized = sanitize_query(input);
18254 assert_eq!(sanitized, "test query");
18255 }
18256
18257 #[test]
18260 fn unicode_bidi_mixed_ltr_rtl_no_panic() {
18261 let input = "hello שלום world עולם";
18262 let tokens = parse_boolean_query(input);
18263 let terms: Vec<_> = tokens
18264 .iter()
18265 .filter_map(|t| match t {
18266 QueryToken::Term(s) => Some(s.as_str()),
18267 _ => None,
18268 })
18269 .collect();
18270 assert_eq!(terms.len(), 4);
18271 assert!(terms.contains(&"hello"));
18272 assert!(terms.contains(&"שלום"));
18273 assert!(terms.contains(&"world"));
18274 assert!(terms.contains(&"עולם"));
18275 }
18276
18277 #[test]
18278 fn unicode_bidi_override_chars_removed() {
18279 let input = "test\u{202D}content\u{202C}end";
18282 let sanitized = sanitize_query(input);
18283 assert_eq!(sanitized, "test content end");
18284 }
18285
18286 #[test]
18287 fn unicode_bidi_rtl_mark_removed() {
18288 let input = "test\u{200F}content";
18290 let sanitized = sanitize_query(input);
18291 assert_eq!(sanitized, "test content");
18292 }
18293
18294 #[test]
18297 fn unicode_full_pipeline_cjk_query() {
18298 let explanation = QueryExplanation::analyze("测试 代码", &SearchFilters::default());
18299 assert_eq!(explanation.parsed.terms.len(), 2);
18300 assert!(!explanation.parsed.terms[0].text.is_empty());
18301 assert!(!explanation.parsed.terms[1].text.is_empty());
18302 }
18303
18304 #[test]
18305 fn unicode_full_pipeline_mixed_script_boolean() {
18306 let explanation =
18307 QueryExplanation::analyze("Hello AND 世界 OR مرحبا", &SearchFilters::default());
18308 assert!(
18310 explanation.parsed.operators.iter().any(|op| op == "AND"),
18311 "AND operator should be recognized in mixed-script query"
18312 );
18313 }
18314
18315 #[test]
18316 fn unicode_full_pipeline_emoji_query_type() {
18317 let explanation = QueryExplanation::analyze("🚀🔥💻", &SearchFilters::default());
18319 assert!(
18321 explanation.parsed.terms.is_empty()
18322 || explanation
18323 .parsed
18324 .terms
18325 .iter()
18326 .all(|t| t.subterms.is_empty()),
18327 "All-emoji query should produce no meaningful terms"
18328 );
18329 }
18330
18331 #[test]
18332 fn unicode_full_pipeline_phrase_with_cjk() {
18333 let explanation = QueryExplanation::analyze("\"测试代码\"", &SearchFilters::default());
18334 assert!(
18335 !explanation.parsed.phrases.is_empty(),
18336 "CJK phrase should be recognized"
18337 );
18338 }
18339
18340 #[test]
18341 fn unicode_full_pipeline_wildcard_with_unicode() {
18342 let explanation = QueryExplanation::analyze("*测试*", &SearchFilters::default());
18343 assert!(
18344 !explanation.parsed.terms.is_empty(),
18345 "Wildcard with CJK should produce terms"
18346 );
18347 if let Some(term) = explanation.parsed.terms.first() {
18349 assert!(
18350 term.subterms
18351 .iter()
18352 .any(|s| s.pattern.contains("*") || s.pattern == "exact"),
18353 "CJK wildcard should produce wildcard or exact pattern"
18354 );
18355 }
18356 }
18357
18358 #[test]
18359 fn unicode_query_terms_lower_case_folding() {
18360 let terms = QueryTermsLower::from_query("STRAßE");
18362 assert_eq!(terms.query_lower, "straße");
18363
18364 let terms2 = QueryTermsLower::from_query("HELLO");
18367 assert_eq!(terms2.query_lower, "hello");
18368 }
18369
18370 #[test]
18371 fn unicode_normalize_term_parts_cjk() {
18372 let parts = normalize_term_parts("测试 代码");
18373 assert_eq!(parts, vec!["测试", "代码"]);
18374 }
18375
18376 #[test]
18377 fn unicode_normalize_term_parts_strips_emoji() {
18378 let parts = normalize_term_parts("🚀launch🔥code");
18379 assert!(parts.contains(&"launch".to_string()));
18381 assert!(parts.contains(&"code".to_string()));
18382 }
18383
18384 #[test]
18389 fn special_char_unbalanced_quote_no_panic() {
18390 let tokens = parse_boolean_query("\"hello world");
18391 assert!(
18392 tokens
18393 .iter()
18394 .any(|t| matches!(t, QueryToken::Phrase(p) if p.contains("hello"))),
18395 "Unbalanced quote should still produce a phrase: {tokens:?}"
18396 );
18397 }
18398
18399 #[test]
18400 fn special_char_unbalanced_trailing_quote() {
18401 let tokens = parse_boolean_query("test\"");
18402 assert!(
18403 tokens
18404 .iter()
18405 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
18406 "Text before trailing quote should parse as term: {tokens:?}"
18407 );
18408 }
18409
18410 #[test]
18411 fn special_char_multiple_unbalanced_quotes() {
18412 let tokens = parse_boolean_query("\"foo \"bar");
18413 assert!(
18414 !tokens.is_empty(),
18415 "Should parse despite odd quotes: {tokens:?}"
18416 );
18417 }
18418
18419 #[test]
18420 fn special_char_empty_quotes() {
18421 let tokens = parse_boolean_query("\"\" test");
18422 assert!(
18423 tokens
18424 .iter()
18425 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
18426 "Empty quotes should be skipped: {tokens:?}"
18427 );
18428 }
18429
18430 #[test]
18431 fn special_char_unbalanced_via_sanitize() {
18432 let sanitized = sanitize_query("\"hello world");
18433 assert!(
18434 sanitized.contains('"'),
18435 "Quotes preserved by sanitize_query"
18436 );
18437 }
18438
18439 #[test]
18442 fn special_char_backslash_quote_sanitize() {
18443 let sanitized = sanitize_query("\\\"test\\\"");
18444 assert!(sanitized.contains('"'));
18445 assert!(!sanitized.contains('\\'), "Backslash should be stripped");
18446 }
18447
18448 #[test]
18449 fn special_char_backslash_quote_parse() {
18450 let tokens = parse_boolean_query("\\\"test\\\"");
18451 assert!(!tokens.is_empty(), "Should parse without panic: {tokens:?}");
18452 }
18453
18454 #[test]
18455 fn special_char_inner_escaped_quotes() {
18456 let tokens = parse_boolean_query("\"test \\\"inner\\\" test\"");
18457 assert!(
18458 !tokens.is_empty(),
18459 "Nested escaped quotes should not panic: {tokens:?}"
18460 );
18461 }
18462
18463 #[test]
18466 fn special_char_windows_path_sanitize() {
18467 let sanitized = sanitize_query("C:\\Users\\test");
18468 assert_eq!(sanitized, "C Users test");
18469 }
18470
18471 #[test]
18472 fn special_char_unc_path_sanitize() {
18473 let sanitized = sanitize_query("\\\\server\\share");
18474 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18475 assert!(parts.contains(&"server"));
18476 assert!(parts.contains(&"share"));
18477 }
18478
18479 #[test]
18480 fn special_char_windows_path_terms() {
18481 let parts = normalize_term_parts("C:\\Users\\test\\file.rs");
18482 assert!(parts.contains(&"C".to_string()));
18483 assert!(parts.contains(&"Users".to_string()));
18484 assert!(parts.contains(&"test".to_string()));
18485 assert!(parts.contains(&"file".to_string()));
18486 assert!(parts.contains(&"rs".to_string()));
18487 }
18488
18489 #[test]
18492 fn special_char_regex_dot_star() {
18493 let sanitized = sanitize_query("foo.*bar");
18494 assert_eq!(sanitized, "foo *bar");
18495 }
18496
18497 #[test]
18498 fn special_char_regex_char_class() {
18499 let sanitized = sanitize_query("[a-z]+");
18500 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18501 assert_eq!(parts, vec!["a-z"]);
18502 assert_eq!(normalize_term_parts("[a-z]+"), vec!["a", "z"]);
18503 }
18504
18505 #[test]
18506 fn special_char_regex_anchors() {
18507 let sanitized = sanitize_query("^start$");
18508 assert_eq!(sanitized.trim(), "start");
18509 }
18510
18511 #[test]
18512 fn special_char_regex_pipe_groups() {
18513 let sanitized = sanitize_query("(foo|bar)");
18514 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18515 assert_eq!(parts, vec!["foo", "bar"]);
18516 }
18517
18518 #[test]
18521 fn special_char_sql_injection_or() {
18522 let sanitized = sanitize_query("'OR 1=1--");
18523 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18524 assert!(parts.contains(&"OR"));
18525 assert!(parts.contains(&"1"));
18526 assert!(!sanitized.contains('\''));
18527 assert!(!sanitized.contains('='));
18528 }
18529
18530 #[test]
18531 fn special_char_sql_injection_drop() {
18532 let sanitized = sanitize_query("; DROP TABLE users;--");
18533 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18534 assert!(parts.contains(&"DROP"));
18535 assert!(parts.contains(&"TABLE"));
18536 assert!(parts.contains(&"users"));
18537 assert!(!sanitized.contains(';'));
18538 }
18539
18540 #[test]
18541 fn special_char_sql_injection_union() {
18542 let sanitized = sanitize_query("' UNION SELECT * FROM passwords --");
18543 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18544 assert!(parts.contains(&"UNION"));
18545 assert!(parts.contains(&"SELECT"));
18546 assert!(parts.contains(&"*"));
18547 assert!(parts.contains(&"FROM"));
18548 assert!(parts.contains(&"passwords"));
18549 }
18550
18551 #[test]
18552 fn special_char_sql_parse_as_literal() {
18553 let tokens = parse_boolean_query("OR 1=1");
18554 assert!(
18555 tokens.iter().any(|t| matches!(t, QueryToken::Or)),
18556 "OR should be parsed as Or operator: {tokens:?}"
18557 );
18558 }
18559
18560 #[test]
18563 fn special_char_shell_subshell() {
18564 let sanitized = sanitize_query("$(cmd)");
18565 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18566 assert_eq!(parts, vec!["cmd"]);
18567 }
18568
18569 #[test]
18570 fn special_char_shell_backticks() {
18571 let sanitized = sanitize_query("`cmd`");
18572 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18573 assert_eq!(parts, vec!["cmd"]);
18574 }
18575
18576 #[test]
18577 fn special_char_shell_pipe_rm() {
18578 let sanitized = sanitize_query("| rm -rf /");
18579 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18580 assert!(parts.contains(&"rm"));
18581 assert!(parts.contains(&"-rf"));
18582 assert_eq!(normalize_term_parts("| rm -rf /"), vec!["rm", "rf"]);
18583 assert!(!sanitized.contains('|'));
18584 assert!(!sanitized.contains('/'));
18585 }
18586
18587 #[test]
18588 fn special_char_shell_semicolon_chain() {
18589 let sanitized = sanitize_query("test; echo pwned; cat /etc/passwd");
18590 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18591 assert!(parts.contains(&"test"));
18592 assert!(parts.contains(&"echo"));
18593 assert!(parts.contains(&"pwned"));
18594 assert!(!sanitized.contains(';'));
18595 }
18596
18597 #[test]
18600 fn special_char_null_byte_mid_string() {
18601 let sanitized = sanitize_query("test\x00hidden");
18602 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18603 assert_eq!(parts, vec!["test", "hidden"]);
18604 }
18605
18606 #[test]
18607 fn special_char_null_byte_leading() {
18608 let sanitized = sanitize_query("\x00\x00attack");
18609 assert_eq!(sanitized.trim(), "attack");
18610 }
18611
18612 #[test]
18613 fn special_char_null_byte_trailing() {
18614 let sanitized = sanitize_query("query\x00\x00\x00");
18615 assert_eq!(sanitized.trim(), "query");
18616 }
18617
18618 #[test]
18619 fn special_char_null_byte_parse() {
18620 let tokens = parse_boolean_query("test\x00hidden");
18621 assert!(
18622 !tokens.is_empty(),
18623 "Null bytes should not prevent parsing: {tokens:?}"
18624 );
18625 }
18626
18627 #[test]
18630 fn special_char_control_newline() {
18631 let sanitized = sanitize_query("line1\nline2");
18632 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18633 assert_eq!(parts, vec!["line1", "line2"]);
18634 }
18635
18636 #[test]
18637 fn special_char_control_tab_cr() {
18638 let sanitized = sanitize_query("tab\there\r\nend");
18639 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18640 assert_eq!(parts, vec!["tab", "here", "end"]);
18641 }
18642
18643 #[test]
18644 fn special_char_control_parse_whitespace() {
18645 let tokens = parse_boolean_query("hello\tworld\ntest");
18646 let terms: Vec<&str> = tokens
18647 .iter()
18648 .filter_map(|t| match t {
18649 QueryToken::Term(s) => Some(s.as_str()),
18650 _ => None,
18651 })
18652 .collect();
18653 assert_eq!(terms, vec!["hello", "world", "test"]);
18654 }
18655
18656 #[test]
18657 fn special_char_control_bell_escape() {
18658 let sanitized = sanitize_query("test\x07\x1b[31mred");
18659 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18660 assert!(parts.contains(&"test"));
18661 assert!(parts.contains(&"31mred"));
18662 }
18663
18664 #[test]
18667 fn special_char_html_entity_lt() {
18668 let sanitized = sanitize_query("<script>");
18669 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18670 assert_eq!(parts, vec!["lt", "script", "gt"]);
18671 }
18672
18673 #[test]
18674 fn special_char_html_numeric_entity() {
18675 let sanitized = sanitize_query("<script>");
18676 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18677 assert!(parts.contains(&"x3C"));
18678 assert!(parts.contains(&"script"));
18679 assert!(parts.contains(&"x3E"));
18680 }
18681
18682 #[test]
18683 fn special_char_html_tags_stripped() {
18684 let sanitized = sanitize_query("<script>alert('xss')</script>");
18685 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18686 assert!(parts.contains(&"script"));
18687 assert!(parts.contains(&"alert"));
18688 assert!(parts.contains(&"xss"));
18689 }
18690
18691 #[test]
18692 fn special_char_html_attribute() {
18693 let sanitized = sanitize_query("<img src=\"evil.js\" onerror=\"alert(1)\">");
18694 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18695 assert!(parts.contains(&"img"));
18696 assert!(parts.contains(&"src"));
18697 assert!(parts.contains(&"onerror"));
18698 }
18699
18700 #[test]
18703 fn special_char_url_percent_encoding() {
18704 let sanitized = sanitize_query("%20space%2Fslash");
18705 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18706 assert_eq!(parts, vec!["20space", "2Fslash"]);
18707 }
18708
18709 #[test]
18710 fn special_char_url_null_byte_encoded() {
18711 let sanitized = sanitize_query("test%00hidden");
18712 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18713 assert_eq!(parts, vec!["test", "00hidden"]);
18714 }
18715
18716 #[test]
18717 fn special_char_url_full_query_string() {
18718 let sanitized = sanitize_query("search?q=hello&lang=en");
18719 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18720 assert_eq!(parts, vec!["search", "q", "hello", "lang", "en"]);
18721 }
18722
18723 #[test]
18726 fn special_char_explain_sql_injection() {
18727 let filters = SearchFilters::default();
18728 let explanation = QueryExplanation::analyze("'OR 1=1--", &filters);
18729 assert!(
18730 !explanation.parsed.terms.is_empty() || !explanation.parsed.phrases.is_empty(),
18731 "SQL injection should produce parseable terms"
18732 );
18733 }
18734
18735 #[test]
18736 fn special_char_explain_shell_injection() {
18737 let filters = SearchFilters::default();
18738 let explanation = QueryExplanation::analyze("$(rm -rf /)", &filters);
18739 assert!(
18740 !explanation.parsed.terms.is_empty(),
18741 "Shell injection should produce parseable terms"
18742 );
18743 }
18744
18745 #[test]
18746 fn special_char_explain_html_xss() {
18747 let filters = SearchFilters::default();
18748 let explanation = QueryExplanation::analyze("<script>alert('xss')</script>", &filters);
18749 assert!(
18750 !explanation.parsed.terms.is_empty(),
18751 "XSS payload should produce parseable terms"
18752 );
18753 }
18754
18755 #[test]
18756 fn special_char_terms_lower_injection() {
18757 let qt = QueryTermsLower::from_query("'; DROP TABLE--");
18758 let tokens: Vec<&str> = qt.tokens().collect();
18759 for token in &tokens {
18760 assert!(
18761 token.chars().all(|c| c.is_alphanumeric()),
18762 "Token should only contain alphanumeric characters: {token}"
18763 );
18764 }
18765 }
18766
18767 #[test]
18768 fn special_char_terms_lower_null_bytes() {
18769 let qt = QueryTermsLower::from_query("test\x00hidden");
18770 let tokens: Vec<&str> = qt.tokens().collect();
18771 assert!(tokens.contains(&"test"));
18772 assert!(tokens.contains(&"hidden"));
18773 }
18774
18775 #[test]
18776 fn special_char_boolean_with_injection() {
18777 let tokens = parse_boolean_query("search AND 'OR 1=1-- NOT drop");
18778 assert!(
18779 tokens.iter().any(|t| matches!(t, QueryToken::And)),
18780 "Boolean AND should still be recognized: {tokens:?}"
18781 );
18782 assert!(
18783 tokens.iter().any(|t| matches!(t, QueryToken::Not)),
18784 "Boolean NOT should still be recognized: {tokens:?}"
18785 );
18786 }
18787
18788 #[test]
18794 fn stress_query_100k_chars_completes_quickly() {
18795 let long_query = "a ".repeat(50000);
18797 assert_eq!(long_query.len(), 100000);
18798
18799 let start = std::time::Instant::now();
18800 let sanitized = sanitize_query(&long_query);
18801 let elapsed_sanitize = start.elapsed();
18802
18803 let start = std::time::Instant::now();
18804 let tokens = parse_boolean_query(&sanitized);
18805 let elapsed_parse = start.elapsed();
18806
18807 assert!(
18808 elapsed_sanitize < std::time::Duration::from_secs(1),
18809 "sanitize_query with 100k chars took {:?} (>1s)",
18810 elapsed_sanitize
18811 );
18812 assert!(
18813 elapsed_parse < std::time::Duration::from_secs(1),
18814 "parse_boolean_query with 100k chars took {:?} (>1s)",
18815 elapsed_parse
18816 );
18817 assert!(!tokens.is_empty(), "100k char query should produce tokens");
18818 }
18819
18820 #[test]
18821 fn stress_query_1000_terms() {
18822 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
18824 let query = words.join(" ");
18825
18826 let start = std::time::Instant::now();
18827 let sanitized = sanitize_query(&query);
18828 let tokens = parse_boolean_query(&sanitized);
18829 let elapsed = start.elapsed();
18830
18831 assert!(
18832 elapsed < std::time::Duration::from_secs(1),
18833 "1000 terms query took {:?} (>1s)",
18834 elapsed
18835 );
18836 let term_count = tokens
18838 .iter()
18839 .filter(|t| matches!(t, QueryToken::Term(_)))
18840 .count();
18841 assert!(
18842 term_count >= 900,
18843 "Expected ~1000 terms, got {} terms",
18844 term_count
18845 );
18846 }
18847
18848 #[test]
18849 fn stress_query_1000_identical_terms() {
18850 let query = "test ".repeat(1000);
18852
18853 let start = std::time::Instant::now();
18854 let sanitized = sanitize_query(&query);
18855 let tokens = parse_boolean_query(&sanitized);
18856 let elapsed = start.elapsed();
18857
18858 assert!(
18859 elapsed < std::time::Duration::from_secs(1),
18860 "1000 identical terms query took {:?} (>1s)",
18861 elapsed
18862 );
18863
18864 let parsed_term_count = tokens
18866 .iter()
18867 .filter(|t| matches!(t, QueryToken::Term(_)))
18868 .count();
18869 assert_eq!(parsed_term_count, 1000, "Parser should produce 1000 terms");
18870
18871 let qt = QueryTermsLower::from_query(&query);
18873 let tokens_lower: Vec<&str> = qt.tokens().collect();
18874 assert_eq!(
18875 tokens_lower.len(),
18876 1000,
18877 "All 1000 identical terms should be preserved"
18878 );
18879 assert!(
18880 tokens_lower.iter().all(|t| *t == "test"),
18881 "All tokens should be 'test'"
18882 );
18883 }
18884
18885 #[test]
18886 fn stress_query_10k_char_single_term() {
18887 let long_term = "a".repeat(10000);
18889
18890 let start = std::time::Instant::now();
18891 let sanitized = sanitize_query(&long_term);
18892 let tokens = parse_boolean_query(&sanitized);
18893 let elapsed = start.elapsed();
18894
18895 assert!(
18896 elapsed < std::time::Duration::from_secs(1),
18897 "10k char single term took {:?} (>1s)",
18898 elapsed
18899 );
18900 assert_eq!(tokens.len(), 1, "Should produce exactly one token");
18901 assert!(
18902 matches!(&tokens[0], QueryToken::Term(t) if t.len() == 10000),
18903 "Expected Term token"
18904 );
18905 }
18906
18907 #[test]
18908 fn stress_deeply_nested_parentheses() {
18909 let open_parens = "(".repeat(100);
18912 let close_parens = ")".repeat(100);
18913 let query = format!("{}test{}", open_parens, close_parens);
18914
18915 let start = std::time::Instant::now();
18916 let sanitized = sanitize_query(&query);
18917 let tokens = parse_boolean_query(&sanitized);
18918 let elapsed = start.elapsed();
18919
18920 assert!(
18921 elapsed < std::time::Duration::from_millis(100),
18922 "Deeply nested parens took {:?} (>100ms)",
18923 elapsed
18924 );
18925 let term_count = tokens
18927 .iter()
18928 .filter(|t| matches!(t, QueryToken::Term(_)))
18929 .count();
18930 assert_eq!(term_count, 1, "Should have 1 term after sanitizing parens");
18931 }
18932
18933 #[test]
18934 fn stress_many_boolean_operators() {
18935 let terms: Vec<String> = (0..101).map(|i| format!("term{}", i)).collect();
18937 let query = terms.join(" AND ");
18938
18939 let start = std::time::Instant::now();
18940 let tokens = parse_boolean_query(&query);
18941 let elapsed = start.elapsed();
18942
18943 assert!(
18944 elapsed < std::time::Duration::from_secs(1),
18945 "100+ boolean ops took {:?} (>1s)",
18946 elapsed
18947 );
18948
18949 let and_count = tokens
18950 .iter()
18951 .filter(|t| matches!(t, QueryToken::And))
18952 .count();
18953 let term_count = tokens
18954 .iter()
18955 .filter(|t| matches!(t, QueryToken::Term(_)))
18956 .count();
18957
18958 assert_eq!(and_count, 100, "Should have 100 AND operators");
18959 assert_eq!(term_count, 101, "Should have 101 terms");
18960 }
18961
18962 #[test]
18963 fn stress_many_or_operators() {
18964 let terms: Vec<String> = (0..101).map(|i| format!("opt{}", i)).collect();
18966 let query = terms.join(" OR ");
18967
18968 let start = std::time::Instant::now();
18969 let tokens = parse_boolean_query(&query);
18970 let elapsed = start.elapsed();
18971
18972 assert!(
18973 elapsed < std::time::Duration::from_secs(1),
18974 "100+ OR ops took {:?} (>1s)",
18975 elapsed
18976 );
18977
18978 let or_count = tokens
18979 .iter()
18980 .filter(|t| matches!(t, QueryToken::Or))
18981 .count();
18982 assert_eq!(or_count, 100, "Should have 100 OR operators");
18983 }
18984
18985 #[test]
18986 fn stress_mixed_boolean_operators() {
18987 let query = "a AND b OR c NOT d AND e OR f NOT g ".repeat(50);
18989
18990 let start = std::time::Instant::now();
18991 let tokens = parse_boolean_query(&query);
18992 let elapsed = start.elapsed();
18993
18994 assert!(
18995 elapsed < std::time::Duration::from_secs(1),
18996 "Mixed boolean ops took {:?} (>1s)",
18997 elapsed
18998 );
18999 assert!(
19000 !tokens.is_empty(),
19001 "Complex boolean query should produce tokens"
19002 );
19003 }
19004
19005 #[test]
19006 fn stress_memory_bounds_large_query() {
19007 let large_query = "x".repeat(100000);
19011
19012 let sanitized = sanitize_query(&large_query);
19013 let tokens = parse_boolean_query(&sanitized);
19014
19015 assert!(
19017 sanitized.len() <= large_query.len(),
19018 "Sanitized output should not exceed input size"
19019 );
19020
19021 assert_eq!(tokens.len(), 1);
19023
19024 let qt = QueryTermsLower::from_query(&large_query);
19026 let token_count = qt.tokens().count();
19027 assert_eq!(token_count, 1, "Should be 1 token of 100k chars");
19028 }
19029
19030 #[test]
19031 fn stress_concurrent_queries() {
19032 use std::thread;
19033
19034 let queries: Vec<String> = (0..100)
19035 .map(|i| format!("concurrent_query_{} test search", i))
19036 .collect();
19037
19038 let handles: Vec<_> = queries
19039 .into_iter()
19040 .map(|query| {
19041 thread::spawn(move || {
19042 let sanitized = sanitize_query(&query);
19043 let tokens = parse_boolean_query(&sanitized);
19044 let qt = QueryTermsLower::from_query(&query);
19045 (tokens.len(), qt.tokens().count())
19046 })
19047 })
19048 .collect();
19049
19050 for (i, handle) in handles.into_iter().enumerate() {
19051 let (token_len, qt_len) = handle.join().expect("Thread panicked");
19052 assert!(token_len > 0, "Query {} should produce tokens", i);
19053 assert!(qt_len > 0, "Query {} QueryTermsLower should have tokens", i);
19054 }
19055 }
19056
19057 #[test]
19058 fn stress_many_quoted_phrases() {
19059 let phrases: Vec<String> = (0..50)
19061 .map(|i| format!("\"phrase number {}\"", i))
19062 .collect();
19063 let query = phrases.join(" AND ");
19064
19065 let start = std::time::Instant::now();
19066 let tokens = parse_boolean_query(&query);
19067 let elapsed = start.elapsed();
19068
19069 assert!(
19070 elapsed < std::time::Duration::from_secs(1),
19071 "50 quoted phrases took {:?} (>1s)",
19072 elapsed
19073 );
19074
19075 let phrase_count = tokens
19076 .iter()
19077 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19078 .count();
19079 assert_eq!(phrase_count, 50, "Should have 50 phrases");
19080 }
19081
19082 #[test]
19083 fn stress_alternating_quotes() {
19084 let parts: Vec<String> = (0..100)
19086 .map(|i| {
19087 if i % 2 == 0 {
19088 format!("\"word{}\"", i)
19089 } else {
19090 format!("word{}", i)
19091 }
19092 })
19093 .collect();
19094 let query = parts.join(" ");
19095
19096 let start = std::time::Instant::now();
19097 let tokens = parse_boolean_query(&query);
19098 let elapsed = start.elapsed();
19099
19100 assert!(
19101 elapsed < std::time::Duration::from_secs(1),
19102 "100 alternating quotes took {:?} (>1s)",
19103 elapsed
19104 );
19105
19106 let phrase_count = tokens
19107 .iter()
19108 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19109 .count();
19110 let term_count = tokens
19111 .iter()
19112 .filter(|t| matches!(t, QueryToken::Term(_)))
19113 .count();
19114
19115 assert_eq!(phrase_count, 50, "Should have 50 phrases");
19116 assert_eq!(term_count, 50, "Should have 50 terms");
19117 }
19118
19119 #[test]
19120 fn stress_many_wildcards() {
19121 let patterns: Vec<&str> = vec!["pre*", "*suf", "*sub*", "a*b", "test*", "*ing", "*tion*"];
19123 let query = patterns
19124 .iter()
19125 .cycle()
19126 .take(100)
19127 .cloned()
19128 .collect::<Vec<_>>()
19129 .join(" ");
19130
19131 let start = std::time::Instant::now();
19132 let sanitized = sanitize_query(&query);
19133 let tokens = parse_boolean_query(&sanitized);
19134 let elapsed = start.elapsed();
19135
19136 assert!(
19137 elapsed < std::time::Duration::from_secs(1),
19138 "100 wildcards took {:?} (>1s)",
19139 elapsed
19140 );
19141 assert!(!tokens.is_empty());
19142 }
19143
19144 #[test]
19145 fn stress_query_explanation_large_query() {
19146 let words: Vec<String> = (0..100).map(|i| format!("term{}", i)).collect();
19148 let query = words.join(" ");
19149 let filters = SearchFilters::default();
19150
19151 let start = std::time::Instant::now();
19152 let explanation = QueryExplanation::analyze(&query, &filters);
19153 let elapsed = start.elapsed();
19154
19155 assert!(
19156 elapsed < std::time::Duration::from_secs(2),
19157 "QueryExplanation for 100 terms took {:?} (>2s)",
19158 elapsed
19159 );
19160 assert!(
19161 !explanation.parsed.terms.is_empty(),
19162 "Should parse terms successfully"
19163 );
19164 }
19165
19166 #[test]
19167 fn stress_very_long_single_quoted_phrase() {
19168 let words: Vec<String> = (0..500).map(|i| format!("word{}", i)).collect();
19170 let phrase = format!("\"{}\"", words.join(" "));
19171
19172 let start = std::time::Instant::now();
19173 let tokens = parse_boolean_query(&phrase);
19174 let elapsed = start.elapsed();
19175
19176 assert!(
19177 elapsed < std::time::Duration::from_secs(1),
19178 "500-word phrase took {:?} (>1s)",
19179 elapsed
19180 );
19181
19182 let phrase_count = tokens
19183 .iter()
19184 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19185 .count();
19186 assert_eq!(phrase_count, 1, "Should have exactly 1 phrase");
19187 }
19188
19189 #[test]
19190 fn stress_not_prefix_many() {
19191 let terms: Vec<String> = (0..100).map(|i| format!("-term{}", i)).collect();
19193 let query = terms.join(" ");
19194
19195 let start = std::time::Instant::now();
19196 let tokens = parse_boolean_query(&query);
19197 let elapsed = start.elapsed();
19198
19199 assert!(
19200 elapsed < std::time::Duration::from_secs(1),
19201 "100 NOT prefixes took {:?} (>1s)",
19202 elapsed
19203 );
19204
19205 let not_count = tokens
19206 .iter()
19207 .filter(|t| matches!(t, QueryToken::Not))
19208 .count();
19209 assert_eq!(not_count, 100, "Should have 100 NOT operators");
19210 }
19211
19212 #[test]
19213 fn stress_unicode_large_cjk_query() {
19214 let cjk_chars = "中文日本語한국어".repeat(1000);
19216
19217 let start = std::time::Instant::now();
19218 let sanitized = sanitize_query(&cjk_chars);
19219 let qt = QueryTermsLower::from_query(&sanitized);
19220 let elapsed = start.elapsed();
19221
19222 assert!(
19223 elapsed < std::time::Duration::from_secs(1),
19224 "Large CJK query took {:?} (>1s)",
19225 elapsed
19226 );
19227 assert!(!qt.is_empty(), "CJK query should produce tokens");
19228 }
19229
19230 #[test]
19231 fn stress_unicode_many_emoji() {
19232 let emoji_query = "🚀 🔍 📝 💻 🎯 ".repeat(500);
19234
19235 let start = std::time::Instant::now();
19236 let sanitized = sanitize_query(&emoji_query);
19237 let tokens = parse_boolean_query(&sanitized);
19238 let elapsed = start.elapsed();
19239
19240 assert!(
19241 elapsed < std::time::Duration::from_secs(1),
19242 "Emoji query took {:?} (>1s)",
19243 elapsed
19244 );
19245 assert!(
19247 tokens.is_empty(),
19248 "Emoji-only query should produce no tokens"
19249 );
19250 }
19251
19252 #[test]
19253 fn stress_mixed_content_large() {
19254 let mixed = r#"
19256 function test() { return x + y; }
19257 SELECT * FROM users WHERE id = 1;
19258 The quick brown fox 狐狸 jumps over lazy dog
19259 Error: "undefined is not a function" at line 42
19260 https://example.com/path?query=value&other=123
19261 "#
19262 .repeat(100);
19263
19264 let start = std::time::Instant::now();
19265 let sanitized = sanitize_query(&mixed);
19266 let tokens = parse_boolean_query(&sanitized);
19267 let qt = QueryTermsLower::from_query(&mixed);
19268 let elapsed = start.elapsed();
19269
19270 assert!(
19271 elapsed < std::time::Duration::from_secs(2),
19272 "Mixed content query took {:?} (>2s)",
19273 elapsed
19274 );
19275 assert!(!tokens.is_empty());
19276 assert!(!qt.is_empty());
19277 }
19278
19279 #[test]
19286 fn unicode_emoji_mixed_with_alphanumeric() {
19287 let tokens = parse_boolean_query("rocket🚀launch");
19289 assert_eq!(tokens.len(), 1);
19290 let sanitized = sanitize_query("rocket🚀launch");
19292 assert_eq!(sanitized, "rocket launch");
19293
19294 let sanitized2 = sanitize_query("test🔥🎯code");
19296 assert_eq!(sanitized2, "test code");
19297 }
19298
19299 #[test]
19300 fn unicode_emoji_with_boolean_operators() {
19301 let tokens = parse_boolean_query("🚀code AND test");
19303 let term_count = tokens
19305 .iter()
19306 .filter(|t| matches!(t, QueryToken::Term(_)))
19307 .count();
19308 assert!(term_count >= 1, "Should have at least one term");
19309
19310 let tokens_or = parse_boolean_query("deploy OR 🎯target");
19312 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
19313 assert!(has_or, "Should detect OR operator");
19314 }
19315
19316 #[test]
19317 fn unicode_emoji_at_word_boundaries() {
19318 let sanitized_start = sanitize_query("🔍search");
19320 assert_eq!(sanitized_start, " search");
19321
19322 let sanitized_end = sanitize_query("complete✅");
19324 assert_eq!(sanitized_end, "complete ");
19325
19326 let sanitized_only = sanitize_query("🎉🎊🎁");
19328 assert!(
19329 sanitized_only.trim().is_empty(),
19330 "Emoji-only should be empty after trimming"
19331 );
19332 }
19333
19334 #[test]
19337 fn unicode_arabic_text_preserved() {
19338 let arabic = "مرحبا بالعالم"; let sanitized = sanitize_query(arabic);
19341 assert_eq!(
19342 sanitized, arabic,
19343 "Arabic alphanumeric chars should be preserved"
19344 );
19345
19346 let tokens = parse_boolean_query(arabic);
19347 assert!(!tokens.is_empty(), "Arabic query should produce tokens");
19348 }
19349
19350 #[test]
19351 fn unicode_hebrew_text_preserved() {
19352 let hebrew = "שלום עולם"; let sanitized = sanitize_query(hebrew);
19355 assert_eq!(
19356 sanitized, hebrew,
19357 "Hebrew alphanumeric chars should be preserved"
19358 );
19359
19360 let tokens = parse_boolean_query(hebrew);
19361 assert!(!tokens.is_empty(), "Hebrew query should produce tokens");
19362 }
19363
19364 #[test]
19365 fn unicode_mixed_rtl_and_ltr() {
19366 let mixed = "hello مرحبا world";
19368 let sanitized = sanitize_query(mixed);
19369 assert_eq!(sanitized, mixed, "Mixed RTL/LTR should be preserved");
19370
19371 let tokens = parse_boolean_query(mixed);
19372 let term_count = tokens
19373 .iter()
19374 .filter(|t| matches!(t, QueryToken::Term(_)))
19375 .count();
19376 assert_eq!(term_count, 3, "Should have 3 terms");
19377 }
19378
19379 #[test]
19380 fn unicode_rtl_with_boolean_operators() {
19381 let hebrew_and = "שלום AND עולם";
19383 let tokens = parse_boolean_query(hebrew_and);
19384 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
19385 assert!(has_and, "Should detect AND operator in Hebrew query");
19386
19387 let arabic_not = "مرحبا NOT بالعالم";
19389 let tokens_not = parse_boolean_query(arabic_not);
19390 let has_not = tokens_not.iter().any(|t| matches!(t, QueryToken::Not));
19391 assert!(has_not, "Should detect NOT operator in Arabic query");
19392 }
19393
19394 #[test]
19397 fn special_chars_backslash_stripped() {
19398 let query = r"path\to\file";
19400 let sanitized = sanitize_query(query);
19401 assert_eq!(sanitized, "path to file");
19402 }
19403
19404 #[test]
19405 fn special_chars_escaped_quotes_handling() {
19406 let query = r#"say \"hello\""#;
19408 let sanitized = sanitize_query(query);
19409 assert!(sanitized.contains('"'), "Quotes should be preserved");
19411 }
19412
19413 #[test]
19414 fn special_chars_windows_paths() {
19415 let path = r"C:\Users\test\Documents";
19417 let sanitized = sanitize_query(path);
19418 assert_eq!(sanitized, "C Users test Documents");
19419 }
19420
19421 #[test]
19424 fn boolean_deeply_nested_operators() {
19425 let query = "a AND b OR c NOT d AND e";
19427 let tokens = parse_boolean_query(query);
19428
19429 let mut and_count = 0;
19430 let mut or_count = 0;
19431 let mut not_count = 0;
19432 for token in &tokens {
19433 match token {
19434 QueryToken::And => and_count += 1,
19435 QueryToken::Or => or_count += 1,
19436 QueryToken::Not => not_count += 1,
19437 _ => {}
19438 }
19439 }
19440
19441 assert_eq!(and_count, 2, "Should have 2 AND operators");
19442 assert_eq!(or_count, 1, "Should have 1 OR operator");
19443 assert_eq!(not_count, 1, "Should have 1 NOT operator");
19444 }
19445
19446 #[test]
19447 fn boolean_consecutive_operators_degenerate() {
19448 let tokens = parse_boolean_query("foo AND AND bar");
19450 let term_count = tokens
19452 .iter()
19453 .filter(|t| matches!(t, QueryToken::Term(_)))
19454 .count();
19455 assert!(
19456 term_count >= 2,
19457 "Should have at least 2 terms (foo and bar)"
19458 );
19459 }
19460
19461 #[test]
19462 fn boolean_operator_at_start() {
19463 let tokens = parse_boolean_query("AND foo");
19465 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
19466 assert!(has_and, "Leading AND should be detected");
19467
19468 let tokens_or = parse_boolean_query("OR test");
19469 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
19470 assert!(has_or, "Leading OR should be detected");
19471 }
19472
19473 #[test]
19474 fn boolean_operator_at_end() {
19475 let tokens = parse_boolean_query("foo AND");
19477 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
19478 assert!(has_and, "Trailing AND should be detected");
19479 }
19480
19481 #[test]
19484 fn numeric_query_digits_only() {
19485 let tokens = parse_boolean_query("12345");
19487 assert_eq!(tokens.len(), 1);
19488 assert_eq!(tokens[0], QueryToken::Term("12345".to_string()));
19489
19490 let sanitized = sanitize_query("12345");
19491 assert_eq!(sanitized, "12345");
19492 }
19493
19494 #[test]
19495 fn numeric_query_with_text() {
19496 let tokens = parse_boolean_query("error 404 not found");
19498 let term_count = tokens
19499 .iter()
19500 .filter(|t| matches!(t, QueryToken::Term(_)))
19501 .count();
19502 assert!(term_count >= 3, "Should have at least 3 terms");
19504 }
19505
19506 #[test]
19507 fn numeric_versions_with_dots() {
19508 let sanitized = sanitize_query("version 1.2.3");
19510 assert_eq!(sanitized, "version 1 2 3"); }
19512
19513 #[test]
19516 fn whitespace_tabs_treated_as_separators() {
19517 let tokens = parse_boolean_query("foo\tbar\tbaz");
19518 let term_count = tokens
19519 .iter()
19520 .filter(|t| matches!(t, QueryToken::Term(_)))
19521 .count();
19522 assert_eq!(term_count, 3, "Tabs should separate terms");
19523 }
19524
19525 #[test]
19526 fn whitespace_newlines_treated_as_separators() {
19527 let tokens = parse_boolean_query("foo\nbar\nbaz");
19528 let term_count = tokens
19529 .iter()
19530 .filter(|t| matches!(t, QueryToken::Term(_)))
19531 .count();
19532 assert_eq!(term_count, 3, "Newlines should separate terms");
19533 }
19534
19535 #[test]
19536 fn whitespace_mixed_types() {
19537 let tokens = parse_boolean_query("a \t b \n c d");
19538 let term_count = tokens
19539 .iter()
19540 .filter(|t| matches!(t, QueryToken::Term(_)))
19541 .count();
19542 assert_eq!(term_count, 4, "Mixed whitespace should separate properly");
19543 }
19544
19545 #[test]
19548 fn stress_very_long_single_term() {
19549 let long_term = "a".repeat(10_000);
19551
19552 let start = std::time::Instant::now();
19553 let tokens = parse_boolean_query(&long_term);
19554 let elapsed = start.elapsed();
19555
19556 assert!(
19557 elapsed < std::time::Duration::from_secs(1),
19558 "10K char term took {:?} (>1s)",
19559 elapsed
19560 );
19561 assert_eq!(tokens.len(), 1);
19562 assert!(
19563 matches!(tokens.first(), Some(QueryToken::Term(t)) if t.len() == 10_000),
19564 "Expected 10K Term token, got {tokens:?}"
19565 );
19566 }
19567
19568 #[test]
19569 fn stress_very_long_term_with_wildcard() {
19570 let long_pattern = format!("{}*", "prefix".repeat(1000));
19572
19573 let start = std::time::Instant::now();
19574 let sanitized = sanitize_query(&long_pattern);
19575 let pattern = WildcardPattern::parse(&sanitized);
19576 let elapsed = start.elapsed();
19577
19578 assert!(
19579 elapsed < std::time::Duration::from_secs(1),
19580 "Long wildcard pattern took {:?} (>1s)",
19581 elapsed
19582 );
19583 assert!(
19584 matches!(pattern, WildcardPattern::Prefix(_)),
19585 "Should parse as prefix pattern"
19586 );
19587 }
19588
19589 #[test]
19592 fn query_explanation_empty_query() {
19593 let explanation = QueryExplanation::analyze("", &SearchFilters::default());
19594 assert_eq!(explanation.query_type, QueryType::Empty);
19595 }
19596
19597 #[test]
19598 fn search_mode_default_is_hybrid_preferred() {
19599 assert_eq!(SearchMode::default(), SearchMode::Hybrid);
19600 }
19601
19602 #[test]
19603 fn query_explanation_whitespace_only_query() {
19604 let explanation = QueryExplanation::analyze(" \t\n ", &SearchFilters::default());
19605 assert_eq!(explanation.query_type, QueryType::Empty);
19606 }
19607
19608 #[test]
19609 fn query_explanation_unicode_query() {
19610 let explanation = QueryExplanation::analyze("日本語 search", &SearchFilters::default());
19611 assert!(!explanation.parsed.terms.is_empty());
19613 }
19614
19615 #[test]
19618 fn query_terms_lower_unicode_normalization() {
19619 let terms = QueryTermsLower::from_query("CAFÉ RÉSUMÉ");
19621 assert_eq!(terms.query_lower, "café résumé");
19622 }
19623
19624 #[test]
19625 fn query_terms_lower_mixed_case_unicode() {
19626 let terms = QueryTermsLower::from_query("Hello日本語World");
19628 assert!(terms.query_lower.contains("hello"));
19630 assert!(terms.query_lower.contains("world"));
19631 }
19632
19633 #[test]
19634 fn query_terms_lower_preserves_numbers() {
19635 let terms = QueryTermsLower::from_query("ABC123XYZ");
19636 assert_eq!(terms.query_lower, "abc123xyz");
19637 }
19638
19639 #[test]
19642 fn wildcard_pattern_internal_asterisk() {
19643 let pattern = WildcardPattern::parse("f*o");
19645 assert!(
19646 matches!(pattern, WildcardPattern::Complex(_)),
19647 "Internal asterisk should be Complex"
19648 );
19649 }
19650
19651 #[test]
19652 fn wildcard_pattern_multiple_internal_asterisks() {
19653 let pattern = WildcardPattern::parse("a*b*c");
19655 assert!(
19656 matches!(pattern, WildcardPattern::Complex(_)),
19657 "Multiple internal asterisks should be Complex"
19658 );
19659 }
19660
19661 #[test]
19662 fn wildcard_pattern_regex_escapes_special_chars() {
19663 let pattern = WildcardPattern::parse("*foo.bar*");
19665 if let Some(regex) = pattern.to_regex() {
19666 assert!(
19667 regex.contains("\\."),
19668 "Dot should be escaped in regex: {}",
19669 regex
19670 );
19671 }
19672 }
19673
19674 #[test]
19675 fn wildcard_pattern_complex_regex_generation() {
19676 let pattern = WildcardPattern::parse("f*o*o");
19677 if let Some(regex) = pattern.to_regex() {
19678 assert!(
19680 regex.contains(".*"),
19681 "Should have .* for internal wildcards: {}",
19682 regex
19683 );
19684 }
19685 }
19686
19687 #[test]
19688 fn test_transpile_to_fts5() {
19689 assert_eq!(
19691 transpile_to_fts5("foo bar"),
19692 Some("foo AND bar".to_string())
19693 );
19694
19695 assert_eq!(
19697 transpile_to_fts5("foo AND bar"),
19698 Some("foo AND bar".to_string())
19699 );
19700 assert_eq!(
19701 transpile_to_fts5("foo OR bar"),
19702 Some("(foo OR bar)".to_string())
19703 );
19704 assert_eq!(transpile_to_fts5("OR foo"), Some("foo".to_string()));
19705 assert_eq!(transpile_to_fts5("NOT foo"), None);
19706
19707 assert_eq!(
19710 transpile_to_fts5("A AND B OR C"),
19711 Some("A AND (B OR C)".to_string())
19712 );
19713
19714 assert_eq!(
19716 transpile_to_fts5("A OR B AND C"),
19717 Some("(A OR B) AND C".to_string())
19718 );
19719
19720 assert_eq!(
19722 transpile_to_fts5("A OR B OR C"),
19723 Some("(A OR B OR C)".to_string())
19724 );
19725
19726 assert_eq!(
19728 transpile_to_fts5("\"foo bar\""),
19729 Some("\"foo bar\"".to_string())
19730 );
19731
19732 assert_eq!(transpile_to_fts5("foo*"), Some("foo*".to_string()));
19734
19735 assert_eq!(transpile_to_fts5("*foo"), None);
19737 assert_eq!(transpile_to_fts5("f*o"), None);
19738
19739 assert_eq!(
19742 transpile_to_fts5("foo-bar"),
19743 Some("(foo AND bar)".to_string())
19744 );
19745 assert_eq!(
19746 transpile_to_fts5("foo-bar*"),
19747 Some("(foo AND bar*)".to_string())
19748 );
19749 assert_eq!(
19750 transpile_to_fts5("br-123.jsonl"),
19751 Some("(br AND 123 AND jsonl)".to_string())
19752 );
19753 assert_eq!(
19754 transpile_to_fts5("br-123.json*"),
19755 Some("(br AND 123 AND json*)".to_string())
19756 );
19757
19758 assert_eq!(transpile_to_fts5("NOT A OR B"), None);
19760 }
19761
19762 #[test]
19763 fn semantic_doc_id_roundtrip_from_query() {
19764 let hash_hex = "00".repeat(32);
19765 let doc_id = format!("m|42|2|3|7|11|1|1700000000000|{hash_hex}");
19766 let parsed = parse_semantic_doc_id(&doc_id).expect("roundtrip parse");
19767 assert_eq!(parsed.message_id, 42);
19768 assert_eq!(parsed.chunk_idx, 2);
19769 assert_eq!(parsed.agent_id, 3);
19770 assert_eq!(parsed.workspace_id, 7);
19771 assert_eq!(parsed.source_id, 11);
19772 assert_eq!(parsed.role, 1);
19773 assert_eq!(parsed.created_at_ms, 1_700_000_000_000);
19774 }
19775
19776 #[test]
19777 fn semantic_filter_applies_all_constraints() {
19778 use frankensearch::core::filter::SearchFilter;
19779
19780 let filter = SemanticFilter {
19781 agents: Some(HashSet::from([3])),
19782 workspaces: Some(HashSet::from([7])),
19783 sources: Some(HashSet::from([11])),
19784 roles: Some(HashSet::from([1])),
19785 created_from: Some(1_700_000_000_000),
19786 created_to: Some(1_700_000_000_100),
19787 };
19788
19789 assert!(filter.matches("m|42|2|3|7|11|1|1700000000001", None));
19790 assert!(!filter.matches("m|42|2|99|7|11|1|1700000000001", None));
19791 assert!(!filter.matches("m|42|2|3|7|11|1|1699999999999", None));
19792 assert!(!filter.matches("not-a-doc-id", None));
19793 }
19794
19795 #[test]
19796 fn fs_semantic_index_runs_filtered_search() -> Result<()> {
19797 let temp = TempDir::new()?;
19798 let index_path = crate::search::vector_index::vector_index_path(temp.path(), "embed-fast");
19799 if let Some(parent) = index_path.parent() {
19800 std::fs::create_dir_all(parent)?;
19801 }
19802
19803 let hash_a = "00".repeat(32);
19804 let hash_b = "11".repeat(32);
19805 let doc_a = format!("m|101|0|1|10|100|1|1700000000001|{hash_a}");
19806 let doc_b = format!("m|202|0|2|20|200|1|1700000000002|{hash_b}");
19807
19808 let mut writer = VectorIndex::create_with_revision(
19809 &index_path,
19810 "embed-fast",
19811 "rev-1",
19812 2,
19813 frankensearch::index::Quantization::F16,
19814 )
19815 .map_err(|err| anyhow!("create fsvi index failed: {err}"))?;
19816 writer
19817 .write_record(&doc_a, &[1.0, 0.0])
19818 .map_err(|err| anyhow!("write_record failed: {err}"))?;
19819 writer
19820 .write_record(&doc_b, &[0.0, 1.0])
19821 .map_err(|err| anyhow!("write_record failed: {err}"))?;
19822 writer
19823 .finish()
19824 .map_err(|err| anyhow!("finish fsvi index failed: {err}"))?;
19825
19826 let fs_index =
19827 VectorIndex::open(&index_path).map_err(|err| anyhow!("open fsvi failed: {err}"))?;
19828 let filter = SemanticFilter {
19829 agents: Some(HashSet::from([1])),
19830 workspaces: None,
19831 sources: None,
19832 roles: None,
19833 created_from: None,
19834 created_to: None,
19835 };
19836 let fs_filter = semantic_filter_as_search_filter(&filter).expect("expected active filter");
19837 let hits = fs_index
19838 .search_top_k(&[1.0, 0.0], 5, Some(fs_filter))
19839 .map_err(|err| anyhow!("frankensearch search failed: {err}"))?;
19840 assert_eq!(hits.len(), 1);
19841 let parsed = parse_semantic_doc_id(&hits[0].doc_id).expect("parse bridged doc_id");
19842 assert_eq!(parsed.message_id, 101);
19843 assert_eq!(parsed.agent_id, 1);
19844 Ok(())
19845 }
19846
19847 #[test]
19859 fn hit_is_noise_returns_false_when_content_and_snippet_both_empty() {
19860 let hit = SearchHit {
19861 title: String::new(),
19862 snippet: String::new(),
19863 content: String::new(),
19864 content_hash: 0,
19865 conversation_id: Some(1),
19866 score: 1.0,
19867 source_path: "/tmp/session.jsonl".to_string(),
19868 agent: "codex".to_string(),
19869 workspace: String::new(),
19870 workspace_original: None,
19871 created_at: Some(1700000000000),
19872 line_number: Some(1),
19873 match_type: MatchType::Exact,
19874 source_id: "local".to_string(),
19875 origin_kind: "local".to_string(),
19876 origin_host: None,
19877 };
19878
19879 assert!(
19883 !hit_is_noise(&hit, "anything"),
19884 "hit with empty content AND snippet (projection-only) must NOT be classified as noise"
19885 );
19886 assert!(
19887 !hit_is_noise(&hit, ""),
19888 "noise classifier must not treat an empty-query projection-only hit as noise"
19889 );
19890 }
19891
19892 #[test]
19897 fn hit_is_noise_still_drops_tool_acknowledgement_when_content_present() {
19898 let hit = SearchHit {
19899 title: String::new(),
19900 snippet: String::new(),
19901 content: "ok".to_string(),
19902 content_hash: 0,
19903 conversation_id: Some(1),
19904 score: 1.0,
19905 source_path: "/tmp/session.jsonl".to_string(),
19906 agent: "codex".to_string(),
19907 workspace: String::new(),
19908 workspace_original: None,
19909 created_at: Some(1700000000000),
19910 line_number: Some(1),
19911 match_type: MatchType::Exact,
19912 source_id: "local".to_string(),
19913 origin_kind: "local".to_string(),
19914 origin_host: None,
19915 };
19916
19917 assert!(
19918 hit_is_noise(&hit, ""),
19919 "bare tool-ack 'ok' with content present should still be dropped as noise"
19920 );
19921 }
19922}