1use anyhow::{Context, Result, anyhow, bail};
2use crossbeam_channel as mpsc;
3use frankensearch::lexical::{
4 BooleanQuery, CASS_SCHEMA_HASH as FS_CASS_SCHEMA_HASH, CassFields as FsCassFields,
5 CassQueryFilters as FsCassQueryFilters, CassQueryToken as FsCassQueryToken,
6 CassSourceFilter as FsCassSourceFilter, CassWildcardPattern as FsCassWildcardPattern, Count,
7 IndexReader, IndexRecordOption, LexicalDocHit as FsLexicalDocHit,
8 LexicalSearchResult as FsLexicalSearchResult, Occur, Query, ReloadPolicy, Searcher,
9 SnippetConfig as FsSnippetConfig, TantivyDocument, Term, TermQuery, TopDocs, Value,
10 cass_build_tantivy_query as fs_cass_build_tantivy_query,
11 cass_has_boolean_operators as fs_cass_has_boolean_operators,
12 cass_open_search_reader as fs_cass_open_search_reader,
13 cass_parse_boolean_query as fs_cass_parse_boolean_query,
14 cass_sanitize_query as fs_cass_sanitize_query, load_doc as fs_load_doc,
15 render_snippet_html as fs_render_snippet_html,
16 try_build_snippet_generator as fs_try_build_snippet_generator,
17};
18use frankensearch::{
19 Cx as FsCx, InMemoryTwoTierIndex as FsInMemoryTwoTierIndex,
20 InMemoryVectorIndex as FsInMemoryVectorIndex, LexicalSearch as FsLexicalSearch,
21 QueryClass as FsQueryClass, RrfConfig as FsRrfConfig, ScoreSource as FsScoreSource,
22 ScoredResult as FsScoredResult, SearchError as FsSearchError, SearchFuture as FsSearchFuture,
23 SearchPhase as FsSearchPhase, SyncEmbedderAdapter as FsSyncEmbedderAdapter,
24 SyncTwoTierSearcher as FsSyncTwoTierSearcher, TwoTierConfig as FsTwoTierConfig,
25 TwoTierIndex as FsTwoTierIndex, TwoTierSearcher as FsTwoTierSearcher, VectorHit as FsVectorHit,
26 candidate_count as fs_candidate_count,
27 core::filter::SearchFilter as FsSearchFilter,
28 index::{
29 HNSW_DEFAULT_EF_SEARCH as FS_HNSW_DEFAULT_EF_SEARCH, HnswIndex as FsHnswIndex,
30 VectorIndex as FsVectorIndex,
31 },
32 rrf_fuse as fs_rrf_fuse,
33};
34use lru::LruCache;
35use once_cell::sync::Lazy;
36use parking_lot::RwLock;
37use std::cell::RefCell;
38use std::cmp::Ordering as CmpOrdering;
39use std::collections::{HashMap, HashSet, VecDeque};
40use std::hash::{Hash, Hasher};
41use std::num::NonZeroUsize;
42use std::path::{Path, PathBuf};
43use std::sync::atomic::{AtomicU64, Ordering};
44use std::sync::{Arc, Mutex};
45use std::time::{Duration, Instant};
46
47use frankensqlite::Connection;
48#[cfg(test)]
49use frankensqlite::compat::OptionalExtension;
50use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
51#[cfg(test)]
52use frankensqlite::params;
53
54struct SendConnection(Connection);
62
63type TantivyContentExactKey = (i64, i64);
64type TantivyContentFallbackKey = (String, String, i64);
65type TantivyHydratedContentMaps = (
66 HashMap<TantivyContentExactKey, String>,
67 HashMap<TantivyContentFallbackKey, String>,
68);
69type SqliteFtsHydratedRow = (
70 i64,
71 Option<i64>,
72 Option<String>,
73 Option<String>,
74 Option<String>,
75 Option<String>,
76 Option<String>,
77 Option<i64>,
78);
79type SqliteFtsMessageRow = (
80 i64,
81 String,
82 String,
83 String,
84 String,
85 String,
86 Option<i64>,
87 Option<i64>,
88 Option<i64>,
89 Option<String>,
90 Option<String>,
91 Option<String>,
92);
93type SqliteMessageScanAlternative = Vec<String>;
94type SqliteMessageScanGroup = Vec<SqliteMessageScanAlternative>;
95struct SqliteMessageScanQuery {
96 include_groups: Vec<SqliteMessageScanGroup>,
97 exclude_terms: Vec<String>,
98}
99
100#[derive(Clone, Copy)]
101struct SqliteMessageScanRequest<'a> {
102 raw_query: &'a str,
103 filters: &'a SearchFilters,
104 limit: usize,
105 offset: usize,
106 field_mask: FieldMask,
107 query_match_type: MatchType,
108}
109
110#[derive(Clone, Copy, Debug, PartialEq, Eq)]
111enum SqliteFtsMatchMode {
112 Table,
113 IndexedColumns,
114}
115
116const SQLITE_FTS5_HYDRATE_PARAM_CHUNK: usize = 30_000;
120const SQLITE_MAX_VARIABLE_NUMBER: usize = 32_766;
121const SQLITE_FTS5_POST_FILTER_SCAN_CHUNK: usize = 1_024;
122const SQLITE_FTS5_POST_FILTER_SCAN_LIMIT: usize = 30_000;
123const SQLITE_MESSAGE_SCAN_FALLBACK_LIMIT: usize = 30_000;
124const SEARCH_SQLITE_HYDRATION_CACHE_KIB: i64 = 4_096;
125const SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER: usize = 4;
126
127unsafe impl Send for SendConnection {}
130
131impl std::ops::Deref for SendConnection {
132 type Target = Connection;
133 fn deref(&self) -> &Connection {
134 &self.0
135 }
136}
137
138fn open_search_hydration_sqlite(path: &Path, timeout: Duration) -> Result<Connection> {
139 let conn =
140 crate::storage::sqlite::open_franken_raw_readonly_connection_with_timeout(path, timeout)?;
141 conn.execute("PRAGMA query_only = 1;")
142 .with_context(|| "setting search hydration query_only")?;
143 conn.execute("PRAGMA busy_timeout = 5000;")
144 .with_context(|| "setting search hydration busy_timeout")?;
145 conn.execute(&format!(
146 "PRAGMA cache_size = -{SEARCH_SQLITE_HYDRATION_CACHE_KIB};"
147 ))
148 .with_context(|| "setting search hydration cache_size")?;
149 Ok(conn)
150}
151
152fn nfc_sanitize_query(raw: &str) -> String {
156 use unicode_normalization::UnicodeNormalization;
157 let nfc: String = raw.nfc().collect();
158 fs_cass_sanitize_query(&nfc)
159}
160
161fn franken_query_map_collect_retry<T, F>(
162 conn: &Connection,
163 sql: &str,
164 params: &[ParamValue],
165 map: F,
166) -> Result<Vec<T>, frankensqlite::FrankenError>
167where
168 F: Copy + Fn(&frankensqlite::Row) -> Result<T, frankensqlite::FrankenError>,
169{
170 let deadline = Instant::now() + Duration::from_secs(2);
171 let mut backoff = Duration::from_millis(4);
172 loop {
173 match conn.query_map_collect(sql, params, |row| map(row)) {
174 Ok(values) => return Ok(values),
175 Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
176 let now = Instant::now();
177 if now >= deadline {
178 return Err(err);
179 }
180 let remaining = deadline.saturating_duration_since(now);
181 crate::storage::sqlite::sleep_with_franken_retry_backoff(
182 &mut backoff,
183 remaining,
184 Duration::from_millis(64),
185 );
186 }
187 Err(err) => return Err(err),
188 }
189 }
190}
191
192fn hydrate_message_content_by_conversation(
193 conn: &Connection,
194 requests: &[TantivyContentExactKey],
195) -> Result<HashMap<TantivyContentExactKey, String>> {
196 if requests.is_empty() {
197 return Ok(HashMap::new());
198 }
199
200 let mut wanted_by_conversation: HashMap<i64, HashSet<i64>> = HashMap::new();
201 for &(conversation_id, line_idx) in requests {
202 wanted_by_conversation
203 .entry(conversation_id)
204 .or_default()
205 .insert(line_idx);
206 }
207
208 let mut conversation_ids = wanted_by_conversation.keys().copied().collect::<Vec<_>>();
209 conversation_ids.sort_unstable();
210 let mut hydrated = HashMap::with_capacity(requests.len());
211
212 for conversation_id in conversation_ids {
213 let Some(wanted_indices) = wanted_by_conversation.get(&conversation_id) else {
214 continue;
215 };
216 let mut wanted_indices = wanted_indices.iter().copied().collect::<Vec<_>>();
217 wanted_indices.sort_unstable();
218 let placeholders = sql_placeholders(wanted_indices.len());
219 let sql = format!(
220 "SELECT m.conversation_id, m.idx, m.content
221 FROM messages m INDEXED BY sqlite_autoindex_messages_1
222 WHERE m.conversation_id = ? AND m.idx IN ({placeholders})
223 ORDER BY m.idx"
224 );
225 let mut params = Vec::with_capacity(wanted_indices.len() + 1);
226 params.push(ParamValue::from(conversation_id));
227 params.extend(wanted_indices.iter().copied().map(ParamValue::from));
228 let rows: Vec<(i64, i64, String)> =
229 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
230 Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?))
231 })?;
232 for (conversation_id, line_idx, content) in rows {
233 hydrated.insert((conversation_id, line_idx), content);
234 }
235 }
236
237 Ok(hydrated)
238}
239
240fn semantic_message_id_from_db(message_id: i64) -> std::io::Result<u64> {
241 u64::try_from(message_id).map_err(|_| std::io::Error::other("negative message_id"))
242}
243
244fn semantic_doc_component_id_from_db(raw: Option<i64>) -> u32 {
245 raw.map(|value| u32::try_from(value.max(0)).unwrap_or(u32::MAX))
246 .unwrap_or(0)
247}
248
249use crate::search::canonicalize::{canonicalize_for_embedding, content_hash, is_search_noise_text};
250use crate::search::embedder::Embedder;
251use crate::search::vector_index::{
252 ROLE_USER, SemanticDocId, SemanticFilter, SemanticFilterMaps, VectorIndex, VectorSearchResult,
253 parse_semantic_doc_id, role_code_from_str,
254};
255use crate::sources::provenance::SourceFilter;
256
257pub struct StringInterner {
268 cache: RwLock<LruCache<Arc<str>, Arc<str>>>,
269}
270
271impl StringInterner {
272 pub fn new(capacity: usize) -> Self {
274 Self {
275 cache: RwLock::new(LruCache::new(
276 NonZeroUsize::new(capacity).expect("capacity must be > 0"),
277 )),
278 }
279 }
280
281 pub fn intern(&self, s: &str) -> Arc<str> {
287 {
289 let cache = self.cache.read();
290 if let Some(arc) = cache.peek(s) {
293 return Arc::clone(arc);
294 }
295 }
296
297 let mut cache = self.cache.write();
299
300 if let Some(arc) = cache.get(s) {
303 return Arc::clone(arc);
304 }
305
306 let arc: Arc<str> = Arc::from(s);
308 cache.put(Arc::clone(&arc), Arc::clone(&arc));
309 arc
310 }
311
312 #[allow(dead_code)]
314 pub fn len(&self) -> usize {
315 self.cache.read().len()
316 }
317
318 #[allow(dead_code)]
320 pub fn is_empty(&self) -> bool {
321 self.cache.read().is_empty()
322 }
323}
324
325static CACHE_KEY_INTERNER: Lazy<StringInterner> = Lazy::new(|| StringInterner::new(10_000));
328
329#[inline]
331fn intern_cache_key(s: &str) -> Arc<str> {
332 CACHE_KEY_INTERNER.intern(s)
333}
334
335#[inline]
351pub fn sql_placeholders(count: usize) -> String {
352 if count == 0 {
353 return String::new();
354 }
355 let capacity = count.saturating_mul(2).saturating_sub(1);
357 let mut result = String::with_capacity(capacity);
358 for i in 0..count {
359 if i > 0 {
360 result.push(',');
361 }
362 result.push('?');
363 }
364 result
365}
366
367#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
368pub struct SearchFilters {
369 pub agents: HashSet<String>,
370 pub workspaces: HashSet<String>,
371 pub created_from: Option<i64>,
372 pub created_to: Option<i64>,
373 #[serde(skip_serializing_if = "SourceFilter::is_all")]
375 pub source_filter: SourceFilter,
376 #[serde(skip_serializing_if = "HashSet::is_empty")]
378 pub session_paths: HashSet<String>,
379}
380
381#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, clap::ValueEnum)]
382#[serde(rename_all = "snake_case")]
383pub enum SearchMode {
384 Lexical,
386 Semantic,
388 #[default]
390 Hybrid,
391}
392
393impl SearchMode {
394 pub fn next(self) -> Self {
395 match self {
396 SearchMode::Lexical => SearchMode::Semantic,
397 SearchMode::Semantic => SearchMode::Hybrid,
398 SearchMode::Hybrid => SearchMode::Lexical,
399 }
400 }
401}
402
403#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize)]
410#[serde(rename_all = "snake_case")]
411pub enum SemanticTierMode {
412 #[default]
413 Single,
414 Progressive,
415 FastOnly,
416 QualityOnly,
417}
418
419impl SemanticTierMode {
420 const fn wants_two_tier(self) -> bool {
421 !matches!(self, Self::Single)
422 }
423
424 fn to_frankensearch_config(self) -> FsTwoTierConfig {
425 let mut config = frankensearch_two_tier_config();
426 match self {
427 Self::Single | Self::Progressive => {}
428 Self::FastOnly => {
429 config.fast_only = true;
430 }
431 Self::QualityOnly => {
432 config.fast_only = false;
433 config.quality_weight = 1.0;
434 }
435 }
436 config
437 }
438}
439
440const PROGRESSIVE_EMBEDDING_CACHE_CAPACITY: usize = 64;
441const ANN_CANDIDATE_MULTIPLIER: usize = 4;
442const HYBRID_NO_LIMIT_PLANNING_WINDOW: usize = 64;
443const HYBRID_NO_LIMIT_SEMANTIC_CAP: usize = 2048;
444const AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS: usize = 16;
445
446pub const NO_LIMIT_RESULT_MIN: usize = 1_000;
467pub const NO_LIMIT_RESULT_MAX: usize = 1_000_000;
468
469const AVG_HIT_BYTES: u64 = 80 * 1024;
474
475const NO_LIMIT_BYTES_CEILING: u64 = 16 * 1024 * 1024 * 1024;
481
482const NO_LIMIT_BYTES_FLOOR: u64 = 256 * 1024 * 1024;
486
487const NO_LIMIT_RAM_DIVISOR: u64 = 16;
491
492fn available_memory_bytes() -> Option<u64> {
493 let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?;
494 for line in meminfo.lines() {
495 if let Some(rest) = line.strip_prefix("MemAvailable:") {
496 let kb: u64 = rest.split_whitespace().next()?.parse().ok()?;
497 return Some(kb.saturating_mul(1024));
498 }
499 }
500 None
501}
502
503fn no_limit_result_cap() -> usize {
504 static CAP: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
505 *CAP.get_or_init(|| {
506 compute_no_limit_result_cap_from(
507 std::env::var("CASS_SEARCH_NO_LIMIT_CAP").ok(),
508 std::env::var("CASS_SEARCH_NO_LIMIT_BYTES").ok(),
509 available_memory_bytes(),
510 )
511 })
512}
513
514fn compute_no_limit_result_cap_from(
521 cap_env: Option<String>,
522 bytes_env: Option<String>,
523 available_bytes: Option<u64>,
524) -> usize {
525 if let Some(hits) = cap_env
529 .and_then(|v| v.parse::<usize>().ok())
530 .filter(|v| *v > 0)
531 {
532 return hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
533 }
534
535 let budget_bytes = no_limit_budget_bytes(bytes_env, available_bytes);
536 let hits = (budget_bytes / AVG_HIT_BYTES) as usize;
537 hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX)
538}
539
540fn no_limit_budget_bytes(bytes_env: Option<String>, available_bytes: Option<u64>) -> u64 {
541 bytes_env
542 .and_then(|v| v.parse::<u64>().ok())
543 .filter(|v| *v > 0)
544 .or_else(|| no_limit_available_memory_budget(available_bytes))
545 .unwrap_or(NO_LIMIT_BYTES_FLOOR)
546}
547
548fn no_limit_available_memory_budget(available_bytes: Option<u64>) -> Option<u64> {
549 available_bytes.map(|avail| {
550 (avail / NO_LIMIT_RAM_DIVISOR).clamp(NO_LIMIT_BYTES_FLOOR, NO_LIMIT_BYTES_CEILING)
551 })
552}
553
554static FRANKENSEARCH_TWO_TIER_CONFIG: Lazy<FsTwoTierConfig> =
555 Lazy::new(|| FsTwoTierConfig::optimized().with_env_overrides());
556
557fn frankensearch_two_tier_config() -> FsTwoTierConfig {
558 FRANKENSEARCH_TWO_TIER_CONFIG.clone()
559}
560
561#[inline]
562const fn progressive_phase_fetch_limit(limit: usize) -> usize {
563 let limit = if limit == 0 { 1 } else { limit };
564 limit.saturating_mul(3)
565}
566
567#[derive(Debug, Clone, Copy, PartialEq, Eq)]
568struct HybridCandidateBudget {
569 lexical_candidates: usize,
570 semantic_candidates: usize,
571}
572
573#[inline]
574const fn hybrid_stage_multipliers(query_class: FsQueryClass) -> (usize, usize) {
575 match query_class {
576 FsQueryClass::Identifier => (6, 2),
578 FsQueryClass::ShortKeyword => (4, 4),
580 FsQueryClass::NaturalLanguage => (2, 8),
582 FsQueryClass::Empty => (0, 0),
584 }
585}
586
587#[inline]
588fn hybrid_candidate_budget(
589 query: &str,
590 requested_limit: usize,
591 effective_limit: usize,
592 offset: usize,
593 total_docs: usize,
594) -> HybridCandidateBudget {
595 let query_class = FsQueryClass::classify(query);
596 let (lex_mult, sem_mult) = hybrid_stage_multipliers(query_class);
597 let total_docs = total_docs.max(1);
598
599 if requested_limit == 0 {
602 let planning_window = HYBRID_NO_LIMIT_PLANNING_WINDOW.max(offset.saturating_add(1));
603 let lexical = effective_limit.min(total_docs).min(no_limit_result_cap());
608 let semantic = fs_candidate_count(planning_window, 0, sem_mult)
616 .max(planning_window)
617 .min(HYBRID_NO_LIMIT_SEMANTIC_CAP.max(offset.saturating_add(planning_window)))
618 .min(total_docs)
619 .min(lexical);
620 return HybridCandidateBudget {
621 lexical_candidates: lexical,
622 semantic_candidates: semantic,
623 };
624 }
625
626 let lexical = fs_candidate_count(requested_limit, offset, lex_mult.max(1))
627 .max(requested_limit.saturating_add(offset))
628 .min(total_docs);
629 let semantic = fs_candidate_count(requested_limit, offset, sem_mult.max(1))
630 .max(requested_limit.saturating_add(offset))
631 .min(total_docs);
632
633 HybridCandidateBudget {
634 lexical_candidates: lexical,
635 semantic_candidates: semantic,
636 }
637}
638
639#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
645#[serde(rename_all = "snake_case")]
646pub enum QueryType {
647 Simple,
649 Phrase,
651 Boolean,
653 Wildcard,
655 Filtered,
657 Empty,
659}
660
661#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
663#[serde(rename_all = "snake_case")]
664pub enum IndexStrategy {
665 EdgeNgram,
667 RegexScan,
669 BooleanCombination,
671 RangeScan,
673 FullScan,
675}
676
677#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
679#[serde(rename_all = "snake_case")]
680pub enum QueryCost {
681 Low,
683 Medium,
685 High,
687}
688
689#[derive(Debug, Clone, serde::Serialize)]
691pub struct ParsedSubTerm {
692 pub text: String,
693 pub pattern: String,
694}
695
696#[derive(Debug, Clone, serde::Serialize)]
698pub struct ParsedTerm {
699 pub text: String,
701 pub negated: bool,
703 pub subterms: Vec<ParsedSubTerm>,
705}
706
707#[derive(Debug, Clone, Default, serde::Serialize)]
709pub struct ParsedQuery {
710 pub terms: Vec<ParsedTerm>,
712 pub phrases: Vec<String>,
714 pub operators: Vec<String>,
716 pub implicit_and: bool,
718}
719
720#[derive(Debug, Clone, serde::Serialize)]
722pub struct QueryExplanation {
723 pub original_query: String,
725 pub sanitized_query: String,
727 pub parsed: ParsedQuery,
729 pub query_type: QueryType,
731 pub index_strategy: IndexStrategy,
733 pub wildcard_applied: bool,
735 pub estimated_cost: QueryCost,
737 pub filters_summary: FiltersSummary,
739 pub warnings: Vec<String>,
741}
742
743#[derive(Debug, Clone, Default, serde::Serialize)]
745pub struct FiltersSummary {
746 pub agent_count: usize,
748 pub workspace_count: usize,
750 pub has_time_filter: bool,
752 pub description: Option<String>,
754}
755
756impl QueryExplanation {
757 pub fn analyze(query: &str, filters: &SearchFilters) -> Self {
759 let sanitized = nfc_sanitize_query(query);
760 let tokens = fs_cass_parse_boolean_query(query);
762
763 let mut parsed = ParsedQuery::default();
765 let mut has_explicit_operator = false;
766 let mut next_negated = false;
767
768 for token in &tokens {
769 match token {
770 FsCassQueryToken::Term(t) => {
771 let parts: Vec<String> = nfc_sanitize_query(t)
772 .split_whitespace()
773 .map(|s| s.to_string())
774 .collect();
775 if parts.is_empty() {
776 next_negated = false;
777 continue;
778 }
779 let mut subterms = Vec::new();
780 for part in parts {
781 let pattern = FsCassWildcardPattern::parse(&part);
782 let pattern_str = match &pattern {
783 FsCassWildcardPattern::Exact(_) => "exact",
784 FsCassWildcardPattern::Prefix(_) => "prefix (*)",
785 FsCassWildcardPattern::Suffix(_) => "suffix (*)",
786 FsCassWildcardPattern::Substring(_) => "substring (*)",
787 FsCassWildcardPattern::Complex(_) => "complex (*)",
788 };
789 subterms.push(ParsedSubTerm {
790 text: part,
791 pattern: pattern_str.to_string(),
792 });
793 }
794 parsed.terms.push(ParsedTerm {
795 text: t.clone(),
796 negated: next_negated,
797 subterms,
798 });
799 next_negated = false;
800 }
801 FsCassQueryToken::Phrase(p) => {
802 let parts: Vec<String> = nfc_sanitize_query(p)
803 .split_whitespace()
804 .map(|s| s.trim_matches('*').to_lowercase())
805 .filter(|s| !s.is_empty())
806 .collect();
807 if !parts.is_empty() {
808 parsed.phrases.push(parts.join(" "));
809 }
810 next_negated = false;
811 }
812 FsCassQueryToken::And => {
813 parsed.operators.push("AND".to_string());
814 has_explicit_operator = true;
815 }
816 FsCassQueryToken::Or => {
817 parsed.operators.push("OR".to_string());
818 has_explicit_operator = true;
819 }
820 FsCassQueryToken::Not => {
821 parsed.operators.push("NOT".to_string());
822 has_explicit_operator = true;
823 next_negated = true;
824 }
825 }
826 }
827
828 parsed.implicit_and = !has_explicit_operator && parsed.terms.len() > 1;
830
831 let query_type = Self::classify_query(&parsed, filters, &sanitized);
833
834 let index_strategy = Self::determine_strategy(&parsed, &sanitized);
836
837 let estimated_cost = Self::estimate_cost(&parsed, &index_strategy, filters);
839
840 let filters_summary = Self::summarize_filters(filters);
842
843 let warnings = Self::generate_warnings(&parsed, &sanitized, filters);
845
846 Self {
847 original_query: query.to_string(),
848 sanitized_query: sanitized,
849 parsed,
850 query_type,
851 index_strategy,
852 wildcard_applied: false, estimated_cost,
854 filters_summary,
855 warnings,
856 }
857 }
858
859 fn classify_query(parsed: &ParsedQuery, filters: &SearchFilters, sanitized: &str) -> QueryType {
860 if sanitized.trim().is_empty() {
861 return QueryType::Empty;
862 }
863
864 let has_filters = !filters.agents.is_empty()
866 || !filters.workspaces.is_empty()
867 || filters.created_from.is_some()
868 || filters.created_to.is_some()
869 || !filters.source_filter.is_all();
870
871 if has_filters {
872 return QueryType::Filtered;
873 }
874
875 if !parsed.operators.is_empty() {
877 return QueryType::Boolean;
878 }
879
880 if !parsed.phrases.is_empty() {
882 return QueryType::Phrase;
883 }
884
885 let has_wildcards = parsed
887 .terms
888 .iter()
889 .flat_map(|t| &t.subterms)
890 .any(|t| t.pattern != "exact");
891 if has_wildcards {
892 return QueryType::Wildcard;
893 }
894
895 QueryType::Simple
896 }
897
898 fn determine_strategy(parsed: &ParsedQuery, sanitized: &str) -> IndexStrategy {
899 if sanitized.trim().is_empty() {
900 return IndexStrategy::FullScan;
901 }
902
903 let has_leading_wildcard = parsed
905 .terms
906 .iter()
907 .flat_map(|t| &t.subterms)
908 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
909
910 if has_leading_wildcard {
911 return IndexStrategy::RegexScan;
912 }
913
914 let has_compound_terms = parsed.terms.iter().any(|t| t.subterms.len() > 1);
917
918 if !parsed.operators.is_empty()
919 || parsed.terms.len() > 1
920 || !parsed.phrases.is_empty()
921 || has_compound_terms
922 {
923 return IndexStrategy::BooleanCombination;
924 }
925
926 IndexStrategy::EdgeNgram
928 }
929
930 fn estimate_cost(
931 parsed: &ParsedQuery,
932 strategy: &IndexStrategy,
933 filters: &SearchFilters,
934 ) -> QueryCost {
935 if matches!(strategy, IndexStrategy::RegexScan) {
937 return QueryCost::High;
938 }
939
940 if matches!(strategy, IndexStrategy::FullScan) {
942 return QueryCost::High;
943 }
944
945 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
947
948 let term_count: usize = parsed.terms.iter().map(|t| t.subterms.len()).sum();
950 let operator_count = parsed.operators.len();
951 let phrase_count = parsed.phrases.len();
952
953 let complexity = term_count + operator_count * 2 + phrase_count * 2;
954
955 if complexity > 6 || has_time_filter {
956 QueryCost::High
957 } else if complexity > 2 {
958 QueryCost::Medium
959 } else {
960 QueryCost::Low
961 }
962 }
963
964 fn summarize_filters(filters: &SearchFilters) -> FiltersSummary {
965 let agent_count = filters.agents.len();
966 let workspace_count = filters.workspaces.len();
967 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
968
969 let mut parts = Vec::new();
970 if agent_count > 0 {
971 parts.push(format!(
972 "{} agent{}",
973 agent_count,
974 if agent_count > 1 { "s" } else { "" }
975 ));
976 }
977 if workspace_count > 0 {
978 parts.push(format!(
979 "{} workspace{}",
980 workspace_count,
981 if workspace_count > 1 { "s" } else { "" }
982 ));
983 }
984 if has_time_filter {
985 parts.push("time range".to_string());
986 }
987
988 let description = if parts.is_empty() {
989 None
990 } else {
991 Some(format!("Filtering by: {}", parts.join(", ")))
992 };
993
994 FiltersSummary {
995 agent_count,
996 workspace_count,
997 has_time_filter,
998 description,
999 }
1000 }
1001
1002 fn generate_warnings(
1003 parsed: &ParsedQuery,
1004 sanitized: &str,
1005 filters: &SearchFilters,
1006 ) -> Vec<String> {
1007 let mut warnings = Vec::new();
1008
1009 let has_leading_wildcard = parsed
1011 .terms
1012 .iter()
1013 .flat_map(|t| &t.subterms)
1014 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
1015 if has_leading_wildcard {
1016 warnings.push(
1017 "Leading wildcards (*foo) require regex scan and may be slow on large indexes"
1018 .to_string(),
1019 );
1020 }
1021
1022 for term in &parsed.terms {
1024 for sub in &term.subterms {
1025 if sub.text.trim_matches('*').len() < 2 {
1026 warnings.push(format!(
1027 "Very short term '{}' may match many documents",
1028 sub.text
1029 ));
1030 }
1031 }
1032 }
1033
1034 if sanitized.trim().is_empty() {
1036 warnings.push("Empty query will return all documents (expensive)".to_string());
1037 }
1038
1039 if parsed.operators.len() > 3 {
1041 warnings.push("Complex boolean query may have unexpected precedence".to_string());
1042 }
1043
1044 if let Some(agent) = filters.agents.iter().next()
1046 && filters.agents.len() == 1
1047 && filters.workspaces.is_empty()
1048 {
1049 warnings.push(format!(
1050 "Searching only in agent '{}' - results from other agents will be excluded",
1051 agent
1052 ));
1053 }
1054
1055 warnings
1056 }
1057
1058 pub fn with_wildcard_fallback(mut self, applied: bool) -> Self {
1060 self.wildcard_applied = applied;
1061 if applied
1062 && !self
1063 .warnings
1064 .iter()
1065 .any(|w| w.contains("wildcard fallback"))
1066 {
1067 self.warnings.push(
1068 "Wildcard fallback was applied automatically due to sparse exact matches"
1069 .to_string(),
1070 );
1071 }
1072 self
1073 }
1074}
1075
1076#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize)]
1079#[serde(rename_all = "snake_case")]
1080pub enum MatchType {
1081 #[default]
1083 Exact,
1084 Prefix,
1086 Suffix,
1088 Substring,
1090 Wildcard,
1092 ImplicitWildcard,
1094}
1095
1096impl MatchType {
1097 pub fn quality_factor(self) -> f32 {
1099 match self {
1100 MatchType::Exact => 1.0,
1101 MatchType::Prefix => 0.9,
1102 MatchType::Suffix => 0.8,
1103 MatchType::Substring => 0.7,
1104 MatchType::Wildcard => 0.65,
1105 MatchType::ImplicitWildcard => 0.6,
1106 }
1107 }
1108}
1109
1110#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
1112#[serde(rename_all = "snake_case")]
1113pub enum SuggestionKind {
1114 SpellingFix,
1116 WildcardQuery,
1118 RemoveFilter,
1120 AlternateAgent,
1122 BroaderDateRange,
1124}
1125
1126#[derive(Debug, Clone, serde::Serialize)]
1128pub struct QuerySuggestion {
1129 pub kind: SuggestionKind,
1131 pub message: String,
1133 pub suggested_query: Option<String>,
1135 pub suggested_filters: Option<SearchFilters>,
1137 pub shortcut: Option<u8>,
1139}
1140
1141impl QuerySuggestion {
1142 fn spelling(_query: &str, corrected: &str) -> Self {
1143 Self {
1144 kind: SuggestionKind::SpellingFix,
1145 message: format!("Did you mean: \"{corrected}\"?"),
1146 suggested_query: Some(corrected.to_string()),
1147 suggested_filters: None,
1148 shortcut: None,
1149 }
1150 }
1151
1152 fn wildcard(query: &str) -> Self {
1153 let wildcard_query = format!("*{}*", query.trim_matches('*'));
1154 Self {
1155 kind: SuggestionKind::WildcardQuery,
1156 message: format!("Try broader search: \"{wildcard_query}\""),
1157 suggested_query: Some(wildcard_query),
1158 suggested_filters: None,
1159 shortcut: None,
1160 }
1161 }
1162
1163 fn remove_agent_filter(current_agent: &str, current_filters: &SearchFilters) -> Self {
1164 let mut filters = current_filters.clone();
1167 filters.agents.clear();
1168 Self {
1169 kind: SuggestionKind::RemoveFilter,
1170 message: format!("Remove agent filter (currently: {current_agent})"),
1171 suggested_query: None,
1172 suggested_filters: Some(filters),
1173 shortcut: None,
1174 }
1175 }
1176
1177 fn try_agent(agent_slug: &str) -> Self {
1178 let mut filters = SearchFilters::default();
1179 filters.agents.insert(agent_slug.to_string());
1180 Self {
1181 kind: SuggestionKind::AlternateAgent,
1182 message: format!("Try searching in: {agent_slug}"),
1183 suggested_query: None,
1184 suggested_filters: Some(filters),
1185 shortcut: None,
1186 }
1187 }
1188
1189 fn with_shortcut(mut self, key: u8) -> Self {
1190 self.shortcut = Some(key);
1191 self
1192 }
1193}
1194
1195#[derive(Debug, Clone, Copy)]
1196pub struct FieldMask {
1197 flags: u8,
1198 preview_content_chars: Option<usize>,
1199}
1200
1201impl FieldMask {
1202 const CONTENT: u8 = 1 << 0;
1203 const SNIPPET: u8 = 1 << 1;
1204 const TITLE: u8 = 1 << 2;
1205 const CACHE: u8 = 1 << 3;
1206
1207 pub const FULL: Self = Self {
1208 flags: Self::CONTENT | Self::SNIPPET | Self::TITLE | Self::CACHE,
1209 preview_content_chars: None,
1210 };
1211
1212 pub fn new(
1213 wants_content: bool,
1214 wants_snippet: bool,
1215 wants_title: bool,
1216 allows_cache: bool,
1217 ) -> Self {
1218 let mut flags = 0;
1219 if wants_content {
1220 flags |= Self::CONTENT;
1221 }
1222 if wants_snippet {
1223 flags |= Self::SNIPPET;
1224 }
1225 if wants_title {
1226 flags |= Self::TITLE;
1227 }
1228 if allows_cache {
1229 flags |= Self::CACHE;
1230 }
1231 Self {
1232 flags,
1233 preview_content_chars: None,
1234 }
1235 }
1236
1237 pub fn with_preview_content_limit(mut self, max_chars: Option<usize>) -> Self {
1238 self.preview_content_chars = max_chars;
1239 if max_chars.is_some() {
1240 self.flags &= !Self::CACHE;
1241 }
1242 self
1243 }
1244
1245 pub fn needs_content(self) -> bool {
1246 self.flags & Self::CONTENT != 0
1247 }
1248
1249 pub fn wants_snippet(self) -> bool {
1250 self.flags & Self::SNIPPET != 0
1251 }
1252
1253 pub fn wants_title(self) -> bool {
1254 self.flags & Self::TITLE != 0
1255 }
1256
1257 pub fn allows_cache(self) -> bool {
1258 self.flags & Self::CACHE != 0
1259 }
1260
1261 pub fn preview_content_limit(self) -> Option<usize> {
1262 self.preview_content_chars
1263 }
1264}
1265
1266#[derive(Debug, Clone, serde::Serialize)]
1267pub struct SearchHit {
1268 pub title: String,
1269 pub snippet: String,
1270 pub content: String,
1271 #[serde(skip_serializing)]
1272 pub content_hash: u64,
1273 #[serde(skip_serializing)]
1274 pub conversation_id: Option<i64>,
1275 pub score: f32,
1276 pub source_path: String,
1277 pub agent: String,
1278 pub workspace: String,
1279 #[serde(skip_serializing_if = "Option::is_none")]
1281 pub workspace_original: Option<String>,
1282 pub created_at: Option<i64>,
1283 pub line_number: Option<usize>,
1285 #[serde(default)]
1287 pub match_type: MatchType,
1288 #[serde(default = "default_source_id")]
1291 pub source_id: String,
1292 #[serde(default = "default_source_id")]
1294 pub origin_kind: String,
1295 #[serde(skip_serializing_if = "Option::is_none")]
1297 pub origin_host: Option<String>,
1298}
1299
1300static LAZY_FIELDS_ENABLED: Lazy<bool> = Lazy::new(|| {
1301 dotenvy::var("CASS_LAZY_FIELDS")
1302 .ok()
1303 .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
1304 .unwrap_or(true)
1305});
1306
1307fn default_source_id() -> String {
1308 "local".to_string()
1309}
1310
1311fn effective_field_mask(field_mask: FieldMask) -> FieldMask {
1312 if *LAZY_FIELDS_ENABLED {
1313 field_mask
1314 } else {
1315 FieldMask::FULL
1316 }
1317}
1318
1319fn execute_query_with_lazy_exact_count(
1320 searcher: &Searcher,
1321 query: &dyn Query,
1322 limit: usize,
1323 offset: usize,
1324) -> Result<FsLexicalSearchResult> {
1325 let top_docs = searcher.search(
1326 query,
1327 &TopDocs::with_limit(limit)
1328 .and_offset(offset)
1329 .order_by_score(),
1330 )?;
1331 let page_saturated = top_docs.len() == limit;
1332 let total_count = if page_saturated {
1333 searcher.search(query, &Count)?
1334 } else {
1335 offset.saturating_add(top_docs.len())
1336 };
1337 let hits = top_docs
1338 .into_iter()
1339 .enumerate()
1340 .map(|(rank, (bm25_score, doc_address))| FsLexicalDocHit {
1341 bm25_score,
1342 rank,
1343 doc_address,
1344 })
1345 .collect();
1346
1347 Ok(FsLexicalSearchResult { hits, total_count })
1348}
1349
1350#[derive(Debug, Clone)]
1352pub struct SearchResult {
1353 pub hits: Vec<SearchHit>,
1355 pub wildcard_fallback: bool,
1357 pub cache_stats: CacheStats,
1359 pub suggestions: Vec<QuerySuggestion>,
1361 pub ann_stats: Option<crate::search::ann_index::AnnSearchStats>,
1363 pub total_count: Option<usize>,
1369}
1370
1371#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1372pub enum ProgressivePhaseKind {
1373 Initial,
1374 Refined,
1375}
1376
1377#[allow(clippy::large_enum_variant)]
1380#[derive(Debug, Clone)]
1381pub enum ProgressiveSearchEvent {
1382 Phase {
1383 kind: ProgressivePhaseKind,
1384 result: SearchResult,
1385 elapsed_ms: u128,
1386 },
1387 RefinementFailed {
1388 latency_ms: u128,
1389 error: String,
1390 },
1391}
1392
1393#[derive(Debug, Clone)]
1394pub(crate) struct ProgressiveSearchRequest<'a> {
1395 pub(crate) cx: &'a FsCx,
1396 pub(crate) query: &'a str,
1397 pub(crate) filters: SearchFilters,
1398 pub(crate) limit: usize,
1399 pub(crate) sparse_threshold: usize,
1400 pub(crate) field_mask: FieldMask,
1401 pub(crate) mode: SearchMode,
1402}
1403
1404#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1405struct SearchHitKey {
1406 source_id: String,
1407 source_path: String,
1408 conversation_id: Option<i64>,
1409 title: String,
1410 line_number: Option<usize>,
1411 created_at: Option<i64>,
1412 content_hash: u64,
1413}
1414
1415fn normalized_search_source_id_sql_expr(
1416 source_id_column: &str,
1417 origin_kind_column: &str,
1418 origin_host_column: &str,
1419) -> String {
1420 format!(
1421 "CASE \
1422 WHEN TRIM(COALESCE({source_id_column}, '')) != '' THEN \
1423 CASE \
1424 WHEN LOWER(TRIM(COALESCE({source_id_column}, ''))) = '{local}' THEN '{local}' \
1425 ELSE TRIM(COALESCE({source_id_column}, '')) \
1426 END \
1427 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) IN ('ssh', 'remote') THEN \
1428 CASE \
1429 WHEN TRIM(COALESCE({origin_host_column}, '')) = '' THEN 'remote' \
1430 ELSE TRIM(COALESCE({origin_host_column}, '')) \
1431 END \
1432 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) = '{local}' THEN '{local}' \
1433 WHEN TRIM(COALESCE({origin_host_column}, '')) != '' THEN TRIM(COALESCE({origin_host_column}, '')) \
1434 ELSE '{local}' \
1435 END",
1436 local = crate::sources::provenance::LOCAL_SOURCE_ID,
1437 )
1438}
1439
1440fn normalize_search_source_filter_value(source_id: &str) -> String {
1441 let trimmed = source_id.trim();
1442 if trimmed.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1443 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1444 } else {
1445 trimmed.to_string()
1446 }
1447}
1448
1449fn normalized_search_hit_source_id_parts(
1450 source_id: &str,
1451 origin_kind: &str,
1452 origin_host: Option<&str>,
1453) -> String {
1454 let trimmed_source_id = source_id.trim();
1455 if !trimmed_source_id.is_empty() {
1456 if trimmed_source_id.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1457 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1458 }
1459 return trimmed_source_id.to_string();
1460 }
1461
1462 let trimmed_origin_host = origin_host.map(str::trim).filter(|value| !value.is_empty());
1463 let trimmed_origin_kind = origin_kind.trim();
1464 if trimmed_origin_kind.eq_ignore_ascii_case("ssh")
1465 || trimmed_origin_kind.eq_ignore_ascii_case("remote")
1466 {
1467 return trimmed_origin_host.unwrap_or("remote").to_string();
1468 }
1469 if let Some(origin_host) = trimmed_origin_host {
1470 return origin_host.to_string();
1471 }
1472
1473 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1474}
1475
1476fn normalized_search_hit_origin_kind(source_id: &str, origin_kind: Option<&str>) -> String {
1477 if let Some(kind) = origin_kind.map(str::trim).filter(|value| !value.is_empty()) {
1478 if kind.eq_ignore_ascii_case("local") {
1479 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1480 }
1481 if kind.eq_ignore_ascii_case("ssh") || kind.eq_ignore_ascii_case("remote") {
1482 return "remote".to_string();
1483 }
1484 return kind.to_ascii_lowercase();
1485 }
1486
1487 if source_id == crate::sources::provenance::LOCAL_SOURCE_ID {
1488 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1489 } else {
1490 "remote".to_string()
1491 }
1492}
1493
1494fn normalized_search_hit_source_id(hit: &SearchHit) -> String {
1495 normalized_search_hit_source_id_parts(
1496 hit.source_id.as_str(),
1497 hit.origin_kind.as_str(),
1498 hit.origin_host.as_deref(),
1499 )
1500}
1501
1502impl SearchHitKey {
1503 fn from_hit(hit: &SearchHit) -> Self {
1504 Self {
1505 source_id: normalized_search_hit_source_id(hit),
1506 source_path: hit.source_path.clone(),
1507 conversation_id: hit.conversation_id,
1508 title: if hit.conversation_id.is_some() {
1509 String::new()
1510 } else {
1511 hit.title.trim().to_string()
1512 },
1513 line_number: hit.line_number,
1514 created_at: hit.created_at,
1515 content_hash: hit.content_hash,
1516 }
1517 }
1518}
1519
1520impl Ord for SearchHitKey {
1521 fn cmp(&self, other: &Self) -> CmpOrdering {
1522 self.source_id
1523 .cmp(&other.source_id)
1524 .then_with(|| self.source_path.cmp(&other.source_path))
1525 .then_with(|| self.conversation_id.cmp(&other.conversation_id))
1526 .then_with(|| self.title.cmp(&other.title))
1527 .then_with(|| self.line_number.cmp(&other.line_number))
1528 .then_with(|| self.created_at.cmp(&other.created_at))
1529 .then_with(|| self.content_hash.cmp(&other.content_hash))
1530 }
1531}
1532
1533impl PartialOrd for SearchHitKey {
1534 fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
1535 Some(self.cmp(other))
1536 }
1537}
1538
1539const FEDERATED_RRF_K: f32 = 60.0;
1540
1541#[derive(Debug)]
1542struct FederatedRankedHit {
1543 hit: SearchHit,
1544 shard_index: usize,
1545 shard_rank: usize,
1546 fused_score: f32,
1547}
1548
1549fn federated_rrf_score(shard_rank: usize) -> f32 {
1550 1.0 / (FEDERATED_RRF_K + shard_rank as f32 + 1.0)
1551}
1552
1553fn merge_federated_ranked_hits(mut ranked_hits: Vec<FederatedRankedHit>) -> Vec<SearchHit> {
1554 ranked_hits.sort_by(|a, b| {
1555 b.fused_score
1556 .total_cmp(&a.fused_score)
1557 .then_with(|| a.shard_rank.cmp(&b.shard_rank))
1558 .then_with(|| SearchHitKey::from_hit(&a.hit).cmp(&SearchHitKey::from_hit(&b.hit)))
1559 .then_with(|| a.shard_index.cmp(&b.shard_index))
1560 });
1561 ranked_hits
1562 .into_iter()
1563 .map(|mut ranked| {
1564 ranked.hit.score = ranked.fused_score;
1565 ranked.hit
1566 })
1567 .collect()
1568}
1569
1570#[cfg(test)]
1571#[allow(dead_code)]
1572#[derive(Debug, Default, Clone)]
1573struct HybridScore {
1574 rrf: f32,
1575 lexical_rank: Option<usize>,
1576 semantic_rank: Option<usize>,
1577 lexical_score: Option<f32>,
1578 semantic_score: Option<f32>,
1579}
1580
1581#[cfg(test)]
1582#[allow(dead_code)]
1583#[derive(Debug, Clone)]
1584struct FusedHit {
1585 key: SearchHitKey,
1586 score: HybridScore,
1587 hit: SearchHit,
1588}
1589
1590pub(crate) fn stable_content_hash(content: &str) -> u64 {
1600 use xxhash_rust::xxh3::Xxh3;
1601 let mut hasher = Xxh3::new();
1602 let mut first = true;
1603 for token in content.split_whitespace() {
1604 if !first {
1605 hasher.update(b" ");
1606 }
1607 hasher.update(token.as_bytes());
1608 first = false;
1609 }
1610 hasher.digest()
1611}
1612
1613fn stable_hit_hash(
1614 content: &str,
1615 source_path: &str,
1616 line_number: Option<usize>,
1617 created_at: Option<i64>,
1618) -> u64 {
1619 use xxhash_rust::xxh3::Xxh3;
1620 let mut hasher = Xxh3::new();
1621 if !content.is_empty() {
1624 hasher.update(&stable_content_hash(content).to_le_bytes());
1625 }
1626 hasher.update(b"|");
1627 hasher.update(source_path.as_bytes());
1628 hasher.update(b"|");
1629 if let Some(line) = line_number {
1630 let mut buf = itoa::Buffer::new();
1631 hasher.update(buf.format(line).as_bytes());
1632 }
1633 hasher.update(b"|");
1634 if let Some(ts) = created_at {
1635 let mut buf = itoa::Buffer::new();
1636 hasher.update(buf.format(ts).as_bytes());
1637 }
1638 hasher.digest()
1639}
1640
1641fn search_hit_key_doc_id(key: &SearchHitKey) -> String {
1642 use std::fmt::Write as _;
1650 const SEP: char = '\u{1f}';
1651 let capacity = key.source_id.len()
1653 + key.source_path.len()
1654 + key.title.len()
1655 + 6 + 3 * 20 + 20; let mut out = String::with_capacity(capacity);
1659 out.push_str(&key.source_id);
1660 out.push(SEP);
1661 out.push_str(&key.source_path);
1662 out.push(SEP);
1663 if let Some(v) = key.conversation_id {
1664 let _ = write!(out, "{v}");
1665 }
1666 out.push(SEP);
1667 out.push_str(&key.title);
1668 out.push(SEP);
1669 if let Some(v) = key.line_number {
1670 let _ = write!(out, "{v}");
1671 }
1672 out.push(SEP);
1673 if let Some(v) = key.created_at {
1674 let _ = write!(out, "{v}");
1675 }
1676 out.push(SEP);
1677 let _ = write!(out, "{}", key.content_hash);
1678 out
1679}
1680
1681fn search_hit_doc_id(hit: &SearchHit) -> String {
1682 search_hit_key_doc_id(&SearchHitKey::from_hit(hit))
1683}
1684
1685#[cfg(test)]
1687fn cmp_fused_hit_desc(a: &FusedHit, b: &FusedHit) -> CmpOrdering {
1688 b.score
1689 .rrf
1690 .total_cmp(&a.score.rrf)
1691 .then_with(|| {
1692 let a_both = a.score.lexical_rank.is_some() && a.score.semantic_rank.is_some();
1693 let b_both = b.score.lexical_rank.is_some() && b.score.semantic_rank.is_some();
1694 match (b_both, a_both) {
1695 (true, false) => CmpOrdering::Greater,
1696 (false, true) => CmpOrdering::Less,
1697 _ => CmpOrdering::Equal,
1698 }
1699 })
1700 .then_with(|| a.key.cmp(&b.key))
1701}
1702
1703#[cfg(test)]
1705#[allow(dead_code)]
1706const QUICKSELECT_THRESHOLD: usize = 64;
1707
1708#[cfg(test)]
1717#[allow(dead_code)]
1718fn top_k_fused(mut hits: Vec<FusedHit>, k: usize) -> Vec<FusedHit> {
1719 let n = hits.len();
1720
1721 if n == 0 || k == 0 {
1723 return Vec::new();
1724 }
1725 if k >= n {
1726 hits.sort_by(cmp_fused_hit_desc);
1727 return hits;
1728 }
1729
1730 if n < QUICKSELECT_THRESHOLD {
1732 hits.sort_by(cmp_fused_hit_desc);
1733 hits.truncate(k);
1734 return hits;
1735 }
1736
1737 hits.select_nth_unstable_by(k - 1, cmp_fused_hit_desc);
1739
1740 hits.truncate(k);
1742
1743 hits.sort_by(cmp_fused_hit_desc);
1745
1746 hits
1747}
1748
1749pub fn rrf_fuse_hits(
1752 lexical: &[SearchHit],
1753 semantic: &[SearchHit],
1754 query: &str,
1755 limit: usize,
1756 offset: usize,
1757) -> Vec<SearchHit> {
1758 if limit == 0 {
1759 return Vec::new();
1760 }
1761 let total_candidates = lexical.len().saturating_add(semantic.len());
1762 if total_candidates == 0 {
1763 return Vec::new();
1764 }
1765
1766 let mut lexical_scored = Vec::with_capacity(lexical.len());
1767 let mut semantic_scored = Vec::with_capacity(semantic.len());
1768 let mut hit_by_doc_id: HashMap<String, SearchHit> = HashMap::with_capacity(total_candidates);
1769
1770 for hit in lexical {
1771 let doc_id = search_hit_doc_id(hit);
1772 hit_by_doc_id.insert(doc_id.clone(), hit.clone());
1774 lexical_scored.push(FsScoredResult {
1775 doc_id,
1776 score: hit.score,
1777 source: FsScoreSource::Lexical,
1778 index: None,
1779 fast_score: None,
1780 quality_score: None,
1781 lexical_score: Some(hit.score),
1782 rerank_score: None,
1783 explanation: None,
1784 metadata: None,
1785 });
1786 }
1787
1788 for (idx, hit) in semantic.iter().enumerate() {
1789 let doc_id = search_hit_doc_id(hit);
1790 hit_by_doc_id
1791 .entry(doc_id.clone())
1792 .or_insert_with(|| hit.clone());
1793 semantic_scored.push(FsVectorHit {
1794 index: u32::try_from(idx).unwrap_or(u32::MAX),
1795 score: hit.score,
1796 doc_id,
1797 });
1798 }
1799
1800 let fused = fs_rrf_fuse(
1803 &lexical_scored,
1804 &semantic_scored,
1805 total_candidates,
1806 0,
1807 &FsRrfConfig::default(),
1808 );
1809
1810 #[derive(Clone, Copy)]
1815 struct CompatSlot {
1816 index: usize,
1817 conversation_id: Option<i64>,
1818 ambiguous: bool,
1819 }
1820
1821 let mut source_ids: HashMap<String, u32> = HashMap::new();
1822 let mut path_ids: HashMap<String, u32> = HashMap::new();
1823 let mut title_ids: HashMap<String, u32> = HashMap::new();
1824 let mut next_source_id: u32 = 0;
1825 let mut next_path_id: u32 = 0;
1826 let mut next_title_id: u32 = 0;
1827 type CompatExactKey = (
1828 u32,
1829 u32,
1830 Option<i64>,
1831 Option<u32>,
1832 Option<usize>,
1833 Option<i64>,
1834 u64,
1835 );
1836 type CompatFallbackKey = (u32, u32, u32, Option<usize>, Option<i64>, u64);
1837
1838 let mut exact_seen: HashMap<CompatExactKey, usize> = HashMap::with_capacity(fused.len());
1839 let mut fallback_seen: HashMap<CompatFallbackKey, CompatSlot> =
1840 HashMap::with_capacity(fused.len());
1841 let mut unique_hits: Vec<SearchHit> = Vec::with_capacity(fused.len());
1842
1843 let update_slot = |slot: &mut CompatSlot, conversation_id: Option<i64>| {
1844 if slot.ambiguous {
1845 return;
1846 }
1847 match (slot.conversation_id, conversation_id) {
1848 (Some(existing), Some(current)) if existing != current => slot.ambiguous = true,
1849 (None, Some(current)) => slot.conversation_id = Some(current),
1850 _ => {}
1851 }
1852 };
1853
1854 for fused_hit in fused {
1855 let mut hit = match hit_by_doc_id.remove(&fused_hit.doc_id) {
1856 Some(hit) => hit,
1857 None => continue,
1858 };
1859 if hit_is_noise(&hit, query) {
1860 continue;
1861 }
1862
1863 let normalized_source_id = normalized_search_hit_source_id(&hit);
1864 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
1865 *id
1866 } else {
1867 let id = next_source_id;
1868 next_source_id = next_source_id.saturating_add(1);
1869 source_ids.insert(normalized_source_id, id);
1870 id
1871 };
1872 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
1873 *id
1874 } else {
1875 let id = next_path_id;
1876 next_path_id = next_path_id.saturating_add(1);
1877 path_ids.insert(hit.source_path.clone(), id);
1878 id
1879 };
1880 let normalized_title = hit.title.trim();
1881 let fallback_title_key = if let Some(id) = title_ids.get(normalized_title) {
1882 *id
1883 } else {
1884 let id = next_title_id;
1885 next_title_id = next_title_id.saturating_add(1);
1886 title_ids.insert(normalized_title.to_string(), id);
1887 id
1888 };
1889 let exact_title_key = if hit.conversation_id.is_some() {
1890 None
1891 } else {
1892 Some(fallback_title_key)
1893 };
1894 let exact_key = (
1895 source_key,
1896 path_key,
1897 hit.conversation_id,
1898 exact_title_key,
1899 hit.line_number,
1900 hit.created_at,
1901 hit.content_hash,
1902 );
1903 let fallback_key = (
1904 source_key,
1905 path_key,
1906 fallback_title_key,
1907 hit.line_number,
1908 hit.created_at,
1909 hit.content_hash,
1910 );
1911
1912 let merged_idx = exact_seen.get(&exact_key).copied().or_else(|| {
1913 fallback_seen.get(&fallback_key).and_then(|slot| {
1914 if slot.ambiguous {
1915 return None;
1916 }
1917 match (slot.conversation_id, hit.conversation_id) {
1918 (Some(existing), Some(current)) if existing != current => None,
1919 _ => Some(slot.index),
1920 }
1921 })
1922 });
1923
1924 if let Some(existing_idx) = merged_idx {
1925 exact_seen.insert(exact_key, existing_idx);
1926 let slot = fallback_seen.entry(fallback_key).or_insert(CompatSlot {
1927 index: existing_idx,
1928 conversation_id: hit.conversation_id,
1929 ambiguous: false,
1930 });
1931 update_slot(slot, hit.conversation_id);
1932 if unique_hits[existing_idx].conversation_id.is_none() && hit.conversation_id.is_some()
1933 {
1934 unique_hits[existing_idx].conversation_id = hit.conversation_id;
1935 }
1936 unique_hits[existing_idx].score += fused_hit.rrf_score as f32;
1937 continue;
1938 }
1939
1940 hit.score = fused_hit.rrf_score as f32;
1941 let index = unique_hits.len();
1942 unique_hits.push(hit);
1943 exact_seen.insert(exact_key, index);
1944 match fallback_seen.get_mut(&fallback_key) {
1945 Some(slot) => update_slot(slot, unique_hits[index].conversation_id),
1946 None => {
1947 fallback_seen.insert(
1948 fallback_key,
1949 CompatSlot {
1950 index,
1951 conversation_id: unique_hits[index].conversation_id,
1952 ambiguous: false,
1953 },
1954 );
1955 }
1956 }
1957 }
1958
1959 unique_hits.sort_by(|a, b| {
1960 b.score
1961 .total_cmp(&a.score)
1962 .then_with(|| SearchHitKey::from_hit(a).cmp(&SearchHitKey::from_hit(b)))
1963 });
1964
1965 let start = offset.min(unique_hits.len());
1966 unique_hits.into_iter().skip(start).take(limit).collect()
1967}
1968
1969struct QueryCache {
1970 embedder_id: String,
1971 embeddings: LruCache<String, Vec<f32>>,
1972}
1973
1974impl QueryCache {
1975 fn new(embedder_id: &str, capacity: NonZeroUsize) -> Self {
1976 Self {
1977 embedder_id: embedder_id.to_string(),
1978 embeddings: LruCache::new(capacity),
1979 }
1980 }
1981
1982 fn align_embedder(&mut self, embedder: &dyn Embedder) {
1983 if self.embedder_id != embedder.id() {
1984 self.embedder_id = embedder.id().to_string();
1985 self.embeddings.clear();
1986 }
1987 }
1988
1989 fn get_cached(&mut self, embedder: &dyn Embedder, canonical: &str) -> Option<Vec<f32>> {
1990 self.align_embedder(embedder);
1991 self.embeddings.get(canonical).cloned()
1992 }
1993
1994 fn store(&mut self, embedder: &dyn Embedder, canonical: &str, embedding: Vec<f32>) {
1995 self.align_embedder(embedder);
1996 self.embeddings.put(canonical.to_string(), embedding);
1997 }
1998}
1999
2000fn semantic_filter_as_search_filter(filter: &SemanticFilter) -> Option<&dyn FsSearchFilter> {
2003 let unrestricted = filter.agents.is_none()
2004 && filter.workspaces.is_none()
2005 && filter.sources.is_none()
2006 && filter.roles.is_none()
2007 && filter.created_from.is_none()
2008 && filter.created_to.is_none();
2009 if unrestricted { None } else { Some(filter) }
2010}
2011
2012fn open_fs_semantic_ann_index(fs_index: &FsVectorIndex, ann_path: &Path) -> Result<FsHnswIndex> {
2013 if !ann_path.is_file() {
2014 bail!(
2015 "approximate search unavailable: HNSW index not found at {}",
2016 ann_path.display()
2017 );
2018 }
2019
2020 let ann = FsHnswIndex::load(ann_path, fs_index)
2021 .map_err(|err| anyhow!("open HNSW index failed: {err}"))?;
2022 let matches = ann
2023 .matches_vector_index(fs_index)
2024 .map_err(|err| anyhow!("validate HNSW index failed: {err}"))?;
2025 if !matches {
2026 bail!(
2027 "approximate search unavailable: HNSW index at {} is stale for current semantic index (run 'cass index --semantic --build-hnsw')",
2028 ann_path.display()
2029 );
2030 }
2031
2032 Ok(ann)
2033}
2034
2035struct SemanticSearchState {
2036 context_token: Arc<()>,
2037 embedder: Arc<dyn Embedder>,
2038 fs_semantic_index: Arc<FsVectorIndex>,
2039 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2040 fs_ann_index: Option<Arc<FsHnswIndex>>,
2041 ann_path: Option<PathBuf>,
2042 fs_in_memory_two_tier_index: Option<Arc<FsInMemoryTwoTierIndex>>,
2043 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable,
2044 progressive_context: Option<Arc<ProgressiveTwoTierContext>>,
2045 progressive_context_unavailable: bool,
2046 filter_maps: SemanticFilterMaps,
2047 roles: Option<HashSet<u8>>,
2048 query_cache: QueryCache,
2049}
2050
2051#[derive(Debug, Clone, Copy, Default)]
2052struct InMemoryTwoTierUnavailable {
2053 fast_only: bool,
2054 quality: bool,
2055}
2056
2057impl InMemoryTwoTierUnavailable {
2058 fn is_known_unavailable(self, tier_mode: SemanticTierMode) -> bool {
2059 match tier_mode {
2060 SemanticTierMode::Single => false,
2061 SemanticTierMode::FastOnly => self.fast_only,
2062 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => self.quality,
2063 }
2064 }
2065
2066 fn mark_unavailable(&mut self, tier_mode: SemanticTierMode) {
2067 match tier_mode {
2068 SemanticTierMode::Single => {}
2069 SemanticTierMode::FastOnly => {
2070 self.fast_only = true;
2071 }
2072 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => {
2073 self.quality = true;
2074 }
2075 }
2076 }
2077}
2078
2079struct ProgressiveTwoTierContext {
2080 context_token: Arc<()>,
2081 index: Arc<FsTwoTierIndex>,
2082 fast_embedder: Arc<dyn frankensearch::Embedder>,
2083 quality_embedder: Option<Arc<dyn frankensearch::Embedder>>,
2084}
2085
2086#[derive(Clone)]
2087struct SemanticCandidateContext {
2088 fs_semantic_index: Arc<FsVectorIndex>,
2089 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2090 filter_maps: SemanticFilterMaps,
2091 roles: Option<HashSet<u8>>,
2092}
2093
2094struct SemanticCandidateSearchRequest<'a> {
2095 fetch_limit: usize,
2096 approximate: bool,
2097 tier_mode: SemanticTierMode,
2098 in_memory_two_tier_index: Option<&'a Arc<FsInMemoryTwoTierIndex>>,
2099 ann_index: Option<&'a Arc<FsHnswIndex>>,
2100}
2101
2102#[derive(Debug, Clone, Copy, Default)]
2103struct SemanticCandidateRetryState {
2104 has_more_candidates: bool,
2105 exact_window_may_omit_competitor: bool,
2106}
2107
2108struct SemanticQueryEmbedding {
2109 context_token: Arc<()>,
2110 vector: Vec<f32>,
2111}
2112
2113struct SharedCassSyncEmbedder {
2114 inner: Arc<dyn Embedder>,
2115 cache: Mutex<LruCache<String, Vec<f32>>>,
2116}
2117
2118impl SharedCassSyncEmbedder {
2119 fn new(inner: Arc<dyn Embedder>) -> Self {
2120 let cache_capacity =
2121 NonZeroUsize::new(PROGRESSIVE_EMBEDDING_CACHE_CAPACITY).expect("cache capacity > 0");
2122 Self {
2123 inner,
2124 cache: Mutex::new(LruCache::new(cache_capacity)),
2125 }
2126 }
2127}
2128
2129impl Embedder for SharedCassSyncEmbedder {
2130 fn embed_sync(&self, text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
2131 if let Ok(mut cache) = self.cache.lock()
2132 && let Some(embedding) = cache.get(text).cloned()
2133 {
2134 return Ok(embedding);
2135 }
2136
2137 let embedding = self.inner.embed_sync(text)?;
2138 if let Ok(mut cache) = self.cache.lock() {
2139 cache.put(text.to_owned(), embedding.clone());
2140 }
2141 Ok(embedding)
2142 }
2143
2144 fn embed_batch_sync(
2145 &self,
2146 texts: &[&str],
2147 ) -> crate::search::embedder::EmbedderResult<Vec<Vec<f32>>> {
2148 self.inner.embed_batch_sync(texts)
2149 }
2150
2151 fn dimension(&self) -> usize {
2152 self.inner.dimension()
2153 }
2154
2155 fn id(&self) -> &str {
2156 self.inner.id()
2157 }
2158
2159 fn model_name(&self) -> &str {
2160 self.inner.model_name()
2161 }
2162
2163 fn is_ready(&self) -> bool {
2164 self.inner.is_ready()
2165 }
2166
2167 fn is_semantic(&self) -> bool {
2168 self.inner.is_semantic()
2169 }
2170
2171 fn category(&self) -> frankensearch::ModelCategory {
2172 self.inner.category()
2173 }
2174
2175 fn tier(&self) -> frankensearch::ModelTier {
2176 self.inner.tier()
2177 }
2178
2179 fn supports_mrl(&self) -> bool {
2180 self.inner.supports_mrl()
2181 }
2182}
2183
2184fn build_in_memory_two_tier_index(
2185 ann_path: Option<PathBuf>,
2186 embedder_id: &str,
2187 tier_mode: SemanticTierMode,
2188) -> Option<Arc<FsInMemoryTwoTierIndex>> {
2189 let index_dir = ann_path
2190 .as_ref()
2191 .and_then(|path| path.parent().map(Path::to_path_buf));
2192 let Some(index_dir) = index_dir else {
2193 tracing::debug!("two-tier semantic unavailable: ann/index directory path missing");
2194 return None;
2195 };
2196
2197 match FsInMemoryTwoTierIndex::from_dir(&index_dir) {
2198 Ok(index) => return Some(Arc::new(index)),
2199 Err(err) => {
2200 tracing::debug!(
2201 dir = %index_dir.display(),
2202 error = %err,
2203 "two-tier semantic index load failed; considering fallback"
2204 );
2205 }
2206 }
2207
2208 if !matches!(tier_mode, SemanticTierMode::FastOnly) {
2209 return None;
2210 }
2211
2212 let fallback_fast = index_dir.join(format!("index-{embedder_id}.fsvi"));
2213 if !fallback_fast.is_file() {
2214 return None;
2215 }
2216
2217 match FsInMemoryVectorIndex::from_fsvi(&fallback_fast) {
2218 Ok(fast) => Some(Arc::new(FsInMemoryTwoTierIndex::new(fast, None))),
2219 Err(err) => {
2220 tracing::debug!(
2221 path = %fallback_fast.display(),
2222 error = %err,
2223 "fast-only semantic fallback index load failed"
2224 );
2225 None
2226 }
2227 }
2228}
2229
2230fn two_tier_index_supports_mode(
2231 index: &FsInMemoryTwoTierIndex,
2232 tier_mode: SemanticTierMode,
2233) -> bool {
2234 !matches!(
2235 tier_mode,
2236 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly
2237 ) || index.has_quality_index()
2238}
2239
2240#[derive(Debug, Clone)]
2241struct ResolvedSemanticDocId {
2242 message_id: u64,
2243 doc_id: String,
2244}
2245
2246type ProgressiveLookupKey = (String, String, Option<i64>, String, i64, Option<i64>, u64);
2247type ProgressiveExactQueryKey = (i64, i64);
2248type ProgressiveFallbackQueryKey = (String, String, i64);
2249type ResolvedSemanticLookupRow = Option<(ProgressiveLookupKey, ResolvedSemanticDocId)>;
2250
2251#[derive(Debug, Clone)]
2252struct ProgressiveLexicalHit {
2253 title: String,
2254 snippet: String,
2255 content: String,
2256 content_hash: u64,
2257 conversation_id: Option<i64>,
2258 source_path: String,
2259 agent: String,
2260 workspace: String,
2261 workspace_original: Option<String>,
2262 created_at: Option<i64>,
2263 match_type: MatchType,
2264 line_number: Option<usize>,
2265 source_id: String,
2266 origin_kind: String,
2267 origin_host: Option<String>,
2268}
2269
2270impl ProgressiveLexicalHit {
2271 fn from_search_hit(hit: &SearchHit, field_mask: FieldMask) -> Self {
2272 Self {
2273 title: if field_mask.wants_title() {
2274 hit.title.clone()
2275 } else {
2276 String::new()
2277 },
2278 snippet: if field_mask.wants_snippet() {
2279 hit.snippet.clone()
2280 } else {
2281 String::new()
2282 },
2283 content: if field_mask.needs_content() {
2284 hit.content.clone()
2285 } else {
2286 String::new()
2287 },
2288 content_hash: hit.content_hash,
2289 conversation_id: hit.conversation_id,
2290 source_path: hit.source_path.clone(),
2291 agent: hit.agent.clone(),
2292 workspace: hit.workspace.clone(),
2293 workspace_original: hit.workspace_original.clone(),
2294 created_at: hit.created_at,
2295 match_type: hit.match_type,
2296 line_number: hit.line_number,
2297 source_id: hit.source_id.clone(),
2298 origin_kind: hit.origin_kind.clone(),
2299 origin_host: hit.origin_host.clone(),
2300 }
2301 }
2302
2303 fn to_search_hit(&self, score: f32) -> SearchHit {
2304 SearchHit {
2305 title: self.title.clone(),
2306 snippet: self.snippet.clone(),
2307 content: self.content.clone(),
2308 content_hash: self.content_hash,
2309 conversation_id: self.conversation_id,
2310 score,
2311 source_path: self.source_path.clone(),
2312 agent: self.agent.clone(),
2313 workspace: self.workspace.clone(),
2314 workspace_original: self.workspace_original.clone(),
2315 created_at: self.created_at,
2316 line_number: self.line_number,
2317 match_type: self.match_type,
2318 source_id: self.source_id.clone(),
2319 origin_kind: self.origin_kind.clone(),
2320 origin_host: self.origin_host.clone(),
2321 }
2322 }
2323}
2324
2325#[derive(Debug, Default)]
2326struct ProgressiveLexicalCache {
2327 hits_by_message: HashMap<u64, ProgressiveLexicalHit>,
2328 wildcard_fallback: bool,
2329 suggestions: Vec<QuerySuggestion>,
2330}
2331
2332#[derive(Clone, Copy)]
2333struct ProgressivePhaseContext<'a> {
2334 query: &'a str,
2335 filters: &'a SearchFilters,
2336 field_mask: FieldMask,
2337 lexical_cache: Option<&'a ProgressiveLexicalCache>,
2338 limit: usize,
2339 fetch_limit: usize,
2340}
2341
2342type ProgressiveLexicalSnapshot = Arc<ProgressiveLexicalCache>;
2343
2344struct CassProgressiveLexicalAdapter {
2345 client: Arc<SearchClient>,
2346 filters: SearchFilters,
2347 field_mask: FieldMask,
2348 sparse_threshold: usize,
2349 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2350}
2351
2352impl CassProgressiveLexicalAdapter {
2353 fn new(
2354 client: Arc<SearchClient>,
2355 filters: SearchFilters,
2356 field_mask: FieldMask,
2357 sparse_threshold: usize,
2358 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2359 ) -> Self {
2360 Self {
2361 client,
2362 filters,
2363 field_mask,
2364 sparse_threshold,
2365 shared,
2366 }
2367 }
2368}
2369
2370impl FsLexicalSearch for CassProgressiveLexicalAdapter {
2371 fn search<'a>(
2372 &'a self,
2373 cx: &'a FsCx,
2374 query: &'a str,
2375 limit: usize,
2376 ) -> FsSearchFuture<'a, Vec<FsScoredResult>> {
2377 Box::pin(async move {
2378 if cx.is_cancel_requested() {
2379 return Err(FsSearchError::Cancelled {
2380 phase: "lexical".to_string(),
2381 reason: "cancel requested".to_string(),
2382 });
2383 }
2384
2385 let result = self
2386 .client
2387 .search_with_fallback(
2388 query,
2389 self.filters.clone(),
2390 limit,
2391 0,
2392 self.sparse_threshold,
2393 self.field_mask,
2394 )
2395 .map_err(|err| FsSearchError::SubsystemError {
2396 subsystem: "cass_lexical_adapter",
2397 source: Box::new(std::io::Error::other(err.to_string())),
2398 })?;
2399
2400 let resolved = self
2401 .client
2402 .resolve_semantic_doc_ids_for_hits(&result.hits)
2403 .map_err(|err| FsSearchError::SubsystemError {
2404 subsystem: "cass_lexical_adapter",
2405 source: Box::new(std::io::Error::other(err.to_string())),
2406 })?;
2407
2408 let mut scored = Vec::with_capacity(result.hits.len());
2409 let mut hits_by_message = HashMap::with_capacity(result.hits.len());
2410
2411 for (hit, resolved_doc) in result.hits.iter().zip(resolved) {
2412 let Some(resolved_doc) = resolved_doc else {
2413 continue;
2414 };
2415 hits_by_message
2416 .entry(resolved_doc.message_id)
2417 .or_insert_with(|| {
2418 ProgressiveLexicalHit::from_search_hit(hit, self.field_mask)
2419 });
2420 scored.push(FsScoredResult {
2421 doc_id: resolved_doc.doc_id,
2422 score: hit.score,
2423 source: FsScoreSource::Lexical,
2424 index: None,
2425 fast_score: None,
2426 quality_score: None,
2427 lexical_score: Some(hit.score),
2428 rerank_score: None,
2429 explanation: None,
2430 metadata: None,
2431 });
2432 }
2433
2434 if let Ok(mut guard) = self.shared.lock() {
2435 *guard = Arc::new(ProgressiveLexicalCache {
2436 hits_by_message,
2437 wildcard_fallback: result.wildcard_fallback,
2438 suggestions: result.suggestions,
2439 });
2440 }
2441
2442 Ok(scored)
2443 })
2444 }
2445
2446 fn index_document<'a>(
2447 &'a self,
2448 _cx: &'a FsCx,
2449 _doc: &'a frankensearch::IndexableDocument,
2450 ) -> FsSearchFuture<'a, ()> {
2451 Box::pin(async move {
2452 Err(FsSearchError::SubsystemError {
2453 subsystem: "cass_lexical_adapter",
2454 source: Box::new(std::io::Error::other("cass lexical adapter is read-only")),
2455 })
2456 })
2457 }
2458
2459 fn commit<'a>(&'a self, _cx: &'a FsCx) -> FsSearchFuture<'a, ()> {
2460 Box::pin(async move { Ok(()) })
2461 }
2462
2463 fn doc_count(&self) -> usize {
2464 self.client.total_docs()
2465 }
2466}
2467
2468pub struct SearchClient {
2469 reader: Option<(IndexReader, FsCassFields)>,
2470 sqlite: Mutex<Option<SendConnection>>,
2471 sqlite_path: Option<PathBuf>,
2472 prefix_cache: Mutex<CacheShards>,
2473 reload_on_search: bool,
2474 last_reload: Mutex<Option<Instant>>,
2475 last_generation: Mutex<Option<u64>>,
2476 reload_epoch: Arc<AtomicU64>,
2477 warm_tx: Option<mpsc::Sender<WarmJob>>,
2478 _warm_handle: Option<std::thread::JoinHandle<()>>,
2479 metrics: Metrics,
2480 cache_namespace: String,
2481 semantic: Mutex<Option<SemanticSearchState>>,
2482 last_tantivy_total_count: Mutex<Option<usize>>,
2486}
2487
2488#[derive(Debug, Clone, Copy)]
2489pub struct SearchClientOptions {
2490 pub enable_reload: bool,
2491 pub enable_warm: bool,
2492}
2493
2494impl Default for SearchClientOptions {
2495 fn default() -> Self {
2496 Self {
2497 enable_reload: true,
2498 enable_warm: true,
2499 }
2500 }
2501}
2502
2503impl Drop for SearchClient {
2504 fn drop(&mut self) {
2505 FEDERATED_SEARCH_READERS
2506 .write()
2507 .remove(&self.cache_namespace);
2508 }
2509}
2510
2511#[derive(Debug, Clone, PartialEq, Eq)]
2512pub struct CacheStats {
2513 pub cache_hits: u64,
2514 pub cache_miss: u64,
2515 pub cache_shortfall: u64,
2516 pub reloads: u64,
2517 pub reload_ms_total: u128,
2518 pub total_cap: usize,
2519 pub total_cost: usize,
2520 pub eviction_count: u64,
2522 pub approx_bytes: usize,
2524 pub byte_cap: usize,
2526 pub eviction_policy: &'static str,
2528 pub ghost_entries: usize,
2530 pub admission_rejects: u64,
2532 pub prewarm_scheduled: u64,
2534 pub prewarm_skipped_pressure: u64,
2536 pub reader_generation: Option<u64>,
2538}
2539
2540impl Default for CacheStats {
2541 fn default() -> Self {
2542 Self {
2543 cache_hits: 0,
2544 cache_miss: 0,
2545 cache_shortfall: 0,
2546 reloads: 0,
2547 reload_ms_total: 0,
2548 total_cap: 0,
2549 total_cost: 0,
2550 eviction_count: 0,
2551 approx_bytes: 0,
2552 byte_cap: 0,
2553 eviction_policy: "unknown",
2554 ghost_entries: 0,
2555 admission_rejects: 0,
2556 prewarm_scheduled: 0,
2557 prewarm_skipped_pressure: 0,
2558 reader_generation: None,
2559 }
2560 }
2561}
2562
2563static CACHE_SHARD_CAP: Lazy<usize> = Lazy::new(|| {
2566 dotenvy::var("CASS_CACHE_SHARD_CAP")
2567 .ok()
2568 .and_then(|v| v.parse::<usize>().ok())
2569 .filter(|v| *v > 0)
2570 .unwrap_or(256)
2571});
2572
2573static CACHE_TOTAL_CAP: Lazy<usize> = Lazy::new(|| {
2575 dotenvy::var("CASS_CACHE_TOTAL_CAP")
2576 .ok()
2577 .and_then(|v| v.parse::<usize>().ok())
2578 .filter(|v| *v > 0)
2579 .unwrap_or(2048)
2580});
2581
2582static CACHE_DEBUG_ENABLED: Lazy<bool> = Lazy::new(|| {
2583 dotenvy::var("CASS_DEBUG_CACHE_METRICS")
2584 .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
2585 .unwrap_or(false)
2586});
2587
2588static CACHE_BYTE_CAP: Lazy<usize> = Lazy::new(|| match dotenvy::var("CASS_CACHE_BYTE_CAP") {
2591 Ok(value) => cache_byte_cap_from_env_value(Some(&value), available_memory_bytes()),
2592 Err(_) => default_cache_byte_cap(),
2593});
2594
2595static CACHE_EVICTION_POLICY: Lazy<CacheEvictionPolicy> = Lazy::new(|| {
2596 cache_eviction_policy_from_env_value(dotenvy::var("CASS_CACHE_EVICTION_POLICY").ok().as_deref())
2597});
2598
2599const DEFAULT_CACHE_BYTE_CAP_FALLBACK: usize = 64 * 1024 * 1024;
2600const DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR: u64 = 128;
2601const DEFAULT_CACHE_BYTE_CAP_CEILING: u64 = 2 * 1024 * 1024 * 1024;
2602const S3_FIFO_GHOST_CAP_MULTIPLIER: usize = 2;
2603const S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR: usize = 4;
2604const PREWARM_ENTRY_PRESSURE_NUMERATOR: usize = 9;
2605const PREWARM_ENTRY_PRESSURE_DENOMINATOR: usize = 10;
2606const PREWARM_BYTE_PRESSURE_NUMERATOR: usize = 4;
2607const PREWARM_BYTE_PRESSURE_DENOMINATOR: usize = 5;
2608
2609const CACHE_KEY_VERSION: &str = "1";
2610
2611static WARM_DEBOUNCE_MS: Lazy<u64> = Lazy::new(|| {
2613 dotenvy::var("CASS_WARM_DEBOUNCE_MS")
2614 .ok()
2615 .and_then(|v| v.parse::<u64>().ok())
2616 .filter(|v| *v > 0)
2617 .unwrap_or(120)
2618});
2619
2620fn default_cache_byte_cap() -> usize {
2621 default_cache_byte_cap_for_available(available_memory_bytes())
2622}
2623
2624fn cache_byte_cap_from_env_value(value: Option<&str>, available_bytes: Option<u64>) -> usize {
2625 let Some(raw) = value else {
2626 return default_cache_byte_cap_for_available(available_bytes);
2627 };
2628 raw.parse::<usize>()
2629 .unwrap_or_else(|_| default_cache_byte_cap_for_available(available_bytes))
2630}
2631
2632fn default_cache_byte_cap_for_available(available_bytes: Option<u64>) -> usize {
2633 let Some(available_bytes) = available_bytes else {
2634 return DEFAULT_CACHE_BYTE_CAP_FALLBACK;
2635 };
2636 let ceiling = usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX);
2637 let budget = available_bytes / DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR;
2638 let budget = budget.min(DEFAULT_CACHE_BYTE_CAP_CEILING);
2639 let budget = usize::try_from(budget).unwrap_or(ceiling);
2640 budget.clamp(DEFAULT_CACHE_BYTE_CAP_FALLBACK, ceiling)
2641}
2642
2643#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2644enum CacheEvictionPolicy {
2645 Lru,
2646 S3Fifo,
2647}
2648
2649impl CacheEvictionPolicy {
2650 fn label(self) -> &'static str {
2651 match self {
2652 CacheEvictionPolicy::Lru => "lru",
2653 CacheEvictionPolicy::S3Fifo => "s3-fifo",
2654 }
2655 }
2656}
2657
2658fn cache_eviction_policy_from_env_value(value: Option<&str>) -> CacheEvictionPolicy {
2659 match value.map(str::trim).filter(|value| !value.is_empty()) {
2660 Some(value) if value.eq_ignore_ascii_case("s3-fifo") => CacheEvictionPolicy::S3Fifo,
2661 Some(value) if value.eq_ignore_ascii_case("s3fifo") => CacheEvictionPolicy::S3Fifo,
2662 Some(value) if value.eq_ignore_ascii_case("s3_fifo") => CacheEvictionPolicy::S3Fifo,
2663 _ => CacheEvictionPolicy::Lru,
2664 }
2665}
2666
2667#[derive(Clone)]
2668struct CachedHit {
2669 hit: SearchHit,
2670 lc_content: String,
2671 lc_title: Option<String>,
2672 bloom64: u64,
2673}
2674
2675impl CachedHit {
2676 fn approx_bytes(&self) -> usize {
2679 let base = std::mem::size_of::<Self>();
2681 let hit_strings = self.hit.title.len()
2683 + self.hit.snippet.len()
2684 + self.hit.content.len()
2685 + self.hit.source_path.len()
2686 + self.hit.agent.len()
2687 + self.hit.workspace.len()
2688 + self
2689 .hit
2690 .workspace_original
2691 .as_ref()
2692 .map_or(0, std::string::String::len)
2693 + self.hit.source_id.len()
2694 + self.hit.origin_kind.len()
2695 + self
2696 .hit
2697 .origin_host
2698 .as_ref()
2699 .map_or(0, std::string::String::len);
2700 let lc_strings =
2702 self.lc_content.len() + self.lc_title.as_ref().map_or(0, std::string::String::len);
2703 base + hit_strings + lc_strings
2704 }
2705}
2706
2707struct CacheShards {
2708 shards: HashMap<Arc<str>, LruCache<Arc<str>, Vec<CachedHit>>>,
2710 total_cap: usize,
2711 total_cost: usize,
2712 eviction_count: u64,
2714 total_bytes: usize,
2716 byte_cap: usize,
2718 policy: CacheEvictionPolicy,
2720 ghost_keys: VecDeque<Arc<str>>,
2722 ghost_set: HashSet<Arc<str>>,
2723 admission_rejects: u64,
2724}
2725
2726impl CacheShards {
2727 fn new(total_cap: usize, byte_cap: usize) -> Self {
2728 Self::new_with_policy(total_cap, byte_cap, *CACHE_EVICTION_POLICY)
2729 }
2730
2731 fn new_with_policy(total_cap: usize, byte_cap: usize, policy: CacheEvictionPolicy) -> Self {
2732 Self {
2733 shards: HashMap::new(),
2734 total_cap: total_cap.max(1),
2735 total_cost: 0,
2736 eviction_count: 0,
2737 total_bytes: 0,
2738 byte_cap,
2739 policy,
2740 ghost_keys: VecDeque::new(),
2741 ghost_set: HashSet::new(),
2742 admission_rejects: 0,
2743 }
2744 }
2745
2746 fn shard_mut(&mut self, name: &str) -> &mut LruCache<Arc<str>, Vec<CachedHit>> {
2747 let interned_name = intern_cache_key(name);
2749 self.shards
2750 .entry(interned_name)
2751 .or_insert_with(|| LruCache::new(NonZeroUsize::new(*CACHE_SHARD_CAP).unwrap()))
2752 }
2753
2754 fn shard_opt(&self, name: &str) -> Option<&LruCache<Arc<str>, Vec<CachedHit>>> {
2755 self.shards.get(name)
2757 }
2758
2759 fn put(&mut self, shard_name: &str, key: Arc<str>, value: Vec<CachedHit>) {
2760 let new_cost = value.len();
2761 let new_bytes: usize = value.iter().map(CachedHit::approx_bytes).sum();
2762 let replacing = self
2763 .shard_opt(shard_name)
2764 .is_some_and(|shard| shard.contains(&key));
2765
2766 if !replacing && !self.should_admit(&key, new_cost, new_bytes) {
2767 self.admission_rejects += 1;
2768 self.record_ghost(key);
2769 return;
2770 }
2771
2772 self.remove_ghost(&key);
2773
2774 let shard = self.shard_mut(shard_name);
2775 let old_val = shard.put(key, value);
2776 let (old_cost, old_bytes) = old_val.as_ref().map_or((0, 0), |v| {
2777 (v.len(), v.iter().map(CachedHit::approx_bytes).sum())
2778 });
2779
2780 self.total_cost = self
2781 .total_cost
2782 .saturating_add(new_cost)
2783 .saturating_sub(old_cost);
2784 self.total_bytes = self
2785 .total_bytes
2786 .saturating_add(new_bytes)
2787 .saturating_sub(old_bytes);
2788 self.evict_until_within_cap();
2789 }
2790
2791 fn evict_until_within_cap(&mut self) {
2792 while self.total_cost > self.total_cap
2794 || (self.byte_cap > 0 && self.total_bytes > self.byte_cap)
2795 {
2796 let byte_pressure = self.byte_cap > 0 && self.total_bytes > self.byte_cap;
2801 let mut largest_shard_key = None;
2802 let mut max_score = 0usize;
2803 for (k, v) in self.shards.iter() {
2804 let score = if byte_pressure {
2805 shard_cached_bytes(v)
2806 } else {
2807 v.len()
2808 };
2809 if score > max_score {
2810 max_score = score;
2811 largest_shard_key = Some(k.clone());
2812 }
2813 }
2814
2815 if let Some(key) = largest_shard_key {
2816 if let Some(shard) = self.shards.get_mut(&key)
2817 && let Some((evicted_key, v)) = shard.pop_lru()
2818 {
2819 let evicted_bytes: usize = v.iter().map(CachedHit::approx_bytes).sum();
2820 self.total_cost = self.total_cost.saturating_sub(v.len());
2821 self.total_bytes = self.total_bytes.saturating_sub(evicted_bytes);
2822 self.eviction_count += 1;
2823 self.record_ghost(evicted_key);
2824 }
2825 } else {
2826 break; }
2828 }
2829 }
2830
2831 fn should_admit(&self, key: &Arc<str>, cost: usize, bytes: usize) -> bool {
2832 if self.policy == CacheEvictionPolicy::Lru || self.ghost_set.contains(key) {
2833 return true;
2834 }
2835 !self.is_s3_fifo_large_candidate(cost, bytes)
2836 }
2837
2838 fn is_s3_fifo_large_candidate(&self, cost: usize, bytes: usize) -> bool {
2839 let entry_heavy = cost
2840 > self
2841 .total_cap
2842 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2843 let byte_heavy = self.byte_cap > 0
2844 && bytes
2845 > self
2846 .byte_cap
2847 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2848 entry_heavy || byte_heavy
2849 }
2850
2851 fn record_ghost(&mut self, key: Arc<str>) {
2852 if self.policy != CacheEvictionPolicy::S3Fifo {
2853 return;
2854 }
2855 if self.ghost_set.insert(key.clone()) {
2856 self.ghost_keys.push_back(key);
2857 }
2858 let cap = self
2859 .total_cap
2860 .saturating_mul(S3_FIFO_GHOST_CAP_MULTIPLIER)
2861 .max(1);
2862 while self.ghost_set.len() > cap {
2863 if let Some(old) = self.ghost_keys.pop_front() {
2864 self.ghost_set.remove(&old);
2865 } else {
2866 break;
2867 }
2868 }
2869 }
2870
2871 fn remove_ghost(&mut self, key: &Arc<str>) {
2872 self.ghost_set.remove(key);
2873 self.ghost_keys.retain(|candidate| candidate != key);
2874 }
2875
2876 fn clear(&mut self) {
2877 self.shards.clear();
2878 self.total_cost = 0;
2879 self.total_bytes = 0;
2880 self.ghost_keys.clear();
2881 self.ghost_set.clear();
2882 }
2884
2885 fn total_cost(&self) -> usize {
2886 self.total_cost
2887 }
2888
2889 fn total_cap(&self) -> usize {
2890 self.total_cap
2891 }
2892
2893 fn eviction_count(&self) -> u64 {
2894 self.eviction_count
2895 }
2896
2897 fn total_bytes(&self) -> usize {
2898 self.total_bytes
2899 }
2900
2901 fn byte_cap(&self) -> usize {
2902 self.byte_cap
2903 }
2904
2905 fn policy_label(&self) -> &'static str {
2906 self.policy.label()
2907 }
2908
2909 fn ghost_entries(&self) -> usize {
2910 self.ghost_set.len()
2911 }
2912
2913 fn admission_rejects(&self) -> u64 {
2914 self.admission_rejects
2915 }
2916
2917 fn prewarm_pressure(&self) -> bool {
2918 let entry_pressure = self
2919 .total_cost
2920 .saturating_mul(PREWARM_ENTRY_PRESSURE_DENOMINATOR)
2921 >= self
2922 .total_cap
2923 .saturating_mul(PREWARM_ENTRY_PRESSURE_NUMERATOR);
2924 let byte_pressure = self.byte_cap > 0
2925 && self
2926 .total_bytes
2927 .saturating_mul(PREWARM_BYTE_PRESSURE_DENOMINATOR)
2928 >= self
2929 .byte_cap
2930 .saturating_mul(PREWARM_BYTE_PRESSURE_NUMERATOR);
2931 entry_pressure || byte_pressure
2932 }
2933}
2934
2935fn shard_cached_bytes(shard: &LruCache<Arc<str>, Vec<CachedHit>>) -> usize {
2936 shard
2937 .iter()
2938 .map(|(_key, hits)| hits.iter().map(CachedHit::approx_bytes).sum::<usize>())
2939 .sum()
2940}
2941
2942#[derive(Clone)]
2943struct WarmJob {
2944 query: String,
2945 filters_fingerprint: String,
2946 shard_name: String,
2947}
2948
2949#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2950enum AdaptivePrewarmDecision {
2951 Schedule,
2952 SkipCold,
2953 SkipPressure,
2954}
2955
2956#[derive(Clone)]
2957struct SearcherCacheEntry {
2958 epoch: u64,
2959 reader_key: usize,
2960 searcher: Searcher,
2961}
2962
2963thread_local! {
2964 static THREAD_SEARCHER: RefCell<Option<SearcherCacheEntry>> = const { RefCell::new(None) };
2965}
2966
2967#[derive(Clone)]
2968struct FederatedIndexReader {
2969 reader: IndexReader,
2970 fields: FsCassFields,
2971}
2972
2973static FEDERATED_SEARCH_READERS: Lazy<RwLock<HashMap<String, Arc<Vec<FederatedIndexReader>>>>> =
2974 Lazy::new(|| RwLock::new(HashMap::new()));
2975static SEARCH_CLIENT_INSTANCE_COUNTER: AtomicU64 = AtomicU64::new(1);
2976
2977fn levenshtein_distance(a: &str, b: &str) -> usize {
2980 let a_chars: Vec<char> = a.chars().collect();
2981 let b_chars: Vec<char> = b.chars().collect();
2982 let a_len = a_chars.len();
2983 let b_len = b_chars.len();
2984
2985 if a_len == 0 {
2986 return b_len;
2987 }
2988 if b_len == 0 {
2989 return a_len;
2990 }
2991
2992 let mut prev_row: Vec<usize> = (0..=b_len).collect();
2994 let mut curr_row: Vec<usize> = vec![0; b_len + 1];
2995
2996 for (i, a_char) in a_chars.iter().enumerate() {
2997 curr_row[0] = i + 1;
2998 for (j, b_char) in b_chars.iter().enumerate() {
2999 let cost = usize::from(a_char != b_char);
3000 curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
3004 std::mem::swap(&mut prev_row, &mut curr_row);
3005 }
3006
3007 prev_row[b_len]
3008}
3009
3010fn normalize_term_parts(raw: &str) -> Vec<String> {
3015 let mut parts = Vec::new();
3016 for token in nfc_sanitize_query(raw).split_whitespace() {
3017 let mut current = String::new();
3018 let mut chars = token.chars().peekable();
3019 while let Some(ch) = chars.next() {
3020 let trailing_wildcard = ch == '*' && chars.peek().is_none() && !current.is_empty();
3021 if ch.is_alphanumeric() || ch == '_' || trailing_wildcard {
3022 current.push(ch);
3023 continue;
3024 }
3025
3026 if !current.is_empty() {
3027 parts.push(std::mem::take(&mut current));
3028 }
3029 }
3030
3031 if !current.is_empty() {
3032 parts.push(current);
3033 }
3034 }
3035 parts
3036}
3037
3038fn normalize_phrase_terms(raw: &str) -> Vec<String> {
3040 normalize_term_parts(raw)
3041 .into_iter()
3042 .map(|s| s.trim_matches('*').to_lowercase())
3043 .filter(|s| !s.is_empty())
3044 .collect()
3045}
3046
3047fn render_fts5_term_part(part: &str) -> Option<String> {
3048 let pattern = FsCassWildcardPattern::parse(part);
3049 if matches!(
3050 pattern,
3051 FsCassWildcardPattern::Suffix(_)
3052 | FsCassWildcardPattern::Substring(_)
3053 | FsCassWildcardPattern::Complex(_)
3054 ) {
3055 return None;
3056 }
3057
3058 Some(part.to_string())
3059}
3060
3061fn dominant_match_type(query: &str) -> MatchType {
3064 let mut worst = MatchType::Exact;
3065 for term in query.split_whitespace() {
3066 let pattern = FsCassWildcardPattern::parse(term);
3067 let mt = match pattern {
3068 FsCassWildcardPattern::Exact(_) => MatchType::Exact,
3069 FsCassWildcardPattern::Prefix(_) => MatchType::Prefix,
3070 FsCassWildcardPattern::Suffix(_) => MatchType::Suffix,
3071 FsCassWildcardPattern::Substring(_) => MatchType::Substring,
3072 FsCassWildcardPattern::Complex(_) => MatchType::Wildcard,
3073 };
3074 if mt.quality_factor() < worst.quality_factor() {
3076 worst = mt;
3077 }
3078 }
3079 worst
3080}
3081
3082pub(crate) fn is_tool_invocation_noise(content: &str) -> bool {
3085 let trimmed = content.trim();
3086
3087 if trimmed.starts_with("[Tool:") {
3089 if let Some(close_idx) = trimmed.find(']') {
3091 let after = &trimmed[close_idx + 1..];
3093 if !after.trim().is_empty() {
3094 return false; }
3096
3097 let inner = &trimmed[6..close_idx]; return inner.trim().is_empty();
3103 }
3104 return true;
3106 }
3107
3108 if trimmed.len() < 20 {
3110 let lower = trimmed.to_lowercase();
3111 if lower.starts_with("[tool") || lower.starts_with("tool:") {
3112 return true;
3113 }
3114 }
3115
3116 false
3117}
3118
3119fn hit_content_for_noise_check(hit: &SearchHit) -> &str {
3120 if hit.content.is_empty() {
3121 &hit.snippet
3122 } else {
3123 &hit.content
3124 }
3125}
3126
3127fn hit_is_noise(hit: &SearchHit, query: &str) -> bool {
3128 let content_to_check = hit_content_for_noise_check(hit);
3129 if content_to_check.is_empty() {
3139 return false;
3140 }
3141 is_search_noise_text(content_to_check, query) || is_tool_invocation_noise(content_to_check)
3142}
3143
3144fn snippet_from_content(content: &str) -> String {
3145 let trimmed = content.trim();
3146 let mut chars = trimmed.chars();
3147 let preview: String = chars.by_ref().take(200).collect();
3148 if chars.next().is_some() {
3149 format!("{preview}...")
3150 } else {
3151 preview
3152 }
3153}
3154
3155#[cfg(test)]
3163pub(crate) fn deduplicate_hits(hits: Vec<SearchHit>) -> Vec<SearchHit> {
3164 deduplicate_hits_with_query(hits, "")
3165}
3166
3167pub(crate) fn deduplicate_hits_with_query(hits: Vec<SearchHit>, query: &str) -> Vec<SearchHit> {
3168 let mut source_ids: HashMap<String, u32> = HashMap::new();
3175 let mut path_ids: HashMap<String, u32> = HashMap::new();
3176 let mut title_ids: HashMap<String, u32> = HashMap::new();
3177 let mut next_source_id: u32 = 0;
3178 let mut next_path_id: u32 = 0;
3179 let mut next_title_id: u32 = 0;
3180 type DedupKey = (
3181 u32,
3182 u32,
3183 Option<i64>,
3184 Option<u32>,
3185 Option<usize>,
3186 Option<i64>,
3187 u64,
3188 );
3189
3190 let mut seen: HashMap<DedupKey, usize> = HashMap::new();
3191 let mut deduped: Vec<SearchHit> = Vec::new();
3192
3193 for hit in hits {
3194 if hit_is_noise(&hit, query) {
3195 continue;
3196 }
3197
3198 let normalized_source_id = normalized_search_hit_source_id(&hit);
3201 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
3202 *id
3203 } else {
3204 let id = next_source_id;
3205 next_source_id = next_source_id.saturating_add(1);
3206 source_ids.insert(normalized_source_id, id);
3207 id
3208 };
3209 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
3210 *id
3211 } else {
3212 let id = next_path_id;
3213 next_path_id = next_path_id.saturating_add(1);
3214 path_ids.insert(hit.source_path.clone(), id);
3215 id
3216 };
3217 let title_key = if hit.conversation_id.is_some() {
3218 None
3219 } else {
3220 let normalized_title = hit.title.trim();
3221 Some(if let Some(id) = title_ids.get(normalized_title) {
3222 *id
3223 } else {
3224 let id = next_title_id;
3225 next_title_id = next_title_id.saturating_add(1);
3226 title_ids.insert(normalized_title.to_string(), id);
3227 id
3228 })
3229 };
3230 let key = (
3231 source_key,
3232 path_key,
3233 hit.conversation_id,
3234 title_key,
3235 hit.line_number,
3236 hit.created_at,
3237 hit.content_hash,
3238 );
3239
3240 if let Some(&existing_idx) = seen.get(&key) {
3241 if deduped[existing_idx].score < hit.score {
3243 deduped[existing_idx] = hit;
3244 }
3245 } else {
3247 seen.insert(key, deduped.len());
3248 deduped.push(hit);
3249 }
3250 }
3251
3252 deduped
3253}
3254
3255fn should_try_wildcard_fallback(
3256 returned_hits: usize,
3257 limit: usize,
3258 offset: usize,
3259 sparse_threshold: usize,
3260) -> bool {
3261 if offset != 0 {
3262 return false;
3263 }
3264
3265 let effective_sparse_threshold = if limit == 0 {
3266 sparse_threshold
3267 } else {
3268 sparse_threshold.min(limit)
3269 };
3270
3271 returned_hits < effective_sparse_threshold
3272}
3273
3274fn should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(
3275 query: &str,
3276 returned_hits: usize,
3277) -> bool {
3278 if returned_hits != 0 {
3279 return false;
3280 }
3281
3282 for token in normalize_phrase_terms(query) {
3283 if token.chars().count() > AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS {
3284 return true;
3285 }
3286 }
3287
3288 false
3289}
3290
3291fn snippet_from_preview_without_full_content(
3292 field_mask: FieldMask,
3293 stored_preview: &str,
3294 query: &str,
3295) -> Option<String> {
3296 if field_mask.needs_content() || !field_mask.wants_snippet() || stored_preview.is_empty() {
3297 return None;
3298 }
3299
3300 cached_prefix_snippet(stored_preview, query, 160)
3301}
3302
3303fn stored_preview_is_complete_content(stored_preview: &str) -> bool {
3304 !stored_preview.is_empty() && !stored_preview.ends_with('…')
3307}
3308
3309impl SearchClient {
3310 pub fn open(index_path: &Path, db_path: Option<&Path>) -> Result<Option<Self>> {
3311 Self::open_with_options(index_path, db_path, SearchClientOptions::default())
3312 }
3313
3314 pub fn open_with_options(
3315 index_path: &Path,
3316 db_path: Option<&Path>,
3317 options: SearchClientOptions,
3318 ) -> Result<Option<Self>> {
3319 let tantivy = fs_cass_open_search_reader(index_path, ReloadPolicy::Manual).ok();
3320 let client_id = SEARCH_CLIENT_INSTANCE_COUNTER.fetch_add(1, Ordering::Relaxed);
3321 let cache_namespace = format!(
3322 "v{}|schema:{}|client:{}|index:{}",
3323 CACHE_KEY_VERSION,
3324 FS_CASS_SCHEMA_HASH,
3325 client_id,
3326 index_path.display()
3327 );
3328 let federated_readers = if tantivy.is_none() {
3329 crate::search::tantivy::open_federated_search_readers(index_path, ReloadPolicy::Manual)
3330 .ok()
3331 .flatten()
3332 .filter(|readers| !readers.is_empty())
3333 .map(|readers| {
3334 Arc::new(
3335 readers
3336 .into_iter()
3337 .map(|(reader, fields)| FederatedIndexReader { reader, fields })
3338 .collect::<Vec<_>>(),
3339 )
3340 })
3341 } else {
3342 None
3343 };
3344
3345 let sqlite_path = db_path.map(Path::to_path_buf).filter(|path| path.exists());
3346
3347 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_some() {
3348 tracing::warn!(
3349 index_path = %index_path.display(),
3350 "Tantivy search index not found or incompatible. \
3351 Search results will be degraded. \
3352 Run `cass index --full` to rebuild the index."
3353 );
3354 }
3355
3356 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_none() {
3357 return Ok(None);
3358 }
3359
3360 let reload_epoch = Arc::new(AtomicU64::new(0));
3361 let metrics = Metrics::default();
3362
3363 let warm_pair = if options.enable_warm
3364 && let Some((reader, fields)) = &tantivy
3365 {
3366 maybe_spawn_warm_worker(
3367 reader.clone(),
3368 *fields,
3369 reload_epoch.clone(),
3370 metrics.clone(),
3371 )
3372 } else {
3373 None
3374 };
3375
3376 if let Some(readers) = &federated_readers {
3377 FEDERATED_SEARCH_READERS
3378 .write()
3379 .insert(cache_namespace.clone(), Arc::clone(readers));
3380 } else {
3381 FEDERATED_SEARCH_READERS.write().remove(&cache_namespace);
3382 }
3383
3384 Ok(Some(Self {
3385 reader: tantivy,
3386 sqlite: Mutex::new(None),
3387 sqlite_path,
3388 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
3389 reload_on_search: options.enable_reload,
3390 last_reload: Mutex::new(None),
3391 last_generation: Mutex::new(None),
3392 reload_epoch,
3393 warm_tx: warm_pair.as_ref().map(|(tx, _)| tx.clone()),
3394 _warm_handle: warm_pair.map(|(_, h)| h),
3395 metrics,
3396 cache_namespace,
3397 semantic: Mutex::new(None),
3398 last_tantivy_total_count: Mutex::new(None),
3399 }))
3400 }
3401
3402 fn sqlite_guard(&self) -> Result<std::sync::MutexGuard<'_, Option<SendConnection>>> {
3403 let mut guard = self
3404 .sqlite
3405 .lock()
3406 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3407
3408 if guard.is_none()
3409 && let Some(path) = &self.sqlite_path
3410 {
3411 match open_search_hydration_sqlite(path, std::time::Duration::from_secs(1)) {
3412 Ok(conn) => {
3413 *guard = Some(SendConnection(conn));
3414 }
3415 Err(err) => {
3416 tracing::debug!(
3417 error = %err,
3418 path = %path.display(),
3419 "readonly sqlite open failed for search client"
3420 );
3421 }
3422 }
3423 }
3424
3425 Ok(guard)
3426 }
3427
3428 pub fn search(
3429 &self,
3430 query: &str,
3431 filters: SearchFilters,
3432 limit: usize,
3433 offset: usize,
3434 field_mask: FieldMask,
3435 ) -> Result<Vec<SearchHit>> {
3436 use unicode_normalization::UnicodeNormalization;
3440 let query: String = query.nfc().collect();
3441 let query: &str = &query;
3442 let sanitized = nfc_sanitize_query(query);
3443 let field_mask = effective_field_mask(field_mask);
3444 let limit = if limit == 0 {
3445 self.total_docs().min(no_limit_result_cap()).max(1)
3446 } else {
3447 limit
3448 };
3449 let can_use_cache =
3450 field_mask.allows_cache() && (field_mask.needs_content() || field_mask.wants_snippet());
3451
3452 if let Some((reader, _)) = &self.reader {
3455 self.maybe_reload_reader(reader)?;
3456 let searcher = self.searcher_for_thread(reader);
3457 self.track_generation(searcher.generation().generation_id());
3458 } else if let Some(readers) = self.federated_readers()
3459 && let Some(signature) = self.maybe_reload_federated_readers(readers.as_ref())?
3460 {
3461 self.track_generation(signature);
3462 }
3463
3464 if can_use_cache
3469 && offset == 0
3470 && !query.contains('*')
3471 && !fs_cass_has_boolean_operators(query)
3472 {
3473 self.maybe_schedule_adaptive_query_prewarm(&sanitized, &filters);
3474 if let Some(cached) = self.cached_prefix_hits(&sanitized, &filters) {
3475 let query_terms = QueryTermsLower::from_query(&sanitized);
3477 let mut filtered: Vec<SearchHit> = cached
3478 .into_iter()
3479 .filter(|h| hit_matches_query_cached_precomputed(h, &query_terms))
3480 .map(|c| c.hit.clone())
3481 .collect();
3482 if filtered.len() >= limit {
3483 filtered.truncate(limit);
3484 self.metrics.inc_cache_hits();
3485 self.maybe_log_cache_metrics("hit");
3486 return Ok(filtered);
3487 }
3488 self.metrics.inc_cache_shortfall();
3490 self.maybe_log_cache_metrics("shortfall");
3491 } else {
3492 self.metrics.inc_cache_miss();
3494 self.maybe_log_cache_metrics("miss");
3495 }
3496 }
3497
3498 let target_hits = offset.saturating_add(limit);
3502 let initial_fetch_limit = if target_hits <= 16 {
3503 target_hits.saturating_mul(2)
3504 } else {
3505 target_hits.saturating_mul(3).div_ceil(2)
3508 };
3509 let session_path_filter_active = !filters.session_paths.is_empty();
3510 let fallback_fetch_limit = if session_path_filter_active {
3511 self.total_docs()
3512 .min(no_limit_result_cap())
3513 .max(target_hits.saturating_mul(3))
3514 .max(1)
3515 } else {
3516 target_hits.saturating_mul(3)
3517 };
3518
3519 if let Some((reader, fields)) = &self.reader {
3521 tracing::info!(
3522 backend = "tantivy",
3523 query = sanitized,
3524 limit = initial_fetch_limit,
3525 offset = 0,
3526 "search_start"
3527 );
3528 let (hits, tantivy_total_count) = self.search_tantivy(
3529 reader,
3530 fields,
3531 query,
3532 &sanitized,
3533 filters.clone(),
3534 initial_fetch_limit,
3535 0, field_mask,
3537 )?;
3538 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3539 *tc = Some(tantivy_total_count);
3540 }
3541 if !hits.is_empty() {
3542 let initial_hit_count = hits.len();
3543 let page_hits = |raw_hits: Vec<SearchHit>| {
3544 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3545 };
3546
3547 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3548
3549 let needs_retry = deduped_len < target_hits
3550 && initial_hit_count == initial_fetch_limit
3551 && initial_fetch_limit < fallback_fetch_limit;
3552
3553 if needs_retry {
3554 tracing::debug!(
3555 query = sanitized,
3556 target_hits,
3557 deduped_len,
3558 initial_fetch_limit,
3559 fallback_fetch_limit,
3560 session_path_filter_active,
3561 "retrying lexical fetch due to dedup or session-path shortfall"
3562 );
3563 let (retry_hits, retry_total_count) = self.search_tantivy(
3564 reader,
3565 fields,
3566 query,
3567 &sanitized,
3568 filters.clone(),
3569 fallback_fetch_limit,
3570 0,
3571 field_mask,
3572 )?;
3573 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3574 *tc = Some(retry_total_count);
3575 }
3576 if !retry_hits.is_empty() {
3577 (deduped_len, paged_hits) = page_hits(retry_hits);
3578 }
3579 }
3580
3581 tracing::trace!(
3582 query = sanitized,
3583 target_hits,
3584 deduped_len,
3585 returned = paged_hits.len(),
3586 "lexical fetch complete"
3587 );
3588
3589 if can_use_cache && offset == 0 {
3590 self.put_cache(&sanitized, &filters, &paged_hits);
3591 }
3592 return Ok(paged_hits);
3593 }
3594 tracing::debug!(
3595 query = sanitized,
3596 "tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3597 );
3598 return Ok(Vec::new());
3599 } else if let Some(readers) = self.federated_readers() {
3600 tracing::info!(
3601 backend = "tantivy-federated",
3602 query = sanitized,
3603 limit = initial_fetch_limit,
3604 offset = 0,
3605 shards = readers.len(),
3606 "search_start"
3607 );
3608 let (hits, tantivy_total_count) = self.search_tantivy_federated(
3609 readers.as_ref(),
3610 query,
3611 &sanitized,
3612 filters.clone(),
3613 initial_fetch_limit,
3614 field_mask,
3615 )?;
3616 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3617 *tc = Some(tantivy_total_count);
3618 }
3619 if !hits.is_empty() {
3620 let initial_hit_count = hits.len();
3621 let page_hits = |raw_hits: Vec<SearchHit>| {
3622 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3623 };
3624
3625 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3626 let expected_federated_capacity = initial_fetch_limit.saturating_mul(readers.len());
3627 let federated_initial_capacity_reached = if session_path_filter_active {
3628 initial_hit_count >= initial_fetch_limit.min(expected_federated_capacity)
3629 } else {
3630 initial_hit_count == expected_federated_capacity
3631 };
3632 let needs_retry = deduped_len < target_hits
3633 && federated_initial_capacity_reached
3634 && initial_fetch_limit < fallback_fetch_limit;
3635
3636 if needs_retry {
3637 tracing::debug!(
3638 query = sanitized,
3639 target_hits,
3640 deduped_len,
3641 initial_fetch_limit,
3642 fallback_fetch_limit,
3643 shards = readers.len(),
3644 session_path_filter_active,
3645 "retrying federated lexical fetch due to dedup or session-path shortfall"
3646 );
3647 let (retry_hits, retry_total_count) = self.search_tantivy_federated(
3648 readers.as_ref(),
3649 query,
3650 &sanitized,
3651 filters.clone(),
3652 fallback_fetch_limit,
3653 field_mask,
3654 )?;
3655 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3656 *tc = Some(retry_total_count);
3657 }
3658 if !retry_hits.is_empty() {
3659 (deduped_len, paged_hits) = page_hits(retry_hits);
3660 }
3661 }
3662
3663 tracing::trace!(
3664 query = sanitized,
3665 target_hits,
3666 deduped_len,
3667 returned = paged_hits.len(),
3668 shards = readers.len(),
3669 "federated lexical fetch complete"
3670 );
3671
3672 if can_use_cache && offset == 0 {
3673 self.put_cache(&sanitized, &filters, &paged_hits);
3674 }
3675 return Ok(paged_hits);
3676 }
3677 tracing::debug!(
3678 query = sanitized,
3679 shards = readers.len(),
3680 "federated tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3681 );
3682 return Ok(Vec::new());
3683 }
3684
3685 let unsupported_wildcards = sanitized.split_whitespace().any(|t| {
3689 let core = t.trim_end_matches('*');
3690 core.contains('*') });
3692
3693 if unsupported_wildcards {
3694 return Ok(Vec::new());
3695 }
3696
3697 let has_sqlite_backend = {
3698 let sqlite_guard = self
3699 .sqlite
3700 .lock()
3701 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3702 sqlite_guard.is_some() || self.sqlite_path.is_some()
3703 };
3704
3705 if has_sqlite_backend {
3706 tracing::info!(
3707 backend = "sqlite-fts5",
3708 query = sanitized,
3709 limit = fallback_fetch_limit,
3710 offset = 0,
3711 "search_start"
3712 );
3713 let hits = self.search_sqlite_fts5(
3714 self.sqlite_path
3715 .as_deref()
3716 .unwrap_or_else(|| Path::new(":memory:")),
3717 query,
3718 filters.clone(),
3719 fallback_fetch_limit,
3720 0, field_mask,
3722 )?;
3723 let (_, paged_hits) =
3724 self.postprocess_hits_page(hits, &sanitized, &filters, limit, offset);
3725
3726 if can_use_cache && offset == 0 {
3727 self.put_cache(&sanitized, &filters, &paged_hits);
3728 }
3729 return Ok(paged_hits);
3730 }
3731
3732 tracing::info!(backend = "none", query = query, "search_start");
3733 Ok(Vec::new())
3734 }
3735
3736 pub fn set_semantic_context(
3737 &self,
3738 embedder: Arc<dyn Embedder>,
3739 fs_semantic_index: VectorIndex,
3740 filter_maps: SemanticFilterMaps,
3741 roles: Option<HashSet<u8>>,
3742 ann_path: Option<PathBuf>,
3743 ) -> Result<()> {
3744 self.set_semantic_indexes_context(
3745 embedder,
3746 vec![fs_semantic_index],
3747 filter_maps,
3748 roles,
3749 ann_path,
3750 )
3751 }
3752
3753 pub fn set_semantic_indexes_context(
3754 &self,
3755 embedder: Arc<dyn Embedder>,
3756 fs_semantic_indexes: Vec<VectorIndex>,
3757 filter_maps: SemanticFilterMaps,
3758 roles: Option<HashSet<u8>>,
3759 ann_path: Option<PathBuf>,
3760 ) -> Result<()> {
3761 if fs_semantic_indexes.is_empty() {
3762 bail!("semantic context requires at least one vector index");
3763 }
3764
3765 let fs_semantic_indexes = fs_semantic_indexes
3766 .into_iter()
3767 .map(|index| {
3768 let embedder_id = index.embedder_id().to_string();
3769 let dimension = index.dimension();
3770 if embedder_id != embedder.id() {
3771 bail!(
3772 "embedder mismatch: index uses {}, embedder is {}",
3773 embedder_id,
3774 embedder.id()
3775 );
3776 }
3777 if dimension != embedder.dimension() {
3778 bail!(
3779 "embedder dimension mismatch: index uses {}, embedder is {}",
3780 dimension,
3781 embedder.dimension()
3782 );
3783 }
3784 Ok(Arc::new(index))
3785 })
3786 .collect::<Result<Vec<_>>>()?;
3787 let fs_semantic_index = Arc::clone(&fs_semantic_indexes[0]);
3788 let shard_count = fs_semantic_indexes.len();
3789 let ann_path = if shard_count == 1 { ann_path } else { None };
3790 let embedder_id = fs_semantic_index.embedder_id().to_string();
3791 let dimension = fs_semantic_index.dimension();
3792 let fs_semantic_indexes = Arc::new(fs_semantic_indexes);
3793
3794 let capacity = NonZeroUsize::new(100).ok_or_else(|| anyhow!("invalid cache size"))?;
3795 let context_token = Arc::new(());
3796 let mut state_guard = self
3797 .semantic
3798 .lock()
3799 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3800 *state_guard = Some(SemanticSearchState {
3801 context_token,
3802 embedder,
3803 fs_semantic_index,
3804 fs_semantic_indexes,
3805 fs_ann_index: None,
3806 ann_path,
3807 fs_in_memory_two_tier_index: None,
3808 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable::default(),
3809 progressive_context: None,
3810 progressive_context_unavailable: false,
3811 filter_maps,
3812 roles,
3813 query_cache: QueryCache::new(embedder_id.as_str(), capacity),
3814 });
3815 if shard_count > 1 {
3816 tracing::info!(
3817 shard_count,
3818 dimension,
3819 embedder = embedder_id,
3820 "semantic search context loaded sharded vector generation"
3821 );
3822 }
3823 Ok(())
3824 }
3825
3826 pub fn clear_semantic_context(&self) -> Result<()> {
3827 let mut guard = self
3828 .semantic
3829 .lock()
3830 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3831 *guard = None;
3832 Ok(())
3833 }
3834
3835 fn semantic_context_matches(&self, context_token: &Arc<()>) -> Result<bool> {
3836 let guard = self
3837 .semantic
3838 .lock()
3839 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3840 Ok(guard
3841 .as_ref()
3842 .is_some_and(|state| Arc::ptr_eq(&state.context_token, context_token)))
3843 }
3844
3845 fn semantic_query_embedding(&self, canonical: &str) -> Result<SemanticQueryEmbedding> {
3846 loop {
3847 let (embedder, context_token) = {
3848 let mut guard = self
3849 .semantic
3850 .lock()
3851 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3852 let state = guard.as_mut().ok_or_else(|| {
3853 anyhow!("semantic search unavailable (no embedder or vector index)")
3854 })?;
3855 if let Some(hit) = state
3856 .query_cache
3857 .get_cached(state.embedder.as_ref(), canonical)
3858 {
3859 return Ok(SemanticQueryEmbedding {
3860 context_token: Arc::clone(&state.context_token),
3861 vector: hit,
3862 });
3863 }
3864 (
3865 Arc::clone(&state.embedder),
3866 Arc::clone(&state.context_token),
3867 )
3868 };
3869
3870 let embedding = embedder
3871 .embed_sync(canonical)
3872 .map_err(|e| anyhow!("embedding failed: {e}"))?;
3873
3874 let mut guard = self
3875 .semantic
3876 .lock()
3877 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3878 let state = guard.as_mut().ok_or_else(|| {
3879 anyhow!("semantic search unavailable (no embedder or vector index)")
3880 })?;
3881 if !Arc::ptr_eq(&state.context_token, &context_token) {
3882 continue;
3883 }
3884 if let Some(hit) = state
3885 .query_cache
3886 .get_cached(state.embedder.as_ref(), canonical)
3887 {
3888 return Ok(SemanticQueryEmbedding {
3889 context_token,
3890 vector: hit,
3891 });
3892 }
3893 state
3894 .query_cache
3895 .store(state.embedder.as_ref(), canonical, embedding.clone());
3896 return Ok(SemanticQueryEmbedding {
3897 context_token,
3898 vector: embedding,
3899 });
3900 }
3901 }
3902
3903 fn in_memory_two_tier_index(
3904 &self,
3905 tier_mode: SemanticTierMode,
3906 ) -> Result<Option<Arc<FsInMemoryTwoTierIndex>>> {
3907 loop {
3908 let (ann_path, embedder_id, context_token) = {
3909 let mut guard = self
3910 .semantic
3911 .lock()
3912 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3913 let state = guard.as_mut().ok_or_else(|| {
3914 anyhow!("semantic search unavailable (no embedder or vector index)")
3915 })?;
3916 if let Some(index) = state.fs_in_memory_two_tier_index.as_ref()
3917 && two_tier_index_supports_mode(index.as_ref(), tier_mode)
3918 {
3919 return Ok(Some(Arc::clone(index)));
3920 }
3921 if state
3922 .in_memory_two_tier_unavailable
3923 .is_known_unavailable(tier_mode)
3924 {
3925 return Ok(None);
3926 }
3927 (
3928 state.ann_path.clone(),
3929 state.embedder.id().to_string(),
3930 Arc::clone(&state.context_token),
3931 )
3932 };
3933
3934 let index = build_in_memory_two_tier_index(ann_path.clone(), &embedder_id, tier_mode);
3935
3936 let mut guard = self
3937 .semantic
3938 .lock()
3939 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3940 let state = guard.as_mut().ok_or_else(|| {
3941 anyhow!("semantic search unavailable (no embedder or vector index)")
3942 })?;
3943 if let Some(existing) = state.fs_in_memory_two_tier_index.as_ref()
3944 && two_tier_index_supports_mode(existing.as_ref(), tier_mode)
3945 {
3946 return Ok(Some(Arc::clone(existing)));
3947 }
3948 if !Arc::ptr_eq(&state.context_token, &context_token) {
3949 continue;
3950 }
3951 let Some(index) = index else {
3952 state
3953 .in_memory_two_tier_unavailable
3954 .mark_unavailable(tier_mode);
3955 return Ok(None);
3956 };
3957 if !two_tier_index_supports_mode(index.as_ref(), tier_mode) {
3958 state
3959 .in_memory_two_tier_unavailable
3960 .mark_unavailable(tier_mode);
3961 return Ok(None);
3962 }
3963 state.fs_in_memory_two_tier_index = Some(Arc::clone(&index));
3964 if index.has_quality_index() {
3965 state.in_memory_two_tier_unavailable = InMemoryTwoTierUnavailable::default();
3966 } else {
3967 state.in_memory_two_tier_unavailable.fast_only = false;
3968 }
3969 return Ok(Some(index));
3970 }
3971 }
3972
3973 fn ann_index(&self) -> Result<Arc<FsHnswIndex>> {
3974 loop {
3975 let (ann_path, fs_semantic_index) = {
3976 let mut guard = self
3977 .semantic
3978 .lock()
3979 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3980 let state = guard.as_mut().ok_or_else(|| {
3981 anyhow!("semantic search unavailable (no embedder or vector index)")
3982 })?;
3983 if let Some(index) = state.fs_ann_index.as_ref() {
3984 return Ok(Arc::clone(index));
3985 }
3986 let ann_path = state.ann_path.clone().ok_or_else(|| {
3987 anyhow!(
3988 "approximate search unavailable: HNSW index missing (run 'cass index --semantic --build-hnsw')"
3989 )
3990 })?;
3991 (ann_path, Arc::clone(&state.fs_semantic_index))
3992 };
3993
3994 let ann = Arc::new(open_fs_semantic_ann_index(
3995 fs_semantic_index.as_ref(),
3996 &ann_path,
3997 )?);
3998
3999 let mut guard = self
4000 .semantic
4001 .lock()
4002 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4003 let state = guard.as_mut().ok_or_else(|| {
4004 anyhow!("semantic search unavailable (no embedder or vector index)")
4005 })?;
4006 if let Some(existing) = state.fs_ann_index.as_ref() {
4007 return Ok(Arc::clone(existing));
4008 }
4009 if state.ann_path.as_ref() != Some(&ann_path)
4010 || !Arc::ptr_eq(&state.fs_semantic_index, &fs_semantic_index)
4011 {
4012 continue;
4013 }
4014 state.fs_ann_index = Some(Arc::clone(&ann));
4015 return Ok(ann);
4016 }
4017 }
4018
4019 fn collapse_semantic_results(
4020 best_by_message: HashMap<u64, VectorSearchResult>,
4021 fetch_limit: usize,
4022 ) -> Vec<VectorSearchResult> {
4023 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
4024 collapsed.sort_by(|a, b| {
4025 b.score
4026 .total_cmp(&a.score)
4027 .then_with(|| a.message_id.cmp(&b.message_id))
4028 });
4029 if collapsed.len() > fetch_limit {
4030 collapsed.truncate(fetch_limit);
4031 }
4032 collapsed
4033 }
4034
4035 fn semantic_exact_candidate_limit(fetch_limit: usize, record_count: usize) -> usize {
4036 fetch_limit
4037 .saturating_mul(SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER)
4038 .max(fetch_limit)
4039 .min(record_count)
4040 }
4041
4042 fn semantic_window_may_omit_competitor(
4043 collapsed: &[VectorSearchResult],
4044 fetch_limit: usize,
4045 max_omitted_score: Option<f32>,
4046 ) -> bool {
4047 if fetch_limit == 0 {
4048 return false;
4049 }
4050 let Some(max_omitted_score) = max_omitted_score else {
4051 return false;
4052 };
4053 if collapsed.len() < fetch_limit {
4054 return true;
4055 }
4056 let Some(last_in_requested_window) = collapsed.get(fetch_limit - 1) else {
4057 return true;
4058 };
4059 !last_in_requested_window
4060 .score
4061 .total_cmp(&max_omitted_score)
4062 .is_gt()
4063 }
4064
4065 fn record_fs_semantic_hit(
4066 best_by_message: &mut HashMap<u64, VectorSearchResult>,
4067 hit: &FsVectorHit,
4068 ) {
4069 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4070 return;
4071 };
4072 best_by_message
4073 .entry(parsed.message_id)
4074 .and_modify(|entry| {
4075 if hit.score > entry.score {
4076 entry.score = hit.score;
4077 entry.chunk_idx = parsed.chunk_idx;
4078 }
4079 })
4080 .or_insert(VectorSearchResult {
4081 message_id: parsed.message_id,
4082 chunk_idx: parsed.chunk_idx,
4083 score: hit.score,
4084 });
4085 }
4086
4087 fn search_exact_semantic_indexes(
4088 context: &SemanticCandidateContext,
4089 embedding: &[f32],
4090 fetch_limit: usize,
4091 fs_filter: Option<&dyn FsSearchFilter>,
4092 ) -> Result<(Vec<VectorSearchResult>, SemanticCandidateRetryState)> {
4093 if context.fs_semantic_indexes.len() == 1 {
4094 let record_count = context.fs_semantic_index.record_count();
4095 let candidate_limit = Self::semantic_exact_candidate_limit(fetch_limit, record_count);
4096 let fs_hits = context
4097 .fs_semantic_index
4098 .search_top_k(embedding, candidate_limit, fs_filter)
4099 .map_err(|err| anyhow!("frankensearch semantic search failed: {err}"))?;
4100 let mut best_by_message = HashMap::with_capacity(fs_hits.len());
4101 for hit in &fs_hits {
4102 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4103 }
4104 let collapsed = Self::collapse_semantic_results(best_by_message, candidate_limit);
4105 let has_more_candidates =
4106 fs_hits.len() >= candidate_limit && candidate_limit < record_count;
4107 let max_omitted_score = if has_more_candidates {
4108 fs_hits.last().map(|hit| hit.score)
4109 } else {
4110 None
4111 };
4112 let exact_window_may_omit_competitor = Self::semantic_window_may_omit_competitor(
4113 &collapsed,
4114 fetch_limit,
4115 max_omitted_score,
4116 );
4117 return Ok((
4118 collapsed,
4119 SemanticCandidateRetryState {
4120 has_more_candidates,
4121 exact_window_may_omit_competitor,
4122 },
4123 ));
4124 }
4125
4126 let mut best_by_message = HashMap::new();
4127 let mut raw_hits = 0usize;
4128 let mut max_omitted_score: Option<f32> = None;
4129 let mut has_more_candidates = false;
4130 for index in context.fs_semantic_indexes.iter() {
4131 let shard_record_count = index.record_count();
4132 let shard_limit = Self::semantic_exact_candidate_limit(fetch_limit, shard_record_count);
4138 if shard_limit == 0 {
4139 continue;
4140 }
4141 let fs_hits = index
4142 .search_top_k(embedding, shard_limit, fs_filter)
4143 .map_err(|err| anyhow!("frankensearch sharded semantic search failed: {err}"))?;
4144 if fs_hits.len() >= shard_limit
4145 && shard_limit < shard_record_count
4146 && let Some(last_hit) = fs_hits.last()
4147 {
4148 has_more_candidates = true;
4149 max_omitted_score = Some(
4150 max_omitted_score
4151 .map(|current| current.max(last_hit.score))
4152 .unwrap_or(last_hit.score),
4153 );
4154 }
4155 raw_hits = raw_hits.saturating_add(fs_hits.len());
4156 best_by_message.reserve(fs_hits.len());
4157 for hit in &fs_hits {
4158 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4159 }
4160 }
4161 let candidate_return_limit = Self::semantic_exact_candidate_limit(fetch_limit, raw_hits);
4162 let collapsed = Self::collapse_semantic_results(best_by_message, candidate_return_limit);
4163 let exact_window_may_omit_competitor =
4164 Self::semantic_window_may_omit_competitor(&collapsed, fetch_limit, max_omitted_score);
4165 tracing::debug!(
4166 shard_count = context.fs_semantic_indexes.len(),
4167 raw_hits,
4168 returned = collapsed.len(),
4169 "semantic sharded exact merge complete"
4170 );
4171 Ok((
4172 collapsed,
4173 SemanticCandidateRetryState {
4174 has_more_candidates,
4175 exact_window_may_omit_competitor,
4176 },
4177 ))
4178 }
4179
4180 fn search_semantic_candidates(
4181 &self,
4182 context: &SemanticCandidateContext,
4183 embedding: &[f32],
4184 filters: &SearchFilters,
4185 request: SemanticCandidateSearchRequest<'_>,
4186 ) -> Result<(
4187 Vec<VectorSearchResult>,
4188 SemanticCandidateRetryState,
4189 Option<crate::search::ann_index::AnnSearchStats>,
4190 )> {
4191 let mut semantic_filter =
4192 SemanticFilter::from_search_filters(filters, &context.filter_maps)?;
4193 if let Some(roles) = context.roles.clone() {
4194 semantic_filter = semantic_filter.with_roles(Some(roles));
4195 }
4196
4197 if request.tier_mode.wants_two_tier() && !request.approximate {
4198 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4199 if let Some(two_tier_index) = request.in_memory_two_tier_index {
4200 let config = request.tier_mode.to_frankensearch_config();
4201 let searcher = FsSyncTwoTierSearcher::new(Arc::clone(two_tier_index), config);
4202 let (tier_hits, metrics) = searcher
4203 .search_collect_with_filter(embedding, request.fetch_limit, fs_filter)
4204 .map_err(|err| {
4205 anyhow!("frankensearch two-tier semantic search failed: {err}")
4206 })?;
4207
4208 tracing::debug!(
4209 tier_mode = ?request.tier_mode,
4210 phase1_ms = metrics.phase1_total_ms,
4211 phase2_ms = metrics.phase2_total_ms,
4212 skip_reason = ?metrics.skip_reason,
4213 returned = tier_hits.len(),
4214 "semantic two-tier search executed"
4215 );
4216
4217 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4218 HashMap::with_capacity(tier_hits.len());
4219 for hit in tier_hits.iter() {
4220 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4221 continue;
4222 };
4223 best_by_message
4224 .entry(parsed.message_id)
4225 .and_modify(|entry| {
4226 if hit.score > entry.score {
4227 entry.score = hit.score;
4228 entry.chunk_idx = parsed.chunk_idx;
4229 }
4230 })
4231 .or_insert(VectorSearchResult {
4232 message_id: parsed.message_id,
4233 chunk_idx: parsed.chunk_idx,
4234 score: hit.score,
4235 });
4236 }
4237
4238 return Ok((
4239 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4240 SemanticCandidateRetryState {
4241 has_more_candidates: tier_hits.len() >= request.fetch_limit,
4242 exact_window_may_omit_competitor: false,
4243 },
4244 None,
4245 ));
4246 }
4247
4248 tracing::debug!(
4249 tier_mode = ?request.tier_mode,
4250 "two-tier semantic unavailable; falling back to exact single-tier search"
4251 );
4252
4253 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4254 let (results, truncated) = Self::search_exact_semantic_indexes(
4255 context,
4256 embedding,
4257 request.fetch_limit,
4258 fs_filter,
4259 )?;
4260 return Ok((results, truncated, None));
4261 }
4262
4263 if request.approximate {
4264 if request.tier_mode.wants_two_tier() {
4265 tracing::debug!(
4266 tier_mode = ?request.tier_mode,
4267 "approximate search requested; bypassing two-tier mode"
4268 );
4269 }
4270
4271 let ann = request
4272 .ann_index
4273 .ok_or_else(|| anyhow!("HNSW index failed to initialize"))?;
4274 let candidate = request
4275 .fetch_limit
4276 .saturating_mul(ANN_CANDIDATE_MULTIPLIER)
4277 .max(request.fetch_limit);
4278 let ef = FS_HNSW_DEFAULT_EF_SEARCH.max(candidate);
4279 let (ann_results, search_stats) =
4280 ann.knn_search_with_stats(embedding, candidate, ef)
4281 .map_err(|err| anyhow!("frankensearch approximate search failed: {err}"))?;
4282 let ann_stats = Some(crate::search::ann_index::AnnSearchStats {
4283 index_size: search_stats.index_size,
4284 dimension: search_stats.dimension,
4285 ef_search: search_stats.ef_search,
4286 k_requested: search_stats.k_requested,
4287 k_returned: search_stats.k_returned,
4288 search_time_us: search_stats.search_time_us,
4289 estimated_recall: search_stats.estimated_recall as f32,
4290 is_approximate: search_stats.is_approximate,
4291 });
4292
4293 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4294
4295 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4296 HashMap::with_capacity(ann_results.len());
4297 for hit in ann_results.iter() {
4298 if let Some(filter) = fs_filter
4299 && !filter.matches(&hit.doc_id, None)
4300 {
4301 continue;
4302 }
4303 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4304 continue;
4305 };
4306 best_by_message
4307 .entry(parsed.message_id)
4308 .and_modify(|entry| {
4309 if hit.score > entry.score {
4310 entry.score = hit.score;
4311 entry.chunk_idx = parsed.chunk_idx;
4312 }
4313 })
4314 .or_insert(VectorSearchResult {
4315 message_id: parsed.message_id,
4316 chunk_idx: parsed.chunk_idx,
4317 score: hit.score,
4318 });
4319 }
4320
4321 return Ok((
4322 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4323 SemanticCandidateRetryState {
4324 has_more_candidates: ann_results.len() >= candidate,
4325 exact_window_may_omit_competitor: false,
4326 },
4327 ann_stats,
4328 ));
4329 }
4330
4331 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4332 let (results, truncated) = Self::search_exact_semantic_indexes(
4333 context,
4334 embedding,
4335 request.fetch_limit,
4336 fs_filter,
4337 )?;
4338 Ok((results, truncated, None))
4339 }
4340
4341 pub fn can_progressively_refine(&self) -> bool {
4342 self.progressive_context()
4343 .map(|context| {
4344 context.as_ref().is_some_and(|ctx| {
4345 ctx.quality_embedder.is_some() && ctx.index.has_quality_index()
4346 })
4347 })
4348 .unwrap_or(false)
4349 }
4350
4351 fn progressive_context(&self) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4352 loop {
4353 let (ann_path, embedder, context_token) = {
4354 let mut guard = self
4355 .semantic
4356 .lock()
4357 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4358 let state = guard.as_mut().ok_or_else(|| {
4359 anyhow!("semantic search unavailable (no embedder or vector index)")
4360 })?;
4361 if let Some(context) = state.progressive_context.as_ref() {
4362 return Ok(Some(Arc::clone(context)));
4363 }
4364 if state.progressive_context_unavailable {
4365 return Ok(None);
4366 }
4367 (
4368 state.ann_path.clone(),
4369 Arc::clone(&state.embedder),
4370 Arc::clone(&state.context_token),
4371 )
4372 };
4373
4374 let context = match self.build_progressive_context(
4375 ann_path.clone(),
4376 embedder,
4377 Arc::clone(&context_token),
4378 ) {
4379 Ok(context) => context,
4380 Err(err) => {
4381 let mut guard = self
4382 .semantic
4383 .lock()
4384 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4385 let state = guard.as_mut().ok_or_else(|| {
4386 anyhow!("semantic search unavailable (no embedder or vector index)")
4387 })?;
4388 if let Some(existing) = state.progressive_context.as_ref() {
4389 return Ok(Some(Arc::clone(existing)));
4390 }
4391 if !Arc::ptr_eq(&state.context_token, &context_token) {
4392 continue;
4393 }
4394 return Err(err);
4395 }
4396 };
4397
4398 let Some(context) = context else {
4399 let mut guard = self
4400 .semantic
4401 .lock()
4402 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4403 let state = guard.as_mut().ok_or_else(|| {
4404 anyhow!("semantic search unavailable (no embedder or vector index)")
4405 })?;
4406 if let Some(existing) = state.progressive_context.as_ref() {
4407 return Ok(Some(Arc::clone(existing)));
4408 }
4409 if !Arc::ptr_eq(&state.context_token, &context_token) {
4410 continue;
4411 }
4412 state.progressive_context_unavailable = true;
4413 return Ok(None);
4414 };
4415
4416 let mut guard = self
4417 .semantic
4418 .lock()
4419 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4420 let state = guard.as_mut().ok_or_else(|| {
4421 anyhow!("semantic search unavailable (no embedder or vector index)")
4422 })?;
4423 if let Some(existing) = state.progressive_context.as_ref() {
4424 return Ok(Some(Arc::clone(existing)));
4425 }
4426 if !Arc::ptr_eq(&state.context_token, &context_token) {
4427 continue;
4428 }
4429 state.progressive_context_unavailable = false;
4430 state.progressive_context = Some(Arc::clone(&context));
4431 return Ok(Some(context));
4432 }
4433 }
4434
4435 fn build_progressive_context(
4436 &self,
4437 ann_path: Option<PathBuf>,
4438 embedder: Arc<dyn Embedder>,
4439 context_token: Arc<()>,
4440 ) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4441 let Some(index_dir) = ann_path
4442 .as_ref()
4443 .and_then(|path| path.parent().map(Path::to_path_buf))
4444 else {
4445 return Ok(None);
4446 };
4447
4448 let fast_path = {
4449 let explicit = index_dir.join("vector.fast.idx");
4450 if explicit.is_file() {
4451 explicit
4452 } else {
4453 let fallback = index_dir.join("vector.idx");
4454 if fallback.is_file() {
4455 fallback
4456 } else {
4457 return Ok(None);
4458 }
4459 }
4460 };
4461 let quality_path = index_dir.join("vector.quality.idx");
4462 if !quality_path.is_file() {
4463 return Ok(None);
4464 }
4465
4466 let fast_index = FsVectorIndex::open(&fast_path)
4467 .map_err(|err| anyhow!("open fast-tier index failed: {err}"))?;
4468 let quality_index = FsVectorIndex::open(&quality_path)
4469 .map_err(|err| anyhow!("open quality-tier index failed: {err}"))?;
4470 let index = Arc::new(
4471 FsTwoTierIndex::open(&index_dir, frankensearch_two_tier_config())
4472 .map_err(|err| anyhow!("open progressive two-tier index failed: {err}"))?,
4473 );
4474
4475 let fast_embedder = self.load_embedder_for_progressive_id(
4476 &embedder,
4477 fast_index.embedder_id(),
4478 fast_index.dimension(),
4479 )?;
4480 let fast_embedder: Arc<dyn frankensearch::Embedder> = Arc::new(FsSyncEmbedderAdapter(
4481 SharedCassSyncEmbedder::new(fast_embedder),
4482 ));
4483 let quality_embedder = Some(self.load_embedder_for_progressive_id(
4484 &embedder,
4485 quality_index.embedder_id(),
4486 quality_index.dimension(),
4487 )?);
4488 let quality_embedder = quality_embedder.map(|embedder| {
4489 Arc::new(FsSyncEmbedderAdapter(SharedCassSyncEmbedder::new(embedder)))
4490 as Arc<dyn frankensearch::Embedder>
4491 });
4492
4493 Ok(Some(Arc::new(ProgressiveTwoTierContext {
4494 context_token,
4495 index,
4496 fast_embedder,
4497 quality_embedder,
4498 })))
4499 }
4500
4501 fn load_embedder_for_progressive_id(
4502 &self,
4503 current_embedder: &Arc<dyn Embedder>,
4504 embedder_id: &str,
4505 dimension: usize,
4506 ) -> Result<Arc<dyn Embedder>> {
4507 if current_embedder.id() == embedder_id {
4508 return Ok(Arc::clone(current_embedder));
4509 }
4510
4511 if let Some(dim) = embedder_id.strip_prefix("fnv1a-")
4512 && let Ok(parsed) = dim.parse::<usize>()
4513 {
4514 return Ok(Arc::new(crate::search::hash_embedder::HashEmbedder::new(
4515 parsed.max(dimension),
4516 )));
4517 }
4518
4519 if let Some(embedder_name) =
4520 crate::search::fastembed_embedder::FastEmbedder::canonical_name(embedder_id)
4521 {
4522 let data_dir = self
4523 .sqlite_path
4524 .as_ref()
4525 .and_then(|path| path.parent())
4526 .ok_or_else(|| anyhow!("cannot resolve data dir for progressive embedder load"))?;
4527 let embedder = crate::search::fastembed_embedder::FastEmbedder::load_by_name(
4528 data_dir,
4529 embedder_name,
4530 )
4531 .with_context(|| format!("loading FastEmbed model for {embedder_name}"))?;
4532 if embedder.dimension() != dimension {
4533 bail!(
4534 "progressive embedder dimension mismatch: {} index expects {}, model has {}",
4535 embedder_id,
4536 dimension,
4537 embedder.dimension()
4538 );
4539 }
4540 return Ok(Arc::new(embedder));
4541 }
4542
4543 bail!("unsupported progressive embedder id: {embedder_id}");
4544 }
4545
4546 fn resolve_semantic_doc_ids_for_hits(
4547 &self,
4548 hits: &[SearchHit],
4549 ) -> Result<Vec<Option<ResolvedSemanticDocId>>> {
4550 if hits.is_empty() {
4551 return Ok(Vec::new());
4552 }
4553
4554 let lookup_keys: Vec<Option<ProgressiveLookupKey>> = hits
4555 .iter()
4556 .map(|hit| {
4557 let idx = hit
4558 .line_number
4559 .and_then(|line| line.checked_sub(1))
4560 .map(i64::try_from)
4561 .transpose()
4562 .ok()
4563 .flatten()?;
4564 Some((
4565 normalized_search_hit_source_id(hit),
4566 hit.source_path.clone(),
4567 hit.conversation_id,
4568 hit.title.trim().to_string(),
4569 idx,
4570 hit.created_at,
4571 hit.content_hash,
4572 ))
4573 })
4574 .collect();
4575
4576 let mut seen_exact = HashSet::new();
4577 let mut exact_query_keys = Vec::new();
4578 let mut seen_fallback = HashSet::new();
4579 let mut fallback_query_keys = Vec::new();
4580 for (source_id, source_path, conversation_id, _title, idx, _created_at, _content_hash) in
4581 lookup_keys.iter().flatten()
4582 {
4583 if let Some(conversation_id) = conversation_id {
4584 let query_key: ProgressiveExactQueryKey = (*conversation_id, *idx);
4585 if seen_exact.insert(query_key) {
4586 exact_query_keys.push(query_key);
4587 }
4588 } else {
4589 let query_key: ProgressiveFallbackQueryKey =
4590 (source_id.clone(), source_path.clone(), *idx);
4591 if seen_fallback.insert(query_key.clone()) {
4592 fallback_query_keys.push(query_key);
4593 }
4594 }
4595 }
4596
4597 if exact_query_keys.is_empty() && fallback_query_keys.is_empty() {
4598 return Ok(vec![None; hits.len()]);
4599 }
4600
4601 let sqlite_guard = self.sqlite_guard()?;
4602 let conn = sqlite_guard
4603 .as_ref()
4604 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4605
4606 let mut resolved_by_key = HashMap::new();
4607 let normalized_source_sql =
4608 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4609
4610 const CHUNK_SIZE: usize = 300;
4611 for chunk in exact_query_keys.chunks(CHUNK_SIZE) {
4612 let mut sql = String::from("SELECT c.id, ");
4613 sql.push_str(&normalized_source_sql);
4614 sql.push_str(
4615 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4616 FROM messages m
4617 JOIN conversations c ON m.conversation_id = c.id
4618 LEFT JOIN sources s ON c.source_id = s.id
4619 WHERE ",
4620 );
4621 let mut params = Vec::with_capacity(chunk.len().saturating_mul(2));
4622 for (idx, (conversation_id, line_idx)) in chunk.iter().enumerate() {
4623 if idx > 0 {
4624 sql.push_str(" OR ");
4625 }
4626 sql.push_str("(c.id = ? AND m.idx = ?)");
4627 params.push(ParamValue::from(*conversation_id));
4628 params.push(ParamValue::from(*line_idx));
4629 }
4630
4631 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4632 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4633 let conversation_id: i64 = row.get_typed(0)?;
4634 let source_id: String = row.get_typed(1)?;
4635 let source_path: String = row.get_typed(2)?;
4636 let idx: i64 = row.get_typed(3)?;
4637 let message_id_raw: i64 = row.get_typed(4)?;
4638 let agent_id_raw: Option<i64> = row.get_typed(5)?;
4641 let workspace_id_raw: Option<i64> = row.get_typed(6)?;
4642 let role_raw: String = row.get_typed(7)?;
4643 let created_at_ms: Option<i64> = row.get_typed(8)?;
4644 let content: String = row.get_typed(9)?;
4645 let title: Option<String> = row.get_typed(10)?;
4646
4647 let canonical = canonicalize_for_embedding(&content);
4648 if canonical.is_empty() {
4649 return Ok(None);
4650 }
4651
4652 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4653 std::io::Error::other("message id out of range for progressive doc_id")
4654 })?;
4655 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4656 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4657 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4658 let doc_id = SemanticDocId {
4659 message_id,
4660 chunk_idx: 0,
4661 agent_id,
4662 workspace_id,
4663 source_id: crc32fast::hash(source_id.as_bytes()),
4664 role,
4665 created_at_ms: created_at_ms.unwrap_or(0),
4666 content_hash: Some(content_hash(&canonical)),
4667 }
4668 .to_doc_id_string();
4669 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4670 let lookup_key = (
4671 source_id,
4672 source_path.clone(),
4673 Some(conversation_id),
4674 title.unwrap_or_default().trim().to_string(),
4675 idx,
4676 created_at_ms,
4677 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4678 );
4679
4680 Ok(Some((
4681 lookup_key,
4682 ResolvedSemanticDocId { message_id, doc_id },
4683 )))
4684 })?;
4685
4686 for row in chunk_rows.into_iter().flatten() {
4687 resolved_by_key.insert(row.0, row.1);
4688 }
4689 }
4690
4691 for chunk in fallback_query_keys.chunks(CHUNK_SIZE) {
4692 let mut sql = String::from("SELECT ");
4693 sql.push_str(&normalized_source_sql);
4694 sql.push_str(
4695 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4696 FROM messages m
4697 JOIN conversations c ON m.conversation_id = c.id
4698 LEFT JOIN sources s ON c.source_id = s.id
4699 WHERE ",
4700 );
4701 let mut params = Vec::with_capacity(chunk.len().saturating_mul(3));
4702 for (idx, (source_id, source_path, line_idx)) in chunk.iter().enumerate() {
4703 if idx > 0 {
4704 sql.push_str(" OR ");
4705 }
4706 sql.push_str(&format!(
4707 "({normalized_source_sql} = ? AND c.source_path = ? AND m.idx = ?)"
4708 ));
4709 params.push(ParamValue::from(normalize_search_source_filter_value(
4710 source_id,
4711 )));
4712 params.push(ParamValue::from(source_path.clone()));
4713 params.push(ParamValue::from(*line_idx));
4714 }
4715
4716 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4717 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4718 let source_id: String = row.get_typed(0)?;
4719 let source_path: String = row.get_typed(1)?;
4720 let idx: i64 = row.get_typed(2)?;
4721 let message_id_raw: i64 = row.get_typed(3)?;
4722 let agent_id_raw: Option<i64> = row.get_typed(4)?;
4725 let workspace_id_raw: Option<i64> = row.get_typed(5)?;
4726 let role_raw: String = row.get_typed(6)?;
4727 let created_at_ms: Option<i64> = row.get_typed(7)?;
4728 let content: String = row.get_typed(8)?;
4729 let title: Option<String> = row.get_typed(9)?;
4730
4731 let canonical = canonicalize_for_embedding(&content);
4732 if canonical.is_empty() {
4733 return Ok(None);
4734 }
4735
4736 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4737 std::io::Error::other("message id out of range for progressive doc_id")
4738 })?;
4739 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4740 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4741 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4742 let doc_id = SemanticDocId {
4743 message_id,
4744 chunk_idx: 0,
4745 agent_id,
4746 workspace_id,
4747 source_id: crc32fast::hash(source_id.as_bytes()),
4748 role,
4749 created_at_ms: created_at_ms.unwrap_or(0),
4750 content_hash: Some(content_hash(&canonical)),
4751 }
4752 .to_doc_id_string();
4753 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4754 let lookup_key = (
4755 source_id,
4756 source_path.clone(),
4757 None,
4758 title.unwrap_or_default().trim().to_string(),
4759 idx,
4760 created_at_ms,
4761 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4762 );
4763
4764 Ok(Some((
4765 lookup_key,
4766 ResolvedSemanticDocId { message_id, doc_id },
4767 )))
4768 })?;
4769
4770 for row in chunk_rows.into_iter().flatten() {
4771 resolved_by_key.insert(row.0, row.1);
4772 }
4773 }
4774
4775 Ok(lookup_keys
4776 .into_iter()
4777 .map(|key| key.and_then(|lookup| resolved_by_key.get(&lookup).cloned()))
4778 .collect())
4779 }
4780
4781 fn load_message_text_by_id(&self, message_id: u64) -> Result<Option<String>> {
4782 let sqlite_guard = self.sqlite_guard()?;
4783 let conn = sqlite_guard
4784 .as_ref()
4785 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4786 let rows: Vec<String> = conn.query_map_collect(
4787 "SELECT content FROM messages WHERE id = ?",
4788 &[ParamValue::from(i64::try_from(message_id)?)],
4789 |row: &frankensqlite::Row| row.get_typed(0),
4790 )?;
4791 Ok(rows.into_iter().next())
4792 }
4793
4794 fn collapse_progressive_scored_results(
4795 &self,
4796 results: &[FsScoredResult],
4797 fetch_limit: usize,
4798 ) -> Vec<VectorSearchResult> {
4799 let fetch = fetch_limit.max(1);
4800 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4801 HashMap::with_capacity(results.len());
4802 for hit in results {
4803 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4804 continue;
4805 };
4806 best_by_message
4807 .entry(parsed.message_id)
4808 .and_modify(|entry| {
4809 if hit.score > entry.score {
4810 entry.score = hit.score;
4811 entry.chunk_idx = parsed.chunk_idx;
4812 }
4813 })
4814 .or_insert(VectorSearchResult {
4815 message_id: parsed.message_id,
4816 chunk_idx: parsed.chunk_idx,
4817 score: hit.score,
4818 });
4819 }
4820 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
4821 collapsed.sort_by(|a, b| {
4822 b.score
4823 .total_cmp(&a.score)
4824 .then_with(|| a.message_id.cmp(&b.message_id))
4825 });
4826 if collapsed.len() > fetch {
4827 collapsed.truncate(fetch);
4828 }
4829 collapsed
4830 }
4831
4832 fn hydrate_semantic_hits_with_ids(
4833 &self,
4834 results: &[VectorSearchResult],
4835 field_mask: FieldMask,
4836 ) -> Result<Vec<(u64, SearchHit)>> {
4837 if results.is_empty() {
4838 return Ok(Vec::new());
4839 }
4840 let sqlite_guard = self.sqlite_guard()?;
4841 let conn = sqlite_guard
4842 .as_ref()
4843 .ok_or_else(|| anyhow!("semantic search requires database connection"))?;
4844
4845 #[derive(Debug)]
4846 struct MessageHydrationRow {
4847 message_id: u64,
4848 conversation_id: i64,
4849 full_content: String,
4850 msg_created_at: Option<i64>,
4851 idx: Option<i64>,
4852 }
4853
4854 #[derive(Debug)]
4855 struct ConversationHydrationRow {
4856 title: Option<String>,
4857 source_path: String,
4858 source_id: String,
4859 origin_host: Option<String>,
4860 agent: String,
4861 workspace: Option<String>,
4862 origin_kind: Option<String>,
4863 started_at: Option<i64>,
4864 }
4865
4866 let mut unique_message_ids = Vec::with_capacity(results.len());
4867 let mut seen_message_ids = HashSet::with_capacity(results.len());
4868 for result in results {
4869 if seen_message_ids.insert(result.message_id) {
4870 unique_message_ids.push(result.message_id);
4871 }
4872 }
4873
4874 let message_placeholder_capacity =
4875 unique_message_ids.len().saturating_mul(2).saturating_sub(1);
4876 let mut message_placeholders = String::with_capacity(message_placeholder_capacity);
4877 let mut message_params: Vec<ParamValue> = Vec::with_capacity(unique_message_ids.len());
4878 for (idx, message_id) in unique_message_ids.iter().enumerate() {
4879 if idx > 0 {
4880 message_placeholders.push(',');
4881 }
4882 message_placeholders.push('?');
4883 message_params.push(ParamValue::from(i64::try_from(*message_id)?));
4884 }
4885
4886 let message_sql = format!(
4887 "SELECT id, conversation_id, content, created_at, idx
4888 FROM messages
4889 WHERE id IN ({message_placeholders})"
4890 );
4891
4892 let message_rows: Vec<MessageHydrationRow> =
4893 conn.query_map_collect(&message_sql, &message_params, |row: &frankensqlite::Row| {
4894 let message_id: i64 = row.get_typed(0)?;
4895 Ok(MessageHydrationRow {
4896 message_id: semantic_message_id_from_db(message_id)?,
4897 conversation_id: row.get_typed(1)?,
4898 full_content: row.get_typed(2)?,
4899 msg_created_at: row.get_typed(3)?,
4900 idx: row.get_typed(4)?,
4901 })
4902 })?;
4903 if message_rows.is_empty() {
4904 return Ok(Vec::new());
4905 }
4906
4907 let title_expr = if field_mask.wants_title() {
4908 "c.title"
4909 } else {
4910 "''"
4911 };
4912 let normalized_source_sql =
4913 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4914 let mut conversation_ids = Vec::with_capacity(message_rows.len());
4915 let mut seen_conversation_ids = HashSet::with_capacity(message_rows.len());
4916 for row in &message_rows {
4917 if seen_conversation_ids.insert(row.conversation_id) {
4918 conversation_ids.push(row.conversation_id);
4919 }
4920 }
4921 let conversation_placeholder_capacity =
4922 conversation_ids.len().saturating_mul(2).saturating_sub(1);
4923 let mut conversation_placeholders =
4924 String::with_capacity(conversation_placeholder_capacity);
4925 let mut conversation_params: Vec<ParamValue> = Vec::with_capacity(conversation_ids.len());
4926 for (idx, conversation_id) in conversation_ids.iter().enumerate() {
4927 if idx > 0 {
4928 conversation_placeholders.push(',');
4929 }
4930 conversation_placeholders.push('?');
4931 conversation_params.push(ParamValue::from(*conversation_id));
4932 }
4933 let sql = format!(
4938 "SELECT c.id, {title_expr}, c.source_path, {normalized_source_sql}, c.origin_host, COALESCE(a.slug, 'unknown'), w.path, s.kind, c.started_at
4939 FROM conversations c
4940 LEFT JOIN agents a ON c.agent_id = a.id
4941 LEFT JOIN workspaces w ON c.workspace_id = w.id
4942 LEFT JOIN sources s ON c.source_id = s.id
4943 WHERE c.id IN ({conversation_placeholders})"
4944 );
4945
4946 let conversation_rows: Vec<(i64, ConversationHydrationRow)> =
4947 conn.query_map_collect(&sql, &conversation_params, |row: &frankensqlite::Row| {
4948 let conversation_id: i64 = row.get_typed(0)?;
4949 let title: Option<String> = if field_mask.wants_title() {
4950 row.get_typed(1)?
4951 } else {
4952 None
4953 };
4954 Ok((
4955 conversation_id,
4956 ConversationHydrationRow {
4957 title,
4958 source_path: row.get_typed(2)?,
4959 source_id: row.get_typed(3)?,
4960 origin_host: row.get_typed(4)?,
4961 agent: row.get_typed(5)?,
4962 workspace: row.get_typed(6)?,
4963 origin_kind: row.get_typed(7)?,
4964 started_at: row.get_typed(8)?,
4965 },
4966 ))
4967 })?;
4968
4969 let conversations_by_id: HashMap<i64, ConversationHydrationRow> =
4970 conversation_rows.into_iter().collect();
4971
4972 let rows: Vec<(u64, SearchHit)> = message_rows
4973 .into_iter()
4974 .filter_map(|message| {
4975 let conversation = conversations_by_id.get(&message.conversation_id)?;
4976
4977 let created_at = message.msg_created_at.or(conversation.started_at);
4978 let line_number = message
4979 .idx
4980 .and_then(|i| usize::try_from(i).ok())
4981 .map(|i| i.saturating_add(1));
4982 let snippet = if field_mask.wants_snippet() {
4983 snippet_from_content(&message.full_content)
4984 } else {
4985 String::new()
4986 };
4987 let content = if field_mask.needs_content() {
4988 message.full_content.clone()
4989 } else {
4990 String::new()
4991 };
4992 let content_hash = stable_hit_hash(
4993 &message.full_content,
4994 &conversation.source_path,
4995 line_number,
4996 created_at,
4997 );
4998 let source_id = normalized_search_hit_source_id_parts(
4999 conversation.source_id.as_str(),
5000 conversation.origin_kind.as_deref().unwrap_or_default(),
5001 conversation.origin_host.as_deref(),
5002 );
5003 let origin_kind = normalized_search_hit_origin_kind(
5004 &source_id,
5005 conversation.origin_kind.as_deref(),
5006 );
5007
5008 let hit = SearchHit {
5009 title: if field_mask.wants_title() {
5010 conversation.title.clone().unwrap_or_default()
5011 } else {
5012 String::new()
5013 },
5014 snippet,
5015 content,
5016 content_hash,
5017 conversation_id: Some(message.conversation_id),
5018 score: 0.0,
5019 source_path: conversation.source_path.clone(),
5020 agent: conversation.agent.clone(),
5021 workspace: conversation.workspace.clone().unwrap_or_default(),
5022 workspace_original: None,
5023 created_at,
5024 line_number,
5025 match_type: MatchType::Exact,
5026 source_id,
5027 origin_kind,
5028 origin_host: conversation.origin_host.clone(),
5029 };
5030
5031 Some((message.message_id, hit))
5032 })
5033 .collect();
5034
5035 let mut hits_by_id = HashMap::new();
5036 for (id, hit) in rows {
5037 hits_by_id.insert(id, hit);
5038 }
5039
5040 let mut ordered = Vec::new();
5041 for result in results {
5042 if let Some(mut hit) = hits_by_id.remove(&result.message_id) {
5043 hit.score = result.score;
5044 ordered.push((result.message_id, hit));
5045 }
5046 }
5047
5048 Ok(ordered)
5049 }
5050
5051 fn overlay_progressive_lexical_hit(
5052 &self,
5053 hit: &mut SearchHit,
5054 lexical: &ProgressiveLexicalHit,
5055 field_mask: FieldMask,
5056 ) {
5057 if field_mask.wants_title() && !lexical.title.is_empty() {
5058 hit.title = lexical.title.clone();
5059 }
5060 if field_mask.wants_snippet() && !lexical.snippet.is_empty() {
5061 hit.snippet = lexical.snippet.clone();
5062 }
5063 if field_mask.needs_content() && !lexical.content.is_empty() {
5064 hit.content = lexical.content.clone();
5065 }
5066 hit.match_type = lexical.match_type;
5067 hit.line_number = lexical.line_number.or(hit.line_number);
5068 }
5069
5070 fn progressive_phase_to_result(
5071 &self,
5072 results: &[FsScoredResult],
5073 ctx: ProgressivePhaseContext<'_>,
5074 ) -> Result<SearchResult> {
5075 let collapsed = self.collapse_progressive_scored_results(results, ctx.fetch_limit);
5076 let missing: Vec<VectorSearchResult> = collapsed
5077 .iter()
5078 .filter(|result| {
5079 ctx.lexical_cache
5080 .and_then(|cache| cache.hits_by_message.get(&result.message_id))
5081 .is_none()
5082 })
5083 .map(|result| VectorSearchResult {
5084 message_id: result.message_id,
5085 chunk_idx: result.chunk_idx,
5086 score: result.score,
5087 })
5088 .collect();
5089 let mut hydrated_by_id: HashMap<u64, SearchHit> = self
5090 .hydrate_semantic_hits_with_ids(&missing, ctx.field_mask)?
5091 .into_iter()
5092 .collect();
5093
5094 let mut hydrated: Vec<(u64, SearchHit)> = Vec::with_capacity(collapsed.len());
5095 for result in &collapsed {
5096 if let Some(cache) = ctx.lexical_cache
5097 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
5098 {
5099 hydrated.push((result.message_id, lexical.to_search_hit(result.score)));
5100 continue;
5101 }
5102 if let Some(mut hit) = hydrated_by_id.remove(&result.message_id) {
5103 if let Some(cache) = ctx.lexical_cache
5104 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
5105 {
5106 self.overlay_progressive_lexical_hit(&mut hit, lexical, ctx.field_mask);
5107 }
5108 hydrated.push((result.message_id, hit));
5109 }
5110 }
5111
5112 let mut hits: Vec<SearchHit> = hydrated.into_iter().map(|(_, hit)| hit).collect();
5113 (_, hits) = self.postprocess_hits_page(hits, ctx.query, ctx.filters, ctx.limit, 0);
5114
5115 let (wildcard_fallback, suggestions) = ctx
5116 .lexical_cache
5117 .map(|cache| {
5118 let suggestions = if hits.is_empty() {
5119 cache.suggestions.clone()
5120 } else {
5121 Vec::new()
5122 };
5123 (cache.wildcard_fallback, suggestions)
5124 })
5125 .unwrap_or((false, Vec::new()));
5126
5127 Ok(SearchResult {
5128 hits,
5129 wildcard_fallback,
5130 cache_stats: self.cache_stats(),
5131 suggestions,
5132 ann_stats: None,
5133 total_count: None,
5134 })
5135 }
5136
5137 pub(crate) async fn search_progressive_with_callback(
5138 self: &Arc<Self>,
5139 request: ProgressiveSearchRequest<'_>,
5140 mut on_event: impl FnMut(ProgressiveSearchEvent) + Send,
5141 ) -> Result<()> {
5142 let ProgressiveSearchRequest {
5143 cx,
5144 query,
5145 filters,
5146 limit,
5147 sparse_threshold,
5148 field_mask,
5149 mode,
5150 } = request;
5151 let field_mask = effective_field_mask(field_mask);
5152 let limit = limit.max(1);
5153 let fetch_limit = progressive_phase_fetch_limit(limit);
5154
5155 match mode {
5156 SearchMode::Lexical => {
5157 let started = Instant::now();
5158 let result = self.search_with_fallback(
5159 query,
5160 filters,
5161 limit,
5162 0,
5163 sparse_threshold,
5164 field_mask,
5165 )?;
5166 on_event(ProgressiveSearchEvent::Phase {
5167 kind: ProgressivePhaseKind::Initial,
5168 elapsed_ms: started.elapsed().as_millis(),
5169 result,
5170 });
5171 return Ok(());
5172 }
5173 SearchMode::Semantic | SearchMode::Hybrid => {}
5174 }
5175
5176 let progressive_context = {
5177 self.progressive_context()?
5178 .ok_or_else(|| anyhow!("progressive two-tier context unavailable"))?
5179 };
5180 let progressive_context_token = Arc::clone(&progressive_context.context_token);
5181
5182 let lexical_cache: Arc<Mutex<ProgressiveLexicalSnapshot>> =
5183 Arc::new(Mutex::new(Arc::new(ProgressiveLexicalCache::default())));
5184 let text_cache: Arc<Mutex<HashMap<u64, String>>> = Arc::new(Mutex::new(HashMap::new()));
5185 let text_client = Arc::clone(self);
5186 let text_cache_for_lookup = Arc::clone(&text_cache);
5187 let text_fn = move |doc_id: &str| -> Option<String> {
5188 let parsed = parse_semantic_doc_id(doc_id)?;
5189 if let Ok(cache) = text_cache_for_lookup.lock()
5190 && let Some(text) = cache.get(&parsed.message_id)
5191 {
5192 return Some(text.clone());
5193 }
5194 let loaded = text_client
5195 .load_message_text_by_id(parsed.message_id)
5196 .ok()
5197 .flatten()?;
5198 if let Ok(mut cache) = text_cache_for_lookup.lock() {
5199 cache.insert(parsed.message_id, loaded.clone());
5200 }
5201 Some(loaded)
5202 };
5203
5204 let mut searcher = FsTwoTierSearcher::new(
5205 Arc::clone(&progressive_context.index),
5206 Arc::clone(&progressive_context.fast_embedder),
5207 frankensearch_two_tier_config(),
5208 );
5209
5210 if let Some(quality_embedder) = progressive_context.quality_embedder.as_ref() {
5211 searcher = searcher.with_quality_embedder(Arc::clone(quality_embedder));
5212 }
5213
5214 if matches!(mode, SearchMode::Hybrid) {
5215 let lexical = Arc::new(CassProgressiveLexicalAdapter::new(
5216 Arc::clone(self),
5217 filters.clone(),
5218 field_mask,
5219 sparse_threshold,
5220 Arc::clone(&lexical_cache),
5221 ));
5222 searcher = searcher.with_lexical(lexical);
5223 }
5224
5225 let phase_client = Arc::clone(self);
5226 let phase_filters = filters.clone();
5227 let phase_cache = Arc::clone(&lexical_cache);
5228 let mut phase_error: Option<anyhow::Error> = None;
5229
5230 let search_result = searcher
5231 .search(cx, query, fetch_limit, text_fn, |phase| {
5232 if phase_error.is_some() {
5233 return;
5234 }
5235 match phase_client.semantic_context_matches(&progressive_context_token) {
5236 Ok(true) => {}
5237 Ok(false) => {
5238 phase_error = Some(anyhow!(
5239 "progressive search aborted: semantic context changed"
5240 ));
5241 cx.set_cancel_requested(true);
5242 return;
5243 }
5244 Err(err) => {
5245 phase_error = Some(err);
5246 cx.set_cancel_requested(true);
5247 return;
5248 }
5249 }
5250 let lexical_snapshot = phase_cache.lock().ok().map(|guard| Arc::clone(&guard));
5251 let event_result = match phase {
5252 FsSearchPhase::Initial {
5253 results, latency, ..
5254 } => phase_client
5255 .progressive_phase_to_result(
5256 &results,
5257 ProgressivePhaseContext {
5258 query,
5259 filters: &phase_filters,
5260 field_mask,
5261 lexical_cache: lexical_snapshot.as_deref(),
5262 limit,
5263 fetch_limit,
5264 },
5265 )
5266 .map(|result| ProgressiveSearchEvent::Phase {
5267 kind: ProgressivePhaseKind::Initial,
5268 elapsed_ms: latency.as_millis(),
5269 result,
5270 }),
5271 FsSearchPhase::Refined {
5272 results, latency, ..
5273 } => phase_client
5274 .progressive_phase_to_result(
5275 &results,
5276 ProgressivePhaseContext {
5277 query,
5278 filters: &phase_filters,
5279 field_mask,
5280 lexical_cache: lexical_snapshot.as_deref(),
5281 limit,
5282 fetch_limit,
5283 },
5284 )
5285 .map(|result| ProgressiveSearchEvent::Phase {
5286 kind: ProgressivePhaseKind::Refined,
5287 elapsed_ms: latency.as_millis(),
5288 result,
5289 }),
5290 FsSearchPhase::Reranked {
5296 results, latency, ..
5297 } => phase_client
5298 .progressive_phase_to_result(
5299 &results,
5300 ProgressivePhaseContext {
5301 query,
5302 filters: &phase_filters,
5303 field_mask,
5304 lexical_cache: lexical_snapshot.as_deref(),
5305 limit,
5306 fetch_limit,
5307 },
5308 )
5309 .map(|result| ProgressiveSearchEvent::Phase {
5310 kind: ProgressivePhaseKind::Refined,
5311 elapsed_ms: latency.as_millis(),
5312 result,
5313 }),
5314 FsSearchPhase::RefinementFailed { error, latency, .. } => {
5315 Ok(ProgressiveSearchEvent::RefinementFailed {
5316 latency_ms: latency.as_millis(),
5317 error: error.to_string(),
5318 })
5319 }
5320 };
5321
5322 match event_result {
5323 Ok(event) => on_event(event),
5324 Err(err) => {
5325 phase_error = Some(err);
5326 cx.set_cancel_requested(true);
5327 }
5328 }
5329 })
5330 .await;
5331
5332 if let Some(err) = phase_error {
5333 return Err(err);
5334 }
5335
5336 search_result
5337 .map(|_| ())
5338 .map_err(|err| anyhow!("progressive search failed: {err}"))
5339 }
5340
5341 pub fn search_semantic(
5343 &self,
5344 query: &str,
5345 filters: SearchFilters,
5346 limit: usize,
5347 offset: usize,
5348 field_mask: FieldMask,
5349 approximate: bool,
5350 ) -> Result<(
5351 Vec<SearchHit>,
5352 Option<crate::search::ann_index::AnnSearchStats>,
5353 )> {
5354 self.search_semantic_with_tier(
5355 query,
5356 filters,
5357 limit,
5358 offset,
5359 field_mask,
5360 approximate,
5361 SemanticTierMode::Single,
5362 )
5363 }
5364
5365 #[allow(clippy::too_many_arguments)]
5367 pub fn search_semantic_with_tier(
5368 &self,
5369 query: &str,
5370 filters: SearchFilters,
5371 limit: usize,
5372 offset: usize,
5373 field_mask: FieldMask,
5374 approximate: bool,
5375 tier_mode: SemanticTierMode,
5376 ) -> Result<(
5377 Vec<SearchHit>,
5378 Option<crate::search::ann_index::AnnSearchStats>,
5379 )> {
5380 let field_mask = effective_field_mask(field_mask);
5381 let canonical = canonicalize_for_embedding(query);
5382 if canonical.trim().is_empty() {
5383 return Ok((Vec::new(), None));
5384 }
5385 let limit = if limit == 0 {
5386 self.total_docs().min(no_limit_result_cap()).max(1)
5387 } else {
5388 limit
5389 };
5390 let target_hits = limit.saturating_add(offset);
5391 if target_hits == 0 {
5392 return Ok((Vec::new(), None));
5393 }
5394 let initial_fetch_limit = target_hits;
5395 let fallback_fetch_limit = target_hits.saturating_mul(3);
5396 loop {
5397 let (embedding, candidate_context, in_memory_two_tier_index, ann_index, context_token) = loop {
5398 let embedding = self.semantic_query_embedding(&canonical)?;
5399 let (candidate_context, context_token) = {
5400 let guard = self
5401 .semantic
5402 .lock()
5403 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5404 let state = guard.as_ref().ok_or_else(|| {
5405 anyhow!("semantic search unavailable (no embedder or vector index)")
5406 })?;
5407 (
5408 SemanticCandidateContext {
5409 fs_semantic_index: Arc::clone(&state.fs_semantic_index),
5410 fs_semantic_indexes: Arc::clone(&state.fs_semantic_indexes),
5411 filter_maps: state.filter_maps.clone(),
5412 roles: state.roles.clone(),
5413 },
5414 Arc::clone(&state.context_token),
5415 )
5416 };
5417 if !Arc::ptr_eq(&embedding.context_token, &context_token) {
5418 continue;
5419 }
5420 let in_memory_two_tier_index = if tier_mode.wants_two_tier() && !approximate {
5421 self.in_memory_two_tier_index(tier_mode)?
5422 } else {
5423 None
5424 };
5425 let ann_index = if approximate {
5426 Some(self.ann_index()?)
5427 } else {
5428 None
5429 };
5430
5431 let guard = self
5432 .semantic
5433 .lock()
5434 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5435 let state = guard.as_ref().ok_or_else(|| {
5436 anyhow!("semantic search unavailable (no embedder or vector index)")
5437 })?;
5438 if !Arc::ptr_eq(&state.context_token, &context_token) {
5439 continue;
5440 }
5441 break (
5442 embedding.vector,
5443 candidate_context,
5444 in_memory_two_tier_index,
5445 ann_index,
5446 context_token,
5447 );
5448 };
5449
5450 let finalize_hits =
5451 |results: &[VectorSearchResult]| -> Result<(usize, Vec<SearchHit>)> {
5452 let hits = self.hydrate_semantic_hits(results, field_mask)?;
5453 Ok(self.postprocess_hits_page(hits, query, &filters, limit, offset))
5454 };
5455
5456 let (results, retry_state, mut ann_stats) = self.search_semantic_candidates(
5457 &candidate_context,
5458 &embedding,
5459 &filters,
5460 SemanticCandidateSearchRequest {
5461 fetch_limit: initial_fetch_limit,
5462 approximate,
5463 tier_mode,
5464 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5465 ann_index: ann_index.as_ref(),
5466 },
5467 )?;
5468 if !self.semantic_context_matches(&context_token)? {
5469 tracing::debug!("semantic context changed during candidate search; retrying");
5470 continue;
5471 }
5472 let (mut available_hits, mut paged_hits) = finalize_hits(&results)?;
5473
5474 let needs_retry = initial_fetch_limit < fallback_fetch_limit
5475 && ((available_hits < target_hits && retry_state.has_more_candidates)
5476 || retry_state.exact_window_may_omit_competitor);
5477
5478 if needs_retry {
5479 tracing::debug!(
5480 query = canonical,
5481 target_hits,
5482 available_hits,
5483 initial_fetch_limit,
5484 fallback_fetch_limit,
5485 "retrying semantic fetch due to candidate-window shortfall"
5486 );
5487 let (retry_results, _, retry_ann_stats) = self.search_semantic_candidates(
5488 &candidate_context,
5489 &embedding,
5490 &filters,
5491 SemanticCandidateSearchRequest {
5492 fetch_limit: fallback_fetch_limit,
5493 approximate,
5494 tier_mode,
5495 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5496 ann_index: ann_index.as_ref(),
5497 },
5498 )?;
5499 if !self.semantic_context_matches(&context_token)? {
5500 tracing::debug!("semantic context changed during retry fetch; retrying");
5501 continue;
5502 }
5503 (available_hits, paged_hits) = finalize_hits(&retry_results)?;
5504 ann_stats = retry_ann_stats;
5505 }
5506
5507 tracing::trace!(
5508 query = canonical,
5509 target_hits,
5510 available_hits,
5511 returned = paged_hits.len(),
5512 "semantic fetch complete"
5513 );
5514
5515 return Ok((paged_hits, ann_stats));
5516 }
5517 }
5518
5519 fn hydrate_semantic_hits(
5520 &self,
5521 results: &[VectorSearchResult],
5522 field_mask: FieldMask,
5523 ) -> Result<Vec<SearchHit>> {
5524 self.hydrate_semantic_hits_with_ids(results, field_mask)
5525 .map(|rows| rows.into_iter().map(|(_, hit)| hit).collect())
5526 }
5527
5528 fn postprocess_hits_page(
5529 &self,
5530 hits: Vec<SearchHit>,
5531 query: &str,
5532 filters: &SearchFilters,
5533 limit: usize,
5534 offset: usize,
5535 ) -> (usize, Vec<SearchHit>) {
5536 let mut hits = deduplicate_hits_with_query(hits, query);
5537 if !filters.session_paths.is_empty() {
5538 hits.retain(|hit| filters.session_paths.contains(&hit.source_path));
5539 }
5540 let available_hits = hits.len();
5541 let paged_hits = hits.into_iter().skip(offset).take(limit).collect();
5542 (available_hits, paged_hits)
5543 }
5544
5545 pub fn search_with_fallback(
5549 &self,
5550 query: &str,
5551 filters: SearchFilters,
5552 limit: usize,
5553 offset: usize,
5554 sparse_threshold: usize,
5555 field_mask: FieldMask,
5556 ) -> Result<SearchResult> {
5557 let hits = self.search(query, filters.clone(), limit, offset, field_mask)?;
5559 let baseline_stats = self.cache_stats();
5560 let tantivy_total = self
5562 .last_tantivy_total_count
5563 .lock()
5564 .ok()
5565 .and_then(|guard| *guard);
5566
5567 let query_has_wildcards = query.contains('*');
5569 let has_boolean_or_phrase = fs_cass_has_boolean_operators(query);
5570 let is_sparse = should_try_wildcard_fallback(hits.len(), limit, offset, sparse_threshold);
5571
5572 if !is_sparse || query_has_wildcards || has_boolean_or_phrase || query.trim().is_empty() {
5573 let suggestions = if hits.is_empty() && !query.trim().is_empty() {
5577 self.generate_suggestions(query, &filters)
5578 } else {
5579 Vec::new()
5580 };
5581 return Ok(SearchResult {
5582 hits,
5583 wildcard_fallback: false,
5584 cache_stats: baseline_stats,
5585 suggestions,
5586 ann_stats: None,
5587 total_count: tantivy_total,
5588 });
5589 }
5590
5591 if should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(query, hits.len()) {
5592 let suggestions = if hits.is_empty() {
5593 self.generate_suggestions(query, &filters)
5594 } else {
5595 Vec::new()
5596 };
5597 return Ok(SearchResult {
5598 hits,
5599 wildcard_fallback: false,
5600 cache_stats: baseline_stats,
5601 suggestions,
5602 ann_stats: None,
5603 total_count: tantivy_total,
5604 });
5605 }
5606
5607 let wildcard_query = query
5609 .split_whitespace()
5610 .map(|term| format!("*{}*", term.trim_matches('*')))
5611 .collect::<Vec<_>>()
5612 .join(" ");
5613
5614 tracing::info!(
5615 original_query = query,
5616 wildcard_query = wildcard_query,
5617 original_count = hits.len(),
5618 "wildcard_fallback"
5619 );
5620
5621 let mut fallback_hits =
5622 self.search(&wildcard_query, filters.clone(), limit, offset, field_mask)?;
5623 let fallback_stats = self.cache_stats();
5624 let fallback_tantivy_total = self
5626 .last_tantivy_total_count
5627 .lock()
5628 .ok()
5629 .and_then(|guard| *guard);
5630
5631 if fallback_hits.len() > hits.len() {
5633 for hit in &mut fallback_hits {
5635 hit.match_type = MatchType::ImplicitWildcard;
5636 }
5637 let suggestions = if fallback_hits.is_empty() {
5639 self.generate_suggestions(query, &filters)
5640 } else {
5641 Vec::new()
5642 };
5643 Ok(SearchResult {
5644 hits: fallback_hits,
5645 wildcard_fallback: true,
5646 cache_stats: fallback_stats,
5647 suggestions,
5648 ann_stats: None,
5649 total_count: fallback_tantivy_total,
5650 })
5651 } else {
5652 let suggestions = if hits.is_empty() {
5655 self.generate_suggestions(query, &filters)
5656 } else {
5657 Vec::new()
5658 };
5659 Ok(SearchResult {
5660 hits,
5661 wildcard_fallback: false,
5662 cache_stats: baseline_stats,
5663 suggestions,
5664 ann_stats: None,
5665 total_count: tantivy_total,
5666 })
5667 }
5668 }
5669
5670 #[allow(clippy::too_many_arguments)]
5672 pub fn search_hybrid(
5673 &self,
5674 lexical_query: &str,
5675 semantic_query: &str,
5676 filters: SearchFilters,
5677 limit: usize,
5678 offset: usize,
5679 sparse_threshold: usize,
5680 field_mask: FieldMask,
5681 approximate: bool,
5682 ) -> Result<SearchResult> {
5683 self.search_hybrid_with_tier(
5684 lexical_query,
5685 semantic_query,
5686 filters,
5687 limit,
5688 offset,
5689 sparse_threshold,
5690 field_mask,
5691 approximate,
5692 SemanticTierMode::Single,
5693 )
5694 }
5695
5696 #[allow(clippy::too_many_arguments)]
5699 pub fn search_hybrid_with_tier(
5700 &self,
5701 lexical_query: &str,
5702 semantic_query: &str,
5703 filters: SearchFilters,
5704 limit: usize,
5705 offset: usize,
5706 sparse_threshold: usize,
5707 field_mask: FieldMask,
5708 approximate: bool,
5709 semantic_tier_mode: SemanticTierMode,
5710 ) -> Result<SearchResult> {
5711 let requested_limit = limit;
5712 let total_docs = self.total_docs().max(1);
5713 let limit = if requested_limit == 0 {
5714 total_docs.min(no_limit_result_cap()).max(1)
5715 } else {
5716 requested_limit
5717 };
5718 let fetch = limit.saturating_add(offset);
5719 if fetch == 0 {
5720 return Ok(SearchResult {
5721 hits: Vec::new(),
5722 wildcard_fallback: false,
5723 cache_stats: self.cache_stats(),
5724 suggestions: Vec::new(),
5725 ann_stats: None,
5726 total_count: None,
5727 });
5728 }
5729
5730 if semantic_query.trim().is_empty() {
5731 return self.search_with_fallback(
5732 lexical_query,
5733 filters,
5734 limit,
5735 offset,
5736 sparse_threshold,
5737 field_mask,
5738 );
5739 }
5740
5741 let budget =
5742 hybrid_candidate_budget(semantic_query, requested_limit, limit, offset, total_docs);
5743 let lexical = self.search_with_fallback(
5744 lexical_query,
5745 filters.clone(),
5746 budget.lexical_candidates,
5747 0,
5748 sparse_threshold,
5749 field_mask,
5750 )?;
5751 let (semantic_hits, semantic_ann_stats) = self.search_semantic_with_tier(
5752 semantic_query,
5753 filters,
5754 budget.semantic_candidates,
5755 0,
5756 field_mask,
5757 approximate,
5758 semantic_tier_mode,
5759 )?;
5760 let fused = rrf_fuse_hits(&lexical.hits, &semantic_hits, semantic_query, limit, offset);
5761 let suggestions = if fused.is_empty() {
5762 lexical.suggestions.clone()
5763 } else {
5764 Vec::new()
5765 };
5766 Ok(SearchResult {
5767 hits: fused,
5768 wildcard_fallback: lexical.wildcard_fallback,
5769 cache_stats: lexical.cache_stats,
5770 suggestions,
5771 ann_stats: semantic_ann_stats,
5772 total_count: None,
5773 })
5774 }
5775
5776 fn generate_suggestions(&self, query: &str, filters: &SearchFilters) -> Vec<QuerySuggestion> {
5778 let mut suggestions = Vec::new();
5779 let query_lower = query.to_lowercase();
5780
5781 if !query.contains('*') && query.len() >= 2 {
5783 suggestions.push(QuerySuggestion::wildcard(query).with_shortcut(1));
5784 }
5785
5786 if !filters.agents.is_empty() {
5788 let agents: Vec<&str> = filters
5789 .agents
5790 .iter()
5791 .map(std::string::String::as_str)
5792 .collect();
5793 let agent_str = agents.join(", ");
5794 suggestions
5795 .push(QuerySuggestion::remove_agent_filter(&agent_str, filters).with_shortcut(2));
5796 }
5797
5798 let known_agents = [
5800 "codex",
5801 "claude",
5802 "claude_code",
5803 "cline",
5804 "gemini",
5805 "amp",
5806 "opencode",
5807 ];
5808 for agent in &known_agents {
5809 if levenshtein_distance(&query_lower, agent) <= 2 && query_lower != *agent {
5810 suggestions.push(
5811 QuerySuggestion::spelling(query, agent)
5812 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5813 );
5814 break; }
5816 }
5817
5818 if filters.agents.is_empty()
5822 && let Ok(sqlite_guard) = self.sqlite.lock()
5823 && let Some(conn) = sqlite_guard.as_ref()
5824 && let Ok(rows) = conn.query_map_collect(
5825 "SELECT a.slug
5826 FROM conversations c
5827 JOIN agents a ON c.agent_id = a.id
5828 GROUP BY a.slug
5829 ORDER BY MAX(c.id) DESC
5830 LIMIT 3",
5831 &[],
5832 |row: &frankensqlite::Row| row.get_typed::<String>(0),
5833 )
5834 {
5835 for row in rows {
5836 if suggestions.len() < 3 {
5837 suggestions.push(
5838 QuerySuggestion::try_agent(&row)
5839 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5840 );
5841 }
5842 }
5843 }
5844
5845 suggestions.truncate(3);
5847 for (i, sugg) in suggestions.iter_mut().enumerate() {
5848 sugg.shortcut = Some((i + 1) as u8);
5849 }
5850
5851 suggestions
5852 }
5853
5854 fn searcher_for_thread(&self, reader: &IndexReader) -> Searcher {
5855 let epoch = self.reload_epoch.load(Ordering::Relaxed);
5856 let reader_key = reader as *const IndexReader as usize;
5857 THREAD_SEARCHER.with(|slot| {
5858 let mut slot = slot.borrow_mut();
5859 if let Some(entry) = slot.as_ref()
5860 && entry.epoch == epoch
5861 && entry.reader_key == reader_key
5862 {
5863 return entry.searcher.clone();
5864 }
5865 let searcher = reader.searcher();
5866 *slot = Some(SearcherCacheEntry {
5867 epoch,
5868 reader_key,
5869 searcher: searcher.clone(),
5870 });
5871 searcher
5872 })
5873 }
5874
5875 fn federated_readers(&self) -> Option<Arc<Vec<FederatedIndexReader>>> {
5876 FEDERATED_SEARCH_READERS
5877 .read()
5878 .get(&self.cache_namespace)
5879 .cloned()
5880 }
5881
5882 fn maybe_reload_federated_readers(
5883 &self,
5884 readers: &[FederatedIndexReader],
5885 ) -> Result<Option<u64>> {
5886 if !self.reload_on_search || readers.is_empty() {
5887 return Ok(None);
5888 }
5889 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
5890 let now = Instant::now();
5891 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
5892 if guard
5893 .map(|t| now.duration_since(t) < MIN_RELOAD_INTERVAL)
5894 .unwrap_or(false)
5895 {
5896 let signature = self.federated_generation_signature(readers);
5897 return Ok(Some(signature));
5898 }
5899
5900 let reload_started = Instant::now();
5901 for shard in readers {
5902 shard.reader.reload()?;
5903 }
5904 let elapsed = reload_started.elapsed();
5905 *guard = Some(now);
5906 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
5907 self.metrics.record_reload(elapsed);
5908 tracing::debug!(
5909 duration_ms = elapsed.as_millis() as u64,
5910 reload_epoch = epoch,
5911 shards = readers.len(),
5912 "tantivy_reader_reload_federated"
5913 );
5914 Ok(Some(self.federated_generation_signature(readers)))
5915 }
5916
5917 fn federated_generation_signature(&self, readers: &[FederatedIndexReader]) -> u64 {
5918 let mut hasher = std::collections::hash_map::DefaultHasher::new();
5919 readers.len().hash(&mut hasher);
5920 for shard in readers {
5921 self.searcher_for_thread(&shard.reader)
5922 .generation()
5923 .generation_id()
5924 .hash(&mut hasher);
5925 }
5926 hasher.finish()
5927 }
5928
5929 fn track_generation(&self, generation: u64) {
5930 let mut guard = self
5931 .last_generation
5932 .lock()
5933 .unwrap_or_else(|e| e.into_inner());
5934 if let Some(prev) = *guard
5935 && prev != generation
5936 && let Ok(mut cache) = self.prefix_cache.lock()
5937 {
5938 cache.clear();
5939 }
5940 *guard = Some(generation);
5941 }
5942
5943 fn hydrate_tantivy_hit_contents(
5944 &self,
5945 exact_keys: &[TantivyContentExactKey],
5946 fallback_keys: &[TantivyContentFallbackKey],
5947 ) -> Result<TantivyHydratedContentMaps> {
5948 if exact_keys.is_empty() && fallback_keys.is_empty() {
5949 return Ok((HashMap::new(), HashMap::new()));
5950 }
5951
5952 let sqlite_guard = match self.sqlite_guard() {
5953 Ok(guard) => guard,
5954 Err(_) => return Ok((HashMap::new(), HashMap::new())),
5955 };
5956 let Some(conn) = sqlite_guard.as_ref() else {
5957 return Ok((HashMap::new(), HashMap::new()));
5958 };
5959
5960 let mut hydrated_exact = HashMap::new();
5961 let mut hydrated_fallback = HashMap::new();
5962 const CHUNK_SIZE: usize = 300;
5963
5964 if !exact_keys.is_empty() {
5965 let mut unique_exact_keys = Vec::with_capacity(exact_keys.len());
5966 let mut seen = HashSet::with_capacity(exact_keys.len());
5967 for key in exact_keys {
5968 if seen.insert(*key) {
5969 unique_exact_keys.push(*key);
5970 }
5971 }
5972
5973 hydrated_exact.extend(hydrate_message_content_by_conversation(
5974 conn,
5975 &unique_exact_keys,
5976 )?);
5977 }
5978
5979 if !fallback_keys.is_empty() {
5980 let mut unique_fallback_keys = Vec::with_capacity(fallback_keys.len());
5981 let mut seen = HashSet::with_capacity(fallback_keys.len());
5982 for key in fallback_keys {
5983 if seen.insert(key.clone()) {
5984 unique_fallback_keys.push(key.clone());
5985 }
5986 }
5987
5988 let mut unique_source_paths = Vec::with_capacity(unique_fallback_keys.len());
5989 let mut seen_source_paths = HashSet::with_capacity(unique_fallback_keys.len());
5990 for (_, source_path, _) in &unique_fallback_keys {
5991 if seen_source_paths.insert(source_path.clone()) {
5992 unique_source_paths.push(source_path.clone());
5993 }
5994 }
5995
5996 let mut conversations_by_key: HashMap<(String, String), Vec<i64>> = HashMap::new();
5997 for chunk in unique_source_paths.chunks(CHUNK_SIZE) {
5998 let placeholders = sql_placeholders(chunk.len());
5999 let sql = format!(
6000 "SELECT c.id,
6001 c.source_path,
6002 COALESCE(c.source_id, ''),
6003 COALESCE(c.origin_host, ''),
6004 COALESCE(s.kind, '')
6005 FROM conversations c
6006 LEFT JOIN sources s ON c.source_id = s.id
6007 WHERE c.source_path IN ({placeholders})
6008 ORDER BY c.id"
6009 );
6010 let params = chunk
6011 .iter()
6012 .map(|source_path| ParamValue::from(source_path.clone()))
6013 .collect::<Vec<_>>();
6014 let rows: Vec<(i64, String, String, String, String)> =
6015 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
6016 Ok((
6017 row.get_typed(0)?,
6018 row.get_typed(1)?,
6019 row.get_typed(2)?,
6020 row.get_typed(3)?,
6021 row.get_typed(4)?,
6022 ))
6023 })?;
6024
6025 for (conversation_id, source_path, raw_source_id, origin_host, origin_kind) in rows
6026 {
6027 let normalized_source_id = normalized_search_hit_source_id_parts(
6028 &raw_source_id,
6029 &origin_kind,
6030 (!origin_host.trim().is_empty()).then_some(origin_host.as_str()),
6031 );
6032 conversations_by_key
6033 .entry((normalized_source_id, source_path))
6034 .or_default()
6035 .push(conversation_id);
6036 }
6037 }
6038
6039 let mut message_requests = Vec::new();
6040 let mut fallback_keys_by_exact: HashMap<
6041 TantivyContentExactKey,
6042 Vec<TantivyContentFallbackKey>,
6043 > = HashMap::new();
6044 let mut seen_message_requests = HashSet::new();
6045 for (source_id, source_path, line_idx) in &unique_fallback_keys {
6046 let key = (source_id.clone(), source_path.clone());
6047 let Some(conversation_ids) = conversations_by_key.get(&key) else {
6048 continue;
6049 };
6050 for &conversation_id in conversation_ids {
6051 let exact_key = (conversation_id, *line_idx);
6052 if seen_message_requests.insert(exact_key) {
6053 message_requests.push(exact_key);
6054 }
6055 fallback_keys_by_exact.entry(exact_key).or_default().push((
6056 source_id.clone(),
6057 source_path.clone(),
6058 *line_idx,
6059 ));
6060 }
6061 }
6062
6063 for ((conversation_id, line_idx), content) in
6064 hydrate_message_content_by_conversation(conn, &message_requests)?
6065 {
6066 if let Some(fallback_keys) =
6067 fallback_keys_by_exact.get(&(conversation_id, line_idx))
6068 {
6069 for fallback_key in fallback_keys {
6070 hydrated_fallback.insert(fallback_key.clone(), content.clone());
6071 }
6072 }
6073 }
6074 }
6075
6076 Ok((hydrated_exact, hydrated_fallback))
6077 }
6078
6079 #[allow(clippy::too_many_arguments)]
6080 fn search_tantivy(
6081 &self,
6082 reader: &IndexReader,
6083 fields: &FsCassFields,
6084 raw_query: &str,
6085 sanitized_query: &str,
6086 filters: SearchFilters,
6087 limit: usize,
6088 offset: usize,
6089 field_mask: FieldMask,
6090 ) -> Result<(Vec<SearchHit>, usize)> {
6091 struct PendingTantivyHit {
6092 score: f32,
6093 doc: TantivyDocument,
6094 title: String,
6095 stored_content: String,
6096 stored_preview: String,
6097 agent: String,
6098 source_path: String,
6099 workspace: String,
6100 workspace_original: Option<String>,
6101 created_at: Option<i64>,
6102 line_number: Option<usize>,
6103 stored_preview_snippet: Option<String>,
6104 source_id: String,
6105 conversation_id: Option<i64>,
6106 raw_origin_kind: Option<String>,
6107 origin_host: Option<String>,
6108 }
6109
6110 self.maybe_reload_reader(reader)?;
6111 let searcher = self.searcher_for_thread(reader);
6112 self.track_generation(searcher.generation().generation_id());
6113
6114 let wants_snippet = field_mask.wants_snippet();
6115 let needs_content = field_mask.needs_content() || wants_snippet;
6116
6117 let fs_filters = FsCassQueryFilters {
6120 agents: filters.agents.into_iter().collect(),
6121 workspaces: filters.workspaces.into_iter().collect(),
6122 created_from: filters.created_from,
6123 created_to: filters.created_to,
6124 source_filter: match filters.source_filter {
6125 SourceFilter::All => FsCassSourceFilter::All,
6126 SourceFilter::Local => FsCassSourceFilter::Local,
6127 SourceFilter::Remote => FsCassSourceFilter::Remote,
6128 SourceFilter::SourceId(id) => {
6129 FsCassSourceFilter::SourceId(normalize_search_source_filter_value(&id))
6130 }
6131 },
6132 };
6133
6134 let q: Box<dyn Query> = fs_cass_build_tantivy_query(raw_query, &fs_filters, fields);
6137
6138 let prefix_only = is_prefix_only(sanitized_query);
6139 let top_docs = execute_query_with_lazy_exact_count(&searcher, &*q, limit, offset)?;
6140 let tantivy_total_count = top_docs.total_count;
6141 let query_match_type = dominant_match_type(sanitized_query);
6142 let mut pending_hits = Vec::with_capacity(top_docs.hits.len());
6143 let mut missing_exact_content_keys = Vec::new();
6144 let mut missing_fallback_content_keys = Vec::new();
6145
6146 for ranked_hit in top_docs.hits {
6147 let score = ranked_hit.bm25_score;
6148 let doc: TantivyDocument = fs_load_doc(&searcher, ranked_hit.doc_address)?;
6149 let title = if field_mask.wants_title() {
6150 doc.get_first(fields.title)
6151 .and_then(|v| v.as_str())
6152 .unwrap_or("")
6153 .to_string()
6154 } else {
6155 String::new()
6156 };
6157 let stored_content = doc
6158 .get_first(fields.content)
6159 .and_then(|v| v.as_str())
6160 .unwrap_or("")
6161 .to_string();
6162 let stored_preview = doc
6163 .get_first(fields.preview)
6164 .and_then(|v| v.as_str())
6165 .unwrap_or("")
6166 .to_string();
6167 let stored_preview_snippet = snippet_from_preview_without_full_content(
6168 field_mask,
6169 &stored_preview,
6170 sanitized_query,
6171 );
6172 let agent = doc
6173 .get_first(fields.agent)
6174 .and_then(|v| v.as_str())
6175 .unwrap_or("")
6176 .to_string();
6177 let workspace = doc
6178 .get_first(fields.workspace)
6179 .and_then(|v| v.as_str())
6180 .unwrap_or("")
6181 .to_string();
6182 let workspace_original = doc
6183 .get_first(fields.workspace_original)
6184 .and_then(|v| v.as_str())
6185 .filter(|s| !s.is_empty())
6186 .map(String::from);
6187 let created_at = doc.get_first(fields.created_at).and_then(|v| v.as_i64());
6188 let line_number = doc
6189 .get_first(fields.msg_idx)
6190 .and_then(|v| v.as_u64())
6191 .and_then(|i| usize::try_from(i).ok())
6192 .map(|i| i.saturating_add(1));
6193 let raw_source_id = doc
6194 .get_first(fields.source_id)
6195 .and_then(|v| v.as_str())
6196 .unwrap_or_default()
6197 .to_string();
6198 let conversation_id = fields
6199 .conversation_id
6200 .and_then(|field| doc.get_first(field))
6201 .and_then(|v| v.as_i64());
6202 let source_path = doc
6203 .get_first(fields.source_path)
6204 .and_then(|v| v.as_str())
6205 .unwrap_or("")
6206 .to_string();
6207 let raw_origin_kind = doc
6208 .get_first(fields.origin_kind)
6209 .and_then(|v| v.as_str())
6210 .map(str::to_string);
6211 let origin_host = doc
6212 .get_first(fields.origin_host)
6213 .and_then(|v| v.as_str())
6214 .filter(|s| !s.is_empty())
6215 .map(String::from);
6216 let source_id = normalized_search_hit_source_id_parts(
6217 raw_source_id.as_str(),
6218 raw_origin_kind.as_deref().unwrap_or_default(),
6219 origin_host.as_deref(),
6220 );
6221
6222 let preview_satisfies_bounded_content =
6223 field_mask.preview_content_limit().is_some() && !stored_preview.is_empty();
6224 let preview_satisfies_full_content = field_mask.needs_content()
6225 && field_mask.preview_content_limit().is_none()
6226 && stored_preview_is_complete_content(&stored_preview);
6227 if needs_content
6228 && let Some(line_idx) = line_number
6229 .and_then(|line| line.checked_sub(1))
6230 .and_then(|line| i64::try_from(line).ok())
6231 && stored_content.is_empty()
6232 && !preview_satisfies_bounded_content
6233 && !preview_satisfies_full_content
6234 && stored_preview_snippet.is_none()
6235 {
6236 if let Some(conversation_id) = conversation_id {
6237 missing_exact_content_keys.push((conversation_id, line_idx));
6238 } else {
6239 missing_fallback_content_keys.push((
6240 source_id.clone(),
6241 source_path.clone(),
6242 line_idx,
6243 ));
6244 }
6245 }
6246
6247 pending_hits.push(PendingTantivyHit {
6248 score,
6249 doc,
6250 title,
6251 stored_content,
6252 stored_preview,
6253 agent,
6254 source_path,
6255 workspace,
6256 workspace_original,
6257 created_at,
6258 line_number,
6259 stored_preview_snippet,
6260 source_id,
6261 conversation_id,
6262 raw_origin_kind,
6263 origin_host,
6264 });
6265 }
6266
6267 let (hydrated_contents, hydrated_fallback_contents) = if needs_content
6268 && (!missing_exact_content_keys.is_empty() || !missing_fallback_content_keys.is_empty())
6269 {
6270 self.hydrate_tantivy_hit_contents(
6271 &missing_exact_content_keys,
6272 &missing_fallback_content_keys,
6273 )?
6274 } else {
6275 (HashMap::new(), HashMap::new())
6276 };
6277 let needs_tantivy_snippet_generator = wants_snippet
6278 && !prefix_only
6279 && pending_hits
6280 .iter()
6281 .any(|pending| pending.stored_preview_snippet.is_none());
6282 let snippet_generator = if needs_tantivy_snippet_generator {
6283 let snippet_cfg = FsSnippetConfig {
6284 max_chars: 160,
6285 highlight_prefix: "<b>".to_string(),
6286 highlight_postfix: "</b>".to_string(),
6287 };
6288 fs_try_build_snippet_generator(&searcher, &*q, fields.content, &snippet_cfg)
6289 } else {
6290 None
6291 };
6292 let mut hits = Vec::with_capacity(pending_hits.len());
6293 for pending in pending_hits {
6294 let hydrated_content = pending
6295 .line_number
6296 .and_then(|line| line.checked_sub(1))
6297 .and_then(|line| i64::try_from(line).ok())
6298 .and_then(|line_idx| {
6299 if let Some(conversation_id) = pending.conversation_id {
6300 hydrated_contents.get(&(conversation_id, line_idx)).cloned()
6301 } else {
6302 hydrated_fallback_contents
6303 .get(&(
6304 pending.source_id.clone(),
6305 pending.source_path.clone(),
6306 line_idx,
6307 ))
6308 .cloned()
6309 }
6310 });
6311 let preview_satisfies_effective_content = !pending.stored_preview.is_empty()
6312 && (field_mask.preview_content_limit().is_some()
6313 || (field_mask.needs_content()
6314 && field_mask.preview_content_limit().is_none()
6315 && stored_preview_is_complete_content(&pending.stored_preview)));
6316 let effective_content = if !pending.stored_content.is_empty() {
6317 pending.stored_content.clone()
6318 } else if preview_satisfies_effective_content {
6319 pending.stored_preview.clone()
6320 } else if let Some(content) = hydrated_content {
6321 content
6322 } else {
6323 pending.stored_preview.clone()
6324 };
6325 let snippet = if wants_snippet {
6326 if let Some(snippet) = pending.stored_preview_snippet.clone() {
6327 snippet
6328 } else if let Some(r#gen) = &snippet_generator {
6329 let rendered = if !pending.stored_content.is_empty() {
6330 fs_render_snippet_html(r#gen, &pending.doc, "<b>", "</b>")
6331 } else if !effective_content.is_empty() {
6332 let mut snippet_doc = TantivyDocument::new();
6333 snippet_doc.add_text(fields.content, &effective_content);
6334 fs_render_snippet_html(r#gen, &snippet_doc, "<b>", "</b>")
6335 } else {
6336 None
6337 };
6338 rendered
6339 .map(|html| html.replace("<b>", "**").replace("</b>", "**"))
6340 .or_else(|| cached_prefix_snippet(&effective_content, sanitized_query, 160))
6341 .unwrap_or_else(|| {
6342 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6343 })
6344 } else if let Some(sn) =
6345 cached_prefix_snippet(&effective_content, sanitized_query, 160)
6346 {
6347 sn
6348 } else {
6349 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6350 }
6351 } else {
6352 String::new()
6353 };
6354 let content = if field_mask.needs_content() {
6355 effective_content.clone()
6356 } else {
6357 String::new()
6358 };
6359 let content_hash = stable_hit_hash(
6360 &effective_content,
6361 &pending.source_path,
6362 pending.line_number,
6363 pending.created_at,
6364 );
6365 let origin_kind = normalized_search_hit_origin_kind(
6366 &pending.source_id,
6367 pending.raw_origin_kind.as_deref(),
6368 )
6369 .to_string();
6370 hits.push(SearchHit {
6371 title: pending.title,
6372 snippet,
6373 content,
6374 content_hash,
6375 conversation_id: pending.conversation_id,
6376 score: pending.score,
6377 source_path: pending.source_path,
6378 agent: pending.agent,
6379 workspace: pending.workspace,
6380 workspace_original: pending.workspace_original,
6381 created_at: pending.created_at,
6382 line_number: pending.line_number,
6383 match_type: query_match_type,
6384 source_id: pending.source_id,
6385 origin_kind,
6386 origin_host: pending.origin_host,
6387 });
6388 }
6389 Ok((hits, tantivy_total_count))
6390 }
6391
6392 #[allow(clippy::too_many_arguments)]
6393 fn search_tantivy_federated(
6394 &self,
6395 readers: &[FederatedIndexReader],
6396 raw_query: &str,
6397 sanitized_query: &str,
6398 filters: SearchFilters,
6399 limit: usize,
6400 field_mask: FieldMask,
6401 ) -> Result<(Vec<SearchHit>, usize)> {
6402 let mut ranked_hits = Vec::new();
6403 let mut total_count = 0usize;
6404
6405 for (shard_index, shard) in readers.iter().enumerate() {
6406 let (shard_hits, shard_total_count) = self.search_tantivy(
6407 &shard.reader,
6408 &shard.fields,
6409 raw_query,
6410 sanitized_query,
6411 filters.clone(),
6412 limit,
6413 0,
6414 field_mask,
6415 )?;
6416 total_count = total_count.saturating_add(shard_total_count);
6417 for (shard_rank, hit) in shard_hits.into_iter().enumerate() {
6418 ranked_hits.push(FederatedRankedHit {
6419 hit,
6420 shard_index,
6421 shard_rank,
6422 fused_score: federated_rrf_score(shard_rank),
6423 });
6424 }
6425 }
6426
6427 let raw_hit_count = ranked_hits.len();
6428 let generation_signature = self.federated_generation_signature(readers);
6429 self.track_generation(generation_signature);
6430 let combined_hits = merge_federated_ranked_hits(ranked_hits);
6431 tracing::debug!(
6432 generation_signature,
6433 shard_count = readers.len(),
6434 total_count,
6435 raw_hit_count,
6436 returned_hit_count = combined_hits.len(),
6437 merge_policy = "rrf_rank_then_stable_hit_key",
6438 "federated lexical search merged shard results"
6439 );
6440
6441 Ok((combined_hits, total_count))
6442 }
6443
6444 fn sqlite_fts_uses_message_id_column(conn: &Connection) -> Result<bool> {
6445 let params: [ParamValue; 0] = [];
6446 let ddl_rows: Vec<String> = franken_query_map_collect_retry(
6447 conn,
6448 "SELECT COALESCE(sql, '')
6449 FROM sqlite_master
6450 WHERE name = 'fts_messages'
6451 ORDER BY rowid DESC
6452 LIMIT 1",
6453 ¶ms,
6454 |row: &frankensqlite::Row| row.get_typed::<String>(0),
6455 )?;
6456 Ok(ddl_rows
6457 .first()
6458 .map(|sql| sql.to_ascii_lowercase().contains("message_id"))
6459 .unwrap_or(false))
6460 }
6461
6462 fn sqlite_fts_match_mode(conn: &Connection) -> Result<SqliteFtsMatchMode> {
6463 let params = [ParamValue::from("__cass_fts_probe_no_match__")];
6464 match franken_query_map_collect_retry(
6465 conn,
6466 "SELECT COUNT(*) FROM fts_messages WHERE fts_messages MATCH ?",
6467 ¶ms,
6468 |row: &frankensqlite::Row| row.get_typed::<i64>(0),
6469 ) {
6470 Ok(_) => Ok(SqliteFtsMatchMode::Table),
6471 Err(err)
6472 if err
6473 .to_string()
6474 .contains("no such column: fts_messages in table fts_messages") =>
6475 {
6476 Ok(SqliteFtsMatchMode::IndexedColumns)
6477 }
6478 Err(err) => Err(anyhow!(err)),
6479 }
6480 }
6481
6482 fn sqlite_fts5_rowid_projection_available(conn: &Connection) -> bool {
6483 let params: [ParamValue; 0] = [];
6484 franken_query_map_collect_retry(
6485 conn,
6486 "SELECT rowid FROM fts_messages LIMIT 1",
6487 ¶ms,
6488 |row: &frankensqlite::Row| row.get_typed::<i64>(0),
6489 )
6490 .is_ok()
6491 }
6492
6493 fn sqlite_fts5_match_clause(match_mode: SqliteFtsMatchMode) -> &'static str {
6494 match match_mode {
6495 SqliteFtsMatchMode::Table => "fts_messages MATCH ?",
6496 SqliteFtsMatchMode::IndexedColumns => {
6497 "(content MATCH ?
6498 OR title MATCH ?
6499 OR agent MATCH ?
6500 OR workspace MATCH ?
6501 OR source_path MATCH ?)"
6502 }
6503 }
6504 }
6505
6506 fn push_sqlite_fts5_match_params(
6507 params: &mut Vec<ParamValue>,
6508 fts_query: &str,
6509 match_mode: SqliteFtsMatchMode,
6510 ) {
6511 let copies = match match_mode {
6512 SqliteFtsMatchMode::Table => 1,
6513 SqliteFtsMatchMode::IndexedColumns => 5,
6514 };
6515 for _ in 0..copies {
6516 params.push(ParamValue::from(fts_query));
6517 }
6518 }
6519
6520 fn sqlite_fts5_rank_query(
6521 fts_query: &str,
6522 _filters: &SearchFilters,
6523 limit: usize,
6524 offset: usize,
6525 _uses_message_id: bool,
6526 match_mode: SqliteFtsMatchMode,
6527 ) -> (String, Vec<ParamValue>) {
6528 let match_clause = Self::sqlite_fts5_match_clause(match_mode);
6529 let mut sql = format!(
6530 "SELECT rowid,
6531 bm25(fts_messages)
6532 FROM fts_messages
6533 WHERE {match_clause}"
6534 );
6535 let mut params = Vec::with_capacity(9);
6536 Self::push_sqlite_fts5_match_params(&mut params, fts_query, match_mode);
6537
6538 sql.push_str(" ORDER BY bm25(fts_messages), rowid LIMIT ? OFFSET ?");
6539 params.push(ParamValue::from(limit as i64));
6540 params.push(ParamValue::from(offset as i64));
6541
6542 (sql, params)
6543 }
6544
6545 fn sqlite_fts5_hydrate_query(
6546 row_count: usize,
6547 field_mask: FieldMask,
6548 uses_message_id: bool,
6549 ) -> String {
6550 let title_expr = if field_mask.wants_title() {
6551 "fts_messages.title"
6552 } else {
6553 "NULL"
6554 };
6555 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6556 "fts_messages.content"
6557 } else {
6558 "NULL"
6559 };
6560 let message_key_expr = if uses_message_id {
6561 "CAST(fts_messages.message_id AS INTEGER)"
6562 } else {
6563 "rowid"
6564 };
6565 let placeholders = sql_placeholders(row_count);
6566
6567 format!(
6568 "SELECT rowid,
6569 {message_key_expr},
6570 {title_expr},
6571 {content_expr},
6572 fts_messages.agent,
6573 fts_messages.workspace,
6574 fts_messages.source_path,
6575 CAST(fts_messages.created_at AS INTEGER)
6576 FROM fts_messages
6577 WHERE rowid IN ({placeholders})"
6578 )
6579 }
6580
6581 fn sqlite_fts5_message_hydrate_query(row_count: usize, field_mask: FieldMask) -> String {
6582 let title_expr = if field_mask.wants_title() {
6583 "COALESCE(c.title, '')"
6584 } else {
6585 "''"
6586 };
6587 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6588 "COALESCE(m.content, '')"
6589 } else {
6590 "''"
6591 };
6592 let normalized_source_sql =
6593 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6594 let placeholders = sql_placeholders(row_count);
6595
6596 format!(
6597 "SELECT m.id,
6598 {title_expr},
6599 {content_expr},
6600 COALESCE(a.slug, ''),
6601 COALESCE(w.path, ''),
6602 COALESCE(c.source_path, ''),
6603 CAST(m.created_at AS INTEGER),
6604 m.idx,
6605 c.id,
6606 {normalized_source_sql},
6607 c.origin_host,
6608 s.kind
6609 FROM messages m
6610 LEFT JOIN conversations c ON m.conversation_id = c.id
6611 LEFT JOIN sources s ON c.source_id = s.id
6612 LEFT JOIN agents a ON c.agent_id = a.id
6613 LEFT JOIN workspaces w ON c.workspace_id = w.id
6614 WHERE m.id IN ({placeholders})"
6615 )
6616 }
6617
6618 fn sqlite_fts5_hydrate_row_chunks(
6619 ranked_rows: &[(i64, f64)],
6620 ) -> impl Iterator<Item = &[(i64, f64)]> {
6621 const _: () = assert!(SQLITE_FTS5_HYDRATE_PARAM_CHUNK <= SQLITE_MAX_VARIABLE_NUMBER);
6622 ranked_rows.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK)
6623 }
6624
6625 fn sqlite_fts5_filters_need_post_hydration(filters: &SearchFilters) -> bool {
6626 !filters.agents.is_empty()
6627 || !filters.workspaces.is_empty()
6628 || filters.created_from.is_some()
6629 || filters.created_to.is_some()
6630 || !filters.source_filter.is_all()
6631 || !filters.session_paths.is_empty()
6632 }
6633
6634 fn sqlite_fts5_hit_matches_filters(hit: &SearchHit, filters: &SearchFilters) -> bool {
6635 if !filters.agents.is_empty() && !filters.agents.contains(&hit.agent) {
6636 return false;
6637 }
6638 if !filters.workspaces.is_empty() && !filters.workspaces.contains(&hit.workspace) {
6639 return false;
6640 }
6641 if filters.created_from.is_some() || filters.created_to.is_some() {
6642 let Some(created_at) = hit.created_at else {
6643 return false;
6644 };
6645 if let Some(created_from) = filters.created_from
6646 && created_at < created_from
6647 {
6648 return false;
6649 }
6650 if let Some(created_to) = filters.created_to
6651 && created_at > created_to
6652 {
6653 return false;
6654 }
6655 }
6656 if !filters.session_paths.is_empty() && !filters.session_paths.contains(&hit.source_path) {
6657 return false;
6658 }
6659
6660 match &filters.source_filter {
6661 SourceFilter::All => true,
6662 SourceFilter::Local => matches!(
6663 hit.source_id
6664 .as_str()
6665 .cmp(crate::sources::provenance::LOCAL_SOURCE_ID),
6666 CmpOrdering::Equal
6667 ),
6668 SourceFilter::Remote => !matches!(
6669 hit.source_id
6670 .as_str()
6671 .cmp(crate::sources::provenance::LOCAL_SOURCE_ID),
6672 CmpOrdering::Equal
6673 ),
6674 SourceFilter::SourceId(id) => {
6675 let normalized = normalize_search_source_filter_value(id);
6676 matches!(
6677 hit.source_id.as_str().cmp(normalized.as_str()),
6678 CmpOrdering::Equal
6679 )
6680 }
6681 }
6682 }
6683
6684 fn sqlite_message_scan_query(raw_query: &str) -> Option<SqliteMessageScanQuery> {
6685 fn scan_parts(parts: Vec<String>) -> Vec<String> {
6686 parts
6687 .into_iter()
6688 .map(|part| part.trim_end_matches('*').to_lowercase())
6689 .filter(|part| !part.is_empty())
6690 .collect()
6691 }
6692
6693 let tokens = fs_cass_parse_boolean_query(raw_query);
6694 if tokens.is_empty() {
6695 return None;
6696 }
6697
6698 let mut include_groups = Vec::new();
6699 let mut pending_or_group: SqliteMessageScanGroup = Vec::new();
6700 let mut exclude_terms = Vec::new();
6701 let mut negated = false;
6702 let mut in_or_sequence = false;
6703 for token in tokens {
6704 match token {
6705 FsCassQueryToken::And => {
6706 if !pending_or_group.is_empty() {
6707 include_groups.push(std::mem::take(&mut pending_or_group));
6708 }
6709 in_or_sequence = false;
6710 negated = false;
6711 }
6712 FsCassQueryToken::Or => {
6713 if include_groups.is_empty() && pending_or_group.is_empty() {
6714 continue;
6715 }
6716 if negated {
6717 return None;
6718 }
6719 in_or_sequence = true;
6720 }
6721 FsCassQueryToken::Not => {
6722 if in_or_sequence {
6723 return None;
6724 }
6725 if !pending_or_group.is_empty() {
6726 include_groups.push(std::mem::take(&mut pending_or_group));
6727 }
6728 negated = true;
6729 in_or_sequence = false;
6730 }
6731 FsCassQueryToken::Term(term) => {
6732 let parts = scan_parts(normalize_term_parts(&term));
6733 if parts.is_empty() {
6734 continue;
6735 }
6736 if negated {
6737 exclude_terms.extend(parts);
6738 } else if in_or_sequence {
6739 if pending_or_group.is_empty() {
6740 let previous = include_groups.pop()?;
6741 pending_or_group.extend(previous);
6742 }
6743 pending_or_group.push(parts);
6744 } else {
6745 include_groups.push(vec![parts]);
6746 }
6747 negated = false;
6748 }
6749 FsCassQueryToken::Phrase(phrase) => {
6750 let parts = normalize_phrase_terms(&phrase);
6751 if parts.is_empty() {
6752 continue;
6753 }
6754 if negated {
6755 exclude_terms.extend(parts);
6756 } else if in_or_sequence {
6757 if pending_or_group.is_empty() {
6758 let previous = include_groups.pop()?;
6759 pending_or_group.extend(previous);
6760 }
6761 pending_or_group.push(parts);
6762 } else {
6763 include_groups.push(vec![parts]);
6764 }
6765 negated = false;
6766 }
6767 }
6768 }
6769
6770 if !pending_or_group.is_empty() {
6771 include_groups.push(pending_or_group);
6772 }
6773
6774 for group in &mut include_groups {
6775 for alternative in group.iter_mut() {
6776 alternative.sort();
6777 alternative.dedup();
6778 }
6779 group.retain(|alternative| !alternative.is_empty());
6780 group.sort();
6781 group.dedup();
6782 }
6783 include_groups.retain(|group| !group.is_empty());
6784 exclude_terms.sort();
6785 exclude_terms.dedup();
6786 if include_groups.is_empty() {
6787 return None;
6788 }
6789
6790 Some(SqliteMessageScanQuery {
6791 include_groups,
6792 exclude_terms,
6793 })
6794 }
6795
6796 fn sqlite_message_scan_score(haystack: &str, scan_query: &SqliteMessageScanQuery) -> f32 {
6797 for term in &scan_query.exclude_terms {
6798 if haystack.contains(term) {
6799 return 0.0;
6800 }
6801 }
6802
6803 let mut score = 0.0f32;
6804 for group in &scan_query.include_groups {
6805 let mut group_score = 0.0f32;
6806 for alternative in group {
6807 let mut alternative_score = 0.0f32;
6808 for term in alternative {
6809 let matches = haystack.matches(term).count();
6810 if matches < 1 {
6811 alternative_score = 0.0;
6812 break;
6813 }
6814 alternative_score += matches as f32;
6815 }
6816 group_score = group_score.max(alternative_score);
6817 }
6818 if group_score <= 0.0 {
6819 return 0.0;
6820 }
6821 score += group_score;
6822 }
6823 score
6824 }
6825
6826 fn sqlite_message_scan_query_sql(field_mask: FieldMask) -> String {
6827 let title_expr = if field_mask.wants_title() {
6828 "COALESCE(c.title, '')"
6829 } else {
6830 "''"
6831 };
6832 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6833 "COALESCE(m.content, '')"
6834 } else {
6835 "''"
6836 };
6837 let normalized_source_sql =
6838 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6839
6840 format!(
6841 "SELECT m.id,
6842 {title_expr},
6843 {content_expr},
6844 COALESCE(a.slug, ''),
6845 COALESCE(w.path, ''),
6846 COALESCE(c.source_path, ''),
6847 CAST(m.created_at AS INTEGER),
6848 m.idx,
6849 c.id,
6850 {normalized_source_sql},
6851 c.origin_host,
6852 s.kind,
6853 COALESCE(m.content, ''),
6854 COALESCE(c.title, '')
6855 FROM messages m
6856 LEFT JOIN conversations c ON m.conversation_id = c.id
6857 LEFT JOIN sources s ON c.source_id = s.id
6858 LEFT JOIN agents a ON c.agent_id = a.id
6859 LEFT JOIN workspaces w ON c.workspace_id = w.id
6860 ORDER BY m.id
6861 LIMIT ?"
6862 )
6863 }
6864
6865 fn search_sqlite_message_scan(
6866 &self,
6867 conn: &Connection,
6868 request: SqliteMessageScanRequest<'_>,
6869 ) -> Result<Vec<SearchHit>> {
6870 let Some(scan_query) = Self::sqlite_message_scan_query(request.raw_query) else {
6871 return Ok(Vec::new());
6872 };
6873
6874 let sql = Self::sqlite_message_scan_query_sql(request.field_mask);
6875 let params = [ParamValue::from(SQLITE_MESSAGE_SCAN_FALLBACK_LIMIT as i64)];
6876 let rows: Vec<(SqliteFtsMessageRow, String, String)> =
6877 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
6878 Ok((
6879 (
6880 row.get_typed(0)?,
6881 row.get_typed(1)?,
6882 row.get_typed(2)?,
6883 row.get_typed(3)?,
6884 row.get_typed(4)?,
6885 row.get_typed(5)?,
6886 row.get_typed(6)?,
6887 row.get_typed(7)?,
6888 row.get_typed(8)?,
6889 row.get_typed::<Option<String>>(9)?,
6890 row.get_typed(10)?,
6891 row.get_typed(11)?,
6892 ),
6893 row.get_typed(12)?,
6894 row.get_typed(13)?,
6895 ))
6896 })?;
6897
6898 let mut scored_hits = Vec::new();
6899 for (
6900 (
6901 _message_id,
6902 title,
6903 raw_content,
6904 agent,
6905 workspace,
6906 source_path,
6907 created_at,
6908 idx,
6909 conversation_id,
6910 raw_source_id,
6911 origin_host,
6912 raw_origin_kind,
6913 ),
6914 scan_content,
6915 scan_title,
6916 ) in rows
6917 {
6918 let mut haystack = String::with_capacity(
6919 scan_content.len()
6920 + scan_title.len()
6921 + agent.len()
6922 + workspace.len()
6923 + source_path.len()
6924 + 4,
6925 );
6926 haystack.push_str(&scan_content);
6927 haystack.push(' ');
6928 haystack.push_str(&scan_title);
6929 haystack.push(' ');
6930 haystack.push_str(&agent);
6931 haystack.push(' ');
6932 haystack.push_str(&workspace);
6933 haystack.push(' ');
6934 haystack.push_str(&source_path);
6935 let haystack = haystack.to_lowercase();
6936 let score = Self::sqlite_message_scan_score(&haystack, &scan_query);
6937 if score <= 0.0 {
6938 continue;
6939 }
6940
6941 let raw_source_id = raw_source_id.unwrap_or_else(default_source_id);
6942 let source_id = normalized_search_hit_source_id_parts(
6943 raw_source_id.as_str(),
6944 raw_origin_kind.as_deref().unwrap_or_default(),
6945 origin_host.as_deref(),
6946 );
6947 let origin_kind =
6948 normalized_search_hit_origin_kind(source_id.as_str(), raw_origin_kind.as_deref());
6949 let line_number = idx
6950 .and_then(|i| usize::try_from(i).ok())
6951 .map(|i| i.saturating_add(1));
6952 let snippet = if request.field_mask.wants_snippet() {
6953 snippet_from_content(&scan_content)
6954 } else {
6955 String::new()
6956 };
6957 let content = if request.field_mask.needs_content() {
6958 raw_content
6959 } else {
6960 String::new()
6961 };
6962 let content_hash = if content.is_empty() {
6963 stable_hit_hash(&snippet, &source_path, line_number, created_at)
6964 } else {
6965 stable_hit_hash(&content, &source_path, line_number, created_at)
6966 };
6967
6968 let hit = SearchHit {
6969 title,
6970 snippet,
6971 content,
6972 content_hash,
6973 conversation_id,
6974 score,
6975 source_path,
6976 agent,
6977 workspace,
6978 workspace_original: None,
6979 created_at,
6980 line_number,
6981 match_type: request.query_match_type,
6982 source_id,
6983 origin_kind,
6984 origin_host,
6985 };
6986
6987 if Self::sqlite_fts5_hit_matches_filters(&hit, request.filters) {
6988 scored_hits.push(hit);
6989 }
6990 }
6991
6992 scored_hits.sort_by(|left, right| {
6993 right
6994 .score
6995 .partial_cmp(&left.score)
6996 .unwrap_or(CmpOrdering::Equal)
6997 });
6998
6999 Ok(scored_hits
7000 .into_iter()
7001 .skip(request.offset)
7002 .take(request.limit)
7003 .collect())
7004 }
7005
7006 fn search_sqlite_fts5(
7007 &self,
7008 _db_path: &Path,
7009 raw_query: &str,
7010 filters: SearchFilters,
7011 limit: usize,
7012 offset: usize,
7013 field_mask: FieldMask,
7014 ) -> Result<Vec<SearchHit>> {
7015 if limit < 1 {
7016 return Ok(Vec::new());
7017 }
7018
7019 let fts_query = match transpile_to_fts5(raw_query) {
7020 Some(q) if !q.trim().is_empty() => q,
7021 _ => return Ok(Vec::new()),
7022 };
7023
7024 let sqlite_guard = self.sqlite_guard()?;
7025 let Some(conn) = sqlite_guard.as_ref() else {
7026 return Ok(Vec::new());
7027 };
7028
7029 let empty_params: [ParamValue; 0] = [];
7030 let has_fts = franken_query_map_collect_retry(
7031 conn,
7032 "SELECT 1 FROM sqlite_master WHERE name = 'fts_messages'",
7033 &empty_params,
7034 |row| row.get_typed::<i64>(0),
7035 )
7036 .map(|rows| !rows.is_empty())
7037 .unwrap_or(false);
7038 if !has_fts {
7039 return Ok(Vec::new());
7040 }
7041
7042 let query_match_type = dominant_match_type(raw_query);
7043 let scan_request = SqliteMessageScanRequest {
7044 raw_query,
7045 filters: &filters,
7046 limit,
7047 offset,
7048 field_mask,
7049 query_match_type,
7050 };
7051 if let Err(err) =
7052 crate::storage::sqlite::validate_fts_messages_integrity_for_connection(conn)
7053 {
7054 tracing::warn!(
7055 error = %err,
7056 "sqlite FTS fallback integrity check failed; using source-table scan fallback"
7057 );
7058 return self.search_sqlite_message_scan(conn, scan_request);
7059 }
7060 let uses_message_id =
7061 if let Ok(uses_message_id) = Self::sqlite_fts_uses_message_id_column(conn) {
7062 uses_message_id
7063 } else {
7064 tracing::warn!(
7065 "sqlite FTS fallback is present but not queryable; skipping fallback search"
7066 );
7067 return self.search_sqlite_message_scan(conn, scan_request);
7068 };
7069 let match_mode = match Self::sqlite_fts_match_mode(conn) {
7070 Ok(match_mode) => match_mode,
7071 Err(err) => {
7072 tracing::warn!(
7073 error = %err,
7074 "sqlite FTS fallback is present but not queryable; skipping fallback search"
7075 );
7076 return self.search_sqlite_message_scan(conn, scan_request);
7077 }
7078 };
7079 if !Self::sqlite_fts5_rowid_projection_available(conn) {
7080 tracing::warn!(
7081 "sqlite FTS fallback cannot project rowid through frankensqlite; using source-table scan fallback"
7082 );
7083 return self.search_sqlite_message_scan(conn, scan_request);
7084 }
7085
7086 let post_filter = Self::sqlite_fts5_filters_need_post_hydration(&filters);
7087 let target_hits = if post_filter {
7088 offset.saturating_add(limit)
7089 } else {
7090 limit
7091 };
7092 let rank_batch_limit = if post_filter {
7093 target_hits.clamp(1, SQLITE_FTS5_POST_FILTER_SCAN_CHUNK)
7094 } else {
7095 limit
7096 };
7097 let mut rank_offset = if post_filter { 0 } else { offset };
7098 let mut scanned_rows = 0usize;
7099 let mut hits = Vec::with_capacity(target_hits.min(rank_batch_limit));
7100
7101 loop {
7102 let (rank_sql, rank_params) = Self::sqlite_fts5_rank_query(
7103 fts_query.as_str(),
7104 &filters,
7105 rank_batch_limit,
7106 rank_offset,
7107 uses_message_id,
7108 match_mode,
7109 );
7110 let ranked_rows: Vec<(i64, f64)> =
7111 match franken_query_map_collect_retry(conn, &rank_sql, &rank_params, |row| {
7112 Ok((row.get_typed(0)?, row.get_typed(1)?))
7113 }) {
7114 Ok(rows) => rows,
7115 Err(err) => {
7116 tracing::warn!(
7117 error = %err,
7118 "sqlite FTS fallback rank query failed; returning no fallback hits"
7119 );
7120 return self.search_sqlite_message_scan(conn, scan_request);
7121 }
7122 };
7123 if ranked_rows.is_empty() {
7124 break;
7125 }
7126
7127 scanned_rows = scanned_rows.saturating_add(ranked_rows.len());
7128 let bm25_by_rowid: HashMap<i64, f64> = ranked_rows.iter().copied().collect();
7129 let mut fts_rows_by_rowid = HashMap::with_capacity(ranked_rows.len());
7130 let mut message_ids = Vec::with_capacity(ranked_rows.len());
7131 let mut seen_message_ids = HashSet::with_capacity(ranked_rows.len());
7132
7133 for rank_chunk in Self::sqlite_fts5_hydrate_row_chunks(&ranked_rows) {
7134 let hydrate_sql =
7135 Self::sqlite_fts5_hydrate_query(rank_chunk.len(), field_mask, uses_message_id);
7136 let hydrate_params = rank_chunk
7137 .iter()
7138 .map(|(fts_rowid, _)| ParamValue::from(*fts_rowid))
7139 .collect::<Vec<_>>();
7140 let rows: Vec<SqliteFtsHydratedRow> = match franken_query_map_collect_retry(
7141 conn,
7142 &hydrate_sql,
7143 &hydrate_params,
7144 |row| {
7145 Ok((
7146 row.get_typed(0)?,
7147 row.get_typed(1)?,
7148 row.get_typed(2)?,
7149 row.get_typed(3)?,
7150 row.get_typed(4)?,
7151 row.get_typed(5)?,
7152 row.get_typed(6)?,
7153 row.get_typed(7)?,
7154 ))
7155 },
7156 ) {
7157 Ok(rows) => rows,
7158 Err(err) => {
7159 tracing::warn!(
7160 error = %err,
7161 "sqlite FTS fallback rowid hydration query failed; returning no fallback hits"
7162 );
7163 return self.search_sqlite_message_scan(conn, scan_request);
7164 }
7165 };
7166
7167 for row in rows {
7168 let fts_rowid = row.0;
7169 let message_id = row.1.unwrap_or(fts_rowid);
7170 if seen_message_ids.insert(message_id) {
7171 message_ids.push(message_id);
7172 }
7173 fts_rows_by_rowid.insert(fts_rowid, row);
7174 }
7175 }
7176
7177 let mut metadata_by_message_id = HashMap::with_capacity(message_ids.len());
7178 for message_chunk in message_ids.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK) {
7179 let metadata_sql =
7180 Self::sqlite_fts5_message_hydrate_query(message_chunk.len(), field_mask);
7181 let metadata_params = message_chunk
7182 .iter()
7183 .map(|message_id| ParamValue::from(*message_id))
7184 .collect::<Vec<_>>();
7185 let metadata_rows: Vec<SqliteFtsMessageRow> = match franken_query_map_collect_retry(
7186 conn,
7187 &metadata_sql,
7188 &metadata_params,
7189 |row| {
7190 Ok((
7191 row.get_typed(0)?,
7192 row.get_typed(1)?,
7193 row.get_typed(2)?,
7194 row.get_typed(3)?,
7195 row.get_typed(4)?,
7196 row.get_typed(5)?,
7197 row.get_typed(6)?,
7198 row.get_typed(7)?,
7199 row.get_typed(8)?,
7200 row.get_typed::<Option<String>>(9)?,
7201 row.get_typed(10)?,
7202 row.get_typed(11)?,
7203 ))
7204 },
7205 ) {
7206 Ok(rows) => rows,
7207 Err(err) => {
7208 tracing::warn!(
7209 error = %err,
7210 "sqlite FTS fallback message hydration query failed; returning no fallback hits"
7211 );
7212 return self.search_sqlite_message_scan(conn, scan_request);
7213 }
7214 };
7215 metadata_by_message_id.extend(metadata_rows.into_iter().map(|row| (row.0, row)));
7216 }
7217
7218 let mut hits_by_rowid = HashMap::with_capacity(ranked_rows.len());
7219 for (
7220 fts_rowid,
7221 fts_message_id,
7222 fts_title,
7223 fts_content,
7224 fts_agent,
7225 fts_workspace,
7226 fts_source_path,
7227 fts_created_at,
7228 ) in fts_rows_by_rowid.into_values()
7229 {
7230 let Some(&bm25_score) = bm25_by_rowid.get(&fts_rowid) else {
7231 continue;
7232 };
7233 let message_id = fts_message_id.unwrap_or(fts_rowid);
7234 let (
7235 title,
7236 raw_content,
7237 agent,
7238 workspace,
7239 source_path,
7240 created_at,
7241 idx,
7242 conversation_id,
7243 raw_source_id,
7244 origin_host,
7245 raw_origin_kind,
7246 ) = match metadata_by_message_id.remove(&message_id) {
7247 Some((
7248 _,
7249 metadata_title,
7250 metadata_content,
7251 metadata_agent,
7252 metadata_workspace,
7253 metadata_source_path,
7254 metadata_created_at,
7255 metadata_idx,
7256 metadata_conversation_id,
7257 metadata_raw_source_id,
7258 metadata_origin_host,
7259 metadata_raw_origin_kind,
7260 )) => (
7261 if metadata_title.is_empty() {
7262 fts_title.unwrap_or_default()
7263 } else {
7264 metadata_title
7265 },
7266 if metadata_content.is_empty() {
7267 fts_content.unwrap_or_default()
7268 } else {
7269 metadata_content
7270 },
7271 if metadata_agent.is_empty() {
7272 fts_agent.unwrap_or_default()
7273 } else {
7274 metadata_agent
7275 },
7276 if metadata_workspace.is_empty() {
7277 fts_workspace.unwrap_or_default()
7278 } else {
7279 metadata_workspace
7280 },
7281 if metadata_source_path.is_empty() {
7282 fts_source_path.unwrap_or_default()
7283 } else {
7284 metadata_source_path
7285 },
7286 metadata_created_at.or(fts_created_at),
7287 metadata_idx,
7288 metadata_conversation_id,
7289 metadata_raw_source_id.unwrap_or_else(default_source_id),
7290 metadata_origin_host,
7291 metadata_raw_origin_kind,
7292 ),
7293 None => (
7294 fts_title.unwrap_or_default(),
7295 fts_content.unwrap_or_default(),
7296 fts_agent.unwrap_or_default(),
7297 fts_workspace.unwrap_or_default(),
7298 fts_source_path.unwrap_or_default(),
7299 fts_created_at,
7300 None,
7301 None,
7302 default_source_id(),
7303 None,
7304 None,
7305 ),
7306 };
7307
7308 let source_id = normalized_search_hit_source_id_parts(
7309 raw_source_id.as_str(),
7310 raw_origin_kind.as_deref().unwrap_or_default(),
7311 origin_host.as_deref(),
7312 );
7313 let origin_kind = normalized_search_hit_origin_kind(
7314 source_id.as_str(),
7315 raw_origin_kind.as_deref(),
7316 );
7317 let line_number = idx
7318 .and_then(|i| usize::try_from(i).ok())
7319 .map(|i| i.saturating_add(1));
7320 let snippet = if field_mask.wants_snippet() {
7321 snippet_from_content(&raw_content)
7322 } else {
7323 String::new()
7324 };
7325 let content = if field_mask.needs_content() {
7326 raw_content
7327 } else {
7328 String::new()
7329 };
7330 let content_hash = if content.is_empty() {
7331 stable_hit_hash(&snippet, &source_path, line_number, created_at)
7332 } else {
7333 stable_hit_hash(&content, &source_path, line_number, created_at)
7334 };
7335
7336 let hit = SearchHit {
7337 title,
7338 snippet,
7339 content,
7340 content_hash,
7341 conversation_id,
7342 score: (-bm25_score) as f32,
7343 source_path,
7344 agent,
7345 workspace,
7346 workspace_original: None,
7347 created_at,
7348 line_number,
7349 match_type: query_match_type,
7350 source_id,
7351 origin_kind,
7352 origin_host,
7353 };
7354 hits_by_rowid.insert(fts_rowid, hit);
7355 }
7356
7357 for (fts_rowid, _) in &ranked_rows {
7358 if let Some(hit) = hits_by_rowid.remove(fts_rowid)
7359 && Self::sqlite_fts5_hit_matches_filters(&hit, &filters)
7360 {
7361 hits.push(hit);
7362 if hits.len() >= target_hits {
7363 break;
7364 }
7365 }
7366 }
7367
7368 if hits.len() >= target_hits
7369 || !post_filter
7370 || ranked_rows.len() < rank_batch_limit
7371 || scanned_rows >= SQLITE_FTS5_POST_FILTER_SCAN_LIMIT
7372 {
7373 break;
7374 }
7375 rank_offset = rank_offset.saturating_add(ranked_rows.len());
7376 }
7377
7378 if post_filter {
7379 let hits = hits
7380 .into_iter()
7381 .skip(offset)
7382 .take(limit)
7383 .collect::<Vec<_>>();
7384 if hits.is_empty() {
7385 self.search_sqlite_message_scan(conn, scan_request)
7386 } else {
7387 Ok(hits)
7388 }
7389 } else if hits.is_empty() {
7390 self.search_sqlite_message_scan(conn, scan_request)
7391 } else {
7392 Ok(hits)
7393 }
7394 }
7395
7396 pub fn browse_by_date(
7403 &self,
7404 filters: SearchFilters,
7405 limit: usize,
7406 offset: usize,
7407 newest_first: bool,
7408 field_mask: FieldMask,
7409 ) -> Result<Vec<SearchHit>> {
7410 let sqlite_guard = self.sqlite_guard()?;
7411 if let Some(conn) = sqlite_guard.as_ref() {
7412 self.browse_by_date_sqlite(conn, filters, limit, offset, newest_first, field_mask)
7413 } else {
7414 Ok(Vec::new())
7415 }
7416 }
7417
7418 fn browse_by_date_sqlite(
7419 &self,
7420 conn: &Connection,
7421 filters: SearchFilters,
7422 limit: usize,
7423 offset: usize,
7424 newest_first: bool,
7425 field_mask: FieldMask,
7426 ) -> Result<Vec<SearchHit>> {
7427 let order = if newest_first { "DESC" } else { "ASC" };
7428 let title_expr = if field_mask.wants_title() {
7429 "c.title"
7430 } else {
7431 "''"
7432 };
7433 let normalized_source_sql =
7441 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
7442 let mut sql = format!(
7443 "SELECT c.id, {title_expr}, m.content, \
7444 COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'), \
7445 w.path, c.source_path, m.created_at, m.idx, \
7446 {normalized_source_sql}, c.origin_host, s.kind
7447 FROM messages m
7448 JOIN conversations c ON m.conversation_id = c.id
7449 LEFT JOIN workspaces w ON c.workspace_id = w.id
7450 LEFT JOIN sources s ON c.source_id = s.id
7451 WHERE 1=1"
7452 );
7453 let mut params: Vec<ParamValue> = Vec::new();
7454
7455 if !filters.agents.is_empty() {
7456 let placeholders = sql_placeholders(filters.agents.len());
7457 sql.push_str(&format!(
7458 " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug IN ({placeholders}))"
7459 ));
7460 for a in &filters.agents {
7461 params.push(ParamValue::from(a.as_str()));
7462 }
7463 }
7464
7465 if !filters.workspaces.is_empty() {
7466 let placeholders = sql_placeholders(filters.workspaces.len());
7467 sql.push_str(&format!(" AND COALESCE(w.path, '') IN ({placeholders})"));
7468 for w in &filters.workspaces {
7469 params.push(ParamValue::from(w.as_str()));
7470 }
7471 }
7472
7473 if let Some(created_from) = filters.created_from {
7474 sql.push_str(" AND m.created_at >= ?");
7475 params.push(ParamValue::from(created_from));
7476 }
7477 if let Some(created_to) = filters.created_to {
7478 sql.push_str(" AND m.created_at <= ?");
7479 params.push(ParamValue::from(created_to));
7480 }
7481
7482 match &filters.source_filter {
7484 SourceFilter::All => {}
7485 SourceFilter::Local => sql.push_str(&format!(
7486 " AND {normalized_source_sql} = '{local}'",
7487 local = crate::sources::provenance::LOCAL_SOURCE_ID,
7488 )),
7489 SourceFilter::Remote => sql.push_str(&format!(
7490 " AND {normalized_source_sql} != '{local}'",
7491 local = crate::sources::provenance::LOCAL_SOURCE_ID,
7492 )),
7493 SourceFilter::SourceId(id) => {
7494 sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
7495 params.push(ParamValue::from(normalize_search_source_filter_value(id)));
7496 }
7497 }
7498
7499 sql.push_str(&format!(
7500 " ORDER BY CASE WHEN m.created_at IS NULL THEN 1 ELSE 0 END, m.created_at {order}, m.id {order} LIMIT ? OFFSET ?"
7501 ));
7502 params.push(ParamValue::from(limit as i64));
7503 params.push(ParamValue::from(offset as i64));
7504
7505 let rows: Vec<SearchHit> =
7506 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
7507 let conversation_id: i64 = row.get_typed(0)?;
7508 let title: String = if field_mask.wants_title() {
7509 row.get_typed::<Option<String>>(1)?.unwrap_or_default()
7510 } else {
7511 String::new()
7512 };
7513 let raw_content: String = row.get_typed(2)?;
7514 let agent: String = row.get_typed(3)?;
7515 let workspace: Option<String> = row.get_typed(4)?;
7516 let source_path: String = row.get_typed(5)?;
7517 let created_at: Option<i64> = row.get_typed(6)?;
7518 let idx: Option<i64> = row.get_typed(7)?;
7519 let raw_source_id: String = row
7520 .get_typed::<Option<String>>(8)?
7521 .unwrap_or_else(default_source_id);
7522 let origin_host: Option<String> = row.get_typed(9)?;
7523 let raw_origin_kind: Option<String> = row.get_typed(10)?;
7524 let source_id = normalized_search_hit_source_id_parts(
7525 raw_source_id.as_str(),
7526 raw_origin_kind.as_deref().unwrap_or_default(),
7527 origin_host.as_deref(),
7528 );
7529 let origin_kind = normalized_search_hit_origin_kind(
7530 source_id.as_str(),
7531 raw_origin_kind.as_deref(),
7532 );
7533 let line_number = idx
7534 .and_then(|i| usize::try_from(i).ok())
7535 .map(|i| i.saturating_add(1));
7536 let snippet = if field_mask.wants_snippet() {
7537 snippet_from_content(&raw_content)
7538 } else {
7539 String::new()
7540 };
7541 let content = if field_mask.needs_content() {
7542 raw_content.clone()
7543 } else {
7544 String::new()
7545 };
7546 let content_hash =
7547 stable_hit_hash(&raw_content, &source_path, line_number, created_at);
7548 Ok(SearchHit {
7549 title,
7550 snippet,
7551 content,
7552 content_hash,
7553 conversation_id: Some(conversation_id),
7554 score: 0.0,
7555 source_path,
7556 agent,
7557 workspace: workspace.unwrap_or_default(),
7558 workspace_original: None,
7559 created_at,
7560 line_number,
7561 match_type: MatchType::Exact,
7562 source_id,
7563 origin_kind,
7564 origin_host,
7565 })
7566 })?;
7567 Ok(rows)
7568 }
7569}
7570
7571#[doc(hidden)]
7578pub fn fuzz_transpile_to_fts5(raw_query: &str) -> Option<String> {
7579 transpile_to_fts5(raw_query)
7580}
7581
7582fn transpile_to_fts5(raw_query: &str) -> Option<String> {
7586 let tokens = fs_cass_parse_boolean_query(raw_query);
7587 if tokens.is_empty() {
7588 return Some("".to_string());
7589 }
7590
7591 let mut fts_clauses: Vec<(&str, String)> = Vec::new();
7592 let mut pending_or_group: Vec<String> = Vec::new();
7593 let mut next_op = "AND";
7594 let mut in_or_sequence = false;
7595 for token in tokens {
7596 match token {
7597 FsCassQueryToken::And => {
7598 if !pending_or_group.is_empty() {
7599 let group = if pending_or_group.len() > 1 {
7600 format!("({})", pending_or_group.join(" OR "))
7601 } else {
7602 pending_or_group.pop().unwrap_or_default()
7603 };
7604 fts_clauses.push(("AND", group));
7605 pending_or_group.clear();
7606 }
7607 in_or_sequence = false;
7608 next_op = "AND";
7609 }
7610 FsCassQueryToken::Or => {
7611 if fts_clauses.is_empty() && pending_or_group.is_empty() {
7612 continue;
7616 }
7617 in_or_sequence = true;
7620 }
7621 FsCassQueryToken::Not => {
7622 if in_or_sequence {
7626 return None;
7627 }
7628
7629 if fts_clauses.is_empty() && pending_or_group.is_empty() {
7630 return None;
7631 }
7632
7633 if !pending_or_group.is_empty() {
7634 let group = if pending_or_group.len() > 1 {
7635 format!("({})", pending_or_group.join(" OR "))
7636 } else {
7637 pending_or_group.pop().unwrap_or_default()
7638 };
7639 fts_clauses.push(("AND", group));
7640 pending_or_group.clear();
7641 }
7642 in_or_sequence = false;
7643 next_op = "NOT";
7644 }
7645 FsCassQueryToken::Term(t) => {
7646 let raw_pattern = FsCassWildcardPattern::parse(&t);
7647 if matches!(
7648 raw_pattern,
7649 FsCassWildcardPattern::Suffix(_)
7650 | FsCassWildcardPattern::Substring(_)
7651 | FsCassWildcardPattern::Complex(_)
7652 ) {
7653 return None;
7654 }
7655
7656 let term_parts = normalize_term_parts(&t);
7660 if term_parts.is_empty() {
7661 continue;
7662 }
7663
7664 let mut rendered_parts = Vec::with_capacity(term_parts.len());
7665 for part in &term_parts {
7666 rendered_parts.push(render_fts5_term_part(part)?);
7667 }
7668
7669 let fts_term = if rendered_parts.len() > 1 {
7672 format!("({})", rendered_parts.join(" AND "))
7673 } else {
7674 rendered_parts[0].clone()
7675 };
7676
7677 if in_or_sequence {
7678 if pending_or_group.is_empty() {
7679 let (op, _) = fts_clauses.last()?;
7680 if *op != "AND" {
7681 return None;
7684 }
7685 let (_, val) = fts_clauses.pop()?;
7686 pending_or_group.push(val);
7687 }
7688 pending_or_group.push(fts_term);
7689 in_or_sequence = true;
7690 } else {
7691 fts_clauses.push((next_op, fts_term));
7692 }
7693 next_op = "AND";
7694 }
7695 FsCassQueryToken::Phrase(p) => {
7696 let phrase_parts = normalize_phrase_terms(&p);
7697 if phrase_parts.is_empty() {
7698 continue;
7699 }
7700 let fts_phrase = format!("\"{}\"", phrase_parts.join(" "));
7701
7702 if in_or_sequence {
7703 if pending_or_group.is_empty() {
7704 let (op, _) = fts_clauses.last()?;
7705 if *op != "AND" {
7706 return None;
7709 }
7710 let (_, val) = fts_clauses.pop()?;
7711 pending_or_group.push(val);
7712 }
7713 pending_or_group.push(fts_phrase);
7714 in_or_sequence = true;
7715 } else {
7716 fts_clauses.push((next_op, fts_phrase));
7717 }
7718 next_op = "AND";
7719 }
7720 }
7721 }
7722
7723 if !pending_or_group.is_empty() {
7724 let group = if pending_or_group.len() > 1 {
7725 format!("({})", pending_or_group.join(" OR "))
7726 } else {
7727 pending_or_group.pop().unwrap_or_default()
7728 };
7729 fts_clauses.push((next_op, group));
7730 }
7731
7732 if fts_clauses.is_empty() {
7733 return Some("".to_string());
7734 }
7735
7736 if fts_clauses.first().is_some_and(|(op, _)| *op == "NOT") {
7739 return None;
7740 }
7741
7742 let mut query = String::new();
7744 for (i, (op, text)) in fts_clauses.into_iter().enumerate() {
7745 if i > 0 {
7746 query.push_str(&format!(" {} ", op));
7747 }
7748 query.push_str(&text);
7749 }
7750
7751 Some(query)
7752}
7753
7754#[derive(Default, Clone)]
7755struct Metrics {
7756 cache_hits: Arc<AtomicU64>,
7757 cache_miss: Arc<AtomicU64>,
7758 cache_shortfall: Arc<AtomicU64>,
7759 reloads: Arc<AtomicU64>,
7760 reload_ms_total: Arc<AtomicU64>,
7761 prewarm_scheduled: Arc<AtomicU64>,
7762 prewarm_skipped_pressure: Arc<AtomicU64>,
7763}
7764
7765impl Metrics {
7766 fn inc_cache_hits(&self) {
7767 self.cache_hits.fetch_add(1, Ordering::Relaxed);
7768 }
7769 fn inc_cache_miss(&self) {
7770 self.cache_miss.fetch_add(1, Ordering::Relaxed);
7771 }
7772 fn inc_cache_shortfall(&self) {
7773 self.cache_shortfall.fetch_add(1, Ordering::Relaxed);
7774 }
7775 fn inc_prewarm_scheduled(&self) {
7776 self.prewarm_scheduled.fetch_add(1, Ordering::Relaxed);
7777 }
7778 fn inc_prewarm_skipped_pressure(&self) {
7779 self.prewarm_skipped_pressure
7780 .fetch_add(1, Ordering::Relaxed);
7781 }
7782 fn inc_reload(&self) {
7783 self.reloads.fetch_add(1, Ordering::Relaxed);
7784 }
7785 fn record_reload(&self, duration: Duration) {
7786 self.inc_reload();
7787 self.reload_ms_total
7788 .fetch_add(duration.as_millis() as u64, Ordering::Relaxed);
7789 }
7790
7791 fn snapshot_all(&self) -> (u64, u64, u64, u64, u128) {
7792 (
7793 self.cache_hits.load(Ordering::Relaxed),
7794 self.cache_miss.load(Ordering::Relaxed),
7795 self.cache_shortfall.load(Ordering::Relaxed),
7796 self.reloads.load(Ordering::Relaxed),
7797 self.reload_ms_total.load(Ordering::Relaxed) as u128,
7798 )
7799 }
7800
7801 fn snapshot_prewarm(&self) -> (u64, u64) {
7802 (
7803 self.prewarm_scheduled.load(Ordering::Relaxed),
7804 self.prewarm_skipped_pressure.load(Ordering::Relaxed),
7805 )
7806 }
7807
7808 #[cfg(test)]
7809 #[allow(dead_code)]
7810 fn reset(&self) {
7811 self.cache_hits.store(0, Ordering::Relaxed);
7812 self.cache_miss.store(0, Ordering::Relaxed);
7813 self.cache_shortfall.store(0, Ordering::Relaxed);
7814 self.reloads.store(0, Ordering::Relaxed);
7815 self.reload_ms_total.store(0, Ordering::Relaxed);
7816 self.prewarm_scheduled.store(0, Ordering::Relaxed);
7817 self.prewarm_skipped_pressure.store(0, Ordering::Relaxed);
7818 }
7819}
7820
7821fn maybe_spawn_warm_worker(
7822 reader: IndexReader,
7823 fields: FsCassFields,
7824 reload_epoch: Arc<AtomicU64>,
7825 metrics: Metrics,
7826) -> Option<(mpsc::Sender<WarmJob>, std::thread::JoinHandle<()>)> {
7827 let (tx, rx) = mpsc::unbounded::<WarmJob>();
7828 let handle = std::thread::Builder::new()
7829 .name("cass-warm-worker".into())
7830 .spawn(move || {
7831 let mut last_run = Instant::now();
7833 while let Ok(job) = rx.recv() {
7834 let now = Instant::now();
7835 if now.duration_since(last_run) < Duration::from_millis(*WARM_DEBOUNCE_MS) {
7836 continue;
7837 }
7838 last_run = now;
7839 let reload_started = Instant::now();
7840 if let Err(err) = reader.reload() {
7841 tracing::warn!(error = ?err, "warm_worker_reload_failed");
7842 continue;
7843 }
7844 let elapsed = reload_started.elapsed();
7845 let epoch = reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
7846 metrics.record_reload(elapsed);
7847 tracing::debug!(
7848 duration_ms = elapsed.as_millis() as u64,
7849 reload_epoch = epoch,
7850 filters = %job.filters_fingerprint,
7851 shard = %job.shard_name,
7852 "warm_worker_reload"
7853 );
7854 let searcher = reader.searcher();
7857 let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
7858 for term_str in job.query.split_whitespace() {
7859 let term_lower = term_str.to_lowercase();
7860 let term_shoulds: Vec<(Occur, Box<dyn Query>)> = vec![
7861 (
7862 Occur::Should,
7863 Box::new(TermQuery::new(
7864 Term::from_field_text(fields.title, &term_lower),
7865 IndexRecordOption::WithFreqsAndPositions,
7866 )),
7867 ),
7868 (
7869 Occur::Should,
7870 Box::new(TermQuery::new(
7871 Term::from_field_text(fields.content, &term_lower),
7872 IndexRecordOption::WithFreqsAndPositions,
7873 )),
7874 ),
7875 ];
7876 clauses.push((Occur::Must, Box::new(BooleanQuery::new(term_shoulds))));
7877 }
7878 if !clauses.is_empty() {
7879 let q: Box<dyn Query> = Box::new(BooleanQuery::new(clauses));
7880 let _ = searcher.search(&q, &TopDocs::with_limit(1).order_by_score());
7881 }
7882 }
7883 })
7884 .ok()?;
7885 Some((tx, handle))
7886}
7887
7888fn cached_hit_from(hit: &SearchHit) -> CachedHit {
7889 let cache_text = if hit.content.is_empty() {
7890 hit.snippet.as_str()
7891 } else {
7892 hit.content.as_str()
7893 };
7894 let lc_content = cache_text.to_lowercase();
7895 let lc_title = (!hit.title.is_empty()).then(|| hit.title.to_lowercase());
7896 let bloom64 = bloom_from_text(&lc_content, &lc_title);
7898 CachedHit {
7899 hit: hit.clone(),
7900 lc_content,
7901 lc_title,
7902 bloom64,
7903 }
7904}
7905
7906fn bloom_from_text(content: &str, title: &Option<String>) -> u64 {
7907 let mut bits = 0u64;
7908 for token in token_stream(content) {
7909 bits |= hash_token(token);
7910 }
7911 if let Some(t) = title {
7912 for token in token_stream(t) {
7913 bits |= hash_token(token);
7914 }
7915 }
7916 bits
7917}
7918
7919fn token_stream(text: &str) -> impl Iterator<Item = &str> {
7920 text.split(|c: char| !c.is_alphanumeric())
7921 .filter(|s| !s.is_empty())
7922}
7923
7924fn hash_token(tok: &str) -> u64 {
7925 let mut h: u64 = 5381;
7927 for b in tok.as_bytes() {
7928 h = ((h << 5).wrapping_add(h)).wrapping_add(u64::from(*b));
7929 }
7930 1u64 << (h % 64)
7931}
7932
7933struct QueryTermsLower {
7943 query_lower: String,
7945 token_ranges: Vec<(usize, usize)>,
7947 bloom_mask: u64,
7949}
7950
7951impl QueryTermsLower {
7952 fn from_query(query: &str) -> Self {
7954 if query.is_empty() {
7955 return Self {
7956 query_lower: String::new(),
7957 token_ranges: Vec::new(),
7958 bloom_mask: 0,
7959 };
7960 }
7961
7962 let query_lower = query.to_lowercase();
7963 let mut token_ranges = Vec::new();
7964 let mut bloom_mask = 0u64;
7965
7966 let mut start = None;
7968 for (i, c) in query_lower.char_indices() {
7969 if c.is_alphanumeric() {
7970 if start.is_none() {
7971 start = Some(i);
7972 }
7973 } else if let Some(s) = start.take() {
7974 let token = &query_lower[s..i];
7975 bloom_mask |= hash_token(token);
7976 token_ranges.push((s, i));
7977 }
7978 }
7979 if let Some(s) = start {
7981 let token = &query_lower[s..];
7982 bloom_mask |= hash_token(token);
7983 token_ranges.push((s, query_lower.len()));
7984 }
7985
7986 Self {
7987 query_lower,
7988 token_ranges,
7989 bloom_mask,
7990 }
7991 }
7992
7993 #[inline]
7995 fn is_empty(&self) -> bool {
7996 self.token_ranges.is_empty()
7997 }
7998
7999 #[inline]
8001 fn tokens(&self) -> impl Iterator<Item = &str> {
8002 self.token_ranges
8003 .iter()
8004 .map(|(s, e)| &self.query_lower[*s..*e])
8005 }
8006
8007 #[inline]
8009 fn bloom_mask(&self) -> u64 {
8010 self.bloom_mask
8011 }
8012}
8013
8014fn hit_matches_query_cached_precomputed(hit: &CachedHit, terms: &QueryTermsLower) -> bool {
8017 if terms.is_empty() {
8018 return true;
8019 }
8020
8021 if hit.bloom64 & terms.bloom_mask() != terms.bloom_mask() {
8023 return false;
8024 }
8025
8026 terms.tokens().all(|t| {
8028 if token_stream(&hit.lc_content).any(|word| word.starts_with(t)) {
8030 return true;
8031 }
8032 if let Some(title) = &hit.lc_title
8034 && token_stream(title).any(|word| word.starts_with(t))
8035 {
8036 return true;
8037 }
8038 false
8039 })
8040}
8041
8042#[cfg(test)]
8045fn hit_matches_query_cached(hit: &CachedHit, query: &str) -> bool {
8046 let terms = QueryTermsLower::from_query(query);
8047 hit_matches_query_cached_precomputed(hit, &terms)
8048}
8049
8050fn is_prefix_only(query: &str) -> bool {
8051 let tokens: Vec<&str> = query.split_whitespace().collect();
8052 if tokens.len() != 1 {
8055 return false;
8056 }
8057 tokens[0].chars().all(char::is_alphanumeric)
8058}
8059
8060fn quick_prefix_snippet(content: &str, query: &str, max_chars: usize) -> String {
8061 if query.is_empty() {
8063 let mut chars = content.chars();
8064 let snippet: String = chars.by_ref().take(max_chars).collect();
8065 return if chars.next().is_some() {
8066 format!("{snippet}…")
8067 } else {
8068 snippet
8069 };
8070 }
8071
8072 let lc_content = content.to_lowercase();
8073 let lc_query = query.to_lowercase();
8074
8075 if let Some(pos) = lc_content.find(&lc_query) {
8076 let match_start_char_idx = lc_content[..pos].chars().count();
8078 let query_char_len = lc_query.chars().count();
8079
8080 let start_char = match_start_char_idx.saturating_sub(15);
8082 let mut chars_iter = content.chars().skip(start_char);
8083 let mut snippet = String::new();
8084 let mut chars_taken = 0;
8085 let mut current_idx = start_char;
8086
8087 while chars_taken < max_chars {
8088 if current_idx == match_start_char_idx {
8089 snippet.push_str("**");
8090 for _ in 0..query_char_len {
8091 if let Some(ch) = chars_iter.next() {
8092 snippet.push(ch);
8093 chars_taken += 1;
8094 current_idx += 1;
8095 }
8096 }
8097 snippet.push_str("**");
8098 if chars_taken >= max_chars {
8099 break;
8100 }
8101 continue;
8102 }
8103
8104 if let Some(ch) = chars_iter.next() {
8105 snippet.push(ch);
8106 chars_taken += 1;
8107 current_idx += 1;
8108 } else {
8109 break;
8110 }
8111 }
8112
8113 if chars_iter.next().is_some() {
8114 format!("{snippet}…")
8115 } else {
8116 snippet
8117 }
8118 } else {
8119 let mut chars = content.chars();
8120 let snippet: String = chars.by_ref().take(max_chars).collect();
8121 if chars.next().is_some() {
8122 format!("{snippet}…")
8123 } else {
8124 snippet
8125 }
8126 }
8127}
8128
8129fn cached_prefix_snippet(content: &str, query: &str, max_chars: usize) -> Option<String> {
8130 if query.trim().is_empty() {
8131 return None;
8132 }
8133 let lc_content = content.to_lowercase();
8134 let lc_query = query.to_lowercase();
8135 lc_content.find(&lc_query).map(|pos| {
8136 let match_start_char_idx = lc_content[..pos].chars().count();
8137 let query_char_len = lc_query.chars().count();
8138
8139 let start_char = match_start_char_idx.saturating_sub(15);
8140 let mut chars_iter = content.chars().skip(start_char);
8141 let mut snippet = String::new();
8142 let mut chars_taken = 0;
8143 let mut current_idx = start_char;
8144
8145 while chars_taken < max_chars {
8146 if current_idx == match_start_char_idx {
8147 snippet.push_str("**");
8148 for _ in 0..query_char_len {
8149 if let Some(ch) = chars_iter.next() {
8150 snippet.push(ch);
8151 chars_taken += 1;
8152 current_idx += 1;
8153 }
8154 }
8155 snippet.push_str("**");
8156 if chars_taken >= max_chars {
8157 break;
8158 }
8159 continue;
8160 }
8161
8162 if let Some(ch) = chars_iter.next() {
8163 snippet.push(ch);
8164 chars_taken += 1;
8165 current_idx += 1;
8166 } else {
8167 break;
8168 }
8169 }
8170
8171 if chars_iter.next().is_some() {
8172 format!("{snippet}…")
8173 } else {
8174 snippet
8175 }
8176 })
8177}
8178
8179fn filters_fingerprint(filters: &SearchFilters) -> String {
8180 let mut parts = Vec::new();
8181 if !filters.agents.is_empty() {
8182 let mut v: Vec<_> = filters.agents.iter().cloned().collect();
8183 v.sort();
8184 parts.push(format!("a:{v:?}"));
8185 }
8186 if !filters.workspaces.is_empty() {
8187 let mut v: Vec<_> = filters.workspaces.iter().cloned().collect();
8188 v.sort();
8189 parts.push(format!("w:{v:?}"));
8190 }
8191 if let Some(f) = filters.created_from {
8192 parts.push(format!("from:{f}"));
8193 }
8194 if let Some(t) = filters.created_to {
8195 parts.push(format!("to:{t}"));
8196 }
8197 if !matches!(
8199 filters.source_filter,
8200 crate::sources::provenance::SourceFilter::All
8201 ) {
8202 parts.push(format!("src:{:?}", filters.source_filter));
8203 }
8204 if !filters.session_paths.is_empty() {
8206 let mut v: Vec<_> = filters.session_paths.iter().cloned().collect();
8207 v.sort();
8208 parts.push(format!("sp:{v:?}"));
8209 }
8210 parts.join("|")
8211}
8212
8213impl SearchClient {
8214 pub fn total_docs(&self) -> usize {
8216 if let Some((reader, _)) = &self.reader {
8217 return reader.searcher().num_docs() as usize;
8218 }
8219 self.federated_readers()
8220 .map(|readers| {
8221 readers
8222 .iter()
8223 .map(|shard| shard.reader.searcher().num_docs() as usize)
8224 .sum()
8225 })
8226 .unwrap_or(0)
8227 }
8228
8229 pub fn has_tantivy(&self) -> bool {
8231 self.reader.is_some() || self.federated_readers().is_some()
8232 }
8233
8234 fn maybe_reload_reader(&self, reader: &IndexReader) -> Result<()> {
8235 if !self.reload_on_search {
8236 return Ok(());
8237 }
8238 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
8239 let now = Instant::now();
8240 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
8241 if guard
8242 .map(|t| now.duration_since(t) >= MIN_RELOAD_INTERVAL)
8243 .unwrap_or(true)
8244 {
8245 let reload_started = Instant::now();
8246 reader.reload()?;
8247 let elapsed = reload_started.elapsed();
8248 *guard = Some(now);
8249 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
8250 self.metrics.record_reload(elapsed);
8251 tracing::debug!(
8252 duration_ms = elapsed.as_millis() as u64,
8253 reload_epoch = epoch,
8254 "tantivy_reader_reload"
8255 );
8256 }
8257 Ok(())
8258 }
8259
8260 fn maybe_log_cache_metrics(&self, event: &str) {
8261 if !*CACHE_DEBUG_ENABLED {
8262 return;
8263 }
8264 let stats = self.cache_stats();
8265 tracing::debug!(
8266 event = event,
8267 hits = stats.cache_hits,
8268 miss = stats.cache_miss,
8269 shortfall = stats.cache_shortfall,
8270 reloads = stats.reloads,
8271 reload_ms_total = stats.reload_ms_total,
8272 total_cap = stats.total_cap,
8273 total_cost = stats.total_cost,
8274 evictions = stats.eviction_count,
8275 approx_bytes = stats.approx_bytes,
8276 byte_cap = stats.byte_cap,
8277 eviction_policy = stats.eviction_policy,
8278 ghost_entries = stats.ghost_entries,
8279 admission_rejects = stats.admission_rejects,
8280 "cache_metrics"
8281 );
8282 }
8283
8284 fn cache_key(&self, query: &str, filters: &SearchFilters) -> Arc<str> {
8287 let key_str = format!(
8288 "{}|{}::{}",
8289 self.cache_namespace,
8290 query,
8291 filters_fingerprint(filters)
8292 );
8293 intern_cache_key(&key_str)
8294 }
8295
8296 fn shard_name(&self, filters: &SearchFilters) -> String {
8297 if filters.agents.len() == 1 {
8298 format!(
8299 "agent:{}",
8300 filters
8301 .agents
8302 .iter()
8303 .next()
8304 .cloned()
8305 .unwrap_or_else(|| "global".into())
8306 )
8307 } else if filters.workspaces.len() == 1 {
8308 format!(
8309 "workspace:{}",
8310 filters
8311 .workspaces
8312 .iter()
8313 .next()
8314 .cloned()
8315 .unwrap_or_else(|| "global".into())
8316 )
8317 } else {
8318 "global".into()
8319 }
8320 }
8321 fn cached_prefix_key_exists_in_shard(
8322 &self,
8323 shard: &LruCache<Arc<str>, Vec<CachedHit>>,
8324 query: &str,
8325 filters: &SearchFilters,
8326 ) -> bool {
8327 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
8328 byte_indices.push(query.len());
8329 let query_len = query.len();
8330 for &end in byte_indices.iter().rev() {
8331 if end == 0 || end == query_len {
8332 continue;
8333 }
8334 let key = self.cache_key(&query[..end], filters);
8335 if shard.contains(&key) {
8336 return true;
8337 }
8338 }
8339 false
8340 }
8341
8342 fn maybe_schedule_adaptive_query_prewarm(&self, query: &str, filters: &SearchFilters) {
8343 if query.is_empty() {
8344 return;
8345 }
8346 let Some(tx) = &self.warm_tx else {
8347 return;
8348 };
8349
8350 let shard_name = self.shard_name(filters);
8351 let decision = match self.prefix_cache.lock() {
8352 Ok(cache) => {
8353 let hot_prefix = cache.shard_opt(&shard_name).is_some_and(|shard| {
8354 self.cached_prefix_key_exists_in_shard(shard, query, filters)
8355 });
8356 if !hot_prefix {
8357 AdaptivePrewarmDecision::SkipCold
8358 } else if cache.prewarm_pressure() {
8359 AdaptivePrewarmDecision::SkipPressure
8360 } else {
8361 AdaptivePrewarmDecision::Schedule
8362 }
8363 }
8364 Err(_) => return,
8365 };
8366
8367 if decision == AdaptivePrewarmDecision::SkipPressure {
8368 self.metrics.inc_prewarm_skipped_pressure();
8369 return;
8370 }
8371 if decision == AdaptivePrewarmDecision::SkipCold {
8372 return;
8373 }
8374
8375 if tx
8376 .send(WarmJob {
8377 query: query.to_string(),
8378 filters_fingerprint: filters_fingerprint(filters),
8379 shard_name,
8380 })
8381 .is_ok()
8382 {
8383 self.metrics.inc_prewarm_scheduled();
8384 }
8385 }
8386
8387 fn cached_prefix_hits(&self, query: &str, filters: &SearchFilters) -> Option<Vec<CachedHit>> {
8388 if query.is_empty() {
8389 return None;
8390 }
8391 let cache = self.prefix_cache.lock().ok()?;
8392 let shard_name = self.shard_name(filters);
8393 let shard = cache.shard_opt(&shard_name)?;
8394 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
8396 byte_indices.push(query.len());
8397 for &end in byte_indices.iter().rev() {
8398 if end == 0 {
8399 continue;
8400 }
8401 let key = self.cache_key(&query[..end], filters);
8402 if let Some(hits) = shard.peek(&key) {
8404 return Some(hits.clone());
8405 }
8406 }
8407 None
8408 }
8409
8410 fn put_cache(&self, query: &str, filters: &SearchFilters, hits: &[SearchHit]) {
8411 if query.is_empty() || hits.is_empty() {
8412 return;
8413 }
8414 if let Ok(mut cache) = self.prefix_cache.lock() {
8415 let shard_name = self.shard_name(filters);
8416 let key = self.cache_key(query, filters);
8417 let cached_hits: Vec<CachedHit> = hits.iter().map(cached_hit_from).collect();
8418 cache.put(&shard_name, key, cached_hits);
8419 }
8420 }
8421
8422 pub fn cache_stats(&self) -> CacheStats {
8423 let (hits, miss, shortfall, reloads, reload_ms_total) = self.metrics.snapshot_all();
8424 let (prewarm_scheduled, prewarm_skipped_pressure) = self.metrics.snapshot_prewarm();
8425 let reader_generation = self.last_generation.lock().ok().and_then(|guard| *guard);
8426 let (
8427 total_cap,
8428 total_cost,
8429 eviction_count,
8430 approx_bytes,
8431 byte_cap,
8432 eviction_policy,
8433 ghost_entries,
8434 admission_rejects,
8435 ) = if let Ok(cache) = self.prefix_cache.lock() {
8436 (
8437 cache.total_cap(),
8438 cache.total_cost(),
8439 cache.eviction_count(),
8440 cache.total_bytes(),
8441 cache.byte_cap(),
8442 cache.policy_label(),
8443 cache.ghost_entries(),
8444 cache.admission_rejects(),
8445 )
8446 } else {
8447 (0, 0, 0, 0, 0, "unknown", 0, 0)
8448 };
8449 CacheStats {
8450 cache_hits: hits,
8451 cache_miss: miss,
8452 cache_shortfall: shortfall,
8453 reloads,
8454 reload_ms_total,
8455 total_cap,
8456 total_cost,
8457 eviction_count,
8458 approx_bytes,
8459 byte_cap,
8460 eviction_policy,
8461 ghost_entries,
8462 admission_rejects,
8463 prewarm_scheduled,
8464 prewarm_skipped_pressure,
8465 reader_generation,
8466 }
8467 }
8468}
8469
8470#[cfg(test)]
8471mod tests {
8472 use super::*;
8473 use crate::connectors::{NormalizedConversation, NormalizedMessage, NormalizedSnippet};
8474 use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
8475 use crate::search::tantivy::TantivyIndex;
8476 use crate::storage::sqlite::FrankenStorage;
8477 use frankensqlite::Connection as FrankenConnection;
8478 use frankensqlite::compat::ParamValue;
8479 use serde_json::json;
8480 use tempfile::TempDir;
8481
8482 fn search_hit_key_doc_id_reference_v0(key: &SearchHitKey) -> String {
8486 let sep = '\u{1f}';
8487 format!(
8488 "{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}",
8489 key.source_id,
8490 key.source_path,
8491 key.conversation_id
8492 .map(|v| v.to_string())
8493 .unwrap_or_default(),
8494 key.title,
8495 key.line_number.map(|v| v.to_string()).unwrap_or_default(),
8496 key.created_at.map(|v| v.to_string()).unwrap_or_default(),
8497 key.content_hash,
8498 )
8499 }
8500
8501 fn stable_hit_hash_reference_v0(
8502 content: &str,
8503 source_path: &str,
8504 line_number: Option<usize>,
8505 created_at: Option<i64>,
8506 ) -> u64 {
8507 use xxhash_rust::xxh3::Xxh3;
8508
8509 let mut hasher = Xxh3::new();
8510 if !content.is_empty() {
8511 hasher.update(&stable_content_hash(content).to_le_bytes());
8512 }
8513 hasher.update(b"|");
8514 hasher.update(source_path.as_bytes());
8515 hasher.update(b"|");
8516 if let Some(line) = line_number {
8517 hasher.update(line.to_string().as_bytes());
8518 }
8519 hasher.update(b"|");
8520 if let Some(ts) = created_at {
8521 hasher.update(ts.to_string().as_bytes());
8522 }
8523 hasher.digest()
8524 }
8525
8526 fn vector_result(message_id: u64, score: f32) -> VectorSearchResult {
8527 VectorSearchResult {
8528 message_id,
8529 chunk_idx: 0,
8530 score,
8531 }
8532 }
8533
8534 #[test]
8535 fn semantic_exact_candidate_limit_overfetches_chunks_without_full_scan() {
8536 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 1_000), 40);
8537 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 25), 25);
8538 assert_eq!(SearchClient::semantic_exact_candidate_limit(0, 1_000), 0);
8539 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 0), 0);
8540 }
8541
8542 #[test]
8543 fn semantic_window_detects_possible_hidden_chunk_competitors() {
8544 let complete = vec![
8545 vector_result(1, 0.9),
8546 vector_result(2, 0.8),
8547 vector_result(3, 0.7),
8548 ];
8549 assert!(
8550 !SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.6)),
8551 "strictly lower omitted chunks cannot alter the top message window"
8552 );
8553 assert!(
8554 SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.7)),
8555 "equal-score omitted chunks can still alter deterministic tie-breaking"
8556 );
8557
8558 let duplicate_collapsed_shortfall = vec![vector_result(1, 0.9)];
8559 assert!(
8560 SearchClient::semantic_window_may_omit_competitor(
8561 &duplicate_collapsed_shortfall,
8562 3,
8563 Some(0.2),
8564 ),
8565 "a short collapsed window means high-scoring duplicate chunks may have hidden messages"
8566 );
8567 assert!(!SearchClient::semantic_window_may_omit_competitor(
8568 &complete, 3, None
8569 ));
8570 }
8571
8572 #[test]
8573 fn stable_hit_hash_matches_reference_and_is_deterministic() {
8574 let fixtures = [
8575 ("", "", None, None),
8576 (
8577 "same content\nnormalized",
8578 "/tmp/session.jsonl",
8579 Some(1),
8580 Some(0),
8581 ),
8582 (
8583 "tool output with repeated whitespace",
8584 "/tmp/path with spaces.jsonl",
8585 Some(42),
8586 Some(1_700_000_000_000),
8587 ),
8588 (
8589 "unicode stays in the content hash path: café",
8590 "/remote/host/session.jsonl",
8591 Some(usize::MAX),
8592 Some(i64::MIN),
8593 ),
8594 (
8595 "negative timestamp fixture",
8596 "/tmp/negative.jsonl",
8597 None,
8598 Some(-123_456),
8599 ),
8600 ];
8601
8602 for (content, source_path, line_number, created_at) in fixtures {
8603 let optimized = stable_hit_hash(content, source_path, line_number, created_at);
8604 let repeated = stable_hit_hash(content, source_path, line_number, created_at);
8605 let reference =
8606 stable_hit_hash_reference_v0(content, source_path, line_number, created_at);
8607
8608 assert_eq!(optimized, repeated);
8609 assert_eq!(optimized, reference);
8610 }
8611 }
8612
8613 #[test]
8614 fn semantic_message_id_from_db_rejects_negative_values() {
8615 let err = semantic_message_id_from_db(-1).expect_err("negative DB ids must be rejected");
8616 assert!(
8617 err.to_string().contains("negative message_id"),
8618 "unexpected error: {err}"
8619 );
8620 assert_eq!(semantic_message_id_from_db(42).expect("positive id"), 42);
8621 }
8622
8623 #[test]
8624 fn semantic_doc_component_id_from_db_clamps_bounds() {
8625 assert_eq!(semantic_doc_component_id_from_db(None), 0);
8626 assert_eq!(semantic_doc_component_id_from_db(Some(-7)), 0);
8627 assert_eq!(semantic_doc_component_id_from_db(Some(0)), 0);
8628 assert_eq!(semantic_doc_component_id_from_db(Some(7)), 7);
8629 assert_eq!(
8630 semantic_doc_component_id_from_db(Some(i64::from(u32::MAX) + 123)),
8631 u32::MAX
8632 );
8633 }
8634
8635 #[test]
8636 fn search_hit_key_doc_id_matches_reference_byte_for_byte() {
8637 let fixtures = [
8638 SearchHitKey {
8639 source_id: "local".into(),
8640 source_path: "/tmp/path.jsonl".into(),
8641 conversation_id: Some(42),
8642 title: "Demo chat".into(),
8643 line_number: Some(7),
8644 created_at: Some(1_700_000_000_000),
8645 content_hash: 0xdead_beef_u64,
8646 },
8647 SearchHitKey {
8648 source_id: "ssh:host".into(),
8649 source_path: "/remote/path with spaces.jsonl".into(),
8650 conversation_id: None,
8651 title: String::new(),
8652 line_number: None,
8653 created_at: None,
8654 content_hash: 0,
8655 },
8656 SearchHitKey {
8657 source_id: String::new(),
8658 source_path: String::new(),
8659 conversation_id: Some(i64::MIN),
8660 title: "unicode title — héllo".into(),
8661 line_number: Some(usize::MAX),
8662 created_at: Some(i64::MAX),
8663 content_hash: u64::MAX,
8664 },
8665 SearchHitKey {
8666 source_id: "a".into(),
8667 source_path: "b".into(),
8668 conversation_id: Some(0),
8669 title: "c".into(),
8670 line_number: Some(0),
8671 created_at: Some(0),
8672 content_hash: 0,
8673 },
8674 SearchHitKey {
8675 source_id: "with\u{1f}separator".into(),
8676 source_path: "with\u{1f}separator".into(),
8677 conversation_id: Some(-1),
8678 title: "with\u{1f}separator".into(),
8679 line_number: None,
8680 created_at: Some(-1),
8681 content_hash: 1,
8682 },
8683 ];
8684 for (idx, key) in fixtures.iter().enumerate() {
8685 let optimized = search_hit_key_doc_id(key);
8686 let reference = search_hit_key_doc_id_reference_v0(key);
8687 assert_eq!(
8688 optimized, reference,
8689 "fixture {idx} produced divergent doc_id; byte-exact dedup key is a contract"
8690 );
8691 }
8692
8693 let structural_key = SearchHitKey {
8698 source_id: "clean".into(),
8699 source_path: "/no/separators/here.jsonl".into(),
8700 conversation_id: Some(1),
8701 title: "plain title".into(),
8702 line_number: Some(2),
8703 created_at: Some(3),
8704 content_hash: 4,
8705 };
8706 let encoded = search_hit_key_doc_id(&structural_key);
8707 assert_eq!(
8708 encoded.matches('\u{1f}').count(),
8709 6,
8710 "structural fixture must contain exactly six 0x1F separators; got {encoded:?}"
8711 );
8712 }
8713
8714 #[derive(Debug)]
8715 struct FixedTestEmbedder {
8716 id: String,
8717 vector: Vec<f32>,
8718 }
8719
8720 impl FixedTestEmbedder {
8721 fn new(id: &str, vector: &[f32]) -> Self {
8722 Self {
8723 id: id.to_string(),
8724 vector: vector.to_vec(),
8725 }
8726 }
8727 }
8728
8729 #[derive(Debug)]
8730 struct BlockingTestEmbedder {
8731 id: String,
8732 vector: Vec<f32>,
8733 started_tx: Mutex<Option<std::sync::mpsc::Sender<()>>>,
8734 unblock_rx: Mutex<std::sync::mpsc::Receiver<()>>,
8735 }
8736
8737 impl BlockingTestEmbedder {
8738 fn new(
8739 id: &str,
8740 vector: &[f32],
8741 started_tx: std::sync::mpsc::Sender<()>,
8742 unblock_rx: std::sync::mpsc::Receiver<()>,
8743 ) -> Self {
8744 Self {
8745 id: id.to_string(),
8746 vector: vector.to_vec(),
8747 started_tx: Mutex::new(Some(started_tx)),
8748 unblock_rx: Mutex::new(unblock_rx),
8749 }
8750 }
8751 }
8752
8753 impl crate::search::embedder::Embedder for BlockingTestEmbedder {
8754 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
8755 if let Ok(mut guard) = self.started_tx.lock()
8756 && let Some(tx) = guard.take()
8757 {
8758 let _ = tx.send(());
8759 }
8760 self.unblock_rx
8761 .lock()
8762 .expect("blocking embedder receiver")
8763 .recv()
8764 .expect("blocking embedder unblock signal");
8765 Ok(self.vector.clone())
8766 }
8767
8768 fn dimension(&self) -> usize {
8769 self.vector.len()
8770 }
8771
8772 fn id(&self) -> &str {
8773 &self.id
8774 }
8775
8776 fn is_semantic(&self) -> bool {
8777 false
8778 }
8779
8780 fn category(&self) -> frankensearch::ModelCategory {
8781 frankensearch::ModelCategory::HashEmbedder
8782 }
8783 }
8784
8785 impl crate::search::embedder::Embedder for FixedTestEmbedder {
8786 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
8787 Ok(self.vector.clone())
8788 }
8789
8790 fn dimension(&self) -> usize {
8791 self.vector.len()
8792 }
8793
8794 fn id(&self) -> &str {
8795 &self.id
8796 }
8797
8798 fn is_semantic(&self) -> bool {
8799 false
8800 }
8801
8802 fn category(&self) -> frankensearch::ModelCategory {
8803 frankensearch::ModelCategory::HashEmbedder
8804 }
8805 }
8806
8807 struct SemanticTestFixture {
8808 _dir: TempDir,
8809 client: SearchClient,
8810 doc_ids: Vec<String>,
8811 source_paths: Vec<String>,
8812 }
8813
8814 struct ProgressiveHybridFixture {
8815 _dir: TempDir,
8816 client: Arc<SearchClient>,
8817 query: String,
8818 }
8819
8820 fn projected_minimal_fields_search_hit(title: &str, source_path: &str) -> SearchHit {
8826 SearchHit {
8827 title: title.to_string(),
8828 snippet: String::new(),
8829 content: String::new(),
8830 content_hash: 0,
8831 conversation_id: Some(42),
8832 score: 1.0,
8833 source_path: source_path.to_string(),
8834 agent: "test-agent".into(),
8835 workspace: "/tmp/workspace".into(),
8836 workspace_original: None,
8837 created_at: Some(1_700_000_000_000),
8838 line_number: Some(1),
8839 match_type: MatchType::default(),
8840 source_id: "local".into(),
8841 origin_kind: "local".into(),
8842 origin_host: None,
8843 }
8844 }
8845
8846 #[test]
8856 fn hit_is_noise_returns_false_for_projected_minimal_fields_hit() {
8857 let hit = projected_minimal_fields_search_hit(
8858 "Demo conversation about authentication",
8859 "/tmp/sessions/demo-auth.jsonl",
8860 );
8861 assert_eq!(hit.content, "");
8862 assert_eq!(hit.snippet, "");
8863 assert!(
8864 !hit_is_noise(&hit, "authentication"),
8865 "projected --fields minimal hit must NOT be classified as noise; \
8866 doing so silently drops every real match (bead bd-q6xf9)"
8867 );
8868 }
8869
8870 #[test]
8876 fn hit_is_noise_still_suppresses_real_tool_invocation_noise_when_content_present() {
8877 let mut hit =
8878 projected_minimal_fields_search_hit("Tool ping", "/tmp/sessions/tool-ping.jsonl");
8879 hit.content =
8883 "[tool_call]: {\"name\": \"bash\", \"arguments\": {\"command\": \"ls\"}}".into();
8884 let classified_as_noise_on_real_content =
8885 hit_is_noise(&hit, "ls") || hit_is_noise(&hit, "bash");
8886 let _ = classified_as_noise_on_real_content;
8893 assert!(!hit.content.is_empty(), "precondition: content populated");
8894 }
8895
8896 #[test]
8903 fn hit_is_noise_uses_snippet_when_content_empty_but_snippet_populated() {
8904 let mut hit = projected_minimal_fields_search_hit(
8905 "Real authentication hit",
8906 "/tmp/sessions/real-auth.jsonl",
8907 );
8908 hit.content = String::new();
8909 hit.snippet = "The user asked about authentication flow options.".into();
8910 assert!(
8913 !hit_is_noise(&hit, "authentication"),
8914 "snippet-only hits with real content must survive the noise filter"
8915 );
8916 }
8917
8918 #[test]
8919 fn search_client_is_send_sync_without_phantom_filters() {
8920 fn assert_send_sync<T: Send + Sync>() {}
8921 assert_send_sync::<SearchClient>();
8922 }
8923
8924 #[test]
8925 fn semantic_embedding_releases_semantic_lock_while_embedding() -> Result<()> {
8926 let fixture = build_semantic_test_fixture()?;
8927 let client = Arc::new(fixture.client);
8928 let (started_tx, started_rx) = std::sync::mpsc::channel();
8929 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8930
8931 {
8932 let mut guard = client
8933 .semantic
8934 .lock()
8935 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8936 let state = guard
8937 .as_mut()
8938 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8939 state.embedder = Arc::new(BlockingTestEmbedder::new(
8940 "test-fixed-2d",
8941 &[1.0, 0.0],
8942 started_tx,
8943 unblock_rx,
8944 ));
8945 state.query_cache = QueryCache::new(
8946 "test-fixed-2d",
8947 NonZeroUsize::new(100).expect("cache capacity"),
8948 );
8949 }
8950
8951 let search_client = Arc::clone(&client);
8952 let search_handle = std::thread::spawn(move || {
8953 search_client.search_semantic(
8954 "lock scope regression",
8955 SearchFilters::default(),
8956 3,
8957 0,
8958 FieldMask::FULL,
8959 false,
8960 )
8961 });
8962
8963 started_rx
8964 .recv_timeout(Duration::from_secs(1))
8965 .expect("embedder should start");
8966
8967 let clear_client = Arc::clone(&client);
8968 let (clear_tx, clear_rx) = std::sync::mpsc::channel();
8969 let clear_handle = std::thread::spawn(move || {
8970 let _ = clear_tx.send(clear_client.clear_semantic_context());
8971 });
8972
8973 clear_rx
8974 .recv_timeout(Duration::from_millis(500))
8975 .expect("semantic lock should not stay held during embed")?;
8976
8977 unblock_tx.send(()).expect("unblock embedder");
8978 clear_handle.join().expect("clear thread join");
8979 let search_result = search_handle.join().expect("search thread join");
8980 assert!(
8981 search_result.is_err(),
8982 "search should observe semantic context cleared after embedding"
8983 );
8984
8985 Ok(())
8986 }
8987
8988 #[test]
8989 fn semantic_embedding_ignores_stale_same_id_context_after_swap() -> Result<()> {
8990 let fixture = build_semantic_test_fixture()?;
8991 let client = Arc::new(fixture.client);
8992 let (started_tx, started_rx) = std::sync::mpsc::channel();
8993 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8994
8995 {
8996 let mut guard = client
8997 .semantic
8998 .lock()
8999 .map_err(|_| anyhow!("semantic lock poisoned"))?;
9000 let state = guard
9001 .as_mut()
9002 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
9003 state.embedder = Arc::new(BlockingTestEmbedder::new(
9004 "test-fixed-2d",
9005 &[1.0, 0.0],
9006 started_tx,
9007 unblock_rx,
9008 ));
9009 state.query_cache = QueryCache::new(
9010 "test-fixed-2d",
9011 NonZeroUsize::new(100).expect("cache capacity"),
9012 );
9013 }
9014
9015 let embedding_client = Arc::clone(&client);
9016 let handle =
9017 std::thread::spawn(move || embedding_client.semantic_query_embedding("context-swap"));
9018
9019 started_rx
9020 .recv_timeout(Duration::from_secs(1))
9021 .expect("embedder should start");
9022
9023 {
9024 let mut guard = client
9025 .semantic
9026 .lock()
9027 .map_err(|_| anyhow!("semantic lock poisoned"))?;
9028 let state = guard
9029 .as_mut()
9030 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
9031 state.context_token = Arc::new(());
9032 state.embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[0.0, 1.0]));
9033 state.query_cache = QueryCache::new(
9034 "test-fixed-2d",
9035 NonZeroUsize::new(100).expect("cache capacity"),
9036 );
9037 }
9038
9039 unblock_tx.send(()).expect("unblock embedder");
9040
9041 let embedding = handle.join().expect("embedding thread join")?.vector;
9042 assert_eq!(
9043 embedding,
9044 vec![0.0, 1.0],
9045 "stale embedding from the previous same-id context must not leak across the swap"
9046 );
9047
9048 Ok(())
9049 }
9050
9051 #[test]
9052 fn quality_mode_does_not_reuse_fast_only_two_tier_cache() -> Result<()> {
9053 let dir = TempDir::new()?;
9054 let mut index = TantivyIndex::open_or_create(dir.path())?;
9055 index.commit()?;
9056
9057 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9058 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9059 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
9060 let writer = VectorIndex::create_with_revision(
9061 &fast_path,
9062 embedder.id(),
9063 "rev-fast-only",
9064 embedder.dimension(),
9065 frankensearch::index::Quantization::F16,
9066 )?;
9067 writer.finish()?;
9068
9069 client.set_semantic_context(
9070 embedder,
9071 VectorIndex::open(&fast_path)?,
9072 SemanticFilterMaps::for_tests(
9073 HashMap::new(),
9074 HashMap::new(),
9075 HashMap::new(),
9076 HashSet::new(),
9077 ),
9078 None,
9079 Some(fast_path),
9080 )?;
9081
9082 let fast_only_index = client
9083 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
9084 .expect("fast-only index should load");
9085 assert!(
9086 !fast_only_index.has_quality_index(),
9087 "fixture should only provide the fast tier"
9088 );
9089
9090 let quality_index = client.in_memory_two_tier_index(SemanticTierMode::QualityOnly)?;
9091 assert!(
9092 quality_index.is_none(),
9093 "quality mode must not reuse a cached fast-only two-tier index"
9094 );
9095
9096 Ok(())
9097 }
9098
9099 #[test]
9100 fn failed_quality_probe_does_not_block_fast_only_two_tier_load() -> Result<()> {
9101 let dir = TempDir::new()?;
9102 let mut index = TantivyIndex::open_or_create(dir.path())?;
9103 index.commit()?;
9104
9105 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9106 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9107 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
9108 let writer = VectorIndex::create_with_revision(
9109 &fast_path,
9110 embedder.id(),
9111 "rev-fast-only",
9112 embedder.dimension(),
9113 frankensearch::index::Quantization::F16,
9114 )?;
9115 writer.finish()?;
9116
9117 client.set_semantic_context(
9118 embedder,
9119 VectorIndex::open(&fast_path)?,
9120 SemanticFilterMaps::for_tests(
9121 HashMap::new(),
9122 HashMap::new(),
9123 HashMap::new(),
9124 HashSet::new(),
9125 ),
9126 None,
9127 Some(fast_path),
9128 )?;
9129
9130 assert!(
9131 client
9132 .in_memory_two_tier_index(SemanticTierMode::QualityOnly)?
9133 .is_none(),
9134 "quality-only lookup should fail for a fast-only fixture"
9135 );
9136
9137 let fast_only_index = client
9138 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
9139 .expect("a failed quality-only probe must not poison fast-only loads");
9140 assert!(
9141 !fast_only_index.has_quality_index(),
9142 "fixture should still resolve to the fast-only tier"
9143 );
9144
9145 Ok(())
9146 }
9147
9148 #[test]
9149 fn progressive_context_error_does_not_poison_future_attempts() -> Result<()> {
9150 let dir = TempDir::new()?;
9151 let mut index = TantivyIndex::open_or_create(dir.path())?;
9152 index.commit()?;
9153
9154 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9155 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9156 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
9157 let writer = VectorIndex::create_with_revision(
9158 &fast_path,
9159 embedder.id(),
9160 "rev-progressive-error",
9161 embedder.dimension(),
9162 frankensearch::index::Quantization::F16,
9163 )?;
9164 writer.finish()?;
9165 std::fs::write(dir.path().join("vector.fast.idx"), b"not-a-valid-index")?;
9166 std::fs::write(dir.path().join("vector.quality.idx"), b"not-a-valid-index")?;
9167
9168 client.set_semantic_context(
9169 embedder,
9170 VectorIndex::open(&fast_path)?,
9171 SemanticFilterMaps::for_tests(
9172 HashMap::new(),
9173 HashMap::new(),
9174 HashMap::new(),
9175 HashSet::new(),
9176 ),
9177 None,
9178 Some(fast_path),
9179 )?;
9180
9181 let first_err = client
9182 .progressive_context()
9183 .err()
9184 .expect("invalid progressive index files should fail to load");
9185 assert!(
9186 first_err
9187 .to_string()
9188 .contains("open fast-tier index failed"),
9189 "unexpected first progressive-context error: {first_err}"
9190 );
9191
9192 let second_err = client
9193 .progressive_context()
9194 .err()
9195 .expect("a failed progressive load must not be memoized as None");
9196 assert!(
9197 second_err
9198 .to_string()
9199 .contains("open fast-tier index failed"),
9200 "unexpected second progressive-context error: {second_err}"
9201 );
9202
9203 Ok(())
9204 }
9205
9206 fn build_semantic_test_fixture() -> Result<SemanticTestFixture> {
9207 build_semantic_test_fixture_with_shards(false)
9208 }
9209
9210 fn build_sharded_semantic_test_fixture() -> Result<SemanticTestFixture> {
9211 build_semantic_test_fixture_with_shards(true)
9212 }
9213
9214 fn build_semantic_test_fixture_with_shards(sharded: bool) -> Result<SemanticTestFixture> {
9215 let dir = TempDir::new()?;
9216 let db_path = dir.path().join("cass.db");
9217 let storage = FrankenStorage::open(&db_path)?;
9218
9219 let agent = Agent {
9220 id: None,
9221 slug: "codex".into(),
9222 name: "Codex".into(),
9223 version: None,
9224 kind: AgentKind::Cli,
9225 };
9226 let agent_id = storage.ensure_agent(&agent)?;
9227 let workspace_path = dir.path().join("workspace");
9228 std::fs::create_dir_all(&workspace_path)?;
9229 let workspace_id = storage.ensure_workspace(&workspace_path, None)?;
9230
9231 let documents = [
9232 ("session-a.jsonl", "top semantic match", [1.0_f32, 0.0_f32]),
9233 (
9234 "session-b.jsonl",
9235 "middle semantic match",
9236 [0.9_f32, 0.1_f32],
9237 ),
9238 ("session-c.jsonl", "late semantic match", [0.8_f32, 0.2_f32]),
9239 ];
9240 let base_ts = 1_700_000_000_000_i64;
9241 let mut doc_ids = Vec::with_capacity(documents.len());
9242 let mut source_paths = Vec::with_capacity(documents.len());
9243
9244 for (idx, (name, content, _vector)) in documents.iter().enumerate() {
9245 let source_path = dir.path().join(name);
9246 source_paths.push(source_path.to_string_lossy().to_string());
9247
9248 let conversation = Conversation {
9249 id: None,
9250 agent_slug: agent.slug.clone(),
9251 workspace: Some(workspace_path.clone()),
9252 external_id: Some(format!("semantic-{idx}")),
9253 title: Some(format!("semantic session {idx}")),
9254 source_path,
9255 started_at: Some(base_ts + idx as i64),
9256 ended_at: Some(base_ts + idx as i64),
9257 approx_tokens: Some(16),
9258 metadata_json: json!({"fixture": "semantic_search"}),
9259 messages: vec![Message {
9260 id: None,
9261 idx: 0,
9262 role: MessageRole::User,
9263 author: Some("user".into()),
9264 created_at: Some(base_ts + idx as i64),
9265 content: (*content).to_string(),
9266 extra_json: json!({}),
9267 snippets: Vec::new(),
9268 }],
9269 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
9270 origin_host: None,
9271 };
9272
9273 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
9274 }
9275
9276 let message_rows: Vec<(u64, i64)> = storage.raw().query_map_collect(
9277 "SELECT m.id, COALESCE(m.created_at, c.started_at, 0)
9278 FROM messages m
9279 JOIN conversations c ON m.conversation_id = c.id
9280 ORDER BY c.id",
9281 &[],
9282 |row: &frankensqlite::Row| {
9283 let message_id: i64 = row.get_typed(0)?;
9284 let created_at: i64 = row.get_typed(1)?;
9285 Ok((u64::try_from(message_id).unwrap_or(u64::MAX), created_at))
9286 },
9287 )?;
9288 assert_eq!(
9289 message_rows.len(),
9290 documents.len(),
9291 "fixture should create 3 messages"
9292 );
9293
9294 let filter_maps = SemanticFilterMaps::from_storage(&storage)?;
9295 let embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[1.0, 0.0]));
9296 let source_hash = crc32fast::hash(crate::sources::provenance::LOCAL_SOURCE_ID.as_bytes());
9297 let vector_dir = dir.path().join("vector_index");
9298 std::fs::create_dir_all(&vector_dir)?;
9299 let mut vector_records = Vec::with_capacity(documents.len());
9300
9301 for ((message_id, created_at_ms), (_, _, vector)) in message_rows.iter().zip(documents) {
9302 let doc_id = SemanticDocId {
9303 message_id: *message_id,
9304 chunk_idx: 0,
9305 agent_id: u32::try_from(agent_id)?,
9306 workspace_id: u32::try_from(workspace_id)?,
9307 source_id: source_hash,
9308 role: ROLE_USER,
9309 created_at_ms: *created_at_ms,
9310 content_hash: None,
9311 }
9312 .to_doc_id_string();
9313 doc_ids.push(doc_id.clone());
9314 vector_records.push((doc_id, vector));
9315 }
9316
9317 let mut vector_indexes = Vec::new();
9318 if sharded {
9319 for (shard_index, chunk) in vector_records.chunks(2).enumerate() {
9320 let vector_path = vector_dir.join(format!("shard-{shard_index}.fsvi"));
9321 let mut writer = VectorIndex::create_with_revision(
9322 &vector_path,
9323 embedder.id(),
9324 "rev-1",
9325 embedder.dimension(),
9326 frankensearch::index::Quantization::F16,
9327 )?;
9328 for (doc_id, vector) in chunk {
9329 writer.write_record(doc_id, vector)?;
9330 }
9331 writer.finish()?;
9332 vector_indexes.push(VectorIndex::open(&vector_path)?);
9333 }
9334 } else {
9335 let vector_path = vector_dir.join("index-test-fixed-2d.fsvi");
9336 let mut writer = VectorIndex::create_with_revision(
9337 &vector_path,
9338 embedder.id(),
9339 "rev-1",
9340 embedder.dimension(),
9341 frankensearch::index::Quantization::F16,
9342 )?;
9343 for (doc_id, vector) in &vector_records {
9344 writer.write_record(doc_id, vector)?;
9345 }
9346 writer.finish()?;
9347 vector_indexes.push(VectorIndex::open(&vector_path)?);
9348 }
9349 drop(storage);
9350
9351 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
9352 client.set_semantic_indexes_context(embedder, vector_indexes, filter_maps, None, None)?;
9353
9354 Ok(SemanticTestFixture {
9355 _dir: dir,
9356 client,
9357 doc_ids,
9358 source_paths,
9359 })
9360 }
9361
9362 fn build_progressive_hybrid_fixture() -> Result<ProgressiveHybridFixture> {
9363 let dir = TempDir::new()?;
9364 let mut index = TantivyIndex::open_or_create(dir.path())?;
9365 let workspace_path = dir.path().join("workspace");
9366 std::fs::create_dir_all(&workspace_path)?;
9367 let agent_id = 1_i64;
9368 let workspace_id = 1_i64;
9369 let source_id = crate::sources::provenance::LOCAL_SOURCE_ID;
9370 let source_hash = crc32fast::hash(source_id.as_bytes());
9371 let conn = Connection::open(":memory:")?;
9372 conn.execute_batch(
9373 r#"
9374 CREATE TABLE agents (
9375 id INTEGER PRIMARY KEY,
9376 slug TEXT NOT NULL
9377 );
9378 CREATE TABLE workspaces (
9379 id INTEGER PRIMARY KEY,
9380 path TEXT NOT NULL
9381 );
9382 CREATE TABLE sources (
9383 id TEXT PRIMARY KEY,
9384 kind TEXT NOT NULL
9385 );
9386 CREATE TABLE conversations (
9387 id INTEGER PRIMARY KEY,
9388 agent_id INTEGER NOT NULL,
9389 workspace_id INTEGER,
9390 title TEXT,
9391 source_path TEXT NOT NULL,
9392 source_id TEXT NOT NULL,
9393 origin_host TEXT,
9394 started_at INTEGER
9395 );
9396 CREATE TABLE messages (
9397 id INTEGER PRIMARY KEY,
9398 conversation_id INTEGER NOT NULL,
9399 idx INTEGER NOT NULL,
9400 role TEXT NOT NULL,
9401 created_at INTEGER,
9402 content TEXT NOT NULL
9403 );
9404 "#,
9405 )?;
9406 conn.execute_compat(
9407 "INSERT INTO agents (id, slug) VALUES (?1, ?2)",
9408 params![agent_id, "codex"],
9409 )?;
9410 conn.execute_compat(
9411 "INSERT INTO workspaces (id, path) VALUES (?1, ?2)",
9412 params![workspace_id, workspace_path.to_string_lossy().to_string()],
9413 )?;
9414 conn.execute_compat(
9415 "INSERT INTO sources (id, kind) VALUES (?1, ?2)",
9416 params![source_id, "local"],
9417 )?;
9418
9419 let query = "oauth refresh token middleware session cache".to_string();
9420 let filler = " context window ranking provenance semantic upgrade lexical overlay";
9421 let base_ts = 1_700_000_100_000_i64;
9422 let doc_count = 64usize;
9423 let mut message_rows = Vec::with_capacity(doc_count);
9424
9425 for idx in 0..doc_count {
9426 let conversation_id = i64::try_from(idx + 1)?;
9427 let message_id = u64::try_from(idx + 1)?;
9428 let source_path = dir.path().join(format!("progressive-{idx:03}.jsonl"));
9429 let repeated = filler.repeat(48);
9430 let content = if idx % 4 == 0 {
9431 format!(
9432 "{query} hot path candidate {idx} with detailed search diagnostics.{repeated}"
9433 )
9434 } else if idx % 4 == 1 {
9435 format!(
9436 "search pipeline benchmark {idx} with lexical overlay and semantic ranking.{repeated}"
9437 )
9438 } else if idx % 4 == 2 {
9439 format!(
9440 "interactive typing debounce benchmark {idx} for hybrid two tier search.{repeated}"
9441 )
9442 } else {
9443 format!(
9444 "unrelated background chatter {idx} about build systems and formatting checks.{repeated}"
9445 )
9446 };
9447 let created_at = base_ts + idx as i64;
9448 let source_path_str = source_path.to_string_lossy().to_string();
9449 let title = format!("progressive fixture {idx}");
9450
9451 conn.execute_compat(
9452 "INSERT INTO conversations (
9453 id, agent_id, workspace_id, title, source_path, source_id, origin_host, started_at
9454 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, NULL, ?7)",
9455 params![
9456 conversation_id,
9457 agent_id,
9458 workspace_id,
9459 title,
9460 source_path_str.clone(),
9461 source_id,
9462 created_at
9463 ],
9464 )?;
9465 conn.execute_compat(
9466 "INSERT INTO messages (
9467 id, conversation_id, idx, role, created_at, content
9468 ) VALUES (?1, ?2, 0, 'user', ?3, ?4)",
9469 params![
9470 i64::try_from(message_id)?,
9471 conversation_id,
9472 created_at,
9473 content.clone()
9474 ],
9475 )?;
9476 message_rows.push((message_id, created_at, content.clone()));
9477
9478 let normalized = NormalizedConversation {
9479 agent_slug: "codex".into(),
9480 external_id: Some(format!("progressive-{idx}")),
9481 title: Some(format!("progressive fixture {idx}")),
9482 workspace: Some(workspace_path.clone()),
9483 source_path,
9484 started_at: Some(created_at),
9485 ended_at: Some(created_at),
9486 metadata: json!({}),
9487 messages: vec![NormalizedMessage {
9488 idx: 0,
9489 role: "user".into(),
9490 author: Some("user".into()),
9491 created_at: Some(created_at),
9492 content,
9493 extra: json!({}),
9494 snippets: Vec::new(),
9495 invocations: Vec::new(),
9496 }],
9497 };
9498 index.add_conversation(&normalized)?;
9499 }
9500 index.commit()?;
9501
9502 assert_eq!(
9503 message_rows.len(),
9504 doc_count,
9505 "fixture should create the requested number of messages"
9506 );
9507
9508 let fast_embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9509 let quality_embedder = crate::search::hash_embedder::HashEmbedder::new(384);
9510 let filter_maps = SemanticFilterMaps::for_tests(
9511 HashMap::from([("codex".to_string(), u32::try_from(agent_id)?)]),
9512 HashMap::from([(
9513 workspace_path.to_string_lossy().to_string(),
9514 u32::try_from(workspace_id)?,
9515 )]),
9516 HashMap::from([(source_id.to_string(), source_hash)]),
9517 HashSet::new(),
9518 );
9519 let fast_path = dir.path().join("vector.fast.idx");
9520 let quality_path = dir.path().join("vector.quality.idx");
9521
9522 let mut fast_writer = VectorIndex::create_with_revision(
9523 &fast_path,
9524 fast_embedder.id(),
9525 "rev-progressive-fast",
9526 fast_embedder.dimension(),
9527 frankensearch::index::Quantization::F16,
9528 )?;
9529 let mut quality_writer = VectorIndex::create_with_revision(
9530 &quality_path,
9531 quality_embedder.id(),
9532 "rev-progressive-quality",
9533 quality_embedder.dimension(),
9534 frankensearch::index::Quantization::F16,
9535 )?;
9536
9537 for (message_id, created_at_ms, content) in &message_rows {
9538 let canonical = canonicalize_for_embedding(content);
9539 let doc_id = SemanticDocId {
9540 message_id: *message_id,
9541 chunk_idx: 0,
9542 agent_id: u32::try_from(agent_id)?,
9543 workspace_id: u32::try_from(workspace_id)?,
9544 source_id: source_hash,
9545 role: ROLE_USER,
9546 created_at_ms: *created_at_ms,
9547 content_hash: Some(content_hash(&canonical)),
9548 }
9549 .to_doc_id_string();
9550
9551 let fast_vec = fast_embedder.embed_sync(content)?;
9552 fast_writer.write_record(&doc_id, &fast_vec)?;
9553 let quality_vec = quality_embedder.embed_sync(content)?;
9554 quality_writer.write_record(&doc_id, &quality_vec)?;
9555 }
9556 fast_writer.finish()?;
9557 quality_writer.finish()?;
9558
9559 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
9560 let client = SearchClient {
9561 reader,
9562 sqlite: Mutex::new(Some(SendConnection(conn))),
9563 sqlite_path: None,
9564 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9565 reload_on_search: true,
9566 last_reload: Mutex::new(None),
9567 last_generation: Mutex::new(None),
9568 reload_epoch: Arc::new(AtomicU64::new(0)),
9569 warm_tx: None,
9570 _warm_handle: None,
9571 metrics: Metrics::default(),
9572 cache_namespace: format!("v{}|schema:{}", CACHE_KEY_VERSION, FS_CASS_SCHEMA_HASH),
9573 semantic: Mutex::new(None),
9574 last_tantivy_total_count: Mutex::new(None),
9575 };
9576 let semantic_embedder: Arc<dyn Embedder> = fast_embedder;
9577 client.set_semantic_context(
9578 semantic_embedder,
9579 VectorIndex::open(&fast_path)?,
9580 filter_maps,
9581 None,
9582 Some(fast_path),
9583 )?;
9584
9585 Ok(ProgressiveHybridFixture {
9586 _dir: dir,
9587 client: Arc::new(client),
9588 query,
9589 })
9590 }
9591
9592 fn sanitize_query(raw: &str) -> String {
9593 nfc_sanitize_query(raw)
9594 }
9595
9596 fn parse_boolean_query(query: &str) -> Vec<FsCassQueryToken> {
9597 fs_cass_parse_boolean_query(query)
9598 }
9599
9600 fn sqlite_master_name_count(db_path: &Path, name: &str) -> Result<i64> {
9601 let conn = FrankenConnection::open(db_path.to_string_lossy().as_ref())?;
9602 Ok(conn.query_row_map(
9603 "SELECT COUNT(*) FROM sqlite_master WHERE name = ?1",
9604 &[ParamValue::from(name)],
9605 |row| row.get_typed(0),
9606 )?)
9607 }
9608
9609 type QueryToken = FsCassQueryToken;
9610 type WildcardPattern = FsCassWildcardPattern;
9611 type QueryTokenList = Vec<QueryToken>;
9612
9613 #[test]
9614 #[ignore = "profiling harness for live hybrid progressive search"]
9615 fn progressive_hybrid_profile_harness() -> Result<()> {
9616 let fixture = build_progressive_hybrid_fixture()?;
9617 let runtime = asupersync::runtime::RuntimeBuilder::current_thread()
9618 .build()
9619 .map_err(|err| anyhow!("build test runtime failed: {err}"))?;
9620 let iterations = 24usize;
9621
9622 runtime.block_on(async {
9623 let cx = FsCx::for_request();
9624 fixture
9625 .client
9626 .search_progressive_with_callback(
9627 ProgressiveSearchRequest {
9628 cx: &cx,
9629 query: &fixture.query,
9630 filters: SearchFilters::default(),
9631 limit: 16,
9632 sparse_threshold: 0,
9633 field_mask: FieldMask::new(false, true, true, true),
9634 mode: SearchMode::Hybrid,
9635 },
9636 |_| {},
9637 )
9638 .await
9639 })?;
9640
9641 let mut initial_events = 0usize;
9642 let mut refined_events = 0usize;
9643 let mut total_hits = 0usize;
9644 for _ in 0..iterations {
9645 let mut refinement_error = None;
9646 runtime.block_on(async {
9647 let cx = FsCx::for_request();
9648 fixture
9649 .client
9650 .search_progressive_with_callback(
9651 ProgressiveSearchRequest {
9652 cx: &cx,
9653 query: &fixture.query,
9654 filters: SearchFilters::default(),
9655 limit: 16,
9656 sparse_threshold: 0,
9657 field_mask: FieldMask::new(false, true, true, true),
9658 mode: SearchMode::Hybrid,
9659 },
9660 |event| match event {
9661 ProgressiveSearchEvent::Phase { kind, result, .. } => {
9662 assert!(
9663 !result.hits.is_empty(),
9664 "progressive harness expects non-empty hits for each phase"
9665 );
9666 total_hits += result.hits.len();
9667 match kind {
9668 ProgressivePhaseKind::Initial => initial_events += 1,
9669 ProgressivePhaseKind::Refined => refined_events += 1,
9670 }
9671 }
9672 ProgressiveSearchEvent::RefinementFailed { error, .. } => {
9673 refinement_error = Some(error);
9674 }
9675 },
9676 )
9677 .await
9678 })?;
9679 if let Some(error) = refinement_error {
9680 bail!("progressive harness refinement failed: {error}");
9681 }
9682 }
9683
9684 assert_eq!(initial_events, iterations);
9685 assert_eq!(refined_events, iterations);
9686 assert!(
9687 total_hits >= iterations.saturating_mul(16),
9688 "harness should observe a full page for each phase"
9689 );
9690
9691 Ok(())
9692 }
9693
9694 #[test]
9699 fn interner_returns_same_arc_for_same_string() {
9700 let interner = StringInterner::new(100);
9701
9702 let s1 = interner.intern("test_query");
9703 let s2 = interner.intern("test_query");
9704
9705 assert!(Arc::ptr_eq(&s1, &s2));
9707 assert_eq!(&*s1, "test_query");
9708 }
9709
9710 #[test]
9711 fn interner_different_strings_return_different_arcs() {
9712 let interner = StringInterner::new(100);
9713
9714 let s1 = interner.intern("query1");
9715 let s2 = interner.intern("query2");
9716
9717 assert!(!Arc::ptr_eq(&s1, &s2));
9718 assert_eq!(&*s1, "query1");
9719 assert_eq!(&*s2, "query2");
9720 }
9721
9722 #[test]
9723 fn interner_handles_empty_string() {
9724 let interner = StringInterner::new(100);
9725
9726 let s1 = interner.intern("");
9727 let s2 = interner.intern("");
9728
9729 assert!(Arc::ptr_eq(&s1, &s2));
9730 assert_eq!(&*s1, "");
9731 }
9732
9733 #[test]
9734 fn interner_handles_unicode() {
9735 let interner = StringInterner::new(100);
9736
9737 let s1 = interner.intern("测试查询");
9738 let s2 = interner.intern("测试查询");
9739 let s3 = interner.intern("emoji 🔍 search");
9740
9741 assert!(Arc::ptr_eq(&s1, &s2));
9742 assert_eq!(&*s3, "emoji 🔍 search");
9743 }
9744
9745 #[test]
9746 fn interner_respects_lru_eviction() {
9747 let interner = StringInterner::new(3);
9748
9749 let _s1 = interner.intern("query1");
9750 let _s2 = interner.intern("query2");
9751 let _s3 = interner.intern("query3");
9752
9753 assert_eq!(interner.len(), 3);
9754
9755 let _s4 = interner.intern("query4");
9757
9758 assert_eq!(interner.len(), 3);
9759
9760 let s1_new = interner.intern("query1");
9762 assert_eq!(&*s1_new, "query1");
9763 }
9764
9765 #[test]
9766 fn interner_concurrent_access() {
9767 use std::thread;
9768
9769 let interner = Arc::new(StringInterner::new(1000));
9770 let queries: Vec<String> = (0..100).map(|i| format!("query_{}", i)).collect();
9771
9772 let handles: Vec<_> = (0..4)
9773 .map(|_| {
9774 let interner = Arc::clone(&interner);
9775 let queries = queries.clone();
9776
9777 thread::spawn(move || {
9778 for _ in 0..10 {
9779 for query in &queries {
9780 let _ = interner.intern(query);
9781 }
9782 }
9783 })
9784 })
9785 .collect();
9786
9787 for handle in handles {
9788 handle.join().unwrap();
9789 }
9790
9791 for query in &queries {
9793 let s1 = interner.intern(query);
9794 let s2 = interner.intern(query);
9795 assert!(Arc::ptr_eq(&s1, &s2));
9796 }
9797 }
9798
9799 #[test]
9804 fn query_terms_lower_basic() {
9805 let terms = QueryTermsLower::from_query("Hello World");
9806
9807 assert_eq!(terms.query_lower, "hello world");
9808 let tokens: Vec<&str> = terms.tokens().collect();
9809 assert_eq!(tokens, vec!["hello", "world"]);
9810 }
9811
9812 #[test]
9813 fn query_terms_lower_empty() {
9814 let terms = QueryTermsLower::from_query("");
9815
9816 assert!(terms.is_empty());
9817 assert_eq!(terms.tokens().count(), 0);
9818 }
9819
9820 #[test]
9821 fn query_terms_lower_single_term() {
9822 let terms = QueryTermsLower::from_query("TEST");
9823
9824 let tokens: Vec<&str> = terms.tokens().collect();
9825 assert_eq!(tokens, vec!["test"]);
9826 }
9827
9828 #[test]
9829 fn query_terms_lower_with_punctuation() {
9830 let terms = QueryTermsLower::from_query("hello, world! how's it?");
9831
9832 let tokens: Vec<&str> = terms.tokens().collect();
9833 assert_eq!(tokens, vec!["hello", "world", "how", "s", "it"]);
9834 }
9835
9836 #[test]
9837 fn query_terms_lower_unicode() {
9838 let terms = QueryTermsLower::from_query("Héllo Wörld");
9839
9840 assert_eq!(terms.query_lower, "héllo wörld");
9841 let tokens: Vec<&str> = terms.tokens().collect();
9842 assert_eq!(tokens, vec!["héllo", "wörld"]);
9843 }
9844
9845 #[test]
9846 fn query_terms_lower_bloom_mask() {
9847 let terms = QueryTermsLower::from_query("test");
9848
9849 assert_ne!(terms.bloom_mask(), 0);
9851
9852 let terms2 = QueryTermsLower::from_query("test");
9854 assert_eq!(terms.bloom_mask(), terms2.bloom_mask());
9855 }
9856
9857 #[test]
9858 fn hit_matches_with_precomputed_terms() {
9859 let hit = SearchHit {
9860 title: "Test Title".into(),
9861 snippet: "".into(),
9862 content: "hello world content".into(),
9863 content_hash: stable_content_hash("hello world content"),
9864 score: 1.0,
9865 source_path: "p".into(),
9866 agent: "a".into(),
9867 workspace: "w".into(),
9868 workspace_original: None,
9869 created_at: None,
9870 line_number: None,
9871 match_type: MatchType::Exact,
9872 source_id: "local".into(),
9873 origin_kind: "local".into(),
9874 origin_host: None,
9875 conversation_id: None,
9876 };
9877 let cached = cached_hit_from(&hit);
9878
9879 let terms = QueryTermsLower::from_query("hello");
9881 assert!(hit_matches_query_cached_precomputed(&cached, &terms));
9882
9883 let terms_miss = QueryTermsLower::from_query("missing");
9884 assert!(!hit_matches_query_cached_precomputed(&cached, &terms_miss));
9885 }
9886
9887 fn make_fused_hit(
9892 id: &str,
9893 rrf: f32,
9894 lexical: Option<usize>,
9895 semantic: Option<usize>,
9896 ) -> FusedHit {
9897 FusedHit {
9898 key: SearchHitKey {
9899 source_id: "local".to_string(),
9900 source_path: id.to_string(),
9901 conversation_id: None,
9902 title: String::new(),
9903 line_number: None,
9904 created_at: None,
9905 content_hash: 0,
9906 },
9907 score: HybridScore {
9908 rrf,
9909 lexical_rank: lexical,
9910 semantic_rank: semantic,
9911 lexical_score: None,
9912 semantic_score: None,
9913 },
9914 hit: SearchHit {
9915 title: id.into(),
9916 snippet: "".into(),
9917 content: "".into(),
9918 content_hash: 0,
9919 score: rrf,
9920 source_path: id.into(),
9921 agent: "test".into(),
9922 workspace: "test".into(),
9923 workspace_original: None,
9924 created_at: None,
9925 line_number: None,
9926 match_type: MatchType::Exact,
9927 source_id: "local".into(),
9928 origin_kind: "local".into(),
9929 origin_host: None,
9930 conversation_id: None,
9931 },
9932 }
9933 }
9934
9935 fn make_federated_merge_hit(id: &str, agent: &str) -> SearchHit {
9936 SearchHit {
9937 title: id.into(),
9938 snippet: String::new(),
9939 content: id.into(),
9940 content_hash: stable_content_hash(id),
9941 score: 0.0,
9942 source_path: format!("{id}.jsonl"),
9943 agent: agent.into(),
9944 workspace: "workspace".into(),
9945 workspace_original: None,
9946 created_at: Some(1_700_000_000_000),
9947 line_number: Some(1),
9948 match_type: MatchType::Exact,
9949 source_id: "local".into(),
9950 origin_kind: "local".into(),
9951 origin_host: None,
9952 conversation_id: None,
9953 }
9954 }
9955
9956 fn make_federated_ranked_hit(
9957 shard_index: usize,
9958 shard_rank: usize,
9959 id: &str,
9960 ) -> FederatedRankedHit {
9961 FederatedRankedHit {
9962 hit: make_federated_merge_hit(id, &format!("shard-{shard_index}")),
9963 shard_index,
9964 shard_rank,
9965 fused_score: federated_rrf_score(shard_rank),
9966 }
9967 }
9968
9969 #[test]
9970 fn federated_merge_orders_equal_rank_hits_by_stable_hit_key() {
9971 let merged = merge_federated_ranked_hits(vec![
9972 make_federated_ranked_hit(2, 0, "zeta"),
9973 make_federated_ranked_hit(0, 0, "bravo"),
9974 make_federated_ranked_hit(1, 0, "alpha"),
9975 ]);
9976
9977 let paths = merged
9978 .iter()
9979 .map(|hit| hit.source_path.as_str())
9980 .collect::<Vec<_>>();
9981 assert_eq!(paths, vec!["alpha.jsonl", "bravo.jsonl", "zeta.jsonl"]);
9982 assert!(
9983 merged
9984 .iter()
9985 .all(|hit| (hit.score - federated_rrf_score(0)).abs() < f32::EPSILON),
9986 "equal per-shard rank should produce equal RRF scores"
9987 );
9988 }
9989
9990 #[test]
9991 fn federated_merge_keeps_rrf_rank_ahead_of_stable_key() {
9992 let merged = merge_federated_ranked_hits(vec![
9993 make_federated_ranked_hit(0, 1, "alpha"),
9994 make_federated_ranked_hit(1, 0, "zeta"),
9995 ]);
9996
9997 let paths = merged
9998 .iter()
9999 .map(|hit| hit.source_path.as_str())
10000 .collect::<Vec<_>>();
10001 assert_eq!(paths, vec!["zeta.jsonl", "alpha.jsonl"]);
10002 assert!(merged[0].score > merged[1].score);
10003 }
10004
10005 #[test]
10006 fn federated_merge_uses_shard_index_as_duplicate_final_tiebreak() {
10007 let merged = merge_federated_ranked_hits(vec![
10008 FederatedRankedHit {
10009 hit: make_federated_merge_hit("same", "shard-2"),
10010 shard_index: 2,
10011 shard_rank: 0,
10012 fused_score: federated_rrf_score(0),
10013 },
10014 FederatedRankedHit {
10015 hit: make_federated_merge_hit("same", "shard-0"),
10016 shard_index: 0,
10017 shard_rank: 0,
10018 fused_score: federated_rrf_score(0),
10019 },
10020 ]);
10021
10022 assert_eq!(merged[0].agent, "shard-0");
10023 assert_eq!(merged[1].agent, "shard-2");
10024 }
10025
10026 #[test]
10027 fn top_k_fused_basic() {
10028 let hits = vec![
10029 make_fused_hit("a", 1.0, Some(0), None),
10030 make_fused_hit("b", 3.0, Some(1), None),
10031 make_fused_hit("c", 2.0, Some(2), None),
10032 make_fused_hit("d", 5.0, Some(3), None),
10033 make_fused_hit("e", 4.0, Some(4), None),
10034 ];
10035
10036 let top = top_k_fused(hits, 3);
10037
10038 assert_eq!(top.len(), 3);
10039 assert_eq!(top[0].key.source_path, "d"); assert_eq!(top[1].key.source_path, "e"); assert_eq!(top[2].key.source_path, "b"); }
10043
10044 #[test]
10045 fn top_k_fused_empty() {
10046 let hits: Vec<FusedHit> = vec![];
10047 let top = top_k_fused(hits, 10);
10048 assert!(top.is_empty());
10049 }
10050
10051 #[test]
10052 fn top_k_fused_k_zero() {
10053 let hits = vec![
10054 make_fused_hit("a", 1.0, Some(0), None),
10055 make_fused_hit("b", 2.0, Some(1), None),
10056 ];
10057 let top = top_k_fused(hits, 0);
10058 assert!(top.is_empty());
10059 }
10060
10061 #[test]
10062 fn top_k_fused_k_larger_than_n() {
10063 let hits = vec![
10064 make_fused_hit("a", 1.0, Some(0), None),
10065 make_fused_hit("b", 2.0, Some(1), None),
10066 ];
10067
10068 let top = top_k_fused(hits, 10);
10069
10070 assert_eq!(top.len(), 2);
10071 assert_eq!(top[0].key.source_path, "b"); assert_eq!(top[1].key.source_path, "a"); }
10074
10075 #[test]
10076 fn top_k_fused_k_equals_n() {
10077 let hits = vec![
10078 make_fused_hit("a", 3.0, Some(0), None),
10079 make_fused_hit("b", 1.0, Some(1), None),
10080 make_fused_hit("c", 2.0, Some(2), None),
10081 ];
10082
10083 let top = top_k_fused(hits, 3);
10084
10085 assert_eq!(top.len(), 3);
10086 assert_eq!(top[0].key.source_path, "a"); assert_eq!(top[1].key.source_path, "c"); assert_eq!(top[2].key.source_path, "b"); }
10090
10091 #[test]
10092 fn top_k_fused_k_one() {
10093 let hits = vec![
10094 make_fused_hit("a", 1.0, Some(0), None),
10095 make_fused_hit("b", 3.0, Some(1), None),
10096 make_fused_hit("c", 2.0, Some(2), None),
10097 ];
10098
10099 let top = top_k_fused(hits, 1);
10100
10101 assert_eq!(top.len(), 1);
10102 assert_eq!(top[0].key.source_path, "b");
10103 assert_eq!(top[0].score.rrf, 3.0);
10104 }
10105
10106 #[test]
10107 fn top_k_fused_duplicate_scores() {
10108 let hits = vec![
10109 make_fused_hit("a", 2.0, Some(0), None),
10110 make_fused_hit("b", 2.0, Some(1), None),
10111 make_fused_hit("c", 2.0, Some(2), None),
10112 make_fused_hit("d", 1.0, Some(3), None),
10113 ];
10114
10115 let top = top_k_fused(hits, 2);
10116
10117 assert_eq!(top.len(), 2);
10118 assert_eq!(top[0].score.rrf, 2.0);
10120 assert_eq!(top[1].score.rrf, 2.0);
10121 }
10122
10123 #[test]
10124 fn top_k_fused_dual_source_tiebreaker() {
10125 let hits = vec![
10127 make_fused_hit("a", 2.0, Some(0), None), make_fused_hit("b", 2.0, Some(1), Some(0)), make_fused_hit("c", 2.0, None, Some(1)), ];
10131
10132 let top = top_k_fused(hits, 3);
10133
10134 assert_eq!(top.len(), 3);
10135 assert_eq!(top[0].key.source_path, "b");
10137 }
10138
10139 #[test]
10140 fn top_k_fused_large_input_uses_quickselect() {
10141 let hits: Vec<FusedHit> = (0..100)
10143 .map(|i| make_fused_hit(&format!("hit_{}", i), i as f32, Some(i), None))
10144 .collect();
10145
10146 let top = top_k_fused(hits, 10);
10147
10148 assert_eq!(top.len(), 10);
10149 for (i, hit) in top.iter().enumerate() {
10151 assert_eq!(hit.key.source_path, format!("hit_{}", 99 - i));
10152 assert_eq!(hit.score.rrf, (99 - i) as f32);
10153 }
10154 }
10155
10156 #[test]
10157 fn top_k_fused_equivalence_with_full_sort() {
10158 for n in [10, 50, 100, 200] {
10160 for k in [1, 5, 10, 25] {
10161 if k > n {
10162 continue;
10163 }
10164
10165 let hits: Vec<FusedHit> = (0..n)
10166 .map(|i| {
10167 let score = ((i * 17 + 7) % 1000) as f32;
10169 make_fused_hit(&format!("hit_{}", i), score, Some(i), None)
10170 })
10171 .collect();
10172
10173 let mut baseline = hits.clone();
10175 baseline.sort_by(cmp_fused_hit_desc);
10176 baseline.truncate(k);
10177
10178 let quickselect = top_k_fused(hits, k);
10180
10181 assert_eq!(quickselect.len(), baseline.len(), "n={}, k={}", n, k);
10183
10184 for (q, b) in quickselect.iter().zip(baseline.iter()) {
10186 assert_eq!(
10187 q.key.source_path, b.key.source_path,
10188 "n={}, k={}: mismatch",
10189 n, k
10190 );
10191 assert_eq!(q.score.rrf, b.score.rrf, "n={}, k={}: score mismatch", n, k);
10192 }
10193 }
10194 }
10195 }
10196
10197 #[test]
10198 fn cmp_fused_hit_desc_basic_ordering() {
10199 let a = make_fused_hit("a", 2.0, Some(0), None);
10200 let b = make_fused_hit("b", 3.0, Some(1), None);
10201
10202 assert_eq!(cmp_fused_hit_desc(&a, &b), CmpOrdering::Greater);
10204 assert_eq!(cmp_fused_hit_desc(&b, &a), CmpOrdering::Less);
10205 assert_eq!(cmp_fused_hit_desc(&a, &a), CmpOrdering::Equal);
10206 }
10207
10208 #[test]
10213 fn cache_enforces_prefix_matching() {
10214 let hit = SearchHit {
10216 title: "test".into(),
10217 snippet: "".into(),
10218 content: "arrow".into(),
10219 content_hash: stable_content_hash("arrow"),
10220 score: 1.0,
10221 source_path: "p".into(),
10222 agent: "a".into(),
10223 workspace: "w".into(),
10224 workspace_original: None,
10225 created_at: None,
10226 line_number: None,
10227 match_type: MatchType::Exact,
10228 source_id: "local".into(),
10229 origin_kind: "local".into(),
10230 origin_host: None,
10231 conversation_id: None,
10232 };
10233
10234 let cached = CachedHit {
10235 hit: hit.clone(),
10236 lc_content: "arrow".into(),
10237 lc_title: Some("test".into()),
10238 bloom64: u64::MAX, };
10240
10241 let matched = hit_matches_query_cached(&cached, "row");
10244
10245 assert!(
10246 !matched,
10247 "Query 'row' should NOT match content 'arrow' (prefix match required)"
10248 );
10249 }
10250
10251 #[test]
10252 fn search_deduplication_across_pages_repro() {
10253 let dir = TempDir::new().unwrap();
10258 let index_path = dir.path();
10259 let mut index = TantivyIndex::open_or_create(index_path).unwrap();
10260
10261 let msg1 = NormalizedMessage {
10265 idx: 0,
10266 role: "user".into(),
10267 author: None,
10268 created_at: Some(1000),
10269 content: "duplicate content".into(),
10270 extra: serde_json::json!({}),
10271 snippets: Vec::new(),
10272 invocations: Vec::new(),
10273 };
10274 let conv1 = NormalizedConversation {
10275 agent_slug: "agent1".into(),
10276 external_id: None,
10277 title: None,
10278 workspace: None,
10279 source_path: "path/1".into(),
10280 started_at: None,
10281 ended_at: None,
10282 metadata: serde_json::json!({}),
10283 messages: vec![msg1],
10284 };
10285
10286 let msg2 = NormalizedMessage {
10287 idx: 0,
10288 role: "user".into(),
10289 author: None,
10290 created_at: Some(2000), content: "duplicate content".into(), extra: serde_json::json!({}),
10293 snippets: Vec::new(),
10294 invocations: Vec::new(),
10295 };
10296 let conv2 = NormalizedConversation {
10297 agent_slug: "agent1".into(),
10298 external_id: None,
10299 title: None,
10300 workspace: None,
10301 source_path: "path/2".into(), started_at: None,
10303 ended_at: None,
10304 metadata: serde_json::json!({}),
10305 messages: vec![msg2],
10306 };
10307
10308 index.add_conversation(&conv1).unwrap();
10309 index.add_conversation(&conv2).unwrap();
10310 index.commit().unwrap();
10311
10312 let client = SearchClient::open(index_path, None).unwrap().unwrap();
10313
10314 let page1 = client
10316 .search("duplicate", SearchFilters::default(), 1, 0, FieldMask::FULL)
10317 .unwrap();
10318 assert_eq!(page1.len(), 1);
10319
10320 let page2 = client
10322 .search("duplicate", SearchFilters::default(), 1, 1, FieldMask::FULL)
10323 .unwrap();
10324
10325 assert_eq!(page2.len(), 1);
10326 assert_ne!(page1[0].source_path, page2[0].source_path);
10327 }
10328
10329 #[test]
10330 fn cache_skips_complex_queries() {
10331 let client = SearchClient {
10332 reader: None,
10333 sqlite: Mutex::new(None),
10334 sqlite_path: None,
10335 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10336 reload_on_search: true,
10337 last_reload: Mutex::new(None),
10338 last_generation: Mutex::new(None),
10339 reload_epoch: Arc::new(AtomicU64::new(0)),
10340 warm_tx: None,
10341 _warm_handle: None,
10342 metrics: Metrics::default(),
10343 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10344 semantic: Mutex::new(None),
10345 last_tantivy_total_count: Mutex::new(None),
10346 };
10347
10348 let _ = client.search("foo*", SearchFilters::default(), 10, 0, FieldMask::FULL);
10350 let stats = client.cache_stats();
10351 assert_eq!(
10352 stats.cache_miss, 0,
10353 "Wildcard query should not trigger cache miss"
10354 );
10355
10356 let _ = client.search(
10358 "foo OR bar",
10359 SearchFilters::default(),
10360 10,
10361 0,
10362 FieldMask::FULL,
10363 );
10364 let stats = client.cache_stats();
10365 assert_eq!(
10366 stats.cache_miss, 0,
10367 "Boolean query should not trigger cache miss"
10368 );
10369
10370 let _ = client.search("simple", SearchFilters::default(), 10, 0, FieldMask::FULL);
10372 let stats = client.cache_stats();
10373 assert_eq!(
10374 stats.cache_miss, 1,
10375 "Simple query should trigger cache miss"
10376 );
10377 }
10378
10379 #[test]
10380 fn cache_prefix_lookup_handles_utf8_boundaries() {
10381 let client = SearchClient {
10382 reader: None,
10383 sqlite: Mutex::new(None),
10384 sqlite_path: None,
10385 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10386 reload_on_search: true,
10387 last_reload: Mutex::new(None),
10388 last_generation: Mutex::new(None),
10389 reload_epoch: Arc::new(AtomicU64::new(0)),
10390 warm_tx: None,
10391 _warm_handle: None,
10392 metrics: Metrics::default(),
10393 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10394 semantic: Mutex::new(None),
10395 last_tantivy_total_count: Mutex::new(None),
10396 };
10397
10398 let hits = vec![SearchHit {
10399 title: "こんにちは".into(),
10400 snippet: String::new(),
10401 content: "こんにちは 世界".into(),
10402 content_hash: stable_content_hash("こんにちは 世界"),
10403 score: 1.0,
10404 source_path: "p".into(),
10405 agent: "a".into(),
10406 workspace: "w".into(),
10407 workspace_original: None,
10408 created_at: None,
10409 line_number: None,
10410 match_type: MatchType::Exact,
10411 source_id: "local".into(),
10412 origin_kind: "local".into(),
10413 origin_host: None,
10414 conversation_id: None,
10415 }];
10416
10417 client.put_cache("こん", &SearchFilters::default(), &hits);
10418
10419 let cached = client
10420 .cached_prefix_hits("こんにちは", &SearchFilters::default())
10421 .unwrap();
10422 assert_eq!(cached.len(), 1);
10423 assert_eq!(cached[0].hit.title, "こんにちは");
10424 }
10425
10426 #[test]
10427 fn bloom_gate_rejects_missing_terms() {
10428 let hit = SearchHit {
10429 title: "hello world".into(),
10430 snippet: "hello world".into(),
10431 content: "hello world".into(),
10432 content_hash: stable_content_hash("hello world"),
10433 score: 1.0,
10434 source_path: "p".into(),
10435 agent: "a".into(),
10436 workspace: "w".into(),
10437 workspace_original: None,
10438 created_at: None,
10439 line_number: None,
10440 match_type: MatchType::Exact,
10441 source_id: "local".into(),
10442 origin_kind: "local".into(),
10443 origin_host: None,
10444 conversation_id: None,
10445 };
10446 let cached = cached_hit_from(&hit);
10447 assert!(hit_matches_query_cached(&cached, "hello"));
10448 assert!(!hit_matches_query_cached(&cached, "missing"));
10449
10450 let metrics = Metrics::default();
10451 metrics.inc_cache_hits();
10452 metrics.inc_cache_miss();
10453 metrics.inc_cache_shortfall();
10454 metrics.inc_reload();
10455 let (hits, miss, shortfall, reloads, _) = metrics.snapshot_all();
10456 assert_eq!((hits, miss, shortfall, reloads), (1, 1, 1, 1));
10457 }
10458
10459 #[test]
10460 fn progressive_lexical_hit_omits_unused_content() {
10461 let hit = SearchHit {
10462 title: "hello world".into(),
10463 snippet: "hello **world**".into(),
10464 content: "hello world from a much larger conversation body".into(),
10465 content_hash: stable_content_hash("hello world from a much larger conversation body"),
10466 score: 1.0,
10467 source_path: "p".into(),
10468 agent: "a".into(),
10469 workspace: "w".into(),
10470 workspace_original: None,
10471 created_at: None,
10472 line_number: Some(3),
10473 match_type: MatchType::Exact,
10474 source_id: "local".into(),
10475 origin_kind: "local".into(),
10476 origin_host: None,
10477 conversation_id: None,
10478 };
10479
10480 let snippet_only =
10481 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(false, true, true, true));
10482 assert_eq!(snippet_only.title, hit.title);
10483 assert_eq!(snippet_only.snippet, hit.snippet);
10484 assert!(
10485 snippet_only.content.is_empty(),
10486 "snippet-only progressive cache should not retain full content"
10487 );
10488 assert_eq!(snippet_only.match_type, hit.match_type);
10489 assert_eq!(snippet_only.line_number, hit.line_number);
10490 assert_eq!(snippet_only.source_path, hit.source_path);
10491 assert_eq!(snippet_only.agent, hit.agent);
10492 assert_eq!(snippet_only.workspace, hit.workspace);
10493
10494 let full =
10495 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(true, true, true, true));
10496 assert_eq!(full.content, hit.content);
10497 }
10498
10499 #[test]
10500 fn progressive_phase_reuses_lexical_cache_without_db_hydration() -> Result<()> {
10501 let client = SearchClient {
10502 reader: None,
10503 sqlite: Mutex::new(None),
10504 sqlite_path: None,
10505 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10506 reload_on_search: true,
10507 last_reload: Mutex::new(None),
10508 last_generation: Mutex::new(None),
10509 reload_epoch: Arc::new(AtomicU64::new(0)),
10510 warm_tx: None,
10511 _warm_handle: None,
10512 metrics: Metrics::default(),
10513 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10514 semantic: Mutex::new(None),
10515 last_tantivy_total_count: Mutex::new(None),
10516 };
10517 let field_mask = FieldMask::new(false, true, true, true);
10518 let lexical_hit = SearchHit {
10519 title: "lexical title".into(),
10520 snippet: "lexical snippet".into(),
10521 content: "full lexical body".into(),
10522 content_hash: stable_content_hash("full lexical body"),
10523 score: 0.0,
10524 source_path: "/tmp/session.jsonl".into(),
10525 agent: "codex".into(),
10526 workspace: "/tmp".into(),
10527 workspace_original: Some("/original".into()),
10528 created_at: Some(1_700_000_000_000),
10529 line_number: Some(7),
10530 match_type: MatchType::Exact,
10531 source_id: "local".into(),
10532 origin_kind: "local".into(),
10533 origin_host: None,
10534 conversation_id: None,
10535 };
10536 let mut lexical_cache = ProgressiveLexicalCache::default();
10537 lexical_cache.hits_by_message.insert(
10538 42,
10539 ProgressiveLexicalHit::from_search_hit(&lexical_hit, field_mask),
10540 );
10541
10542 let hash_hex = "00".repeat(32);
10543 let results = vec![FsScoredResult {
10544 doc_id: format!("m|42|0|1|1|1|1|1700000000000|{hash_hex}"),
10545 score: 0.91,
10546 source: FsScoreSource::Lexical,
10547 index: None,
10548 fast_score: None,
10549 quality_score: None,
10550 lexical_score: Some(0.91),
10551 rerank_score: None,
10552 explanation: None,
10553 metadata: None,
10554 }];
10555
10556 let result = client.progressive_phase_to_result(
10557 &results,
10558 ProgressivePhaseContext {
10559 query: "merged title",
10560 filters: &SearchFilters::default(),
10561 field_mask,
10562 lexical_cache: Some(&lexical_cache),
10563 limit: 1,
10564 fetch_limit: 1,
10565 },
10566 )?;
10567
10568 assert_eq!(result.hits.len(), 1);
10569 assert_eq!(result.hits[0].title, lexical_hit.title);
10570 assert_eq!(result.hits[0].snippet, lexical_hit.snippet);
10571 assert!(
10572 result.hits[0].content.is_empty(),
10573 "masked lexical cache should still avoid carrying full content"
10574 );
10575 assert_eq!(result.hits[0].source_path, lexical_hit.source_path);
10576 assert_eq!(result.hits[0].score, 0.91);
10577
10578 Ok(())
10579 }
10580
10581 #[test]
10582 fn search_returns_results_with_filters_and_pagination() -> Result<()> {
10583 let dir = TempDir::new()?;
10584 let mut index = TantivyIndex::open_or_create(dir.path())?;
10585 let conv = NormalizedConversation {
10586 agent_slug: "codex".into(),
10587 external_id: None,
10588 title: Some("hello world convo".into()),
10589 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
10590 source_path: dir.path().join("rollout-1.jsonl"),
10591 started_at: Some(1_700_000_000_000),
10592 ended_at: None,
10593 metadata: serde_json::json!({}),
10594 messages: vec![NormalizedMessage {
10595 idx: 0,
10596 role: "user".into(),
10597 author: Some("me".into()),
10598 created_at: Some(1_700_000_000_000),
10599 content: "hello rust world".into(),
10600 extra: serde_json::json!({}),
10601 snippets: vec![NormalizedSnippet {
10602 file_path: None,
10603 start_line: None,
10604 end_line: None,
10605 language: None,
10606 snippet_text: None,
10607 }],
10608 invocations: Vec::new(),
10609 }],
10610 };
10611 index.add_conversation(&conv)?;
10612 index.commit()?;
10613
10614 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10615 let mut filters = SearchFilters::default();
10616 filters.agents.insert("codex".into());
10617
10618 let hits = client.search("hello", filters, 10, 0, FieldMask::FULL)?;
10619 assert_eq!(hits.len(), 1);
10620 assert_eq!(hits[0].agent, "codex");
10621 assert!(hits[0].snippet.contains("hello"));
10622 Ok(())
10623 }
10624
10625 #[test]
10626 fn search_honors_created_range_and_workspace() -> Result<()> {
10627 let dir = TempDir::new()?;
10628 let mut index = TantivyIndex::open_or_create(dir.path())?;
10629
10630 let conv_a = NormalizedConversation {
10631 agent_slug: "codex".into(),
10632 external_id: None,
10633 title: Some("needle one".into()),
10634 workspace: Some(std::path::PathBuf::from("/ws/a")),
10635 source_path: dir.path().join("a.jsonl"),
10636 started_at: Some(10),
10637 ended_at: None,
10638 metadata: serde_json::json!({}),
10639 messages: vec![NormalizedMessage {
10640 idx: 0,
10641 role: "user".into(),
10642 author: None,
10643 created_at: Some(10),
10644 content: "alpha needle".into(),
10645 extra: serde_json::json!({}),
10646 snippets: vec![NormalizedSnippet {
10647 file_path: None,
10648 start_line: None,
10649 end_line: None,
10650 language: None,
10651 snippet_text: None,
10652 }],
10653 invocations: Vec::new(),
10654 }],
10655 };
10656 let conv_b = NormalizedConversation {
10657 agent_slug: "codex".into(),
10658 external_id: None,
10659 title: Some("needle two".into()),
10660 workspace: Some(std::path::PathBuf::from("/ws/b")),
10661 source_path: dir.path().join("b.jsonl"),
10662 started_at: Some(20),
10663 ended_at: None,
10664 metadata: serde_json::json!({}),
10665 messages: vec![NormalizedMessage {
10666 idx: 0,
10667 role: "user".into(),
10668 author: None,
10669 created_at: Some(20),
10670 content: "\nneedle second line".into(),
10671 extra: serde_json::json!({}),
10672 snippets: vec![NormalizedSnippet {
10673 file_path: None,
10674 start_line: None,
10675 end_line: None,
10676 language: None,
10677 snippet_text: None,
10678 }],
10679 invocations: Vec::new(),
10680 }],
10681 };
10682 index.add_conversation(&conv_a)?;
10683 index.add_conversation(&conv_b)?;
10684 index.commit()?;
10685
10686 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10687 let mut filters = SearchFilters::default();
10688 filters.workspaces.insert("/ws/b".into());
10689 filters.created_from = Some(15);
10690 filters.created_to = Some(25);
10691
10692 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
10693 assert_eq!(hits.len(), 1);
10694 assert_eq!(hits[0].workspace, "/ws/b");
10695 assert!(hits[0].snippet.contains("second line"));
10696 Ok(())
10697 }
10698
10699 #[test]
10700 fn pagination_skips_results() -> Result<()> {
10701 let dir = TempDir::new()?;
10702 let mut index = TantivyIndex::open_or_create(dir.path())?;
10703 for i in 0..3 {
10704 let conv = NormalizedConversation {
10705 agent_slug: "codex".into(),
10706 external_id: None,
10707 title: Some(format!("doc-{i}")),
10708 workspace: Some(std::path::PathBuf::from("/ws/p")),
10709 source_path: dir.path().join(format!("{i}.jsonl")),
10710 started_at: Some(100 + i),
10711 ended_at: None,
10712 metadata: serde_json::json!({}),
10713 messages: vec![NormalizedMessage {
10714 idx: 0,
10715 role: "user".into(),
10716 author: None,
10717 created_at: Some(100 + i),
10718 content: format!("pagination needle document number {i}"),
10720 extra: serde_json::json!({}),
10721 snippets: vec![NormalizedSnippet {
10722 file_path: None,
10723 start_line: None,
10724 end_line: None,
10725 language: None,
10726 snippet_text: None,
10727 }],
10728 invocations: Vec::new(),
10729 }],
10730 };
10731 index.add_conversation(&conv)?;
10732 }
10733 index.commit()?;
10734
10735 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10736 let hits = client.search(
10737 "pagination",
10738 SearchFilters::default(),
10739 1,
10740 1,
10741 FieldMask::FULL,
10742 )?;
10743 assert_eq!(hits.len(), 1);
10744 Ok(())
10745 }
10746
10747 #[test]
10748 fn search_matches_hyphenated_term() -> Result<()> {
10749 let dir = TempDir::new()?;
10750 let mut index = TantivyIndex::open_or_create(dir.path())?;
10751 let conv = NormalizedConversation {
10752 agent_slug: "codex".into(),
10753 external_id: None,
10754 title: Some("cma-es notes".into()),
10755 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
10756 source_path: dir.path().join("rollout-1.jsonl"),
10757 started_at: Some(1_700_000_000_000),
10758 ended_at: None,
10759 metadata: serde_json::json!({}),
10760 messages: vec![NormalizedMessage {
10761 idx: 0,
10762 role: "user".into(),
10763 author: Some("me".into()),
10764 created_at: Some(1_700_000_000_000),
10765 content: "Need CMA-ES strategy and CMA ES variants".into(),
10766 extra: serde_json::json!({}),
10767 snippets: vec![NormalizedSnippet {
10768 file_path: None,
10769 start_line: None,
10770 end_line: None,
10771 language: None,
10772 snippet_text: None,
10773 }],
10774 invocations: Vec::new(),
10775 }],
10776 };
10777 index.add_conversation(&conv)?;
10778 index.commit()?;
10779
10780 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10781 let hits = client.search("cma-es", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10782 assert_eq!(hits.len(), 1);
10783 assert!(hits[0].snippet.to_lowercase().contains("cma"));
10784 Ok(())
10785 }
10786
10787 #[test]
10788 fn search_matches_prefix_edge_ngram() -> Result<()> {
10789 let dir = TempDir::new()?;
10790 let mut index = TantivyIndex::open_or_create(dir.path())?;
10791 let conv = NormalizedConversation {
10792 agent_slug: "codex".into(),
10793 external_id: None,
10794 title: Some("math logic".into()),
10795 workspace: Some(std::path::PathBuf::from("/ws/m")),
10796 source_path: dir.path().join("math.jsonl"),
10797 started_at: Some(1000),
10798 ended_at: None,
10799 metadata: serde_json::json!({}),
10800 messages: vec![NormalizedMessage {
10801 idx: 0,
10802 role: "user".into(),
10803 author: None,
10804 created_at: Some(1000),
10805 content: "please calculate the entropy".into(),
10806 extra: serde_json::json!({}),
10807 snippets: vec![],
10808 invocations: Vec::new(),
10809 }],
10810 };
10811 index.add_conversation(&conv)?;
10812 index.commit()?;
10813
10814 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10815
10816 let hits = client.search("cal", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10818 assert_eq!(hits.len(), 1);
10819 assert!(hits[0].content.contains("calculate"));
10820
10821 let hits = client.search("entr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10823 assert_eq!(hits.len(), 1);
10824
10825 Ok(())
10826 }
10827
10828 #[test]
10829 fn search_matches_snake_case() -> Result<()> {
10830 let dir = TempDir::new()?;
10831 let mut index = TantivyIndex::open_or_create(dir.path())?;
10832 let conv = NormalizedConversation {
10833 agent_slug: "codex".into(),
10834 external_id: None,
10835 title: Some("code".into()),
10836 workspace: None,
10837 source_path: dir.path().join("c.jsonl"),
10838 started_at: Some(1),
10839 ended_at: None,
10840 metadata: serde_json::json!({}),
10841 messages: vec![NormalizedMessage {
10842 idx: 0,
10843 role: "user".into(),
10844 author: None,
10845 created_at: Some(1),
10846 content: "check the my_variable_name please".into(),
10847 extra: serde_json::json!({}),
10848 snippets: vec![],
10849 invocations: Vec::new(),
10850 }],
10851 };
10852 index.add_conversation(&conv)?;
10853 index.commit()?;
10854
10855 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10856
10857 let hits = client.search("vari", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10859 assert_eq!(hits.len(), 1);
10860
10861 let hits = client.search(
10863 "my_variable",
10864 SearchFilters::default(),
10865 10,
10866 0,
10867 FieldMask::FULL,
10868 )?;
10869 assert_eq!(hits.len(), 1);
10870
10871 Ok(())
10872 }
10873
10874 #[test]
10875 fn search_matches_symbols_stripped() -> Result<()> {
10876 let dir = TempDir::new()?;
10877 let mut index = TantivyIndex::open_or_create(dir.path())?;
10878 let conv = NormalizedConversation {
10879 agent_slug: "codex".into(),
10880 external_id: None,
10881 title: Some("symbols".into()),
10882 workspace: None,
10883 source_path: dir.path().join("s.jsonl"),
10884 started_at: Some(1),
10885 ended_at: None,
10886 metadata: serde_json::json!({}),
10887 messages: vec![NormalizedMessage {
10888 idx: 0,
10889 role: "user".into(),
10890 author: None,
10891 created_at: Some(1),
10892 content: "working with c++ and foo.bar today".into(),
10893 extra: serde_json::json!({}),
10894 snippets: vec![],
10895 invocations: Vec::new(),
10896 }],
10897 };
10898 index.add_conversation(&conv)?;
10899 index.commit()?;
10900
10901 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10902
10903 let hits = client.search("c++", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10905 assert_eq!(hits.len(), 1);
10906
10907 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10909 assert_eq!(hits.len(), 1);
10910
10911 Ok(())
10912 }
10913
10914 #[test]
10915 fn search_sets_match_type_for_wildcards() -> Result<()> {
10916 let dir = TempDir::new()?;
10917 let mut index = TantivyIndex::open_or_create(dir.path())?;
10918
10919 let conv = NormalizedConversation {
10920 agent_slug: "codex".into(),
10921 external_id: None,
10922 title: Some("handlers".into()),
10923 workspace: None,
10924 source_path: dir.path().join("h.jsonl"),
10925 started_at: Some(1),
10926 ended_at: None,
10927 metadata: serde_json::json!({}),
10928 messages: vec![NormalizedMessage {
10929 idx: 0,
10930 role: "user".into(),
10931 author: None,
10932 created_at: Some(1),
10933 content: "the request handler delegates".into(),
10934 extra: serde_json::json!({}),
10935 snippets: vec![],
10936 invocations: Vec::new(),
10937 }],
10938 };
10939 index.add_conversation(&conv)?;
10940 index.commit()?;
10941
10942 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10943
10944 let exact = client.search("handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10945 assert_eq!(exact[0].match_type, MatchType::Exact);
10946
10947 let prefix = client.search("hand*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10948 assert_eq!(prefix[0].match_type, MatchType::Prefix);
10949
10950 let suffix = client.search("*handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10951 assert_eq!(suffix[0].match_type, MatchType::Suffix);
10952
10953 let substring =
10954 client.search("*andle*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10955 assert_eq!(substring[0].match_type, MatchType::Substring);
10956
10957 Ok(())
10958 }
10959
10960 #[test]
10961 fn search_with_fallback_marks_implicit_wildcard() -> Result<()> {
10962 let dir = TempDir::new()?;
10963 let mut index = TantivyIndex::open_or_create(dir.path())?;
10964
10965 let conv = NormalizedConversation {
10966 agent_slug: "codex".into(),
10967 external_id: None,
10968 title: Some("handlers".into()),
10969 workspace: None,
10970 source_path: dir.path().join("h2.jsonl"),
10971 started_at: Some(1),
10972 ended_at: None,
10973 metadata: serde_json::json!({}),
10974 messages: vec![NormalizedMessage {
10975 idx: 0,
10976 role: "user".into(),
10977 author: None,
10978 created_at: Some(1),
10979 content: "the request handler delegates".into(),
10980 extra: serde_json::json!({}),
10981 snippets: vec![],
10982 invocations: Vec::new(),
10983 }],
10984 };
10985 index.add_conversation(&conv)?;
10986 index.commit()?;
10987
10988 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10989
10990 let result = client.search_with_fallback(
10992 "andle",
10993 SearchFilters::default(),
10994 10,
10995 0,
10996 2,
10997 FieldMask::FULL,
10998 )?;
10999 assert!(result.wildcard_fallback);
11000 assert_eq!(result.hits.len(), 1);
11001 assert_eq!(result.hits[0].match_type, MatchType::ImplicitWildcard);
11002
11003 Ok(())
11004 }
11005
11006 #[test]
11007 fn sqlite_backend_skips_wildcard_queries() -> Result<()> {
11008 let conn = Connection::open(":memory:")?;
11010 let client = SearchClient {
11011 reader: None,
11012 sqlite: Mutex::new(Some(SendConnection(conn))),
11013 sqlite_path: None,
11014 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11015 reload_on_search: true,
11016 last_reload: Mutex::new(None),
11017 last_generation: Mutex::new(None),
11018 reload_epoch: Arc::new(AtomicU64::new(0)),
11019 warm_tx: None,
11020 _warm_handle: None,
11021 metrics: Metrics::default(),
11022 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11023 semantic: Mutex::new(None),
11024 last_tantivy_total_count: Mutex::new(None),
11025 };
11026
11027 let hits = client.search("*handler", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11028 assert!(
11029 hits.is_empty(),
11030 "wildcard should skip sqlite fallback, not error"
11031 );
11032
11033 Ok(())
11034 }
11035
11036 #[test]
11037 fn sqlite_backend_handles_null_workspace() -> Result<()> {
11038 let conn = Connection::open(":memory:")?;
11039 conn.execute_batch(
11040 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11041 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11042 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11043 CREATE TABLE conversations (
11044 id INTEGER PRIMARY KEY,
11045 agent_id INTEGER,
11046 workspace_id INTEGER,
11047 source_id TEXT,
11048 origin_host TEXT,
11049 title TEXT,
11050 source_path TEXT
11051 );
11052 CREATE TABLE messages (
11053 id INTEGER PRIMARY KEY,
11054 conversation_id INTEGER,
11055 idx INTEGER,
11056 content TEXT,
11057 created_at INTEGER
11058 );
11059 CREATE VIRTUAL TABLE fts_messages USING fts5(
11060 content,
11061 title,
11062 agent,
11063 workspace,
11064 source_path,
11065 created_at UNINDEXED,
11066 content='',
11067 tokenize='porter'
11068 );",
11069 )?;
11070 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11071 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11072 conn.execute(
11073 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 't', '/tmp/session.jsonl')",
11074 )?;
11075 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
11076 conn.execute_compat(
11077 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11078 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
11079 params![
11080 1_i64,
11081 "auth token failure",
11082 "t",
11083 "codex",
11084 "/tmp/session.jsonl",
11085 42_i64
11086 ],
11087 )?;
11088
11089 let client = SearchClient {
11090 reader: None,
11091 sqlite: Mutex::new(Some(SendConnection(conn))),
11092 sqlite_path: None,
11093 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11094 reload_on_search: true,
11095 last_reload: Mutex::new(None),
11096 last_generation: Mutex::new(None),
11097 reload_epoch: Arc::new(AtomicU64::new(0)),
11098 warm_tx: None,
11099 _warm_handle: None,
11100 metrics: Metrics::default(),
11101 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11102 semantic: Mutex::new(None),
11103 last_tantivy_total_count: Mutex::new(None),
11104 };
11105
11106 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11107 assert_eq!(hits.len(), 1);
11108 assert_eq!(hits[0].workspace, "");
11109 assert_eq!(hits[0].line_number, Some(1));
11110 assert_eq!(hits[0].source_id, "local");
11111 assert_eq!(hits[0].origin_kind, "local");
11112 Ok(())
11113 }
11114
11115 #[test]
11116 fn sqlite_backend_supports_legacy_fts_message_id_schema() -> Result<()> {
11117 let conn = Connection::open(":memory:")?;
11118 conn.execute_batch(
11119 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11120 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11121 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11122 CREATE TABLE conversations (
11123 id INTEGER PRIMARY KEY,
11124 agent_id INTEGER,
11125 workspace_id INTEGER,
11126 source_id TEXT,
11127 origin_host TEXT,
11128 title TEXT,
11129 source_path TEXT
11130 );
11131 CREATE TABLE messages (
11132 id INTEGER PRIMARY KEY,
11133 conversation_id INTEGER,
11134 idx INTEGER,
11135 content TEXT,
11136 created_at INTEGER
11137 );
11138 CREATE VIRTUAL TABLE fts_messages USING fts5(
11139 content,
11140 title,
11141 agent,
11142 workspace,
11143 source_path,
11144 created_at UNINDEXED,
11145 message_id UNINDEXED,
11146 tokenize='porter'
11147 );",
11148 )?;
11149 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11150 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11151 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/legacy')")?;
11152 conn.execute(
11153 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11154 VALUES(1, 1, 1, 'local', NULL, 'legacy title', '/tmp/legacy.jsonl')",
11155 )?;
11156 conn.execute(
11157 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11158 VALUES(42, 1, 4, 'legacy auth token failure', 99)",
11159 )?;
11160 conn.execute_compat(
11161 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at, message_id)
11162 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
11163 params![
11164 1_i64,
11165 "legacy auth token failure",
11166 "legacy title",
11167 "codex",
11168 "/legacy",
11169 "/tmp/legacy.jsonl",
11170 99_i64,
11171 42_i64
11172 ],
11173 )?;
11174
11175 let client = SearchClient {
11176 reader: None,
11177 sqlite: Mutex::new(Some(SendConnection(conn))),
11178 sqlite_path: None,
11179 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11180 reload_on_search: true,
11181 last_reload: Mutex::new(None),
11182 last_generation: Mutex::new(None),
11183 reload_epoch: Arc::new(AtomicU64::new(0)),
11184 warm_tx: None,
11185 _warm_handle: None,
11186 metrics: Metrics::default(),
11187 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11188 semantic: Mutex::new(None),
11189 last_tantivy_total_count: Mutex::new(None),
11190 };
11191
11192 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11193 assert_eq!(hits.len(), 1);
11194 assert_eq!(hits[0].title, "legacy title");
11195 assert_eq!(hits[0].source_path, "/tmp/legacy.jsonl");
11196 assert_eq!(hits[0].workspace, "/legacy");
11197 assert_eq!(hits[0].line_number, Some(5));
11198 assert_eq!(hits[0].content, "legacy auth token failure");
11199 Ok(())
11200 }
11201
11202 #[test]
11203 fn tantivy_reader_skips_sqlite_fallback_on_empty_lexical_results() -> Result<()> {
11204 let dir = TempDir::new()?;
11205 let mut index = TantivyIndex::open_or_create(dir.path())?;
11206 index.commit()?;
11207 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
11208 assert!(
11209 reader.is_some(),
11210 "test fixture should open a Tantivy reader even with an empty index"
11211 );
11212
11213 let conn = Connection::open(":memory:")?;
11214 conn.execute_batch(
11215 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11216 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11217 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11218 CREATE TABLE conversations (
11219 id INTEGER PRIMARY KEY,
11220 agent_id INTEGER,
11221 workspace_id INTEGER,
11222 source_id TEXT,
11223 origin_host TEXT,
11224 title TEXT,
11225 source_path TEXT
11226 );
11227 CREATE TABLE messages (
11228 id INTEGER PRIMARY KEY,
11229 conversation_id INTEGER,
11230 idx INTEGER,
11231 content TEXT,
11232 created_at INTEGER
11233 );
11234 CREATE VIRTUAL TABLE fts_messages USING fts5(
11235 content,
11236 title,
11237 agent,
11238 workspace,
11239 source_path,
11240 created_at UNINDEXED,
11241 content='',
11242 tokenize='porter'
11243 );",
11244 )?;
11245 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11246 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11247 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/sqlite-only')")?;
11248 conn.execute(
11249 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11250 VALUES(1, 1, 1, 'local', NULL, 'sqlite fallback only', '/tmp/sqlite-only.jsonl')",
11251 )?;
11252 conn.execute(
11253 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11254 VALUES(1, 1, 0, 'sqliteonlytoken overflow candidate', 42)",
11255 )?;
11256 conn.execute_compat(
11257 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11258 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11259 params![
11260 1_i64,
11261 "sqliteonlytoken overflow candidate",
11262 "sqlite fallback only",
11263 "codex",
11264 "/sqlite-only",
11265 "/tmp/sqlite-only.jsonl",
11266 42_i64
11267 ],
11268 )?;
11269
11270 let client = SearchClient {
11271 reader,
11272 sqlite: Mutex::new(Some(SendConnection(conn))),
11273 sqlite_path: None,
11274 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11275 reload_on_search: true,
11276 last_reload: Mutex::new(None),
11277 last_generation: Mutex::new(None),
11278 reload_epoch: Arc::new(AtomicU64::new(0)),
11279 warm_tx: None,
11280 _warm_handle: None,
11281 metrics: Metrics::default(),
11282 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11283 semantic: Mutex::new(None),
11284 last_tantivy_total_count: Mutex::new(None),
11285 };
11286
11287 let sqlite_hits = client.search_sqlite_fts5(
11288 Path::new(":memory:"),
11289 "sqliteonlytoken",
11290 SearchFilters::default(),
11291 5,
11292 0,
11293 FieldMask::FULL,
11294 )?;
11295 assert_eq!(
11296 sqlite_hits.len(),
11297 1,
11298 "fixture should prove sqlite fallback would have produced a hit"
11299 );
11300
11301 let tantivy_authoritative_hits = client.search(
11302 "sqliteonlytoken",
11303 SearchFilters::default(),
11304 5,
11305 0,
11306 FieldMask::FULL,
11307 )?;
11308 assert!(
11309 tantivy_authoritative_hits.is_empty(),
11310 "a live Tantivy reader should prevent sqlite fallback from populating empty lexical results"
11311 );
11312 Ok(())
11313 }
11314
11315 #[test]
11316 fn sqlite_guard_does_not_repair_fts_when_generation_key_stale() -> Result<()> {
11317 let temp_dir = TempDir::new()?;
11318 let db_path = temp_dir.path().join("stale-gen-fts.db");
11319
11320 {
11322 let storage = FrankenStorage::open(&db_path)?;
11323 let agent = Agent {
11324 id: None,
11325 slug: "codex".into(),
11326 name: "Codex".into(),
11327 version: None,
11328 kind: AgentKind::Cli,
11329 };
11330 let agent_id = storage.ensure_agent(&agent)?;
11331 let conversation = Conversation {
11332 id: None,
11333 agent_slug: "codex".into(),
11334 workspace: Some(PathBuf::from("/tmp/workspace")),
11335 external_id: Some("stale-gen-fts".into()),
11336 title: Some("Stale FTS generation".into()),
11337 source_path: PathBuf::from("/tmp/stale-gen-fts.jsonl"),
11338 started_at: Some(1_700_000_000_000),
11339 ended_at: Some(1_700_000_000_100),
11340 approx_tokens: Some(42),
11341 metadata_json: serde_json::Value::Null,
11342 messages: vec![Message {
11343 id: None,
11344 idx: 0,
11345 role: MessageRole::User,
11346 author: Some("user".into()),
11347 created_at: Some(1_700_000_000_050),
11348 content: "message that should remain queryable".into(),
11349 extra_json: serde_json::Value::Null,
11350 snippets: Vec::new(),
11351 }],
11352 source_id: "local".into(),
11353 origin_host: None,
11354 };
11355 storage.insert_conversation_tree(agent_id, None, &conversation)?;
11356 }
11357
11358 let count_before = sqlite_master_name_count(&db_path, "fts_messages")
11359 .context("count schema rows before generation key deletion")?;
11360
11361 {
11365 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
11366 conn.execute_compat(
11367 "DELETE FROM meta WHERE key = ?1",
11368 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
11369 )?;
11370 }
11371
11372 let client = SearchClient {
11375 reader: None,
11376 sqlite: Mutex::new(None),
11377 sqlite_path: Some(db_path.clone()),
11378 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11379 reload_on_search: true,
11380 last_reload: Mutex::new(None),
11381 last_generation: Mutex::new(None),
11382 reload_epoch: Arc::new(AtomicU64::new(0)),
11383 warm_tx: None,
11384 _warm_handle: None,
11385 metrics: Metrics::default(),
11386 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11387 semantic: Mutex::new(None),
11388 last_tantivy_total_count: Mutex::new(None),
11389 };
11390
11391 let guard = client
11392 .sqlite_guard()
11393 .context("open sqlite guard for stale generation fixture")?;
11394 assert!(guard.is_some(), "sqlite guard should open the db");
11395 let conn = guard
11396 .as_ref()
11397 .expect("sqlite guard should hold a connection");
11398 let no_params: [ParamValue; 0] = [];
11399 let cache_size: i64 =
11400 conn.query_row_map("PRAGMA cache_size;", &no_params, |row| row.get_typed(0))?;
11401 assert_eq!(
11402 cache_size, -SEARCH_SQLITE_HYDRATION_CACHE_KIB,
11403 "search hydration should not inherit the general storage cache profile"
11404 );
11405 drop(guard);
11406
11407 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
11409 let generation_after: Option<String> = conn
11410 .query_row_map(
11411 "SELECT value FROM meta WHERE key = ?1",
11412 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
11413 |row| row.get_typed(0),
11414 )
11415 .optional()?;
11416 assert!(
11417 generation_after.is_none(),
11418 "search sqlite guard must not mutate FTS rebuild metadata"
11419 );
11420
11421 let count_after = sqlite_master_name_count(&db_path, "fts_messages")
11423 .context("count schema rows after sqlite guard reopen")?;
11424 assert_eq!(
11425 count_after, count_before,
11426 "read-only reopen must leave FTS schema state unchanged"
11427 );
11428
11429 Ok(())
11430 }
11431
11432 #[test]
11433 fn sqlite_path_rusqlite_fallback_matches_hyphenated_ids_with_workspace_filter() -> Result<()> {
11434 fn fts_match_count(conn: &FrankenConnection, fts_query: &str) -> Result<Option<usize>> {
11435 let match_mode = SearchClient::sqlite_fts_match_mode(conn)?;
11436 let sql = format!(
11437 "SELECT COUNT(*) FROM fts_messages WHERE {}",
11438 SearchClient::sqlite_fts5_match_clause(match_mode)
11439 );
11440 let mut params = Vec::new();
11441 SearchClient::push_sqlite_fts5_match_params(&mut params, fts_query, match_mode);
11442 match franken_query_map_collect_retry(conn, &sql, ¶ms, |row| row.get_typed(0)) {
11443 Ok(rows) => {
11444 let count: i64 = rows.into_iter().next().unwrap_or(0);
11445 Ok(Some(usize::try_from(count.max(0)).unwrap_or(usize::MAX)))
11446 }
11447 Err(err) if err.to_string().contains("no such function: MATCH/2") => Ok(None),
11448 Err(err) => Err(err.into()),
11449 }
11450 }
11451
11452 let temp_dir = TempDir::new()?;
11453 let db_path = temp_dir.path().join("hyphenated-rusqlite-fallback.db");
11454
11455 {
11456 let storage = FrankenStorage::open(&db_path)?;
11457 storage.ensure_search_fallback_fts_consistency()?;
11460 let conn = storage.raw();
11461 conn.execute(
11462 "INSERT INTO agents(id, slug, name, kind, created_at, updated_at)
11463 VALUES(1, 'codex', 'Codex', 'codex', 1, 1)",
11464 )?;
11465 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws/alpha')")?;
11466 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/ws/beta')")?;
11467 conn.execute(
11468 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11469 VALUES(1, 1, 1, 'local', NULL, 'alpha bead', '/tmp/alpha.jsonl')",
11470 )?;
11471 conn.execute(
11472 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11473 VALUES(2, 1, 2, 'local', NULL, 'beta bead', '/tmp/beta.jsonl')",
11474 )?;
11475 conn.execute(
11476 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11477 VALUES(11, 1, 0, 'user', 'Need follow-up on br-123 root cause', 100)",
11478 )?;
11479 conn.execute(
11480 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11481 VALUES(12, 2, 0, 'user', 'Need follow-up on br-123 user report', 101)",
11482 )?;
11483 conn.execute_compat(
11484 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11485 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11486 &[
11487 ParamValue::from(11_i64),
11488 ParamValue::from("Need follow-up on br-123 root cause"),
11489 ParamValue::from("alpha bead"),
11490 ParamValue::from("codex"),
11491 ParamValue::from("/ws/alpha"),
11492 ParamValue::from("/tmp/alpha.jsonl"),
11493 ParamValue::from(100_i64),
11494 ],
11495 )?;
11496 conn.execute_compat(
11497 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11498 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11499 &[
11500 ParamValue::from(12_i64),
11501 ParamValue::from("Need follow-up on br-123 user report"),
11502 ParamValue::from("beta bead"),
11503 ParamValue::from("codex"),
11504 ParamValue::from("/ws/beta"),
11505 ParamValue::from("/tmp/beta.jsonl"),
11506 ParamValue::from(101_i64),
11507 ],
11508 )?;
11509 let preclose_total_rows: i64 =
11510 conn.query_row_map("SELECT COUNT(*) FROM fts_messages", params![], |row| {
11511 row.get_typed(0)
11512 })?;
11513 assert_eq!(
11514 preclose_total_rows, 2,
11515 "freshly seeded file-backed FTS should retain the inserted rows"
11516 );
11517 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
11518 if let Some(match_count) = fts_match_count(conn, transpiled.as_str())? {
11519 assert_eq!(
11520 match_count, 2,
11521 "freshly seeded file-backed FTS should match the transpiled hyphenated query before reopen"
11522 );
11523 }
11524 }
11525
11526 let client = SearchClient {
11527 reader: None,
11528 sqlite: Mutex::new(None),
11529 sqlite_path: Some(db_path),
11530 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11531 reload_on_search: true,
11532 last_reload: Mutex::new(None),
11533 last_generation: Mutex::new(None),
11534 reload_epoch: Arc::new(AtomicU64::new(0)),
11535 warm_tx: None,
11536 _warm_handle: None,
11537 metrics: Metrics::default(),
11538 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11539 semantic: Mutex::new(None),
11540 last_tantivy_total_count: Mutex::new(None),
11541 };
11542
11543 let guard = client.sqlite_guard()?;
11544 let conn = guard.as_ref().expect("sqlite guard should reopen file db");
11545 let reopened_total_rows: i64 =
11546 conn.query_row_map("SELECT COUNT(*) FROM fts_messages", params![], |row| {
11547 row.get_typed(0)
11548 })?;
11549 assert_eq!(
11550 reopened_total_rows, 2,
11551 "reopened file-backed FTS should still contain the seeded rows"
11552 );
11553 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
11554 if let Some(match_count) = fts_match_count(conn, transpiled.as_str())? {
11555 assert_eq!(
11556 match_count, 2,
11557 "reopened file-backed FTS should still match the transpiled hyphenated query"
11558 );
11559 }
11560 drop(guard);
11561
11562 let all_hits = client.search("br-123", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
11563 assert_eq!(all_hits.len(), 2);
11564 assert!(
11565 all_hits.iter().all(|hit| hit.content.contains("br-123")),
11566 "hyphenated bead IDs should survive the file-backed sqlite fallback path"
11567 );
11568
11569 let leading_or_hits = client.search(
11570 "OR br-123",
11571 SearchFilters::default(),
11572 10,
11573 0,
11574 FieldMask::FULL,
11575 )?;
11576 assert_eq!(leading_or_hits.len(), 2);
11577
11578 let dotted_hits = client.search(
11579 "br-123.jsonl",
11580 SearchFilters::default(),
11581 10,
11582 0,
11583 FieldMask::FULL,
11584 )?;
11585 assert_eq!(dotted_hits.len(), 2);
11586
11587 let dotted_prefix_hits = client.search(
11588 "br-123.json*",
11589 SearchFilters::default(),
11590 10,
11591 0,
11592 FieldMask::FULL,
11593 )?;
11594 assert_eq!(dotted_prefix_hits.len(), 2);
11595
11596 let prefix_hits =
11597 client.search("br-12*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
11598 assert_eq!(prefix_hits.len(), 2);
11599
11600 let filtered_hits = client.search(
11601 "br-123",
11602 SearchFilters {
11603 workspaces: HashSet::from_iter(["/ws/beta".to_string()]),
11604 ..SearchFilters::default()
11605 },
11606 10,
11607 0,
11608 FieldMask::FULL,
11609 )?;
11610 assert_eq!(filtered_hits.len(), 1);
11611 assert_eq!(filtered_hits[0].workspace, "/ws/beta");
11612 assert_eq!(filtered_hits[0].source_path, "/tmp/beta.jsonl");
11613 assert!(filtered_hits[0].content.contains("br-123"));
11614
11615 Ok(())
11616 }
11617
11618 #[test]
11619 fn sqlite_backend_orders_hits_by_bm25_score() -> Result<()> {
11620 let conn = Connection::open(":memory:")?;
11621 conn.execute_batch(
11622 "CREATE TABLE conversations (
11623 id INTEGER PRIMARY KEY,
11624 agent_id INTEGER,
11625 workspace_id INTEGER,
11626 source_id TEXT,
11627 origin_host TEXT,
11628 title TEXT,
11629 source_path TEXT
11630 );
11631 CREATE TABLE messages (
11632 id INTEGER PRIMARY KEY,
11633 conversation_id INTEGER,
11634 idx INTEGER,
11635 content TEXT,
11636 created_at INTEGER
11637 );
11638 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11639 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11640 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11641 CREATE VIRTUAL TABLE fts_messages USING fts5(
11642 content,
11643 title,
11644 agent,
11645 workspace,
11646 source_path,
11647 created_at UNINDEXED,
11648 content='',
11649 tokenize='porter'
11650 );",
11651 )?;
11652 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11653 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11654 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
11655 conn.execute(
11656 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'best', '/tmp/best.jsonl')",
11657 )?;
11658 conn.execute(
11659 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'worse', '/tmp/worse.jsonl')",
11660 )?;
11661 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(7, 1, 0, 'auth auth auth failure', 42)")?;
11662 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(8, 2, 0, 'auth failure', 43)")?;
11663 conn.execute_compat(
11664 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11665 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11666 params![
11667 7_i64,
11668 "auth auth auth failure",
11669 "best",
11670 "codex",
11671 "/ws",
11672 "/tmp/best.jsonl",
11673 42_i64
11674 ],
11675 )?;
11676 conn.execute_compat(
11677 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11678 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11679 params![
11680 8_i64,
11681 "auth failure",
11682 "worse",
11683 "codex",
11684 "/ws",
11685 "/tmp/worse.jsonl",
11686 43_i64
11687 ],
11688 )?;
11689 let client = SearchClient {
11690 reader: None,
11691 sqlite: Mutex::new(Some(SendConnection(conn))),
11692 sqlite_path: None,
11693 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11694 reload_on_search: true,
11695 last_reload: Mutex::new(None),
11696 last_generation: Mutex::new(None),
11697 reload_epoch: Arc::new(AtomicU64::new(0)),
11698 warm_tx: None,
11699 _warm_handle: None,
11700 metrics: Metrics::default(),
11701 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11702 semantic: Mutex::new(None),
11703 last_tantivy_total_count: Mutex::new(None),
11704 };
11705 let direct_hits = client.search_sqlite_fts5(
11706 Path::new(":memory:"),
11707 "auth",
11708 SearchFilters::default(),
11709 5,
11710 0,
11711 FieldMask::FULL,
11712 )?;
11713 assert_eq!(direct_hits.len(), 2);
11714
11715 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11716 assert_eq!(hits.len(), 2);
11717 assert_eq!(hits[0].title, "best");
11718 assert_eq!(hits[1].title, "worse");
11719 assert!(hits[0].score > hits[1].score);
11720
11721 Ok(())
11722 }
11723
11724 #[test]
11725 fn sqlite_fts5_ranked_phase_defers_content_decode_until_after_limit() {
11726 let (rank_sql, params) = SearchClient::sqlite_fts5_rank_query(
11727 "auth",
11728 &SearchFilters::default(),
11729 50,
11730 0,
11731 false,
11732 SqliteFtsMatchMode::Table,
11733 );
11734 let hydrate_sql = SearchClient::sqlite_fts5_hydrate_query(
11735 2,
11736 FieldMask::new(true, true, true, true),
11737 false,
11738 );
11739
11740 assert!(
11741 !rank_sql.contains("fts_messages.content"),
11742 "rank query must not decode large content rows before LIMIT"
11743 );
11744 assert!(
11745 hydrate_sql.contains("fts_messages.content"),
11746 "hydration query should still provide requested content"
11747 );
11748 assert!(
11749 rank_sql.contains("LIMIT ? OFFSET ?"),
11750 "rank query must apply page bounds before hydration"
11751 );
11752 assert_eq!(params.len(), 3, "fts query plus limit and offset params");
11753 }
11754
11755 #[test]
11756 fn sqlite_fts5_hydration_chunks_stay_below_bind_variable_limit() {
11757 let oversized_row_count = SQLITE_MAX_VARIABLE_NUMBER + 1;
11758 let unchunked_sql = SearchClient::sqlite_fts5_hydrate_query(
11759 oversized_row_count,
11760 FieldMask::new(true, true, true, true),
11761 false,
11762 );
11763 assert!(
11764 unchunked_sql.matches('?').count() > SQLITE_MAX_VARIABLE_NUMBER,
11765 "the pre-fix one-shot hydration query would exceed frankensqlite's bind limit"
11766 );
11767
11768 let ranked_rows: Vec<(i64, f64)> = (0..(SQLITE_FTS5_HYDRATE_PARAM_CHUNK + 17))
11769 .map(|idx| (idx as i64, idx as f64))
11770 .collect();
11771 let chunk_sizes: Vec<usize> = SearchClient::sqlite_fts5_hydrate_row_chunks(&ranked_rows)
11772 .map(<[(i64, f64)]>::len)
11773 .collect();
11774
11775 assert_eq!(
11776 chunk_sizes,
11777 vec![SQLITE_FTS5_HYDRATE_PARAM_CHUNK, 17],
11778 "large fallback pages must hydrate in bounded chunks while preserving rank windows"
11779 );
11780 assert!(
11781 chunk_sizes
11782 .iter()
11783 .all(|chunk_size| *chunk_size <= SQLITE_MAX_VARIABLE_NUMBER),
11784 "every hydration chunk must fit under frankensqlite's bind-variable ceiling"
11785 );
11786 }
11787
11788 #[test]
11789 fn tantivy_fallback_hydration_narrows_by_normalized_source_before_message_lookup() -> Result<()>
11790 {
11791 let conn = Connection::open(":memory:")?;
11792 conn.execute_batch(
11793 "CREATE TABLE conversations (
11794 id INTEGER PRIMARY KEY,
11795 source_id TEXT,
11796 origin_host TEXT,
11797 source_path TEXT NOT NULL
11798 );
11799 CREATE TABLE messages (
11800 id INTEGER PRIMARY KEY,
11801 conversation_id INTEGER NOT NULL,
11802 idx INTEGER NOT NULL,
11803 content TEXT NOT NULL,
11804 UNIQUE(conversation_id, idx)
11805 );
11806 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11807 )?;
11808 conn.execute(
11809 "INSERT INTO conversations(id, source_id, origin_host, source_path)
11810 VALUES(1, '', 'devbox', '/tmp/shared-fallback.jsonl')",
11811 )?;
11812 conn.execute(
11813 "INSERT INTO conversations(id, source_id, origin_host, source_path)
11814 VALUES(2, 'local', NULL, '/tmp/shared-fallback.jsonl')",
11815 )?;
11816 conn.execute(
11817 "INSERT INTO messages(id, conversation_id, idx, content)
11818 VALUES(10, 1, 2, 'remote fallback content')",
11819 )?;
11820 conn.execute(
11821 "INSERT INTO messages(id, conversation_id, idx, content)
11822 VALUES(20, 2, 2, 'local content must not win')",
11823 )?;
11824
11825 let client = SearchClient {
11826 reader: None,
11827 sqlite: Mutex::new(Some(SendConnection(conn))),
11828 sqlite_path: None,
11829 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11830 reload_on_search: true,
11831 last_reload: Mutex::new(None),
11832 last_generation: Mutex::new(None),
11833 reload_epoch: Arc::new(AtomicU64::new(0)),
11834 warm_tx: None,
11835 _warm_handle: None,
11836 metrics: Metrics::default(),
11837 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11838 semantic: Mutex::new(None),
11839 last_tantivy_total_count: Mutex::new(None),
11840 };
11841
11842 let fallback_key = (
11843 "devbox".to_string(),
11844 "/tmp/shared-fallback.jsonl".to_string(),
11845 2,
11846 );
11847 let (_, hydrated_fallback) =
11848 client.hydrate_tantivy_hit_contents(&[], std::slice::from_ref(&fallback_key))?;
11849
11850 assert_eq!(
11851 hydrated_fallback.get(&fallback_key).map(String::as_str),
11852 Some("remote fallback content")
11853 );
11854
11855 Ok(())
11856 }
11857
11858 #[test]
11859 fn exact_content_hydration_returns_only_requested_message_indices() -> Result<()> {
11860 let conn = Connection::open(":memory:")?;
11861 conn.execute_batch(
11862 "CREATE TABLE messages (
11863 id INTEGER PRIMARY KEY,
11864 conversation_id INTEGER NOT NULL,
11865 idx INTEGER NOT NULL,
11866 content TEXT NOT NULL,
11867 UNIQUE(conversation_id, idx)
11868 );",
11869 )?;
11870
11871 for idx in 0..8 {
11872 conn.execute(&format!(
11873 "INSERT INTO messages(conversation_id, idx, content)
11874 VALUES(1, {idx}, 'conversation one row {idx}')"
11875 ))?;
11876 }
11877 conn.execute(
11878 "INSERT INTO messages(conversation_id, idx, content)
11879 VALUES(2, 0, 'conversation two row 0')",
11880 )?;
11881
11882 let hydrated =
11883 hydrate_message_content_by_conversation(&conn, &[(1, 6), (1, 2), (2, 0), (1, 99)])?;
11884
11885 assert_eq!(hydrated.len(), 3);
11886 assert_eq!(
11887 hydrated.get(&(1, 2)).map(String::as_str),
11888 Some("conversation one row 2")
11889 );
11890 assert_eq!(
11891 hydrated.get(&(1, 6)).map(String::as_str),
11892 Some("conversation one row 6")
11893 );
11894 assert_eq!(
11895 hydrated.get(&(2, 0)).map(String::as_str),
11896 Some("conversation two row 0")
11897 );
11898 assert!(!hydrated.contains_key(&(1, 99)));
11899
11900 Ok(())
11901 }
11902
11903 #[test]
11904 fn sqlite_backend_generates_snippet_from_content() -> Result<()> {
11905 let conn = Connection::open(":memory:")?;
11906 conn.execute_batch(
11907 "CREATE TABLE conversations (
11908 id INTEGER PRIMARY KEY,
11909 agent_id INTEGER,
11910 workspace_id INTEGER,
11911 source_id TEXT,
11912 origin_host TEXT,
11913 title TEXT,
11914 source_path TEXT
11915 );
11916 CREATE TABLE messages (
11917 id INTEGER PRIMARY KEY,
11918 conversation_id INTEGER,
11919 idx INTEGER,
11920 content TEXT,
11921 created_at INTEGER
11922 );
11923 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11924 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11925 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11926 CREATE VIRTUAL TABLE fts_messages USING fts5(
11927 content,
11928 title,
11929 agent,
11930 workspace,
11931 source_path,
11932 created_at UNINDEXED,
11933 content='',
11934 tokenize='porter'
11935 );",
11936 )?;
11937 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11938 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11939 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
11940 conn.execute(
11941 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'snippet title', '/tmp/snippet.jsonl')",
11942 )?;
11943 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'alpha beta gamma delta epsilon zeta eta theta', 42)")?;
11944 conn.execute_compat(
11945 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11946 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11947 params![
11948 1_i64,
11949 "alpha beta gamma delta epsilon zeta eta theta",
11950 "snippet title",
11951 "codex",
11952 "/ws",
11953 "/tmp/snippet.jsonl",
11954 42_i64
11955 ],
11956 )?;
11957
11958 let client = SearchClient {
11959 reader: None,
11960 sqlite: Mutex::new(Some(SendConnection(conn))),
11961 sqlite_path: None,
11962 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11963 reload_on_search: true,
11964 last_reload: Mutex::new(None),
11965 last_generation: Mutex::new(None),
11966 reload_epoch: Arc::new(AtomicU64::new(0)),
11967 warm_tx: None,
11968 _warm_handle: None,
11969 metrics: Metrics::default(),
11970 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11971 semantic: Mutex::new(None),
11972 last_tantivy_total_count: Mutex::new(None),
11973 };
11974
11975 let hits = client.search("delta", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11976 assert_eq!(hits.len(), 1);
11977 assert_eq!(hits[0].snippet, snippet_from_content(&hits[0].content));
11979 assert!(hits[0].snippet.contains("delta"));
11980
11981 Ok(())
11982 }
11983
11984 #[test]
11985 fn sqlite_backend_respects_source_filter() -> Result<()> {
11986 let conn = Connection::open(":memory:")?;
11987 conn.execute_batch(
11988 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11989 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11990 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11991 CREATE TABLE conversations (
11992 id INTEGER PRIMARY KEY,
11993 agent_id INTEGER,
11994 workspace_id INTEGER,
11995 source_id TEXT,
11996 origin_host TEXT,
11997 title TEXT,
11998 source_path TEXT
11999 );
12000 CREATE TABLE messages (
12001 id INTEGER PRIMARY KEY,
12002 conversation_id INTEGER,
12003 idx INTEGER,
12004 content TEXT,
12005 created_at INTEGER
12006 );
12007 CREATE VIRTUAL TABLE fts_messages USING fts5(
12008 content,
12009 title,
12010 agent,
12011 workspace,
12012 source_path,
12013 created_at UNINDEXED,
12014 content='',
12015 tokenize='porter'
12016 );",
12017 )?;
12018 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
12019 conn.execute("INSERT INTO sources(id, kind) VALUES('laptop', 'ssh')")?;
12020 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12021 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/local')")?;
12022 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/remote')")?;
12023 conn.execute(
12024 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, ' local ', NULL, 'local title', '/tmp/local.jsonl')",
12025 )?;
12026 conn.execute("INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 2, 'laptop', 'dev@laptop', 'remote title', '/tmp/remote.jsonl')")?;
12027 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
12028 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
12029 conn.execute_compat(
12030 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12031 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
12032 params![
12033 1_i64,
12034 "auth token failure",
12035 "local title",
12036 "codex",
12037 "/local",
12038 "/tmp/local.jsonl",
12039 42_i64
12040 ],
12041 )?;
12042 conn.execute_compat(
12043 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12044 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
12045 params![
12046 2_i64,
12047 "auth token failure",
12048 "remote title",
12049 "codex",
12050 "/remote",
12051 "/tmp/remote.jsonl",
12052 43_i64
12053 ],
12054 )?;
12055
12056 let client = SearchClient {
12057 reader: None,
12058 sqlite: Mutex::new(Some(SendConnection(conn))),
12059 sqlite_path: None,
12060 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12061 reload_on_search: true,
12062 last_reload: Mutex::new(None),
12063 last_generation: Mutex::new(None),
12064 reload_epoch: Arc::new(AtomicU64::new(0)),
12065 warm_tx: None,
12066 _warm_handle: None,
12067 metrics: Metrics::default(),
12068 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12069 semantic: Mutex::new(None),
12070 last_tantivy_total_count: Mutex::new(None),
12071 };
12072
12073 let local_hits = client.browse_by_date(
12074 SearchFilters {
12075 source_filter: SourceFilter::Local,
12076 ..SearchFilters::default()
12077 },
12078 5,
12079 0,
12080 true,
12081 FieldMask::FULL,
12082 )?;
12083 assert_eq!(local_hits.len(), 1);
12084 assert_eq!(local_hits[0].source_id, "local");
12085
12086 let remote_hits = client.browse_by_date(
12087 SearchFilters {
12088 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
12089 ..SearchFilters::default()
12090 },
12091 5,
12092 0,
12093 true,
12094 FieldMask::FULL,
12095 )?;
12096 assert_eq!(remote_hits.len(), 1);
12097 assert_eq!(remote_hits[0].source_id, "local");
12098 assert_eq!(remote_hits[0].origin_kind, "local");
12099
12100 Ok(())
12101 }
12102
12103 #[test]
12104 fn sqlite_backend_remote_source_filter_matches_blank_source_id_with_origin_host() -> Result<()>
12105 {
12106 let conn = Connection::open(":memory:")?;
12107 conn.execute_batch(
12108 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12109 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
12110 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
12111 CREATE TABLE conversations (
12112 id INTEGER PRIMARY KEY,
12113 agent_id INTEGER,
12114 workspace_id INTEGER,
12115 source_id TEXT,
12116 origin_host TEXT,
12117 title TEXT,
12118 source_path TEXT
12119 );
12120 CREATE TABLE messages (
12121 id INTEGER PRIMARY KEY,
12122 conversation_id INTEGER,
12123 idx INTEGER,
12124 content TEXT,
12125 created_at INTEGER
12126 );
12127 CREATE VIRTUAL TABLE fts_messages USING fts5(
12128 content,
12129 title,
12130 agent,
12131 workspace,
12132 source_path,
12133 created_at UNINDEXED,
12134 content='',
12135 tokenize='porter'
12136 );",
12137 )?;
12138 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12139 conn.execute(
12140 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12141 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'remote title', '/tmp/remote-filter.jsonl')",
12142 )?;
12143 conn.execute(
12144 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12145 VALUES(1, 1, 0, 'remote filter proof', 42)",
12146 )?;
12147 conn.execute_compat(
12148 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12149 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
12150 params![
12151 1_i64,
12152 "remote filter proof",
12153 "remote title",
12154 "codex",
12155 "/tmp/remote-filter.jsonl",
12156 42_i64
12157 ],
12158 )?;
12159
12160 let client = SearchClient {
12161 reader: None,
12162 sqlite: Mutex::new(Some(SendConnection(conn))),
12163 sqlite_path: None,
12164 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12165 reload_on_search: true,
12166 last_reload: Mutex::new(None),
12167 last_generation: Mutex::new(None),
12168 reload_epoch: Arc::new(AtomicU64::new(0)),
12169 warm_tx: None,
12170 _warm_handle: None,
12171 metrics: Metrics::default(),
12172 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12173 semantic: Mutex::new(None),
12174 last_tantivy_total_count: Mutex::new(None),
12175 };
12176
12177 let remote_hits = client.search(
12178 "remote",
12179 SearchFilters {
12180 source_filter: SourceFilter::Remote,
12181 ..Default::default()
12182 },
12183 5,
12184 0,
12185 FieldMask::FULL,
12186 )?;
12187 assert_eq!(remote_hits.len(), 1);
12188 assert_eq!(remote_hits[0].source_id, "dev@laptop");
12189 assert_eq!(remote_hits[0].origin_kind, "remote");
12190 assert_eq!(remote_hits[0].origin_host.as_deref(), Some("dev@laptop"));
12191
12192 let source_hits = client.search(
12193 "remote",
12194 SearchFilters {
12195 source_filter: SourceFilter::SourceId("dev@laptop".into()),
12196 ..Default::default()
12197 },
12198 5,
12199 0,
12200 FieldMask::FULL,
12201 )?;
12202 assert_eq!(source_hits.len(), 1);
12203 assert_eq!(source_hits[0].source_id, "dev@laptop");
12204 assert_eq!(source_hits[0].origin_kind, "remote");
12205
12206 Ok(())
12207 }
12208
12209 #[test]
12210 fn sqlite_backend_workspace_filter_matches_null_workspace_as_empty_string() -> Result<()> {
12211 let conn = Connection::open(":memory:")?;
12212 conn.execute_batch(
12213 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12214 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
12215 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
12216 CREATE TABLE conversations (
12217 id INTEGER PRIMARY KEY,
12218 agent_id INTEGER,
12219 workspace_id INTEGER,
12220 source_id TEXT,
12221 origin_host TEXT,
12222 title TEXT,
12223 source_path TEXT
12224 );
12225 CREATE TABLE messages (
12226 id INTEGER PRIMARY KEY,
12227 conversation_id INTEGER,
12228 idx INTEGER,
12229 content TEXT,
12230 created_at INTEGER
12231 );
12232 CREATE VIRTUAL TABLE fts_messages USING fts5(
12233 content,
12234 title,
12235 agent,
12236 workspace,
12237 source_path,
12238 created_at UNINDEXED,
12239 content='',
12240 tokenize='porter'
12241 );",
12242 )?;
12243 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
12244 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12245 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/named')")?;
12246 conn.execute(
12248 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 'null workspace', '/tmp/null-workspace.jsonl')",
12249 )?;
12250 conn.execute(
12252 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'named workspace', '/tmp/named-workspace.jsonl')",
12253 )?;
12254 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
12255 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
12256 conn.execute_compat(
12257 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12258 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
12259 params![
12260 1_i64,
12261 "auth token failure",
12262 "null workspace",
12263 "codex",
12264 "/tmp/null-workspace.jsonl",
12265 42_i64
12266 ],
12267 )?;
12268 conn.execute_compat(
12269 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12270 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
12271 params![
12272 2_i64,
12273 "auth token failure",
12274 "named workspace",
12275 "codex",
12276 "/named",
12277 "/tmp/named-workspace.jsonl",
12278 43_i64
12279 ],
12280 )?;
12281
12282 let client = SearchClient {
12283 reader: None,
12284 sqlite: Mutex::new(Some(SendConnection(conn))),
12285 sqlite_path: None,
12286 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12287 reload_on_search: true,
12288 last_reload: Mutex::new(None),
12289 last_generation: Mutex::new(None),
12290 reload_epoch: Arc::new(AtomicU64::new(0)),
12291 warm_tx: None,
12292 _warm_handle: None,
12293 metrics: Metrics::default(),
12294 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12295 semantic: Mutex::new(None),
12296 last_tantivy_total_count: Mutex::new(None),
12297 };
12298
12299 let hits = client.search(
12300 "auth",
12301 SearchFilters {
12302 workspaces: HashSet::from_iter([String::new()]),
12303 ..SearchFilters::default()
12304 },
12305 5,
12306 0,
12307 FieldMask::FULL,
12308 )?;
12309 assert_eq!(hits.len(), 1);
12310 assert_eq!(hits[0].workspace, "");
12311 assert_eq!(hits[0].source_path, "/tmp/null-workspace.jsonl");
12312
12313 Ok(())
12314 }
12315
12316 #[test]
12317 fn sqlite_message_scan_preserves_boolean_or_precedence() {
12318 let simple_or =
12319 SearchClient::sqlite_message_scan_query("alpha OR beta").expect("simple OR scan query");
12320 assert!(SearchClient::sqlite_message_scan_score("alpha", &simple_or) > 0.0);
12321 assert!(SearchClient::sqlite_message_scan_score("beta", &simple_or) > 0.0);
12322 assert_eq!(
12323 SearchClient::sqlite_message_scan_score("gamma", &simple_or),
12324 0.0
12325 );
12326
12327 let and_then_or = SearchClient::sqlite_message_scan_query("alpha AND beta OR gamma")
12328 .expect("AND followed by OR scan query");
12329 assert!(
12330 SearchClient::sqlite_message_scan_score("alpha gamma", &and_then_or) > 0.0,
12331 "alpha AND (beta OR gamma) should accept the gamma branch"
12332 );
12333 assert_eq!(
12334 SearchClient::sqlite_message_scan_score("alpha", &and_then_or),
12335 0.0
12336 );
12337 assert_eq!(
12338 SearchClient::sqlite_message_scan_score("beta gamma", &and_then_or),
12339 0.0
12340 );
12341
12342 let or_then_and = SearchClient::sqlite_message_scan_query("alpha OR beta AND gamma")
12343 .expect("OR followed by AND scan query");
12344 assert!(
12345 SearchClient::sqlite_message_scan_score("alpha gamma", &or_then_and) > 0.0,
12346 "(alpha OR beta) AND gamma should accept the alpha branch"
12347 );
12348 assert!(
12349 SearchClient::sqlite_message_scan_score("beta gamma", &or_then_and) > 0.0,
12350 "(alpha OR beta) AND gamma should accept the beta branch"
12351 );
12352 assert_eq!(
12353 SearchClient::sqlite_message_scan_score("alpha", &or_then_and),
12354 0.0
12355 );
12356
12357 let binary_not =
12358 SearchClient::sqlite_message_scan_query("alpha NOT beta").expect("NOT scan query");
12359 assert!(SearchClient::sqlite_message_scan_score("alpha", &binary_not) > 0.0);
12360 assert_eq!(
12361 SearchClient::sqlite_message_scan_score("alpha beta", &binary_not),
12362 0.0
12363 );
12364 }
12365
12366 #[test]
12367 fn browse_by_date_treats_null_workspace_and_source_as_local() -> Result<()> {
12368 let conn = Connection::open(":memory:")?;
12369 conn.execute_batch(
12370 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12371 CREATE TABLE conversations (
12372 id INTEGER PRIMARY KEY,
12373 agent_id INTEGER NOT NULL,
12374 workspace_id INTEGER,
12375 source_id TEXT,
12376 origin_host TEXT,
12377 title TEXT,
12378 source_path TEXT NOT NULL
12379 );
12380 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12381 CREATE TABLE messages (
12382 id INTEGER PRIMARY KEY,
12383 conversation_id INTEGER NOT NULL,
12384 idx INTEGER,
12385 content TEXT NOT NULL,
12386 created_at INTEGER
12387 );
12388 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12389 )?;
12390 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12391 conn.execute(
12392 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12393 VALUES(1, 1, NULL, NULL, NULL, 'browse title', '/tmp/browse.jsonl')",
12394 )?;
12395 conn.execute(
12396 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12397 VALUES(1, 1, 0, 'browse auth token failure', 123)",
12398 )?;
12399
12400 let client = SearchClient {
12401 reader: None,
12402 sqlite: Mutex::new(Some(SendConnection(conn))),
12403 sqlite_path: None,
12404 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12405 reload_on_search: true,
12406 last_reload: Mutex::new(None),
12407 last_generation: Mutex::new(None),
12408 reload_epoch: Arc::new(AtomicU64::new(0)),
12409 warm_tx: None,
12410 _warm_handle: None,
12411 metrics: Metrics::default(),
12412 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12413 semantic: Mutex::new(None),
12414 last_tantivy_total_count: Mutex::new(None),
12415 };
12416
12417 let hits = client.browse_by_date(
12418 SearchFilters {
12419 workspaces: HashSet::from_iter([String::new()]),
12420 source_filter: SourceFilter::Local,
12421 ..SearchFilters::default()
12422 },
12423 5,
12424 0,
12425 true,
12426 FieldMask::FULL,
12427 )?;
12428 assert_eq!(hits.len(), 1);
12429 assert_eq!(hits[0].workspace, "");
12430 assert_eq!(hits[0].source_id, "local");
12431 assert_eq!(hits[0].origin_kind, "local");
12432
12433 Ok(())
12434 }
12435
12436 #[test]
12437 fn hydrate_semantic_hits_with_ids_snippet_only_uses_full_content_for_snippets_and_identity()
12438 -> Result<()> {
12439 let conn = Connection::open(":memory:")?;
12440 conn.execute_batch(
12441 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12442 CREATE TABLE conversations (
12443 id INTEGER PRIMARY KEY,
12444 agent_id INTEGER NOT NULL,
12445 workspace_id INTEGER,
12446 source_id TEXT,
12447 origin_host TEXT,
12448 title TEXT,
12449 source_path TEXT NOT NULL,
12450 started_at INTEGER
12451 );
12452 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12453 CREATE TABLE messages (
12454 id INTEGER PRIMARY KEY,
12455 conversation_id INTEGER NOT NULL,
12456 idx INTEGER,
12457 role TEXT,
12458 content TEXT NOT NULL,
12459 created_at INTEGER
12460 );
12461 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12462 )?;
12463 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12464 conn.execute(
12465 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12466 VALUES(1, 1, NULL, 'local', NULL, 'semantic title', '/tmp/semantic.jsonl', 100)",
12467 )?;
12468 let shared_prefix = "shared-prefix ".repeat(32);
12469 let first = format!("{shared_prefix}first unique semantic tail");
12470 let second = format!("{shared_prefix}second unique semantic tail");
12471 conn.execute_with_params(
12472 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12473 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
12474 &[
12475 fsqlite_types::value::SqliteValue::Integer(1),
12476 fsqlite_types::value::SqliteValue::Integer(0),
12477 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
12478 fsqlite_types::value::SqliteValue::Integer(101),
12479 ],
12480 )?;
12481 conn.execute_with_params(
12482 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12483 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
12484 &[
12485 fsqlite_types::value::SqliteValue::Integer(2),
12486 fsqlite_types::value::SqliteValue::Integer(1),
12487 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
12488 fsqlite_types::value::SqliteValue::Integer(102),
12489 ],
12490 )?;
12491
12492 let client = SearchClient {
12493 reader: None,
12494 sqlite: Mutex::new(Some(SendConnection(conn))),
12495 sqlite_path: None,
12496 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12497 reload_on_search: true,
12498 last_reload: Mutex::new(None),
12499 last_generation: Mutex::new(None),
12500 reload_epoch: Arc::new(AtomicU64::new(0)),
12501 warm_tx: None,
12502 _warm_handle: None,
12503 metrics: Metrics::default(),
12504 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12505 semantic: Mutex::new(None),
12506 last_tantivy_total_count: Mutex::new(None),
12507 };
12508
12509 let hits = client.hydrate_semantic_hits_with_ids(
12510 &[
12511 VectorSearchResult {
12512 message_id: 1,
12513 chunk_idx: 0,
12514 score: 0.9,
12515 },
12516 VectorSearchResult {
12517 message_id: 2,
12518 chunk_idx: 0,
12519 score: 0.8,
12520 },
12521 ],
12522 FieldMask::new(false, true, true, true),
12523 )?;
12524 assert_eq!(hits.len(), 2);
12525 assert!(hits.iter().all(|(_, hit)| hit.content.is_empty()));
12526 assert!(hits.iter().all(|(_, hit)| !hit.snippet.is_empty()));
12527 assert_ne!(hits[0].1.content_hash, hits[1].1.content_hash);
12528
12529 Ok(())
12530 }
12531
12532 #[test]
12533 fn hydrate_semantic_hits_with_ids_normalizes_trimmed_local_source_metadata() -> Result<()> {
12534 let conn = Connection::open(":memory:")?;
12535 conn.execute_batch(
12536 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12537 CREATE TABLE conversations (
12538 id INTEGER PRIMARY KEY,
12539 agent_id INTEGER NOT NULL,
12540 workspace_id INTEGER,
12541 source_id TEXT,
12542 origin_host TEXT,
12543 title TEXT,
12544 source_path TEXT NOT NULL,
12545 started_at INTEGER
12546 );
12547 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12548 CREATE TABLE messages (
12549 id INTEGER PRIMARY KEY,
12550 conversation_id INTEGER NOT NULL,
12551 idx INTEGER,
12552 role TEXT,
12553 content TEXT NOT NULL,
12554 created_at INTEGER
12555 );
12556 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12557 )?;
12558 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12559 conn.execute(
12560 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12561 VALUES(1, 1, NULL, ' local ', NULL, 'trimmed local semantic', '/tmp/trimmed-local-semantic.jsonl', 100)",
12562 )?;
12563 conn.execute_with_params(
12564 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12565 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12566 &[
12567 fsqlite_types::value::SqliteValue::Integer(1),
12568 fsqlite_types::value::SqliteValue::Text("trimmed local semantic body".into()),
12569 ],
12570 )?;
12571
12572 let client = SearchClient {
12573 reader: None,
12574 sqlite: Mutex::new(Some(SendConnection(conn))),
12575 sqlite_path: None,
12576 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12577 reload_on_search: true,
12578 last_reload: Mutex::new(None),
12579 last_generation: Mutex::new(None),
12580 reload_epoch: Arc::new(AtomicU64::new(0)),
12581 warm_tx: None,
12582 _warm_handle: None,
12583 metrics: Metrics::default(),
12584 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12585 semantic: Mutex::new(None),
12586 last_tantivy_total_count: Mutex::new(None),
12587 };
12588
12589 let hits = client.hydrate_semantic_hits_with_ids(
12590 &[VectorSearchResult {
12591 message_id: 1,
12592 chunk_idx: 0,
12593 score: 0.9,
12594 }],
12595 FieldMask::new(false, true, true, true),
12596 )?;
12597 assert_eq!(hits.len(), 1);
12598 assert_eq!(hits[0].1.source_id, "local");
12599 assert_eq!(hits[0].1.origin_kind, "local");
12600
12601 Ok(())
12602 }
12603
12604 #[test]
12605 fn hydrate_semantic_hits_with_ids_preserves_remote_origin_without_source_row() -> Result<()> {
12606 let conn = Connection::open(":memory:")?;
12607 conn.execute_batch(
12608 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12609 CREATE TABLE conversations (
12610 id INTEGER PRIMARY KEY,
12611 agent_id INTEGER NOT NULL,
12612 workspace_id INTEGER,
12613 source_id TEXT,
12614 origin_host TEXT,
12615 title TEXT,
12616 source_path TEXT NOT NULL,
12617 started_at INTEGER
12618 );
12619 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12620 CREATE TABLE messages (
12621 id INTEGER PRIMARY KEY,
12622 conversation_id INTEGER NOT NULL,
12623 idx INTEGER,
12624 role TEXT,
12625 content TEXT NOT NULL,
12626 created_at INTEGER
12627 );
12628 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12629 )?;
12630 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12631 conn.execute(
12632 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12633 VALUES(1, 1, NULL, 'laptop', 'dev@laptop', 'remote semantic', '/tmp/remote-semantic.jsonl', 100)",
12634 )?;
12635 conn.execute_with_params(
12636 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12637 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12638 &[
12639 fsqlite_types::value::SqliteValue::Integer(1),
12640 fsqlite_types::value::SqliteValue::Text("remote semantic body".into()),
12641 ],
12642 )?;
12643
12644 let client = SearchClient {
12645 reader: None,
12646 sqlite: Mutex::new(Some(SendConnection(conn))),
12647 sqlite_path: None,
12648 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12649 reload_on_search: true,
12650 last_reload: Mutex::new(None),
12651 last_generation: Mutex::new(None),
12652 reload_epoch: Arc::new(AtomicU64::new(0)),
12653 warm_tx: None,
12654 _warm_handle: None,
12655 metrics: Metrics::default(),
12656 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12657 semantic: Mutex::new(None),
12658 last_tantivy_total_count: Mutex::new(None),
12659 };
12660
12661 let hits = client.hydrate_semantic_hits_with_ids(
12662 &[VectorSearchResult {
12663 message_id: 1,
12664 chunk_idx: 0,
12665 score: 0.9,
12666 }],
12667 FieldMask::new(false, true, true, true),
12668 )?;
12669 assert_eq!(hits.len(), 1);
12670 assert_eq!(hits[0].1.source_id, "laptop");
12671 assert_eq!(hits[0].1.origin_kind, "remote");
12672 assert_eq!(hits[0].1.origin_host.as_deref(), Some("dev@laptop"));
12673
12674 Ok(())
12675 }
12676
12677 #[test]
12678 fn resolve_semantic_doc_ids_for_hits_distinguishes_same_source_path_line_by_content_hash()
12679 -> Result<()> {
12680 let conn = Connection::open(":memory:")?;
12681 conn.execute_batch(
12682 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12683 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12684 CREATE TABLE conversations (
12685 id INTEGER PRIMARY KEY,
12686 agent_id INTEGER NOT NULL,
12687 workspace_id INTEGER,
12688 source_id TEXT,
12689 origin_host TEXT,
12690 title TEXT,
12691 source_path TEXT NOT NULL
12692 );
12693 CREATE TABLE messages (
12694 id INTEGER PRIMARY KEY,
12695 conversation_id INTEGER NOT NULL,
12696 idx INTEGER,
12697 role TEXT,
12698 content TEXT NOT NULL,
12699 created_at INTEGER
12700 );",
12701 )?;
12702 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12703 conn.execute(
12704 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12705 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
12706 )?;
12707 conn.execute(
12708 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12709 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
12710 )?;
12711 let first = "same prefix first tail".to_string();
12712 let second = "same prefix second tail".to_string();
12713 conn.execute_with_params(
12714 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12715 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12716 &[
12717 fsqlite_types::value::SqliteValue::Integer(11),
12718 fsqlite_types::value::SqliteValue::Integer(1),
12719 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
12720 ],
12721 )?;
12722 conn.execute_with_params(
12723 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12724 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12725 &[
12726 fsqlite_types::value::SqliteValue::Integer(22),
12727 fsqlite_types::value::SqliteValue::Integer(2),
12728 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
12729 ],
12730 )?;
12731
12732 let client = SearchClient {
12733 reader: None,
12734 sqlite: Mutex::new(Some(SendConnection(conn))),
12735 sqlite_path: None,
12736 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12737 reload_on_search: true,
12738 last_reload: Mutex::new(None),
12739 last_generation: Mutex::new(None),
12740 reload_epoch: Arc::new(AtomicU64::new(0)),
12741 warm_tx: None,
12742 _warm_handle: None,
12743 metrics: Metrics::default(),
12744 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12745 semantic: Mutex::new(None),
12746 last_tantivy_total_count: Mutex::new(None),
12747 };
12748
12749 let first_hit = SearchHit {
12750 title: "Shared Session".into(),
12751 snippet: String::new(),
12752 content: String::new(),
12753 content_hash: stable_hit_hash(
12754 &first,
12755 "/tmp/progressive-shared.jsonl",
12756 Some(1),
12757 Some(100),
12758 ),
12759 score: 0.0,
12760 source_path: "/tmp/progressive-shared.jsonl".into(),
12761 agent: "codex".into(),
12762 workspace: String::new(),
12763 workspace_original: None,
12764 created_at: Some(100),
12765 line_number: Some(1),
12766 match_type: MatchType::Exact,
12767 source_id: "local".into(),
12768 origin_kind: "local".into(),
12769 origin_host: None,
12770 conversation_id: None,
12771 };
12772 let second_hit = SearchHit {
12773 title: "Shared Session".into(),
12774 snippet: String::new(),
12775 content: String::new(),
12776 content_hash: stable_hit_hash(
12777 &second,
12778 "/tmp/progressive-shared.jsonl",
12779 Some(1),
12780 Some(100),
12781 ),
12782 score: 0.0,
12783 source_path: "/tmp/progressive-shared.jsonl".into(),
12784 agent: "codex".into(),
12785 workspace: String::new(),
12786 workspace_original: None,
12787 created_at: Some(100),
12788 line_number: Some(1),
12789 match_type: MatchType::Exact,
12790 source_id: "local".into(),
12791 origin_kind: "local".into(),
12792 origin_host: None,
12793 conversation_id: None,
12794 };
12795
12796 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
12797 assert_eq!(resolved.len(), 2);
12798 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12799 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
12800 assert_ne!(
12801 resolved[0].as_ref().map(|hit| hit.doc_id.as_str()),
12802 resolved[1].as_ref().map(|hit| hit.doc_id.as_str())
12803 );
12804
12805 Ok(())
12806 }
12807
12808 #[test]
12809 fn hydrate_semantic_hits_with_ids_keeps_missing_title_empty() -> Result<()> {
12810 let conn = Connection::open(":memory:")?;
12811 conn.execute_batch(
12812 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12813 CREATE TABLE conversations (
12814 id INTEGER PRIMARY KEY,
12815 agent_id INTEGER NOT NULL,
12816 workspace_id INTEGER,
12817 source_id TEXT,
12818 origin_host TEXT,
12819 title TEXT,
12820 source_path TEXT NOT NULL,
12821 started_at INTEGER
12822 );
12823 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12824 CREATE TABLE messages (
12825 id INTEGER PRIMARY KEY,
12826 conversation_id INTEGER NOT NULL,
12827 idx INTEGER,
12828 role TEXT,
12829 content TEXT NOT NULL,
12830 created_at INTEGER
12831 );
12832 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12833 )?;
12834 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12835 conn.execute(
12836 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12837 VALUES(1, 1, NULL, 'local', NULL, NULL, '/tmp/untitled-semantic.jsonl', 100)",
12838 )?;
12839 conn.execute_with_params(
12840 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12841 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12842 &[
12843 fsqlite_types::value::SqliteValue::Integer(1),
12844 fsqlite_types::value::SqliteValue::Text("untitled semantic body".into()),
12845 ],
12846 )?;
12847
12848 let client = SearchClient {
12849 reader: None,
12850 sqlite: Mutex::new(Some(SendConnection(conn))),
12851 sqlite_path: None,
12852 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12853 reload_on_search: true,
12854 last_reload: Mutex::new(None),
12855 last_generation: Mutex::new(None),
12856 reload_epoch: Arc::new(AtomicU64::new(0)),
12857 warm_tx: None,
12858 _warm_handle: None,
12859 metrics: Metrics::default(),
12860 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12861 semantic: Mutex::new(None),
12862 last_tantivy_total_count: Mutex::new(None),
12863 };
12864
12865 let hits = client.hydrate_semantic_hits_with_ids(
12866 &[VectorSearchResult {
12867 message_id: 1,
12868 chunk_idx: 0,
12869 score: 0.9,
12870 }],
12871 FieldMask::new(false, true, true, true),
12872 )?;
12873 assert_eq!(hits.len(), 1);
12874 assert_eq!(hits[0].1.title, "");
12875
12876 Ok(())
12877 }
12878
12879 #[test]
12880 fn resolve_semantic_doc_ids_for_hits_prefers_conversation_id_over_ambiguous_provenance()
12881 -> Result<()> {
12882 let conn = Connection::open(":memory:")?;
12883 conn.execute_batch(
12884 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12885 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12886 CREATE TABLE conversations (
12887 id INTEGER PRIMARY KEY,
12888 agent_id INTEGER NOT NULL,
12889 workspace_id INTEGER,
12890 source_id TEXT,
12891 origin_host TEXT,
12892 title TEXT,
12893 source_path TEXT NOT NULL
12894 );
12895 CREATE TABLE messages (
12896 id INTEGER PRIMARY KEY,
12897 conversation_id INTEGER NOT NULL,
12898 idx INTEGER,
12899 role TEXT,
12900 content TEXT NOT NULL,
12901 created_at INTEGER
12902 );",
12903 )?;
12904 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12905 conn.execute(
12906 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12907 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
12908 )?;
12909 conn.execute(
12910 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12911 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
12912 )?;
12913 let content = "same ambiguous content".to_string();
12914 conn.execute_with_params(
12915 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12916 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12917 &[
12918 fsqlite_types::value::SqliteValue::Integer(11),
12919 fsqlite_types::value::SqliteValue::Integer(1),
12920 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12921 ],
12922 )?;
12923 conn.execute_with_params(
12924 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12925 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12926 &[
12927 fsqlite_types::value::SqliteValue::Integer(22),
12928 fsqlite_types::value::SqliteValue::Integer(2),
12929 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12930 ],
12931 )?;
12932
12933 let client = SearchClient {
12934 reader: None,
12935 sqlite: Mutex::new(Some(SendConnection(conn))),
12936 sqlite_path: None,
12937 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12938 reload_on_search: true,
12939 last_reload: Mutex::new(None),
12940 last_generation: Mutex::new(None),
12941 reload_epoch: Arc::new(AtomicU64::new(0)),
12942 warm_tx: None,
12943 _warm_handle: None,
12944 metrics: Metrics::default(),
12945 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12946 semantic: Mutex::new(None),
12947 last_tantivy_total_count: Mutex::new(None),
12948 };
12949
12950 let first_hit = SearchHit {
12951 title: "Shared Session".into(),
12952 snippet: String::new(),
12953 content: String::new(),
12954 content_hash: stable_hit_hash(
12955 &content,
12956 "/tmp/progressive-conversation-id.jsonl",
12957 Some(1),
12958 Some(100),
12959 ),
12960 score: 0.0,
12961 source_path: "/tmp/progressive-conversation-id.jsonl".into(),
12962 agent: "codex".into(),
12963 workspace: String::new(),
12964 workspace_original: None,
12965 created_at: Some(100),
12966 line_number: Some(1),
12967 match_type: MatchType::Exact,
12968 source_id: "local".into(),
12969 origin_kind: "local".into(),
12970 origin_host: None,
12971 conversation_id: Some(1),
12972 };
12973 let second_hit = SearchHit {
12974 conversation_id: Some(2),
12975 ..first_hit.clone()
12976 };
12977
12978 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
12979 assert_eq!(resolved.len(), 2);
12980 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12981 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
12982
12983 Ok(())
12984 }
12985
12986 #[test]
12987 fn resolve_semantic_doc_ids_for_hits_treats_null_source_as_local() -> Result<()> {
12988 let conn = Connection::open(":memory:")?;
12989 conn.execute_batch(
12990 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12991 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12992 CREATE TABLE conversations (
12993 id INTEGER PRIMARY KEY,
12994 agent_id INTEGER NOT NULL,
12995 workspace_id INTEGER,
12996 source_id TEXT,
12997 origin_host TEXT,
12998 title TEXT,
12999 source_path TEXT NOT NULL
13000 );
13001 CREATE TABLE messages (
13002 id INTEGER PRIMARY KEY,
13003 conversation_id INTEGER NOT NULL,
13004 idx INTEGER,
13005 role TEXT,
13006 content TEXT NOT NULL,
13007 created_at INTEGER
13008 );",
13009 )?;
13010 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13011 conn.execute(
13012 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13013 VALUES(1, 1, NULL, NULL, NULL, 'Legacy Local', '/tmp/legacy-local.jsonl')",
13014 )?;
13015 let content = "legacy local semantic message".to_string();
13016 conn.execute_with_params(
13017 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13018 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13019 &[
13020 fsqlite_types::value::SqliteValue::Integer(11),
13021 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13022 ],
13023 )?;
13024
13025 let client = SearchClient {
13026 reader: None,
13027 sqlite: Mutex::new(Some(SendConnection(conn))),
13028 sqlite_path: None,
13029 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13030 reload_on_search: true,
13031 last_reload: Mutex::new(None),
13032 last_generation: Mutex::new(None),
13033 reload_epoch: Arc::new(AtomicU64::new(0)),
13034 warm_tx: None,
13035 _warm_handle: None,
13036 metrics: Metrics::default(),
13037 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13038 semantic: Mutex::new(None),
13039 last_tantivy_total_count: Mutex::new(None),
13040 };
13041
13042 let hit = SearchHit {
13043 title: "Legacy Local".into(),
13044 snippet: String::new(),
13045 content: String::new(),
13046 content_hash: stable_hit_hash(&content, "/tmp/legacy-local.jsonl", Some(1), Some(100)),
13047 score: 0.0,
13048 source_path: "/tmp/legacy-local.jsonl".into(),
13049 agent: "codex".into(),
13050 workspace: String::new(),
13051 workspace_original: None,
13052 created_at: Some(100),
13053 line_number: Some(1),
13054 match_type: MatchType::Exact,
13055 source_id: "local".into(),
13056 origin_kind: "local".into(),
13057 origin_host: None,
13058 conversation_id: None,
13059 };
13060
13061 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13062 assert_eq!(resolved.len(), 1);
13063 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
13064
13065 Ok(())
13066 }
13067
13068 #[test]
13069 fn resolve_semantic_doc_ids_for_hits_matches_trimmed_local_source_id() -> Result<()> {
13070 let conn = Connection::open(":memory:")?;
13071 conn.execute_batch(
13072 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13073 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
13074 CREATE TABLE conversations (
13075 id INTEGER PRIMARY KEY,
13076 agent_id INTEGER NOT NULL,
13077 workspace_id INTEGER,
13078 source_id TEXT,
13079 origin_host TEXT,
13080 title TEXT,
13081 source_path TEXT NOT NULL
13082 );
13083 CREATE TABLE messages (
13084 id INTEGER PRIMARY KEY,
13085 conversation_id INTEGER NOT NULL,
13086 idx INTEGER,
13087 role TEXT,
13088 content TEXT NOT NULL,
13089 created_at INTEGER
13090 );",
13091 )?;
13092 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13093 conn.execute(
13094 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13095 VALUES(1, 1, NULL, ' local ', NULL, 'Trimmed Local', '/tmp/trimmed-local.jsonl')",
13096 )?;
13097 let content = "trimmed local semantic message".to_string();
13098 conn.execute_with_params(
13099 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13100 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13101 &[
13102 fsqlite_types::value::SqliteValue::Integer(11),
13103 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13104 ],
13105 )?;
13106
13107 let client = SearchClient {
13108 reader: None,
13109 sqlite: Mutex::new(Some(SendConnection(conn))),
13110 sqlite_path: None,
13111 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13112 reload_on_search: true,
13113 last_reload: Mutex::new(None),
13114 last_generation: Mutex::new(None),
13115 reload_epoch: Arc::new(AtomicU64::new(0)),
13116 warm_tx: None,
13117 _warm_handle: None,
13118 metrics: Metrics::default(),
13119 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13120 semantic: Mutex::new(None),
13121 last_tantivy_total_count: Mutex::new(None),
13122 };
13123
13124 let hit = SearchHit {
13125 title: "Trimmed Local".into(),
13126 snippet: String::new(),
13127 content: String::new(),
13128 content_hash: stable_hit_hash(&content, "/tmp/trimmed-local.jsonl", Some(1), Some(100)),
13129 score: 0.0,
13130 source_path: "/tmp/trimmed-local.jsonl".into(),
13131 agent: "codex".into(),
13132 workspace: String::new(),
13133 workspace_original: None,
13134 created_at: Some(100),
13135 line_number: Some(1),
13136 match_type: MatchType::Exact,
13137 source_id: "local".into(),
13138 origin_kind: "local".into(),
13139 origin_host: None,
13140 conversation_id: None,
13141 };
13142
13143 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13144 assert_eq!(resolved.len(), 1);
13145 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
13146
13147 Ok(())
13148 }
13149
13150 #[test]
13151 fn resolve_semantic_doc_ids_for_hits_normalizes_blank_local_source_id() -> Result<()> {
13152 let conn = Connection::open(":memory:")?;
13153 conn.execute_batch(
13154 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13155 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
13156 CREATE TABLE conversations (
13157 id INTEGER PRIMARY KEY,
13158 agent_id INTEGER NOT NULL,
13159 workspace_id INTEGER,
13160 source_id TEXT,
13161 origin_host TEXT,
13162 title TEXT,
13163 source_path TEXT NOT NULL
13164 );
13165 CREATE TABLE messages (
13166 id INTEGER PRIMARY KEY,
13167 conversation_id INTEGER NOT NULL,
13168 idx INTEGER,
13169 role TEXT,
13170 content TEXT NOT NULL,
13171 created_at INTEGER
13172 );",
13173 )?;
13174 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13175 conn.execute(
13176 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13177 VALUES(1, 1, NULL, 'local', NULL, 'Blank Local', '/tmp/blank-local.jsonl')",
13178 )?;
13179 let content = "blank local semantic message".to_string();
13180 conn.execute_with_params(
13181 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13182 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13183 &[
13184 fsqlite_types::value::SqliteValue::Integer(11),
13185 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13186 ],
13187 )?;
13188
13189 let client = SearchClient {
13190 reader: None,
13191 sqlite: Mutex::new(Some(SendConnection(conn))),
13192 sqlite_path: None,
13193 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13194 reload_on_search: true,
13195 last_reload: Mutex::new(None),
13196 last_generation: Mutex::new(None),
13197 reload_epoch: Arc::new(AtomicU64::new(0)),
13198 warm_tx: None,
13199 _warm_handle: None,
13200 metrics: Metrics::default(),
13201 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13202 semantic: Mutex::new(None),
13203 last_tantivy_total_count: Mutex::new(None),
13204 };
13205
13206 let hit = SearchHit {
13207 title: "Blank Local".into(),
13208 snippet: String::new(),
13209 content: String::new(),
13210 content_hash: stable_hit_hash(&content, "/tmp/blank-local.jsonl", Some(1), Some(100)),
13211 score: 0.0,
13212 source_path: "/tmp/blank-local.jsonl".into(),
13213 agent: "codex".into(),
13214 workspace: String::new(),
13215 workspace_original: None,
13216 created_at: Some(100),
13217 line_number: Some(1),
13218 match_type: MatchType::Exact,
13219 source_id: " ".into(),
13220 origin_kind: "local".into(),
13221 origin_host: None,
13222 conversation_id: None,
13223 };
13224
13225 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13226 assert_eq!(resolved.len(), 1);
13227 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
13228
13229 Ok(())
13230 }
13231
13232 #[test]
13233 fn resolve_semantic_doc_ids_for_hits_infers_remote_source_from_origin_host_when_source_id_blank()
13234 -> Result<()> {
13235 let conn = Connection::open(":memory:")?;
13236 conn.execute_batch(
13237 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13238 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
13239 CREATE TABLE conversations (
13240 id INTEGER PRIMARY KEY,
13241 agent_id INTEGER NOT NULL,
13242 workspace_id INTEGER,
13243 source_id TEXT,
13244 origin_host TEXT,
13245 title TEXT,
13246 source_path TEXT NOT NULL
13247 );
13248 CREATE TABLE messages (
13249 id INTEGER PRIMARY KEY,
13250 conversation_id INTEGER NOT NULL,
13251 idx INTEGER,
13252 role TEXT,
13253 content TEXT NOT NULL,
13254 created_at INTEGER
13255 );",
13256 )?;
13257 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13258 conn.execute(
13259 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13260 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'Legacy Remote', '/tmp/legacy-remote.jsonl')",
13261 )?;
13262 let content = "legacy remote semantic message".to_string();
13263 conn.execute_with_params(
13264 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13265 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13266 &[
13267 fsqlite_types::value::SqliteValue::Integer(11),
13268 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13269 ],
13270 )?;
13271
13272 let client = SearchClient {
13273 reader: None,
13274 sqlite: Mutex::new(Some(SendConnection(conn))),
13275 sqlite_path: None,
13276 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13277 reload_on_search: true,
13278 last_reload: Mutex::new(None),
13279 last_generation: Mutex::new(None),
13280 reload_epoch: Arc::new(AtomicU64::new(0)),
13281 warm_tx: None,
13282 _warm_handle: None,
13283 metrics: Metrics::default(),
13284 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13285 semantic: Mutex::new(None),
13286 last_tantivy_total_count: Mutex::new(None),
13287 };
13288
13289 let hit = SearchHit {
13290 title: "Legacy Remote".into(),
13291 snippet: String::new(),
13292 content: String::new(),
13293 content_hash: stable_hit_hash(&content, "/tmp/legacy-remote.jsonl", Some(1), Some(100)),
13294 score: 0.0,
13295 source_path: "/tmp/legacy-remote.jsonl".into(),
13296 agent: "codex".into(),
13297 workspace: String::new(),
13298 workspace_original: None,
13299 created_at: Some(100),
13300 line_number: Some(1),
13301 match_type: MatchType::Exact,
13302 source_id: "dev@laptop".into(),
13303 origin_kind: "remote".into(),
13304 origin_host: Some("dev@laptop".into()),
13305 conversation_id: None,
13306 };
13307
13308 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13309 assert_eq!(resolved.len(), 1);
13310 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
13311
13312 Ok(())
13313 }
13314
13315 #[test]
13316 fn browse_by_date_snippet_only_uses_full_content_for_hit_identity() -> Result<()> {
13317 let conn = Connection::open(":memory:")?;
13318 conn.execute_batch(
13319 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13320 CREATE TABLE conversations (
13321 id INTEGER PRIMARY KEY,
13322 agent_id INTEGER NOT NULL,
13323 workspace_id INTEGER,
13324 source_id TEXT,
13325 origin_host TEXT,
13326 title TEXT,
13327 source_path TEXT NOT NULL
13328 );
13329 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
13330 CREATE TABLE messages (
13331 id INTEGER PRIMARY KEY,
13332 conversation_id INTEGER NOT NULL,
13333 idx INTEGER,
13334 content TEXT NOT NULL,
13335 created_at INTEGER
13336 );
13337 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
13338 )?;
13339 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13340 conn.execute(
13341 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13342 VALUES(1, 1, NULL, 'local', NULL, 'browse title', '/tmp/browse-shared.jsonl')",
13343 )?;
13344 let shared_prefix = "shared-prefix ".repeat(48);
13345 let first = format!("{shared_prefix}first browse-only tail");
13346 let second = format!("{shared_prefix}second browse-only tail");
13347 conn.execute_with_params(
13348 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
13349 VALUES(?1, 1, ?2, ?3, ?4)",
13350 &[
13351 fsqlite_types::value::SqliteValue::Integer(1),
13352 fsqlite_types::value::SqliteValue::Integer(0),
13353 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
13354 fsqlite_types::value::SqliteValue::Integer(101),
13355 ],
13356 )?;
13357 conn.execute_with_params(
13358 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
13359 VALUES(?1, 1, ?2, ?3, ?4)",
13360 &[
13361 fsqlite_types::value::SqliteValue::Integer(2),
13362 fsqlite_types::value::SqliteValue::Integer(1),
13363 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
13364 fsqlite_types::value::SqliteValue::Integer(102),
13365 ],
13366 )?;
13367
13368 let client = SearchClient {
13369 reader: None,
13370 sqlite: Mutex::new(Some(SendConnection(conn))),
13371 sqlite_path: None,
13372 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13373 reload_on_search: true,
13374 last_reload: Mutex::new(None),
13375 last_generation: Mutex::new(None),
13376 reload_epoch: Arc::new(AtomicU64::new(0)),
13377 warm_tx: None,
13378 _warm_handle: None,
13379 metrics: Metrics::default(),
13380 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13381 semantic: Mutex::new(None),
13382 last_tantivy_total_count: Mutex::new(None),
13383 };
13384
13385 let hits = client.browse_by_date(
13386 SearchFilters::default(),
13387 10,
13388 0,
13389 true,
13390 FieldMask::new(false, true, true, true),
13391 )?;
13392 assert_eq!(hits.len(), 2);
13393 assert!(hits.iter().all(|hit| hit.content.is_empty()));
13394 assert!(hits.iter().all(|hit| !hit.snippet.is_empty()));
13395 assert_ne!(hits[0].content_hash, hits[1].content_hash);
13396
13397 Ok(())
13398 }
13399
13400 #[test]
13401 fn cache_invalidates_on_new_data() -> Result<()> {
13402 let dir = TempDir::new()?;
13403 let mut index = TantivyIndex::open_or_create(dir.path())?;
13404
13405 let conv1 = NormalizedConversation {
13407 agent_slug: "codex".into(),
13408 external_id: None,
13409 title: Some("first".into()),
13410 workspace: None,
13411 source_path: dir.path().join("1.jsonl"),
13412 started_at: Some(1),
13413 ended_at: None,
13414 metadata: serde_json::json!({}),
13415 messages: vec![NormalizedMessage {
13416 idx: 0,
13417 role: "user".into(),
13418 author: None,
13419 created_at: Some(1),
13420 content: "apple banana".into(),
13421 extra: serde_json::json!({}),
13422 snippets: vec![],
13423 invocations: Vec::new(),
13424 }],
13425 };
13426 index.add_conversation(&conv1)?;
13427 index.commit()?;
13428
13429 let client = SearchClient::open(dir.path(), None)?.expect("index present");
13430
13431 let hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
13433 assert_eq!(hits.len(), 1);
13434 assert_eq!(hits[0].content, "apple banana");
13435
13436 {
13438 let cache = client.prefix_cache.lock().unwrap();
13439 let shard = cache.shard_opt("global").unwrap();
13440 assert!(shard.contains(&client.cache_key("app", &SearchFilters::default())));
13442 }
13443
13444 let conv2 = NormalizedConversation {
13446 agent_slug: "codex".into(),
13447 external_id: None,
13448 title: Some("second".into()),
13449 workspace: None,
13450 source_path: dir.path().join("2.jsonl"),
13451 started_at: Some(2),
13452 ended_at: None,
13453 metadata: serde_json::json!({}),
13454 messages: vec![NormalizedMessage {
13455 idx: 0,
13456 role: "user".into(),
13457 author: None,
13458 created_at: Some(2),
13459 content: "apricot".into(),
13460 extra: serde_json::json!({}),
13461 snippets: vec![],
13462 invocations: Vec::new(),
13463 }],
13464 };
13465 index.add_conversation(&conv2)?;
13466 index.commit()?;
13467
13468 std::thread::sleep(std::time::Duration::from_millis(350));
13474
13475 let _hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
13478 let hits = client.search("apr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
13482 assert_eq!(hits.len(), 1);
13483 assert_eq!(hits[0].content, "apricot");
13484
13485 Ok(())
13489 }
13490
13491 #[test]
13492 fn track_generation_clears_cache_on_change() {
13493 let client = SearchClient {
13494 reader: None,
13495 sqlite: Mutex::new(None),
13496 sqlite_path: None,
13497 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13498 reload_on_search: true,
13499 last_reload: Mutex::new(None),
13500 last_generation: Mutex::new(None),
13501 reload_epoch: Arc::new(AtomicU64::new(0)),
13502 warm_tx: None,
13503 _warm_handle: None,
13504 metrics: Metrics::default(),
13505 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13506 semantic: Mutex::new(None),
13507 last_tantivy_total_count: Mutex::new(None),
13508 };
13509
13510 let hit = SearchHit {
13511 title: "hello world".into(),
13512 snippet: "hello".into(),
13513 content: "hello world".into(),
13514 content_hash: stable_content_hash("hello world"),
13515 score: 1.0,
13516 source_path: "p".into(),
13517 agent: "a".into(),
13518 workspace: "w".into(),
13519 workspace_original: None,
13520 created_at: None,
13521 line_number: None,
13522 match_type: MatchType::Exact,
13523 source_id: "local".into(),
13524 origin_kind: "local".into(),
13525 origin_host: None,
13526 conversation_id: None,
13527 };
13528 let hits = vec![hit];
13529
13530 client.put_cache("hello", &SearchFilters::default(), &hits);
13531 {
13532 let cache = client.prefix_cache.lock().unwrap();
13533 assert!(!cache.shards.is_empty());
13534 }
13535
13536 client.track_generation(1);
13537 {
13538 let cache = client.prefix_cache.lock().unwrap();
13539 assert!(!cache.shards.is_empty());
13540 }
13541
13542 client.track_generation(2);
13543 {
13544 let cache = client.prefix_cache.lock().unwrap();
13545 assert!(cache.shards.is_empty());
13546 }
13547 }
13548
13549 #[test]
13550 fn cache_total_cap_evicts_across_shards() {
13551 let client = SearchClient {
13552 reader: None,
13553 sqlite: Mutex::new(None),
13554 sqlite_path: None,
13555 prefix_cache: Mutex::new(CacheShards::new(2, 0)), reload_on_search: true,
13557 last_reload: Mutex::new(None),
13558 last_generation: Mutex::new(None),
13559 reload_epoch: Arc::new(AtomicU64::new(0)),
13560 warm_tx: None,
13561 _warm_handle: None,
13562 metrics: Metrics::default(),
13563 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13564 semantic: Mutex::new(None),
13565 last_tantivy_total_count: Mutex::new(None),
13566 };
13567
13568 let hit = SearchHit {
13569 title: "a".into(),
13570 snippet: "a".into(),
13571 content: "a".into(),
13572 content_hash: stable_content_hash("a"),
13573 score: 1.0,
13574 source_path: "p".into(),
13575 agent: "agent1".into(),
13576 workspace: "w".into(),
13577 workspace_original: None,
13578 created_at: None,
13579 line_number: None,
13580 match_type: MatchType::Exact,
13581 source_id: "local".into(),
13582 origin_kind: "local".into(),
13583 origin_host: None,
13584 conversation_id: None,
13585 };
13586 let hits = vec![hit.clone()];
13587
13588 let mut filters = SearchFilters::default();
13589 filters.agents.insert("agent1".into());
13590 client.put_cache("a", &filters, &hits);
13591 filters.agents.clear();
13592 filters.agents.insert("agent2".into());
13593 client.put_cache("b", &filters, &hits);
13594 filters.agents.clear();
13595 filters.agents.insert("agent3".into());
13596 client.put_cache("c", &filters, &hits);
13597
13598 let stats = client.cache_stats();
13599 assert!(stats.total_cost <= stats.total_cap);
13600 assert_eq!(stats.total_cap, 2);
13601 }
13602
13603 #[test]
13604 fn cache_stats_reflect_metrics() {
13605 let client = SearchClient {
13606 reader: None,
13607 sqlite: Mutex::new(None),
13608 sqlite_path: None,
13609 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13610 reload_on_search: true,
13611 last_reload: Mutex::new(None),
13612 last_generation: Mutex::new(None),
13613 reload_epoch: Arc::new(AtomicU64::new(0)),
13614 warm_tx: None,
13615 _warm_handle: None,
13616 metrics: Metrics::default(),
13617 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13618 semantic: Mutex::new(None),
13619 last_tantivy_total_count: Mutex::new(None),
13620 };
13621
13622 client.metrics.inc_cache_hits();
13623 client.metrics.inc_cache_miss();
13624 client.metrics.inc_cache_shortfall();
13625 client.metrics.record_reload(Duration::from_millis(10));
13626
13627 let stats = client.cache_stats();
13628 assert_eq!(stats.cache_hits, 1);
13629 assert_eq!(stats.cache_miss, 1);
13630 assert_eq!(stats.cache_shortfall, 1);
13631 assert_eq!(stats.reloads, 1);
13632 assert_eq!(stats.reload_ms_total, 10);
13633 assert_eq!(stats.total_cap, *CACHE_TOTAL_CAP);
13634 assert_eq!(stats.eviction_policy, "lru");
13635 assert_eq!(stats.prewarm_scheduled, 0);
13636 assert_eq!(stats.prewarm_skipped_pressure, 0);
13637 assert_eq!(CacheStats::default().eviction_policy, "unknown");
13638 }
13639
13640 #[test]
13641 fn adaptive_query_prewarm_schedules_only_after_hot_prefix_cache_entry() {
13642 let (tx, rx) = mpsc::unbounded();
13643 let client = SearchClient {
13644 reader: None,
13645 sqlite: Mutex::new(None),
13646 sqlite_path: None,
13647 prefix_cache: Mutex::new(CacheShards::new(10, 0)),
13648 reload_on_search: true,
13649 last_reload: Mutex::new(None),
13650 last_generation: Mutex::new(None),
13651 reload_epoch: Arc::new(AtomicU64::new(0)),
13652 warm_tx: Some(tx),
13653 _warm_handle: None,
13654 metrics: Metrics::default(),
13655 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13656 semantic: Mutex::new(None),
13657 last_tantivy_total_count: Mutex::new(None),
13658 };
13659 let mut filters = SearchFilters::default();
13660 filters.workspaces.insert("/tmp/cass-workspace".into());
13661
13662 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
13663 assert!(
13664 rx.try_recv().is_err(),
13665 "cold prefixes should not schedule adaptive prewarm"
13666 );
13667
13668 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
13669 hit.snippet = "hello".into();
13670 hit.content = "hello world".into();
13671 hit.content_hash = stable_content_hash(&hit.content);
13672 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
13673
13674 let total_cost_before = client.cache_stats().total_cost;
13675 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
13676 assert!(
13677 rx.try_recv().is_err(),
13678 "an exact cached query should not schedule redundant prewarm"
13679 );
13680 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
13681
13682 let job = rx
13683 .try_recv()
13684 .expect("hot prefix should schedule adaptive prewarm");
13685 assert_eq!(job.query, "hello");
13686 assert_eq!(job.shard_name, "workspace:/tmp/cass-workspace");
13687 assert_eq!(job.filters_fingerprint, filters_fingerprint(&filters));
13688 let stats = client.cache_stats();
13689 assert_eq!(stats.prewarm_scheduled, 1);
13690 assert_eq!(stats.prewarm_skipped_pressure, 0);
13691 assert_eq!(
13692 stats.total_cost, total_cost_before,
13693 "prewarm scheduling should not mutate result-cache contents"
13694 );
13695 }
13696
13697 #[test]
13698 fn adaptive_query_prewarm_skips_when_cache_byte_cap_is_under_pressure() {
13699 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
13700 hit.snippet = "hello".into();
13701 hit.content = "hello world with enough content to consume the small byte budget".into();
13702 hit.content_hash = stable_content_hash(&hit.content);
13703 let byte_cap = cached_hit_from(&hit).approx_bytes();
13704
13705 let (tx, rx) = mpsc::unbounded();
13706 let client = SearchClient {
13707 reader: None,
13708 sqlite: Mutex::new(None),
13709 sqlite_path: None,
13710 prefix_cache: Mutex::new(CacheShards::new(10, byte_cap)),
13711 reload_on_search: true,
13712 last_reload: Mutex::new(None),
13713 last_generation: Mutex::new(None),
13714 reload_epoch: Arc::new(AtomicU64::new(0)),
13715 warm_tx: Some(tx),
13716 _warm_handle: None,
13717 metrics: Metrics::default(),
13718 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13719 semantic: Mutex::new(None),
13720 last_tantivy_total_count: Mutex::new(None),
13721 };
13722 let filters = SearchFilters::default();
13723
13724 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
13725 client.maybe_schedule_adaptive_query_prewarm("zebra", &filters);
13726 assert_eq!(
13727 client.cache_stats().prewarm_skipped_pressure,
13728 0,
13729 "cold queries should not be counted as pressure-skipped prewarm jobs"
13730 );
13731
13732 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
13733
13734 assert!(
13735 rx.try_recv().is_err(),
13736 "prewarm should be disabled while cache byte pressure is high"
13737 );
13738 let stats = client.cache_stats();
13739 assert_eq!(stats.prewarm_scheduled, 0);
13740 assert_eq!(stats.prewarm_skipped_pressure, 1);
13741 assert!(stats.approx_bytes <= stats.byte_cap);
13742 }
13743
13744 #[test]
13745 fn cache_eviction_count_tracks_evictions() {
13746 let client = SearchClient {
13748 reader: None,
13749 sqlite: Mutex::new(None),
13750 sqlite_path: None,
13751 prefix_cache: Mutex::new(CacheShards::new(2, 0)),
13752 reload_on_search: true,
13753 last_reload: Mutex::new(None),
13754 last_generation: Mutex::new(None),
13755 reload_epoch: Arc::new(AtomicU64::new(0)),
13756 warm_tx: None,
13757 _warm_handle: None,
13758 metrics: Metrics::default(),
13759 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13760 semantic: Mutex::new(None),
13761 last_tantivy_total_count: Mutex::new(None),
13762 };
13763
13764 let hit = SearchHit {
13765 title: "test".into(),
13766 snippet: "snippet".into(),
13767 content: "content".into(),
13768 content_hash: stable_content_hash("content"),
13769 score: 1.0,
13770 source_path: "p".into(),
13771 agent: "a".into(),
13772 workspace: "w".into(),
13773 workspace_original: None,
13774 created_at: None,
13775 line_number: None,
13776 match_type: MatchType::Exact,
13777 source_id: "local".into(),
13778 origin_kind: "local".into(),
13779 origin_host: None,
13780 conversation_id: None,
13781 };
13782
13783 client.put_cache(
13785 "query1",
13786 &SearchFilters::default(),
13787 std::slice::from_ref(&hit),
13788 );
13789 client.put_cache(
13790 "query2",
13791 &SearchFilters::default(),
13792 std::slice::from_ref(&hit),
13793 );
13794 client.put_cache(
13795 "query3",
13796 &SearchFilters::default(),
13797 std::slice::from_ref(&hit),
13798 );
13799
13800 let stats = client.cache_stats();
13801 assert!(
13802 stats.eviction_count >= 1,
13803 "should have evicted at least 1 entry"
13804 );
13805 assert!(stats.total_cost <= 2, "should be at or below cap");
13806 assert!(stats.approx_bytes > 0, "should track bytes used");
13807 }
13808
13809 #[test]
13810 fn default_cache_byte_cap_scales_with_available_memory() {
13811 let gib = 1024_u64 * 1024 * 1024;
13812
13813 assert_eq!(
13814 default_cache_byte_cap_for_available(None),
13815 DEFAULT_CACHE_BYTE_CAP_FALLBACK
13816 );
13817 assert_eq!(
13818 default_cache_byte_cap_for_available(Some(2 * gib)),
13819 DEFAULT_CACHE_BYTE_CAP_FALLBACK,
13820 "small hosts keep a conservative cache byte budget"
13821 );
13822 assert_eq!(
13823 default_cache_byte_cap_for_available(Some(64 * gib)),
13824 512 * 1024 * 1024,
13825 "larger hosts get a proportionally larger cache byte budget"
13826 );
13827 assert_eq!(
13828 default_cache_byte_cap_for_available(Some(256 * gib)),
13829 usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX),
13830 "large swarm hosts still have a bounded default cache budget"
13831 );
13832 }
13833
13834 #[test]
13835 fn malformed_cache_byte_cap_env_uses_default_instead_of_disabling_guard() {
13836 let gib = 1024_u64 * 1024 * 1024;
13837
13838 assert_eq!(cache_byte_cap_from_env_value(Some("0"), Some(64 * gib)), 0);
13839 assert_eq!(
13840 cache_byte_cap_from_env_value(Some("not-a-number"), Some(64 * gib)),
13841 default_cache_byte_cap_for_available(Some(64 * gib)),
13842 "malformed env should keep the default memory guard active"
13843 );
13844 assert_eq!(
13845 cache_byte_cap_from_env_value(None, Some(64 * gib)),
13846 default_cache_byte_cap_for_available(Some(64 * gib))
13847 );
13848 }
13849
13850 #[test]
13851 fn cache_eviction_policy_env_defaults_to_lru_and_accepts_s3_fifo() {
13852 assert_eq!(
13853 cache_eviction_policy_from_env_value(None),
13854 CacheEvictionPolicy::Lru
13855 );
13856 assert_eq!(
13857 cache_eviction_policy_from_env_value(Some("not-a-policy")),
13858 CacheEvictionPolicy::Lru,
13859 "malformed env keeps the current LRU behavior"
13860 );
13861 assert_eq!(
13862 cache_eviction_policy_from_env_value(Some("s3-fifo")),
13863 CacheEvictionPolicy::S3Fifo
13864 );
13865 assert_eq!(
13866 cache_eviction_policy_from_env_value(Some("s3_fifo")),
13867 CacheEvictionPolicy::S3Fifo
13868 );
13869 }
13870
13871 #[test]
13872 fn s3_fifo_admission_rejects_one_off_byte_heavy_entries_then_admits_ghost_replay() {
13873 let content = "large".repeat(1_000);
13874 let hit = SearchHit {
13875 title: "large".into(),
13876 snippet: "large".into(),
13877 content: content.clone(),
13878 content_hash: stable_content_hash(&content),
13879 score: 1.0,
13880 source_path: "large-path".into(),
13881 agent: "a".into(),
13882 workspace: "w".into(),
13883 workspace_original: None,
13884 created_at: None,
13885 line_number: None,
13886 match_type: MatchType::Exact,
13887 source_id: "local".into(),
13888 origin_kind: "local".into(),
13889 origin_host: None,
13890 conversation_id: None,
13891 };
13892 let cached = cached_hit_from(&hit);
13893 let byte_cap = cached.approx_bytes() + 1_024;
13894 assert!(
13895 cached.approx_bytes() > byte_cap.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR)
13896 );
13897
13898 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::S3Fifo);
13899 let key = Arc::<str>::from("large-query");
13900
13901 cache.put("global", key.clone(), vec![cached.clone()]);
13902 assert_eq!(
13903 cache.total_cost(),
13904 0,
13905 "first one-off large entry is not admitted"
13906 );
13907 assert_eq!(cache.ghost_entries(), 1);
13908 assert_eq!(cache.admission_rejects(), 1);
13909
13910 cache.put("global", key, vec![cached]);
13911 assert_eq!(
13912 cache.total_cost(),
13913 1,
13914 "ghost replay admits the repeated query"
13915 );
13916 assert_eq!(cache.ghost_entries(), 0);
13917 assert!(cache.ghost_keys.is_empty());
13918 assert_eq!(cache.admission_rejects(), 1);
13919 assert!(cache.total_bytes() <= cache.byte_cap());
13920 }
13921
13922 #[test]
13923 fn lru_policy_keeps_admitting_large_entries_under_existing_caps() {
13924 let content = "large".repeat(1_000);
13925 let hit = SearchHit {
13926 title: "large".into(),
13927 snippet: "large".into(),
13928 content: content.clone(),
13929 content_hash: stable_content_hash(&content),
13930 score: 1.0,
13931 source_path: "large-path".into(),
13932 agent: "a".into(),
13933 workspace: "w".into(),
13934 workspace_original: None,
13935 created_at: None,
13936 line_number: None,
13937 match_type: MatchType::Exact,
13938 source_id: "local".into(),
13939 origin_kind: "local".into(),
13940 origin_host: None,
13941 conversation_id: None,
13942 };
13943 let cached = cached_hit_from(&hit);
13944 let byte_cap = cached.approx_bytes() + 1_024;
13945 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::Lru);
13946
13947 cache.put("global", Arc::<str>::from("large-query"), vec![cached]);
13948
13949 assert_eq!(cache.total_cost(), 1);
13950 assert_eq!(cache.ghost_entries(), 0);
13951 assert_eq!(cache.admission_rejects(), 0);
13952 assert_eq!(cache.policy_label(), "lru");
13953 }
13954
13955 #[test]
13956 fn cache_byte_cap_triggers_eviction() {
13957 let client = SearchClient {
13959 reader: None,
13960 sqlite: Mutex::new(None),
13961 sqlite_path: None,
13962 prefix_cache: Mutex::new(CacheShards::new(1000, 100)), reload_on_search: true,
13964 last_reload: Mutex::new(None),
13965 last_generation: Mutex::new(None),
13966 reload_epoch: Arc::new(AtomicU64::new(0)),
13967 warm_tx: None,
13968 _warm_handle: None,
13969 metrics: Metrics::default(),
13970 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13971 semantic: Mutex::new(None),
13972 last_tantivy_total_count: Mutex::new(None),
13973 };
13974
13975 let content = "c".repeat(100);
13977 let hit = SearchHit {
13978 title: "a".repeat(50),
13979 snippet: "b".repeat(50),
13980 content: content.clone(), content_hash: stable_content_hash(&content),
13982 score: 1.0,
13983 source_path: "p".into(),
13984 agent: "a".into(),
13985 workspace: "w".into(),
13986 workspace_original: None,
13987 created_at: None,
13988 line_number: None,
13989 match_type: MatchType::Exact,
13990 source_id: "local".into(),
13991 origin_kind: "local".into(),
13992 origin_host: None,
13993 conversation_id: None,
13994 };
13995
13996 client.put_cache("q1", &SearchFilters::default(), std::slice::from_ref(&hit));
13998 client.put_cache("q2", &SearchFilters::default(), std::slice::from_ref(&hit));
13999 client.put_cache("q3", &SearchFilters::default(), std::slice::from_ref(&hit));
14000
14001 let stats = client.cache_stats();
14002 assert!(
14003 stats.eviction_count >= 1,
14004 "byte cap should trigger evictions"
14005 );
14006 assert_eq!(stats.byte_cap, 100, "byte cap should be reported");
14007 }
14009
14010 #[test]
14011 fn cache_byte_pressure_evicts_byte_heavy_shard_before_small_entries() {
14012 let small_hit = SearchHit {
14013 title: "small".into(),
14014 snippet: "small".into(),
14015 content: "small".into(),
14016 content_hash: stable_content_hash("small"),
14017 score: 1.0,
14018 source_path: "small-path".into(),
14019 agent: "a".into(),
14020 workspace: "w".into(),
14021 workspace_original: None,
14022 created_at: None,
14023 line_number: None,
14024 match_type: MatchType::Exact,
14025 source_id: "local".into(),
14026 origin_kind: "local".into(),
14027 origin_host: None,
14028 conversation_id: None,
14029 };
14030 let large_content = "large".repeat(2_000);
14031 let large_hit = SearchHit {
14032 title: "large".into(),
14033 snippet: "large".into(),
14034 content: large_content.clone(),
14035 content_hash: stable_content_hash(&large_content),
14036 score: 1.0,
14037 source_path: "large-path".into(),
14038 agent: "b".into(),
14039 workspace: "w".into(),
14040 workspace_original: None,
14041 created_at: None,
14042 line_number: None,
14043 match_type: MatchType::Exact,
14044 source_id: "local".into(),
14045 origin_kind: "local".into(),
14046 origin_host: None,
14047 conversation_id: None,
14048 };
14049
14050 let mut cache = CacheShards::new(100, 1_024);
14051 cache.put(
14052 "small",
14053 Arc::<str>::from("small-1"),
14054 vec![cached_hit_from(&small_hit)],
14055 );
14056 cache.put(
14057 "small",
14058 Arc::<str>::from("small-2"),
14059 vec![cached_hit_from(&small_hit)],
14060 );
14061 cache.put(
14062 "large",
14063 Arc::<str>::from("large-1"),
14064 vec![cached_hit_from(&large_hit)],
14065 );
14066
14067 assert_eq!(
14068 cache.shard_opt("small").map(LruCache::len),
14069 Some(2),
14070 "byte pressure should preserve the small shard"
14071 );
14072 assert!(
14073 cache.shard_opt("large").is_none_or(LruCache::is_empty),
14074 "oversized shard should be evicted first under byte pressure"
14075 );
14076 assert!(cache.total_bytes() <= cache.byte_cap());
14077 }
14078
14079 #[test]
14084 fn wildcard_pattern_parse_exact() {
14085 assert_eq!(
14087 FsCassWildcardPattern::parse("hello"),
14088 FsCassWildcardPattern::Exact("hello".into())
14089 );
14090 assert_eq!(
14091 FsCassWildcardPattern::parse("HELLO"),
14092 FsCassWildcardPattern::Exact("hello".into()) );
14094 assert_eq!(
14095 FsCassWildcardPattern::parse("FooBar123"),
14096 FsCassWildcardPattern::Exact("foobar123".into())
14097 );
14098 }
14099
14100 #[test]
14101 fn wildcard_pattern_parse_prefix() {
14102 assert_eq!(
14104 FsCassWildcardPattern::parse("foo*"),
14105 FsCassWildcardPattern::Prefix("foo".into())
14106 );
14107 assert_eq!(
14108 FsCassWildcardPattern::parse("CONFIG*"),
14109 FsCassWildcardPattern::Prefix("config".into())
14110 );
14111 assert_eq!(
14112 FsCassWildcardPattern::parse("test*"),
14113 FsCassWildcardPattern::Prefix("test".into())
14114 );
14115 }
14116
14117 #[test]
14118 fn wildcard_pattern_parse_suffix() {
14119 assert_eq!(
14121 FsCassWildcardPattern::parse("*foo"),
14122 FsCassWildcardPattern::Suffix("foo".into())
14123 );
14124 assert_eq!(
14125 FsCassWildcardPattern::parse("*Error"),
14126 FsCassWildcardPattern::Suffix("error".into())
14127 );
14128 assert_eq!(
14129 FsCassWildcardPattern::parse("*Handler"),
14130 FsCassWildcardPattern::Suffix("handler".into())
14131 );
14132 }
14133
14134 #[test]
14135 fn wildcard_pattern_parse_substring() {
14136 assert_eq!(
14138 FsCassWildcardPattern::parse("*foo*"),
14139 FsCassWildcardPattern::Substring("foo".into())
14140 );
14141 assert_eq!(
14142 FsCassWildcardPattern::parse("*CONFIG*"),
14143 FsCassWildcardPattern::Substring("config".into())
14144 );
14145 assert_eq!(
14146 FsCassWildcardPattern::parse("*test*"),
14147 FsCassWildcardPattern::Substring("test".into())
14148 );
14149 }
14150
14151 #[test]
14152 fn wildcard_pattern_parse_edge_cases() {
14153 assert_eq!(
14155 FsCassWildcardPattern::parse("*"),
14156 FsCassWildcardPattern::Exact(String::new())
14157 );
14158 assert_eq!(
14159 FsCassWildcardPattern::parse("**"),
14160 FsCassWildcardPattern::Exact(String::new())
14161 );
14162 assert_eq!(
14163 FsCassWildcardPattern::parse("***"),
14164 FsCassWildcardPattern::Exact(String::new())
14165 );
14166
14167 assert_eq!(
14169 FsCassWildcardPattern::parse("*a*"),
14170 FsCassWildcardPattern::Substring("a".into())
14171 );
14172 assert_eq!(
14173 FsCassWildcardPattern::parse("a*"),
14174 FsCassWildcardPattern::Prefix("a".into())
14175 );
14176 assert_eq!(
14177 FsCassWildcardPattern::parse("*a"),
14178 FsCassWildcardPattern::Suffix("a".into())
14179 );
14180
14181 assert_eq!(
14183 FsCassWildcardPattern::parse("***foo***"),
14184 FsCassWildcardPattern::Substring("foo".into())
14185 );
14186 }
14187
14188 #[test]
14189 fn wildcard_pattern_to_regex_suffix() {
14190 let pattern = FsCassWildcardPattern::Suffix("foo".into());
14191 assert_eq!(pattern.to_regex(), Some(".*foo$".into()));
14193 }
14194
14195 #[test]
14196 fn wildcard_pattern_to_regex_substring() {
14197 let pattern = FsCassWildcardPattern::Substring("bar".into());
14198 assert_eq!(pattern.to_regex(), Some(".*bar.*".into()));
14199 }
14200
14201 #[test]
14202 fn wildcard_pattern_to_regex_exact_prefix_none() {
14203 let exact = FsCassWildcardPattern::Exact("foo".into());
14205 assert_eq!(exact.to_regex(), None);
14206
14207 let prefix = FsCassWildcardPattern::Prefix("bar".into());
14208 assert_eq!(prefix.to_regex(), None);
14209 }
14210
14211 #[test]
14212 fn match_type_quality_factors() {
14213 assert_eq!(MatchType::Exact.quality_factor(), 1.0);
14215 assert_eq!(MatchType::Prefix.quality_factor(), 0.9);
14217 assert_eq!(MatchType::Suffix.quality_factor(), 0.8);
14219 assert_eq!(MatchType::Substring.quality_factor(), 0.7);
14221 assert_eq!(MatchType::ImplicitWildcard.quality_factor(), 0.6);
14223 }
14224
14225 #[test]
14226 fn dominant_match_type_single_terms() {
14227 assert_eq!(dominant_match_type("hello"), MatchType::Exact);
14229 assert_eq!(dominant_match_type("hello*"), MatchType::Prefix);
14230 assert_eq!(dominant_match_type("*hello"), MatchType::Suffix);
14231 assert_eq!(dominant_match_type("*hello*"), MatchType::Substring);
14232 }
14233
14234 #[test]
14235 fn dominant_match_type_multiple_terms() {
14236 assert_eq!(dominant_match_type("foo bar"), MatchType::Exact);
14238 assert_eq!(dominant_match_type("foo bar*"), MatchType::Prefix);
14239 assert_eq!(dominant_match_type("foo *bar"), MatchType::Suffix);
14240 assert_eq!(dominant_match_type("foo* *bar*"), MatchType::Substring);
14241 assert_eq!(dominant_match_type("foo *bar* baz"), MatchType::Substring);
14243 }
14244
14245 #[test]
14246 fn dominant_match_type_empty_query() {
14247 assert_eq!(dominant_match_type(""), MatchType::Exact);
14248 assert_eq!(dominant_match_type(" "), MatchType::Exact);
14249 }
14250
14251 #[test]
14252 fn wildcard_pattern_to_regex_escapes_special_chars() {
14253 assert_eq!(
14254 FsCassWildcardPattern::Suffix("foo.bar".into()).to_regex(),
14255 Some(".*foo\\.bar$".into())
14256 );
14257 assert_eq!(
14258 FsCassWildcardPattern::Substring("a+b*c?".into()).to_regex(),
14259 Some(".*a\\+b\\*c\\?.*".into())
14260 );
14261 }
14262
14263 #[test]
14264 fn wildcard_pattern_to_regex_escapes_complex_patterns() {
14265 assert_eq!(
14266 FsCassWildcardPattern::Suffix("test[0-9]+".into()).to_regex(),
14267 Some(".*test\\[0-9\\]\\+$".into())
14268 );
14269 assert_eq!(
14270 FsCassWildcardPattern::Substring("(a|b)".into()).to_regex(),
14271 Some(".*\\(a\\|b\\).*".into())
14272 );
14273 assert_eq!(
14274 FsCassWildcardPattern::Substring("end$".into()).to_regex(),
14275 Some(".*end\\$.*".into())
14276 );
14277 assert_eq!(
14278 FsCassWildcardPattern::Substring("^start".into()).to_regex(),
14279 Some(".*\\^start.*".into())
14280 );
14281 }
14282
14283 #[test]
14284 fn is_tool_invocation_noise_detects_noise() {
14285 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
14287 assert!(!is_tool_invocation_noise("[Tool: Read]"));
14288
14289 assert!(is_tool_invocation_noise("[Tool:]"));
14291 assert!(is_tool_invocation_noise("[Tool: ]"));
14292
14293 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
14295 assert!(!is_tool_invocation_noise(" [Tool: Grep - Search files] "));
14296
14297 assert!(is_tool_invocation_noise("[tool]"));
14299 assert!(is_tool_invocation_noise("tool: Bash"));
14300 }
14301
14302 #[test]
14303 fn is_tool_invocation_noise_allows_useful_content() {
14304 assert!(!is_tool_invocation_noise("[Tool: Read - src/main.rs]"));
14306 assert!(!is_tool_invocation_noise("[Tool: Bash - cargo test --lib]"));
14307 }
14308
14309 #[test]
14310 fn is_tool_invocation_noise_detects_tool_markers() {
14311 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
14313 assert!(!is_tool_invocation_noise("[Tool: Read]"));
14314
14315 assert!(is_tool_invocation_noise("[Tool:]"));
14317
14318 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
14320 assert!(!is_tool_invocation_noise(" [Tool: Write - description] "));
14321 }
14322
14323 #[test]
14324 fn deduplicate_hits_removes_exact_dupes() {
14325 let hits = vec![
14326 SearchHit {
14327 title: "title1".into(),
14328 snippet: "snip1".into(),
14329 content: "hello world".into(),
14330 content_hash: stable_content_hash("hello world"),
14331 score: 1.0,
14332 source_path: "a.jsonl".into(),
14333 agent: "agent".into(),
14334 workspace: "ws".into(),
14335 workspace_original: None,
14336 created_at: Some(100),
14337 line_number: None,
14338 match_type: MatchType::Exact,
14339 source_id: "local".into(),
14340 origin_kind: "local".into(),
14341 origin_host: None,
14342 conversation_id: None,
14343 },
14344 SearchHit {
14345 title: "title1".into(),
14346 snippet: "snip2".into(),
14347 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14349 score: 0.5, source_path: "a.jsonl".into(),
14351 agent: "agent".into(),
14352 workspace: "ws".into(),
14353 workspace_original: None,
14354 created_at: Some(100),
14355 line_number: None,
14356 match_type: MatchType::Exact,
14357 source_id: "local".into(), origin_kind: "local".into(),
14359 origin_host: None,
14360 conversation_id: None,
14361 },
14362 ];
14363
14364 let deduped = deduplicate_hits(hits);
14365 assert_eq!(deduped.len(), 1);
14366 assert_eq!(deduped[0].score, 1.0); assert_eq!(deduped[0].title, "title1");
14368 }
14369
14370 #[test]
14371 fn deduplicate_hits_keeps_higher_score() {
14372 let hits = vec![
14373 SearchHit {
14374 title: "title1".into(),
14375 snippet: "snip1".into(),
14376 content: "hello world".into(),
14377 content_hash: stable_content_hash("hello world"),
14378 score: 0.3, source_path: "a.jsonl".into(),
14380 agent: "agent".into(),
14381 workspace: "ws".into(),
14382 workspace_original: None,
14383 created_at: Some(100),
14384 line_number: None,
14385 match_type: MatchType::Exact,
14386 source_id: "local".into(),
14387 origin_kind: "local".into(),
14388 origin_host: None,
14389 conversation_id: None,
14390 },
14391 SearchHit {
14392 title: "title1".into(),
14393 snippet: "snip2".into(),
14394 content: "hello world".into(),
14395 content_hash: stable_content_hash("hello world"),
14396 score: 0.9, source_path: "a.jsonl".into(),
14398 agent: "agent".into(),
14399 workspace: "ws".into(),
14400 workspace_original: None,
14401 created_at: Some(100),
14402 line_number: None,
14403 match_type: MatchType::Exact,
14404 source_id: "local".into(),
14405 origin_kind: "local".into(),
14406 origin_host: None,
14407 conversation_id: None,
14408 },
14409 ];
14410
14411 let deduped = deduplicate_hits(hits);
14412 assert_eq!(deduped.len(), 1);
14413 assert_eq!(deduped[0].score, 0.9); assert_eq!(deduped[0].title, "title1");
14415 }
14416
14417 #[test]
14418 fn deduplicate_hits_keeps_repeated_same_content_at_different_lines() {
14419 let first = SearchHit {
14420 title: "Shared Session".into(),
14421 snippet: String::new(),
14422 content: "repeat me".into(),
14423 content_hash: stable_content_hash("repeat me"),
14424 score: 10.0,
14425 source_path: "/shared/session.jsonl".into(),
14426 agent: "codex".into(),
14427 workspace: "/ws".into(),
14428 workspace_original: None,
14429 created_at: Some(100),
14430 line_number: Some(1),
14431 match_type: MatchType::Exact,
14432 source_id: "local".into(),
14433 origin_kind: "local".into(),
14434 origin_host: None,
14435 conversation_id: None,
14436 };
14437 let mut second = first.clone();
14438 second.line_number = Some(2);
14439 second.created_at = Some(200);
14440 second.score = 9.0;
14441
14442 let deduped = deduplicate_hits(vec![first, second]);
14443 assert_eq!(deduped.len(), 2);
14444 }
14445
14446 #[test]
14447 fn deduplicate_hits_keeps_distinct_conversation_ids_with_same_title_path_and_content() {
14448 let mut first = make_test_hit("same", 1.0);
14449 first.title = "Shared Session".into();
14450 first.source_path = "/shared/session.jsonl".into();
14451 first.content = "identical body".into();
14452 first.content_hash = stable_content_hash("identical body");
14453 first.conversation_id = Some(1);
14454
14455 let mut second = first.clone();
14456 second.conversation_id = Some(2);
14457 second.score = 0.9;
14458
14459 let deduped = deduplicate_hits(vec![first, second]);
14460 assert_eq!(deduped.len(), 2);
14461 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(1)));
14462 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(2)));
14463 }
14464
14465 #[test]
14466 fn deduplicate_hits_coalesces_same_conversation_id_despite_title_drift() {
14467 let mut first = make_test_hit("same", 1.0);
14468 first.title = "Morning Session".into();
14469 first.source_path = "/shared/session.jsonl".into();
14470 first.content = "identical body".into();
14471 first.content_hash = stable_content_hash("identical body");
14472 first.conversation_id = Some(7);
14473
14474 let mut second = first.clone();
14475 second.title = "Evening Session".into();
14476 second.score = 0.9;
14477
14478 let deduped = deduplicate_hits(vec![first, second]);
14479 assert_eq!(deduped.len(), 1);
14480 assert_eq!(deduped[0].conversation_id, Some(7));
14481 }
14482
14483 #[test]
14484 fn deduplicate_hits_keeps_distinct_titles_with_same_source_path_and_content() {
14485 let hits = vec![
14486 SearchHit {
14487 title: "Morning Session".into(),
14488 snippet: "snip1".into(),
14489 content: "hello world".into(),
14490 content_hash: stable_content_hash("hello world"),
14491 score: 0.9,
14492 source_path: "shared.jsonl".into(),
14493 agent: "agent".into(),
14494 workspace: "ws".into(),
14495 workspace_original: None,
14496 created_at: None,
14497 line_number: Some(1),
14498 match_type: MatchType::Exact,
14499 source_id: "local".into(),
14500 origin_kind: "local".into(),
14501 origin_host: None,
14502 conversation_id: None,
14503 },
14504 SearchHit {
14505 title: "Evening Session".into(),
14506 snippet: "snip2".into(),
14507 content: "hello world".into(),
14508 content_hash: stable_content_hash("hello world"),
14509 score: 0.8,
14510 source_path: "shared.jsonl".into(),
14511 agent: "agent".into(),
14512 workspace: "ws".into(),
14513 workspace_original: None,
14514 created_at: None,
14515 line_number: Some(1),
14516 match_type: MatchType::Exact,
14517 source_id: "local".into(),
14518 origin_kind: "local".into(),
14519 origin_host: None,
14520 conversation_id: None,
14521 },
14522 ];
14523
14524 let deduped = deduplicate_hits(hits);
14525 assert_eq!(deduped.len(), 2);
14526 assert!(deduped.iter().any(|hit| hit.title == "Morning Session"));
14527 assert!(deduped.iter().any(|hit| hit.title == "Evening Session"));
14528 }
14529
14530 #[test]
14531 fn deduplicate_hits_normalizes_whitespace() {
14532 let hits = vec![
14533 SearchHit {
14534 title: "title1".into(),
14535 snippet: "snip1".into(),
14536 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14538 score: 1.0,
14539 source_path: "a.jsonl".into(),
14540 agent: "agent".into(),
14541 workspace: "ws".into(),
14542 workspace_original: None,
14543 created_at: Some(100),
14544 line_number: None,
14545 match_type: MatchType::Exact,
14546 source_id: "local".into(),
14547 origin_kind: "local".into(),
14548 origin_host: None,
14549 conversation_id: None,
14550 },
14551 SearchHit {
14552 title: "title1".into(),
14553 snippet: "snip2".into(),
14554 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14556 score: 0.5,
14557 source_path: "a.jsonl".into(),
14558 agent: "agent".into(),
14559 workspace: "ws".into(),
14560 workspace_original: None,
14561 created_at: Some(100),
14562 line_number: None,
14563 match_type: MatchType::Exact,
14564 source_id: "local".into(),
14565 origin_kind: "local".into(),
14566 origin_host: None,
14567 conversation_id: None,
14568 },
14569 ];
14570
14571 let deduped = deduplicate_hits(hits);
14572 assert_eq!(deduped.len(), 1); }
14574
14575 #[test]
14576 fn deduplicate_hits_normalizes_blank_local_source_id() {
14577 let hits = vec![
14578 SearchHit {
14579 title: "title1".into(),
14580 snippet: "snip1".into(),
14581 content: "hello world".into(),
14582 content_hash: stable_content_hash("hello world"),
14583 score: 1.0,
14584 source_path: "a.jsonl".into(),
14585 agent: "agent".into(),
14586 workspace: "ws".into(),
14587 workspace_original: None,
14588 created_at: Some(100),
14589 line_number: None,
14590 match_type: MatchType::Exact,
14591 source_id: "local".into(),
14592 origin_kind: "local".into(),
14593 origin_host: None,
14594 conversation_id: None,
14595 },
14596 SearchHit {
14597 title: "title1".into(),
14598 snippet: "snip2".into(),
14599 content: "hello world".into(),
14600 content_hash: stable_content_hash("hello world"),
14601 score: 0.5,
14602 source_path: "a.jsonl".into(),
14603 agent: "agent".into(),
14604 workspace: "ws".into(),
14605 workspace_original: None,
14606 created_at: Some(100),
14607 line_number: None,
14608 match_type: MatchType::Exact,
14609 source_id: " ".into(),
14610 origin_kind: "local".into(),
14611 origin_host: None,
14612 conversation_id: None,
14613 },
14614 ];
14615
14616 let deduped = deduplicate_hits(hits);
14617 assert_eq!(deduped.len(), 1);
14618 assert_eq!(deduped[0].source_id, "local");
14619 }
14620
14621 #[test]
14622 fn deduplicate_hits_filters_tool_noise() {
14623 let hits = vec![
14624 SearchHit {
14625 title: "title1".into(),
14626 snippet: "snip1".into(),
14627 content: "[Tool:]".into(), content_hash: stable_content_hash("[Tool:]"),
14629 score: 1.0,
14630 source_path: "a.jsonl".into(),
14631 agent: "agent".into(),
14632 workspace: "ws".into(),
14633 workspace_original: None,
14634 created_at: Some(100),
14635 line_number: None,
14636 match_type: MatchType::Exact,
14637 source_id: "local".into(),
14638 origin_kind: "local".into(),
14639 origin_host: None,
14640 conversation_id: None,
14641 },
14642 SearchHit {
14643 title: "title2".into(),
14644 snippet: "snip2".into(),
14645 content: "This is real content about testing".into(),
14646 content_hash: stable_content_hash("This is real content about testing"),
14647 score: 0.5,
14648 source_path: "b.jsonl".into(),
14649 agent: "agent".into(),
14650 workspace: "ws".into(),
14651 workspace_original: None,
14652 created_at: Some(200),
14653 line_number: None,
14654 match_type: MatchType::Exact,
14655 source_id: "local".into(),
14656 origin_kind: "local".into(),
14657 origin_host: None,
14658 conversation_id: None,
14659 },
14660 ];
14661
14662 let deduped = deduplicate_hits(hits);
14663 assert_eq!(deduped.len(), 1);
14664 assert!(deduped[0].content.contains("real content"));
14665 }
14666
14667 #[test]
14668 fn deduplicate_hits_filters_acknowledgement_noise() {
14669 let hits = vec![
14670 SearchHit {
14671 title: "ack".into(),
14672 snippet: "ack".into(),
14673 content: "Acknowledged.".into(),
14674 content_hash: stable_content_hash("Acknowledged."),
14675 score: 1.0,
14676 source_path: "ack.jsonl".into(),
14677 agent: "agent".into(),
14678 workspace: "ws".into(),
14679 workspace_original: None,
14680 created_at: Some(100),
14681 line_number: None,
14682 match_type: MatchType::Exact,
14683 source_id: "local".into(),
14684 origin_kind: "local".into(),
14685 origin_host: None,
14686 conversation_id: None,
14687 },
14688 SearchHit {
14689 title: "real".into(),
14690 snippet: "real".into(),
14691 content: "Authentication refresh logic changed".into(),
14692 content_hash: stable_content_hash("Authentication refresh logic changed"),
14693 score: 0.5,
14694 source_path: "real.jsonl".into(),
14695 agent: "agent".into(),
14696 workspace: "ws".into(),
14697 workspace_original: None,
14698 created_at: Some(200),
14699 line_number: None,
14700 match_type: MatchType::Exact,
14701 source_id: "local".into(),
14702 origin_kind: "local".into(),
14703 origin_host: None,
14704 conversation_id: None,
14705 },
14706 ];
14707
14708 let deduped = deduplicate_hits_with_query(hits, "authentication");
14709 assert_eq!(deduped.len(), 1);
14710 assert_eq!(deduped[0].title, "real");
14711 }
14712
14713 #[test]
14714 fn deduplicate_hits_hides_system_prompts_unless_query_requests_them() {
14715 let prompt_hit = SearchHit {
14716 title: "prompt".into(),
14717 snippet: "prompt".into(),
14718 content:
14719 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly."
14720 .into(),
14721 content_hash: stable_content_hash(
14722 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly.",
14723 ),
14724 score: 1.0,
14725 source_path: "prompt.jsonl".into(),
14726 agent: "agent".into(),
14727 workspace: "ws".into(),
14728 workspace_original: None,
14729 created_at: Some(100),
14730 line_number: None,
14731 match_type: MatchType::Exact,
14732 source_id: "local".into(),
14733 origin_kind: "local".into(),
14734 origin_host: None,
14735 conversation_id: None,
14736 };
14737
14738 assert!(
14739 deduplicate_hits_with_query(vec![prompt_hit.clone()], "coding assistant").is_empty()
14740 );
14741
14742 let kept = deduplicate_hits_with_query(vec![prompt_hit], "AGENTS.md instructions");
14743 assert_eq!(kept.len(), 1);
14744 assert_eq!(kept[0].title, "prompt");
14745 }
14746
14747 #[test]
14748 fn deduplicate_hits_preserves_unique_content() {
14749 let hits = vec![
14750 SearchHit {
14751 title: "title1".into(),
14752 snippet: "snip1".into(),
14753 content: "first message".into(),
14754 content_hash: stable_content_hash("first message"),
14755 score: 1.0,
14756 source_path: "a.jsonl".into(),
14757 agent: "agent".into(),
14758 workspace: "ws".into(),
14759 workspace_original: None,
14760 created_at: Some(100),
14761 line_number: None,
14762 match_type: MatchType::Exact,
14763 source_id: "local".into(),
14764 origin_kind: "local".into(),
14765 origin_host: None,
14766 conversation_id: None,
14767 },
14768 SearchHit {
14769 title: "title2".into(),
14770 snippet: "snip2".into(),
14771 content: "second message".into(),
14772 content_hash: stable_content_hash("second message"),
14773 score: 0.8,
14774 source_path: "b.jsonl".into(),
14775 agent: "agent".into(),
14776 workspace: "ws".into(),
14777 workspace_original: None,
14778 created_at: Some(200),
14779 line_number: None,
14780 match_type: MatchType::Exact,
14781 source_id: "local".into(),
14782 origin_kind: "local".into(),
14783 origin_host: None,
14784 conversation_id: None,
14785 },
14786 SearchHit {
14787 title: "title3".into(),
14788 snippet: "snip3".into(),
14789 content: "third message".into(),
14790 content_hash: stable_content_hash("third message"),
14791 score: 0.6,
14792 source_path: "c.jsonl".into(),
14793 agent: "agent".into(),
14794 workspace: "ws".into(),
14795 workspace_original: None,
14796 created_at: Some(300),
14797 line_number: None,
14798 match_type: MatchType::Exact,
14799 source_id: "local".into(),
14800 origin_kind: "local".into(),
14801 origin_host: None,
14802 conversation_id: None,
14803 },
14804 ];
14805
14806 let deduped = deduplicate_hits(hits);
14807 assert_eq!(deduped.len(), 3); }
14809
14810 #[test]
14813 fn deduplicate_hits_respects_source_boundaries() {
14814 let hits = vec![
14815 SearchHit {
14816 title: "local title".into(),
14817 snippet: "snip".into(),
14818 content: "hello world".into(),
14819 content_hash: stable_content_hash("hello world"),
14820 score: 1.0,
14821 source_path: "a.jsonl".into(),
14822 agent: "agent".into(),
14823 workspace: "ws".into(),
14824 workspace_original: None,
14825 created_at: Some(100),
14826 line_number: None,
14827 match_type: MatchType::Exact,
14828 source_id: "local".into(),
14829 origin_kind: "local".into(),
14830 origin_host: None,
14831 conversation_id: None,
14832 },
14833 SearchHit {
14834 title: "remote title".into(),
14835 snippet: "snip".into(),
14836 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14838 score: 0.9,
14839 source_path: "b.jsonl".into(),
14840 agent: "agent".into(),
14841 workspace: "ws".into(),
14842 workspace_original: None,
14843 created_at: Some(200),
14844 line_number: None,
14845 match_type: MatchType::Exact,
14846 source_id: "work-laptop".into(), origin_kind: "ssh".into(),
14848 origin_host: Some("work-laptop.local".into()),
14849 conversation_id: None,
14850 },
14851 ];
14852
14853 let deduped = deduplicate_hits(hits);
14854 assert_eq!(
14855 deduped.len(),
14856 2,
14857 "same content from different sources should not dedupe"
14858 );
14859 assert!(deduped.iter().any(|h| h.source_id == "local"));
14860 assert!(deduped.iter().any(|h| h.source_id == "work-laptop"));
14861 }
14862
14863 #[test]
14864 fn wildcard_fallback_sparse_check_uses_effective_limit() {
14865 assert!(
14866 !should_try_wildcard_fallback(1, 1, 0, 3),
14867 "a filled one-result page is not sparse for fallback purposes"
14868 );
14869 assert!(
14870 !should_try_wildcard_fallback(2, 2, 0, 3),
14871 "a filled two-result page is not sparse for fallback purposes"
14872 );
14873 assert!(
14874 should_try_wildcard_fallback(0, 1, 0, 3),
14875 "zero hits should still trigger fallback even for tiny pages"
14876 );
14877 assert!(
14878 should_try_wildcard_fallback(1, 2, 0, 3),
14879 "a partially filled page should still trigger fallback"
14880 );
14881 assert!(
14882 !should_try_wildcard_fallback(0, 5, 10, 3),
14883 "pagination should not trigger wildcard fallback"
14884 );
14885 assert!(
14886 should_try_wildcard_fallback(1, 0, 0, 3),
14887 "limit zero preserves the legacy sparse-threshold semantics"
14888 );
14889 }
14890
14891 #[test]
14892 fn snippet_preview_fast_path_requires_snippet_only_match() {
14893 let snippet_only = FieldMask::new(false, true, false, false);
14894 let snippet = snippet_from_preview_without_full_content(
14895 snippet_only,
14896 "migration checks the database constraint before writing",
14897 "database",
14898 )
14899 .expect("preview should satisfy a snippet-only request when it contains the query");
14900 assert!(snippet.contains("**database**"));
14901
14902 assert!(
14903 snippet_from_preview_without_full_content(
14904 FieldMask::FULL,
14905 "migration checks the database constraint before writing",
14906 "database",
14907 )
14908 .is_none(),
14909 "full-content requests must keep the sqlite hydration path"
14910 );
14911 assert!(
14912 snippet_from_preview_without_full_content(
14913 snippet_only,
14914 "migration checks constraints before writing",
14915 "database",
14916 )
14917 .is_none(),
14918 "snippet-only requests hydrate when the preview cannot show the match"
14919 );
14920 }
14921
14922 #[test]
14923 fn search_with_fallback_returns_exact_when_sufficient() -> Result<()> {
14924 let dir = TempDir::new()?;
14925 let mut index = TantivyIndex::open_or_create(dir.path())?;
14926
14927 for i in 0..5 {
14929 let conv = NormalizedConversation {
14930 agent_slug: "codex".into(),
14931 external_id: None,
14932 title: Some(format!("doc-{i}")),
14933 workspace: Some(std::path::PathBuf::from("/ws")),
14934 source_path: dir.path().join(format!("{i}.jsonl")),
14935 started_at: Some(100 + i),
14936 ended_at: None,
14937 metadata: serde_json::json!({}),
14938 messages: vec![NormalizedMessage {
14939 idx: 0,
14940 role: "user".into(),
14941 author: None,
14942 created_at: Some(100 + i),
14943 content: format!("apple fruit number {i} is delicious and healthy"),
14945 extra: serde_json::json!({}),
14946 snippets: vec![],
14947 invocations: Vec::new(),
14948 }],
14949 };
14950 index.add_conversation(&conv)?;
14951 }
14952 index.commit()?;
14953
14954 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14955
14956 let result = client.search_with_fallback(
14958 "apple",
14959 SearchFilters::default(),
14960 10,
14961 0,
14962 3, FieldMask::FULL,
14964 )?;
14965
14966 assert!(!result.wildcard_fallback);
14967 assert!(result.hits.len() >= 3); Ok(())
14970 }
14971
14972 #[test]
14973 fn search_with_fallback_triggers_on_sparse_results() -> Result<()> {
14974 let dir = TempDir::new()?;
14975 let mut index = TantivyIndex::open_or_create(dir.path())?;
14976
14977 let conv = NormalizedConversation {
14979 agent_slug: "codex".into(),
14980 external_id: None,
14981 title: Some("substring test".into()),
14982 workspace: Some(std::path::PathBuf::from("/ws")),
14983 source_path: dir.path().join("test.jsonl"),
14984 started_at: Some(100),
14985 ended_at: None,
14986 metadata: serde_json::json!({}),
14987 messages: vec![NormalizedMessage {
14988 idx: 0,
14989 role: "user".into(),
14990 author: None,
14991 created_at: Some(100),
14992 content: "configuration management system".into(),
14993 extra: serde_json::json!({}),
14994 snippets: vec![],
14995 invocations: Vec::new(),
14996 }],
14997 };
14998 index.add_conversation(&conv)?;
14999 index.commit()?;
15000
15001 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15002
15003 let result = client.search_with_fallback(
15005 "config",
15006 SearchFilters::default(),
15007 10,
15008 0,
15009 5, FieldMask::FULL,
15011 )?;
15012
15013 assert!(!result.hits.is_empty());
15016
15017 Ok(())
15018 }
15019
15020 #[test]
15021 fn search_with_fallback_skips_when_query_has_wildcards() -> Result<()> {
15022 let dir = TempDir::new()?;
15023 let mut index = TantivyIndex::open_or_create(dir.path())?;
15024
15025 let conv = NormalizedConversation {
15026 agent_slug: "codex".into(),
15027 external_id: None,
15028 title: Some("test".into()),
15029 workspace: None,
15030 source_path: dir.path().join("test.jsonl"),
15031 started_at: Some(100),
15032 ended_at: None,
15033 metadata: serde_json::json!({}),
15034 messages: vec![NormalizedMessage {
15035 idx: 0,
15036 role: "user".into(),
15037 author: None,
15038 created_at: Some(100),
15039 content: "testing data".into(),
15040 extra: serde_json::json!({}),
15041 snippets: vec![],
15042 invocations: Vec::new(),
15043 }],
15044 };
15045 index.add_conversation(&conv)?;
15046 index.commit()?;
15047
15048 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15049
15050 let result = client.search_with_fallback(
15052 "*test*",
15053 SearchFilters::default(),
15054 10,
15055 0,
15056 10, FieldMask::FULL,
15058 )?;
15059
15060 assert!(!result.wildcard_fallback); Ok(())
15062 }
15063
15064 #[test]
15065 fn search_with_fallback_prefers_wildcards_when_they_add_hits() -> Result<()> {
15066 let dir = TempDir::new()?;
15067 let mut index = TantivyIndex::open_or_create(dir.path())?;
15068
15069 for (i, body) in [
15072 "alphabet soup for coders",
15073 "mapping the alphabet city blocks",
15074 ]
15075 .iter()
15076 .enumerate()
15077 {
15078 let conv = NormalizedConversation {
15079 agent_slug: "codex".into(),
15080 external_id: None,
15081 title: Some(format!("alpha-{i}")),
15082 workspace: Some(std::path::PathBuf::from("/ws")),
15083 source_path: dir.path().join(format!("alpha-{i}.jsonl")),
15084 started_at: Some(100 + i as i64),
15085 ended_at: None,
15086 metadata: serde_json::json!({}),
15087 messages: vec![NormalizedMessage {
15088 idx: 0,
15089 role: "user".into(),
15090 author: None,
15091 created_at: Some(100 + i as i64),
15092 content: body.to_string(),
15093 extra: serde_json::json!({}),
15094 snippets: vec![],
15095 invocations: Vec::new(),
15096 }],
15097 };
15098 index.add_conversation(&conv)?;
15099 }
15100 index.commit()?;
15101
15102 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15103
15104 let result = client.search_with_fallback(
15105 "bet",
15106 SearchFilters::default(),
15107 10,
15108 0,
15109 2,
15110 FieldMask::FULL,
15111 )?;
15112
15113 assert!(
15114 result.wildcard_fallback,
15115 "should switch to wildcard fallback when it yields more hits"
15116 );
15117 assert_eq!(
15118 result.hits.len(),
15119 2,
15120 "fallback should surface all alphabet docs"
15121 );
15122 assert!(
15123 result
15124 .hits
15125 .iter()
15126 .all(|h| h.match_type == MatchType::ImplicitWildcard)
15127 );
15128 assert!(result.hits.iter().all(|h| h.content.contains("alphabet")));
15129
15130 Ok(())
15131 }
15132
15133 #[test]
15134 fn automatic_wildcard_fallback_skips_long_zero_hit_token() -> Result<()> {
15135 let dir = TempDir::new()?;
15136 let mut index = TantivyIndex::open_or_create(dir.path())?;
15137
15138 let conv = NormalizedConversation {
15139 agent_slug: "codex".into(),
15140 external_id: None,
15141 title: Some("fruit".into()),
15142 workspace: Some(std::path::PathBuf::from("/ws")),
15143 source_path: dir.path().join("fruit.jsonl"),
15144 started_at: Some(100),
15145 ended_at: None,
15146 metadata: serde_json::json!({}),
15147 messages: vec![NormalizedMessage {
15148 idx: 0,
15149 role: "user".into(),
15150 author: None,
15151 created_at: Some(100),
15152 content: "apple pear banana".into(),
15153 extra: serde_json::json!({}),
15154 snippets: vec![],
15155 invocations: Vec::new(),
15156 }],
15157 };
15158 index.add_conversation(&conv)?;
15159 index.commit()?;
15160
15161 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15162
15163 let result = client.search_with_fallback(
15164 "zzzzzzunlikelyterm",
15165 SearchFilters::default(),
15166 10,
15167 0,
15168 1,
15169 FieldMask::FULL,
15170 )?;
15171 assert!(result.hits.is_empty());
15172 assert!(!result.wildcard_fallback);
15173 assert!(
15174 result
15175 .suggestions
15176 .iter()
15177 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
15178 "manual wildcard suggestion should remain available"
15179 );
15180
15181 let short_result = client.search_with_fallback(
15182 "pple",
15183 SearchFilters::default(),
15184 10,
15185 0,
15186 1,
15187 FieldMask::FULL,
15188 )?;
15189 assert!(short_result.wildcard_fallback);
15190 assert_eq!(short_result.hits.len(), 1);
15191 assert_eq!(short_result.hits[0].match_type, MatchType::ImplicitWildcard);
15192
15193 Ok(())
15194 }
15195
15196 #[test]
15197 fn nohit_suggestions_do_not_lazy_open_sqlite_when_tantivy_is_present() -> Result<()> {
15198 let dir = TempDir::new()?;
15199 let index_path = dir.path().join("index");
15200 let db_path = dir.path().join("cass.db");
15201
15202 let storage = FrankenStorage::open(&db_path)?;
15203 storage.close()?;
15204
15205 let mut index = TantivyIndex::open_or_create(&index_path)?;
15206 let conv = NormalizedConversation {
15207 agent_slug: "codex".into(),
15208 external_id: None,
15209 title: Some("fruit".into()),
15210 workspace: Some(std::path::PathBuf::from("/ws")),
15211 source_path: dir.path().join("fruit.jsonl"),
15212 started_at: Some(100),
15213 ended_at: None,
15214 metadata: serde_json::json!({}),
15215 messages: vec![NormalizedMessage {
15216 idx: 0,
15217 role: "user".into(),
15218 author: None,
15219 created_at: Some(100),
15220 content: "apple pear banana".into(),
15221 extra: serde_json::json!({}),
15222 snippets: vec![],
15223 invocations: Vec::new(),
15224 }],
15225 };
15226 index.add_conversation(&conv)?;
15227 index.commit()?;
15228
15229 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("index present");
15230 assert!(
15231 client
15232 .sqlite
15233 .lock()
15234 .map(|guard| guard.is_none())
15235 .unwrap_or(false),
15236 "sqlite should start closed"
15237 );
15238
15239 let result = client.search_with_fallback(
15240 "zzzzzzunlikelyterm",
15241 SearchFilters::default(),
15242 10,
15243 0,
15244 1,
15245 FieldMask::FULL,
15246 )?;
15247
15248 assert!(result.hits.is_empty());
15249 assert!(
15250 result
15251 .suggestions
15252 .iter()
15253 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
15254 "manual wildcard suggestion should remain available"
15255 );
15256 assert!(
15257 result
15258 .suggestions
15259 .iter()
15260 .all(|s| !matches!(s.kind, SuggestionKind::AlternateAgent)),
15261 "alternate-agent suggestions should not force a SQLite open"
15262 );
15263 assert!(
15264 client
15265 .sqlite
15266 .lock()
15267 .map(|guard| guard.is_none())
15268 .unwrap_or(false),
15269 "sqlite should stay closed after Tantivy no-hit suggestions"
15270 );
15271
15272 Ok(())
15273 }
15274
15275 #[test]
15276 fn search_with_fallback_emits_wildcard_suggestion_on_zero_hits() -> Result<()> {
15277 let client = SearchClient {
15278 reader: None,
15279 sqlite: Mutex::new(None),
15280 sqlite_path: None,
15281 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15282 reload_on_search: true,
15283 last_reload: Mutex::new(None),
15284 last_generation: Mutex::new(None),
15285 reload_epoch: Arc::new(AtomicU64::new(0)),
15286 warm_tx: None,
15287 _warm_handle: None,
15288 metrics: Metrics::default(),
15289 cache_namespace: "vtest|schema:none".into(),
15290 semantic: Mutex::new(None),
15291 last_tantivy_total_count: Mutex::new(None),
15292 };
15293
15294 let result = client.search_with_fallback(
15295 "ghost",
15296 SearchFilters::default(),
15297 5,
15298 0,
15299 3,
15300 FieldMask::FULL,
15301 )?;
15302
15303 assert!(
15304 result.hits.is_empty(),
15305 "no index/db means no hits should be returned"
15306 );
15307 assert!(
15308 !result.wildcard_fallback,
15309 "with zero baseline and fallback hits, we should keep baseline and mark fallback=false"
15310 );
15311
15312 let wildcard = result
15313 .suggestions
15314 .iter()
15315 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
15316 .expect("should suggest adding wildcards");
15317 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
15318
15319 Ok(())
15320 }
15321
15322 #[test]
15323 fn search_with_fallback_skips_empty_query() -> Result<()> {
15324 let dir = TempDir::new()?;
15325 let mut index = TantivyIndex::open_or_create(dir.path())?;
15326
15327 let conv = NormalizedConversation {
15328 agent_slug: "codex".into(),
15329 external_id: None,
15330 title: Some("test".into()),
15331 workspace: None,
15332 source_path: dir.path().join("test.jsonl"),
15333 started_at: Some(100),
15334 ended_at: None,
15335 metadata: serde_json::json!({}),
15336 messages: vec![NormalizedMessage {
15337 idx: 0,
15338 role: "user".into(),
15339 author: None,
15340 created_at: Some(100),
15341 content: "testing data".into(),
15342 extra: serde_json::json!({}),
15343 snippets: vec![],
15344 invocations: Vec::new(),
15345 }],
15346 };
15347 index.add_conversation(&conv)?;
15348 index.commit()?;
15349
15350 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15351
15352 let result = client.search_with_fallback(
15354 " ",
15355 SearchFilters::default(),
15356 10,
15357 0,
15358 10,
15359 FieldMask::FULL,
15360 )?;
15361
15362 assert!(!result.wildcard_fallback);
15363 Ok(())
15364 }
15365
15366 #[test]
15367 fn search_with_fallback_skips_for_nonzero_offset() -> Result<()> {
15368 let client = SearchClient {
15370 reader: None,
15371 sqlite: Mutex::new(None),
15372 sqlite_path: None,
15373 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15374 reload_on_search: true,
15375 last_reload: Mutex::new(None),
15376 last_generation: Mutex::new(None),
15377 reload_epoch: Arc::new(AtomicU64::new(0)),
15378 warm_tx: None,
15379 _warm_handle: None,
15380 metrics: Metrics::default(),
15381 cache_namespace: "vtest|schema:none".into(),
15382 semantic: Mutex::new(None),
15383 last_tantivy_total_count: Mutex::new(None),
15384 };
15385
15386 let result = client.search_with_fallback(
15387 "ghost",
15388 SearchFilters::default(),
15389 5,
15390 10,
15391 3,
15392 FieldMask::FULL,
15393 )?;
15394
15395 assert!(
15396 !result.wildcard_fallback,
15397 "fallback should not run on paginated searches"
15398 );
15399 let wildcard = result
15401 .suggestions
15402 .iter()
15403 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
15404 .expect("wildcard suggestion present");
15405 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
15406
15407 Ok(())
15408 }
15409
15410 #[test]
15411 fn generate_suggestions_limits_and_sets_shortcuts() -> Result<()> {
15412 let client = SearchClient {
15414 reader: None,
15415 sqlite: Mutex::new(None),
15416 sqlite_path: None,
15417 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15418 reload_on_search: true,
15419 last_reload: Mutex::new(None),
15420 last_generation: Mutex::new(None),
15421 reload_epoch: Arc::new(AtomicU64::new(0)),
15422 warm_tx: None,
15423 _warm_handle: None,
15424 metrics: Metrics::default(),
15425 cache_namespace: "vtest|schema:none".into(),
15426 semantic: Mutex::new(None),
15427 last_tantivy_total_count: Mutex::new(None),
15428 };
15429
15430 let mut filters = SearchFilters::default();
15431 filters.agents.insert("codex".into()); let result = client.search_with_fallback("claud", filters, 5, 0, 3, FieldMask::FULL)?;
15434
15435 assert_eq!(
15437 result.suggestions.len(),
15438 3,
15439 "should truncate to 3 suggestions"
15440 );
15441 for (idx, sugg) in result.suggestions.iter().enumerate() {
15442 assert_eq!(
15443 sugg.shortcut,
15444 Some((idx + 1) as u8),
15445 "shortcut should match position (1-based)"
15446 );
15447 }
15448
15449 assert!(
15451 result
15452 .suggestions
15453 .iter()
15454 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
15455 "should suggest wildcard search"
15456 );
15457 assert!(
15458 result
15459 .suggestions
15460 .iter()
15461 .any(|s| matches!(s.kind, SuggestionKind::RemoveFilter)),
15462 "should suggest removing agent filter"
15463 );
15464 assert!(
15465 result
15466 .suggestions
15467 .iter()
15468 .any(|s| matches!(s.kind, SuggestionKind::SpellingFix)),
15469 "should suggest spelling fix for nearby agent name"
15470 );
15471
15472 Ok(())
15473 }
15474
15475 #[test]
15476 fn generate_suggestions_includes_recent_alternate_agents() -> Result<()> {
15477 let dir = TempDir::new()?;
15478 let db_path = dir.path().join("cass.db");
15479 let storage = FrankenStorage::open(&db_path)?;
15480 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
15481 let base_ts = 1_700_000_010_000_i64;
15482
15483 for (idx, slug) in ["claude_code", "codex"].iter().enumerate() {
15484 let agent = Agent {
15485 id: None,
15486 slug: (*slug).to_string(),
15487 name: (*slug).to_string(),
15488 version: None,
15489 kind: AgentKind::Cli,
15490 };
15491 let agent_id = storage.ensure_agent(&agent)?;
15492 let conversation = Conversation {
15493 id: None,
15494 agent_slug: (*slug).to_string(),
15495 workspace: Some(dir.path().to_path_buf()),
15496 external_id: Some(format!("alt-agent-{idx}")),
15497 title: Some(format!("alternate agent {idx}")),
15498 source_path: dir.path().join(format!("{slug}.jsonl")),
15499 started_at: Some(base_ts + idx as i64),
15500 ended_at: Some(base_ts + idx as i64),
15501 approx_tokens: Some(8),
15502 metadata_json: json!({}),
15503 messages: vec![Message {
15504 id: None,
15505 idx: 0,
15506 role: MessageRole::User,
15507 author: Some("user".into()),
15508 created_at: Some(base_ts + idx as i64),
15509 content: format!("content from {slug}"),
15510 extra_json: json!({}),
15511 snippets: Vec::new(),
15512 }],
15513 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
15514 origin_host: None,
15515 };
15516 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
15517 }
15518 drop(storage);
15519
15520 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
15521 let result = client.search_with_fallback(
15522 "ghost",
15523 SearchFilters::default(),
15524 5,
15525 0,
15526 3,
15527 FieldMask::FULL,
15528 )?;
15529
15530 let alternate_agents: HashSet<String> = result
15531 .suggestions
15532 .iter()
15533 .filter(|suggestion| matches!(suggestion.kind, SuggestionKind::AlternateAgent))
15534 .filter_map(|suggestion| suggestion.suggested_filters.as_ref())
15535 .flat_map(|filters| filters.agents.iter().cloned())
15536 .collect();
15537
15538 assert!(
15539 alternate_agents.contains("claude_code"),
15540 "should suggest claude_code from normalized conversations schema"
15541 );
15542 assert!(
15543 alternate_agents.contains("codex"),
15544 "should suggest codex from normalized conversations schema"
15545 );
15546
15547 Ok(())
15548 }
15549
15550 #[test]
15551 fn sanitize_query_preserves_wildcards() {
15552 assert_eq!(fs_cass_sanitize_query("*foo*"), "*foo*");
15554 assert_eq!(fs_cass_sanitize_query("foo*"), "foo*");
15555 assert_eq!(fs_cass_sanitize_query("*bar"), "*bar");
15556 assert_eq!(fs_cass_sanitize_query("*config*"), "*config*");
15557 }
15558
15559 #[test]
15560 fn sanitize_query_strips_other_special_chars() {
15561 assert_eq!(fs_cass_sanitize_query("foo.bar"), "foo bar");
15563 assert_eq!(fs_cass_sanitize_query("c++"), "c ");
15564 assert_eq!(fs_cass_sanitize_query("foo-bar"), "foo-bar");
15565 assert_eq!(fs_cass_sanitize_query("test_case"), "test case");
15566 }
15567
15568 #[test]
15569 fn sanitize_query_combined() {
15570 assert_eq!(fs_cass_sanitize_query("*foo.bar*"), "*foo bar*");
15572 assert_eq!(fs_cass_sanitize_query("test-*"), "test-*");
15573 assert_eq!(fs_cass_sanitize_query("*c++*"), "*c *");
15574 }
15575
15576 #[test]
15578 fn parse_boolean_query_simple_terms() {
15579 let tokens = fs_cass_parse_boolean_query("foo bar baz");
15580 assert_eq!(tokens.len(), 3);
15581 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15582 assert_eq!(tokens[1], FsCassQueryToken::Term("bar".to_string()));
15583 assert_eq!(tokens[2], FsCassQueryToken::Term("baz".to_string()));
15584 }
15585
15586 #[test]
15587 fn parse_boolean_query_and_operator() {
15588 let tokens = fs_cass_parse_boolean_query("foo AND bar");
15589 assert_eq!(tokens.len(), 3);
15590 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15591 assert_eq!(tokens[1], FsCassQueryToken::And);
15592 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15593
15594 let tokens2 = fs_cass_parse_boolean_query("foo && bar");
15596 assert_eq!(tokens2.len(), 3);
15597 assert_eq!(tokens2[1], FsCassQueryToken::And);
15598 }
15599
15600 #[test]
15601 fn parse_boolean_query_or_operator() {
15602 let tokens = fs_cass_parse_boolean_query("foo OR bar");
15603 assert_eq!(tokens.len(), 3);
15604 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15605 assert_eq!(tokens[1], FsCassQueryToken::Or);
15606 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15607
15608 let tokens2 = fs_cass_parse_boolean_query("foo || bar");
15610 assert_eq!(tokens2.len(), 3);
15611 assert_eq!(tokens2[1], FsCassQueryToken::Or);
15612 }
15613
15614 #[test]
15615 fn parse_boolean_query_not_operator() {
15616 let tokens = fs_cass_parse_boolean_query("foo NOT bar");
15617 assert_eq!(tokens.len(), 3);
15618 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15619 assert_eq!(tokens[1], FsCassQueryToken::Not);
15620 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15621 }
15622
15623 #[test]
15624 fn parse_boolean_query_quoted_phrase() {
15625 let tokens = fs_cass_parse_boolean_query(r#"foo "exact phrase" bar"#);
15626 assert_eq!(tokens.len(), 3);
15627 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15628 assert_eq!(
15629 tokens[1],
15630 FsCassQueryToken::Phrase("exact phrase".to_string())
15631 );
15632 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15633 }
15634
15635 #[test]
15636 fn parse_boolean_query_complex() {
15637 let tokens = fs_cass_parse_boolean_query(r#"error OR warning NOT "false positive""#);
15638 assert_eq!(tokens.len(), 5);
15639 assert_eq!(tokens[0], FsCassQueryToken::Term("error".to_string()));
15640 assert_eq!(tokens[1], FsCassQueryToken::Or);
15641 assert_eq!(tokens[2], FsCassQueryToken::Term("warning".to_string()));
15642 assert_eq!(tokens[3], FsCassQueryToken::Not);
15643 assert_eq!(
15644 tokens[4],
15645 FsCassQueryToken::Phrase("false positive".to_string())
15646 );
15647 }
15648
15649 #[test]
15650 fn has_boolean_operators_detection() {
15651 assert!(!fs_cass_has_boolean_operators("foo bar"));
15652 assert!(fs_cass_has_boolean_operators("foo AND bar"));
15653 assert!(fs_cass_has_boolean_operators("foo OR bar"));
15654 assert!(fs_cass_has_boolean_operators("foo NOT bar"));
15655 assert!(fs_cass_has_boolean_operators(r#""exact phrase""#));
15656 assert!(fs_cass_has_boolean_operators("foo && bar"));
15657 assert!(fs_cass_has_boolean_operators("foo || bar"));
15658 }
15659
15660 #[test]
15661 fn parse_boolean_query_case_insensitive_operators() {
15662 let tokens = fs_cass_parse_boolean_query("foo and bar or baz not qux");
15664 assert_eq!(tokens.len(), 7);
15665 assert_eq!(tokens[1], FsCassQueryToken::And);
15666 assert_eq!(tokens[3], FsCassQueryToken::Or);
15667 assert_eq!(tokens[5], FsCassQueryToken::Not);
15668 }
15669
15670 #[test]
15671 fn parse_boolean_query_with_wildcards() {
15672 let tokens = fs_cass_parse_boolean_query("*config* OR env*");
15673 assert_eq!(tokens.len(), 3);
15674 assert_eq!(tokens[0], FsCassQueryToken::Term("*config*".to_string()));
15675 assert_eq!(tokens[1], FsCassQueryToken::Or);
15676 assert_eq!(tokens[2], FsCassQueryToken::Term("env*".to_string()));
15677 }
15678
15679 #[test]
15685 fn tantivy_search_hydrates_long_content_when_content_field_is_not_stored() -> Result<()> {
15686 let dir = TempDir::new()?;
15687 let db_path = dir.path().join("cass.db");
15688 let storage = FrankenStorage::open(&db_path)?;
15689 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
15690 let agent = Agent {
15691 id: None,
15692 slug: "codex".into(),
15693 name: "Codex".into(),
15694 version: None,
15695 kind: AgentKind::Cli,
15696 };
15697 let agent_id = storage.ensure_agent(&agent)?;
15698 let long_content = format!(
15699 "{}needle appears past the preview boundary for hydration proof",
15700 "padding ".repeat(70)
15701 );
15702 let short_content = "shortneedle fits entirely inside the stored preview".to_string();
15703 let conversation = Conversation {
15704 id: None,
15705 agent_slug: "codex".into(),
15706 workspace: Some(dir.path().to_path_buf()),
15707 external_id: Some("hydrate-long-content".into()),
15708 title: Some("hydrated lexical doc".into()),
15709 source_path: dir.path().join("hydrate.jsonl"),
15710 started_at: Some(1_700_000_123_000),
15711 ended_at: Some(1_700_000_123_000),
15712 approx_tokens: Some(32),
15713 metadata_json: json!({}),
15714 messages: vec![
15715 Message {
15716 id: None,
15717 idx: 0,
15718 role: MessageRole::User,
15719 author: Some("user".into()),
15720 created_at: Some(1_700_000_123_000),
15721 content: long_content.clone(),
15722 extra_json: json!({}),
15723 snippets: Vec::new(),
15724 },
15725 Message {
15726 id: None,
15727 idx: 1,
15728 role: MessageRole::Agent,
15729 author: Some("assistant".into()),
15730 created_at: Some(1_700_000_124_000),
15731 content: short_content.clone(),
15732 extra_json: json!({}),
15733 snippets: Vec::new(),
15734 },
15735 ],
15736 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
15737 origin_host: None,
15738 };
15739 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
15740 storage.close()?;
15741
15742 let index_path = dir.path().join("search-index");
15743 let mut index = TantivyIndex::open_or_create(&index_path)?;
15744 let normalized = NormalizedConversation {
15745 agent_slug: "codex".into(),
15746 external_id: Some("hydrate-long-content".into()),
15747 title: Some("hydrated lexical doc".into()),
15748 workspace: Some(dir.path().to_path_buf()),
15749 source_path: dir.path().join("hydrate.jsonl"),
15750 started_at: Some(1_700_000_123_000),
15751 ended_at: Some(1_700_000_123_000),
15752 metadata: json!({}),
15753 messages: vec![
15754 NormalizedMessage {
15755 idx: 0,
15756 role: "user".into(),
15757 author: Some("user".into()),
15758 created_at: Some(1_700_000_123_000),
15759 content: long_content.clone(),
15760 extra: json!({}),
15761 snippets: vec![],
15762 invocations: Vec::new(),
15763 },
15764 NormalizedMessage {
15765 idx: 1,
15766 role: "assistant".into(),
15767 author: Some("assistant".into()),
15768 created_at: Some(1_700_000_124_000),
15769 content: short_content.clone(),
15770 extra: json!({}),
15771 snippets: vec![],
15772 invocations: Vec::new(),
15773 },
15774 ],
15775 };
15776 index.add_conversation(&normalized)?;
15777 index.commit()?;
15778
15779 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
15780 let hits = client.search("needle", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
15781
15782 assert_eq!(hits.len(), 1, "expected one lexical hit");
15783 assert_eq!(hits[0].title, "hydrated lexical doc");
15784 assert!(
15785 hits[0]
15786 .content
15787 .contains("needle appears past the preview boundary"),
15788 "lexical hit should hydrate full content from sqlite when Tantivy content is not stored"
15789 );
15790 assert!(
15791 hits[0].snippet.to_lowercase().contains("needle"),
15792 "snippet should still be rendered from hydrated content"
15793 );
15794
15795 let bounded_hits = client.search(
15796 "needle",
15797 SearchFilters::default(),
15798 5,
15799 0,
15800 FieldMask::FULL.with_preview_content_limit(Some(200)),
15801 )?;
15802
15803 assert_eq!(bounded_hits.len(), 1, "expected one lexical hit");
15804 assert!(
15805 bounded_hits[0].content.starts_with("padding padding"),
15806 "bounded content may be served from the stored preview prefix"
15807 );
15808 assert!(
15809 !bounded_hits[0]
15810 .content
15811 .contains("needle appears past the preview boundary"),
15812 "bounded preview content should not hydrate the full sqlite row"
15813 );
15814
15815 let short_client =
15816 SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
15817 assert!(
15818 short_client
15819 .sqlite
15820 .lock()
15821 .map(|guard| guard.is_none())
15822 .unwrap_or(false),
15823 "sqlite should start closed for short preview hit"
15824 );
15825
15826 let short_hits = short_client.search(
15827 "shortneedle",
15828 SearchFilters::default(),
15829 5,
15830 0,
15831 FieldMask::FULL,
15832 )?;
15833
15834 assert_eq!(short_hits.len(), 1, "expected one short lexical hit");
15835 assert_eq!(
15836 short_hits[0].content, short_content,
15837 "untruncated stored preview is exact full content"
15838 );
15839 assert!(
15840 short_client
15841 .sqlite
15842 .lock()
15843 .map(|guard| guard.is_none())
15844 .unwrap_or(false),
15845 "short full-content hit should not lazy-open sqlite"
15846 );
15847
15848 Ok(())
15849 }
15850
15851 #[test]
15852 fn filter_fidelity_agent_filter_respected() -> Result<()> {
15853 let dir = TempDir::new()?;
15855 let mut index = TantivyIndex::open_or_create(dir.path())?;
15856
15857 let conv_a = NormalizedConversation {
15859 agent_slug: "codex".into(),
15860 external_id: None,
15861 title: Some("alpha doc".into()),
15862 workspace: None,
15863 source_path: dir.path().join("a.jsonl"),
15864 started_at: Some(100),
15865 ended_at: None,
15866 metadata: serde_json::json!({}),
15867 messages: vec![NormalizedMessage {
15868 idx: 0,
15869 role: "user".into(),
15870 author: None,
15871 created_at: Some(100),
15872 content: "hello world findme alpha".into(),
15873 extra: serde_json::json!({}),
15874 snippets: vec![],
15875 invocations: Vec::new(),
15876 }],
15877 };
15878 let conv_b = NormalizedConversation {
15880 agent_slug: "claude".into(),
15881 external_id: None,
15882 title: Some("beta doc".into()),
15883 workspace: None,
15884 source_path: dir.path().join("b.jsonl"),
15885 started_at: Some(200),
15886 ended_at: None,
15887 metadata: serde_json::json!({}),
15888 messages: vec![NormalizedMessage {
15889 idx: 0,
15890 role: "user".into(),
15891 author: None,
15892 created_at: Some(200),
15893 content: "hello world findme beta".into(),
15894 extra: serde_json::json!({}),
15895 snippets: vec![],
15896 invocations: Vec::new(),
15897 }],
15898 };
15899 index.add_conversation(&conv_a)?;
15900 index.add_conversation(&conv_b)?;
15901 index.commit()?;
15902
15903 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15904
15905 let mut filters = SearchFilters::default();
15907 filters.agents.insert("codex".into());
15908
15909 let hits = client.search("findme", filters.clone(), 10, 0, FieldMask::FULL)?;
15910
15911 for hit in &hits {
15913 assert_eq!(
15914 hit.agent, "codex",
15915 "Agent filter violated: got agent '{}' instead of 'codex'",
15916 hit.agent
15917 );
15918 }
15919 assert!(!hits.is_empty(), "Should have found results");
15920
15921 let cached_hits = client.search("findme", filters, 10, 0, FieldMask::FULL)?;
15923 for hit in &cached_hits {
15924 assert_eq!(hit.agent, "codex", "Cached search violated agent filter");
15925 }
15926
15927 Ok(())
15928 }
15929
15930 #[test]
15931 fn filter_fidelity_workspace_filter_respected() -> Result<()> {
15932 let dir = TempDir::new()?;
15934 let mut index = TantivyIndex::open_or_create(dir.path())?;
15935
15936 let conv_a = NormalizedConversation {
15938 agent_slug: "codex".into(),
15939 external_id: None,
15940 title: Some("ws_a doc".into()),
15941 workspace: Some(std::path::PathBuf::from("/workspace/alpha")),
15942 source_path: dir.path().join("a.jsonl"),
15943 started_at: Some(100),
15944 ended_at: None,
15945 metadata: serde_json::json!({}),
15946 messages: vec![NormalizedMessage {
15947 idx: 0,
15948 role: "user".into(),
15949 author: None,
15950 created_at: Some(100),
15951 content: "workspace test needle".into(),
15952 extra: serde_json::json!({}),
15953 snippets: vec![],
15954 invocations: Vec::new(),
15955 }],
15956 };
15957 let conv_b = NormalizedConversation {
15959 agent_slug: "codex".into(),
15960 external_id: None,
15961 title: Some("ws_b doc".into()),
15962 workspace: Some(std::path::PathBuf::from("/workspace/beta")),
15963 source_path: dir.path().join("b.jsonl"),
15964 started_at: Some(200),
15965 ended_at: None,
15966 metadata: serde_json::json!({}),
15967 messages: vec![NormalizedMessage {
15968 idx: 0,
15969 role: "user".into(),
15970 author: None,
15971 created_at: Some(200),
15972 content: "workspace test needle".into(),
15973 extra: serde_json::json!({}),
15974 snippets: vec![],
15975 invocations: Vec::new(),
15976 }],
15977 };
15978 index.add_conversation(&conv_a)?;
15979 index.add_conversation(&conv_b)?;
15980 index.commit()?;
15981
15982 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15983
15984 let mut filters = SearchFilters::default();
15986 filters.workspaces.insert("/workspace/beta".into());
15987
15988 let hits = client.search("needle", filters.clone(), 10, 0, FieldMask::FULL)?;
15989
15990 for hit in &hits {
15992 assert_eq!(
15993 hit.workspace, "/workspace/beta",
15994 "Workspace filter violated: got '{}' instead of '/workspace/beta'",
15995 hit.workspace
15996 );
15997 }
15998 assert!(!hits.is_empty(), "Should have found results");
15999
16000 let cached_hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
16002 for hit in &cached_hits {
16003 assert_eq!(
16004 hit.workspace, "/workspace/beta",
16005 "Cached search violated workspace filter"
16006 );
16007 }
16008
16009 Ok(())
16010 }
16011
16012 #[test]
16013 fn filter_fidelity_date_range_respected() -> Result<()> {
16014 let dir = TempDir::new()?;
16016 let mut index = TantivyIndex::open_or_create(dir.path())?;
16017
16018 let conv_early = NormalizedConversation {
16020 agent_slug: "codex".into(),
16021 external_id: None,
16022 title: Some("early".into()),
16023 workspace: None,
16024 source_path: dir.path().join("early.jsonl"),
16025 started_at: Some(100),
16026 ended_at: None,
16027 metadata: serde_json::json!({}),
16028 messages: vec![NormalizedMessage {
16029 idx: 0,
16030 role: "user".into(),
16031 author: None,
16032 created_at: Some(100),
16033 content: "date range test".into(),
16034 extra: serde_json::json!({}),
16035 snippets: vec![],
16036 invocations: Vec::new(),
16037 }],
16038 };
16039 let conv_middle = NormalizedConversation {
16041 agent_slug: "codex".into(),
16042 external_id: None,
16043 title: Some("middle".into()),
16044 workspace: None,
16045 source_path: dir.path().join("middle.jsonl"),
16046 started_at: Some(500),
16047 ended_at: None,
16048 metadata: serde_json::json!({}),
16049 messages: vec![NormalizedMessage {
16050 idx: 0,
16051 role: "user".into(),
16052 author: None,
16053 created_at: Some(500),
16054 content: "date range test".into(),
16055 extra: serde_json::json!({}),
16056 snippets: vec![],
16057 invocations: Vec::new(),
16058 }],
16059 };
16060 let conv_late = NormalizedConversation {
16062 agent_slug: "codex".into(),
16063 external_id: None,
16064 title: Some("late".into()),
16065 workspace: None,
16066 source_path: dir.path().join("late.jsonl"),
16067 started_at: Some(900),
16068 ended_at: None,
16069 metadata: serde_json::json!({}),
16070 messages: vec![NormalizedMessage {
16071 idx: 0,
16072 role: "user".into(),
16073 author: None,
16074 created_at: Some(900),
16075 content: "date range test".into(),
16076 extra: serde_json::json!({}),
16077 snippets: vec![],
16078 invocations: Vec::new(),
16079 }],
16080 };
16081 index.add_conversation(&conv_early)?;
16082 index.add_conversation(&conv_middle)?;
16083 index.add_conversation(&conv_late)?;
16084 index.commit()?;
16085
16086 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16087
16088 let filters = SearchFilters {
16090 created_from: Some(400),
16091 created_to: Some(600),
16092 ..Default::default()
16093 };
16094
16095 let hits = client.search("range", filters.clone(), 10, 0, FieldMask::FULL)?;
16096
16097 for hit in &hits {
16099 if let Some(ts) = hit.created_at {
16100 assert!(
16101 (400..=600).contains(&ts),
16102 "Date range filter violated: got ts={ts} outside [400, 600]"
16103 );
16104 }
16105 }
16106 assert_eq!(hits.len(), 1, "Should find exactly 1 doc in range");
16108
16109 let cached_hits = client.search("range", filters, 10, 0, FieldMask::FULL)?;
16111 for hit in &cached_hits {
16112 if let Some(ts) = hit.created_at {
16113 assert!(
16114 (400..=600).contains(&ts),
16115 "Cached search violated date range filter"
16116 );
16117 }
16118 }
16119
16120 Ok(())
16121 }
16122
16123 #[test]
16124 fn filter_fidelity_combined_filters_respected() -> Result<()> {
16125 let dir = TempDir::new()?;
16127 let mut index = TantivyIndex::open_or_create(dir.path())?;
16128
16129 let combinations = [
16131 ("codex", "/ws/prod", 100), ("claude", "/ws/prod", 500), ("claude", "/ws/dev", 500), ("claude", "/ws/prod", 900), ];
16136
16137 for (i, (agent, ws, ts)) in combinations.iter().enumerate() {
16138 let conv = NormalizedConversation {
16139 agent_slug: (*agent).into(),
16140 external_id: None,
16141 title: Some(format!("combo-{i}")),
16142 workspace: Some(std::path::PathBuf::from(*ws)),
16143 source_path: dir.path().join(format!("{i}.jsonl")),
16144 started_at: Some(*ts),
16145 ended_at: None,
16146 metadata: serde_json::json!({}),
16147 messages: vec![NormalizedMessage {
16148 idx: 0,
16149 role: "user".into(),
16150 author: None,
16151 created_at: Some(*ts),
16152 content: "hello world combotest query".into(),
16153 extra: serde_json::json!({}),
16154 snippets: vec![],
16155 invocations: Vec::new(),
16156 }],
16157 };
16158 index.add_conversation(&conv)?;
16159 }
16160 index.commit()?;
16161
16162 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16163
16164 let mut filters = SearchFilters::default();
16166 filters.agents.insert("claude".into());
16167 filters.workspaces.insert("/ws/prod".into());
16168 filters.created_from = Some(400);
16169 filters.created_to = Some(600);
16170
16171 let hits = client.search("combotest", filters.clone(), 10, 0, FieldMask::FULL)?;
16172
16173 assert_eq!(hits.len(), 1, "Combined filter should match exactly 1 doc");
16175
16176 for hit in &hits {
16177 assert_eq!(hit.agent, "claude", "Agent filter violated");
16178 assert_eq!(hit.workspace, "/ws/prod", "Workspace filter violated");
16179 if let Some(ts) = hit.created_at {
16180 assert!((400..=600).contains(&ts), "Date filter violated: ts={ts}");
16181 }
16182 }
16183
16184 let cached = client.search("combotest", filters, 10, 0, FieldMask::FULL)?;
16186 assert_eq!(cached.len(), 1, "Cached result count mismatch");
16187
16188 Ok(())
16189 }
16190
16191 #[test]
16192 fn lexical_hits_normalize_trimmed_local_source_metadata() -> Result<()> {
16193 let dir = TempDir::new()?;
16194 let mut index = TantivyIndex::open_or_create(dir.path())?;
16195
16196 let conv = NormalizedConversation {
16197 agent_slug: "codex".into(),
16198 external_id: None,
16199 title: Some("trimmed local doc".into()),
16200 workspace: None,
16201 source_path: dir.path().join("trimmed-local.jsonl"),
16202 started_at: Some(100),
16203 ended_at: None,
16204 metadata: serde_json::json!({
16205 "cass": {
16206 "origin": {
16207 "source_id": " LOCAL ",
16208 "kind": "local"
16209 }
16210 }
16211 }),
16212 messages: vec![NormalizedMessage {
16213 idx: 0,
16214 role: "user".into(),
16215 author: None,
16216 created_at: Some(100),
16217 content: "trimmed local lexical".into(),
16218 extra: serde_json::json!({}),
16219 snippets: vec![],
16220 invocations: Vec::new(),
16221 }],
16222 };
16223 index.add_conversation(&conv)?;
16224 index.commit()?;
16225
16226 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16227 let hits = client.search("trimmed", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16228
16229 assert_eq!(hits.len(), 1);
16230 assert_eq!(hits[0].source_id, "local");
16231 assert_eq!(hits[0].origin_kind, "local");
16232
16233 Ok(())
16234 }
16235
16236 #[test]
16237 fn lexical_hits_normalize_remote_origin_kind_without_source_id() -> Result<()> {
16238 let dir = TempDir::new()?;
16239 let mut index = TantivyIndex::open_or_create(dir.path())?;
16240
16241 let conv = NormalizedConversation {
16242 agent_slug: "codex".into(),
16243 external_id: None,
16244 title: Some("remote lexical doc".into()),
16245 workspace: None,
16246 source_path: dir.path().join("remote-lexical.jsonl"),
16247 started_at: Some(100),
16248 ended_at: None,
16249 metadata: serde_json::json!({
16250 "cass": {
16251 "origin": {
16252 "source_id": " ",
16253 "kind": "ssh",
16254 "host": "dev@laptop"
16255 }
16256 }
16257 }),
16258 messages: vec![NormalizedMessage {
16259 idx: 0,
16260 role: "user".into(),
16261 author: None,
16262 created_at: Some(100),
16263 content: "remote lexical".into(),
16264 extra: serde_json::json!({}),
16265 snippets: vec![],
16266 invocations: Vec::new(),
16267 }],
16268 };
16269 index.add_conversation(&conv)?;
16270 index.commit()?;
16271
16272 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16273 let hits = client.search("remote", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16274
16275 assert_eq!(hits.len(), 1);
16276 assert_eq!(hits[0].source_id, "dev@laptop");
16277 assert_eq!(hits[0].origin_kind, "remote");
16278 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
16279
16280 Ok(())
16281 }
16282
16283 #[test]
16284 fn lexical_hits_infer_remote_origin_from_host_without_kind() -> Result<()> {
16285 let dir = TempDir::new()?;
16286 let mut index = TantivyIndex::open_or_create(dir.path())?;
16287
16288 let conv = NormalizedConversation {
16289 agent_slug: "codex".into(),
16290 external_id: None,
16291 title: Some("legacy host-only lexical doc".into()),
16292 workspace: None,
16293 source_path: dir.path().join("legacy-host-only-lexical.jsonl"),
16294 started_at: Some(100),
16295 ended_at: None,
16296 metadata: serde_json::json!({
16297 "cass": {
16298 "origin": {
16299 "source_id": " ",
16300 "host": "dev@laptop"
16301 }
16302 }
16303 }),
16304 messages: vec![NormalizedMessage {
16305 idx: 0,
16306 role: "user".into(),
16307 author: None,
16308 created_at: Some(100),
16309 content: "legacy remote lexical".into(),
16310 extra: serde_json::json!({}),
16311 snippets: vec![],
16312 invocations: Vec::new(),
16313 }],
16314 };
16315 index.add_conversation(&conv)?;
16316 index.commit()?;
16317
16318 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16319 let hits = client.search("legacy", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16320
16321 assert_eq!(hits.len(), 1);
16322 assert_eq!(hits[0].source_id, "dev@laptop");
16323 assert_eq!(hits[0].origin_kind, "remote");
16324 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
16325
16326 Ok(())
16327 }
16328
16329 #[test]
16330 fn filter_fidelity_source_filter_respected() -> Result<()> {
16331 let dir = TempDir::new()?;
16333 let mut index = TantivyIndex::open_or_create(dir.path())?;
16334
16335 let conv_local = NormalizedConversation {
16337 agent_slug: "codex".into(),
16338 external_id: None,
16339 title: Some("local doc".into()),
16340 workspace: None,
16341 source_path: dir.path().join("local.jsonl"),
16342 started_at: Some(100),
16343 ended_at: None,
16344 metadata: serde_json::json!({}),
16345 messages: vec![NormalizedMessage {
16346 idx: 0,
16347 role: "user".into(),
16348 author: None,
16349 created_at: Some(100),
16350 content: "source filter test local".into(),
16351 extra: serde_json::json!({}),
16352 snippets: vec![],
16353 invocations: Vec::new(),
16354 }],
16355 };
16356 index.add_conversation(&conv_local)?;
16359 index.commit()?;
16360
16361 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16362
16363 let filters = SearchFilters {
16365 source_filter: SourceFilter::Local,
16366 ..Default::default()
16367 };
16368
16369 let hits = client.search("source", filters.clone(), 10, 0, FieldMask::FULL)?;
16370
16371 for hit in &hits {
16373 assert_eq!(
16374 hit.source_id, "local",
16375 "Source filter violated: got source_id '{}' instead of 'local'",
16376 hit.source_id
16377 );
16378 }
16379 assert!(!hits.is_empty(), "Should have found local results");
16380
16381 let filters_id = SearchFilters {
16383 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
16384 ..Default::default()
16385 };
16386
16387 let hits_id = client.search("source", filters_id, 10, 0, FieldMask::FULL)?;
16388 for hit in &hits_id {
16389 assert_eq!(
16390 hit.source_id, "local",
16391 "SourceId filter violated: got '{}' instead of 'local'",
16392 hit.source_id
16393 );
16394 }
16395 assert!(
16396 !hits_id.is_empty(),
16397 "Should have found results for source_id=local"
16398 );
16399
16400 Ok(())
16401 }
16402
16403 #[test]
16404 fn filter_fidelity_cache_key_isolation() {
16405 let client = SearchClient {
16407 reader: None,
16408 sqlite: Mutex::new(None),
16409 sqlite_path: None,
16410 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16411 reload_on_search: true,
16412 last_reload: Mutex::new(None),
16413 last_generation: Mutex::new(None),
16414 reload_epoch: Arc::new(AtomicU64::new(0)),
16415 warm_tx: None,
16416 _warm_handle: None,
16417 metrics: Metrics::default(),
16418 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
16419 semantic: Mutex::new(None),
16420 last_tantivy_total_count: Mutex::new(None),
16421 };
16422
16423 let filters_empty = SearchFilters::default();
16424 let mut filters_agent = SearchFilters::default();
16425 filters_agent.agents.insert("codex".into());
16426
16427 let mut filters_ws = SearchFilters::default();
16428 filters_ws.workspaces.insert("/ws".into());
16429
16430 let key_empty = client.cache_key("test", &filters_empty);
16431 let key_agent = client.cache_key("test", &filters_agent);
16432 let key_ws = client.cache_key("test", &filters_ws);
16433
16434 assert_ne!(
16436 key_empty, key_agent,
16437 "Empty vs agent filter keys should differ"
16438 );
16439 assert_ne!(
16440 key_empty, key_ws,
16441 "Empty vs workspace filter keys should differ"
16442 );
16443 assert_ne!(
16444 key_agent, key_ws,
16445 "Agent vs workspace filter keys should differ"
16446 );
16447
16448 let mut filters_agent2 = SearchFilters::default();
16450 filters_agent2.agents.insert("codex".into());
16451 let key_agent2 = client.cache_key("test", &filters_agent2);
16452 assert_eq!(key_agent, key_agent2, "Same filter should produce same key");
16453 }
16454
16455 #[test]
16463 fn sanitize_query_preserves_unicode_alphanumeric() {
16464 assert_eq!(fs_cass_sanitize_query("こんにちは"), "こんにちは");
16466 assert_eq!(fs_cass_sanitize_query("café"), "café");
16467 assert_eq!(fs_cass_sanitize_query("日本語123"), "日本語123");
16468 }
16469
16470 #[test]
16471 fn sanitize_query_handles_multiple_consecutive_special_chars() {
16472 assert_eq!(fs_cass_sanitize_query("foo---bar"), "foo---bar");
16473 assert_eq!(fs_cass_sanitize_query("a!@#$%^&()b"), "a b");
16475 }
16476
16477 #[test]
16480 fn wildcard_pattern_empty_after_trim_returns_exact_empty() {
16481 assert_eq!(
16482 FsCassWildcardPattern::parse("*"),
16483 FsCassWildcardPattern::Exact(String::new())
16484 );
16485 assert_eq!(
16486 FsCassWildcardPattern::parse("**"),
16487 FsCassWildcardPattern::Exact(String::new())
16488 );
16489 assert_eq!(
16490 FsCassWildcardPattern::parse("***"),
16491 FsCassWildcardPattern::Exact(String::new())
16492 );
16493 }
16494
16495 #[test]
16496 fn wildcard_pattern_to_regex_generation() {
16497 assert_eq!(FsCassWildcardPattern::Exact("foo".into()).to_regex(), None);
16499 assert_eq!(FsCassWildcardPattern::Prefix("foo".into()).to_regex(), None);
16500 assert_eq!(
16503 FsCassWildcardPattern::Suffix("foo".into()).to_regex(),
16504 Some(".*foo$".into())
16505 );
16506 assert_eq!(
16507 FsCassWildcardPattern::Substring("foo".into()).to_regex(),
16508 Some(".*foo.*".into())
16509 );
16510 }
16511
16512 #[test]
16515 fn parse_boolean_query_prefix_minus_not() {
16516 let tokens = fs_cass_parse_boolean_query("-world");
16518 let expected = vec![
16519 FsCassQueryToken::Not,
16520 FsCassQueryToken::Term("world".into()),
16521 ];
16522 assert_eq!(tokens, expected);
16523
16524 let tokens = fs_cass_parse_boolean_query("hello -world");
16526 let expected = vec![
16527 FsCassQueryToken::Term("hello".into()),
16528 FsCassQueryToken::Not,
16529 FsCassQueryToken::Term("world".into()),
16530 ];
16531 assert_eq!(tokens, expected);
16532 }
16533
16534 #[test]
16535 fn parse_boolean_query_empty_quoted_phrase_ignored() {
16536 let tokens = parse_boolean_query("\"\"");
16537 assert!(tokens.is_empty());
16538
16539 let tokens = parse_boolean_query("foo \"\" bar");
16540 let expected: QueryTokenList = vec![
16541 QueryToken::Term("foo".into()),
16542 QueryToken::Term("bar".into()),
16543 ];
16544 assert_eq!(tokens, expected);
16545 }
16546
16547 #[test]
16548 fn parse_boolean_query_unclosed_quote() {
16549 let tokens = parse_boolean_query("\"hello world");
16551 let expected: QueryTokenList = vec![QueryToken::Phrase("hello world".into())];
16552 assert_eq!(tokens, expected);
16553 }
16554
16555 #[test]
16556 fn transpile_to_fts5_rejects_leading_unary_not_queries() {
16557 assert_eq!(transpile_to_fts5("NOT foo"), None);
16558 assert_eq!(transpile_to_fts5("-foo"), None);
16559 }
16560
16561 #[test]
16562 fn transpile_to_fts5_rejects_or_not_forms_it_cannot_represent() {
16563 assert_eq!(transpile_to_fts5("foo OR NOT bar"), None);
16564 assert_eq!(transpile_to_fts5("foo NOT bar OR baz"), None);
16565 }
16566
16567 #[test]
16568 fn transpile_to_fts5_ignores_leading_or() {
16569 assert_eq!(transpile_to_fts5("OR test"), Some("test".to_string()));
16570 assert_eq!(
16571 transpile_to_fts5("OR foo-bar"),
16572 Some("(foo AND bar)".to_string())
16573 );
16574 }
16575
16576 #[test]
16577 fn transpile_to_fts5_splits_hyphenated_subterms_for_sqlite_fts() {
16578 assert_eq!(
16579 transpile_to_fts5("br-123.jsonl"),
16580 Some("(br AND 123 AND jsonl)".to_string())
16581 );
16582 assert_eq!(
16583 transpile_to_fts5("br-123.json*"),
16584 Some("(br AND 123 AND json*)".to_string())
16585 );
16586 }
16587
16588 #[test]
16589 fn transpile_to_fts5_preserves_supported_binary_not() {
16590 assert_eq!(
16591 transpile_to_fts5("foo NOT bar").as_deref(),
16592 Some("foo NOT bar")
16593 );
16594 assert_eq!(
16595 transpile_to_fts5("foo NOT bar-baz"),
16596 Some("foo NOT (bar AND baz)".to_string())
16597 );
16598 }
16599
16600 #[test]
16601 fn search_sqlite_fts5_returns_empty_when_sqlite_is_unavailable() {
16602 let client = SearchClient {
16603 reader: None,
16604 sqlite: Mutex::new(None),
16605 sqlite_path: None,
16606 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16607 reload_on_search: false,
16608 last_reload: Mutex::new(None),
16609 last_generation: Mutex::new(None),
16610 reload_epoch: Arc::new(AtomicU64::new(0)),
16611 warm_tx: None,
16612 _warm_handle: None,
16613 metrics: Metrics::default(),
16614 cache_namespace: "fts5-disabled".to_string(),
16615 semantic: Mutex::new(None),
16616 last_tantivy_total_count: Mutex::new(None),
16617 };
16618
16619 let hits = client.search_sqlite_fts5(
16620 Path::new("/nonexistent"),
16621 "test query",
16622 SearchFilters::default(),
16623 10,
16624 0,
16625 FieldMask::FULL,
16626 );
16627
16628 assert!(hits.is_ok(), "disabled FTS5 path should stay non-fatal");
16629 assert!(
16630 hits.unwrap().is_empty(),
16631 "unavailable SQLite fallback should keep returning an empty result set"
16632 );
16633 }
16634
16635 #[test]
16657 fn search_sqlite_fts5_rank_and_hydrate_split_preserves_limit_prefix_invariant() -> Result<()> {
16658 let conn = Connection::open(":memory:")?;
16659 conn.execute_batch(
16660 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
16661 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
16662 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
16663 CREATE TABLE conversations (
16664 id INTEGER PRIMARY KEY,
16665 agent_id INTEGER,
16666 workspace_id INTEGER,
16667 source_id TEXT,
16668 origin_host TEXT,
16669 title TEXT,
16670 source_path TEXT
16671 );
16672 CREATE TABLE messages (
16673 id INTEGER PRIMARY KEY,
16674 conversation_id INTEGER,
16675 idx INTEGER,
16676 content TEXT,
16677 created_at INTEGER
16678 );
16679 CREATE VIRTUAL TABLE fts_messages USING fts5(
16680 content,
16681 title,
16682 agent,
16683 workspace,
16684 source_path,
16685 created_at UNINDEXED,
16686 message_id UNINDEXED,
16687 tokenize='porter'
16688 );",
16689 )?;
16690 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
16691 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
16692 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/tmp/k0e5p')")?;
16693
16694 for (i, repeats) in (1..=6_i64).enumerate() {
16701 let conv_id = i as i64 + 1;
16702 let msg_id = (i as i64 + 1) * 10;
16703 conn.execute_compat(
16704 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, \
16705 origin_host, title, source_path) \
16706 VALUES(?1, 1, 1, 'local', NULL, ?2, ?3)",
16707 params![
16708 conv_id,
16709 format!("k0e5p-{}", i),
16710 format!("/tmp/k0e5p/{}.jsonl", i),
16711 ],
16712 )?;
16713 let content = "rankprobe ".repeat(repeats as usize);
16714 conn.execute_compat(
16715 "INSERT INTO messages(id, conversation_id, idx, content, created_at) \
16716 VALUES(?1, ?2, ?3, ?4, ?5)",
16717 params![
16718 msg_id,
16719 conv_id,
16720 i as i64,
16721 content.as_str(),
16722 1_700_000_000_i64 + i as i64
16723 ],
16724 )?;
16725 conn.execute_compat(
16726 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, \
16727 source_path, created_at, message_id) \
16728 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
16729 params![
16730 msg_id,
16731 content.as_str(),
16732 format!("k0e5p-{}", i),
16733 "codex",
16734 "/tmp/k0e5p",
16735 format!("/tmp/k0e5p/{}.jsonl", i),
16736 1_700_000_000_i64 + i as i64,
16737 msg_id,
16738 ],
16739 )?;
16740 }
16741
16742 let client = SearchClient {
16743 reader: None,
16744 sqlite: Mutex::new(Some(SendConnection(conn))),
16745 sqlite_path: None,
16746 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16747 reload_on_search: false,
16748 last_reload: Mutex::new(None),
16749 last_generation: Mutex::new(None),
16750 reload_epoch: Arc::new(AtomicU64::new(0)),
16751 warm_tx: None,
16752 _warm_handle: None,
16753 metrics: Metrics::default(),
16754 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:k0e5p"),
16755 semantic: Mutex::new(None),
16756 last_tantivy_total_count: Mutex::new(None),
16757 };
16758
16759 fn hit_keys(hits: &[SearchHit]) -> Vec<(String, Option<usize>)> {
16764 hits.iter()
16765 .map(|h| (h.source_path.clone(), h.line_number))
16766 .collect()
16767 }
16768
16769 let large_hits = client.search_sqlite_fts5(
16770 Path::new(":memory:"),
16771 "rankprobe",
16772 SearchFilters::default(),
16773 6,
16774 0,
16775 FieldMask::FULL,
16776 )?;
16777 assert_eq!(
16778 large_hits.len(),
16779 6,
16780 "limit=N must return all N candidates when the corpus has exactly N matches"
16781 );
16782
16783 let small_hits = client.search_sqlite_fts5(
16784 Path::new(":memory:"),
16785 "rankprobe",
16786 SearchFilters::default(),
16787 3,
16788 0,
16789 FieldMask::FULL,
16790 )?;
16791 assert_eq!(small_hits.len(), 3, "limit=3 must return exactly 3 hits");
16792
16793 let large_keys = hit_keys(&large_hits);
16796 let small_keys = hit_keys(&small_hits);
16797 assert_eq!(
16798 small_keys,
16799 large_keys[..3],
16800 "limit=3 hit keys MUST be the first 3 of limit=6 hit keys (rank+hydrate \
16801 split must not re-order or re-filter); small={small_keys:?} \
16802 large_prefix={:?}",
16803 &large_keys[..3]
16804 );
16805
16806 for (idx, (small, large)) in small_hits.iter().zip(large_hits.iter()).enumerate() {
16812 assert_eq!(
16813 small.content, large.content,
16814 "hit[{idx}] content must agree across limit=3 and limit=6: \
16815 small={:?} large={:?}",
16816 small.content, large.content
16817 );
16818 assert_eq!(
16819 small.title, large.title,
16820 "hit[{idx}] title must agree across limit=3 and limit=6"
16821 );
16822 }
16823
16824 let zero_hits = client.search_sqlite_fts5(
16828 Path::new(":memory:"),
16829 "rankprobe",
16830 SearchFilters::default(),
16831 0,
16832 0,
16833 FieldMask::FULL,
16834 )?;
16835 assert!(
16836 zero_hits.is_empty(),
16837 "limit=0 must return zero hits even though the rank phase has candidates; \
16838 got {} hits",
16839 zero_hits.len()
16840 );
16841
16842 Ok(())
16843 }
16844
16845 #[test]
16848 fn levenshtein_distance_identical_strings() {
16849 assert_eq!(levenshtein_distance("hello", "hello"), 0);
16850 assert_eq!(levenshtein_distance("", ""), 0);
16851 }
16852
16853 #[test]
16854 fn levenshtein_distance_insertions() {
16855 assert_eq!(levenshtein_distance("", "abc"), 3);
16856 assert_eq!(levenshtein_distance("cat", "cats"), 1);
16857 }
16858
16859 #[test]
16860 fn levenshtein_distance_deletions() {
16861 assert_eq!(levenshtein_distance("abc", ""), 3);
16862 assert_eq!(levenshtein_distance("cats", "cat"), 1);
16863 }
16864
16865 #[test]
16866 fn levenshtein_distance_substitutions() {
16867 assert_eq!(levenshtein_distance("cat", "bat"), 1);
16868 assert_eq!(levenshtein_distance("kitten", "sitten"), 1);
16869 }
16870
16871 #[test]
16872 fn levenshtein_distance_mixed_operations() {
16873 assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
16874 assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
16875 }
16876
16877 #[test]
16880 fn is_tool_invocation_noise_allows_real_content() {
16881 assert!(!is_tool_invocation_noise("This is a normal message"));
16882 assert!(!is_tool_invocation_noise(
16883 "Let me use the Tool feature to accomplish this task. Here is the implementation..."
16884 ));
16885 let long_content = "[Tool: Read] Now here is a lot of useful content that explains the implementation details and provides context for the changes being made to the codebase.";
16887 assert!(!is_tool_invocation_noise(long_content));
16888 }
16889
16890 #[test]
16891 fn is_tool_invocation_noise_handles_short_tool_markers() {
16892 assert!(is_tool_invocation_noise("[tool: x]"));
16893 assert!(is_tool_invocation_noise("tool: bash"));
16894 }
16895
16896 #[test]
16899 fn search_boolean_and_filters_results() -> Result<()> {
16900 let dir = TempDir::new()?;
16901 let mut index = TantivyIndex::open_or_create(dir.path())?;
16902
16903 let conv1 = NormalizedConversation {
16905 agent_slug: "codex".into(),
16906 external_id: None,
16907 title: Some("doc1".into()),
16908 workspace: None,
16909 source_path: dir.path().join("1.jsonl"),
16910 started_at: Some(1),
16911 ended_at: None,
16912 metadata: serde_json::json!({}),
16913 messages: vec![NormalizedMessage {
16914 idx: 0,
16915 role: "user".into(),
16916 author: None,
16917 created_at: Some(1),
16918 content: "alpha beta gamma".into(),
16919 extra: serde_json::json!({}),
16920 snippets: vec![],
16921 invocations: Vec::new(),
16922 }],
16923 };
16924 let conv2 = NormalizedConversation {
16925 agent_slug: "codex".into(),
16926 external_id: None,
16927 title: Some("doc2".into()),
16928 workspace: None,
16929 source_path: dir.path().join("2.jsonl"),
16930 started_at: Some(2),
16931 ended_at: None,
16932 metadata: serde_json::json!({}),
16933 messages: vec![NormalizedMessage {
16934 idx: 0,
16935 role: "user".into(),
16936 author: None,
16937 created_at: Some(2),
16938 content: "alpha delta".into(),
16939 extra: serde_json::json!({}),
16940 snippets: vec![],
16941 invocations: Vec::new(),
16942 }],
16943 };
16944 index.add_conversation(&conv1)?;
16945 index.add_conversation(&conv2)?;
16946 index.commit()?;
16947
16948 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16949
16950 let hits = client.search(
16952 "alpha AND beta",
16953 SearchFilters::default(),
16954 10,
16955 0,
16956 FieldMask::FULL,
16957 )?;
16958 assert_eq!(hits.len(), 1);
16959 assert!(hits[0].content.contains("gamma"));
16960
16961 let hits = client.search(
16963 "alpha AND delta",
16964 SearchFilters::default(),
16965 10,
16966 0,
16967 FieldMask::FULL,
16968 )?;
16969 assert_eq!(hits.len(), 1);
16970 assert!(hits[0].content.contains("delta"));
16971
16972 Ok(())
16973 }
16974
16975 #[test]
16976 fn search_boolean_or_expands_results() -> Result<()> {
16977 let dir = TempDir::new()?;
16978 let mut index = TantivyIndex::open_or_create(dir.path())?;
16979
16980 let conv1 = NormalizedConversation {
16981 agent_slug: "codex".into(),
16982 external_id: None,
16983 title: Some("doc1".into()),
16984 workspace: None,
16985 source_path: dir.path().join("1.jsonl"),
16986 started_at: Some(1),
16987 ended_at: None,
16988 metadata: serde_json::json!({}),
16989 messages: vec![NormalizedMessage {
16990 idx: 0,
16991 role: "user".into(),
16992 author: None,
16993 created_at: Some(1),
16994 content: "unique xyzzy term".into(),
16995 extra: serde_json::json!({}),
16996 snippets: vec![],
16997 invocations: Vec::new(),
16998 }],
16999 };
17000 let conv2 = NormalizedConversation {
17001 agent_slug: "codex".into(),
17002 external_id: None,
17003 title: Some("doc2".into()),
17004 workspace: None,
17005 source_path: dir.path().join("2.jsonl"),
17006 started_at: Some(2),
17007 ended_at: None,
17008 metadata: serde_json::json!({}),
17009 messages: vec![NormalizedMessage {
17010 idx: 0,
17011 role: "user".into(),
17012 author: None,
17013 created_at: Some(2),
17014 content: "unique plugh term".into(),
17015 extra: serde_json::json!({}),
17016 snippets: vec![],
17017 invocations: Vec::new(),
17018 }],
17019 };
17020 index.add_conversation(&conv1)?;
17021 index.add_conversation(&conv2)?;
17022 index.commit()?;
17023
17024 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17025
17026 let hits = client.search(
17028 "xyzzy OR plugh",
17029 SearchFilters::default(),
17030 10,
17031 0,
17032 FieldMask::FULL,
17033 )?;
17034 assert_eq!(hits.len(), 2);
17035
17036 Ok(())
17037 }
17038
17039 #[test]
17040 fn search_boolean_not_excludes_results() -> Result<()> {
17041 let dir = TempDir::new()?;
17042 let mut index = TantivyIndex::open_or_create(dir.path())?;
17043
17044 let conv1 = NormalizedConversation {
17045 agent_slug: "codex".into(),
17046 external_id: None,
17047 title: Some("doc1".into()),
17048 workspace: None,
17049 source_path: dir.path().join("1.jsonl"),
17050 started_at: Some(1),
17051 ended_at: None,
17052 metadata: serde_json::json!({}),
17053 messages: vec![NormalizedMessage {
17054 idx: 0,
17055 role: "user".into(),
17056 author: None,
17057 created_at: Some(1),
17058 content: "nottest keep this".into(),
17059 extra: serde_json::json!({}),
17060 snippets: vec![],
17061 invocations: Vec::new(),
17062 }],
17063 };
17064 let conv2 = NormalizedConversation {
17065 agent_slug: "codex".into(),
17066 external_id: None,
17067 title: Some("doc2".into()),
17068 workspace: None,
17069 source_path: dir.path().join("2.jsonl"),
17070 started_at: Some(2),
17071 ended_at: None,
17072 metadata: serde_json::json!({}),
17073 messages: vec![NormalizedMessage {
17074 idx: 0,
17075 role: "user".into(),
17076 author: None,
17077 created_at: Some(2),
17078 content: "nottest exclude this".into(),
17079 extra: serde_json::json!({}),
17080 snippets: vec![],
17081 invocations: Vec::new(),
17082 }],
17083 };
17084 index.add_conversation(&conv1)?;
17085 index.add_conversation(&conv2)?;
17086 index.commit()?;
17087
17088 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17089
17090 let hits = client.search(
17092 "nottest NOT exclude",
17093 SearchFilters::default(),
17094 10,
17095 0,
17096 FieldMask::FULL,
17097 )?;
17098 assert_eq!(hits.len(), 1);
17099 assert!(
17101 !hits[0].content.contains("exclude"),
17102 "NOT exclude should filter out doc with 'exclude'"
17103 );
17104
17105 let hits = client.search(
17107 "nottest -exclude",
17108 SearchFilters::default(),
17109 10,
17110 0,
17111 FieldMask::FULL,
17112 )?;
17113 assert_eq!(hits.len(), 1);
17114 assert!(
17115 !hits[0].content.contains("exclude"),
17116 "Prefix -exclude should filter out doc with 'exclude'"
17117 );
17118
17119 Ok(())
17120 }
17121
17122 #[test]
17123 fn search_phrase_query_matches_exact_sequence() -> Result<()> {
17124 let dir = TempDir::new()?;
17125 let mut index = TantivyIndex::open_or_create(dir.path())?;
17126
17127 let conv1 = NormalizedConversation {
17128 agent_slug: "codex".into(),
17129 external_id: None,
17130 title: Some("doc1".into()),
17131 workspace: None,
17132 source_path: dir.path().join("1.jsonl"),
17133 started_at: Some(1),
17134 ended_at: None,
17135 metadata: serde_json::json!({}),
17136 messages: vec![NormalizedMessage {
17137 idx: 0,
17138 role: "user".into(),
17139 author: None,
17140 created_at: Some(1),
17141 content: "the quick brown fox".into(),
17142 extra: serde_json::json!({}),
17143 snippets: vec![],
17144 invocations: Vec::new(),
17145 }],
17146 };
17147 let conv2 = NormalizedConversation {
17148 agent_slug: "codex".into(),
17149 external_id: None,
17150 title: Some("doc2".into()),
17151 workspace: None,
17152 source_path: dir.path().join("2.jsonl"),
17153 started_at: Some(2),
17154 ended_at: None,
17155 metadata: serde_json::json!({}),
17156 messages: vec![NormalizedMessage {
17157 idx: 0,
17158 role: "user".into(),
17159 author: None,
17160 created_at: Some(2),
17161 content: "the brown quick fox".into(),
17162 extra: serde_json::json!({}),
17163 snippets: vec![],
17164 invocations: Vec::new(),
17165 }],
17166 };
17167 index.add_conversation(&conv1)?;
17168 index.add_conversation(&conv2)?;
17169 index.commit()?;
17170
17171 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17172
17173 let hits = client.search(
17175 "quick brown",
17176 SearchFilters::default(),
17177 10,
17178 0,
17179 FieldMask::FULL,
17180 )?;
17181 assert_eq!(hits.len(), 2);
17182
17183 let hits = client.search(
17185 "\"quick brown\"",
17186 SearchFilters::default(),
17187 10,
17188 0,
17189 FieldMask::FULL,
17190 )?;
17191 assert_eq!(hits.len(), 1);
17192 assert!(hits[0].content.contains("quick brown"));
17193
17194 Ok(())
17195 }
17196
17197 #[test]
17198 fn search_dot_punctuation_splits_terms_but_hyphens_preserve_compound_semantics() -> Result<()> {
17199 let dir = TempDir::new()?;
17200 let mut index = TantivyIndex::open_or_create(dir.path())?;
17201
17202 let conv = NormalizedConversation {
17203 agent_slug: "codex".into(),
17204 external_id: None,
17205 title: Some("doc".into()),
17206 workspace: None,
17207 source_path: dir.path().join("3.jsonl"),
17208 started_at: Some(1),
17209 ended_at: None,
17210 metadata: serde_json::json!({}),
17211 messages: vec![NormalizedMessage {
17212 idx: 0,
17213 role: "user".into(),
17214 author: None,
17215 created_at: Some(1),
17216 content: "foo bar baz".into(),
17217 extra: serde_json::json!({}),
17218 snippets: vec![],
17219 invocations: Vec::new(),
17220 }],
17221 };
17222 index.add_conversation(&conv)?;
17223 index.commit()?;
17224
17225 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17226
17227 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17228 assert_eq!(hits.len(), 1);
17229
17230 let hits = client.search("foo-bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17231 assert_eq!(hits.len(), 0);
17232
17233 Ok(())
17234 }
17235
17236 #[test]
17241 fn explanation_classifies_simple_query() {
17242 let exp = QueryExplanation::analyze("hello", &SearchFilters::default());
17243 assert_eq!(exp.query_type, QueryType::Simple);
17244 assert_eq!(exp.index_strategy, IndexStrategy::EdgeNgram);
17245 assert_eq!(exp.estimated_cost, QueryCost::Low);
17246 assert!(exp.parsed.terms.len() == 1);
17247 assert_eq!(exp.parsed.terms[0].text, "hello");
17248 assert!(!exp.parsed.terms[0].subterms.is_empty());
17249 assert_eq!(exp.parsed.terms[0].subterms[0].pattern, "exact");
17250 }
17251
17252 #[test]
17253 fn explanation_classifies_wildcard_query() {
17254 let exp = QueryExplanation::analyze("*handler*", &SearchFilters::default());
17255 assert_eq!(exp.query_type, QueryType::Wildcard);
17256 assert_eq!(exp.index_strategy, IndexStrategy::RegexScan);
17257 assert_eq!(exp.estimated_cost, QueryCost::High);
17258 assert!(!exp.parsed.terms[0].subterms.is_empty());
17259 assert!(
17260 exp.parsed.terms[0].subterms[0]
17261 .pattern
17262 .contains("substring")
17263 );
17264 assert!(exp.warnings.iter().any(|w| w.contains("regex scan")));
17265 }
17266
17267 #[test]
17268 fn explanation_classifies_boolean_query() {
17269 let exp = QueryExplanation::analyze("foo AND bar", &SearchFilters::default());
17270 assert_eq!(exp.query_type, QueryType::Boolean);
17271 assert_eq!(exp.index_strategy, IndexStrategy::BooleanCombination);
17272 assert!(exp.parsed.operators.contains(&"AND".to_string()));
17273 }
17274
17275 #[test]
17276 fn explanation_classifies_phrase_query() {
17277 let exp = QueryExplanation::analyze("\"exact phrase\"", &SearchFilters::default());
17278 assert_eq!(exp.query_type, QueryType::Phrase);
17279 assert!(exp.parsed.phrases.contains(&"exact phrase".to_string()));
17280 }
17281
17282 #[test]
17283 fn explanation_handles_filtered_query() {
17284 let mut filters = SearchFilters::default();
17285 filters.agents.insert("codex".to_string());
17286
17287 let exp = QueryExplanation::analyze("test", &filters);
17288 assert_eq!(exp.query_type, QueryType::Filtered);
17289 assert_eq!(exp.filters_summary.agent_count, 1);
17290 assert!(
17291 exp.filters_summary
17292 .description
17293 .as_ref()
17294 .unwrap()
17295 .contains("1 agent")
17296 );
17297 assert!(exp.warnings.iter().any(|w| w.contains("codex")));
17298 }
17299
17300 #[test]
17301 fn explanation_handles_empty_query() {
17302 let exp = QueryExplanation::analyze("", &SearchFilters::default());
17303 assert_eq!(exp.query_type, QueryType::Empty);
17304 assert_eq!(exp.index_strategy, IndexStrategy::FullScan);
17305 assert_eq!(exp.estimated_cost, QueryCost::High);
17306 assert!(exp.warnings.iter().any(|w| w.contains("Empty query")));
17307 }
17308
17309 #[test]
17310 fn explanation_warns_short_terms() {
17311 let exp = QueryExplanation::analyze("a", &SearchFilters::default());
17312 assert!(exp.warnings.iter().any(|w| w.contains("Very short term")));
17313 }
17314
17315 #[test]
17316 fn explanation_with_wildcard_fallback() {
17317 let exp = QueryExplanation::analyze("test", &SearchFilters::default())
17318 .with_wildcard_fallback(true);
17319 assert!(exp.wildcard_applied);
17320 assert!(exp.warnings.iter().any(|w| w.contains("Wildcard fallback")));
17322 }
17323
17324 #[test]
17325 fn explanation_complex_query_has_higher_cost() {
17326 let exp = QueryExplanation::analyze(
17327 "foo AND bar OR baz NOT qux AND \"phrase here\"",
17328 &SearchFilters::default(),
17329 );
17330 assert_eq!(exp.query_type, QueryType::Boolean);
17331 assert!(matches!(
17333 exp.estimated_cost,
17334 QueryCost::Medium | QueryCost::High
17335 ));
17336 }
17337
17338 #[test]
17339 fn explanation_preserves_original_query() {
17340 let exp = QueryExplanation::analyze("Hello World!", &SearchFilters::default());
17341 assert_eq!(exp.original_query, "Hello World!");
17342 assert!(exp.sanitized_query.contains("Hello"));
17344 assert!(!exp.sanitized_query.contains("!"));
17346 }
17347
17348 #[test]
17349 fn explanation_detects_not_operator() {
17350 let exp = QueryExplanation::analyze("foo NOT bar", &SearchFilters::default());
17351 assert!(exp.parsed.operators.contains(&"NOT".to_string()));
17352 assert!(
17354 exp.parsed
17355 .terms
17356 .iter()
17357 .any(|t| t.negated && t.text == "bar")
17358 );
17359 }
17360
17361 #[test]
17362 fn explanation_implicit_and() {
17363 let exp = QueryExplanation::analyze("foo bar", &SearchFilters::default());
17364 assert!(exp.parsed.implicit_and);
17365 assert_eq!(exp.parsed.terms.len(), 2);
17366 }
17367
17368 #[test]
17369 fn explanation_serializes_to_json() {
17370 let exp = QueryExplanation::analyze("test query", &SearchFilters::default());
17371 let json = serde_json::to_value(&exp).expect("should serialize");
17372 assert!(json["original_query"].is_string());
17373 assert!(json["query_type"].is_string());
17374 assert!(json["index_strategy"].is_string());
17375 assert!(json["estimated_cost"].is_string());
17376 assert!(json["parsed"]["terms"].is_array());
17377 }
17378
17379 #[test]
17384 fn search_multi_filter_agent_workspace_time() -> Result<()> {
17385 let dir = TempDir::new()?;
17387 let mut index = TantivyIndex::open_or_create(dir.path())?;
17388
17389 let convs = [
17391 ("codex", "/ws/alpha", 100, "needle alpha codex"),
17392 ("claude", "/ws/alpha", 200, "needle alpha claude"),
17393 ("codex", "/ws/beta", 150, "needle beta codex"),
17394 ("codex", "/ws/alpha", 300, "needle alpha codex late"),
17395 ];
17396
17397 for (i, (agent, ws, ts, content)) in convs.iter().enumerate() {
17398 let conv = NormalizedConversation {
17399 agent_slug: (*agent).into(),
17400 external_id: None,
17401 title: Some(format!("conv-{i}")),
17402 workspace: Some(std::path::PathBuf::from(*ws)),
17403 source_path: dir.path().join(format!("{i}.jsonl")),
17404 started_at: Some(*ts),
17405 ended_at: None,
17406 metadata: serde_json::json!({}),
17407 messages: vec![NormalizedMessage {
17408 idx: 0,
17409 role: "user".into(),
17410 author: None,
17411 created_at: Some(*ts),
17412 content: (*content).into(),
17413 extra: serde_json::json!({}),
17414 snippets: vec![],
17415 invocations: Vec::new(),
17416 }],
17417 };
17418 index.add_conversation(&conv)?;
17419 }
17420 index.commit()?;
17421
17422 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17423
17424 let mut filters = SearchFilters::default();
17426 filters.agents.insert("codex".into());
17427 filters.workspaces.insert("/ws/alpha".into());
17428 filters.created_from = Some(50);
17429 filters.created_to = Some(250);
17430
17431 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17432 assert_eq!(
17433 hits.len(),
17434 1,
17435 "Should match only one conv (codex + alpha + ts=100)"
17436 );
17437 assert_eq!(hits[0].agent, "codex");
17438 assert_eq!(hits[0].workspace, "/ws/alpha");
17439 assert!(hits[0].content.contains("alpha codex"));
17440 assert!(!hits[0].content.contains("late")); Ok(())
17443 }
17444
17445 #[test]
17446 fn search_multi_agent_filter() -> Result<()> {
17447 let dir = TempDir::new()?;
17449 let mut index = TantivyIndex::open_or_create(dir.path())?;
17450
17451 for agent in ["codex", "claude", "cline", "gemini"] {
17452 let conv = NormalizedConversation {
17453 agent_slug: agent.into(),
17454 external_id: None,
17455 title: Some(format!("{agent}-conv")),
17456 workspace: Some(std::path::PathBuf::from("/ws")),
17457 source_path: dir.path().join(format!("{agent}.jsonl")),
17458 started_at: Some(100),
17459 ended_at: None,
17460 metadata: serde_json::json!({}),
17461 messages: vec![NormalizedMessage {
17462 idx: 0,
17463 role: "user".into(),
17464 author: None,
17465 created_at: Some(100),
17466 content: format!("needle from {agent}"),
17467 extra: serde_json::json!({}),
17468 snippets: vec![],
17469 invocations: Vec::new(),
17470 }],
17471 };
17472 index.add_conversation(&conv)?;
17473 }
17474 index.commit()?;
17475
17476 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17477
17478 let mut filters = SearchFilters::default();
17480 filters.agents.insert("codex".into());
17481 filters.agents.insert("claude".into());
17482
17483 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17484 assert_eq!(hits.len(), 2);
17485 let agents: Vec<_> = hits.iter().map(|h| h.agent.as_str()).collect();
17486 assert!(agents.contains(&"codex"));
17487 assert!(agents.contains(&"claude"));
17488 assert!(!agents.contains(&"cline"));
17489 assert!(!agents.contains(&"gemini"));
17490
17491 Ok(())
17492 }
17493
17494 #[test]
17499 fn cache_metrics_incremented_on_operations() {
17500 let client = SearchClient {
17501 reader: None,
17502 sqlite: Mutex::new(None),
17503 sqlite_path: None,
17504 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
17505 reload_on_search: true,
17506 last_reload: Mutex::new(None),
17507 last_generation: Mutex::new(None),
17508 reload_epoch: Arc::new(AtomicU64::new(0)),
17509 warm_tx: None,
17510 _warm_handle: None,
17511 metrics: Metrics::default(),
17512 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
17513 semantic: Mutex::new(None),
17514 last_tantivy_total_count: Mutex::new(None),
17515 };
17516
17517 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
17519 assert_eq!((hits, miss, shortfall, reloads), (0, 0, 0, 0));
17520
17521 client.metrics.inc_cache_hits();
17523 client.metrics.inc_cache_hits();
17524 client.metrics.inc_cache_miss();
17525 client.metrics.inc_cache_shortfall();
17526 client.metrics.inc_reload();
17527
17528 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
17529 assert_eq!(hits, 2);
17530 assert_eq!(miss, 1);
17531 assert_eq!(shortfall, 1);
17532 assert_eq!(reloads, 1);
17533 }
17534
17535 #[test]
17536 fn cache_shard_name_deterministic() {
17537 let client = SearchClient {
17539 reader: None,
17540 sqlite: Mutex::new(None),
17541 sqlite_path: None,
17542 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
17543 reload_on_search: true,
17544 last_reload: Mutex::new(None),
17545 last_generation: Mutex::new(None),
17546 reload_epoch: Arc::new(AtomicU64::new(0)),
17547 warm_tx: None,
17548 _warm_handle: None,
17549 metrics: Metrics::default(),
17550 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
17551 semantic: Mutex::new(None),
17552 last_tantivy_total_count: Mutex::new(None),
17553 };
17554
17555 let filters1 = SearchFilters::default();
17556 let mut filters2 = SearchFilters::default();
17557 filters2.agents.insert("codex".into());
17558 let mut filters3 = SearchFilters::default();
17559 filters3.workspaces.insert("/tmp/cass-workspace".into());
17560
17561 let shard1_first = client.shard_name(&filters1);
17563 let shard1_second = client.shard_name(&filters1);
17564 assert_eq!(
17565 shard1_first, shard1_second,
17566 "Same filters should produce same shard name"
17567 );
17568
17569 let shard2 = client.shard_name(&filters2);
17571 assert_ne!(
17572 shard1_first, shard2,
17573 "Different filters should produce different shard names"
17574 );
17575
17576 assert_eq!(shard2, client.shard_name(&filters2));
17578 assert_eq!(
17579 client.shard_name(&filters3),
17580 "workspace:/tmp/cass-workspace"
17581 );
17582 }
17583
17584 #[test]
17589 fn wildcard_fallback_respects_filter_constraints() -> Result<()> {
17590 let dir = TempDir::new()?;
17591 let mut index = TantivyIndex::open_or_create(dir.path())?;
17592
17593 let conv_match = NormalizedConversation {
17595 agent_slug: "codex".into(),
17596 external_id: None,
17597 title: Some("match".into()),
17598 workspace: Some(std::path::PathBuf::from("/target")),
17599 source_path: dir.path().join("match.jsonl"),
17600 started_at: Some(100),
17601 ended_at: None,
17602 metadata: serde_json::json!({}),
17603 messages: vec![NormalizedMessage {
17604 idx: 0,
17605 role: "user".into(),
17606 author: None,
17607 created_at: Some(100),
17608 content: "unique specific term here".into(),
17609 extra: serde_json::json!({}),
17610 snippets: vec![],
17611 invocations: Vec::new(),
17612 }],
17613 };
17614
17615 let conv_other = NormalizedConversation {
17616 agent_slug: "claude".into(),
17617 external_id: None,
17618 title: Some("other".into()),
17619 workspace: Some(std::path::PathBuf::from("/other")),
17620 source_path: dir.path().join("other.jsonl"),
17621 started_at: Some(100),
17622 ended_at: None,
17623 metadata: serde_json::json!({}),
17624 messages: vec![NormalizedMessage {
17625 idx: 0,
17626 role: "user".into(),
17627 author: None,
17628 created_at: Some(100),
17629 content: "unique specific also here".into(),
17630 extra: serde_json::json!({}),
17631 snippets: vec![],
17632 invocations: Vec::new(),
17633 }],
17634 };
17635
17636 index.add_conversation(&conv_match)?;
17637 index.add_conversation(&conv_other)?;
17638 index.commit()?;
17639
17640 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17641
17642 let mut filters = SearchFilters::default();
17644 filters.agents.insert("codex".into());
17645
17646 let result =
17647 client.search_with_fallback("unique", filters.clone(), 10, 0, 100, FieldMask::FULL)?;
17648 assert!(result.hits.iter().all(|h| h.agent == "codex"));
17650
17651 Ok(())
17652 }
17653
17654 #[test]
17655 fn wildcard_fallback_short_query_triggers_prefix() -> Result<()> {
17656 let dir = TempDir::new()?;
17657 let mut index = TantivyIndex::open_or_create(dir.path())?;
17658
17659 let conv = NormalizedConversation {
17660 agent_slug: "codex".into(),
17661 external_id: None,
17662 title: Some("test".into()),
17663 workspace: None,
17664 source_path: dir.path().join("test.jsonl"),
17665 started_at: Some(100),
17666 ended_at: None,
17667 metadata: serde_json::json!({}),
17668 messages: vec![NormalizedMessage {
17669 idx: 0,
17670 role: "user".into(),
17671 author: None,
17672 created_at: Some(100),
17673 content: "authentication authorization oauth".into(),
17674 extra: serde_json::json!({}),
17675 snippets: vec![],
17676 invocations: Vec::new(),
17677 }],
17678 };
17679 index.add_conversation(&conv)?;
17680 index.commit()?;
17681
17682 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17683
17684 let result = client.search_with_fallback(
17686 "auth",
17687 SearchFilters::default(),
17688 10,
17689 0,
17690 100,
17691 FieldMask::FULL,
17692 )?;
17693 assert!(
17694 !result.hits.is_empty(),
17695 "Short prefix should match via prefix search"
17696 );
17697 assert!(result.hits[0].content.contains("auth"));
17698
17699 Ok(())
17700 }
17701
17702 #[test]
17707 fn search_real_fixture_multiple_messages() -> Result<()> {
17708 let dir = TempDir::new()?;
17709 let mut index = TantivyIndex::open_or_create(dir.path())?;
17710
17711 let conv = NormalizedConversation {
17713 agent_slug: "claude_code".into(),
17714 external_id: Some("conv-123".into()),
17715 title: Some("Implementing authentication".into()),
17716 workspace: Some(std::path::PathBuf::from("/home/user/project")),
17717 source_path: dir.path().join("session-1.jsonl"),
17718 started_at: Some(1700000000000),
17719 ended_at: Some(1700000060000),
17720 metadata: serde_json::json!({
17721 "model": "claude-3-sonnet",
17722 "tokens": 1500
17723 }),
17724 messages: vec![
17725 NormalizedMessage {
17726 idx: 0,
17727 role: "user".into(),
17728 author: Some("developer".into()),
17729 created_at: Some(1700000000000),
17730 content: "Help me implement JWT authentication for my Express API".into(),
17731 extra: serde_json::json!({}),
17732 snippets: vec![],
17733 invocations: Vec::new(),
17734 },
17735 NormalizedMessage {
17736 idx: 1,
17737 role: "assistant".into(),
17738 author: Some("claude".into()),
17739 created_at: Some(1700000010000),
17740 content: "I'll help you implement JWT authentication. First, let's install the required packages.".into(),
17741 extra: serde_json::json!({}),
17742 snippets: vec![NormalizedSnippet {
17743 file_path: Some("package.json".into()),
17744 start_line: Some(1),
17745 end_line: Some(5),
17746 language: Some("json".into()),
17747 snippet_text: Some(r#"{"dependencies":{"jsonwebtoken":"^9.0.0"}}"#.into()),
17748 }],
17749 invocations: Vec::new(),
17750 },
17751 NormalizedMessage {
17752 idx: 2,
17753 role: "user".into(),
17754 author: Some("developer".into()),
17755 created_at: Some(1700000030000),
17756 content: "Can you also add refresh token support?".into(),
17757 extra: serde_json::json!({}),
17758 snippets: vec![],
17759 invocations: Vec::new(),
17760 },
17761 ],
17762 };
17763 index.add_conversation(&conv)?;
17764 index.commit()?;
17765
17766 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17767
17768 let hits = client.search(
17770 "JWT authentication",
17771 SearchFilters::default(),
17772 10,
17773 0,
17774 FieldMask::FULL,
17775 )?;
17776 assert!(!hits.is_empty(), "Should find JWT authentication");
17777 assert!(hits.iter().any(|h| h.agent == "claude_code"));
17778 assert!(
17779 hits.iter()
17780 .any(|h| h.snippet.contains("JWT") || h.snippet.contains("authentication"))
17781 );
17782
17783 let hits = client.search(
17785 "required packages",
17786 SearchFilters::default(),
17787 10,
17788 0,
17789 FieldMask::FULL,
17790 )?;
17791 assert!(
17792 !hits.is_empty(),
17793 "Should find 'required packages' in assistant response"
17794 );
17795
17796 let hits = client.search(
17798 "refresh token",
17799 SearchFilters::default(),
17800 10,
17801 0,
17802 FieldMask::FULL,
17803 )?;
17804 assert!(!hits.is_empty(), "Should find refresh token");
17805 assert!(hits.iter().any(|h| h.content.contains("refresh")));
17806
17807 Ok(())
17808 }
17809
17810 #[test]
17811 fn search_deduplication_with_similar_content() -> Result<()> {
17812 let dir = TempDir::new()?;
17813 let mut index = TantivyIndex::open_or_create(dir.path())?;
17814
17815 for i in 0..2 {
17817 let conv = NormalizedConversation {
17818 agent_slug: "codex".into(),
17819 external_id: None,
17820 title: Some(format!("similar-{i}")),
17821 workspace: Some(std::path::PathBuf::from("/ws")),
17822 source_path: dir.path().join(format!("similar-{i}.jsonl")),
17823 started_at: Some(100 + i),
17824 ended_at: None,
17825 metadata: serde_json::json!({}),
17826 messages: vec![NormalizedMessage {
17827 idx: 0,
17828 role: "user".into(),
17829 author: None,
17830 created_at: Some(100 + i),
17831 content: "implement the sorting algorithm".into(),
17833 extra: serde_json::json!({}),
17834 snippets: vec![],
17835 invocations: Vec::new(),
17836 }],
17837 };
17838 index.add_conversation(&conv)?;
17839 }
17840 index.commit()?;
17841
17842 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17843 let result = client.search_with_fallback(
17844 "sorting algorithm",
17845 SearchFilters::default(),
17846 10,
17847 0,
17848 100,
17849 FieldMask::FULL,
17850 )?;
17851
17852 assert!(!result.hits.is_empty());
17855
17856 Ok(())
17857 }
17858
17859 #[test]
17864 fn search_session_paths_filter() -> Result<()> {
17865 let dir = TempDir::new()?;
17867 let mut index = TantivyIndex::open_or_create(dir.path())?;
17868
17869 let paths = [
17871 dir.path().join("session-a.jsonl"),
17872 dir.path().join("session-b.jsonl"),
17873 dir.path().join("session-c.jsonl"),
17874 ];
17875
17876 for (i, path) in paths.iter().enumerate() {
17877 let conv = NormalizedConversation {
17878 agent_slug: "claude".into(),
17879 external_id: None,
17880 title: Some(format!("session-{}", i)),
17881 workspace: Some(std::path::PathBuf::from("/ws")),
17882 source_path: path.clone(),
17883 started_at: Some(100 + i as i64),
17884 ended_at: None,
17885 metadata: serde_json::json!({}),
17886 messages: vec![NormalizedMessage {
17887 idx: 0,
17888 role: "user".into(),
17889 author: None,
17890 created_at: Some(100 + i as i64),
17891 content: format!("needle content for session {}", i),
17892 extra: serde_json::json!({}),
17893 snippets: vec![],
17894 invocations: Vec::new(),
17895 }],
17896 };
17897 index.add_conversation(&conv)?;
17898 }
17899 index.commit()?;
17900
17901 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17902
17903 let hits_all = client.search("needle", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17905 assert_eq!(hits_all.len(), 3, "Should find all 3 sessions");
17906
17907 let mut filters = SearchFilters::default();
17909 filters
17910 .session_paths
17911 .insert(paths[0].to_string_lossy().to_string());
17912 filters
17913 .session_paths
17914 .insert(paths[2].to_string_lossy().to_string());
17915
17916 let hits_filtered = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17917 assert_eq!(
17918 hits_filtered.len(),
17919 2,
17920 "Should find only 2 sessions (A and C)"
17921 );
17922
17923 let filtered_paths: HashSet<&str> = hits_filtered
17925 .iter()
17926 .map(|h| h.source_path.as_str())
17927 .collect();
17928 assert!(filtered_paths.contains(paths[0].to_string_lossy().as_ref()));
17929 assert!(filtered_paths.contains(paths[2].to_string_lossy().as_ref()));
17930 assert!(!filtered_paths.contains(paths[1].to_string_lossy().as_ref()));
17931
17932 Ok(())
17933 }
17934
17935 #[test]
17936 fn lexical_session_paths_filter_retries_past_initial_page() -> Result<()> {
17937 let dir = TempDir::new()?;
17938 let mut index = TantivyIndex::open_or_create(dir.path())?;
17939 let requested_path = dir.path().join("requested-session.jsonl");
17940
17941 for i in 0..4 {
17942 let conv = NormalizedConversation {
17943 agent_slug: "claude".into(),
17944 external_id: None,
17945 title: Some(format!("distractor-{i}")),
17946 workspace: Some(std::path::PathBuf::from("/ws")),
17947 source_path: dir.path().join(format!("distractor-{i}.jsonl")),
17948 started_at: Some(100 + i as i64),
17949 ended_at: None,
17950 metadata: serde_json::json!({}),
17951 messages: vec![NormalizedMessage {
17952 idx: 0,
17953 role: "user".into(),
17954 author: None,
17955 created_at: Some(100 + i as i64),
17956 content: "needle needle needle high ranking distractor".into(),
17957 extra: serde_json::json!({}),
17958 snippets: vec![],
17959 invocations: Vec::new(),
17960 }],
17961 };
17962 index.add_conversation(&conv)?;
17963 }
17964
17965 let requested = NormalizedConversation {
17966 agent_slug: "claude".into(),
17967 external_id: None,
17968 title: Some("requested".into()),
17969 workspace: Some(std::path::PathBuf::from("/ws")),
17970 source_path: requested_path.clone(),
17971 started_at: Some(200),
17972 ended_at: None,
17973 metadata: serde_json::json!({}),
17974 messages: vec![NormalizedMessage {
17975 idx: 0,
17976 role: "user".into(),
17977 author: None,
17978 created_at: Some(200),
17979 content: "needle requested session should survive post-filter paging".into(),
17980 extra: serde_json::json!({}),
17981 snippets: vec![],
17982 invocations: Vec::new(),
17983 }],
17984 };
17985 index.add_conversation(&requested)?;
17986 index.commit()?;
17987
17988 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17989 let mut filters = SearchFilters::default();
17990 filters
17991 .session_paths
17992 .insert(requested_path.to_string_lossy().to_string());
17993
17994 let hits = client.search("needle", filters, 1, 0, FieldMask::FULL)?;
17995
17996 assert_eq!(hits.len(), 1);
17997 assert_eq!(hits[0].source_path, requested_path.to_string_lossy());
17998
17999 Ok(())
18000 }
18001
18002 #[test]
18003 fn search_session_paths_empty_filter_returns_all() -> Result<()> {
18004 let dir = TempDir::new()?;
18006 let mut index = TantivyIndex::open_or_create(dir.path())?;
18007
18008 let conv = NormalizedConversation {
18009 agent_slug: "claude".into(),
18010 external_id: None,
18011 title: Some("test".into()),
18012 workspace: Some(std::path::PathBuf::from("/ws")),
18013 source_path: dir.path().join("test.jsonl"),
18014 started_at: Some(100),
18015 ended_at: None,
18016 metadata: serde_json::json!({}),
18017 messages: vec![NormalizedMessage {
18018 idx: 0,
18019 role: "user".into(),
18020 author: None,
18021 created_at: Some(100),
18022 content: "needle content".into(),
18023 extra: serde_json::json!({}),
18024 snippets: vec![],
18025 invocations: Vec::new(),
18026 }],
18027 };
18028 index.add_conversation(&conv)?;
18029 index.commit()?;
18030
18031 let client = SearchClient::open(dir.path(), None)?.expect("index present");
18032
18033 let filters = SearchFilters::default();
18035 assert!(filters.session_paths.is_empty());
18036
18037 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
18038 assert_eq!(hits.len(), 1);
18039
18040 Ok(())
18041 }
18042
18043 #[test]
18044 fn search_client_reads_federated_lexical_bundle_as_one_corpus() -> Result<()> {
18045 let root = TempDir::new()?;
18046 let shard_a = root.path().join("shard-a");
18047 let shard_b = root.path().join("shard-b");
18048 let published = root.path().join("published");
18049
18050 let mut shard_a_index = TantivyIndex::open_or_create(&shard_a)?;
18051 let mut shard_b_index = TantivyIndex::open_or_create(&shard_b)?;
18052
18053 let make_conv =
18054 |external_id: &str, title: &str, source_path: &str, tag: &str| NormalizedConversation {
18055 agent_slug: "codex".into(),
18056 external_id: Some(external_id.into()),
18057 title: Some(title.into()),
18058 workspace: Some(std::path::PathBuf::from("/ws")),
18059 source_path: std::path::PathBuf::from(source_path),
18060 started_at: Some(1_700_000_100_000),
18061 ended_at: Some(1_700_000_100_100),
18062 metadata: json!({}),
18063 messages: vec![
18064 NormalizedMessage {
18065 idx: 0,
18066 role: "user".into(),
18067 author: None,
18068 created_at: Some(1_700_000_100_010),
18069 content: format!("shared federated needle {tag} user"),
18070 extra: json!({}),
18071 snippets: vec![],
18072 invocations: Vec::new(),
18073 },
18074 NormalizedMessage {
18075 idx: 1,
18076 role: "assistant".into(),
18077 author: None,
18078 created_at: Some(1_700_000_100_020),
18079 content: format!("shared federated needle {tag} assistant"),
18080 extra: json!({}),
18081 snippets: vec![],
18082 invocations: Vec::new(),
18083 },
18084 ],
18085 };
18086
18087 let conv_a = make_conv(
18088 "fed-query-a",
18089 "Fed Query A",
18090 "/tmp/fed-query-a.jsonl",
18091 "alpha",
18092 );
18093 let conv_b = make_conv(
18094 "fed-query-b",
18095 "Fed Query B",
18096 "/tmp/fed-query-b.jsonl",
18097 "beta",
18098 );
18099
18100 shard_a_index.add_conversation(&conv_a)?;
18101 shard_b_index.add_conversation(&conv_b)?;
18102 shard_a_index.commit()?;
18103 shard_b_index.commit()?;
18104 drop(shard_a_index);
18105 drop(shard_b_index);
18106
18107 crate::search::tantivy::publish_federated_searchable_index_directories(
18108 &published,
18109 &[&shard_a, &shard_b],
18110 )?;
18111
18112 let client = SearchClient::open(&published, None)?.expect("federated index present");
18113 assert!(client.has_tantivy());
18114 assert_eq!(client.total_docs(), 4);
18115
18116 let hits = client.search(
18117 "shared federated needle",
18118 SearchFilters::default(),
18119 10,
18120 0,
18121 FieldMask::FULL,
18122 )?;
18123 assert_eq!(hits.len(), 4);
18124 let observed_order = hits
18125 .iter()
18126 .map(|hit| {
18127 (
18128 hit.source_path.clone(),
18129 hit.line_number,
18130 hit.content.clone(),
18131 hit.score.to_bits(),
18132 )
18133 })
18134 .collect::<Vec<_>>();
18135 let hit_paths = hits
18136 .iter()
18137 .map(|hit| hit.source_path.as_str())
18138 .collect::<std::collections::HashSet<_>>();
18139 assert!(hit_paths.contains("/tmp/fed-query-a.jsonl"));
18140 assert!(hit_paths.contains("/tmp/fed-query-b.jsonl"));
18141
18142 for attempt in 0..3 {
18143 let repeated = client.search(
18144 "shared federated needle",
18145 SearchFilters::default(),
18146 10,
18147 0,
18148 FieldMask::FULL,
18149 )?;
18150 let repeated_order = repeated
18151 .iter()
18152 .map(|hit| {
18153 (
18154 hit.source_path.clone(),
18155 hit.line_number,
18156 hit.content.clone(),
18157 hit.score.to_bits(),
18158 )
18159 })
18160 .collect::<Vec<_>>();
18161 assert_eq!(
18162 repeated_order, observed_order,
18163 "federated lexical query order drifted on repeated attempt {attempt}"
18164 );
18165 }
18166
18167 Ok(())
18168 }
18169
18170 #[test]
18171 fn semantic_search_session_paths_filter_retries_past_initial_candidates() -> Result<()> {
18172 let fixture = build_semantic_test_fixture()?;
18173 let mut filters = SearchFilters::default();
18174 filters
18175 .session_paths
18176 .insert(fixture.source_paths[2].clone());
18177
18178 let (hits, ann_stats) = fixture.client.search_semantic(
18179 "semantic fixture query",
18180 filters,
18181 1,
18182 0,
18183 FieldMask::FULL,
18184 false,
18185 )?;
18186
18187 assert!(
18188 ann_stats.is_none(),
18189 "exact search should not emit ANN stats"
18190 );
18191 assert_eq!(
18192 hits.len(),
18193 1,
18194 "filtered semantic search should still return a hit"
18195 );
18196 assert_eq!(
18197 hits[0].source_path, fixture.source_paths[2],
18198 "semantic search should keep searching until it finds the requested session path"
18199 );
18200
18201 Ok(())
18202 }
18203
18204 #[test]
18205 fn semantic_search_offsets_after_session_paths_filtering() -> Result<()> {
18206 let fixture = build_semantic_test_fixture()?;
18207 let mut filters = SearchFilters::default();
18208 filters
18209 .session_paths
18210 .insert(fixture.source_paths[1].clone());
18211 filters
18212 .session_paths
18213 .insert(fixture.source_paths[2].clone());
18214
18215 let (hits, _) = fixture.client.search_semantic(
18216 "semantic fixture query",
18217 filters,
18218 1,
18219 1,
18220 FieldMask::FULL,
18221 false,
18222 )?;
18223
18224 assert_eq!(
18225 hits.len(),
18226 1,
18227 "second filtered page should still return one hit"
18228 );
18229 assert_eq!(
18230 hits[0].source_path, fixture.source_paths[2],
18231 "offset must apply after semantic deduplication and session path filtering"
18232 );
18233
18234 Ok(())
18235 }
18236
18237 #[test]
18238 fn semantic_search_merges_sharded_vector_indexes() -> Result<()> {
18239 let fixture = build_sharded_semantic_test_fixture()?;
18240 let (hits, ann_stats) = fixture.client.search_semantic(
18241 "semantic fixture query",
18242 SearchFilters::default(),
18243 3,
18244 0,
18245 FieldMask::FULL,
18246 false,
18247 )?;
18248
18249 assert!(
18250 ann_stats.is_none(),
18251 "sharded exact search should not emit ANN stats"
18252 );
18253 assert_eq!(hits.len(), 3);
18254 assert_eq!(hits[0].source_path, fixture.source_paths[0]);
18255 assert_eq!(hits[1].source_path, fixture.source_paths[1]);
18256 assert_eq!(hits[2].source_path, fixture.source_paths[2]);
18257
18258 Ok(())
18259 }
18260
18261 #[test]
18262 fn progressive_phase_overfetches_before_session_paths_filtering() -> Result<()> {
18263 let fixture = build_semantic_test_fixture()?;
18264 let mut filters = SearchFilters::default();
18265 filters
18266 .session_paths
18267 .insert(fixture.source_paths[2].clone());
18268
18269 let results = vec![
18270 FsScoredResult {
18271 doc_id: fixture.doc_ids[0].clone(),
18272 score: 1.0,
18273 source: FsScoreSource::SemanticFast,
18274 index: None,
18275 fast_score: Some(1.0),
18276 quality_score: None,
18277 lexical_score: None,
18278 rerank_score: None,
18279 explanation: None,
18280 metadata: None,
18281 },
18282 FsScoredResult {
18283 doc_id: fixture.doc_ids[1].clone(),
18284 score: 0.9,
18285 source: FsScoreSource::SemanticFast,
18286 index: None,
18287 fast_score: Some(0.9),
18288 quality_score: None,
18289 lexical_score: None,
18290 rerank_score: None,
18291 explanation: None,
18292 metadata: None,
18293 },
18294 FsScoredResult {
18295 doc_id: fixture.doc_ids[2].clone(),
18296 score: 0.8,
18297 source: FsScoreSource::SemanticFast,
18298 index: None,
18299 fast_score: Some(0.8),
18300 quality_score: None,
18301 lexical_score: None,
18302 rerank_score: None,
18303 explanation: None,
18304 metadata: None,
18305 },
18306 ];
18307
18308 let result = fixture.client.progressive_phase_to_result(
18309 &results,
18310 ProgressivePhaseContext {
18311 query: "session path filter",
18312 filters: &filters,
18313 field_mask: FieldMask::FULL,
18314 lexical_cache: None,
18315 limit: 1,
18316 fetch_limit: 3,
18317 },
18318 )?;
18319
18320 assert_eq!(
18321 result.hits.len(),
18322 1,
18323 "progressive phase should retain enough overfetched hits to satisfy post-search session path filtering"
18324 );
18325 assert_eq!(
18326 result.hits[0].source_path, fixture.source_paths[2],
18327 "progressive phase should page after session path filtering"
18328 );
18329
18330 Ok(())
18331 }
18332
18333 #[test]
18338 fn sql_placeholders_empty() {
18339 assert_eq!(sql_placeholders(0), "");
18340 }
18341
18342 #[test]
18343 fn sql_placeholders_single() {
18344 assert_eq!(sql_placeholders(1), "?");
18345 }
18346
18347 #[test]
18348 fn sql_placeholders_multiple() {
18349 assert_eq!(sql_placeholders(3), "?,?,?");
18350 assert_eq!(sql_placeholders(5), "?,?,?,?,?");
18351 }
18352
18353 #[test]
18354 fn sql_placeholders_capacity_efficient() {
18355 let result = sql_placeholders(3);
18357 assert_eq!(result.len(), 5);
18358 assert!(result.capacity() >= 5); let result = sql_placeholders(10);
18362 assert_eq!(result.len(), 19);
18363 assert!(result.capacity() >= 19);
18364 }
18365
18366 #[test]
18367 fn sql_placeholders_large_count() {
18368 let result = sql_placeholders(100);
18370 assert_eq!(result.len(), 199); assert_eq!(result.chars().filter(|c| *c == '?').count(), 100);
18372 assert_eq!(result.chars().filter(|c| *c == ',').count(), 99);
18373 }
18374
18375 #[test]
18376 fn hybrid_budget_identifier_biases_lexical() {
18377 let budget = hybrid_candidate_budget("src/main.rs", 20, 20, 5, 10_000);
18378 assert!(
18379 budget.lexical_candidates > budget.semantic_candidates,
18380 "identifier queries should allocate more lexical than semantic fanout"
18381 );
18382 assert!(budget.lexical_candidates >= 25);
18383 }
18384
18385 #[test]
18386 fn hybrid_budget_natural_language_biases_semantic() {
18387 let budget = hybrid_candidate_budget(
18388 "how do we fix authentication middleware latency",
18389 20,
18390 20,
18391 5,
18392 10_000,
18393 );
18394 assert!(
18395 budget.semantic_candidates > budget.lexical_candidates,
18396 "natural language queries should allocate more semantic than lexical fanout"
18397 );
18398 }
18399
18400 #[test]
18401 fn hybrid_budget_no_limit_caps_both_lexical_and_semantic() {
18402 let total_docs = 2_000_000;
18410 let budget =
18411 hybrid_candidate_budget("authentication middleware", 0, total_docs, 0, total_docs);
18412 let cap = no_limit_result_cap();
18413 assert!(
18414 budget.lexical_candidates <= cap,
18415 "lexical fanout must respect no_limit_result_cap() = {cap}; got {}",
18416 budget.lexical_candidates
18417 );
18418 assert!(
18419 budget.lexical_candidates <= NO_LIMIT_RESULT_MAX,
18420 "lexical fanout must respect the absolute NO_LIMIT_RESULT_MAX; got {}",
18421 budget.lexical_candidates
18422 );
18423 assert!(budget.semantic_candidates <= HYBRID_NO_LIMIT_SEMANTIC_CAP);
18424 assert!(
18431 budget.semantic_candidates <= budget.lexical_candidates,
18432 "semantic ({}) must not exceed lexical ({}) fanout",
18433 budget.semantic_candidates,
18434 budget.lexical_candidates
18435 );
18436 }
18437
18438 #[test]
18439 fn compute_no_limit_result_cap_clamps_explicit_over_ceiling_env_override() {
18440 let cap = compute_no_limit_result_cap_from(Some("999999999999".to_string()), None, None);
18446 assert!(
18447 cap <= NO_LIMIT_RESULT_MAX,
18448 "explicit override must still clamp to ceiling; got {cap} > {NO_LIMIT_RESULT_MAX}"
18449 );
18450 assert!(cap >= NO_LIMIT_RESULT_MIN);
18451 }
18452
18453 #[test]
18454 fn compute_no_limit_result_cap_clamps_tiny_explicit_override_up_to_floor() {
18455 let cap = compute_no_limit_result_cap_from(Some("1".to_string()), None, None);
18457 assert_eq!(cap, NO_LIMIT_RESULT_MIN);
18458 }
18459
18460 #[test]
18461 fn compute_no_limit_result_cap_uses_meminfo_when_no_env_override() {
18462 let cap = compute_no_limit_result_cap_from(None, None, Some(128u64 * 1024 * 1024 * 1024));
18466 assert!(cap >= NO_LIMIT_RESULT_MIN, "cap {cap} below floor");
18467 assert!(cap <= NO_LIMIT_RESULT_MAX, "cap {cap} above ceiling");
18468 assert!(cap > NO_LIMIT_RESULT_MIN * 10);
18470 }
18471
18472 #[test]
18473 fn compute_no_limit_result_cap_falls_back_to_floor_when_meminfo_unavailable() {
18474 let cap = compute_no_limit_result_cap_from(None, None, None);
18478 assert!(cap >= NO_LIMIT_RESULT_MIN);
18479 assert!(cap <= NO_LIMIT_RESULT_MAX);
18480 }
18481
18482 #[test]
18483 fn compute_no_limit_result_cap_bytes_env_takes_priority_over_meminfo() {
18484 let four_gib = (4u64 * 1024 * 1024 * 1024).to_string();
18489 let cap = compute_no_limit_result_cap_from(
18490 None,
18491 Some(four_gib),
18492 Some(1024u64 * 1024 * 1024 * 1024), );
18494 let expected_hits = ((4u64 * 1024 * 1024 * 1024) / AVG_HIT_BYTES) as usize;
18495 let expected = expected_hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
18496 assert_eq!(cap, expected, "bytes env must win over meminfo");
18497 }
18498
18499 #[test]
18500 fn no_limit_budget_bytes_preserves_fallback_priority() {
18501 let huge_meminfo = Some(1024u64 * 1024 * 1024 * 1024);
18502 let four_gib = 4u64 * 1024 * 1024 * 1024;
18503
18504 assert_eq!(
18505 no_limit_budget_bytes(Some(four_gib.to_string()), huge_meminfo),
18506 four_gib
18507 );
18508 assert_eq!(
18509 no_limit_budget_bytes(Some("0".to_string()), huge_meminfo),
18510 NO_LIMIT_BYTES_CEILING
18511 );
18512 assert_eq!(no_limit_budget_bytes(None, None), NO_LIMIT_BYTES_FLOOR);
18513 }
18514
18515 #[test]
18516 fn compute_no_limit_result_cap_ignores_malformed_env() {
18517 for bad in ["", "abc", "0", "-1"] {
18519 let cap = compute_no_limit_result_cap_from(
18520 Some(bad.to_string()),
18521 Some(bad.to_string()),
18522 None,
18523 );
18524 assert!(cap >= NO_LIMIT_RESULT_MIN, "bad={bad:?} cap={cap}");
18525 assert!(cap <= NO_LIMIT_RESULT_MAX, "bad={bad:?} cap={cap}");
18526 }
18527 }
18528
18529 fn make_test_hit(id: &str, score: f32) -> SearchHit {
18534 SearchHit {
18535 title: id.to_string(),
18536 snippet: String::new(),
18537 content: id.to_string(),
18538 content_hash: stable_content_hash(id),
18539 score,
18540 source_path: format!("/path/{}.jsonl", id),
18541 agent: "test".to_string(),
18542 workspace: "/workspace".to_string(),
18543 workspace_original: None,
18544 created_at: Some(1_700_000_000_000),
18545 line_number: Some(1),
18546 match_type: MatchType::Exact,
18547 source_id: "local".to_string(),
18548 origin_kind: "local".to_string(),
18549 origin_host: None,
18550 conversation_id: None,
18551 }
18552 }
18553
18554 #[test]
18555 fn test_rrf_fusion_ordering() {
18556 let lexical = vec![
18559 make_test_hit("A", 10.0),
18560 make_test_hit("B", 8.0),
18561 make_test_hit("C", 6.0),
18562 ];
18563 let semantic = vec![
18564 make_test_hit("A", 0.9),
18565 make_test_hit("B", 0.7),
18566 make_test_hit("D", 0.5),
18567 ];
18568
18569 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18570
18571 assert_eq!(fused.len(), 4);
18573 assert_eq!(fused[0].title, "A"); assert_eq!(fused[1].title, "B"); }
18577
18578 #[test]
18579 fn test_rrf_handles_disjoint_sets() {
18580 let lexical = vec![make_test_hit("A", 10.0), make_test_hit("B", 8.0)];
18582 let semantic = vec![make_test_hit("C", 0.9), make_test_hit("D", 0.7)];
18583
18584 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18585
18586 assert_eq!(fused.len(), 4);
18588 let titles: Vec<&str> = fused.iter().map(|h| h.title.as_str()).collect();
18589 assert!(titles.contains(&"A"));
18590 assert!(titles.contains(&"B"));
18591 assert!(titles.contains(&"C"));
18592 assert!(titles.contains(&"D"));
18593 }
18594
18595 #[test]
18596 fn test_rrf_tie_breaking_deterministic() {
18597 let lexical = vec![
18599 make_test_hit("X", 5.0),
18600 make_test_hit("Y", 5.0),
18601 make_test_hit("Z", 5.0),
18602 ];
18603 let semantic = vec![]; let fused1 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18607 let fused2 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18608 let fused3 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18609
18610 assert_eq!(fused1.len(), fused2.len());
18612 assert_eq!(fused2.len(), fused3.len());
18613
18614 for i in 0..fused1.len() {
18615 assert_eq!(fused1[i].title, fused2[i].title, "Mismatch at index {}", i);
18616 assert_eq!(fused2[i].title, fused3[i].title, "Mismatch at index {}", i);
18617 }
18618 }
18619
18620 #[test]
18621 fn test_rrf_both_lists_bonus() {
18622 let lexical = vec![
18625 make_test_hit("solo_lex", 10.0), make_test_hit("both", 5.0), ];
18628 let semantic = vec![
18629 make_test_hit("solo_sem", 0.9), make_test_hit("both", 0.5), ];
18632
18633 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18634
18635 assert_eq!(
18639 fused[0].title, "both",
18640 "Doc in both lists should rank first"
18641 );
18642 }
18643
18644 #[test]
18645 fn test_rrf_respects_limit_and_offset() {
18646 let lexical = vec![
18647 make_test_hit("A", 10.0),
18648 make_test_hit("B", 8.0),
18649 make_test_hit("C", 6.0),
18650 ];
18651 let semantic = vec![];
18652
18653 let fused = rrf_fuse_hits(&lexical, &semantic, "", 2, 0);
18655 assert_eq!(fused.len(), 2);
18656
18657 let fused_offset = rrf_fuse_hits(&lexical, &semantic, "", 10, 1);
18659 assert_eq!(fused_offset.len(), 2); let fused_empty = rrf_fuse_hits(&lexical, &semantic, "", 0, 0);
18663 assert!(fused_empty.is_empty());
18664 }
18665
18666 #[test]
18667 fn test_rrf_empty_inputs() {
18668 let empty: Vec<SearchHit> = vec![];
18669 let non_empty = vec![make_test_hit("A", 10.0)];
18670
18671 assert!(rrf_fuse_hits(&empty, &empty, "", 10, 0).is_empty());
18673
18674 let fused = rrf_fuse_hits(&empty, &non_empty, "", 10, 0);
18676 assert_eq!(fused.len(), 1);
18677 assert_eq!(fused[0].title, "A");
18678
18679 let fused = rrf_fuse_hits(&non_empty, &empty, "", 10, 0);
18681 assert_eq!(fused.len(), 1);
18682 assert_eq!(fused[0].title, "A");
18683 }
18684
18685 #[test]
18686 fn test_rrf_coalesces_empty_title_hits_across_search_modes() {
18687 let mut lexical = make_test_hit("shared", 10.0);
18688 lexical.title.clear();
18689 lexical.source_path = "/shared/untitled.jsonl".into();
18690 lexical.content = "same untitled body".into();
18691 lexical.content_hash = stable_content_hash("same untitled body");
18692
18693 let mut semantic = lexical.clone();
18694 semantic.score = 0.9;
18695
18696 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18697 assert_eq!(fused.len(), 1);
18698 assert_eq!(fused[0].title, "");
18699 }
18700
18701 #[test]
18702 fn test_rrf_coalesces_blank_local_source_id_hits_across_search_modes() {
18703 let mut lexical = make_test_hit("shared-local", 10.0);
18704 lexical.source_path = "/shared/local.jsonl".into();
18705 lexical.content = "same local body".into();
18706 lexical.content_hash = stable_content_hash("same local body");
18707 lexical.source_id = "local".into();
18708 lexical.origin_kind = "local".into();
18709
18710 let mut semantic = lexical.clone();
18711 semantic.source_id = " ".into();
18712 semantic.origin_kind = "local".into();
18713 semantic.score = 0.9;
18714
18715 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18716 assert_eq!(fused.len(), 1);
18717 assert_eq!(fused[0].source_id, "local");
18718 }
18719
18720 #[test]
18721 fn test_rrf_keeps_repeated_same_content_at_different_lines() {
18722 let mut first = make_test_hit("same", 10.0);
18723 first.title = "Shared Session".into();
18724 first.source_path = "/shared/session.jsonl".into();
18725 first.content = "repeat me".into();
18726 first.content_hash = stable_content_hash("repeat me");
18727 first.line_number = Some(1);
18728 first.created_at = Some(100);
18729
18730 let mut second = first.clone();
18731 second.line_number = Some(2);
18732 second.created_at = Some(200);
18733 second.score = 0.9;
18734
18735 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
18736 assert_eq!(fused.len(), 2);
18737 assert_eq!(fused[0].line_number, Some(1));
18738 assert_eq!(fused[1].line_number, Some(2));
18739 }
18740
18741 #[test]
18742 fn test_rrf_coalesces_present_and_missing_conversation_id_for_same_message() {
18743 let mut lexical = make_test_hit("same", 10.0);
18744 lexical.title = "Shared Session".into();
18745 lexical.source_path = "/shared/session.jsonl".into();
18746 lexical.content = "identical body".into();
18747 lexical.content_hash = stable_content_hash("identical body");
18748 lexical.created_at = Some(100);
18749 lexical.line_number = Some(1);
18750 lexical.conversation_id = None;
18751
18752 let mut semantic = lexical.clone();
18753 semantic.conversation_id = Some(42);
18754 semantic.score = 0.9;
18755
18756 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18757 assert_eq!(fused.len(), 1);
18758 assert_eq!(fused[0].conversation_id, Some(42));
18759 }
18760
18761 #[test]
18762 fn test_rrf_coalesces_present_and_missing_conversation_id_despite_blank_local_source_id() {
18763 let mut lexical = make_test_hit("same", 10.0);
18764 lexical.title = "Shared Session".into();
18765 lexical.source_path = "/shared/session.jsonl".into();
18766 lexical.content = "identical body".into();
18767 lexical.content_hash = stable_content_hash("identical body");
18768 lexical.created_at = Some(100);
18769 lexical.line_number = Some(1);
18770 lexical.conversation_id = None;
18771 lexical.source_id = "local".into();
18772 lexical.origin_kind = "local".into();
18773
18774 let mut semantic = lexical.clone();
18775 semantic.conversation_id = Some(42);
18776 semantic.source_id = " ".into();
18777 semantic.origin_kind = "local".into();
18778 semantic.score = 0.9;
18779
18780 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18781 assert_eq!(fused.len(), 1);
18782 assert_eq!(fused[0].conversation_id, Some(42));
18783 }
18784
18785 #[test]
18786 fn test_rrf_keeps_distinct_conversation_ids_for_shared_path_and_content() {
18787 let mut first = make_test_hit("same", 10.0);
18788 first.title = "Shared Session".into();
18789 first.source_path = "/shared/session.jsonl".into();
18790 first.content = "identical body".into();
18791 first.content_hash = stable_content_hash("identical body");
18792 first.conversation_id = Some(1);
18793
18794 let mut second = first.clone();
18795 second.conversation_id = Some(2);
18796 second.score = 0.9;
18797
18798 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
18799 assert_eq!(fused.len(), 2);
18800 assert!(fused.iter().any(|hit| hit.conversation_id == Some(1)));
18801 assert!(fused.iter().any(|hit| hit.conversation_id == Some(2)));
18802 }
18803
18804 #[test]
18805 fn test_rrf_coalesces_same_conversation_id_despite_title_drift() {
18806 let mut lexical = make_test_hit("same", 10.0);
18807 lexical.title = "Morning Session".into();
18808 lexical.source_path = "/shared/session.jsonl".into();
18809 lexical.content = "identical body".into();
18810 lexical.content_hash = stable_content_hash("identical body");
18811 lexical.conversation_id = Some(9);
18812
18813 let mut semantic = lexical.clone();
18814 semantic.title = "Evening Session".into();
18815 semantic.score = 0.9;
18816
18817 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18818 assert_eq!(fused.len(), 1);
18819 assert_eq!(fused[0].conversation_id, Some(9));
18820 }
18821
18822 #[test]
18823 fn test_rrf_keeps_distinct_titles_for_shared_path_and_content() {
18824 let mut morning = make_test_hit("same", 10.0);
18825 morning.title = "Morning Session".into();
18826 morning.source_path = "/shared/session.jsonl".into();
18827 morning.content = "identical body".into();
18828 morning.content_hash = stable_content_hash("identical body");
18829 morning.created_at = None;
18830
18831 let mut evening = morning.clone();
18832 evening.title = "Evening Session".into();
18833 evening.score = 0.9;
18834
18835 let fused = rrf_fuse_hits(&[morning], &[evening], "", 10, 0);
18836 assert_eq!(fused.len(), 2);
18837 assert!(fused.iter().any(|hit| hit.title == "Morning Session"));
18838 assert!(fused.iter().any(|hit| hit.title == "Evening Session"));
18839 }
18840
18841 #[test]
18842 fn test_rrf_candidate_depth() {
18843 let lexical: Vec<_> = (0..50)
18845 .map(|i| make_test_hit(&format!("L{}", i), 100.0 - i as f32))
18846 .collect();
18847 let semantic: Vec<_> = (0..50)
18848 .map(|i| make_test_hit(&format!("S{}", i), 1.0 - 0.01 * i as f32))
18849 .collect();
18850
18851 let fused = rrf_fuse_hits(&lexical, &semantic, "", 20, 0);
18852
18853 assert_eq!(fused.len(), 20);
18855
18856 let mut seen = std::collections::HashSet::new();
18858 for hit in &fused {
18859 assert!(seen.insert(&hit.title), "Duplicate hit: {}", hit.title);
18860 }
18861 }
18862
18863 #[test]
18868 fn query_token_list_parses_small_queries() {
18869 let cases = [
18870 ("hello", 1),
18871 ("hello world", 2),
18872 ("hello AND world", 3),
18873 ("hello world foo bar", 4),
18874 ];
18875
18876 for (query, expected_len) in cases {
18877 let tokens = parse_boolean_query(query);
18878 assert_eq!(tokens.len(), expected_len, "{query}");
18879 }
18880 }
18881
18882 #[test]
18883 fn query_token_list_parses_large_queries() {
18884 let tokens = parse_boolean_query("a b c d e f g h i");
18885 assert_eq!(tokens.len(), 9);
18886 }
18887
18888 #[test]
18889 fn query_token_list_handles_quoted_phrases() {
18890 let tokens = parse_boolean_query("\"hello world\" test");
18891 assert_eq!(tokens.len(), 2);
18892
18893 assert!(
18895 matches!(&tokens[0], QueryToken::Phrase(phrase) if phrase == "hello world"),
18896 "Expected Phrase token"
18897 );
18898 }
18899
18900 #[test]
18901 fn query_token_list_handles_operators() {
18902 let tokens = parse_boolean_query("foo AND bar OR baz");
18903 assert_eq!(tokens.len(), 5);
18904 assert_eq!(tokens[1], QueryToken::And);
18905 assert_eq!(tokens[3], QueryToken::Or);
18906 }
18907
18908 #[test]
18909 fn query_token_list_empty_query() {
18910 let tokens = parse_boolean_query("");
18911 assert!(tokens.is_empty());
18912 }
18913
18914 #[test]
18915 fn query_token_list_iteration_works() {
18916 let tokens = parse_boolean_query("a b c");
18917 let terms: Vec<_> = tokens
18918 .iter()
18919 .filter_map(|t| match t {
18920 QueryToken::Term(s) => Some(s.as_str()),
18921 _ => None,
18922 })
18923 .collect();
18924 assert_eq!(terms, vec!["a", "b", "c"]);
18925 }
18926
18927 #[test]
18937 fn unicode_emoji_treated_as_separator() {
18938 let sanitized = sanitize_query("🚀 launch");
18940 assert_eq!(sanitized, " launch", "Emoji should become space");
18941 }
18942
18943 #[test]
18944 fn unicode_emoji_splits_terms() {
18945 let sanitized = sanitize_query("hot🔥code");
18947 assert_eq!(sanitized, "hot code", "Emoji between words splits them");
18948 }
18949
18950 #[test]
18951 fn unicode_multiple_emoji_become_spaces() {
18952 let sanitized = sanitize_query("🚀🔥💻");
18953 assert_eq!(
18954 sanitized.trim(),
18955 "",
18956 "All-emoji query sanitizes to whitespace"
18957 );
18958 }
18959
18960 #[test]
18961 fn unicode_emoji_query_parses_without_panic() {
18962 let tokens = parse_boolean_query("🚀 launch code 🔥");
18963 let terms: Vec<_> = tokens
18964 .iter()
18965 .filter_map(|t| match t {
18966 QueryToken::Term(s) => Some(s.clone()),
18967 _ => None,
18968 })
18969 .collect();
18970 assert!(
18972 terms
18973 .iter()
18974 .any(|t| t.contains("launch") || t.contains("code"))
18975 );
18976 }
18977
18978 #[test]
18979 fn unicode_emoji_query_terms_lower() {
18980 let terms = QueryTermsLower::from_query("🚀 LAUNCH");
18981 let tokens: Vec<&str> = terms.tokens().collect();
18983 assert!(
18984 tokens.contains(&"launch"),
18985 "Should extract 'launch' from emoji query"
18986 );
18987 }
18988
18989 #[test]
18992 fn unicode_cjk_chinese_preserved() {
18993 assert_eq!(sanitize_query("测试代码"), "测试代码");
18994 assert_eq!(sanitize_query("测试 代码"), "测试 代码");
18995 }
18996
18997 #[test]
18998 fn unicode_cjk_japanese_preserved() {
18999 assert_eq!(sanitize_query("テスト"), "テスト");
19000 assert_eq!(sanitize_query("こんにちは世界"), "こんにちは世界");
19002 }
19003
19004 #[test]
19005 fn unicode_cjk_korean_preserved() {
19006 assert_eq!(sanitize_query("테스트"), "테스트");
19007 assert_eq!(sanitize_query("안녕하세요"), "안녕하세요");
19008 }
19009
19010 #[test]
19011 fn unicode_cjk_parsed_as_terms() {
19012 let tokens = parse_boolean_query("测试 代码 search");
19013 let terms: Vec<_> = tokens
19014 .iter()
19015 .filter_map(|t| match t {
19016 QueryToken::Term(s) => Some(s.as_str()),
19017 _ => None,
19018 })
19019 .collect();
19020 assert_eq!(terms, vec!["测试", "代码", "search"]);
19021 }
19022
19023 #[test]
19024 fn unicode_cjk_query_terms_lower() {
19025 let terms = QueryTermsLower::from_query("测试 代码");
19026 let tokens: Vec<&str> = terms.tokens().collect();
19027 assert_eq!(tokens, vec!["测试", "代码"]);
19028 }
19029
19030 #[test]
19033 fn unicode_hebrew_preserved() {
19034 assert_eq!(sanitize_query("שלום עולם"), "שלום עולם");
19035 }
19036
19037 #[test]
19038 fn unicode_arabic_preserved() {
19039 assert_eq!(sanitize_query("مرحبا"), "مرحبا");
19040 }
19041
19042 #[test]
19043 fn unicode_hebrew_parsed_as_terms() {
19044 let tokens = parse_boolean_query("שלום עולם");
19045 let terms: Vec<_> = tokens
19046 .iter()
19047 .filter_map(|t| match t {
19048 QueryToken::Term(s) => Some(s.as_str()),
19049 _ => None,
19050 })
19051 .collect();
19052 assert_eq!(terms, vec!["שלום", "עולם"]);
19053 }
19054
19055 #[test]
19056 fn unicode_arabic_query_terms_lower() {
19057 let terms = QueryTermsLower::from_query("مرحبا بالعالم");
19059 let tokens: Vec<&str> = terms.tokens().collect();
19060 assert_eq!(tokens, vec!["مرحبا", "بالعالم"]);
19061 }
19062
19063 #[test]
19066 fn unicode_mixed_scripts_preserved() {
19067 let sanitized = sanitize_query("Hello 世界 мир");
19068 assert_eq!(sanitized, "Hello 世界 мир");
19069 }
19070
19071 #[test]
19072 fn unicode_mixed_scripts_parsed() {
19073 let tokens = parse_boolean_query("Hello 世界 мир");
19074 let terms: Vec<_> = tokens
19075 .iter()
19076 .filter_map(|t| match t {
19077 QueryToken::Term(s) => Some(s.as_str()),
19078 _ => None,
19079 })
19080 .collect();
19081 assert_eq!(terms, vec!["Hello", "世界", "мир"]);
19082 }
19083
19084 #[test]
19085 fn unicode_mixed_scripts_with_emoji() {
19086 let sanitized = sanitize_query("Hello 🌍 世界");
19088 assert_eq!(sanitized, "Hello 世界");
19089 }
19090
19091 #[test]
19092 fn unicode_latin_cyrillic_arabic_query() {
19093 let terms = QueryTermsLower::from_query("Hello Мир مرحبا");
19094 let tokens: Vec<&str> = terms.tokens().collect();
19095 assert_eq!(tokens, vec!["hello", "мир", "مرحبا"]);
19096 }
19097
19098 #[test]
19101 fn unicode_zero_width_joiner_removed() {
19102 let sanitized = sanitize_query("test\u{200D}query");
19104 assert_eq!(sanitized, "test query");
19105 }
19106
19107 #[test]
19108 fn unicode_zero_width_non_joiner_removed() {
19109 let sanitized = sanitize_query("test\u{200C}query");
19111 assert_eq!(sanitized, "test query");
19112 }
19113
19114 #[test]
19115 fn unicode_zero_width_space_removed() {
19116 let sanitized = sanitize_query("test\u{200B}query");
19118 assert_eq!(sanitized, "test query");
19119 }
19120
19121 #[test]
19122 fn unicode_bom_removed() {
19123 let sanitized = sanitize_query("\u{FEFF}test");
19125 assert_eq!(sanitized, " test");
19126 }
19127
19128 #[test]
19131 fn unicode_precomposed_accent_preserved() {
19132 let sanitized = sanitize_query("café");
19134 assert_eq!(sanitized, "café");
19135 }
19136
19137 #[test]
19138 fn unicode_combining_accent_becomes_separator() {
19139 let input = "cafe\u{0301}";
19143 let sanitized = sanitize_query(input);
19144 assert_eq!(sanitized, "caf\u{00e9}");
19145 }
19146
19147 #[test]
19148 fn unicode_nfc_and_nfd_produce_same_sanitized_query() {
19149 let nfc = "caf\u{00E9}";
19151 let nfd = "cafe\u{0301}";
19153
19154 let san_nfc = sanitize_query(nfc);
19155 let san_nfd = sanitize_query(nfd);
19156
19157 assert_eq!(san_nfc, "café");
19161 assert_eq!(san_nfd, "café");
19162 assert_eq!(san_nfc, san_nfd);
19163 }
19164
19165 #[test]
19166 fn unicode_combining_marks_do_not_panic() {
19167 let zalgo = "t\u{0301}\u{0302}\u{0303}e\u{0304}\u{0305}st";
19169 let sanitized = sanitize_query(zalgo);
19170 assert!(sanitized.contains('t'));
19172 assert!(sanitized.contains('s'));
19173 }
19174
19175 #[test]
19178 fn unicode_mathematical_bold_letters_preserved() {
19179 let input = "\u{1D400}\u{1D401}\u{1D402}";
19181 let sanitized = sanitize_query(input);
19182 assert_eq!(
19183 sanitized, input,
19184 "Mathematical bold letters are alphanumeric"
19185 );
19186 }
19187
19188 #[test]
19189 fn unicode_supplementary_ideograph_preserved() {
19190 let input = "\u{20000}";
19192 let sanitized = sanitize_query(input);
19193 assert_eq!(
19194 sanitized, input,
19195 "Supplementary CJK ideographs are alphanumeric"
19196 );
19197 }
19198
19199 #[test]
19200 fn unicode_supplementary_emoji_removed() {
19201 let input = "test\u{1F600}query";
19203 let sanitized = sanitize_query(input);
19204 assert_eq!(sanitized, "test query");
19205 }
19206
19207 #[test]
19210 fn unicode_bidi_mixed_ltr_rtl_no_panic() {
19211 let input = "hello שלום world עולם";
19212 let tokens = parse_boolean_query(input);
19213 let terms: Vec<_> = tokens
19214 .iter()
19215 .filter_map(|t| match t {
19216 QueryToken::Term(s) => Some(s.as_str()),
19217 _ => None,
19218 })
19219 .collect();
19220 assert_eq!(terms.len(), 4);
19221 assert!(terms.contains(&"hello"));
19222 assert!(terms.contains(&"שלום"));
19223 assert!(terms.contains(&"world"));
19224 assert!(terms.contains(&"עולם"));
19225 }
19226
19227 #[test]
19228 fn unicode_bidi_override_chars_removed() {
19229 let input = "test\u{202D}content\u{202C}end";
19232 let sanitized = sanitize_query(input);
19233 assert_eq!(sanitized, "test content end");
19234 }
19235
19236 #[test]
19237 fn unicode_bidi_rtl_mark_removed() {
19238 let input = "test\u{200F}content";
19240 let sanitized = sanitize_query(input);
19241 assert_eq!(sanitized, "test content");
19242 }
19243
19244 #[test]
19247 fn unicode_full_pipeline_cjk_query() {
19248 let explanation = QueryExplanation::analyze("测试 代码", &SearchFilters::default());
19249 assert_eq!(explanation.parsed.terms.len(), 2);
19250 assert!(!explanation.parsed.terms[0].text.is_empty());
19251 assert!(!explanation.parsed.terms[1].text.is_empty());
19252 }
19253
19254 #[test]
19255 fn unicode_full_pipeline_mixed_script_boolean() {
19256 let explanation =
19257 QueryExplanation::analyze("Hello AND 世界 OR مرحبا", &SearchFilters::default());
19258 assert!(
19260 explanation.parsed.operators.iter().any(|op| op == "AND"),
19261 "AND operator should be recognized in mixed-script query"
19262 );
19263 }
19264
19265 #[test]
19266 fn unicode_full_pipeline_emoji_query_type() {
19267 let explanation = QueryExplanation::analyze("🚀🔥💻", &SearchFilters::default());
19269 assert!(
19271 explanation.parsed.terms.is_empty()
19272 || explanation
19273 .parsed
19274 .terms
19275 .iter()
19276 .all(|t| t.subterms.is_empty()),
19277 "All-emoji query should produce no meaningful terms"
19278 );
19279 }
19280
19281 #[test]
19282 fn unicode_full_pipeline_phrase_with_cjk() {
19283 let explanation = QueryExplanation::analyze("\"测试代码\"", &SearchFilters::default());
19284 assert!(
19285 !explanation.parsed.phrases.is_empty(),
19286 "CJK phrase should be recognized"
19287 );
19288 }
19289
19290 #[test]
19291 fn unicode_full_pipeline_wildcard_with_unicode() {
19292 let explanation = QueryExplanation::analyze("*测试*", &SearchFilters::default());
19293 assert!(
19294 !explanation.parsed.terms.is_empty(),
19295 "Wildcard with CJK should produce terms"
19296 );
19297 if let Some(term) = explanation.parsed.terms.first() {
19299 assert!(
19300 term.subterms
19301 .iter()
19302 .any(|s| s.pattern.contains("*") || s.pattern == "exact"),
19303 "CJK wildcard should produce wildcard or exact pattern"
19304 );
19305 }
19306 }
19307
19308 #[test]
19309 fn unicode_query_terms_lower_case_folding() {
19310 let terms = QueryTermsLower::from_query("STRAßE");
19312 assert_eq!(terms.query_lower, "straße");
19313
19314 let terms2 = QueryTermsLower::from_query("HELLO");
19317 assert_eq!(terms2.query_lower, "hello");
19318 }
19319
19320 #[test]
19321 fn unicode_normalize_term_parts_cjk() {
19322 let parts = normalize_term_parts("测试 代码");
19323 assert_eq!(parts, vec!["测试", "代码"]);
19324 }
19325
19326 #[test]
19327 fn unicode_normalize_term_parts_strips_emoji() {
19328 let parts = normalize_term_parts("🚀launch🔥code");
19329 assert!(parts.contains(&"launch".to_string()));
19331 assert!(parts.contains(&"code".to_string()));
19332 }
19333
19334 #[test]
19339 fn special_char_unbalanced_quote_no_panic() {
19340 let tokens = parse_boolean_query("\"hello world");
19341 assert!(
19342 tokens
19343 .iter()
19344 .any(|t| matches!(t, QueryToken::Phrase(p) if p.contains("hello"))),
19345 "Unbalanced quote should still produce a phrase: {tokens:?}"
19346 );
19347 }
19348
19349 #[test]
19350 fn special_char_unbalanced_trailing_quote() {
19351 let tokens = parse_boolean_query("test\"");
19352 assert!(
19353 tokens
19354 .iter()
19355 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
19356 "Text before trailing quote should parse as term: {tokens:?}"
19357 );
19358 }
19359
19360 #[test]
19361 fn special_char_multiple_unbalanced_quotes() {
19362 let tokens = parse_boolean_query("\"foo \"bar");
19363 assert!(
19364 !tokens.is_empty(),
19365 "Should parse despite odd quotes: {tokens:?}"
19366 );
19367 }
19368
19369 #[test]
19370 fn special_char_empty_quotes() {
19371 let tokens = parse_boolean_query("\"\" test");
19372 assert!(
19373 tokens
19374 .iter()
19375 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
19376 "Empty quotes should be skipped: {tokens:?}"
19377 );
19378 }
19379
19380 #[test]
19381 fn special_char_unbalanced_via_sanitize() {
19382 let sanitized = sanitize_query("\"hello world");
19383 assert!(
19384 sanitized.contains('"'),
19385 "Quotes preserved by sanitize_query"
19386 );
19387 }
19388
19389 #[test]
19392 fn special_char_backslash_quote_sanitize() {
19393 let sanitized = sanitize_query("\\\"test\\\"");
19394 assert!(sanitized.contains('"'));
19395 assert!(!sanitized.contains('\\'), "Backslash should be stripped");
19396 }
19397
19398 #[test]
19399 fn special_char_backslash_quote_parse() {
19400 let tokens = parse_boolean_query("\\\"test\\\"");
19401 assert!(!tokens.is_empty(), "Should parse without panic: {tokens:?}");
19402 }
19403
19404 #[test]
19405 fn special_char_inner_escaped_quotes() {
19406 let tokens = parse_boolean_query("\"test \\\"inner\\\" test\"");
19407 assert!(
19408 !tokens.is_empty(),
19409 "Nested escaped quotes should not panic: {tokens:?}"
19410 );
19411 }
19412
19413 #[test]
19416 fn special_char_windows_path_sanitize() {
19417 let sanitized = sanitize_query("C:\\Users\\test");
19418 assert_eq!(sanitized, "C Users test");
19419 }
19420
19421 #[test]
19422 fn special_char_unc_path_sanitize() {
19423 let sanitized = sanitize_query("\\\\server\\share");
19424 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19425 assert!(parts.contains(&"server"));
19426 assert!(parts.contains(&"share"));
19427 }
19428
19429 #[test]
19430 fn special_char_windows_path_terms() {
19431 let parts = normalize_term_parts("C:\\Users\\test\\file.rs");
19432 assert!(parts.contains(&"C".to_string()));
19433 assert!(parts.contains(&"Users".to_string()));
19434 assert!(parts.contains(&"test".to_string()));
19435 assert!(parts.contains(&"file".to_string()));
19436 assert!(parts.contains(&"rs".to_string()));
19437 }
19438
19439 #[test]
19442 fn special_char_regex_dot_star() {
19443 let sanitized = sanitize_query("foo.*bar");
19444 assert_eq!(sanitized, "foo *bar");
19445 }
19446
19447 #[test]
19448 fn special_char_regex_char_class() {
19449 let sanitized = sanitize_query("[a-z]+");
19450 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19451 assert_eq!(parts, vec!["a-z"]);
19452 assert_eq!(normalize_term_parts("[a-z]+"), vec!["a", "z"]);
19453 }
19454
19455 #[test]
19456 fn special_char_regex_anchors() {
19457 let sanitized = sanitize_query("^start$");
19458 assert_eq!(sanitized.trim(), "start");
19459 }
19460
19461 #[test]
19462 fn special_char_regex_pipe_groups() {
19463 let sanitized = sanitize_query("(foo|bar)");
19464 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19465 assert_eq!(parts, vec!["foo", "bar"]);
19466 }
19467
19468 #[test]
19471 fn special_char_sql_injection_or() {
19472 let sanitized = sanitize_query("'OR 1=1--");
19473 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19474 assert!(parts.contains(&"OR"));
19475 assert!(parts.contains(&"1"));
19476 assert!(!sanitized.contains('\''));
19477 assert!(!sanitized.contains('='));
19478 }
19479
19480 #[test]
19481 fn special_char_sql_injection_drop() {
19482 let sanitized = sanitize_query("; DROP TABLE users;--");
19483 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19484 assert!(parts.contains(&"DROP"));
19485 assert!(parts.contains(&"TABLE"));
19486 assert!(parts.contains(&"users"));
19487 assert!(!sanitized.contains(';'));
19488 }
19489
19490 #[test]
19491 fn special_char_sql_injection_union() {
19492 let sanitized = sanitize_query("' UNION SELECT * FROM passwords --");
19493 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19494 assert!(parts.contains(&"UNION"));
19495 assert!(parts.contains(&"SELECT"));
19496 assert!(parts.contains(&"*"));
19497 assert!(parts.contains(&"FROM"));
19498 assert!(parts.contains(&"passwords"));
19499 }
19500
19501 #[test]
19502 fn special_char_sql_parse_as_literal() {
19503 let tokens = parse_boolean_query("OR 1=1");
19504 assert!(
19505 tokens.iter().any(|t| matches!(t, QueryToken::Or)),
19506 "OR should be parsed as Or operator: {tokens:?}"
19507 );
19508 }
19509
19510 #[test]
19513 fn special_char_shell_subshell() {
19514 let sanitized = sanitize_query("$(cmd)");
19515 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19516 assert_eq!(parts, vec!["cmd"]);
19517 }
19518
19519 #[test]
19520 fn special_char_shell_backticks() {
19521 let sanitized = sanitize_query("`cmd`");
19522 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19523 assert_eq!(parts, vec!["cmd"]);
19524 }
19525
19526 #[test]
19527 fn special_char_shell_pipe_rm() {
19528 let sanitized = sanitize_query("| rm -rf /");
19529 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19530 assert!(parts.contains(&"rm"));
19531 assert!(parts.contains(&"-rf"));
19532 assert_eq!(normalize_term_parts("| rm -rf /"), vec!["rm", "rf"]);
19533 assert!(!sanitized.contains('|'));
19534 assert!(!sanitized.contains('/'));
19535 }
19536
19537 #[test]
19538 fn special_char_shell_semicolon_chain() {
19539 let sanitized = sanitize_query("test; echo pwned; cat /etc/passwd");
19540 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19541 assert!(parts.contains(&"test"));
19542 assert!(parts.contains(&"echo"));
19543 assert!(parts.contains(&"pwned"));
19544 assert!(!sanitized.contains(';'));
19545 }
19546
19547 #[test]
19550 fn special_char_null_byte_mid_string() {
19551 let sanitized = sanitize_query("test\x00hidden");
19552 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19553 assert_eq!(parts, vec!["test", "hidden"]);
19554 }
19555
19556 #[test]
19557 fn special_char_null_byte_leading() {
19558 let sanitized = sanitize_query("\x00\x00attack");
19559 assert_eq!(sanitized.trim(), "attack");
19560 }
19561
19562 #[test]
19563 fn special_char_null_byte_trailing() {
19564 let sanitized = sanitize_query("query\x00\x00\x00");
19565 assert_eq!(sanitized.trim(), "query");
19566 }
19567
19568 #[test]
19569 fn special_char_null_byte_parse() {
19570 let tokens = parse_boolean_query("test\x00hidden");
19571 assert!(
19572 !tokens.is_empty(),
19573 "Null bytes should not prevent parsing: {tokens:?}"
19574 );
19575 }
19576
19577 #[test]
19580 fn special_char_control_newline() {
19581 let sanitized = sanitize_query("line1\nline2");
19582 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19583 assert_eq!(parts, vec!["line1", "line2"]);
19584 }
19585
19586 #[test]
19587 fn special_char_control_tab_cr() {
19588 let sanitized = sanitize_query("tab\there\r\nend");
19589 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19590 assert_eq!(parts, vec!["tab", "here", "end"]);
19591 }
19592
19593 #[test]
19594 fn special_char_control_parse_whitespace() {
19595 let tokens = parse_boolean_query("hello\tworld\ntest");
19596 let terms: Vec<&str> = tokens
19597 .iter()
19598 .filter_map(|t| match t {
19599 QueryToken::Term(s) => Some(s.as_str()),
19600 _ => None,
19601 })
19602 .collect();
19603 assert_eq!(terms, vec!["hello", "world", "test"]);
19604 }
19605
19606 #[test]
19607 fn special_char_control_bell_escape() {
19608 let sanitized = sanitize_query("test\x07\x1b[31mred");
19609 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19610 assert!(parts.contains(&"test"));
19611 assert!(parts.contains(&"31mred"));
19612 }
19613
19614 #[test]
19617 fn special_char_html_entity_lt() {
19618 let sanitized = sanitize_query("<script>");
19619 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19620 assert_eq!(parts, vec!["lt", "script", "gt"]);
19621 }
19622
19623 #[test]
19624 fn special_char_html_numeric_entity() {
19625 let sanitized = sanitize_query("<script>");
19626 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19627 assert!(parts.contains(&"x3C"));
19628 assert!(parts.contains(&"script"));
19629 assert!(parts.contains(&"x3E"));
19630 }
19631
19632 #[test]
19633 fn special_char_html_tags_stripped() {
19634 let sanitized = sanitize_query("<script>alert('xss')</script>");
19635 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19636 assert!(parts.contains(&"script"));
19637 assert!(parts.contains(&"alert"));
19638 assert!(parts.contains(&"xss"));
19639 }
19640
19641 #[test]
19642 fn special_char_html_attribute() {
19643 let sanitized = sanitize_query("<img src=\"evil.js\" onerror=\"alert(1)\">");
19644 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19645 assert!(parts.contains(&"img"));
19646 assert!(parts.contains(&"src"));
19647 assert!(parts.contains(&"onerror"));
19648 }
19649
19650 #[test]
19653 fn special_char_url_percent_encoding() {
19654 let sanitized = sanitize_query("%20space%2Fslash");
19655 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19656 assert_eq!(parts, vec!["20space", "2Fslash"]);
19657 }
19658
19659 #[test]
19660 fn special_char_url_null_byte_encoded() {
19661 let sanitized = sanitize_query("test%00hidden");
19662 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19663 assert_eq!(parts, vec!["test", "00hidden"]);
19664 }
19665
19666 #[test]
19667 fn special_char_url_full_query_string() {
19668 let sanitized = sanitize_query("search?q=hello&lang=en");
19669 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19670 assert_eq!(parts, vec!["search", "q", "hello", "lang", "en"]);
19671 }
19672
19673 #[test]
19676 fn special_char_explain_sql_injection() {
19677 let filters = SearchFilters::default();
19678 let explanation = QueryExplanation::analyze("'OR 1=1--", &filters);
19679 assert!(
19680 !explanation.parsed.terms.is_empty() || !explanation.parsed.phrases.is_empty(),
19681 "SQL injection should produce parseable terms"
19682 );
19683 }
19684
19685 #[test]
19686 fn special_char_explain_shell_injection() {
19687 let filters = SearchFilters::default();
19688 let explanation = QueryExplanation::analyze("$(rm -rf /)", &filters);
19689 assert!(
19690 !explanation.parsed.terms.is_empty(),
19691 "Shell injection should produce parseable terms"
19692 );
19693 }
19694
19695 #[test]
19696 fn special_char_explain_html_xss() {
19697 let filters = SearchFilters::default();
19698 let explanation = QueryExplanation::analyze("<script>alert('xss')</script>", &filters);
19699 assert!(
19700 !explanation.parsed.terms.is_empty(),
19701 "XSS payload should produce parseable terms"
19702 );
19703 }
19704
19705 #[test]
19706 fn special_char_terms_lower_injection() {
19707 let qt = QueryTermsLower::from_query("'; DROP TABLE--");
19708 let tokens: Vec<&str> = qt.tokens().collect();
19709 for token in &tokens {
19710 assert!(
19711 token.chars().all(|c| c.is_alphanumeric()),
19712 "Token should only contain alphanumeric characters: {token}"
19713 );
19714 }
19715 }
19716
19717 #[test]
19718 fn special_char_terms_lower_null_bytes() {
19719 let qt = QueryTermsLower::from_query("test\x00hidden");
19720 let tokens: Vec<&str> = qt.tokens().collect();
19721 assert!(tokens.contains(&"test"));
19722 assert!(tokens.contains(&"hidden"));
19723 }
19724
19725 #[test]
19726 fn special_char_boolean_with_injection() {
19727 let tokens = parse_boolean_query("search AND 'OR 1=1-- NOT drop");
19728 assert!(
19729 tokens.iter().any(|t| matches!(t, QueryToken::And)),
19730 "Boolean AND should still be recognized: {tokens:?}"
19731 );
19732 assert!(
19733 tokens.iter().any(|t| matches!(t, QueryToken::Not)),
19734 "Boolean NOT should still be recognized: {tokens:?}"
19735 );
19736 }
19737
19738 #[test]
19744 fn stress_query_100k_chars_completes_quickly() {
19745 let long_query = "a ".repeat(50000);
19747 assert_eq!(long_query.len(), 100000);
19748
19749 let start = std::time::Instant::now();
19750 let sanitized = sanitize_query(&long_query);
19751 let elapsed_sanitize = start.elapsed();
19752
19753 let start = std::time::Instant::now();
19754 let tokens = parse_boolean_query(&sanitized);
19755 let elapsed_parse = start.elapsed();
19756
19757 assert!(
19758 elapsed_sanitize < std::time::Duration::from_secs(1),
19759 "sanitize_query with 100k chars took {:?} (>1s)",
19760 elapsed_sanitize
19761 );
19762 assert!(
19763 elapsed_parse < std::time::Duration::from_secs(1),
19764 "parse_boolean_query with 100k chars took {:?} (>1s)",
19765 elapsed_parse
19766 );
19767 assert!(!tokens.is_empty(), "100k char query should produce tokens");
19768 }
19769
19770 #[test]
19771 fn stress_query_1000_terms() {
19772 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
19774 let query = words.join(" ");
19775
19776 let start = std::time::Instant::now();
19777 let sanitized = sanitize_query(&query);
19778 let tokens = parse_boolean_query(&sanitized);
19779 let elapsed = start.elapsed();
19780
19781 assert!(
19782 elapsed < std::time::Duration::from_secs(1),
19783 "1000 terms query took {:?} (>1s)",
19784 elapsed
19785 );
19786 let term_count = tokens
19788 .iter()
19789 .filter(|t| matches!(t, QueryToken::Term(_)))
19790 .count();
19791 assert!(
19792 term_count >= 900,
19793 "Expected ~1000 terms, got {} terms",
19794 term_count
19795 );
19796 }
19797
19798 #[test]
19799 fn stress_query_1000_identical_terms() {
19800 let query = "test ".repeat(1000);
19802
19803 let start = std::time::Instant::now();
19804 let sanitized = sanitize_query(&query);
19805 let tokens = parse_boolean_query(&sanitized);
19806 let elapsed = start.elapsed();
19807
19808 assert!(
19809 elapsed < std::time::Duration::from_secs(1),
19810 "1000 identical terms query took {:?} (>1s)",
19811 elapsed
19812 );
19813
19814 let parsed_term_count = tokens
19816 .iter()
19817 .filter(|t| matches!(t, QueryToken::Term(_)))
19818 .count();
19819 assert_eq!(parsed_term_count, 1000, "Parser should produce 1000 terms");
19820
19821 let qt = QueryTermsLower::from_query(&query);
19823 let tokens_lower: Vec<&str> = qt.tokens().collect();
19824 assert_eq!(
19825 tokens_lower.len(),
19826 1000,
19827 "All 1000 identical terms should be preserved"
19828 );
19829 assert!(
19830 tokens_lower.iter().all(|t| *t == "test"),
19831 "All tokens should be 'test'"
19832 );
19833 }
19834
19835 #[test]
19836 fn stress_query_10k_char_single_term() {
19837 let long_term = "a".repeat(10000);
19839
19840 let start = std::time::Instant::now();
19841 let sanitized = sanitize_query(&long_term);
19842 let tokens = parse_boolean_query(&sanitized);
19843 let elapsed = start.elapsed();
19844
19845 assert!(
19846 elapsed < std::time::Duration::from_secs(1),
19847 "10k char single term took {:?} (>1s)",
19848 elapsed
19849 );
19850 assert_eq!(tokens.len(), 1, "Should produce exactly one token");
19851 assert!(
19852 matches!(&tokens[0], QueryToken::Term(t) if t.len() == 10000),
19853 "Expected Term token"
19854 );
19855 }
19856
19857 #[test]
19858 fn stress_deeply_nested_parentheses() {
19859 let open_parens = "(".repeat(100);
19862 let close_parens = ")".repeat(100);
19863 let query = format!("{}test{}", open_parens, close_parens);
19864
19865 let start = std::time::Instant::now();
19866 let sanitized = sanitize_query(&query);
19867 let tokens = parse_boolean_query(&sanitized);
19868 let elapsed = start.elapsed();
19869
19870 assert!(
19871 elapsed < std::time::Duration::from_millis(100),
19872 "Deeply nested parens took {:?} (>100ms)",
19873 elapsed
19874 );
19875 let term_count = tokens
19877 .iter()
19878 .filter(|t| matches!(t, QueryToken::Term(_)))
19879 .count();
19880 assert_eq!(term_count, 1, "Should have 1 term after sanitizing parens");
19881 }
19882
19883 #[test]
19884 fn stress_many_boolean_operators() {
19885 let terms: Vec<String> = (0..101).map(|i| format!("term{}", i)).collect();
19887 let query = terms.join(" AND ");
19888
19889 let start = std::time::Instant::now();
19890 let tokens = parse_boolean_query(&query);
19891 let elapsed = start.elapsed();
19892
19893 assert!(
19894 elapsed < std::time::Duration::from_secs(1),
19895 "100+ boolean ops took {:?} (>1s)",
19896 elapsed
19897 );
19898
19899 let and_count = tokens
19900 .iter()
19901 .filter(|t| matches!(t, QueryToken::And))
19902 .count();
19903 let term_count = tokens
19904 .iter()
19905 .filter(|t| matches!(t, QueryToken::Term(_)))
19906 .count();
19907
19908 assert_eq!(and_count, 100, "Should have 100 AND operators");
19909 assert_eq!(term_count, 101, "Should have 101 terms");
19910 }
19911
19912 #[test]
19913 fn stress_many_or_operators() {
19914 let terms: Vec<String> = (0..101).map(|i| format!("opt{}", i)).collect();
19916 let query = terms.join(" OR ");
19917
19918 let start = std::time::Instant::now();
19919 let tokens = parse_boolean_query(&query);
19920 let elapsed = start.elapsed();
19921
19922 assert!(
19923 elapsed < std::time::Duration::from_secs(1),
19924 "100+ OR ops took {:?} (>1s)",
19925 elapsed
19926 );
19927
19928 let or_count = tokens
19929 .iter()
19930 .filter(|t| matches!(t, QueryToken::Or))
19931 .count();
19932 assert_eq!(or_count, 100, "Should have 100 OR operators");
19933 }
19934
19935 #[test]
19936 fn stress_mixed_boolean_operators() {
19937 let query = "a AND b OR c NOT d AND e OR f NOT g ".repeat(50);
19939
19940 let start = std::time::Instant::now();
19941 let tokens = parse_boolean_query(&query);
19942 let elapsed = start.elapsed();
19943
19944 assert!(
19945 elapsed < std::time::Duration::from_secs(1),
19946 "Mixed boolean ops took {:?} (>1s)",
19947 elapsed
19948 );
19949 assert!(
19950 !tokens.is_empty(),
19951 "Complex boolean query should produce tokens"
19952 );
19953 }
19954
19955 #[test]
19956 fn stress_memory_bounds_large_query() {
19957 let large_query = "x".repeat(100000);
19961
19962 let sanitized = sanitize_query(&large_query);
19963 let tokens = parse_boolean_query(&sanitized);
19964
19965 assert!(
19967 sanitized.len() <= large_query.len(),
19968 "Sanitized output should not exceed input size"
19969 );
19970
19971 assert_eq!(tokens.len(), 1);
19973
19974 let qt = QueryTermsLower::from_query(&large_query);
19976 let token_count = qt.tokens().count();
19977 assert_eq!(token_count, 1, "Should be 1 token of 100k chars");
19978 }
19979
19980 #[test]
19981 fn stress_concurrent_queries() {
19982 use std::thread;
19983
19984 let queries: Vec<String> = (0..100)
19985 .map(|i| format!("concurrent_query_{} test search", i))
19986 .collect();
19987
19988 let handles: Vec<_> = queries
19989 .into_iter()
19990 .map(|query| {
19991 thread::spawn(move || {
19992 let sanitized = sanitize_query(&query);
19993 let tokens = parse_boolean_query(&sanitized);
19994 let qt = QueryTermsLower::from_query(&query);
19995 (tokens.len(), qt.tokens().count())
19996 })
19997 })
19998 .collect();
19999
20000 for (i, handle) in handles.into_iter().enumerate() {
20001 let (token_len, qt_len) = handle.join().expect("Thread panicked");
20002 assert!(token_len > 0, "Query {} should produce tokens", i);
20003 assert!(qt_len > 0, "Query {} QueryTermsLower should have tokens", i);
20004 }
20005 }
20006
20007 #[test]
20008 fn stress_many_quoted_phrases() {
20009 let phrases: Vec<String> = (0..50)
20011 .map(|i| format!("\"phrase number {}\"", i))
20012 .collect();
20013 let query = phrases.join(" AND ");
20014
20015 let start = std::time::Instant::now();
20016 let tokens = parse_boolean_query(&query);
20017 let elapsed = start.elapsed();
20018
20019 assert!(
20020 elapsed < std::time::Duration::from_secs(1),
20021 "50 quoted phrases took {:?} (>1s)",
20022 elapsed
20023 );
20024
20025 let phrase_count = tokens
20026 .iter()
20027 .filter(|t| matches!(t, QueryToken::Phrase(_)))
20028 .count();
20029 assert_eq!(phrase_count, 50, "Should have 50 phrases");
20030 }
20031
20032 #[test]
20033 fn stress_alternating_quotes() {
20034 let parts: Vec<String> = (0..100)
20036 .map(|i| {
20037 if i % 2 == 0 {
20038 format!("\"word{}\"", i)
20039 } else {
20040 format!("word{}", i)
20041 }
20042 })
20043 .collect();
20044 let query = parts.join(" ");
20045
20046 let start = std::time::Instant::now();
20047 let tokens = parse_boolean_query(&query);
20048 let elapsed = start.elapsed();
20049
20050 assert!(
20051 elapsed < std::time::Duration::from_secs(1),
20052 "100 alternating quotes took {:?} (>1s)",
20053 elapsed
20054 );
20055
20056 let phrase_count = tokens
20057 .iter()
20058 .filter(|t| matches!(t, QueryToken::Phrase(_)))
20059 .count();
20060 let term_count = tokens
20061 .iter()
20062 .filter(|t| matches!(t, QueryToken::Term(_)))
20063 .count();
20064
20065 assert_eq!(phrase_count, 50, "Should have 50 phrases");
20066 assert_eq!(term_count, 50, "Should have 50 terms");
20067 }
20068
20069 #[test]
20070 fn stress_many_wildcards() {
20071 let patterns: Vec<&str> = vec!["pre*", "*suf", "*sub*", "a*b", "test*", "*ing", "*tion*"];
20073 let query = patterns
20074 .iter()
20075 .cycle()
20076 .take(100)
20077 .cloned()
20078 .collect::<Vec<_>>()
20079 .join(" ");
20080
20081 let start = std::time::Instant::now();
20082 let sanitized = sanitize_query(&query);
20083 let tokens = parse_boolean_query(&sanitized);
20084 let elapsed = start.elapsed();
20085
20086 assert!(
20087 elapsed < std::time::Duration::from_secs(1),
20088 "100 wildcards took {:?} (>1s)",
20089 elapsed
20090 );
20091 assert!(!tokens.is_empty());
20092 }
20093
20094 #[test]
20095 fn stress_query_explanation_large_query() {
20096 let words: Vec<String> = (0..100).map(|i| format!("term{}", i)).collect();
20098 let query = words.join(" ");
20099 let filters = SearchFilters::default();
20100
20101 let start = std::time::Instant::now();
20102 let explanation = QueryExplanation::analyze(&query, &filters);
20103 let elapsed = start.elapsed();
20104
20105 assert!(
20106 elapsed < std::time::Duration::from_secs(2),
20107 "QueryExplanation for 100 terms took {:?} (>2s)",
20108 elapsed
20109 );
20110 assert!(
20111 !explanation.parsed.terms.is_empty(),
20112 "Should parse terms successfully"
20113 );
20114 }
20115
20116 #[test]
20117 fn stress_very_long_single_quoted_phrase() {
20118 let words: Vec<String> = (0..500).map(|i| format!("word{}", i)).collect();
20120 let phrase = format!("\"{}\"", words.join(" "));
20121
20122 let start = std::time::Instant::now();
20123 let tokens = parse_boolean_query(&phrase);
20124 let elapsed = start.elapsed();
20125
20126 assert!(
20127 elapsed < std::time::Duration::from_secs(1),
20128 "500-word phrase took {:?} (>1s)",
20129 elapsed
20130 );
20131
20132 let phrase_count = tokens
20133 .iter()
20134 .filter(|t| matches!(t, QueryToken::Phrase(_)))
20135 .count();
20136 assert_eq!(phrase_count, 1, "Should have exactly 1 phrase");
20137 }
20138
20139 #[test]
20140 fn stress_not_prefix_many() {
20141 let terms: Vec<String> = (0..100).map(|i| format!("-term{}", i)).collect();
20143 let query = terms.join(" ");
20144
20145 let start = std::time::Instant::now();
20146 let tokens = parse_boolean_query(&query);
20147 let elapsed = start.elapsed();
20148
20149 assert!(
20150 elapsed < std::time::Duration::from_secs(1),
20151 "100 NOT prefixes took {:?} (>1s)",
20152 elapsed
20153 );
20154
20155 let not_count = tokens
20156 .iter()
20157 .filter(|t| matches!(t, QueryToken::Not))
20158 .count();
20159 assert_eq!(not_count, 100, "Should have 100 NOT operators");
20160 }
20161
20162 #[test]
20163 fn stress_unicode_large_cjk_query() {
20164 let cjk_chars = "中文日本語한국어".repeat(1000);
20166
20167 let start = std::time::Instant::now();
20168 let sanitized = sanitize_query(&cjk_chars);
20169 let qt = QueryTermsLower::from_query(&sanitized);
20170 let elapsed = start.elapsed();
20171
20172 assert!(
20173 elapsed < std::time::Duration::from_secs(1),
20174 "Large CJK query took {:?} (>1s)",
20175 elapsed
20176 );
20177 assert!(!qt.is_empty(), "CJK query should produce tokens");
20178 }
20179
20180 #[test]
20181 fn stress_unicode_many_emoji() {
20182 let emoji_query = "🚀 🔍 📝 💻 🎯 ".repeat(500);
20184
20185 let start = std::time::Instant::now();
20186 let sanitized = sanitize_query(&emoji_query);
20187 let tokens = parse_boolean_query(&sanitized);
20188 let elapsed = start.elapsed();
20189
20190 assert!(
20191 elapsed < std::time::Duration::from_secs(1),
20192 "Emoji query took {:?} (>1s)",
20193 elapsed
20194 );
20195 assert!(
20197 tokens.is_empty(),
20198 "Emoji-only query should produce no tokens"
20199 );
20200 }
20201
20202 #[test]
20203 fn stress_mixed_content_large() {
20204 let mixed = r#"
20206 function test() { return x + y; }
20207 SELECT * FROM users WHERE id = 1;
20208 The quick brown fox 狐狸 jumps over lazy dog
20209 Error: "undefined is not a function" at line 42
20210 https://example.com/path?query=value&other=123
20211 "#
20212 .repeat(100);
20213
20214 let start = std::time::Instant::now();
20215 let sanitized = sanitize_query(&mixed);
20216 let tokens = parse_boolean_query(&sanitized);
20217 let qt = QueryTermsLower::from_query(&mixed);
20218 let elapsed = start.elapsed();
20219
20220 assert!(
20221 elapsed < std::time::Duration::from_secs(2),
20222 "Mixed content query took {:?} (>2s)",
20223 elapsed
20224 );
20225 assert!(!tokens.is_empty());
20226 assert!(!qt.is_empty());
20227 }
20228
20229 #[test]
20236 fn unicode_emoji_mixed_with_alphanumeric() {
20237 let tokens = parse_boolean_query("rocket🚀launch");
20239 assert_eq!(tokens.len(), 1);
20240 let sanitized = sanitize_query("rocket🚀launch");
20242 assert_eq!(sanitized, "rocket launch");
20243
20244 let sanitized2 = sanitize_query("test🔥🎯code");
20246 assert_eq!(sanitized2, "test code");
20247 }
20248
20249 #[test]
20250 fn unicode_emoji_with_boolean_operators() {
20251 let tokens = parse_boolean_query("🚀code AND test");
20253 let term_count = tokens
20255 .iter()
20256 .filter(|t| matches!(t, QueryToken::Term(_)))
20257 .count();
20258 assert!(term_count >= 1, "Should have at least one term");
20259
20260 let tokens_or = parse_boolean_query("deploy OR 🎯target");
20262 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
20263 assert!(has_or, "Should detect OR operator");
20264 }
20265
20266 #[test]
20267 fn unicode_emoji_at_word_boundaries() {
20268 let sanitized_start = sanitize_query("🔍search");
20270 assert_eq!(sanitized_start, " search");
20271
20272 let sanitized_end = sanitize_query("complete✅");
20274 assert_eq!(sanitized_end, "complete ");
20275
20276 let sanitized_only = sanitize_query("🎉🎊🎁");
20278 assert!(
20279 sanitized_only.trim().is_empty(),
20280 "Emoji-only should be empty after trimming"
20281 );
20282 }
20283
20284 #[test]
20287 fn unicode_arabic_text_preserved() {
20288 let arabic = "مرحبا بالعالم"; let sanitized = sanitize_query(arabic);
20291 assert_eq!(
20292 sanitized, arabic,
20293 "Arabic alphanumeric chars should be preserved"
20294 );
20295
20296 let tokens = parse_boolean_query(arabic);
20297 assert!(!tokens.is_empty(), "Arabic query should produce tokens");
20298 }
20299
20300 #[test]
20301 fn unicode_hebrew_text_preserved() {
20302 let hebrew = "שלום עולם"; let sanitized = sanitize_query(hebrew);
20305 assert_eq!(
20306 sanitized, hebrew,
20307 "Hebrew alphanumeric chars should be preserved"
20308 );
20309
20310 let tokens = parse_boolean_query(hebrew);
20311 assert!(!tokens.is_empty(), "Hebrew query should produce tokens");
20312 }
20313
20314 #[test]
20315 fn unicode_mixed_rtl_and_ltr() {
20316 let mixed = "hello مرحبا world";
20318 let sanitized = sanitize_query(mixed);
20319 assert_eq!(sanitized, mixed, "Mixed RTL/LTR should be preserved");
20320
20321 let tokens = parse_boolean_query(mixed);
20322 let term_count = tokens
20323 .iter()
20324 .filter(|t| matches!(t, QueryToken::Term(_)))
20325 .count();
20326 assert_eq!(term_count, 3, "Should have 3 terms");
20327 }
20328
20329 #[test]
20330 fn unicode_rtl_with_boolean_operators() {
20331 let hebrew_and = "שלום AND עולם";
20333 let tokens = parse_boolean_query(hebrew_and);
20334 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
20335 assert!(has_and, "Should detect AND operator in Hebrew query");
20336
20337 let arabic_not = "مرحبا NOT بالعالم";
20339 let tokens_not = parse_boolean_query(arabic_not);
20340 let has_not = tokens_not.iter().any(|t| matches!(t, QueryToken::Not));
20341 assert!(has_not, "Should detect NOT operator in Arabic query");
20342 }
20343
20344 #[test]
20347 fn special_chars_backslash_stripped() {
20348 let query = r"path\to\file";
20350 let sanitized = sanitize_query(query);
20351 assert_eq!(sanitized, "path to file");
20352 }
20353
20354 #[test]
20355 fn special_chars_escaped_quotes_handling() {
20356 let query = r#"say \"hello\""#;
20358 let sanitized = sanitize_query(query);
20359 assert!(sanitized.contains('"'), "Quotes should be preserved");
20361 }
20362
20363 #[test]
20364 fn special_chars_windows_paths() {
20365 let path = r"C:\Users\test\Documents";
20367 let sanitized = sanitize_query(path);
20368 assert_eq!(sanitized, "C Users test Documents");
20369 }
20370
20371 #[test]
20374 fn boolean_deeply_nested_operators() {
20375 let query = "a AND b OR c NOT d AND e";
20377 let tokens = parse_boolean_query(query);
20378
20379 let mut and_count = 0;
20380 let mut or_count = 0;
20381 let mut not_count = 0;
20382 for token in &tokens {
20383 match token {
20384 QueryToken::And => and_count += 1,
20385 QueryToken::Or => or_count += 1,
20386 QueryToken::Not => not_count += 1,
20387 _ => {}
20388 }
20389 }
20390
20391 assert_eq!(and_count, 2, "Should have 2 AND operators");
20392 assert_eq!(or_count, 1, "Should have 1 OR operator");
20393 assert_eq!(not_count, 1, "Should have 1 NOT operator");
20394 }
20395
20396 #[test]
20397 fn boolean_consecutive_operators_degenerate() {
20398 let tokens = parse_boolean_query("foo AND AND bar");
20400 let term_count = tokens
20402 .iter()
20403 .filter(|t| matches!(t, QueryToken::Term(_)))
20404 .count();
20405 assert!(
20406 term_count >= 2,
20407 "Should have at least 2 terms (foo and bar)"
20408 );
20409 }
20410
20411 #[test]
20412 fn boolean_operator_at_start() {
20413 let tokens = parse_boolean_query("AND foo");
20415 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
20416 assert!(has_and, "Leading AND should be detected");
20417
20418 let tokens_or = parse_boolean_query("OR test");
20419 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
20420 assert!(has_or, "Leading OR should be detected");
20421 }
20422
20423 #[test]
20424 fn boolean_operator_at_end() {
20425 let tokens = parse_boolean_query("foo AND");
20427 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
20428 assert!(has_and, "Trailing AND should be detected");
20429 }
20430
20431 #[test]
20434 fn numeric_query_digits_only() {
20435 let tokens = parse_boolean_query("12345");
20437 assert_eq!(tokens.len(), 1);
20438 assert_eq!(tokens[0], QueryToken::Term("12345".to_string()));
20439
20440 let sanitized = sanitize_query("12345");
20441 assert_eq!(sanitized, "12345");
20442 }
20443
20444 #[test]
20445 fn numeric_query_with_text() {
20446 let tokens = parse_boolean_query("error 404 not found");
20448 let term_count = tokens
20449 .iter()
20450 .filter(|t| matches!(t, QueryToken::Term(_)))
20451 .count();
20452 assert!(term_count >= 3, "Should have at least 3 terms");
20454 }
20455
20456 #[test]
20457 fn numeric_versions_with_dots() {
20458 let sanitized = sanitize_query("version 1.2.3");
20460 assert_eq!(sanitized, "version 1 2 3"); }
20462
20463 #[test]
20466 fn whitespace_tabs_treated_as_separators() {
20467 let tokens = parse_boolean_query("foo\tbar\tbaz");
20468 let term_count = tokens
20469 .iter()
20470 .filter(|t| matches!(t, QueryToken::Term(_)))
20471 .count();
20472 assert_eq!(term_count, 3, "Tabs should separate terms");
20473 }
20474
20475 #[test]
20476 fn whitespace_newlines_treated_as_separators() {
20477 let tokens = parse_boolean_query("foo\nbar\nbaz");
20478 let term_count = tokens
20479 .iter()
20480 .filter(|t| matches!(t, QueryToken::Term(_)))
20481 .count();
20482 assert_eq!(term_count, 3, "Newlines should separate terms");
20483 }
20484
20485 #[test]
20486 fn whitespace_mixed_types() {
20487 let tokens = parse_boolean_query("a \t b \n c d");
20488 let term_count = tokens
20489 .iter()
20490 .filter(|t| matches!(t, QueryToken::Term(_)))
20491 .count();
20492 assert_eq!(term_count, 4, "Mixed whitespace should separate properly");
20493 }
20494
20495 #[test]
20498 fn stress_very_long_single_term() {
20499 let long_term = "a".repeat(10_000);
20501
20502 let start = std::time::Instant::now();
20503 let tokens = parse_boolean_query(&long_term);
20504 let elapsed = start.elapsed();
20505
20506 assert!(
20507 elapsed < std::time::Duration::from_secs(1),
20508 "10K char term took {:?} (>1s)",
20509 elapsed
20510 );
20511 assert_eq!(tokens.len(), 1);
20512 assert!(
20513 matches!(tokens.first(), Some(QueryToken::Term(t)) if t.len() == 10_000),
20514 "Expected 10K Term token, got {tokens:?}"
20515 );
20516 }
20517
20518 #[test]
20519 fn stress_very_long_term_with_wildcard() {
20520 let long_pattern = format!("{}*", "prefix".repeat(1000));
20522
20523 let start = std::time::Instant::now();
20524 let sanitized = sanitize_query(&long_pattern);
20525 let pattern = WildcardPattern::parse(&sanitized);
20526 let elapsed = start.elapsed();
20527
20528 assert!(
20529 elapsed < std::time::Duration::from_secs(1),
20530 "Long wildcard pattern took {:?} (>1s)",
20531 elapsed
20532 );
20533 assert!(
20534 matches!(pattern, WildcardPattern::Prefix(_)),
20535 "Should parse as prefix pattern"
20536 );
20537 }
20538
20539 #[test]
20542 fn query_explanation_empty_query() {
20543 let explanation = QueryExplanation::analyze("", &SearchFilters::default());
20544 assert_eq!(explanation.query_type, QueryType::Empty);
20545 }
20546
20547 #[test]
20548 fn search_mode_default_is_hybrid_preferred() {
20549 assert_eq!(SearchMode::default(), SearchMode::Hybrid);
20550 }
20551
20552 #[test]
20553 fn query_explanation_whitespace_only_query() {
20554 let explanation = QueryExplanation::analyze(" \t\n ", &SearchFilters::default());
20555 assert_eq!(explanation.query_type, QueryType::Empty);
20556 }
20557
20558 #[test]
20559 fn query_explanation_unicode_query() {
20560 let explanation = QueryExplanation::analyze("日本語 search", &SearchFilters::default());
20561 assert!(!explanation.parsed.terms.is_empty());
20563 }
20564
20565 #[test]
20568 fn query_terms_lower_unicode_normalization() {
20569 let terms = QueryTermsLower::from_query("CAFÉ RÉSUMÉ");
20571 assert_eq!(terms.query_lower, "café résumé");
20572 }
20573
20574 #[test]
20575 fn query_terms_lower_mixed_case_unicode() {
20576 let terms = QueryTermsLower::from_query("Hello日本語World");
20578 assert!(terms.query_lower.contains("hello"));
20580 assert!(terms.query_lower.contains("world"));
20581 }
20582
20583 #[test]
20584 fn query_terms_lower_preserves_numbers() {
20585 let terms = QueryTermsLower::from_query("ABC123XYZ");
20586 assert_eq!(terms.query_lower, "abc123xyz");
20587 }
20588
20589 #[test]
20592 fn wildcard_pattern_internal_asterisk() {
20593 let pattern = WildcardPattern::parse("f*o");
20595 assert!(
20596 matches!(pattern, WildcardPattern::Complex(_)),
20597 "Internal asterisk should be Complex"
20598 );
20599 }
20600
20601 #[test]
20602 fn wildcard_pattern_multiple_internal_asterisks() {
20603 let pattern = WildcardPattern::parse("a*b*c");
20605 assert!(
20606 matches!(pattern, WildcardPattern::Complex(_)),
20607 "Multiple internal asterisks should be Complex"
20608 );
20609 }
20610
20611 #[test]
20612 fn wildcard_pattern_regex_escapes_special_chars() {
20613 let pattern = WildcardPattern::parse("*foo.bar*");
20615 if let Some(regex) = pattern.to_regex() {
20616 assert!(
20617 regex.contains("\\."),
20618 "Dot should be escaped in regex: {}",
20619 regex
20620 );
20621 }
20622 }
20623
20624 #[test]
20625 fn wildcard_pattern_complex_regex_generation() {
20626 let pattern = WildcardPattern::parse("f*o*o");
20627 if let Some(regex) = pattern.to_regex() {
20628 assert!(
20630 regex.contains(".*"),
20631 "Should have .* for internal wildcards: {}",
20632 regex
20633 );
20634 }
20635 }
20636
20637 #[test]
20638 fn test_transpile_to_fts5() {
20639 assert_eq!(
20641 transpile_to_fts5("foo bar"),
20642 Some("foo AND bar".to_string())
20643 );
20644
20645 assert_eq!(
20647 transpile_to_fts5("foo AND bar"),
20648 Some("foo AND bar".to_string())
20649 );
20650 assert_eq!(
20651 transpile_to_fts5("foo OR bar"),
20652 Some("(foo OR bar)".to_string())
20653 );
20654 assert_eq!(transpile_to_fts5("OR foo"), Some("foo".to_string()));
20655 assert_eq!(transpile_to_fts5("NOT foo"), None);
20656
20657 assert_eq!(
20660 transpile_to_fts5("A AND B OR C"),
20661 Some("A AND (B OR C)".to_string())
20662 );
20663
20664 assert_eq!(
20666 transpile_to_fts5("A OR B AND C"),
20667 Some("(A OR B) AND C".to_string())
20668 );
20669
20670 assert_eq!(
20672 transpile_to_fts5("A OR B OR C"),
20673 Some("(A OR B OR C)".to_string())
20674 );
20675
20676 assert_eq!(
20678 transpile_to_fts5("\"foo bar\""),
20679 Some("\"foo bar\"".to_string())
20680 );
20681
20682 assert_eq!(transpile_to_fts5("foo*"), Some("foo*".to_string()));
20684
20685 assert_eq!(transpile_to_fts5("*foo"), None);
20687 assert_eq!(transpile_to_fts5("f*o"), None);
20688
20689 assert_eq!(
20692 transpile_to_fts5("foo-bar"),
20693 Some("(foo AND bar)".to_string())
20694 );
20695 assert_eq!(
20696 transpile_to_fts5("foo-bar*"),
20697 Some("(foo AND bar*)".to_string())
20698 );
20699 assert_eq!(
20700 transpile_to_fts5("br-123.jsonl"),
20701 Some("(br AND 123 AND jsonl)".to_string())
20702 );
20703 assert_eq!(
20704 transpile_to_fts5("br-123.json*"),
20705 Some("(br AND 123 AND json*)".to_string())
20706 );
20707
20708 assert_eq!(transpile_to_fts5("NOT A OR B"), None);
20710 }
20711
20712 #[test]
20713 fn semantic_doc_id_roundtrip_from_query() {
20714 let hash_hex = "00".repeat(32);
20715 let doc_id = format!("m|42|2|3|7|11|1|1700000000000|{hash_hex}");
20716 let parsed = parse_semantic_doc_id(&doc_id).expect("roundtrip parse");
20717 assert_eq!(parsed.message_id, 42);
20718 assert_eq!(parsed.chunk_idx, 2);
20719 assert_eq!(parsed.agent_id, 3);
20720 assert_eq!(parsed.workspace_id, 7);
20721 assert_eq!(parsed.source_id, 11);
20722 assert_eq!(parsed.role, 1);
20723 assert_eq!(parsed.created_at_ms, 1_700_000_000_000);
20724 }
20725
20726 #[test]
20727 fn semantic_filter_applies_all_constraints() {
20728 use frankensearch::core::filter::SearchFilter;
20729
20730 let filter = SemanticFilter {
20731 agents: Some(HashSet::from([3])),
20732 workspaces: Some(HashSet::from([7])),
20733 sources: Some(HashSet::from([11])),
20734 roles: Some(HashSet::from([1])),
20735 created_from: Some(1_700_000_000_000),
20736 created_to: Some(1_700_000_000_100),
20737 };
20738
20739 assert!(filter.matches("m|42|2|3|7|11|1|1700000000001", None));
20740 assert!(!filter.matches("m|42|2|99|7|11|1|1700000000001", None));
20741 assert!(!filter.matches("m|42|2|3|7|11|1|1699999999999", None));
20742 assert!(!filter.matches("not-a-doc-id", None));
20743 }
20744
20745 #[test]
20746 fn fs_semantic_index_runs_filtered_search() -> Result<()> {
20747 let temp = TempDir::new()?;
20748 let index_path = crate::search::vector_index::vector_index_path(temp.path(), "embed-fast");
20749 if let Some(parent) = index_path.parent() {
20750 std::fs::create_dir_all(parent)?;
20751 }
20752
20753 let hash_a = "00".repeat(32);
20754 let hash_b = "11".repeat(32);
20755 let doc_a = format!("m|101|0|1|10|100|1|1700000000001|{hash_a}");
20756 let doc_b = format!("m|202|0|2|20|200|1|1700000000002|{hash_b}");
20757
20758 let mut writer = VectorIndex::create_with_revision(
20759 &index_path,
20760 "embed-fast",
20761 "rev-1",
20762 2,
20763 frankensearch::index::Quantization::F16,
20764 )
20765 .map_err(|err| anyhow!("create fsvi index failed: {err}"))?;
20766 writer
20767 .write_record(&doc_a, &[1.0, 0.0])
20768 .map_err(|err| anyhow!("write_record failed: {err}"))?;
20769 writer
20770 .write_record(&doc_b, &[0.0, 1.0])
20771 .map_err(|err| anyhow!("write_record failed: {err}"))?;
20772 writer
20773 .finish()
20774 .map_err(|err| anyhow!("finish fsvi index failed: {err}"))?;
20775
20776 let fs_index =
20777 VectorIndex::open(&index_path).map_err(|err| anyhow!("open fsvi failed: {err}"))?;
20778 let filter = SemanticFilter {
20779 agents: Some(HashSet::from([1])),
20780 workspaces: None,
20781 sources: None,
20782 roles: None,
20783 created_from: None,
20784 created_to: None,
20785 };
20786 let fs_filter = semantic_filter_as_search_filter(&filter).expect("expected active filter");
20787 let hits = fs_index
20788 .search_top_k(&[1.0, 0.0], 5, Some(fs_filter))
20789 .map_err(|err| anyhow!("frankensearch search failed: {err}"))?;
20790 assert_eq!(hits.len(), 1);
20791 let parsed = parse_semantic_doc_id(&hits[0].doc_id).expect("parse bridged doc_id");
20792 assert_eq!(parsed.message_id, 101);
20793 assert_eq!(parsed.agent_id, 1);
20794 Ok(())
20795 }
20796
20797 #[test]
20809 fn hit_is_noise_returns_false_when_content_and_snippet_both_empty() {
20810 let hit = SearchHit {
20811 title: String::new(),
20812 snippet: String::new(),
20813 content: String::new(),
20814 content_hash: 0,
20815 conversation_id: Some(1),
20816 score: 1.0,
20817 source_path: "/tmp/session.jsonl".to_string(),
20818 agent: "codex".to_string(),
20819 workspace: String::new(),
20820 workspace_original: None,
20821 created_at: Some(1700000000000),
20822 line_number: Some(1),
20823 match_type: MatchType::Exact,
20824 source_id: "local".to_string(),
20825 origin_kind: "local".to_string(),
20826 origin_host: None,
20827 };
20828
20829 assert!(
20833 !hit_is_noise(&hit, "anything"),
20834 "hit with empty content AND snippet (projection-only) must NOT be classified as noise"
20835 );
20836 assert!(
20837 !hit_is_noise(&hit, ""),
20838 "noise classifier must not treat an empty-query projection-only hit as noise"
20839 );
20840 }
20841
20842 #[test]
20847 fn hit_is_noise_still_drops_tool_acknowledgement_when_content_present() {
20848 let hit = SearchHit {
20849 title: String::new(),
20850 snippet: String::new(),
20851 content: "ok".to_string(),
20852 content_hash: 0,
20853 conversation_id: Some(1),
20854 score: 1.0,
20855 source_path: "/tmp/session.jsonl".to_string(),
20856 agent: "codex".to_string(),
20857 workspace: String::new(),
20858 workspace_original: None,
20859 created_at: Some(1700000000000),
20860 line_number: Some(1),
20861 match_type: MatchType::Exact,
20862 source_id: "local".to_string(),
20863 origin_kind: "local".to_string(),
20864 origin_host: None,
20865 };
20866
20867 assert!(
20868 hit_is_noise(&hit, ""),
20869 "bare tool-ack 'ok' with content present should still be dropped as noise"
20870 );
20871 }
20872}