1use anyhow::{Context, Result, anyhow, bail};
2use crossbeam_channel as mpsc;
3use frankensearch::lexical::{
4 BooleanQuery, CASS_SCHEMA_HASH as FS_CASS_SCHEMA_HASH, CassFields as FsCassFields,
5 CassQueryFilters as FsCassQueryFilters, CassQueryToken as FsCassQueryToken,
6 CassSourceFilter as FsCassSourceFilter, CassWildcardPattern as FsCassWildcardPattern, Count,
7 IndexReader, IndexRecordOption, LexicalDocHit as FsLexicalDocHit,
8 LexicalSearchResult as FsLexicalSearchResult, Occur, Query, ReloadPolicy, Searcher,
9 SnippetConfig as FsSnippetConfig, TantivyDocument, Term, TermQuery, TopDocs, Value,
10 cass_build_tantivy_query as fs_cass_build_tantivy_query,
11 cass_has_boolean_operators as fs_cass_has_boolean_operators,
12 cass_open_search_reader as fs_cass_open_search_reader,
13 cass_parse_boolean_query as fs_cass_parse_boolean_query,
14 cass_sanitize_query as fs_cass_sanitize_query, load_doc as fs_load_doc,
15 render_snippet_html as fs_render_snippet_html,
16 try_build_snippet_generator as fs_try_build_snippet_generator,
17};
18use frankensearch::{
19 Cx as FsCx, InMemoryTwoTierIndex as FsInMemoryTwoTierIndex,
20 InMemoryVectorIndex as FsInMemoryVectorIndex, LexicalSearch as FsLexicalSearch,
21 QueryClass as FsQueryClass, RrfConfig as FsRrfConfig, ScoreSource as FsScoreSource,
22 ScoredResult as FsScoredResult, SearchError as FsSearchError, SearchFuture as FsSearchFuture,
23 SearchPhase as FsSearchPhase, SyncEmbedderAdapter as FsSyncEmbedderAdapter,
24 SyncTwoTierSearcher as FsSyncTwoTierSearcher, TwoTierConfig as FsTwoTierConfig,
25 TwoTierIndex as FsTwoTierIndex, TwoTierSearcher as FsTwoTierSearcher, VectorHit as FsVectorHit,
26 candidate_count as fs_candidate_count,
27 core::filter::SearchFilter as FsSearchFilter,
28 index::{
29 HNSW_DEFAULT_EF_SEARCH as FS_HNSW_DEFAULT_EF_SEARCH, HnswIndex as FsHnswIndex,
30 VectorIndex as FsVectorIndex,
31 },
32 rrf_fuse as fs_rrf_fuse,
33};
34use lru::LruCache;
35use once_cell::sync::Lazy;
36use parking_lot::RwLock;
37use std::cell::RefCell;
38use std::cmp::Ordering as CmpOrdering;
39use std::collections::{HashMap, HashSet, VecDeque};
40use std::hash::{Hash, Hasher};
41use std::num::NonZeroUsize;
42use std::path::{Path, PathBuf};
43use std::sync::atomic::{AtomicU64, Ordering};
44use std::sync::{Arc, Mutex};
45use std::time::{Duration, Instant};
46
47use frankensqlite::Connection;
48#[cfg(test)]
49use frankensqlite::compat::OptionalExtension;
50use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
51#[cfg(test)]
52use frankensqlite::params;
53
54struct SendConnection(Connection);
62
63type TantivyContentExactKey = (i64, i64);
64type TantivyContentFallbackKey = (String, String, i64);
65type TantivyHydratedContentMaps = (
66 HashMap<TantivyContentExactKey, String>,
67 HashMap<TantivyContentFallbackKey, String>,
68);
69type SqliteFtsHydratedRow = (
70 i64,
71 Option<i64>,
72 Option<String>,
73 Option<String>,
74 Option<String>,
75 Option<String>,
76 Option<String>,
77 Option<i64>,
78);
79type SqliteFtsMessageRow = (
80 i64,
81 String,
82 String,
83 String,
84 String,
85 String,
86 Option<i64>,
87 Option<i64>,
88 Option<i64>,
89 Option<String>,
90 Option<String>,
91 Option<String>,
92);
93type SqliteMessageScanAlternative = Vec<String>;
94type SqliteMessageScanGroup = Vec<SqliteMessageScanAlternative>;
95struct SqliteMessageScanQuery {
96 include_groups: Vec<SqliteMessageScanGroup>,
97 exclude_terms: Vec<String>,
98}
99
100#[derive(Clone, Copy)]
101struct SqliteMessageScanRequest<'a> {
102 raw_query: &'a str,
103 filters: &'a SearchFilters,
104 limit: usize,
105 offset: usize,
106 field_mask: FieldMask,
107 query_match_type: MatchType,
108}
109
110#[derive(Clone, Copy, Debug, PartialEq, Eq)]
111enum SqliteFtsMatchMode {
112 Table,
113 IndexedColumns,
114}
115
116const SQLITE_FTS5_HYDRATE_PARAM_CHUNK: usize = 30_000;
120const SQLITE_MAX_VARIABLE_NUMBER: usize = 32_766;
121const SQLITE_FTS5_POST_FILTER_SCAN_CHUNK: usize = 1_024;
122const SQLITE_FTS5_POST_FILTER_SCAN_LIMIT: usize = 30_000;
123const SQLITE_MESSAGE_SCAN_FALLBACK_LIMIT: usize = 30_000;
124const SEARCH_SQLITE_HYDRATION_CACHE_KIB: i64 = 4_096;
125const SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER: usize = 4;
126
127unsafe impl Send for SendConnection {}
130
131impl std::ops::Deref for SendConnection {
132 type Target = Connection;
133 fn deref(&self) -> &Connection {
134 &self.0
135 }
136}
137
138fn open_search_hydration_sqlite(path: &Path, timeout: Duration) -> Result<Connection> {
139 let conn =
140 crate::storage::sqlite::open_franken_raw_readonly_connection_with_timeout(path, timeout)?;
141 conn.execute("PRAGMA query_only = 1;")
142 .with_context(|| "setting search hydration query_only")?;
143 conn.execute("PRAGMA busy_timeout = 5000;")
144 .with_context(|| "setting search hydration busy_timeout")?;
145 conn.execute(&format!(
146 "PRAGMA cache_size = -{SEARCH_SQLITE_HYDRATION_CACHE_KIB};"
147 ))
148 .with_context(|| "setting search hydration cache_size")?;
149 Ok(conn)
150}
151
152fn nfc_sanitize_query(raw: &str) -> String {
156 use unicode_normalization::UnicodeNormalization;
157 let nfc: String = raw.nfc().collect();
158 fs_cass_sanitize_query(&nfc)
159}
160
161fn franken_query_map_collect_retry<T, F>(
162 conn: &Connection,
163 sql: &str,
164 params: &[ParamValue],
165 map: F,
166) -> Result<Vec<T>, frankensqlite::FrankenError>
167where
168 F: Copy + Fn(&frankensqlite::Row) -> Result<T, frankensqlite::FrankenError>,
169{
170 let deadline = Instant::now() + Duration::from_secs(2);
171 let mut backoff = Duration::from_millis(4);
172 loop {
173 match conn.query_map_collect(sql, params, |row| map(row)) {
174 Ok(values) => return Ok(values),
175 Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
176 let now = Instant::now();
177 if now >= deadline {
178 return Err(err);
179 }
180 let remaining = deadline.saturating_duration_since(now);
181 crate::storage::sqlite::sleep_with_franken_retry_backoff(
182 &mut backoff,
183 remaining,
184 Duration::from_millis(64),
185 );
186 }
187 Err(err) => return Err(err),
188 }
189 }
190}
191
192fn hydrate_message_content_by_conversation(
193 conn: &Connection,
194 requests: &[TantivyContentExactKey],
195) -> Result<HashMap<TantivyContentExactKey, String>> {
196 if requests.is_empty() {
197 return Ok(HashMap::new());
198 }
199
200 let mut wanted_by_conversation: HashMap<i64, HashSet<i64>> = HashMap::new();
201 for &(conversation_id, line_idx) in requests {
202 wanted_by_conversation
203 .entry(conversation_id)
204 .or_default()
205 .insert(line_idx);
206 }
207
208 let mut conversation_ids = wanted_by_conversation.keys().copied().collect::<Vec<_>>();
209 conversation_ids.sort_unstable();
210 let mut hydrated = HashMap::with_capacity(requests.len());
211
212 for conversation_id in conversation_ids {
213 let Some(wanted_indices) = wanted_by_conversation.get(&conversation_id) else {
214 continue;
215 };
216 let mut wanted_indices = wanted_indices.iter().copied().collect::<Vec<_>>();
217 wanted_indices.sort_unstable();
218 let placeholders = sql_placeholders(wanted_indices.len());
219 let sql = format!(
220 "SELECT m.conversation_id, m.idx, m.content
221 FROM messages m INDEXED BY sqlite_autoindex_messages_1
222 WHERE m.conversation_id = ? AND m.idx IN ({placeholders})
223 ORDER BY m.idx"
224 );
225 let mut params = Vec::with_capacity(wanted_indices.len() + 1);
226 params.push(ParamValue::from(conversation_id));
227 params.extend(wanted_indices.iter().copied().map(ParamValue::from));
228 let rows: Vec<(i64, i64, String)> =
229 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
230 Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?))
231 })?;
232 for (conversation_id, line_idx, content) in rows {
233 hydrated.insert((conversation_id, line_idx), content);
234 }
235 }
236
237 Ok(hydrated)
238}
239
240fn semantic_message_id_from_db(message_id: i64) -> std::io::Result<u64> {
241 u64::try_from(message_id).map_err(|_| std::io::Error::other("negative message_id"))
242}
243
244fn semantic_doc_component_id_from_db(raw: Option<i64>) -> u32 {
245 raw.map(|value| u32::try_from(value.max(0)).unwrap_or(u32::MAX))
246 .unwrap_or(0)
247}
248
249use crate::search::canonicalize::{canonicalize_for_embedding, content_hash, is_search_noise_text};
250use crate::search::embedder::Embedder;
251use crate::search::vector_index::{
252 ROLE_USER, SemanticDocId, SemanticFilter, SemanticFilterMaps, VectorIndex, VectorSearchResult,
253 parse_semantic_doc_id, role_code_from_str,
254};
255use crate::sources::provenance::SourceFilter;
256
257pub struct StringInterner {
268 cache: RwLock<LruCache<Arc<str>, Arc<str>>>,
269}
270
271impl StringInterner {
272 pub fn new(capacity: usize) -> Self {
274 Self {
275 cache: RwLock::new(LruCache::new(
276 NonZeroUsize::new(capacity).expect("capacity must be > 0"),
277 )),
278 }
279 }
280
281 pub fn intern(&self, s: &str) -> Arc<str> {
287 {
289 let cache = self.cache.read();
290 if let Some(arc) = cache.peek(s) {
293 return Arc::clone(arc);
294 }
295 }
296
297 let mut cache = self.cache.write();
299
300 if let Some(arc) = cache.get(s) {
303 return Arc::clone(arc);
304 }
305
306 let arc: Arc<str> = Arc::from(s);
308 cache.put(Arc::clone(&arc), Arc::clone(&arc));
309 arc
310 }
311
312 #[allow(dead_code)]
314 pub fn len(&self) -> usize {
315 self.cache.read().len()
316 }
317
318 #[allow(dead_code)]
320 pub fn is_empty(&self) -> bool {
321 self.cache.read().is_empty()
322 }
323}
324
325static CACHE_KEY_INTERNER: Lazy<StringInterner> = Lazy::new(|| StringInterner::new(10_000));
328
329#[inline]
331fn intern_cache_key(s: &str) -> Arc<str> {
332 CACHE_KEY_INTERNER.intern(s)
333}
334
335#[inline]
351pub fn sql_placeholders(count: usize) -> String {
352 if count == 0 {
353 return String::new();
354 }
355 let capacity = count.saturating_mul(2).saturating_sub(1);
357 let mut result = String::with_capacity(capacity);
358 for i in 0..count {
359 if i > 0 {
360 result.push(',');
361 }
362 result.push('?');
363 }
364 result
365}
366
367#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
368pub struct SearchFilters {
369 pub agents: HashSet<String>,
370 pub workspaces: HashSet<String>,
371 pub created_from: Option<i64>,
372 pub created_to: Option<i64>,
373 #[serde(skip_serializing_if = "SourceFilter::is_all")]
375 pub source_filter: SourceFilter,
376 #[serde(skip_serializing_if = "HashSet::is_empty")]
378 pub session_paths: HashSet<String>,
379}
380
381#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, clap::ValueEnum)]
382#[serde(rename_all = "snake_case")]
383pub enum SearchMode {
384 Lexical,
386 Semantic,
388 #[default]
390 Hybrid,
391}
392
393impl SearchMode {
394 pub fn next(self) -> Self {
395 match self {
396 SearchMode::Lexical => SearchMode::Semantic,
397 SearchMode::Semantic => SearchMode::Hybrid,
398 SearchMode::Hybrid => SearchMode::Lexical,
399 }
400 }
401}
402
403#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize)]
410#[serde(rename_all = "snake_case")]
411pub enum SemanticTierMode {
412 #[default]
413 Single,
414 Progressive,
415 FastOnly,
416 QualityOnly,
417}
418
419impl SemanticTierMode {
420 const fn wants_two_tier(self) -> bool {
421 !matches!(self, Self::Single)
422 }
423
424 fn to_frankensearch_config(self) -> FsTwoTierConfig {
425 let mut config = frankensearch_two_tier_config();
426 match self {
427 Self::Single | Self::Progressive => {}
428 Self::FastOnly => {
429 config.fast_only = true;
430 }
431 Self::QualityOnly => {
432 config.fast_only = false;
433 config.quality_weight = 1.0;
434 }
435 }
436 config
437 }
438}
439
440const PROGRESSIVE_EMBEDDING_CACHE_CAPACITY: usize = 64;
441const ANN_CANDIDATE_MULTIPLIER: usize = 4;
442const HYBRID_NO_LIMIT_PLANNING_WINDOW: usize = 64;
443const HYBRID_NO_LIMIT_SEMANTIC_CAP: usize = 2048;
444const AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS: usize = 16;
445
446pub const NO_LIMIT_RESULT_MIN: usize = 1_000;
467pub const NO_LIMIT_RESULT_MAX: usize = 1_000_000;
468
469const AVG_HIT_BYTES: u64 = 80 * 1024;
474
475const NO_LIMIT_BYTES_CEILING: u64 = 16 * 1024 * 1024 * 1024;
481
482const NO_LIMIT_BYTES_FLOOR: u64 = 256 * 1024 * 1024;
486
487const NO_LIMIT_RAM_DIVISOR: u64 = 16;
491
492fn available_memory_bytes() -> Option<u64> {
493 let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?;
494 for line in meminfo.lines() {
495 if let Some(rest) = line.strip_prefix("MemAvailable:") {
496 let kb: u64 = rest.split_whitespace().next()?.parse().ok()?;
497 return Some(kb.saturating_mul(1024));
498 }
499 }
500 None
501}
502
503fn no_limit_result_cap() -> usize {
504 static CAP: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
505 *CAP.get_or_init(|| {
506 compute_no_limit_result_cap_from(
507 std::env::var("CASS_SEARCH_NO_LIMIT_CAP").ok(),
508 std::env::var("CASS_SEARCH_NO_LIMIT_BYTES").ok(),
509 available_memory_bytes(),
510 )
511 })
512}
513
514fn compute_no_limit_result_cap_from(
521 cap_env: Option<String>,
522 bytes_env: Option<String>,
523 available_bytes: Option<u64>,
524) -> usize {
525 if let Some(hits) = cap_env
529 .and_then(|v| v.parse::<usize>().ok())
530 .filter(|v| *v > 0)
531 {
532 return hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
533 }
534
535 let budget_bytes = no_limit_budget_bytes(bytes_env, available_bytes);
536 let hits = (budget_bytes / AVG_HIT_BYTES) as usize;
537 hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX)
538}
539
540fn no_limit_budget_bytes(bytes_env: Option<String>, available_bytes: Option<u64>) -> u64 {
541 bytes_env
542 .and_then(|v| v.parse::<u64>().ok())
543 .filter(|v| *v > 0)
544 .or_else(|| no_limit_available_memory_budget(available_bytes))
545 .unwrap_or(NO_LIMIT_BYTES_FLOOR)
546}
547
548fn no_limit_available_memory_budget(available_bytes: Option<u64>) -> Option<u64> {
549 available_bytes.map(|avail| {
550 (avail / NO_LIMIT_RAM_DIVISOR).clamp(NO_LIMIT_BYTES_FLOOR, NO_LIMIT_BYTES_CEILING)
551 })
552}
553
554static FRANKENSEARCH_TWO_TIER_CONFIG: Lazy<FsTwoTierConfig> =
555 Lazy::new(|| FsTwoTierConfig::optimized().with_env_overrides());
556
557fn frankensearch_two_tier_config() -> FsTwoTierConfig {
558 FRANKENSEARCH_TWO_TIER_CONFIG.clone()
559}
560
561#[inline]
562const fn progressive_phase_fetch_limit(limit: usize) -> usize {
563 let limit = if limit == 0 { 1 } else { limit };
564 limit.saturating_mul(3)
565}
566
567#[derive(Debug, Clone, Copy, PartialEq, Eq)]
568struct HybridCandidateBudget {
569 lexical_candidates: usize,
570 semantic_candidates: usize,
571}
572
573#[inline]
574const fn hybrid_stage_multipliers(query_class: FsQueryClass) -> (usize, usize) {
575 match query_class {
576 FsQueryClass::Identifier => (6, 2),
578 FsQueryClass::ShortKeyword => (4, 4),
580 FsQueryClass::NaturalLanguage => (2, 8),
582 FsQueryClass::Empty => (0, 0),
584 }
585}
586
587#[inline]
588fn hybrid_candidate_budget(
589 query: &str,
590 requested_limit: usize,
591 effective_limit: usize,
592 offset: usize,
593 total_docs: usize,
594) -> HybridCandidateBudget {
595 let query_class = FsQueryClass::classify(query);
596 let (lex_mult, sem_mult) = hybrid_stage_multipliers(query_class);
597 let total_docs = total_docs.max(1);
598
599 if requested_limit == 0 {
602 let planning_window = HYBRID_NO_LIMIT_PLANNING_WINDOW.max(offset.saturating_add(1));
603 let lexical = effective_limit.min(total_docs).min(no_limit_result_cap());
608 let semantic = fs_candidate_count(planning_window, 0, sem_mult)
616 .max(planning_window)
617 .min(HYBRID_NO_LIMIT_SEMANTIC_CAP.max(offset.saturating_add(planning_window)))
618 .min(total_docs)
619 .min(lexical);
620 return HybridCandidateBudget {
621 lexical_candidates: lexical,
622 semantic_candidates: semantic,
623 };
624 }
625
626 let lexical = fs_candidate_count(requested_limit, offset, lex_mult.max(1))
627 .max(requested_limit.saturating_add(offset))
628 .min(total_docs);
629 let semantic = fs_candidate_count(requested_limit, offset, sem_mult.max(1))
630 .max(requested_limit.saturating_add(offset))
631 .min(total_docs);
632
633 HybridCandidateBudget {
634 lexical_candidates: lexical,
635 semantic_candidates: semantic,
636 }
637}
638
639#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
645#[serde(rename_all = "snake_case")]
646pub enum QueryType {
647 Simple,
649 Phrase,
651 Boolean,
653 Wildcard,
655 Filtered,
657 Empty,
659}
660
661#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
663#[serde(rename_all = "snake_case")]
664pub enum IndexStrategy {
665 EdgeNgram,
667 RegexScan,
669 BooleanCombination,
671 RangeScan,
673 FullScan,
675}
676
677#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
679#[serde(rename_all = "snake_case")]
680pub enum QueryCost {
681 Low,
683 Medium,
685 High,
687}
688
689#[derive(Debug, Clone, serde::Serialize)]
691pub struct ParsedSubTerm {
692 pub text: String,
693 pub pattern: String,
694}
695
696#[derive(Debug, Clone, serde::Serialize)]
698pub struct ParsedTerm {
699 pub text: String,
701 pub negated: bool,
703 pub subterms: Vec<ParsedSubTerm>,
705}
706
707#[derive(Debug, Clone, Default, serde::Serialize)]
709pub struct ParsedQuery {
710 pub terms: Vec<ParsedTerm>,
712 pub phrases: Vec<String>,
714 pub operators: Vec<String>,
716 pub implicit_and: bool,
718}
719
720#[derive(Debug, Clone, serde::Serialize)]
722pub struct QueryExplanation {
723 pub original_query: String,
725 pub sanitized_query: String,
727 pub parsed: ParsedQuery,
729 pub query_type: QueryType,
731 pub index_strategy: IndexStrategy,
733 pub wildcard_applied: bool,
735 pub estimated_cost: QueryCost,
737 pub filters_summary: FiltersSummary,
739 pub warnings: Vec<String>,
741}
742
743#[derive(Debug, Clone, Default, serde::Serialize)]
745pub struct FiltersSummary {
746 pub agent_count: usize,
748 pub workspace_count: usize,
750 pub has_time_filter: bool,
752 pub description: Option<String>,
754}
755
756impl QueryExplanation {
757 pub fn analyze(query: &str, filters: &SearchFilters) -> Self {
759 let sanitized = nfc_sanitize_query(query);
760 let tokens = fs_cass_parse_boolean_query(query);
762
763 let mut parsed = ParsedQuery::default();
765 let mut has_explicit_operator = false;
766 let mut next_negated = false;
767
768 for token in &tokens {
769 match token {
770 FsCassQueryToken::Term(t) => {
771 let parts: Vec<String> = nfc_sanitize_query(t)
772 .split_whitespace()
773 .map(|s| s.to_string())
774 .collect();
775 if parts.is_empty() {
776 next_negated = false;
777 continue;
778 }
779 let mut subterms = Vec::new();
780 for part in parts {
781 let pattern = FsCassWildcardPattern::parse(&part);
782 let pattern_str = match &pattern {
783 FsCassWildcardPattern::Exact(_) => "exact",
784 FsCassWildcardPattern::Prefix(_) => "prefix (*)",
785 FsCassWildcardPattern::Suffix(_) => "suffix (*)",
786 FsCassWildcardPattern::Substring(_) => "substring (*)",
787 FsCassWildcardPattern::Complex(_) => "complex (*)",
788 };
789 subterms.push(ParsedSubTerm {
790 text: part,
791 pattern: pattern_str.to_string(),
792 });
793 }
794 parsed.terms.push(ParsedTerm {
795 text: t.clone(),
796 negated: next_negated,
797 subterms,
798 });
799 next_negated = false;
800 }
801 FsCassQueryToken::Phrase(p) => {
802 let parts: Vec<String> = nfc_sanitize_query(p)
803 .split_whitespace()
804 .map(|s| s.trim_matches('*').to_lowercase())
805 .filter(|s| !s.is_empty())
806 .collect();
807 if !parts.is_empty() {
808 parsed.phrases.push(parts.join(" "));
809 }
810 next_negated = false;
811 }
812 FsCassQueryToken::And => {
813 parsed.operators.push("AND".to_string());
814 has_explicit_operator = true;
815 }
816 FsCassQueryToken::Or => {
817 parsed.operators.push("OR".to_string());
818 has_explicit_operator = true;
819 }
820 FsCassQueryToken::Not => {
821 parsed.operators.push("NOT".to_string());
822 has_explicit_operator = true;
823 next_negated = true;
824 }
825 }
826 }
827
828 parsed.implicit_and = !has_explicit_operator && parsed.terms.len() > 1;
830
831 let query_type = Self::classify_query(&parsed, filters, &sanitized);
833
834 let index_strategy = Self::determine_strategy(&parsed, &sanitized);
836
837 let estimated_cost = Self::estimate_cost(&parsed, &index_strategy, filters);
839
840 let filters_summary = Self::summarize_filters(filters);
842
843 let warnings = Self::generate_warnings(&parsed, &sanitized, filters);
845
846 Self {
847 original_query: query.to_string(),
848 sanitized_query: sanitized,
849 parsed,
850 query_type,
851 index_strategy,
852 wildcard_applied: false, estimated_cost,
854 filters_summary,
855 warnings,
856 }
857 }
858
859 fn classify_query(parsed: &ParsedQuery, filters: &SearchFilters, sanitized: &str) -> QueryType {
860 if sanitized.trim().is_empty() {
861 return QueryType::Empty;
862 }
863
864 let has_filters = !filters.agents.is_empty()
866 || !filters.workspaces.is_empty()
867 || filters.created_from.is_some()
868 || filters.created_to.is_some()
869 || !filters.source_filter.is_all();
870
871 if has_filters {
872 return QueryType::Filtered;
873 }
874
875 if !parsed.operators.is_empty() {
877 return QueryType::Boolean;
878 }
879
880 if !parsed.phrases.is_empty() {
882 return QueryType::Phrase;
883 }
884
885 let has_wildcards = parsed
887 .terms
888 .iter()
889 .flat_map(|t| &t.subterms)
890 .any(|t| t.pattern != "exact");
891 if has_wildcards {
892 return QueryType::Wildcard;
893 }
894
895 QueryType::Simple
896 }
897
898 fn determine_strategy(parsed: &ParsedQuery, sanitized: &str) -> IndexStrategy {
899 if sanitized.trim().is_empty() {
900 return IndexStrategy::FullScan;
901 }
902
903 let has_leading_wildcard = parsed
905 .terms
906 .iter()
907 .flat_map(|t| &t.subterms)
908 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
909
910 if has_leading_wildcard {
911 return IndexStrategy::RegexScan;
912 }
913
914 let has_compound_terms = parsed.terms.iter().any(|t| t.subterms.len() > 1);
917
918 if !parsed.operators.is_empty()
919 || parsed.terms.len() > 1
920 || !parsed.phrases.is_empty()
921 || has_compound_terms
922 {
923 return IndexStrategy::BooleanCombination;
924 }
925
926 IndexStrategy::EdgeNgram
928 }
929
930 fn estimate_cost(
931 parsed: &ParsedQuery,
932 strategy: &IndexStrategy,
933 filters: &SearchFilters,
934 ) -> QueryCost {
935 if matches!(strategy, IndexStrategy::RegexScan) {
937 return QueryCost::High;
938 }
939
940 if matches!(strategy, IndexStrategy::FullScan) {
942 return QueryCost::High;
943 }
944
945 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
947
948 let term_count: usize = parsed.terms.iter().map(|t| t.subterms.len()).sum();
950 let operator_count = parsed.operators.len();
951 let phrase_count = parsed.phrases.len();
952
953 let complexity = term_count + operator_count * 2 + phrase_count * 2;
954
955 if complexity > 6 || has_time_filter {
956 QueryCost::High
957 } else if complexity > 2 {
958 QueryCost::Medium
959 } else {
960 QueryCost::Low
961 }
962 }
963
964 fn summarize_filters(filters: &SearchFilters) -> FiltersSummary {
965 let agent_count = filters.agents.len();
966 let workspace_count = filters.workspaces.len();
967 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
968
969 let mut parts = Vec::new();
970 if agent_count > 0 {
971 parts.push(format!(
972 "{} agent{}",
973 agent_count,
974 if agent_count > 1 { "s" } else { "" }
975 ));
976 }
977 if workspace_count > 0 {
978 parts.push(format!(
979 "{} workspace{}",
980 workspace_count,
981 if workspace_count > 1 { "s" } else { "" }
982 ));
983 }
984 if has_time_filter {
985 parts.push("time range".to_string());
986 }
987
988 let description = if parts.is_empty() {
989 None
990 } else {
991 Some(format!("Filtering by: {}", parts.join(", ")))
992 };
993
994 FiltersSummary {
995 agent_count,
996 workspace_count,
997 has_time_filter,
998 description,
999 }
1000 }
1001
1002 fn generate_warnings(
1003 parsed: &ParsedQuery,
1004 sanitized: &str,
1005 filters: &SearchFilters,
1006 ) -> Vec<String> {
1007 let mut warnings = Vec::new();
1008
1009 let has_leading_wildcard = parsed
1011 .terms
1012 .iter()
1013 .flat_map(|t| &t.subterms)
1014 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
1015 if has_leading_wildcard {
1016 warnings.push(
1017 "Leading wildcards (*foo) require regex scan and may be slow on large indexes"
1018 .to_string(),
1019 );
1020 }
1021
1022 for term in &parsed.terms {
1024 for sub in &term.subterms {
1025 if sub.text.trim_matches('*').len() < 2 {
1026 warnings.push(format!(
1027 "Very short term '{}' may match many documents",
1028 sub.text
1029 ));
1030 }
1031 }
1032 }
1033
1034 if sanitized.trim().is_empty() {
1036 warnings.push("Empty query will return all documents (expensive)".to_string());
1037 }
1038
1039 if parsed.operators.len() > 3 {
1041 warnings.push("Complex boolean query may have unexpected precedence".to_string());
1042 }
1043
1044 if let Some(agent) = filters.agents.iter().next()
1046 && filters.agents.len() == 1
1047 && filters.workspaces.is_empty()
1048 {
1049 warnings.push(format!(
1050 "Searching only in agent '{}' - results from other agents will be excluded",
1051 agent
1052 ));
1053 }
1054
1055 warnings
1056 }
1057
1058 pub fn with_wildcard_fallback(mut self, applied: bool) -> Self {
1060 self.wildcard_applied = applied;
1061 if applied
1062 && !self
1063 .warnings
1064 .iter()
1065 .any(|w| w.contains("wildcard fallback"))
1066 {
1067 self.warnings.push(
1068 "Wildcard fallback was applied automatically due to sparse exact matches"
1069 .to_string(),
1070 );
1071 }
1072 self
1073 }
1074}
1075
1076#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize)]
1079#[serde(rename_all = "snake_case")]
1080pub enum MatchType {
1081 #[default]
1083 Exact,
1084 Prefix,
1086 Suffix,
1088 Substring,
1090 Wildcard,
1092 ImplicitWildcard,
1094}
1095
1096impl MatchType {
1097 pub fn quality_factor(self) -> f32 {
1099 match self {
1100 MatchType::Exact => 1.0,
1101 MatchType::Prefix => 0.9,
1102 MatchType::Suffix => 0.8,
1103 MatchType::Substring => 0.7,
1104 MatchType::Wildcard => 0.65,
1105 MatchType::ImplicitWildcard => 0.6,
1106 }
1107 }
1108}
1109
1110#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
1112#[serde(rename_all = "snake_case")]
1113pub enum SuggestionKind {
1114 SpellingFix,
1116 WildcardQuery,
1118 RemoveFilter,
1120 AlternateAgent,
1122 BroaderDateRange,
1124}
1125
1126#[derive(Debug, Clone, serde::Serialize)]
1128pub struct QuerySuggestion {
1129 pub kind: SuggestionKind,
1131 pub message: String,
1133 pub suggested_query: Option<String>,
1135 pub suggested_filters: Option<SearchFilters>,
1137 pub shortcut: Option<u8>,
1139}
1140
1141impl QuerySuggestion {
1142 fn spelling(_query: &str, corrected: &str) -> Self {
1143 Self {
1144 kind: SuggestionKind::SpellingFix,
1145 message: format!("Did you mean: \"{corrected}\"?"),
1146 suggested_query: Some(corrected.to_string()),
1147 suggested_filters: None,
1148 shortcut: None,
1149 }
1150 }
1151
1152 fn wildcard(query: &str) -> Self {
1153 let wildcard_query = format!("*{}*", query.trim_matches('*'));
1154 Self {
1155 kind: SuggestionKind::WildcardQuery,
1156 message: format!("Try broader search: \"{wildcard_query}\""),
1157 suggested_query: Some(wildcard_query),
1158 suggested_filters: None,
1159 shortcut: None,
1160 }
1161 }
1162
1163 fn remove_agent_filter(current_agent: &str, current_filters: &SearchFilters) -> Self {
1164 let mut filters = current_filters.clone();
1167 filters.agents.clear();
1168 Self {
1169 kind: SuggestionKind::RemoveFilter,
1170 message: format!("Remove agent filter (currently: {current_agent})"),
1171 suggested_query: None,
1172 suggested_filters: Some(filters),
1173 shortcut: None,
1174 }
1175 }
1176
1177 fn try_agent(agent_slug: &str) -> Self {
1178 let mut filters = SearchFilters::default();
1179 filters.agents.insert(agent_slug.to_string());
1180 Self {
1181 kind: SuggestionKind::AlternateAgent,
1182 message: format!("Try searching in: {agent_slug}"),
1183 suggested_query: None,
1184 suggested_filters: Some(filters),
1185 shortcut: None,
1186 }
1187 }
1188
1189 fn with_shortcut(mut self, key: u8) -> Self {
1190 self.shortcut = Some(key);
1191 self
1192 }
1193}
1194
1195#[derive(Debug, Clone, Copy)]
1196pub struct FieldMask {
1197 flags: u8,
1198 preview_content_chars: Option<usize>,
1199}
1200
1201impl FieldMask {
1202 const CONTENT: u8 = 1 << 0;
1203 const SNIPPET: u8 = 1 << 1;
1204 const TITLE: u8 = 1 << 2;
1205 const CACHE: u8 = 1 << 3;
1206
1207 pub const FULL: Self = Self {
1208 flags: Self::CONTENT | Self::SNIPPET | Self::TITLE | Self::CACHE,
1209 preview_content_chars: None,
1210 };
1211
1212 pub fn new(
1213 wants_content: bool,
1214 wants_snippet: bool,
1215 wants_title: bool,
1216 allows_cache: bool,
1217 ) -> Self {
1218 let mut flags = 0;
1219 if wants_content {
1220 flags |= Self::CONTENT;
1221 }
1222 if wants_snippet {
1223 flags |= Self::SNIPPET;
1224 }
1225 if wants_title {
1226 flags |= Self::TITLE;
1227 }
1228 if allows_cache {
1229 flags |= Self::CACHE;
1230 }
1231 Self {
1232 flags,
1233 preview_content_chars: None,
1234 }
1235 }
1236
1237 pub fn with_preview_content_limit(mut self, max_chars: Option<usize>) -> Self {
1238 self.preview_content_chars = max_chars;
1239 if max_chars.is_some() {
1240 self.flags &= !Self::CACHE;
1241 }
1242 self
1243 }
1244
1245 pub fn needs_content(self) -> bool {
1246 self.flags & Self::CONTENT != 0
1247 }
1248
1249 pub fn wants_snippet(self) -> bool {
1250 self.flags & Self::SNIPPET != 0
1251 }
1252
1253 pub fn wants_title(self) -> bool {
1254 self.flags & Self::TITLE != 0
1255 }
1256
1257 pub fn allows_cache(self) -> bool {
1258 self.flags & Self::CACHE != 0
1259 }
1260
1261 pub fn preview_content_limit(self) -> Option<usize> {
1262 self.preview_content_chars
1263 }
1264}
1265
1266#[derive(Debug, Clone, serde::Serialize)]
1267pub struct SearchHit {
1268 pub title: String,
1269 pub snippet: String,
1270 pub content: String,
1271 #[serde(skip_serializing)]
1272 pub content_hash: u64,
1273 #[serde(skip_serializing)]
1274 pub conversation_id: Option<i64>,
1275 pub score: f32,
1276 pub source_path: String,
1277 pub agent: String,
1278 pub workspace: String,
1279 #[serde(skip_serializing_if = "Option::is_none")]
1281 pub workspace_original: Option<String>,
1282 pub created_at: Option<i64>,
1283 pub line_number: Option<usize>,
1285 #[serde(default)]
1287 pub match_type: MatchType,
1288 #[serde(default = "default_source_id")]
1291 pub source_id: String,
1292 #[serde(default = "default_source_id")]
1294 pub origin_kind: String,
1295 #[serde(skip_serializing_if = "Option::is_none")]
1297 pub origin_host: Option<String>,
1298}
1299
1300static LAZY_FIELDS_ENABLED: Lazy<bool> = Lazy::new(|| {
1301 dotenvy::var("CASS_LAZY_FIELDS")
1302 .ok()
1303 .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
1304 .unwrap_or(true)
1305});
1306
1307fn default_source_id() -> String {
1308 "local".to_string()
1309}
1310
1311fn effective_field_mask(field_mask: FieldMask) -> FieldMask {
1312 if *LAZY_FIELDS_ENABLED {
1313 field_mask
1314 } else {
1315 FieldMask::FULL
1316 }
1317}
1318
1319fn execute_query_with_lazy_exact_count(
1320 searcher: &Searcher,
1321 query: &dyn Query,
1322 limit: usize,
1323 offset: usize,
1324) -> Result<FsLexicalSearchResult> {
1325 let top_docs = searcher.search(
1326 query,
1327 &TopDocs::with_limit(limit)
1328 .and_offset(offset)
1329 .order_by_score(),
1330 )?;
1331 let page_saturated = top_docs.len() == limit;
1332 let total_count = if page_saturated {
1333 searcher.search(query, &Count)?
1334 } else {
1335 offset.saturating_add(top_docs.len())
1336 };
1337 let hits = top_docs
1338 .into_iter()
1339 .enumerate()
1340 .map(|(rank, (bm25_score, doc_address))| FsLexicalDocHit {
1341 bm25_score,
1342 rank,
1343 doc_address,
1344 })
1345 .collect();
1346
1347 Ok(FsLexicalSearchResult { hits, total_count })
1348}
1349
1350#[derive(Debug, Clone)]
1352pub struct SearchResult {
1353 pub hits: Vec<SearchHit>,
1355 pub wildcard_fallback: bool,
1357 pub cache_stats: CacheStats,
1359 pub suggestions: Vec<QuerySuggestion>,
1361 pub ann_stats: Option<crate::search::ann_index::AnnSearchStats>,
1363 pub total_count: Option<usize>,
1369}
1370
1371#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1372pub enum ProgressivePhaseKind {
1373 Initial,
1374 Refined,
1375}
1376
1377#[allow(clippy::large_enum_variant)]
1380#[derive(Debug, Clone)]
1381pub enum ProgressiveSearchEvent {
1382 Phase {
1383 kind: ProgressivePhaseKind,
1384 result: SearchResult,
1385 elapsed_ms: u128,
1386 },
1387 RefinementFailed {
1388 latency_ms: u128,
1389 error: String,
1390 },
1391}
1392
1393#[derive(Debug, Clone)]
1394pub(crate) struct ProgressiveSearchRequest<'a> {
1395 pub(crate) cx: &'a FsCx,
1396 pub(crate) query: &'a str,
1397 pub(crate) filters: SearchFilters,
1398 pub(crate) limit: usize,
1399 pub(crate) sparse_threshold: usize,
1400 pub(crate) field_mask: FieldMask,
1401 pub(crate) mode: SearchMode,
1402}
1403
1404#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1405struct SearchHitKey {
1406 source_id: String,
1407 source_path: String,
1408 conversation_id: Option<i64>,
1409 title: String,
1410 line_number: Option<usize>,
1411 created_at: Option<i64>,
1412 content_hash: u64,
1413}
1414
1415fn normalized_search_source_id_sql_expr(
1416 source_id_column: &str,
1417 origin_kind_column: &str,
1418 origin_host_column: &str,
1419) -> String {
1420 format!(
1421 "CASE \
1422 WHEN TRIM(COALESCE({source_id_column}, '')) != '' THEN \
1423 CASE \
1424 WHEN LOWER(TRIM(COALESCE({source_id_column}, ''))) = '{local}' THEN '{local}' \
1425 ELSE TRIM(COALESCE({source_id_column}, '')) \
1426 END \
1427 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) IN ('ssh', 'remote') THEN \
1428 CASE \
1429 WHEN TRIM(COALESCE({origin_host_column}, '')) = '' THEN 'remote' \
1430 ELSE TRIM(COALESCE({origin_host_column}, '')) \
1431 END \
1432 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) = '{local}' THEN '{local}' \
1433 WHEN TRIM(COALESCE({origin_host_column}, '')) != '' THEN TRIM(COALESCE({origin_host_column}, '')) \
1434 ELSE '{local}' \
1435 END",
1436 local = crate::sources::provenance::LOCAL_SOURCE_ID,
1437 )
1438}
1439
1440fn normalize_search_source_filter_value(source_id: &str) -> String {
1441 let trimmed = source_id.trim();
1442 if trimmed.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1443 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1444 } else {
1445 trimmed.to_string()
1446 }
1447}
1448
1449fn normalized_search_hit_source_id_parts(
1450 source_id: &str,
1451 origin_kind: &str,
1452 origin_host: Option<&str>,
1453) -> String {
1454 let trimmed_source_id = source_id.trim();
1455 if !trimmed_source_id.is_empty() {
1456 if trimmed_source_id.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1457 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1458 }
1459 return trimmed_source_id.to_string();
1460 }
1461
1462 let trimmed_origin_host = origin_host.map(str::trim).filter(|value| !value.is_empty());
1463 let trimmed_origin_kind = origin_kind.trim();
1464 if trimmed_origin_kind.eq_ignore_ascii_case("ssh")
1465 || trimmed_origin_kind.eq_ignore_ascii_case("remote")
1466 {
1467 return trimmed_origin_host.unwrap_or("remote").to_string();
1468 }
1469 if let Some(origin_host) = trimmed_origin_host {
1470 return origin_host.to_string();
1471 }
1472
1473 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1474}
1475
1476fn normalized_search_hit_origin_kind(source_id: &str, origin_kind: Option<&str>) -> String {
1477 if let Some(kind) = origin_kind.map(str::trim).filter(|value| !value.is_empty()) {
1478 if kind.eq_ignore_ascii_case("local") {
1479 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1480 }
1481 if kind.eq_ignore_ascii_case("ssh") || kind.eq_ignore_ascii_case("remote") {
1482 return "remote".to_string();
1483 }
1484 return kind.to_ascii_lowercase();
1485 }
1486
1487 if source_id == crate::sources::provenance::LOCAL_SOURCE_ID {
1488 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1489 } else {
1490 "remote".to_string()
1491 }
1492}
1493
1494fn normalized_search_hit_source_id(hit: &SearchHit) -> String {
1495 normalized_search_hit_source_id_parts(
1496 hit.source_id.as_str(),
1497 hit.origin_kind.as_str(),
1498 hit.origin_host.as_deref(),
1499 )
1500}
1501
1502impl SearchHitKey {
1503 fn from_hit(hit: &SearchHit) -> Self {
1504 Self {
1505 source_id: normalized_search_hit_source_id(hit),
1506 source_path: hit.source_path.clone(),
1507 conversation_id: hit.conversation_id,
1508 title: if hit.conversation_id.is_some() {
1509 String::new()
1510 } else {
1511 hit.title.trim().to_string()
1512 },
1513 line_number: hit.line_number,
1514 created_at: hit.created_at,
1515 content_hash: hit.content_hash,
1516 }
1517 }
1518}
1519
1520impl Ord for SearchHitKey {
1521 fn cmp(&self, other: &Self) -> CmpOrdering {
1522 self.source_id
1523 .cmp(&other.source_id)
1524 .then_with(|| self.source_path.cmp(&other.source_path))
1525 .then_with(|| self.conversation_id.cmp(&other.conversation_id))
1526 .then_with(|| self.title.cmp(&other.title))
1527 .then_with(|| self.line_number.cmp(&other.line_number))
1528 .then_with(|| self.created_at.cmp(&other.created_at))
1529 .then_with(|| self.content_hash.cmp(&other.content_hash))
1530 }
1531}
1532
1533impl PartialOrd for SearchHitKey {
1534 fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
1535 Some(self.cmp(other))
1536 }
1537}
1538
1539const FEDERATED_RRF_K: f32 = 60.0;
1540
1541#[derive(Debug)]
1542struct FederatedRankedHit {
1543 hit: SearchHit,
1544 shard_index: usize,
1545 shard_rank: usize,
1546 fused_score: f32,
1547}
1548
1549fn federated_rrf_score(shard_rank: usize) -> f32 {
1550 1.0 / (FEDERATED_RRF_K + shard_rank as f32 + 1.0)
1551}
1552
1553fn merge_federated_ranked_hits(mut ranked_hits: Vec<FederatedRankedHit>) -> Vec<SearchHit> {
1554 ranked_hits.sort_by(|a, b| {
1555 b.fused_score
1556 .total_cmp(&a.fused_score)
1557 .then_with(|| a.shard_rank.cmp(&b.shard_rank))
1558 .then_with(|| SearchHitKey::from_hit(&a.hit).cmp(&SearchHitKey::from_hit(&b.hit)))
1559 .then_with(|| a.shard_index.cmp(&b.shard_index))
1560 });
1561 ranked_hits
1562 .into_iter()
1563 .map(|mut ranked| {
1564 ranked.hit.score = ranked.fused_score;
1565 ranked.hit
1566 })
1567 .collect()
1568}
1569
1570#[cfg(test)]
1571#[allow(dead_code)]
1572#[derive(Debug, Default, Clone)]
1573struct HybridScore {
1574 rrf: f32,
1575 lexical_rank: Option<usize>,
1576 semantic_rank: Option<usize>,
1577 lexical_score: Option<f32>,
1578 semantic_score: Option<f32>,
1579}
1580
1581#[cfg(test)]
1582#[allow(dead_code)]
1583#[derive(Debug, Clone)]
1584struct FusedHit {
1585 key: SearchHitKey,
1586 score: HybridScore,
1587 hit: SearchHit,
1588}
1589
1590pub(crate) fn stable_content_hash(content: &str) -> u64 {
1600 use xxhash_rust::xxh3::Xxh3;
1601 let mut hasher = Xxh3::new();
1602 let mut first = true;
1603 for token in content.split_whitespace() {
1604 if !first {
1605 hasher.update(b" ");
1606 }
1607 hasher.update(token.as_bytes());
1608 first = false;
1609 }
1610 hasher.digest()
1611}
1612
1613fn stable_hit_hash(
1614 content: &str,
1615 source_path: &str,
1616 line_number: Option<usize>,
1617 created_at: Option<i64>,
1618) -> u64 {
1619 use xxhash_rust::xxh3::Xxh3;
1620 let mut hasher = Xxh3::new();
1621 if !content.is_empty() {
1624 hasher.update(&stable_content_hash(content).to_le_bytes());
1625 }
1626 hasher.update(b"|");
1627 hasher.update(source_path.as_bytes());
1628 hasher.update(b"|");
1629 if let Some(line) = line_number {
1630 let mut buf = itoa::Buffer::new();
1631 hasher.update(buf.format(line).as_bytes());
1632 }
1633 hasher.update(b"|");
1634 if let Some(ts) = created_at {
1635 let mut buf = itoa::Buffer::new();
1636 hasher.update(buf.format(ts).as_bytes());
1637 }
1638 hasher.digest()
1639}
1640
1641fn search_hit_key_doc_id(key: &SearchHitKey) -> String {
1642 use std::fmt::Write as _;
1650 const SEP: char = '\u{1f}';
1651 let capacity = key.source_id.len()
1653 + key.source_path.len()
1654 + key.title.len()
1655 + 6 + 3 * 20 + 20; let mut out = String::with_capacity(capacity);
1659 out.push_str(&key.source_id);
1660 out.push(SEP);
1661 out.push_str(&key.source_path);
1662 out.push(SEP);
1663 if let Some(v) = key.conversation_id {
1664 let _ = write!(out, "{v}");
1665 }
1666 out.push(SEP);
1667 out.push_str(&key.title);
1668 out.push(SEP);
1669 if let Some(v) = key.line_number {
1670 let _ = write!(out, "{v}");
1671 }
1672 out.push(SEP);
1673 if let Some(v) = key.created_at {
1674 let _ = write!(out, "{v}");
1675 }
1676 out.push(SEP);
1677 let _ = write!(out, "{}", key.content_hash);
1678 out
1679}
1680
1681fn search_hit_doc_id(hit: &SearchHit) -> String {
1682 search_hit_key_doc_id(&SearchHitKey::from_hit(hit))
1683}
1684
1685#[cfg(test)]
1687fn cmp_fused_hit_desc(a: &FusedHit, b: &FusedHit) -> CmpOrdering {
1688 b.score
1689 .rrf
1690 .total_cmp(&a.score.rrf)
1691 .then_with(|| {
1692 let a_both = a.score.lexical_rank.is_some() && a.score.semantic_rank.is_some();
1693 let b_both = b.score.lexical_rank.is_some() && b.score.semantic_rank.is_some();
1694 match (b_both, a_both) {
1695 (true, false) => CmpOrdering::Greater,
1696 (false, true) => CmpOrdering::Less,
1697 _ => CmpOrdering::Equal,
1698 }
1699 })
1700 .then_with(|| a.key.cmp(&b.key))
1701}
1702
1703#[cfg(test)]
1705#[allow(dead_code)]
1706const QUICKSELECT_THRESHOLD: usize = 64;
1707
1708#[cfg(test)]
1717#[allow(dead_code)]
1718fn top_k_fused(mut hits: Vec<FusedHit>, k: usize) -> Vec<FusedHit> {
1719 let n = hits.len();
1720
1721 if n == 0 || k == 0 {
1723 return Vec::new();
1724 }
1725 if k >= n {
1726 hits.sort_by(cmp_fused_hit_desc);
1727 return hits;
1728 }
1729
1730 if n < QUICKSELECT_THRESHOLD {
1732 hits.sort_by(cmp_fused_hit_desc);
1733 hits.truncate(k);
1734 return hits;
1735 }
1736
1737 hits.select_nth_unstable_by(k - 1, cmp_fused_hit_desc);
1739
1740 hits.truncate(k);
1742
1743 hits.sort_by(cmp_fused_hit_desc);
1745
1746 hits
1747}
1748
1749pub fn rrf_fuse_hits(
1752 lexical: &[SearchHit],
1753 semantic: &[SearchHit],
1754 query: &str,
1755 limit: usize,
1756 offset: usize,
1757) -> Vec<SearchHit> {
1758 if limit == 0 {
1759 return Vec::new();
1760 }
1761 let total_candidates = lexical.len().saturating_add(semantic.len());
1762 if total_candidates == 0 {
1763 return Vec::new();
1764 }
1765
1766 let mut lexical_scored = Vec::with_capacity(lexical.len());
1767 let mut semantic_scored = Vec::with_capacity(semantic.len());
1768 let mut hit_by_doc_id: HashMap<String, SearchHit> = HashMap::with_capacity(total_candidates);
1769
1770 for hit in lexical {
1771 let doc_id = search_hit_doc_id(hit);
1772 hit_by_doc_id.insert(doc_id.clone(), hit.clone());
1774 lexical_scored.push(FsScoredResult {
1775 doc_id,
1776 score: hit.score,
1777 source: FsScoreSource::Lexical,
1778 index: None,
1779 fast_score: None,
1780 quality_score: None,
1781 lexical_score: Some(hit.score),
1782 rerank_score: None,
1783 explanation: None,
1784 metadata: None,
1785 });
1786 }
1787
1788 for (idx, hit) in semantic.iter().enumerate() {
1789 let doc_id = search_hit_doc_id(hit);
1790 hit_by_doc_id
1791 .entry(doc_id.clone())
1792 .or_insert_with(|| hit.clone());
1793 semantic_scored.push(FsVectorHit {
1794 index: u32::try_from(idx).unwrap_or(u32::MAX),
1795 score: hit.score,
1796 doc_id,
1797 });
1798 }
1799
1800 let fused = fs_rrf_fuse(
1803 &lexical_scored,
1804 &semantic_scored,
1805 total_candidates,
1806 0,
1807 &FsRrfConfig::default(),
1808 );
1809
1810 #[derive(Clone, Copy)]
1815 struct CompatSlot {
1816 index: usize,
1817 conversation_id: Option<i64>,
1818 ambiguous: bool,
1819 }
1820
1821 let mut source_ids: HashMap<String, u32> = HashMap::new();
1822 let mut path_ids: HashMap<String, u32> = HashMap::new();
1823 let mut title_ids: HashMap<String, u32> = HashMap::new();
1824 let mut next_source_id: u32 = 0;
1825 let mut next_path_id: u32 = 0;
1826 let mut next_title_id: u32 = 0;
1827 type CompatExactKey = (
1828 u32,
1829 u32,
1830 Option<i64>,
1831 Option<u32>,
1832 Option<usize>,
1833 Option<i64>,
1834 u64,
1835 );
1836 type CompatFallbackKey = (u32, u32, u32, Option<usize>, Option<i64>, u64);
1837
1838 let mut exact_seen: HashMap<CompatExactKey, usize> = HashMap::with_capacity(fused.len());
1839 let mut fallback_seen: HashMap<CompatFallbackKey, CompatSlot> =
1840 HashMap::with_capacity(fused.len());
1841 let mut unique_hits: Vec<SearchHit> = Vec::with_capacity(fused.len());
1842
1843 let update_slot = |slot: &mut CompatSlot, conversation_id: Option<i64>| {
1844 if slot.ambiguous {
1845 return;
1846 }
1847 match (slot.conversation_id, conversation_id) {
1848 (Some(existing), Some(current)) if existing != current => slot.ambiguous = true,
1849 (None, Some(current)) => slot.conversation_id = Some(current),
1850 _ => {}
1851 }
1852 };
1853
1854 for fused_hit in fused {
1855 let mut hit = match hit_by_doc_id.remove(&fused_hit.doc_id) {
1856 Some(hit) => hit,
1857 None => continue,
1858 };
1859 if hit_is_noise(&hit, query) {
1860 continue;
1861 }
1862
1863 let normalized_source_id = normalized_search_hit_source_id(&hit);
1864 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
1865 *id
1866 } else {
1867 let id = next_source_id;
1868 next_source_id = next_source_id.saturating_add(1);
1869 source_ids.insert(normalized_source_id, id);
1870 id
1871 };
1872 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
1873 *id
1874 } else {
1875 let id = next_path_id;
1876 next_path_id = next_path_id.saturating_add(1);
1877 path_ids.insert(hit.source_path.clone(), id);
1878 id
1879 };
1880 let normalized_title = hit.title.trim();
1881 let fallback_title_key = if let Some(id) = title_ids.get(normalized_title) {
1882 *id
1883 } else {
1884 let id = next_title_id;
1885 next_title_id = next_title_id.saturating_add(1);
1886 title_ids.insert(normalized_title.to_string(), id);
1887 id
1888 };
1889 let exact_title_key = if hit.conversation_id.is_some() {
1890 None
1891 } else {
1892 Some(fallback_title_key)
1893 };
1894 let exact_key = (
1895 source_key,
1896 path_key,
1897 hit.conversation_id,
1898 exact_title_key,
1899 hit.line_number,
1900 hit.created_at,
1901 hit.content_hash,
1902 );
1903 let fallback_key = (
1904 source_key,
1905 path_key,
1906 fallback_title_key,
1907 hit.line_number,
1908 hit.created_at,
1909 hit.content_hash,
1910 );
1911
1912 let merged_idx = exact_seen.get(&exact_key).copied().or_else(|| {
1913 fallback_seen.get(&fallback_key).and_then(|slot| {
1914 if slot.ambiguous {
1915 return None;
1916 }
1917 match (slot.conversation_id, hit.conversation_id) {
1918 (Some(existing), Some(current)) if existing != current => None,
1919 _ => Some(slot.index),
1920 }
1921 })
1922 });
1923
1924 if let Some(existing_idx) = merged_idx {
1925 exact_seen.insert(exact_key, existing_idx);
1926 let slot = fallback_seen.entry(fallback_key).or_insert(CompatSlot {
1927 index: existing_idx,
1928 conversation_id: hit.conversation_id,
1929 ambiguous: false,
1930 });
1931 update_slot(slot, hit.conversation_id);
1932 if unique_hits[existing_idx].conversation_id.is_none() && hit.conversation_id.is_some()
1933 {
1934 unique_hits[existing_idx].conversation_id = hit.conversation_id;
1935 }
1936 unique_hits[existing_idx].score += fused_hit.rrf_score as f32;
1937 continue;
1938 }
1939
1940 hit.score = fused_hit.rrf_score as f32;
1941 let index = unique_hits.len();
1942 unique_hits.push(hit);
1943 exact_seen.insert(exact_key, index);
1944 match fallback_seen.get_mut(&fallback_key) {
1945 Some(slot) => update_slot(slot, unique_hits[index].conversation_id),
1946 None => {
1947 fallback_seen.insert(
1948 fallback_key,
1949 CompatSlot {
1950 index,
1951 conversation_id: unique_hits[index].conversation_id,
1952 ambiguous: false,
1953 },
1954 );
1955 }
1956 }
1957 }
1958
1959 unique_hits.sort_by(|a, b| {
1960 b.score
1961 .total_cmp(&a.score)
1962 .then_with(|| SearchHitKey::from_hit(a).cmp(&SearchHitKey::from_hit(b)))
1963 });
1964
1965 let start = offset.min(unique_hits.len());
1966 unique_hits.into_iter().skip(start).take(limit).collect()
1967}
1968
1969struct QueryCache {
1970 embedder_id: String,
1971 embeddings: LruCache<String, Vec<f32>>,
1972}
1973
1974impl QueryCache {
1975 fn new(embedder_id: &str, capacity: NonZeroUsize) -> Self {
1976 Self {
1977 embedder_id: embedder_id.to_string(),
1978 embeddings: LruCache::new(capacity),
1979 }
1980 }
1981
1982 fn align_embedder(&mut self, embedder: &dyn Embedder) {
1983 if self.embedder_id != embedder.id() {
1984 self.embedder_id = embedder.id().to_string();
1985 self.embeddings.clear();
1986 }
1987 }
1988
1989 fn get_cached(&mut self, embedder: &dyn Embedder, canonical: &str) -> Option<Vec<f32>> {
1990 self.align_embedder(embedder);
1991 self.embeddings.get(canonical).cloned()
1992 }
1993
1994 fn store(&mut self, embedder: &dyn Embedder, canonical: &str, embedding: Vec<f32>) {
1995 self.align_embedder(embedder);
1996 self.embeddings.put(canonical.to_string(), embedding);
1997 }
1998}
1999
2000fn semantic_filter_as_search_filter(filter: &SemanticFilter) -> Option<&dyn FsSearchFilter> {
2003 let unrestricted = filter.agents.is_none()
2004 && filter.workspaces.is_none()
2005 && filter.sources.is_none()
2006 && filter.roles.is_none()
2007 && filter.created_from.is_none()
2008 && filter.created_to.is_none();
2009 if unrestricted { None } else { Some(filter) }
2010}
2011
2012fn open_fs_semantic_ann_index(fs_index: &FsVectorIndex, ann_path: &Path) -> Result<FsHnswIndex> {
2013 if !ann_path.is_file() {
2014 bail!(
2015 "approximate search unavailable: HNSW index not found at {}",
2016 ann_path.display()
2017 );
2018 }
2019
2020 let ann = FsHnswIndex::load(ann_path, fs_index)
2021 .map_err(|err| anyhow!("open HNSW index failed: {err}"))?;
2022 let matches = ann
2023 .matches_vector_index(fs_index)
2024 .map_err(|err| anyhow!("validate HNSW index failed: {err}"))?;
2025 if !matches {
2026 bail!(
2027 "approximate search unavailable: HNSW index at {} is stale for current semantic index (run 'cass index --semantic --build-hnsw')",
2028 ann_path.display()
2029 );
2030 }
2031
2032 Ok(ann)
2033}
2034
2035struct SemanticSearchState {
2036 context_token: Arc<()>,
2037 embedder: Arc<dyn Embedder>,
2038 fs_semantic_index: Arc<FsVectorIndex>,
2039 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2040 fs_ann_index: Option<Arc<FsHnswIndex>>,
2041 ann_path: Option<PathBuf>,
2042 fs_in_memory_two_tier_index: Option<Arc<FsInMemoryTwoTierIndex>>,
2043 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable,
2044 progressive_context: Option<Arc<ProgressiveTwoTierContext>>,
2045 progressive_context_unavailable: bool,
2046 filter_maps: SemanticFilterMaps,
2047 roles: Option<HashSet<u8>>,
2048 query_cache: QueryCache,
2049}
2050
2051#[derive(Debug, Clone, Copy, Default)]
2052struct InMemoryTwoTierUnavailable {
2053 fast_only: bool,
2054 quality: bool,
2055}
2056
2057impl InMemoryTwoTierUnavailable {
2058 fn is_known_unavailable(self, tier_mode: SemanticTierMode) -> bool {
2059 match tier_mode {
2060 SemanticTierMode::Single => false,
2061 SemanticTierMode::FastOnly => self.fast_only,
2062 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => self.quality,
2063 }
2064 }
2065
2066 fn mark_unavailable(&mut self, tier_mode: SemanticTierMode) {
2067 match tier_mode {
2068 SemanticTierMode::Single => {}
2069 SemanticTierMode::FastOnly => {
2070 self.fast_only = true;
2071 }
2072 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => {
2073 self.quality = true;
2074 }
2075 }
2076 }
2077}
2078
2079struct ProgressiveTwoTierContext {
2080 context_token: Arc<()>,
2081 index: Arc<FsTwoTierIndex>,
2082 fast_embedder: Arc<dyn frankensearch::Embedder>,
2083 quality_embedder: Option<Arc<dyn frankensearch::Embedder>>,
2084}
2085
2086#[derive(Clone)]
2087struct SemanticCandidateContext {
2088 fs_semantic_index: Arc<FsVectorIndex>,
2089 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2090 filter_maps: SemanticFilterMaps,
2091 roles: Option<HashSet<u8>>,
2092}
2093
2094struct SemanticCandidateSearchRequest<'a> {
2095 fetch_limit: usize,
2096 approximate: bool,
2097 tier_mode: SemanticTierMode,
2098 in_memory_two_tier_index: Option<&'a Arc<FsInMemoryTwoTierIndex>>,
2099 ann_index: Option<&'a Arc<FsHnswIndex>>,
2100}
2101
2102#[derive(Debug, Clone, Copy, Default)]
2103struct SemanticCandidateRetryState {
2104 has_more_candidates: bool,
2105 exact_window_may_omit_competitor: bool,
2106}
2107
2108struct SemanticQueryEmbedding {
2109 context_token: Arc<()>,
2110 vector: Vec<f32>,
2111}
2112
2113struct SharedCassSyncEmbedder {
2114 inner: Arc<dyn Embedder>,
2115 cache: Mutex<LruCache<String, Vec<f32>>>,
2116}
2117
2118impl SharedCassSyncEmbedder {
2119 fn new(inner: Arc<dyn Embedder>) -> Self {
2120 let cache_capacity =
2121 NonZeroUsize::new(PROGRESSIVE_EMBEDDING_CACHE_CAPACITY).expect("cache capacity > 0");
2122 Self {
2123 inner,
2124 cache: Mutex::new(LruCache::new(cache_capacity)),
2125 }
2126 }
2127}
2128
2129impl Embedder for SharedCassSyncEmbedder {
2130 fn embed_sync(&self, text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
2131 if let Ok(mut cache) = self.cache.lock()
2132 && let Some(embedding) = cache.get(text).cloned()
2133 {
2134 return Ok(embedding);
2135 }
2136
2137 let embedding = self.inner.embed_sync(text)?;
2138 if let Ok(mut cache) = self.cache.lock() {
2139 cache.put(text.to_owned(), embedding.clone());
2140 }
2141 Ok(embedding)
2142 }
2143
2144 fn embed_batch_sync(
2145 &self,
2146 texts: &[&str],
2147 ) -> crate::search::embedder::EmbedderResult<Vec<Vec<f32>>> {
2148 self.inner.embed_batch_sync(texts)
2149 }
2150
2151 fn dimension(&self) -> usize {
2152 self.inner.dimension()
2153 }
2154
2155 fn id(&self) -> &str {
2156 self.inner.id()
2157 }
2158
2159 fn model_name(&self) -> &str {
2160 self.inner.model_name()
2161 }
2162
2163 fn is_ready(&self) -> bool {
2164 self.inner.is_ready()
2165 }
2166
2167 fn is_semantic(&self) -> bool {
2168 self.inner.is_semantic()
2169 }
2170
2171 fn category(&self) -> frankensearch::ModelCategory {
2172 self.inner.category()
2173 }
2174
2175 fn tier(&self) -> frankensearch::ModelTier {
2176 self.inner.tier()
2177 }
2178
2179 fn supports_mrl(&self) -> bool {
2180 self.inner.supports_mrl()
2181 }
2182}
2183
2184fn build_in_memory_two_tier_index(
2185 ann_path: Option<PathBuf>,
2186 embedder_id: &str,
2187 tier_mode: SemanticTierMode,
2188) -> Option<Arc<FsInMemoryTwoTierIndex>> {
2189 let index_dir = ann_path
2190 .as_ref()
2191 .and_then(|path| path.parent().map(Path::to_path_buf));
2192 let Some(index_dir) = index_dir else {
2193 tracing::debug!("two-tier semantic unavailable: ann/index directory path missing");
2194 return None;
2195 };
2196
2197 match FsInMemoryTwoTierIndex::from_dir(&index_dir) {
2198 Ok(index) => return Some(Arc::new(index)),
2199 Err(err) => {
2200 tracing::debug!(
2201 dir = %index_dir.display(),
2202 error = %err,
2203 "two-tier semantic index load failed; considering fallback"
2204 );
2205 }
2206 }
2207
2208 if !matches!(tier_mode, SemanticTierMode::FastOnly) {
2209 return None;
2210 }
2211
2212 let fallback_fast = index_dir.join(format!("index-{embedder_id}.fsvi"));
2213 if !fallback_fast.is_file() {
2214 return None;
2215 }
2216
2217 match FsInMemoryVectorIndex::from_fsvi(&fallback_fast) {
2218 Ok(fast) => Some(Arc::new(FsInMemoryTwoTierIndex::new(fast, None))),
2219 Err(err) => {
2220 tracing::debug!(
2221 path = %fallback_fast.display(),
2222 error = %err,
2223 "fast-only semantic fallback index load failed"
2224 );
2225 None
2226 }
2227 }
2228}
2229
2230fn two_tier_index_supports_mode(
2231 index: &FsInMemoryTwoTierIndex,
2232 tier_mode: SemanticTierMode,
2233) -> bool {
2234 !matches!(
2235 tier_mode,
2236 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly
2237 ) || index.has_quality_index()
2238}
2239
2240#[derive(Debug, Clone)]
2241struct ResolvedSemanticDocId {
2242 message_id: u64,
2243 doc_id: String,
2244}
2245
2246type ProgressiveLookupKey = (String, String, Option<i64>, String, i64, Option<i64>, u64);
2247type ProgressiveExactQueryKey = (i64, i64);
2248type ProgressiveFallbackQueryKey = (String, String, i64);
2249type ResolvedSemanticLookupRow = Option<(ProgressiveLookupKey, ResolvedSemanticDocId)>;
2250
2251#[derive(Debug, Clone)]
2252struct ProgressiveLexicalHit {
2253 title: String,
2254 snippet: String,
2255 content: String,
2256 content_hash: u64,
2257 conversation_id: Option<i64>,
2258 source_path: String,
2259 agent: String,
2260 workspace: String,
2261 workspace_original: Option<String>,
2262 created_at: Option<i64>,
2263 match_type: MatchType,
2264 line_number: Option<usize>,
2265 source_id: String,
2266 origin_kind: String,
2267 origin_host: Option<String>,
2268}
2269
2270impl ProgressiveLexicalHit {
2271 fn from_search_hit(hit: &SearchHit, field_mask: FieldMask) -> Self {
2272 Self {
2273 title: if field_mask.wants_title() {
2274 hit.title.clone()
2275 } else {
2276 String::new()
2277 },
2278 snippet: if field_mask.wants_snippet() {
2279 hit.snippet.clone()
2280 } else {
2281 String::new()
2282 },
2283 content: if field_mask.needs_content() {
2284 hit.content.clone()
2285 } else {
2286 String::new()
2287 },
2288 content_hash: hit.content_hash,
2289 conversation_id: hit.conversation_id,
2290 source_path: hit.source_path.clone(),
2291 agent: hit.agent.clone(),
2292 workspace: hit.workspace.clone(),
2293 workspace_original: hit.workspace_original.clone(),
2294 created_at: hit.created_at,
2295 match_type: hit.match_type,
2296 line_number: hit.line_number,
2297 source_id: hit.source_id.clone(),
2298 origin_kind: hit.origin_kind.clone(),
2299 origin_host: hit.origin_host.clone(),
2300 }
2301 }
2302
2303 fn to_search_hit(&self, score: f32) -> SearchHit {
2304 SearchHit {
2305 title: self.title.clone(),
2306 snippet: self.snippet.clone(),
2307 content: self.content.clone(),
2308 content_hash: self.content_hash,
2309 conversation_id: self.conversation_id,
2310 score,
2311 source_path: self.source_path.clone(),
2312 agent: self.agent.clone(),
2313 workspace: self.workspace.clone(),
2314 workspace_original: self.workspace_original.clone(),
2315 created_at: self.created_at,
2316 line_number: self.line_number,
2317 match_type: self.match_type,
2318 source_id: self.source_id.clone(),
2319 origin_kind: self.origin_kind.clone(),
2320 origin_host: self.origin_host.clone(),
2321 }
2322 }
2323}
2324
2325#[derive(Debug, Default)]
2326struct ProgressiveLexicalCache {
2327 hits_by_message: HashMap<u64, ProgressiveLexicalHit>,
2328 wildcard_fallback: bool,
2329 suggestions: Vec<QuerySuggestion>,
2330}
2331
2332#[derive(Clone, Copy)]
2333struct ProgressivePhaseContext<'a> {
2334 query: &'a str,
2335 filters: &'a SearchFilters,
2336 field_mask: FieldMask,
2337 lexical_cache: Option<&'a ProgressiveLexicalCache>,
2338 limit: usize,
2339 fetch_limit: usize,
2340}
2341
2342type ProgressiveLexicalSnapshot = Arc<ProgressiveLexicalCache>;
2343
2344struct CassProgressiveLexicalAdapter {
2345 client: Arc<SearchClient>,
2346 filters: SearchFilters,
2347 field_mask: FieldMask,
2348 sparse_threshold: usize,
2349 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2350}
2351
2352impl CassProgressiveLexicalAdapter {
2353 fn new(
2354 client: Arc<SearchClient>,
2355 filters: SearchFilters,
2356 field_mask: FieldMask,
2357 sparse_threshold: usize,
2358 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2359 ) -> Self {
2360 Self {
2361 client,
2362 filters,
2363 field_mask,
2364 sparse_threshold,
2365 shared,
2366 }
2367 }
2368}
2369
2370impl FsLexicalSearch for CassProgressiveLexicalAdapter {
2371 fn search<'a>(
2372 &'a self,
2373 cx: &'a FsCx,
2374 query: &'a str,
2375 limit: usize,
2376 ) -> FsSearchFuture<'a, Vec<FsScoredResult>> {
2377 Box::pin(async move {
2378 if cx.is_cancel_requested() {
2379 return Err(FsSearchError::Cancelled {
2380 phase: "lexical".to_string(),
2381 reason: "cancel requested".to_string(),
2382 });
2383 }
2384
2385 let result = self
2386 .client
2387 .search_with_fallback(
2388 query,
2389 self.filters.clone(),
2390 limit,
2391 0,
2392 self.sparse_threshold,
2393 self.field_mask,
2394 )
2395 .map_err(|err| FsSearchError::SubsystemError {
2396 subsystem: "cass_lexical_adapter",
2397 source: Box::new(std::io::Error::other(err.to_string())),
2398 })?;
2399
2400 let resolved = self
2401 .client
2402 .resolve_semantic_doc_ids_for_hits(&result.hits)
2403 .map_err(|err| FsSearchError::SubsystemError {
2404 subsystem: "cass_lexical_adapter",
2405 source: Box::new(std::io::Error::other(err.to_string())),
2406 })?;
2407
2408 let mut scored = Vec::with_capacity(result.hits.len());
2409 let mut hits_by_message = HashMap::with_capacity(result.hits.len());
2410
2411 for (hit, resolved_doc) in result.hits.iter().zip(resolved) {
2412 let Some(resolved_doc) = resolved_doc else {
2413 continue;
2414 };
2415 hits_by_message
2416 .entry(resolved_doc.message_id)
2417 .or_insert_with(|| {
2418 ProgressiveLexicalHit::from_search_hit(hit, self.field_mask)
2419 });
2420 scored.push(FsScoredResult {
2421 doc_id: resolved_doc.doc_id,
2422 score: hit.score,
2423 source: FsScoreSource::Lexical,
2424 index: None,
2425 fast_score: None,
2426 quality_score: None,
2427 lexical_score: Some(hit.score),
2428 rerank_score: None,
2429 explanation: None,
2430 metadata: None,
2431 });
2432 }
2433
2434 if let Ok(mut guard) = self.shared.lock() {
2435 *guard = Arc::new(ProgressiveLexicalCache {
2436 hits_by_message,
2437 wildcard_fallback: result.wildcard_fallback,
2438 suggestions: result.suggestions,
2439 });
2440 }
2441
2442 Ok(scored)
2443 })
2444 }
2445
2446 fn index_document<'a>(
2447 &'a self,
2448 _cx: &'a FsCx,
2449 _doc: &'a frankensearch::IndexableDocument,
2450 ) -> FsSearchFuture<'a, ()> {
2451 Box::pin(async move {
2452 Err(FsSearchError::SubsystemError {
2453 subsystem: "cass_lexical_adapter",
2454 source: Box::new(std::io::Error::other("cass lexical adapter is read-only")),
2455 })
2456 })
2457 }
2458
2459 fn commit<'a>(&'a self, _cx: &'a FsCx) -> FsSearchFuture<'a, ()> {
2460 Box::pin(async move { Ok(()) })
2461 }
2462
2463 fn doc_count(&self) -> usize {
2464 self.client.total_docs()
2465 }
2466}
2467
2468pub struct SearchClient {
2469 reader: Option<(IndexReader, FsCassFields)>,
2470 sqlite: Mutex<Option<SendConnection>>,
2471 sqlite_path: Option<PathBuf>,
2472 prefix_cache: Mutex<CacheShards>,
2473 reload_on_search: bool,
2474 last_reload: Mutex<Option<Instant>>,
2475 last_generation: Mutex<Option<u64>>,
2476 reload_epoch: Arc<AtomicU64>,
2477 warm_tx: Option<mpsc::Sender<WarmJob>>,
2478 _warm_handle: Option<std::thread::JoinHandle<()>>,
2479 metrics: Metrics,
2480 cache_namespace: String,
2481 semantic: Mutex<Option<SemanticSearchState>>,
2482 last_tantivy_total_count: Mutex<Option<usize>>,
2486}
2487
2488#[derive(Debug, Clone, Copy)]
2489pub struct SearchClientOptions {
2490 pub enable_reload: bool,
2491 pub enable_warm: bool,
2492}
2493
2494impl Default for SearchClientOptions {
2495 fn default() -> Self {
2496 Self {
2497 enable_reload: true,
2498 enable_warm: true,
2499 }
2500 }
2501}
2502
2503impl Drop for SearchClient {
2504 fn drop(&mut self) {
2505 FEDERATED_SEARCH_READERS
2506 .write()
2507 .remove(&self.cache_namespace);
2508 }
2509}
2510
2511#[derive(Debug, Clone, PartialEq, Eq)]
2512pub struct CacheStats {
2513 pub cache_hits: u64,
2514 pub cache_miss: u64,
2515 pub cache_shortfall: u64,
2516 pub reloads: u64,
2517 pub reload_ms_total: u128,
2518 pub total_cap: usize,
2519 pub total_cost: usize,
2520 pub eviction_count: u64,
2522 pub approx_bytes: usize,
2524 pub byte_cap: usize,
2526 pub eviction_policy: &'static str,
2528 pub ghost_entries: usize,
2530 pub admission_rejects: u64,
2532 pub prewarm_scheduled: u64,
2534 pub prewarm_skipped_pressure: u64,
2536 pub reader_generation: Option<u64>,
2538}
2539
2540impl Default for CacheStats {
2541 fn default() -> Self {
2542 Self {
2543 cache_hits: 0,
2544 cache_miss: 0,
2545 cache_shortfall: 0,
2546 reloads: 0,
2547 reload_ms_total: 0,
2548 total_cap: 0,
2549 total_cost: 0,
2550 eviction_count: 0,
2551 approx_bytes: 0,
2552 byte_cap: 0,
2553 eviction_policy: "unknown",
2554 ghost_entries: 0,
2555 admission_rejects: 0,
2556 prewarm_scheduled: 0,
2557 prewarm_skipped_pressure: 0,
2558 reader_generation: None,
2559 }
2560 }
2561}
2562
2563static CACHE_SHARD_CAP: Lazy<usize> = Lazy::new(|| {
2566 dotenvy::var("CASS_CACHE_SHARD_CAP")
2567 .ok()
2568 .and_then(|v| v.parse::<usize>().ok())
2569 .filter(|v| *v > 0)
2570 .unwrap_or(256)
2571});
2572
2573static CACHE_TOTAL_CAP: Lazy<usize> = Lazy::new(|| {
2575 dotenvy::var("CASS_CACHE_TOTAL_CAP")
2576 .ok()
2577 .and_then(|v| v.parse::<usize>().ok())
2578 .filter(|v| *v > 0)
2579 .unwrap_or(2048)
2580});
2581
2582static CACHE_DEBUG_ENABLED: Lazy<bool> = Lazy::new(|| {
2583 dotenvy::var("CASS_DEBUG_CACHE_METRICS")
2584 .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
2585 .unwrap_or(false)
2586});
2587
2588static CACHE_BYTE_CAP: Lazy<usize> = Lazy::new(|| match dotenvy::var("CASS_CACHE_BYTE_CAP") {
2591 Ok(value) => cache_byte_cap_from_env_value(Some(&value), available_memory_bytes()),
2592 Err(_) => default_cache_byte_cap(),
2593});
2594
2595static CACHE_EVICTION_POLICY: Lazy<CacheEvictionPolicy> = Lazy::new(|| {
2596 cache_eviction_policy_from_env_value(dotenvy::var("CASS_CACHE_EVICTION_POLICY").ok().as_deref())
2597});
2598
2599const DEFAULT_CACHE_BYTE_CAP_FALLBACK: usize = 64 * 1024 * 1024;
2600const DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR: u64 = 128;
2601const DEFAULT_CACHE_BYTE_CAP_CEILING: u64 = 2 * 1024 * 1024 * 1024;
2602const S3_FIFO_GHOST_CAP_MULTIPLIER: usize = 2;
2603const S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR: usize = 4;
2604const PREWARM_ENTRY_PRESSURE_NUMERATOR: usize = 9;
2605const PREWARM_ENTRY_PRESSURE_DENOMINATOR: usize = 10;
2606const PREWARM_BYTE_PRESSURE_NUMERATOR: usize = 4;
2607const PREWARM_BYTE_PRESSURE_DENOMINATOR: usize = 5;
2608
2609const CACHE_KEY_VERSION: &str = "1";
2610
2611static WARM_DEBOUNCE_MS: Lazy<u64> = Lazy::new(|| {
2613 dotenvy::var("CASS_WARM_DEBOUNCE_MS")
2614 .ok()
2615 .and_then(|v| v.parse::<u64>().ok())
2616 .filter(|v| *v > 0)
2617 .unwrap_or(120)
2618});
2619
2620fn default_cache_byte_cap() -> usize {
2621 default_cache_byte_cap_for_available(available_memory_bytes())
2622}
2623
2624fn cache_byte_cap_from_env_value(value: Option<&str>, available_bytes: Option<u64>) -> usize {
2625 let Some(raw) = value else {
2626 return default_cache_byte_cap_for_available(available_bytes);
2627 };
2628 raw.parse::<usize>()
2629 .unwrap_or_else(|_| default_cache_byte_cap_for_available(available_bytes))
2630}
2631
2632fn default_cache_byte_cap_for_available(available_bytes: Option<u64>) -> usize {
2633 let Some(available_bytes) = available_bytes else {
2634 return DEFAULT_CACHE_BYTE_CAP_FALLBACK;
2635 };
2636 let ceiling = usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX);
2637 let budget = available_bytes / DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR;
2638 let budget = budget.min(DEFAULT_CACHE_BYTE_CAP_CEILING);
2639 let budget = usize::try_from(budget).unwrap_or(ceiling);
2640 budget.clamp(DEFAULT_CACHE_BYTE_CAP_FALLBACK, ceiling)
2641}
2642
2643#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2644enum CacheEvictionPolicy {
2645 Lru,
2646 S3Fifo,
2647}
2648
2649impl CacheEvictionPolicy {
2650 fn label(self) -> &'static str {
2651 match self {
2652 CacheEvictionPolicy::Lru => "lru",
2653 CacheEvictionPolicy::S3Fifo => "s3-fifo",
2654 }
2655 }
2656}
2657
2658fn cache_eviction_policy_from_env_value(value: Option<&str>) -> CacheEvictionPolicy {
2659 match value.map(str::trim).filter(|value| !value.is_empty()) {
2660 Some(value) if value.eq_ignore_ascii_case("s3-fifo") => CacheEvictionPolicy::S3Fifo,
2661 Some(value) if value.eq_ignore_ascii_case("s3fifo") => CacheEvictionPolicy::S3Fifo,
2662 Some(value) if value.eq_ignore_ascii_case("s3_fifo") => CacheEvictionPolicy::S3Fifo,
2663 _ => CacheEvictionPolicy::Lru,
2664 }
2665}
2666
2667#[derive(Clone)]
2668struct CachedHit {
2669 hit: SearchHit,
2670 lc_content: String,
2671 lc_title: Option<String>,
2672 bloom64: u64,
2673}
2674
2675impl CachedHit {
2676 fn approx_bytes(&self) -> usize {
2679 let base = std::mem::size_of::<Self>();
2681 let hit_strings = self.hit.title.len()
2683 + self.hit.snippet.len()
2684 + self.hit.content.len()
2685 + self.hit.source_path.len()
2686 + self.hit.agent.len()
2687 + self.hit.workspace.len()
2688 + self
2689 .hit
2690 .workspace_original
2691 .as_ref()
2692 .map_or(0, std::string::String::len)
2693 + self.hit.source_id.len()
2694 + self.hit.origin_kind.len()
2695 + self
2696 .hit
2697 .origin_host
2698 .as_ref()
2699 .map_or(0, std::string::String::len);
2700 let lc_strings =
2702 self.lc_content.len() + self.lc_title.as_ref().map_or(0, std::string::String::len);
2703 base + hit_strings + lc_strings
2704 }
2705}
2706
2707struct CacheShards {
2708 shards: HashMap<Arc<str>, LruCache<Arc<str>, Vec<CachedHit>>>,
2710 total_cap: usize,
2711 total_cost: usize,
2712 eviction_count: u64,
2714 total_bytes: usize,
2716 byte_cap: usize,
2718 policy: CacheEvictionPolicy,
2720 ghost_keys: VecDeque<Arc<str>>,
2722 ghost_set: HashSet<Arc<str>>,
2723 admission_rejects: u64,
2724}
2725
2726impl CacheShards {
2727 fn new(total_cap: usize, byte_cap: usize) -> Self {
2728 Self::new_with_policy(total_cap, byte_cap, *CACHE_EVICTION_POLICY)
2729 }
2730
2731 fn new_with_policy(total_cap: usize, byte_cap: usize, policy: CacheEvictionPolicy) -> Self {
2732 Self {
2733 shards: HashMap::new(),
2734 total_cap: total_cap.max(1),
2735 total_cost: 0,
2736 eviction_count: 0,
2737 total_bytes: 0,
2738 byte_cap,
2739 policy,
2740 ghost_keys: VecDeque::new(),
2741 ghost_set: HashSet::new(),
2742 admission_rejects: 0,
2743 }
2744 }
2745
2746 fn shard_mut(&mut self, name: &str) -> &mut LruCache<Arc<str>, Vec<CachedHit>> {
2747 let interned_name = intern_cache_key(name);
2749 self.shards
2750 .entry(interned_name)
2751 .or_insert_with(|| LruCache::new(NonZeroUsize::new(*CACHE_SHARD_CAP).unwrap()))
2752 }
2753
2754 fn shard_opt(&self, name: &str) -> Option<&LruCache<Arc<str>, Vec<CachedHit>>> {
2755 self.shards.get(name)
2757 }
2758
2759 fn put(&mut self, shard_name: &str, key: Arc<str>, value: Vec<CachedHit>) {
2760 let new_cost = value.len();
2761 let new_bytes: usize = value.iter().map(CachedHit::approx_bytes).sum();
2762 let replacing = self
2763 .shard_opt(shard_name)
2764 .is_some_and(|shard| shard.contains(&key));
2765
2766 if !replacing && !self.should_admit(&key, new_cost, new_bytes) {
2767 self.admission_rejects += 1;
2768 self.record_ghost(key);
2769 return;
2770 }
2771
2772 self.remove_ghost(&key);
2773
2774 let shard = self.shard_mut(shard_name);
2775 let old_val = shard.put(key, value);
2776 let (old_cost, old_bytes) = old_val.as_ref().map_or((0, 0), |v| {
2777 (v.len(), v.iter().map(CachedHit::approx_bytes).sum())
2778 });
2779
2780 self.total_cost = self
2781 .total_cost
2782 .saturating_add(new_cost)
2783 .saturating_sub(old_cost);
2784 self.total_bytes = self
2785 .total_bytes
2786 .saturating_add(new_bytes)
2787 .saturating_sub(old_bytes);
2788 self.evict_until_within_cap();
2789 }
2790
2791 fn evict_until_within_cap(&mut self) {
2792 while self.total_cost > self.total_cap
2794 || (self.byte_cap > 0 && self.total_bytes > self.byte_cap)
2795 {
2796 let byte_pressure = self.byte_cap > 0 && self.total_bytes > self.byte_cap;
2801 let mut largest_shard_key = None;
2802 let mut max_score = 0usize;
2803 for (k, v) in self.shards.iter() {
2804 let score = if byte_pressure {
2805 shard_cached_bytes(v)
2806 } else {
2807 v.len()
2808 };
2809 if score > max_score {
2810 max_score = score;
2811 largest_shard_key = Some(k.clone());
2812 }
2813 }
2814
2815 if let Some(key) = largest_shard_key {
2816 if let Some(shard) = self.shards.get_mut(&key)
2817 && let Some((evicted_key, v)) = shard.pop_lru()
2818 {
2819 let evicted_bytes: usize = v.iter().map(CachedHit::approx_bytes).sum();
2820 self.total_cost = self.total_cost.saturating_sub(v.len());
2821 self.total_bytes = self.total_bytes.saturating_sub(evicted_bytes);
2822 self.eviction_count += 1;
2823 self.record_ghost(evicted_key);
2824 }
2825 } else {
2826 break; }
2828 }
2829 }
2830
2831 fn should_admit(&self, key: &Arc<str>, cost: usize, bytes: usize) -> bool {
2832 if self.policy == CacheEvictionPolicy::Lru || self.ghost_set.contains(key) {
2833 return true;
2834 }
2835 !self.is_s3_fifo_large_candidate(cost, bytes)
2836 }
2837
2838 fn is_s3_fifo_large_candidate(&self, cost: usize, bytes: usize) -> bool {
2839 let entry_heavy = cost
2840 > self
2841 .total_cap
2842 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2843 let byte_heavy = self.byte_cap > 0
2844 && bytes
2845 > self
2846 .byte_cap
2847 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2848 entry_heavy || byte_heavy
2849 }
2850
2851 fn record_ghost(&mut self, key: Arc<str>) {
2852 if self.policy != CacheEvictionPolicy::S3Fifo {
2853 return;
2854 }
2855 if self.ghost_set.insert(key.clone()) {
2856 self.ghost_keys.push_back(key);
2857 }
2858 let cap = self
2859 .total_cap
2860 .saturating_mul(S3_FIFO_GHOST_CAP_MULTIPLIER)
2861 .max(1);
2862 while self.ghost_set.len() > cap {
2863 if let Some(old) = self.ghost_keys.pop_front() {
2864 self.ghost_set.remove(&old);
2865 } else {
2866 break;
2867 }
2868 }
2869 }
2870
2871 fn remove_ghost(&mut self, key: &Arc<str>) {
2872 self.ghost_set.remove(key);
2873 self.ghost_keys.retain(|candidate| candidate != key);
2874 }
2875
2876 fn clear(&mut self) {
2877 self.shards.clear();
2878 self.total_cost = 0;
2879 self.total_bytes = 0;
2880 self.ghost_keys.clear();
2881 self.ghost_set.clear();
2882 }
2884
2885 fn total_cost(&self) -> usize {
2886 self.total_cost
2887 }
2888
2889 fn total_cap(&self) -> usize {
2890 self.total_cap
2891 }
2892
2893 fn eviction_count(&self) -> u64 {
2894 self.eviction_count
2895 }
2896
2897 fn total_bytes(&self) -> usize {
2898 self.total_bytes
2899 }
2900
2901 fn byte_cap(&self) -> usize {
2902 self.byte_cap
2903 }
2904
2905 fn policy_label(&self) -> &'static str {
2906 self.policy.label()
2907 }
2908
2909 fn ghost_entries(&self) -> usize {
2910 self.ghost_set.len()
2911 }
2912
2913 fn admission_rejects(&self) -> u64 {
2914 self.admission_rejects
2915 }
2916
2917 fn prewarm_pressure(&self) -> bool {
2918 let entry_pressure = self
2919 .total_cost
2920 .saturating_mul(PREWARM_ENTRY_PRESSURE_DENOMINATOR)
2921 >= self
2922 .total_cap
2923 .saturating_mul(PREWARM_ENTRY_PRESSURE_NUMERATOR);
2924 let byte_pressure = self.byte_cap > 0
2925 && self
2926 .total_bytes
2927 .saturating_mul(PREWARM_BYTE_PRESSURE_DENOMINATOR)
2928 >= self
2929 .byte_cap
2930 .saturating_mul(PREWARM_BYTE_PRESSURE_NUMERATOR);
2931 entry_pressure || byte_pressure
2932 }
2933}
2934
2935fn shard_cached_bytes(shard: &LruCache<Arc<str>, Vec<CachedHit>>) -> usize {
2936 shard
2937 .iter()
2938 .map(|(_key, hits)| hits.iter().map(CachedHit::approx_bytes).sum::<usize>())
2939 .sum()
2940}
2941
2942#[derive(Clone)]
2943struct WarmJob {
2944 query: String,
2945 filters_fingerprint: String,
2946 shard_name: String,
2947}
2948
2949#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2950enum AdaptivePrewarmDecision {
2951 Schedule,
2952 SkipCold,
2953 SkipPressure,
2954}
2955
2956#[derive(Clone)]
2957struct SearcherCacheEntry {
2958 epoch: u64,
2959 reader_key: usize,
2960 searcher: Searcher,
2961}
2962
2963thread_local! {
2964 static THREAD_SEARCHER: RefCell<Option<SearcherCacheEntry>> = const { RefCell::new(None) };
2965}
2966
2967#[derive(Clone)]
2968struct FederatedIndexReader {
2969 reader: IndexReader,
2970 fields: FsCassFields,
2971}
2972
2973static FEDERATED_SEARCH_READERS: Lazy<RwLock<HashMap<String, Arc<Vec<FederatedIndexReader>>>>> =
2974 Lazy::new(|| RwLock::new(HashMap::new()));
2975static SEARCH_CLIENT_INSTANCE_COUNTER: AtomicU64 = AtomicU64::new(1);
2976
2977fn levenshtein_distance(a: &str, b: &str) -> usize {
2980 let a_chars: Vec<char> = a.chars().collect();
2981 let b_chars: Vec<char> = b.chars().collect();
2982 let a_len = a_chars.len();
2983 let b_len = b_chars.len();
2984
2985 if a_len == 0 {
2986 return b_len;
2987 }
2988 if b_len == 0 {
2989 return a_len;
2990 }
2991
2992 let mut prev_row: Vec<usize> = (0..=b_len).collect();
2994 let mut curr_row: Vec<usize> = vec![0; b_len + 1];
2995
2996 for (i, a_char) in a_chars.iter().enumerate() {
2997 curr_row[0] = i + 1;
2998 for (j, b_char) in b_chars.iter().enumerate() {
2999 let cost = usize::from(a_char != b_char);
3000 curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
3004 std::mem::swap(&mut prev_row, &mut curr_row);
3005 }
3006
3007 prev_row[b_len]
3008}
3009
3010fn normalize_term_parts(raw: &str) -> Vec<String> {
3015 let mut parts = Vec::new();
3016 for token in nfc_sanitize_query(raw).split_whitespace() {
3017 let mut current = String::new();
3018 let mut chars = token.chars().peekable();
3019 while let Some(ch) = chars.next() {
3020 let trailing_wildcard = ch == '*' && chars.peek().is_none() && !current.is_empty();
3021 if ch.is_alphanumeric() || ch == '_' || trailing_wildcard {
3022 current.push(ch);
3023 continue;
3024 }
3025
3026 if !current.is_empty() {
3027 parts.push(std::mem::take(&mut current));
3028 }
3029 }
3030
3031 if !current.is_empty() {
3032 parts.push(current);
3033 }
3034 }
3035 parts
3036}
3037
3038fn normalize_phrase_terms(raw: &str) -> Vec<String> {
3040 normalize_term_parts(raw)
3041 .into_iter()
3042 .map(|s| s.trim_matches('*').to_lowercase())
3043 .filter(|s| !s.is_empty())
3044 .collect()
3045}
3046
3047fn render_fts5_term_part(part: &str) -> Option<String> {
3048 let pattern = FsCassWildcardPattern::parse(part);
3049 if matches!(
3050 pattern,
3051 FsCassWildcardPattern::Suffix(_)
3052 | FsCassWildcardPattern::Substring(_)
3053 | FsCassWildcardPattern::Complex(_)
3054 ) {
3055 return None;
3056 }
3057
3058 Some(part.to_string())
3059}
3060
3061fn dominant_match_type(query: &str) -> MatchType {
3064 let mut worst = MatchType::Exact;
3065 for term in query.split_whitespace() {
3066 let pattern = FsCassWildcardPattern::parse(term);
3067 let mt = match pattern {
3068 FsCassWildcardPattern::Exact(_) => MatchType::Exact,
3069 FsCassWildcardPattern::Prefix(_) => MatchType::Prefix,
3070 FsCassWildcardPattern::Suffix(_) => MatchType::Suffix,
3071 FsCassWildcardPattern::Substring(_) => MatchType::Substring,
3072 FsCassWildcardPattern::Complex(_) => MatchType::Wildcard,
3073 };
3074 if mt.quality_factor() < worst.quality_factor() {
3076 worst = mt;
3077 }
3078 }
3079 worst
3080}
3081
3082pub(crate) fn is_tool_invocation_noise(content: &str) -> bool {
3085 let trimmed = content.trim();
3086
3087 if trimmed.starts_with("[Tool:") {
3089 if let Some(close_idx) = trimmed.find(']') {
3091 let after = &trimmed[close_idx + 1..];
3093 if !after.trim().is_empty() {
3094 return false; }
3096
3097 let inner = &trimmed[6..close_idx]; return inner.trim().is_empty();
3103 }
3104 return true;
3106 }
3107
3108 if trimmed.len() < 20 {
3110 let lower = trimmed.to_lowercase();
3111 if lower.starts_with("[tool") || lower.starts_with("tool:") {
3112 return true;
3113 }
3114 }
3115
3116 false
3117}
3118
3119fn hit_content_for_noise_check(hit: &SearchHit) -> &str {
3120 if hit.content.is_empty() {
3121 &hit.snippet
3122 } else {
3123 &hit.content
3124 }
3125}
3126
3127fn hit_is_noise(hit: &SearchHit, query: &str) -> bool {
3128 let content_to_check = hit_content_for_noise_check(hit);
3129 if content_to_check.is_empty() {
3139 return false;
3140 }
3141 is_search_noise_text(content_to_check, query) || is_tool_invocation_noise(content_to_check)
3142}
3143
3144fn snippet_from_content(content: &str) -> String {
3145 let trimmed = content.trim();
3146 let mut chars = trimmed.chars();
3147 let preview: String = chars.by_ref().take(200).collect();
3148 if chars.next().is_some() {
3149 format!("{preview}...")
3150 } else {
3151 preview
3152 }
3153}
3154
3155#[cfg(test)]
3163pub(crate) fn deduplicate_hits(hits: Vec<SearchHit>) -> Vec<SearchHit> {
3164 deduplicate_hits_with_query(hits, "")
3165}
3166
3167pub(crate) fn deduplicate_hits_with_query(hits: Vec<SearchHit>, query: &str) -> Vec<SearchHit> {
3168 let mut source_ids: HashMap<String, u32> = HashMap::new();
3175 let mut path_ids: HashMap<String, u32> = HashMap::new();
3176 let mut title_ids: HashMap<String, u32> = HashMap::new();
3177 let mut next_source_id: u32 = 0;
3178 let mut next_path_id: u32 = 0;
3179 let mut next_title_id: u32 = 0;
3180 type DedupKey = (
3181 u32,
3182 u32,
3183 Option<i64>,
3184 Option<u32>,
3185 Option<usize>,
3186 Option<i64>,
3187 u64,
3188 );
3189
3190 let mut seen: HashMap<DedupKey, usize> = HashMap::new();
3191 let mut deduped: Vec<SearchHit> = Vec::new();
3192
3193 for hit in hits {
3194 if hit_is_noise(&hit, query) {
3195 continue;
3196 }
3197
3198 let normalized_source_id = normalized_search_hit_source_id(&hit);
3201 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
3202 *id
3203 } else {
3204 let id = next_source_id;
3205 next_source_id = next_source_id.saturating_add(1);
3206 source_ids.insert(normalized_source_id, id);
3207 id
3208 };
3209 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
3210 *id
3211 } else {
3212 let id = next_path_id;
3213 next_path_id = next_path_id.saturating_add(1);
3214 path_ids.insert(hit.source_path.clone(), id);
3215 id
3216 };
3217 let title_key = if hit.conversation_id.is_some() {
3218 None
3219 } else {
3220 let normalized_title = hit.title.trim();
3221 Some(if let Some(id) = title_ids.get(normalized_title) {
3222 *id
3223 } else {
3224 let id = next_title_id;
3225 next_title_id = next_title_id.saturating_add(1);
3226 title_ids.insert(normalized_title.to_string(), id);
3227 id
3228 })
3229 };
3230 let key = (
3231 source_key,
3232 path_key,
3233 hit.conversation_id,
3234 title_key,
3235 hit.line_number,
3236 hit.created_at,
3237 hit.content_hash,
3238 );
3239
3240 if let Some(&existing_idx) = seen.get(&key) {
3241 if deduped[existing_idx].score < hit.score {
3243 deduped[existing_idx] = hit;
3244 }
3245 } else {
3247 seen.insert(key, deduped.len());
3248 deduped.push(hit);
3249 }
3250 }
3251
3252 deduped
3253}
3254
3255fn should_try_wildcard_fallback(
3256 returned_hits: usize,
3257 limit: usize,
3258 offset: usize,
3259 sparse_threshold: usize,
3260) -> bool {
3261 if offset != 0 {
3262 return false;
3263 }
3264
3265 let effective_sparse_threshold = if limit == 0 {
3266 sparse_threshold
3267 } else {
3268 sparse_threshold.min(limit)
3269 };
3270
3271 returned_hits < effective_sparse_threshold
3272}
3273
3274fn should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(
3275 query: &str,
3276 returned_hits: usize,
3277) -> bool {
3278 if returned_hits != 0 {
3279 return false;
3280 }
3281
3282 for token in normalize_phrase_terms(query) {
3283 if token.chars().count() > AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS {
3284 return true;
3285 }
3286 }
3287
3288 false
3289}
3290
3291fn snippet_from_preview_without_full_content(
3292 field_mask: FieldMask,
3293 stored_preview: &str,
3294 query: &str,
3295) -> Option<String> {
3296 if field_mask.needs_content() || !field_mask.wants_snippet() || stored_preview.is_empty() {
3297 return None;
3298 }
3299
3300 cached_prefix_snippet(stored_preview, query, 160)
3301}
3302
3303fn stored_preview_is_complete_content(stored_preview: &str) -> bool {
3304 !stored_preview.is_empty() && !stored_preview.ends_with('…')
3307}
3308
3309impl SearchClient {
3310 pub fn open(index_path: &Path, db_path: Option<&Path>) -> Result<Option<Self>> {
3311 Self::open_with_options(index_path, db_path, SearchClientOptions::default())
3312 }
3313
3314 pub fn open_with_options(
3315 index_path: &Path,
3316 db_path: Option<&Path>,
3317 options: SearchClientOptions,
3318 ) -> Result<Option<Self>> {
3319 let tantivy = fs_cass_open_search_reader(index_path, ReloadPolicy::Manual).ok();
3320 let client_id = SEARCH_CLIENT_INSTANCE_COUNTER.fetch_add(1, Ordering::Relaxed);
3321 let cache_namespace = format!(
3322 "v{}|schema:{}|client:{}|index:{}",
3323 CACHE_KEY_VERSION,
3324 FS_CASS_SCHEMA_HASH,
3325 client_id,
3326 index_path.display()
3327 );
3328 let federated_readers = if tantivy.is_none() {
3329 crate::search::tantivy::open_federated_search_readers(index_path, ReloadPolicy::Manual)
3330 .ok()
3331 .flatten()
3332 .filter(|readers| !readers.is_empty())
3333 .map(|readers| {
3334 Arc::new(
3335 readers
3336 .into_iter()
3337 .map(|(reader, fields)| FederatedIndexReader { reader, fields })
3338 .collect::<Vec<_>>(),
3339 )
3340 })
3341 } else {
3342 None
3343 };
3344
3345 let sqlite_path = db_path.map(Path::to_path_buf).filter(|path| path.exists());
3346
3347 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_some() {
3348 tracing::warn!(
3349 index_path = %index_path.display(),
3350 "Tantivy search index not found or incompatible. \
3351 Search results will be degraded. \
3352 Run `cass index --full` to rebuild the index."
3353 );
3354 }
3355
3356 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_none() {
3357 return Ok(None);
3358 }
3359
3360 let reload_epoch = Arc::new(AtomicU64::new(0));
3361 let metrics = Metrics::default();
3362
3363 let warm_pair = if options.enable_warm
3364 && let Some((reader, fields)) = &tantivy
3365 {
3366 maybe_spawn_warm_worker(
3367 reader.clone(),
3368 *fields,
3369 reload_epoch.clone(),
3370 metrics.clone(),
3371 )
3372 } else {
3373 None
3374 };
3375
3376 if let Some(readers) = &federated_readers {
3377 FEDERATED_SEARCH_READERS
3378 .write()
3379 .insert(cache_namespace.clone(), Arc::clone(readers));
3380 } else {
3381 FEDERATED_SEARCH_READERS.write().remove(&cache_namespace);
3382 }
3383
3384 Ok(Some(Self {
3385 reader: tantivy,
3386 sqlite: Mutex::new(None),
3387 sqlite_path,
3388 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
3389 reload_on_search: options.enable_reload,
3390 last_reload: Mutex::new(None),
3391 last_generation: Mutex::new(None),
3392 reload_epoch,
3393 warm_tx: warm_pair.as_ref().map(|(tx, _)| tx.clone()),
3394 _warm_handle: warm_pair.map(|(_, h)| h),
3395 metrics,
3396 cache_namespace,
3397 semantic: Mutex::new(None),
3398 last_tantivy_total_count: Mutex::new(None),
3399 }))
3400 }
3401
3402 fn sqlite_guard(&self) -> Result<std::sync::MutexGuard<'_, Option<SendConnection>>> {
3403 let mut guard = self
3404 .sqlite
3405 .lock()
3406 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3407
3408 if guard.is_none()
3409 && let Some(path) = &self.sqlite_path
3410 {
3411 match open_search_hydration_sqlite(path, std::time::Duration::from_secs(1)) {
3412 Ok(conn) => {
3413 *guard = Some(SendConnection(conn));
3414 }
3415 Err(err) => {
3416 tracing::debug!(
3417 error = %err,
3418 path = %path.display(),
3419 "readonly sqlite open failed for search client"
3420 );
3421 }
3422 }
3423 }
3424
3425 Ok(guard)
3426 }
3427
3428 pub fn search(
3429 &self,
3430 query: &str,
3431 filters: SearchFilters,
3432 limit: usize,
3433 offset: usize,
3434 field_mask: FieldMask,
3435 ) -> Result<Vec<SearchHit>> {
3436 use unicode_normalization::UnicodeNormalization;
3440 let query: String = query.nfc().collect();
3441 let query: &str = &query;
3442 let sanitized = nfc_sanitize_query(query);
3443 let field_mask = effective_field_mask(field_mask);
3444 let limit = if limit == 0 {
3445 self.total_docs().min(no_limit_result_cap()).max(1)
3446 } else {
3447 limit
3448 };
3449 let can_use_cache =
3450 field_mask.allows_cache() && (field_mask.needs_content() || field_mask.wants_snippet());
3451
3452 if let Some((reader, _)) = &self.reader {
3455 self.maybe_reload_reader(reader)?;
3456 let searcher = self.searcher_for_thread(reader);
3457 self.track_generation(searcher.generation().generation_id());
3458 } else if let Some(readers) = self.federated_readers()
3459 && let Some(signature) = self.maybe_reload_federated_readers(readers.as_ref())?
3460 {
3461 self.track_generation(signature);
3462 }
3463
3464 if can_use_cache
3469 && offset == 0
3470 && !query.contains('*')
3471 && !fs_cass_has_boolean_operators(query)
3472 {
3473 self.maybe_schedule_adaptive_query_prewarm(&sanitized, &filters);
3474 if let Some(cached) = self.cached_prefix_hits(&sanitized, &filters) {
3475 let query_terms = QueryTermsLower::from_query(&sanitized);
3477 let mut filtered: Vec<SearchHit> = cached
3478 .into_iter()
3479 .filter(|h| hit_matches_query_cached_precomputed(h, &query_terms))
3480 .map(|c| c.hit.clone())
3481 .collect();
3482 if filtered.len() >= limit {
3483 filtered.truncate(limit);
3484 self.metrics.inc_cache_hits();
3485 self.maybe_log_cache_metrics("hit");
3486 return Ok(filtered);
3487 }
3488 self.metrics.inc_cache_shortfall();
3490 self.maybe_log_cache_metrics("shortfall");
3491 } else {
3492 self.metrics.inc_cache_miss();
3494 self.maybe_log_cache_metrics("miss");
3495 }
3496 }
3497
3498 let target_hits = offset.saturating_add(limit);
3502 let initial_fetch_limit = if target_hits <= 16 {
3503 target_hits.saturating_mul(2)
3504 } else {
3505 target_hits.saturating_mul(3).div_ceil(2)
3508 };
3509 let session_path_filter_active = !filters.session_paths.is_empty();
3510 let fallback_fetch_limit = if session_path_filter_active {
3511 self.total_docs()
3512 .min(no_limit_result_cap())
3513 .max(target_hits.saturating_mul(3))
3514 .max(1)
3515 } else {
3516 target_hits.saturating_mul(3)
3517 };
3518
3519 if let Some((reader, fields)) = &self.reader {
3521 tracing::info!(
3522 backend = "tantivy",
3523 query = sanitized,
3524 limit = initial_fetch_limit,
3525 offset = 0,
3526 "search_start"
3527 );
3528 let (hits, tantivy_total_count) = self.search_tantivy(
3529 reader,
3530 fields,
3531 query,
3532 &sanitized,
3533 filters.clone(),
3534 initial_fetch_limit,
3535 0, field_mask,
3537 )?;
3538 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3539 *tc = Some(tantivy_total_count);
3540 }
3541 if !hits.is_empty() {
3542 let initial_hit_count = hits.len();
3543 let page_hits = |raw_hits: Vec<SearchHit>| {
3544 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3545 };
3546
3547 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3548
3549 let needs_retry = deduped_len < target_hits
3550 && initial_hit_count == initial_fetch_limit
3551 && initial_fetch_limit < fallback_fetch_limit;
3552
3553 if needs_retry {
3554 tracing::debug!(
3555 query = sanitized,
3556 target_hits,
3557 deduped_len,
3558 initial_fetch_limit,
3559 fallback_fetch_limit,
3560 session_path_filter_active,
3561 "retrying lexical fetch due to dedup or session-path shortfall"
3562 );
3563 let (retry_hits, retry_total_count) = self.search_tantivy(
3564 reader,
3565 fields,
3566 query,
3567 &sanitized,
3568 filters.clone(),
3569 fallback_fetch_limit,
3570 0,
3571 field_mask,
3572 )?;
3573 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3574 *tc = Some(retry_total_count);
3575 }
3576 if !retry_hits.is_empty() {
3577 (deduped_len, paged_hits) = page_hits(retry_hits);
3578 }
3579 }
3580
3581 tracing::trace!(
3582 query = sanitized,
3583 target_hits,
3584 deduped_len,
3585 returned = paged_hits.len(),
3586 "lexical fetch complete"
3587 );
3588
3589 if can_use_cache && offset == 0 {
3590 self.put_cache(&sanitized, &filters, &paged_hits);
3591 }
3592 return Ok(paged_hits);
3593 }
3594 tracing::debug!(
3595 query = sanitized,
3596 "tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3597 );
3598 return Ok(Vec::new());
3599 } else if let Some(readers) = self.federated_readers() {
3600 tracing::info!(
3601 backend = "tantivy-federated",
3602 query = sanitized,
3603 limit = initial_fetch_limit,
3604 offset = 0,
3605 shards = readers.len(),
3606 "search_start"
3607 );
3608 let (hits, tantivy_total_count) = self.search_tantivy_federated(
3609 readers.as_ref(),
3610 query,
3611 &sanitized,
3612 filters.clone(),
3613 initial_fetch_limit,
3614 field_mask,
3615 )?;
3616 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3617 *tc = Some(tantivy_total_count);
3618 }
3619 if !hits.is_empty() {
3620 let initial_hit_count = hits.len();
3621 let page_hits = |raw_hits: Vec<SearchHit>| {
3622 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3623 };
3624
3625 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3626 let expected_federated_capacity = initial_fetch_limit.saturating_mul(readers.len());
3627 let federated_initial_capacity_reached = if session_path_filter_active {
3628 initial_hit_count >= initial_fetch_limit.min(expected_federated_capacity)
3629 } else {
3630 initial_hit_count == expected_federated_capacity
3631 };
3632 let needs_retry = deduped_len < target_hits
3633 && federated_initial_capacity_reached
3634 && initial_fetch_limit < fallback_fetch_limit;
3635
3636 if needs_retry {
3637 tracing::debug!(
3638 query = sanitized,
3639 target_hits,
3640 deduped_len,
3641 initial_fetch_limit,
3642 fallback_fetch_limit,
3643 shards = readers.len(),
3644 session_path_filter_active,
3645 "retrying federated lexical fetch due to dedup or session-path shortfall"
3646 );
3647 let (retry_hits, retry_total_count) = self.search_tantivy_federated(
3648 readers.as_ref(),
3649 query,
3650 &sanitized,
3651 filters.clone(),
3652 fallback_fetch_limit,
3653 field_mask,
3654 )?;
3655 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3656 *tc = Some(retry_total_count);
3657 }
3658 if !retry_hits.is_empty() {
3659 (deduped_len, paged_hits) = page_hits(retry_hits);
3660 }
3661 }
3662
3663 tracing::trace!(
3664 query = sanitized,
3665 target_hits,
3666 deduped_len,
3667 returned = paged_hits.len(),
3668 shards = readers.len(),
3669 "federated lexical fetch complete"
3670 );
3671
3672 if can_use_cache && offset == 0 {
3673 self.put_cache(&sanitized, &filters, &paged_hits);
3674 }
3675 return Ok(paged_hits);
3676 }
3677 tracing::debug!(
3678 query = sanitized,
3679 shards = readers.len(),
3680 "federated tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3681 );
3682 return Ok(Vec::new());
3683 }
3684
3685 let unsupported_wildcards = sanitized.split_whitespace().any(|t| {
3689 let core = t.trim_end_matches('*');
3690 core.contains('*') });
3692
3693 if unsupported_wildcards {
3694 return Ok(Vec::new());
3695 }
3696
3697 let has_sqlite_backend = {
3698 let sqlite_guard = self
3699 .sqlite
3700 .lock()
3701 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3702 sqlite_guard.is_some() || self.sqlite_path.is_some()
3703 };
3704
3705 if has_sqlite_backend {
3706 tracing::info!(
3707 backend = "sqlite-fts5",
3708 query = sanitized,
3709 limit = fallback_fetch_limit,
3710 offset = 0,
3711 "search_start"
3712 );
3713 let hits = self.search_sqlite_fts5(
3714 self.sqlite_path
3715 .as_deref()
3716 .unwrap_or_else(|| Path::new(":memory:")),
3717 query,
3718 filters.clone(),
3719 fallback_fetch_limit,
3720 0, field_mask,
3722 )?;
3723 let (_, paged_hits) =
3724 self.postprocess_hits_page(hits, &sanitized, &filters, limit, offset);
3725
3726 if can_use_cache && offset == 0 {
3727 self.put_cache(&sanitized, &filters, &paged_hits);
3728 }
3729 return Ok(paged_hits);
3730 }
3731
3732 tracing::info!(backend = "none", query = query, "search_start");
3733 Ok(Vec::new())
3734 }
3735
3736 pub fn set_semantic_context(
3737 &self,
3738 embedder: Arc<dyn Embedder>,
3739 fs_semantic_index: VectorIndex,
3740 filter_maps: SemanticFilterMaps,
3741 roles: Option<HashSet<u8>>,
3742 ann_path: Option<PathBuf>,
3743 ) -> Result<()> {
3744 self.set_semantic_indexes_context(
3745 embedder,
3746 vec![fs_semantic_index],
3747 filter_maps,
3748 roles,
3749 ann_path,
3750 )
3751 }
3752
3753 pub fn set_semantic_indexes_context(
3754 &self,
3755 embedder: Arc<dyn Embedder>,
3756 fs_semantic_indexes: Vec<VectorIndex>,
3757 filter_maps: SemanticFilterMaps,
3758 roles: Option<HashSet<u8>>,
3759 ann_path: Option<PathBuf>,
3760 ) -> Result<()> {
3761 if fs_semantic_indexes.is_empty() {
3762 bail!("semantic context requires at least one vector index");
3763 }
3764
3765 let fs_semantic_indexes = fs_semantic_indexes
3766 .into_iter()
3767 .map(|index| {
3768 let embedder_id = index.embedder_id().to_string();
3769 let dimension = index.dimension();
3770 if embedder_id != embedder.id() {
3771 bail!(
3772 "embedder mismatch: index uses {}, embedder is {}",
3773 embedder_id,
3774 embedder.id()
3775 );
3776 }
3777 if dimension != embedder.dimension() {
3778 bail!(
3779 "embedder dimension mismatch: index uses {}, embedder is {}",
3780 dimension,
3781 embedder.dimension()
3782 );
3783 }
3784 Ok(Arc::new(index))
3785 })
3786 .collect::<Result<Vec<_>>>()?;
3787 let fs_semantic_index = Arc::clone(&fs_semantic_indexes[0]);
3788 let shard_count = fs_semantic_indexes.len();
3789 let ann_path = if shard_count == 1 { ann_path } else { None };
3790 let embedder_id = fs_semantic_index.embedder_id().to_string();
3791 let dimension = fs_semantic_index.dimension();
3792 let fs_semantic_indexes = Arc::new(fs_semantic_indexes);
3793
3794 let capacity = NonZeroUsize::new(100).ok_or_else(|| anyhow!("invalid cache size"))?;
3795 let context_token = Arc::new(());
3796 let mut state_guard = self
3797 .semantic
3798 .lock()
3799 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3800 *state_guard = Some(SemanticSearchState {
3801 context_token,
3802 embedder,
3803 fs_semantic_index,
3804 fs_semantic_indexes,
3805 fs_ann_index: None,
3806 ann_path,
3807 fs_in_memory_two_tier_index: None,
3808 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable::default(),
3809 progressive_context: None,
3810 progressive_context_unavailable: false,
3811 filter_maps,
3812 roles,
3813 query_cache: QueryCache::new(embedder_id.as_str(), capacity),
3814 });
3815 if shard_count > 1 {
3816 tracing::info!(
3817 shard_count,
3818 dimension,
3819 embedder = embedder_id,
3820 "semantic search context loaded sharded vector generation"
3821 );
3822 }
3823 Ok(())
3824 }
3825
3826 pub fn clear_semantic_context(&self) -> Result<()> {
3827 let mut guard = self
3828 .semantic
3829 .lock()
3830 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3831 *guard = None;
3832 Ok(())
3833 }
3834
3835 fn semantic_context_matches(&self, context_token: &Arc<()>) -> Result<bool> {
3836 let guard = self
3837 .semantic
3838 .lock()
3839 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3840 Ok(guard
3841 .as_ref()
3842 .is_some_and(|state| Arc::ptr_eq(&state.context_token, context_token)))
3843 }
3844
3845 fn semantic_query_embedding(&self, canonical: &str) -> Result<SemanticQueryEmbedding> {
3846 loop {
3847 let (embedder, context_token) = {
3848 let mut guard = self
3849 .semantic
3850 .lock()
3851 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3852 let state = guard.as_mut().ok_or_else(|| {
3853 anyhow!("semantic search unavailable (no embedder or vector index)")
3854 })?;
3855 if let Some(hit) = state
3856 .query_cache
3857 .get_cached(state.embedder.as_ref(), canonical)
3858 {
3859 return Ok(SemanticQueryEmbedding {
3860 context_token: Arc::clone(&state.context_token),
3861 vector: hit,
3862 });
3863 }
3864 (
3865 Arc::clone(&state.embedder),
3866 Arc::clone(&state.context_token),
3867 )
3868 };
3869
3870 let embedding = embedder
3871 .embed_sync(canonical)
3872 .map_err(|e| anyhow!("embedding failed: {e}"))?;
3873
3874 let mut guard = self
3875 .semantic
3876 .lock()
3877 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3878 let state = guard.as_mut().ok_or_else(|| {
3879 anyhow!("semantic search unavailable (no embedder or vector index)")
3880 })?;
3881 if !Arc::ptr_eq(&state.context_token, &context_token) {
3882 continue;
3883 }
3884 if let Some(hit) = state
3885 .query_cache
3886 .get_cached(state.embedder.as_ref(), canonical)
3887 {
3888 return Ok(SemanticQueryEmbedding {
3889 context_token,
3890 vector: hit,
3891 });
3892 }
3893 state
3894 .query_cache
3895 .store(state.embedder.as_ref(), canonical, embedding.clone());
3896 return Ok(SemanticQueryEmbedding {
3897 context_token,
3898 vector: embedding,
3899 });
3900 }
3901 }
3902
3903 fn in_memory_two_tier_index(
3904 &self,
3905 tier_mode: SemanticTierMode,
3906 ) -> Result<Option<Arc<FsInMemoryTwoTierIndex>>> {
3907 loop {
3908 let (ann_path, embedder_id, context_token) = {
3909 let mut guard = self
3910 .semantic
3911 .lock()
3912 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3913 let state = guard.as_mut().ok_or_else(|| {
3914 anyhow!("semantic search unavailable (no embedder or vector index)")
3915 })?;
3916 if let Some(index) = state.fs_in_memory_two_tier_index.as_ref()
3917 && two_tier_index_supports_mode(index.as_ref(), tier_mode)
3918 {
3919 return Ok(Some(Arc::clone(index)));
3920 }
3921 if state
3922 .in_memory_two_tier_unavailable
3923 .is_known_unavailable(tier_mode)
3924 {
3925 return Ok(None);
3926 }
3927 (
3928 state.ann_path.clone(),
3929 state.embedder.id().to_string(),
3930 Arc::clone(&state.context_token),
3931 )
3932 };
3933
3934 let index = build_in_memory_two_tier_index(ann_path.clone(), &embedder_id, tier_mode);
3935
3936 let mut guard = self
3937 .semantic
3938 .lock()
3939 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3940 let state = guard.as_mut().ok_or_else(|| {
3941 anyhow!("semantic search unavailable (no embedder or vector index)")
3942 })?;
3943 if let Some(existing) = state.fs_in_memory_two_tier_index.as_ref()
3944 && two_tier_index_supports_mode(existing.as_ref(), tier_mode)
3945 {
3946 return Ok(Some(Arc::clone(existing)));
3947 }
3948 if !Arc::ptr_eq(&state.context_token, &context_token) {
3949 continue;
3950 }
3951 let Some(index) = index else {
3952 state
3953 .in_memory_two_tier_unavailable
3954 .mark_unavailable(tier_mode);
3955 return Ok(None);
3956 };
3957 if !two_tier_index_supports_mode(index.as_ref(), tier_mode) {
3958 state
3959 .in_memory_two_tier_unavailable
3960 .mark_unavailable(tier_mode);
3961 return Ok(None);
3962 }
3963 state.fs_in_memory_two_tier_index = Some(Arc::clone(&index));
3964 if index.has_quality_index() {
3965 state.in_memory_two_tier_unavailable = InMemoryTwoTierUnavailable::default();
3966 } else {
3967 state.in_memory_two_tier_unavailable.fast_only = false;
3968 }
3969 return Ok(Some(index));
3970 }
3971 }
3972
3973 fn ann_index(&self) -> Result<Arc<FsHnswIndex>> {
3974 loop {
3975 let (ann_path, fs_semantic_index) = {
3976 let mut guard = self
3977 .semantic
3978 .lock()
3979 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3980 let state = guard.as_mut().ok_or_else(|| {
3981 anyhow!("semantic search unavailable (no embedder or vector index)")
3982 })?;
3983 if let Some(index) = state.fs_ann_index.as_ref() {
3984 return Ok(Arc::clone(index));
3985 }
3986 let ann_path = state.ann_path.clone().ok_or_else(|| {
3987 anyhow!(
3988 "approximate search unavailable: HNSW index missing (run 'cass index --semantic --build-hnsw')"
3989 )
3990 })?;
3991 (ann_path, Arc::clone(&state.fs_semantic_index))
3992 };
3993
3994 let ann = Arc::new(open_fs_semantic_ann_index(
3995 fs_semantic_index.as_ref(),
3996 &ann_path,
3997 )?);
3998
3999 let mut guard = self
4000 .semantic
4001 .lock()
4002 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4003 let state = guard.as_mut().ok_or_else(|| {
4004 anyhow!("semantic search unavailable (no embedder or vector index)")
4005 })?;
4006 if let Some(existing) = state.fs_ann_index.as_ref() {
4007 return Ok(Arc::clone(existing));
4008 }
4009 if state.ann_path.as_ref() != Some(&ann_path)
4010 || !Arc::ptr_eq(&state.fs_semantic_index, &fs_semantic_index)
4011 {
4012 continue;
4013 }
4014 state.fs_ann_index = Some(Arc::clone(&ann));
4015 return Ok(ann);
4016 }
4017 }
4018
4019 fn collapse_semantic_results(
4020 best_by_message: HashMap<u64, VectorSearchResult>,
4021 fetch_limit: usize,
4022 ) -> Vec<VectorSearchResult> {
4023 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
4024 collapsed.sort_by(|a, b| {
4025 b.score
4026 .total_cmp(&a.score)
4027 .then_with(|| a.message_id.cmp(&b.message_id))
4028 });
4029 if collapsed.len() > fetch_limit {
4030 collapsed.truncate(fetch_limit);
4031 }
4032 collapsed
4033 }
4034
4035 fn semantic_exact_candidate_limit(fetch_limit: usize, record_count: usize) -> usize {
4036 fetch_limit
4037 .saturating_mul(SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER)
4038 .max(fetch_limit)
4039 .min(record_count)
4040 }
4041
4042 fn semantic_window_may_omit_competitor(
4043 collapsed: &[VectorSearchResult],
4044 fetch_limit: usize,
4045 max_omitted_score: Option<f32>,
4046 ) -> bool {
4047 if fetch_limit == 0 {
4048 return false;
4049 }
4050 let Some(max_omitted_score) = max_omitted_score else {
4051 return false;
4052 };
4053 if collapsed.len() < fetch_limit {
4054 return true;
4055 }
4056 let Some(last_in_requested_window) = collapsed.get(fetch_limit - 1) else {
4057 return true;
4058 };
4059 !last_in_requested_window
4060 .score
4061 .total_cmp(&max_omitted_score)
4062 .is_gt()
4063 }
4064
4065 fn record_fs_semantic_hit(
4066 best_by_message: &mut HashMap<u64, VectorSearchResult>,
4067 hit: &FsVectorHit,
4068 ) {
4069 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4070 return;
4071 };
4072 best_by_message
4073 .entry(parsed.message_id)
4074 .and_modify(|entry| {
4075 if hit.score > entry.score {
4076 entry.score = hit.score;
4077 entry.chunk_idx = parsed.chunk_idx;
4078 }
4079 })
4080 .or_insert(VectorSearchResult {
4081 message_id: parsed.message_id,
4082 chunk_idx: parsed.chunk_idx,
4083 score: hit.score,
4084 });
4085 }
4086
4087 fn search_exact_semantic_indexes(
4088 context: &SemanticCandidateContext,
4089 embedding: &[f32],
4090 fetch_limit: usize,
4091 fs_filter: Option<&dyn FsSearchFilter>,
4092 ) -> Result<(Vec<VectorSearchResult>, SemanticCandidateRetryState)> {
4093 if context.fs_semantic_indexes.len() == 1 {
4094 let record_count = context.fs_semantic_index.record_count();
4095 let candidate_limit = Self::semantic_exact_candidate_limit(fetch_limit, record_count);
4096 let fs_hits = context
4097 .fs_semantic_index
4098 .search_top_k(embedding, candidate_limit, fs_filter)
4099 .map_err(|err| anyhow!("frankensearch semantic search failed: {err}"))?;
4100 let mut best_by_message = HashMap::with_capacity(fs_hits.len());
4101 for hit in &fs_hits {
4102 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4103 }
4104 let collapsed = Self::collapse_semantic_results(best_by_message, candidate_limit);
4105 let has_more_candidates =
4106 fs_hits.len() >= candidate_limit && candidate_limit < record_count;
4107 let max_omitted_score = if has_more_candidates {
4108 fs_hits.last().map(|hit| hit.score)
4109 } else {
4110 None
4111 };
4112 let exact_window_may_omit_competitor = Self::semantic_window_may_omit_competitor(
4113 &collapsed,
4114 fetch_limit,
4115 max_omitted_score,
4116 );
4117 return Ok((
4118 collapsed,
4119 SemanticCandidateRetryState {
4120 has_more_candidates,
4121 exact_window_may_omit_competitor,
4122 },
4123 ));
4124 }
4125
4126 let mut best_by_message = HashMap::new();
4127 let mut raw_hits = 0usize;
4128 let mut max_omitted_score: Option<f32> = None;
4129 let mut has_more_candidates = false;
4130 for index in context.fs_semantic_indexes.iter() {
4131 let shard_record_count = index.record_count();
4132 let shard_limit = Self::semantic_exact_candidate_limit(fetch_limit, shard_record_count);
4138 if shard_limit == 0 {
4139 continue;
4140 }
4141 let fs_hits = index
4142 .search_top_k(embedding, shard_limit, fs_filter)
4143 .map_err(|err| anyhow!("frankensearch sharded semantic search failed: {err}"))?;
4144 if fs_hits.len() >= shard_limit
4145 && shard_limit < shard_record_count
4146 && let Some(last_hit) = fs_hits.last()
4147 {
4148 has_more_candidates = true;
4149 max_omitted_score = Some(
4150 max_omitted_score
4151 .map(|current| current.max(last_hit.score))
4152 .unwrap_or(last_hit.score),
4153 );
4154 }
4155 raw_hits = raw_hits.saturating_add(fs_hits.len());
4156 best_by_message.reserve(fs_hits.len());
4157 for hit in &fs_hits {
4158 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4159 }
4160 }
4161 let candidate_return_limit = Self::semantic_exact_candidate_limit(fetch_limit, raw_hits);
4162 let collapsed = Self::collapse_semantic_results(best_by_message, candidate_return_limit);
4163 let exact_window_may_omit_competitor =
4164 Self::semantic_window_may_omit_competitor(&collapsed, fetch_limit, max_omitted_score);
4165 tracing::debug!(
4166 shard_count = context.fs_semantic_indexes.len(),
4167 raw_hits,
4168 returned = collapsed.len(),
4169 "semantic sharded exact merge complete"
4170 );
4171 Ok((
4172 collapsed,
4173 SemanticCandidateRetryState {
4174 has_more_candidates,
4175 exact_window_may_omit_competitor,
4176 },
4177 ))
4178 }
4179
4180 fn search_semantic_candidates(
4181 &self,
4182 context: &SemanticCandidateContext,
4183 embedding: &[f32],
4184 filters: &SearchFilters,
4185 request: SemanticCandidateSearchRequest<'_>,
4186 ) -> Result<(
4187 Vec<VectorSearchResult>,
4188 SemanticCandidateRetryState,
4189 Option<crate::search::ann_index::AnnSearchStats>,
4190 )> {
4191 let mut semantic_filter =
4192 SemanticFilter::from_search_filters(filters, &context.filter_maps)?;
4193 if let Some(roles) = context.roles.clone() {
4194 semantic_filter = semantic_filter.with_roles(Some(roles));
4195 }
4196
4197 if request.tier_mode.wants_two_tier() && !request.approximate {
4198 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4199 if let Some(two_tier_index) = request.in_memory_two_tier_index {
4200 let config = request.tier_mode.to_frankensearch_config();
4201 let searcher = FsSyncTwoTierSearcher::new(Arc::clone(two_tier_index), config);
4202 let (tier_hits, metrics) = searcher
4203 .search_collect_with_filter(embedding, request.fetch_limit, fs_filter)
4204 .map_err(|err| {
4205 anyhow!("frankensearch two-tier semantic search failed: {err}")
4206 })?;
4207
4208 tracing::debug!(
4209 tier_mode = ?request.tier_mode,
4210 phase1_ms = metrics.phase1_total_ms,
4211 phase2_ms = metrics.phase2_total_ms,
4212 skip_reason = ?metrics.skip_reason,
4213 returned = tier_hits.len(),
4214 "semantic two-tier search executed"
4215 );
4216
4217 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4218 HashMap::with_capacity(tier_hits.len());
4219 for hit in tier_hits.iter() {
4220 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4221 continue;
4222 };
4223 best_by_message
4224 .entry(parsed.message_id)
4225 .and_modify(|entry| {
4226 if hit.score > entry.score {
4227 entry.score = hit.score;
4228 entry.chunk_idx = parsed.chunk_idx;
4229 }
4230 })
4231 .or_insert(VectorSearchResult {
4232 message_id: parsed.message_id,
4233 chunk_idx: parsed.chunk_idx,
4234 score: hit.score,
4235 });
4236 }
4237
4238 return Ok((
4239 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4240 SemanticCandidateRetryState {
4241 has_more_candidates: tier_hits.len() >= request.fetch_limit,
4242 exact_window_may_omit_competitor: false,
4243 },
4244 None,
4245 ));
4246 }
4247
4248 tracing::debug!(
4249 tier_mode = ?request.tier_mode,
4250 "two-tier semantic unavailable; falling back to exact single-tier search"
4251 );
4252
4253 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4254 let (results, truncated) = Self::search_exact_semantic_indexes(
4255 context,
4256 embedding,
4257 request.fetch_limit,
4258 fs_filter,
4259 )?;
4260 return Ok((results, truncated, None));
4261 }
4262
4263 if request.approximate {
4264 if request.tier_mode.wants_two_tier() {
4265 tracing::debug!(
4266 tier_mode = ?request.tier_mode,
4267 "approximate search requested; bypassing two-tier mode"
4268 );
4269 }
4270
4271 let ann = request
4272 .ann_index
4273 .ok_or_else(|| anyhow!("HNSW index failed to initialize"))?;
4274 let candidate = request
4275 .fetch_limit
4276 .saturating_mul(ANN_CANDIDATE_MULTIPLIER)
4277 .max(request.fetch_limit);
4278 let ef = FS_HNSW_DEFAULT_EF_SEARCH.max(candidate);
4279 let (ann_results, search_stats) =
4280 ann.knn_search_with_stats(embedding, candidate, ef)
4281 .map_err(|err| anyhow!("frankensearch approximate search failed: {err}"))?;
4282 let ann_stats = Some(crate::search::ann_index::AnnSearchStats {
4283 index_size: search_stats.index_size,
4284 dimension: search_stats.dimension,
4285 ef_search: search_stats.ef_search,
4286 k_requested: search_stats.k_requested,
4287 k_returned: search_stats.k_returned,
4288 search_time_us: search_stats.search_time_us,
4289 estimated_recall: search_stats.estimated_recall as f32,
4290 is_approximate: search_stats.is_approximate,
4291 });
4292
4293 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4294
4295 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4296 HashMap::with_capacity(ann_results.len());
4297 for hit in ann_results.iter() {
4298 if let Some(filter) = fs_filter
4299 && !filter.matches(&hit.doc_id, None)
4300 {
4301 continue;
4302 }
4303 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4304 continue;
4305 };
4306 best_by_message
4307 .entry(parsed.message_id)
4308 .and_modify(|entry| {
4309 if hit.score > entry.score {
4310 entry.score = hit.score;
4311 entry.chunk_idx = parsed.chunk_idx;
4312 }
4313 })
4314 .or_insert(VectorSearchResult {
4315 message_id: parsed.message_id,
4316 chunk_idx: parsed.chunk_idx,
4317 score: hit.score,
4318 });
4319 }
4320
4321 return Ok((
4322 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4323 SemanticCandidateRetryState {
4324 has_more_candidates: ann_results.len() >= candidate,
4325 exact_window_may_omit_competitor: false,
4326 },
4327 ann_stats,
4328 ));
4329 }
4330
4331 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4332 let (results, truncated) = Self::search_exact_semantic_indexes(
4333 context,
4334 embedding,
4335 request.fetch_limit,
4336 fs_filter,
4337 )?;
4338 Ok((results, truncated, None))
4339 }
4340
4341 pub fn can_progressively_refine(&self) -> bool {
4342 self.progressive_context()
4343 .map(|context| {
4344 context.as_ref().is_some_and(|ctx| {
4345 ctx.quality_embedder.is_some() && ctx.index.has_quality_index()
4346 })
4347 })
4348 .unwrap_or(false)
4349 }
4350
4351 fn progressive_context(&self) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4352 loop {
4353 let (ann_path, embedder, context_token) = {
4354 let mut guard = self
4355 .semantic
4356 .lock()
4357 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4358 let state = guard.as_mut().ok_or_else(|| {
4359 anyhow!("semantic search unavailable (no embedder or vector index)")
4360 })?;
4361 if let Some(context) = state.progressive_context.as_ref() {
4362 return Ok(Some(Arc::clone(context)));
4363 }
4364 if state.progressive_context_unavailable {
4365 return Ok(None);
4366 }
4367 (
4368 state.ann_path.clone(),
4369 Arc::clone(&state.embedder),
4370 Arc::clone(&state.context_token),
4371 )
4372 };
4373
4374 let context = match self.build_progressive_context(
4375 ann_path.clone(),
4376 embedder,
4377 Arc::clone(&context_token),
4378 ) {
4379 Ok(context) => context,
4380 Err(err) => {
4381 let mut guard = self
4382 .semantic
4383 .lock()
4384 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4385 let state = guard.as_mut().ok_or_else(|| {
4386 anyhow!("semantic search unavailable (no embedder or vector index)")
4387 })?;
4388 if let Some(existing) = state.progressive_context.as_ref() {
4389 return Ok(Some(Arc::clone(existing)));
4390 }
4391 if !Arc::ptr_eq(&state.context_token, &context_token) {
4392 continue;
4393 }
4394 return Err(err);
4395 }
4396 };
4397
4398 let Some(context) = context else {
4399 let mut guard = self
4400 .semantic
4401 .lock()
4402 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4403 let state = guard.as_mut().ok_or_else(|| {
4404 anyhow!("semantic search unavailable (no embedder or vector index)")
4405 })?;
4406 if let Some(existing) = state.progressive_context.as_ref() {
4407 return Ok(Some(Arc::clone(existing)));
4408 }
4409 if !Arc::ptr_eq(&state.context_token, &context_token) {
4410 continue;
4411 }
4412 state.progressive_context_unavailable = true;
4413 return Ok(None);
4414 };
4415
4416 let mut guard = self
4417 .semantic
4418 .lock()
4419 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4420 let state = guard.as_mut().ok_or_else(|| {
4421 anyhow!("semantic search unavailable (no embedder or vector index)")
4422 })?;
4423 if let Some(existing) = state.progressive_context.as_ref() {
4424 return Ok(Some(Arc::clone(existing)));
4425 }
4426 if !Arc::ptr_eq(&state.context_token, &context_token) {
4427 continue;
4428 }
4429 state.progressive_context_unavailable = false;
4430 state.progressive_context = Some(Arc::clone(&context));
4431 return Ok(Some(context));
4432 }
4433 }
4434
4435 fn build_progressive_context(
4436 &self,
4437 ann_path: Option<PathBuf>,
4438 embedder: Arc<dyn Embedder>,
4439 context_token: Arc<()>,
4440 ) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4441 let Some(index_dir) = ann_path
4442 .as_ref()
4443 .and_then(|path| path.parent().map(Path::to_path_buf))
4444 else {
4445 return Ok(None);
4446 };
4447
4448 let fast_path = {
4449 let explicit = index_dir.join("vector.fast.idx");
4450 if explicit.is_file() {
4451 explicit
4452 } else {
4453 let fallback = index_dir.join("vector.idx");
4454 if fallback.is_file() {
4455 fallback
4456 } else {
4457 return Ok(None);
4458 }
4459 }
4460 };
4461 let quality_path = index_dir.join("vector.quality.idx");
4462 if !quality_path.is_file() {
4463 return Ok(None);
4464 }
4465
4466 let fast_index = FsVectorIndex::open(&fast_path)
4467 .map_err(|err| anyhow!("open fast-tier index failed: {err}"))?;
4468 let quality_index = FsVectorIndex::open(&quality_path)
4469 .map_err(|err| anyhow!("open quality-tier index failed: {err}"))?;
4470 let index = Arc::new(
4471 FsTwoTierIndex::open(&index_dir, frankensearch_two_tier_config())
4472 .map_err(|err| anyhow!("open progressive two-tier index failed: {err}"))?,
4473 );
4474
4475 let fast_embedder = self.load_embedder_for_progressive_id(
4476 &embedder,
4477 fast_index.embedder_id(),
4478 fast_index.dimension(),
4479 )?;
4480 let fast_embedder: Arc<dyn frankensearch::Embedder> = Arc::new(FsSyncEmbedderAdapter(
4481 SharedCassSyncEmbedder::new(fast_embedder),
4482 ));
4483 let quality_embedder = Some(self.load_embedder_for_progressive_id(
4484 &embedder,
4485 quality_index.embedder_id(),
4486 quality_index.dimension(),
4487 )?);
4488 let quality_embedder = quality_embedder.map(|embedder| {
4489 Arc::new(FsSyncEmbedderAdapter(SharedCassSyncEmbedder::new(embedder)))
4490 as Arc<dyn frankensearch::Embedder>
4491 });
4492
4493 Ok(Some(Arc::new(ProgressiveTwoTierContext {
4494 context_token,
4495 index,
4496 fast_embedder,
4497 quality_embedder,
4498 })))
4499 }
4500
4501 fn load_embedder_for_progressive_id(
4502 &self,
4503 current_embedder: &Arc<dyn Embedder>,
4504 embedder_id: &str,
4505 dimension: usize,
4506 ) -> Result<Arc<dyn Embedder>> {
4507 if current_embedder.id() == embedder_id {
4508 return Ok(Arc::clone(current_embedder));
4509 }
4510
4511 if let Some(dim) = embedder_id.strip_prefix("fnv1a-")
4512 && let Ok(parsed) = dim.parse::<usize>()
4513 {
4514 return Ok(Arc::new(crate::search::hash_embedder::HashEmbedder::new(
4515 parsed.max(dimension),
4516 )));
4517 }
4518
4519 if let Some(embedder_name) =
4520 crate::search::fastembed_embedder::FastEmbedder::canonical_name(embedder_id)
4521 {
4522 let data_dir = self
4523 .sqlite_path
4524 .as_ref()
4525 .and_then(|path| path.parent())
4526 .ok_or_else(|| anyhow!("cannot resolve data dir for progressive embedder load"))?;
4527 let embedder = crate::search::fastembed_embedder::FastEmbedder::load_by_name(
4528 data_dir,
4529 embedder_name,
4530 )
4531 .with_context(|| format!("loading FastEmbed model for {embedder_name}"))?;
4532 if embedder.dimension() != dimension {
4533 bail!(
4534 "progressive embedder dimension mismatch: {} index expects {}, model has {}",
4535 embedder_id,
4536 dimension,
4537 embedder.dimension()
4538 );
4539 }
4540 return Ok(Arc::new(embedder));
4541 }
4542
4543 bail!("unsupported progressive embedder id: {embedder_id}");
4544 }
4545
4546 fn resolve_semantic_doc_ids_for_hits(
4547 &self,
4548 hits: &[SearchHit],
4549 ) -> Result<Vec<Option<ResolvedSemanticDocId>>> {
4550 if hits.is_empty() {
4551 return Ok(Vec::new());
4552 }
4553
4554 let lookup_keys: Vec<Option<ProgressiveLookupKey>> = hits
4555 .iter()
4556 .map(|hit| {
4557 let idx = hit
4558 .line_number
4559 .and_then(|line| line.checked_sub(1))
4560 .map(i64::try_from)
4561 .transpose()
4562 .ok()
4563 .flatten()?;
4564 Some((
4565 normalized_search_hit_source_id(hit),
4566 hit.source_path.clone(),
4567 hit.conversation_id,
4568 hit.title.trim().to_string(),
4569 idx,
4570 hit.created_at,
4571 hit.content_hash,
4572 ))
4573 })
4574 .collect();
4575
4576 let mut seen_exact = HashSet::new();
4577 let mut exact_query_keys = Vec::new();
4578 let mut seen_fallback = HashSet::new();
4579 let mut fallback_query_keys = Vec::new();
4580 for (source_id, source_path, conversation_id, _title, idx, _created_at, _content_hash) in
4581 lookup_keys.iter().flatten()
4582 {
4583 if let Some(conversation_id) = conversation_id {
4584 let query_key: ProgressiveExactQueryKey = (*conversation_id, *idx);
4585 if seen_exact.insert(query_key) {
4586 exact_query_keys.push(query_key);
4587 }
4588 } else {
4589 let query_key: ProgressiveFallbackQueryKey =
4590 (source_id.clone(), source_path.clone(), *idx);
4591 if seen_fallback.insert(query_key.clone()) {
4592 fallback_query_keys.push(query_key);
4593 }
4594 }
4595 }
4596
4597 if exact_query_keys.is_empty() && fallback_query_keys.is_empty() {
4598 return Ok(vec![None; hits.len()]);
4599 }
4600
4601 let sqlite_guard = self.sqlite_guard()?;
4602 let conn = sqlite_guard
4603 .as_ref()
4604 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4605
4606 let mut resolved_by_key = HashMap::new();
4607 let normalized_source_sql =
4608 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4609
4610 const CHUNK_SIZE: usize = 300;
4611 for chunk in exact_query_keys.chunks(CHUNK_SIZE) {
4612 let mut sql = String::from("SELECT c.id, ");
4613 sql.push_str(&normalized_source_sql);
4614 sql.push_str(
4615 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4616 FROM messages m
4617 JOIN conversations c ON m.conversation_id = c.id
4618 LEFT JOIN sources s ON c.source_id = s.id
4619 WHERE ",
4620 );
4621 let mut params = Vec::with_capacity(chunk.len().saturating_mul(2));
4622 for (idx, (conversation_id, line_idx)) in chunk.iter().enumerate() {
4623 if idx > 0 {
4624 sql.push_str(" OR ");
4625 }
4626 sql.push_str("(c.id = ? AND m.idx = ?)");
4627 params.push(ParamValue::from(*conversation_id));
4628 params.push(ParamValue::from(*line_idx));
4629 }
4630
4631 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4632 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4633 let conversation_id: i64 = row.get_typed(0)?;
4634 let source_id: String = row.get_typed(1)?;
4635 let source_path: String = row.get_typed(2)?;
4636 let idx: i64 = row.get_typed(3)?;
4637 let message_id_raw: i64 = row.get_typed(4)?;
4638 let agent_id_raw: Option<i64> = row.get_typed(5)?;
4641 let workspace_id_raw: Option<i64> = row.get_typed(6)?;
4642 let role_raw: String = row.get_typed(7)?;
4643 let created_at_ms: Option<i64> = row.get_typed(8)?;
4644 let content: String = row.get_typed(9)?;
4645 let title: Option<String> = row.get_typed(10)?;
4646
4647 let canonical = canonicalize_for_embedding(&content);
4648 if canonical.is_empty() {
4649 return Ok(None);
4650 }
4651
4652 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4653 std::io::Error::other("message id out of range for progressive doc_id")
4654 })?;
4655 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4656 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4657 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4658 let doc_id = SemanticDocId {
4659 message_id,
4660 chunk_idx: 0,
4661 agent_id,
4662 workspace_id,
4663 source_id: crc32fast::hash(source_id.as_bytes()),
4664 role,
4665 created_at_ms: created_at_ms.unwrap_or(0),
4666 content_hash: Some(content_hash(&canonical)),
4667 }
4668 .to_doc_id_string();
4669 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4670 let lookup_key = (
4671 source_id,
4672 source_path.clone(),
4673 Some(conversation_id),
4674 title.unwrap_or_default().trim().to_string(),
4675 idx,
4676 created_at_ms,
4677 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4678 );
4679
4680 Ok(Some((
4681 lookup_key,
4682 ResolvedSemanticDocId { message_id, doc_id },
4683 )))
4684 })?;
4685
4686 for row in chunk_rows.into_iter().flatten() {
4687 resolved_by_key.insert(row.0, row.1);
4688 }
4689 }
4690
4691 for chunk in fallback_query_keys.chunks(CHUNK_SIZE) {
4692 let mut sql = String::from("SELECT ");
4693 sql.push_str(&normalized_source_sql);
4694 sql.push_str(
4695 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4696 FROM messages m
4697 JOIN conversations c ON m.conversation_id = c.id
4698 LEFT JOIN sources s ON c.source_id = s.id
4699 WHERE ",
4700 );
4701 let mut params = Vec::with_capacity(chunk.len().saturating_mul(3));
4702 for (idx, (source_id, source_path, line_idx)) in chunk.iter().enumerate() {
4703 if idx > 0 {
4704 sql.push_str(" OR ");
4705 }
4706 sql.push_str(&format!(
4707 "({normalized_source_sql} = ? AND c.source_path = ? AND m.idx = ?)"
4708 ));
4709 params.push(ParamValue::from(normalize_search_source_filter_value(
4710 source_id,
4711 )));
4712 params.push(ParamValue::from(source_path.clone()));
4713 params.push(ParamValue::from(*line_idx));
4714 }
4715
4716 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4717 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4718 let source_id: String = row.get_typed(0)?;
4719 let source_path: String = row.get_typed(1)?;
4720 let idx: i64 = row.get_typed(2)?;
4721 let message_id_raw: i64 = row.get_typed(3)?;
4722 let agent_id_raw: Option<i64> = row.get_typed(4)?;
4725 let workspace_id_raw: Option<i64> = row.get_typed(5)?;
4726 let role_raw: String = row.get_typed(6)?;
4727 let created_at_ms: Option<i64> = row.get_typed(7)?;
4728 let content: String = row.get_typed(8)?;
4729 let title: Option<String> = row.get_typed(9)?;
4730
4731 let canonical = canonicalize_for_embedding(&content);
4732 if canonical.is_empty() {
4733 return Ok(None);
4734 }
4735
4736 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4737 std::io::Error::other("message id out of range for progressive doc_id")
4738 })?;
4739 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4740 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4741 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4742 let doc_id = SemanticDocId {
4743 message_id,
4744 chunk_idx: 0,
4745 agent_id,
4746 workspace_id,
4747 source_id: crc32fast::hash(source_id.as_bytes()),
4748 role,
4749 created_at_ms: created_at_ms.unwrap_or(0),
4750 content_hash: Some(content_hash(&canonical)),
4751 }
4752 .to_doc_id_string();
4753 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4754 let lookup_key = (
4755 source_id,
4756 source_path.clone(),
4757 None,
4758 title.unwrap_or_default().trim().to_string(),
4759 idx,
4760 created_at_ms,
4761 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4762 );
4763
4764 Ok(Some((
4765 lookup_key,
4766 ResolvedSemanticDocId { message_id, doc_id },
4767 )))
4768 })?;
4769
4770 for row in chunk_rows.into_iter().flatten() {
4771 resolved_by_key.insert(row.0, row.1);
4772 }
4773 }
4774
4775 Ok(lookup_keys
4776 .into_iter()
4777 .map(|key| key.and_then(|lookup| resolved_by_key.get(&lookup).cloned()))
4778 .collect())
4779 }
4780
4781 fn load_message_text_by_id(&self, message_id: u64) -> Result<Option<String>> {
4782 let sqlite_guard = self.sqlite_guard()?;
4783 let conn = sqlite_guard
4784 .as_ref()
4785 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4786 let rows: Vec<String> = conn.query_map_collect(
4787 "SELECT content FROM messages WHERE id = ?",
4788 &[ParamValue::from(i64::try_from(message_id)?)],
4789 |row: &frankensqlite::Row| row.get_typed(0),
4790 )?;
4791 Ok(rows.into_iter().next())
4792 }
4793
4794 fn collapse_progressive_scored_results(
4795 &self,
4796 results: &[FsScoredResult],
4797 fetch_limit: usize,
4798 ) -> Vec<VectorSearchResult> {
4799 let fetch = fetch_limit.max(1);
4800 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4801 HashMap::with_capacity(results.len());
4802 for hit in results {
4803 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4804 continue;
4805 };
4806 best_by_message
4807 .entry(parsed.message_id)
4808 .and_modify(|entry| {
4809 if hit.score > entry.score {
4810 entry.score = hit.score;
4811 entry.chunk_idx = parsed.chunk_idx;
4812 }
4813 })
4814 .or_insert(VectorSearchResult {
4815 message_id: parsed.message_id,
4816 chunk_idx: parsed.chunk_idx,
4817 score: hit.score,
4818 });
4819 }
4820 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
4821 collapsed.sort_by(|a, b| {
4822 b.score
4823 .total_cmp(&a.score)
4824 .then_with(|| a.message_id.cmp(&b.message_id))
4825 });
4826 if collapsed.len() > fetch {
4827 collapsed.truncate(fetch);
4828 }
4829 collapsed
4830 }
4831
4832 fn hydrate_semantic_hits_with_ids(
4833 &self,
4834 results: &[VectorSearchResult],
4835 field_mask: FieldMask,
4836 ) -> Result<Vec<(u64, SearchHit)>> {
4837 if results.is_empty() {
4838 return Ok(Vec::new());
4839 }
4840 let sqlite_guard = self.sqlite_guard()?;
4841 let conn = sqlite_guard
4842 .as_ref()
4843 .ok_or_else(|| anyhow!("semantic search requires database connection"))?;
4844
4845 let placeholder_capacity = results.len().saturating_mul(2).saturating_sub(1);
4846 let mut placeholders = String::with_capacity(placeholder_capacity);
4847 let mut params: Vec<ParamValue> = Vec::with_capacity(results.len());
4848 for (idx, result) in results.iter().enumerate() {
4849 if idx > 0 {
4850 placeholders.push(',');
4851 }
4852 placeholders.push('?');
4853 params.push(ParamValue::from(i64::try_from(result.message_id)?));
4854 }
4855
4856 let title_expr = if field_mask.wants_title() {
4857 "c.title"
4858 } else {
4859 "''"
4860 };
4861 let normalized_source_sql =
4862 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4863 let sql = format!(
4868 "SELECT m.id, c.id, m.content, m.created_at, m.idx, m.role, {title_expr}, c.source_path, {normalized_source_sql}, c.origin_host, COALESCE(a.slug, 'unknown'), w.path, s.kind, c.started_at
4869 FROM messages m
4870 JOIN conversations c ON m.conversation_id = c.id
4871 LEFT JOIN agents a ON c.agent_id = a.id
4872 LEFT JOIN workspaces w ON c.workspace_id = w.id
4873 LEFT JOIN sources s ON c.source_id = s.id
4874 WHERE m.id IN ({placeholders})"
4875 );
4876
4877 let rows: Vec<(u64, SearchHit)> =
4878 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4879 let message_id: i64 = row.get_typed(0)?;
4880 let conversation_id: i64 = row.get_typed(1)?;
4881 let full_content: String = row.get_typed(2)?;
4882 let msg_created_at: Option<i64> = row.get_typed(3)?;
4883 let idx: Option<i64> = row.get_typed(4)?;
4884 let title: Option<String> = if field_mask.wants_title() {
4885 row.get_typed(6)?
4886 } else {
4887 None
4888 };
4889 let source_path: String = row.get_typed(7)?;
4890 let raw_source_id: String = row.get_typed(8)?;
4891 let origin_host: Option<String> = row.get_typed(9)?;
4892 let agent: String = row.get_typed(10)?;
4893 let workspace: Option<String> = row.get_typed(11)?;
4894 let raw_origin_kind: Option<String> = row.get_typed(12)?;
4895 let started_at: Option<i64> = row.get_typed(13)?;
4896
4897 let created_at = msg_created_at.or(started_at);
4898 let line_number = idx
4899 .and_then(|i| usize::try_from(i).ok())
4900 .map(|i| i.saturating_add(1));
4901 let snippet = if field_mask.wants_snippet() {
4902 snippet_from_content(&full_content)
4903 } else {
4904 String::new()
4905 };
4906 let content = if field_mask.needs_content() {
4907 full_content.clone()
4908 } else {
4909 String::new()
4910 };
4911 let content_hash =
4912 stable_hit_hash(&full_content, &source_path, line_number, created_at);
4913 let source_id = normalized_search_hit_source_id_parts(
4914 raw_source_id.as_str(),
4915 raw_origin_kind.as_deref().unwrap_or_default(),
4916 origin_host.as_deref(),
4917 );
4918 let origin_kind =
4919 normalized_search_hit_origin_kind(&source_id, raw_origin_kind.as_deref());
4920
4921 let hit = SearchHit {
4922 title: if field_mask.wants_title() {
4923 title.unwrap_or_default()
4924 } else {
4925 String::new()
4926 },
4927 snippet,
4928 content,
4929 content_hash,
4930 conversation_id: Some(conversation_id),
4931 score: 0.0,
4932 source_path,
4933 agent,
4934 workspace: workspace.unwrap_or_default(),
4935 workspace_original: None,
4936 created_at,
4937 line_number,
4938 match_type: MatchType::Exact,
4939 source_id,
4940 origin_kind,
4941 origin_host,
4942 };
4943
4944 Ok((semantic_message_id_from_db(message_id)?, hit))
4945 })?;
4946
4947 let mut hits_by_id = HashMap::new();
4948 for (id, hit) in rows {
4949 hits_by_id.insert(id, hit);
4950 }
4951
4952 let mut ordered = Vec::new();
4953 for result in results {
4954 if let Some(mut hit) = hits_by_id.remove(&result.message_id) {
4955 hit.score = result.score;
4956 ordered.push((result.message_id, hit));
4957 }
4958 }
4959
4960 Ok(ordered)
4961 }
4962
4963 fn overlay_progressive_lexical_hit(
4964 &self,
4965 hit: &mut SearchHit,
4966 lexical: &ProgressiveLexicalHit,
4967 field_mask: FieldMask,
4968 ) {
4969 if field_mask.wants_title() && !lexical.title.is_empty() {
4970 hit.title = lexical.title.clone();
4971 }
4972 if field_mask.wants_snippet() && !lexical.snippet.is_empty() {
4973 hit.snippet = lexical.snippet.clone();
4974 }
4975 if field_mask.needs_content() && !lexical.content.is_empty() {
4976 hit.content = lexical.content.clone();
4977 }
4978 hit.match_type = lexical.match_type;
4979 hit.line_number = lexical.line_number.or(hit.line_number);
4980 }
4981
4982 fn progressive_phase_to_result(
4983 &self,
4984 results: &[FsScoredResult],
4985 ctx: ProgressivePhaseContext<'_>,
4986 ) -> Result<SearchResult> {
4987 let collapsed = self.collapse_progressive_scored_results(results, ctx.fetch_limit);
4988 let missing: Vec<VectorSearchResult> = collapsed
4989 .iter()
4990 .filter(|result| {
4991 ctx.lexical_cache
4992 .and_then(|cache| cache.hits_by_message.get(&result.message_id))
4993 .is_none()
4994 })
4995 .map(|result| VectorSearchResult {
4996 message_id: result.message_id,
4997 chunk_idx: result.chunk_idx,
4998 score: result.score,
4999 })
5000 .collect();
5001 let mut hydrated_by_id: HashMap<u64, SearchHit> = self
5002 .hydrate_semantic_hits_with_ids(&missing, ctx.field_mask)?
5003 .into_iter()
5004 .collect();
5005
5006 let mut hydrated: Vec<(u64, SearchHit)> = Vec::with_capacity(collapsed.len());
5007 for result in &collapsed {
5008 if let Some(cache) = ctx.lexical_cache
5009 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
5010 {
5011 hydrated.push((result.message_id, lexical.to_search_hit(result.score)));
5012 continue;
5013 }
5014 if let Some(mut hit) = hydrated_by_id.remove(&result.message_id) {
5015 if let Some(cache) = ctx.lexical_cache
5016 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
5017 {
5018 self.overlay_progressive_lexical_hit(&mut hit, lexical, ctx.field_mask);
5019 }
5020 hydrated.push((result.message_id, hit));
5021 }
5022 }
5023
5024 let mut hits: Vec<SearchHit> = hydrated.into_iter().map(|(_, hit)| hit).collect();
5025 (_, hits) = self.postprocess_hits_page(hits, ctx.query, ctx.filters, ctx.limit, 0);
5026
5027 let (wildcard_fallback, suggestions) = ctx
5028 .lexical_cache
5029 .map(|cache| {
5030 let suggestions = if hits.is_empty() {
5031 cache.suggestions.clone()
5032 } else {
5033 Vec::new()
5034 };
5035 (cache.wildcard_fallback, suggestions)
5036 })
5037 .unwrap_or((false, Vec::new()));
5038
5039 Ok(SearchResult {
5040 hits,
5041 wildcard_fallback,
5042 cache_stats: self.cache_stats(),
5043 suggestions,
5044 ann_stats: None,
5045 total_count: None,
5046 })
5047 }
5048
5049 pub(crate) async fn search_progressive_with_callback(
5050 self: &Arc<Self>,
5051 request: ProgressiveSearchRequest<'_>,
5052 mut on_event: impl FnMut(ProgressiveSearchEvent) + Send,
5053 ) -> Result<()> {
5054 let ProgressiveSearchRequest {
5055 cx,
5056 query,
5057 filters,
5058 limit,
5059 sparse_threshold,
5060 field_mask,
5061 mode,
5062 } = request;
5063 let field_mask = effective_field_mask(field_mask);
5064 let limit = limit.max(1);
5065 let fetch_limit = progressive_phase_fetch_limit(limit);
5066
5067 match mode {
5068 SearchMode::Lexical => {
5069 let started = Instant::now();
5070 let result = self.search_with_fallback(
5071 query,
5072 filters,
5073 limit,
5074 0,
5075 sparse_threshold,
5076 field_mask,
5077 )?;
5078 on_event(ProgressiveSearchEvent::Phase {
5079 kind: ProgressivePhaseKind::Initial,
5080 elapsed_ms: started.elapsed().as_millis(),
5081 result,
5082 });
5083 return Ok(());
5084 }
5085 SearchMode::Semantic | SearchMode::Hybrid => {}
5086 }
5087
5088 let progressive_context = {
5089 self.progressive_context()?
5090 .ok_or_else(|| anyhow!("progressive two-tier context unavailable"))?
5091 };
5092 let progressive_context_token = Arc::clone(&progressive_context.context_token);
5093
5094 let lexical_cache: Arc<Mutex<ProgressiveLexicalSnapshot>> =
5095 Arc::new(Mutex::new(Arc::new(ProgressiveLexicalCache::default())));
5096 let text_cache: Arc<Mutex<HashMap<u64, String>>> = Arc::new(Mutex::new(HashMap::new()));
5097 let text_client = Arc::clone(self);
5098 let text_cache_for_lookup = Arc::clone(&text_cache);
5099 let text_fn = move |doc_id: &str| -> Option<String> {
5100 let parsed = parse_semantic_doc_id(doc_id)?;
5101 if let Ok(cache) = text_cache_for_lookup.lock()
5102 && let Some(text) = cache.get(&parsed.message_id)
5103 {
5104 return Some(text.clone());
5105 }
5106 let loaded = text_client
5107 .load_message_text_by_id(parsed.message_id)
5108 .ok()
5109 .flatten()?;
5110 if let Ok(mut cache) = text_cache_for_lookup.lock() {
5111 cache.insert(parsed.message_id, loaded.clone());
5112 }
5113 Some(loaded)
5114 };
5115
5116 let mut searcher = FsTwoTierSearcher::new(
5117 Arc::clone(&progressive_context.index),
5118 Arc::clone(&progressive_context.fast_embedder),
5119 frankensearch_two_tier_config(),
5120 );
5121
5122 if let Some(quality_embedder) = progressive_context.quality_embedder.as_ref() {
5123 searcher = searcher.with_quality_embedder(Arc::clone(quality_embedder));
5124 }
5125
5126 if matches!(mode, SearchMode::Hybrid) {
5127 let lexical = Arc::new(CassProgressiveLexicalAdapter::new(
5128 Arc::clone(self),
5129 filters.clone(),
5130 field_mask,
5131 sparse_threshold,
5132 Arc::clone(&lexical_cache),
5133 ));
5134 searcher = searcher.with_lexical(lexical);
5135 }
5136
5137 let phase_client = Arc::clone(self);
5138 let phase_filters = filters.clone();
5139 let phase_cache = Arc::clone(&lexical_cache);
5140 let mut phase_error: Option<anyhow::Error> = None;
5141
5142 let search_result = searcher
5143 .search(cx, query, fetch_limit, text_fn, |phase| {
5144 if phase_error.is_some() {
5145 return;
5146 }
5147 match phase_client.semantic_context_matches(&progressive_context_token) {
5148 Ok(true) => {}
5149 Ok(false) => {
5150 phase_error = Some(anyhow!(
5151 "progressive search aborted: semantic context changed"
5152 ));
5153 cx.set_cancel_requested(true);
5154 return;
5155 }
5156 Err(err) => {
5157 phase_error = Some(err);
5158 cx.set_cancel_requested(true);
5159 return;
5160 }
5161 }
5162 let lexical_snapshot = phase_cache.lock().ok().map(|guard| Arc::clone(&guard));
5163 let event_result = match phase {
5164 FsSearchPhase::Initial {
5165 results, latency, ..
5166 } => phase_client
5167 .progressive_phase_to_result(
5168 &results,
5169 ProgressivePhaseContext {
5170 query,
5171 filters: &phase_filters,
5172 field_mask,
5173 lexical_cache: lexical_snapshot.as_deref(),
5174 limit,
5175 fetch_limit,
5176 },
5177 )
5178 .map(|result| ProgressiveSearchEvent::Phase {
5179 kind: ProgressivePhaseKind::Initial,
5180 elapsed_ms: latency.as_millis(),
5181 result,
5182 }),
5183 FsSearchPhase::Refined {
5184 results, latency, ..
5185 } => phase_client
5186 .progressive_phase_to_result(
5187 &results,
5188 ProgressivePhaseContext {
5189 query,
5190 filters: &phase_filters,
5191 field_mask,
5192 lexical_cache: lexical_snapshot.as_deref(),
5193 limit,
5194 fetch_limit,
5195 },
5196 )
5197 .map(|result| ProgressiveSearchEvent::Phase {
5198 kind: ProgressivePhaseKind::Refined,
5199 elapsed_ms: latency.as_millis(),
5200 result,
5201 }),
5202 FsSearchPhase::Reranked {
5208 results, latency, ..
5209 } => phase_client
5210 .progressive_phase_to_result(
5211 &results,
5212 ProgressivePhaseContext {
5213 query,
5214 filters: &phase_filters,
5215 field_mask,
5216 lexical_cache: lexical_snapshot.as_deref(),
5217 limit,
5218 fetch_limit,
5219 },
5220 )
5221 .map(|result| ProgressiveSearchEvent::Phase {
5222 kind: ProgressivePhaseKind::Refined,
5223 elapsed_ms: latency.as_millis(),
5224 result,
5225 }),
5226 FsSearchPhase::RefinementFailed { error, latency, .. } => {
5227 Ok(ProgressiveSearchEvent::RefinementFailed {
5228 latency_ms: latency.as_millis(),
5229 error: error.to_string(),
5230 })
5231 }
5232 };
5233
5234 match event_result {
5235 Ok(event) => on_event(event),
5236 Err(err) => {
5237 phase_error = Some(err);
5238 cx.set_cancel_requested(true);
5239 }
5240 }
5241 })
5242 .await;
5243
5244 if let Some(err) = phase_error {
5245 return Err(err);
5246 }
5247
5248 search_result
5249 .map(|_| ())
5250 .map_err(|err| anyhow!("progressive search failed: {err}"))
5251 }
5252
5253 pub fn search_semantic(
5255 &self,
5256 query: &str,
5257 filters: SearchFilters,
5258 limit: usize,
5259 offset: usize,
5260 field_mask: FieldMask,
5261 approximate: bool,
5262 ) -> Result<(
5263 Vec<SearchHit>,
5264 Option<crate::search::ann_index::AnnSearchStats>,
5265 )> {
5266 self.search_semantic_with_tier(
5267 query,
5268 filters,
5269 limit,
5270 offset,
5271 field_mask,
5272 approximate,
5273 SemanticTierMode::Single,
5274 )
5275 }
5276
5277 #[allow(clippy::too_many_arguments)]
5279 pub fn search_semantic_with_tier(
5280 &self,
5281 query: &str,
5282 filters: SearchFilters,
5283 limit: usize,
5284 offset: usize,
5285 field_mask: FieldMask,
5286 approximate: bool,
5287 tier_mode: SemanticTierMode,
5288 ) -> Result<(
5289 Vec<SearchHit>,
5290 Option<crate::search::ann_index::AnnSearchStats>,
5291 )> {
5292 let field_mask = effective_field_mask(field_mask);
5293 let canonical = canonicalize_for_embedding(query);
5294 if canonical.trim().is_empty() {
5295 return Ok((Vec::new(), None));
5296 }
5297 let limit = if limit == 0 {
5298 self.total_docs().min(no_limit_result_cap()).max(1)
5299 } else {
5300 limit
5301 };
5302 let target_hits = limit.saturating_add(offset);
5303 if target_hits == 0 {
5304 return Ok((Vec::new(), None));
5305 }
5306 let initial_fetch_limit = target_hits;
5307 let fallback_fetch_limit = target_hits.saturating_mul(3);
5308 loop {
5309 let (embedding, candidate_context, in_memory_two_tier_index, ann_index, context_token) = loop {
5310 let embedding = self.semantic_query_embedding(&canonical)?;
5311 let (candidate_context, context_token) = {
5312 let guard = self
5313 .semantic
5314 .lock()
5315 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5316 let state = guard.as_ref().ok_or_else(|| {
5317 anyhow!("semantic search unavailable (no embedder or vector index)")
5318 })?;
5319 (
5320 SemanticCandidateContext {
5321 fs_semantic_index: Arc::clone(&state.fs_semantic_index),
5322 fs_semantic_indexes: Arc::clone(&state.fs_semantic_indexes),
5323 filter_maps: state.filter_maps.clone(),
5324 roles: state.roles.clone(),
5325 },
5326 Arc::clone(&state.context_token),
5327 )
5328 };
5329 if !Arc::ptr_eq(&embedding.context_token, &context_token) {
5330 continue;
5331 }
5332 let in_memory_two_tier_index = if tier_mode.wants_two_tier() && !approximate {
5333 self.in_memory_two_tier_index(tier_mode)?
5334 } else {
5335 None
5336 };
5337 let ann_index = if approximate {
5338 Some(self.ann_index()?)
5339 } else {
5340 None
5341 };
5342
5343 let guard = self
5344 .semantic
5345 .lock()
5346 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5347 let state = guard.as_ref().ok_or_else(|| {
5348 anyhow!("semantic search unavailable (no embedder or vector index)")
5349 })?;
5350 if !Arc::ptr_eq(&state.context_token, &context_token) {
5351 continue;
5352 }
5353 break (
5354 embedding.vector,
5355 candidate_context,
5356 in_memory_two_tier_index,
5357 ann_index,
5358 context_token,
5359 );
5360 };
5361
5362 let finalize_hits =
5363 |results: &[VectorSearchResult]| -> Result<(usize, Vec<SearchHit>)> {
5364 let hits = self.hydrate_semantic_hits(results, field_mask)?;
5365 Ok(self.postprocess_hits_page(hits, query, &filters, limit, offset))
5366 };
5367
5368 let (results, retry_state, mut ann_stats) = self.search_semantic_candidates(
5369 &candidate_context,
5370 &embedding,
5371 &filters,
5372 SemanticCandidateSearchRequest {
5373 fetch_limit: initial_fetch_limit,
5374 approximate,
5375 tier_mode,
5376 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5377 ann_index: ann_index.as_ref(),
5378 },
5379 )?;
5380 if !self.semantic_context_matches(&context_token)? {
5381 tracing::debug!("semantic context changed during candidate search; retrying");
5382 continue;
5383 }
5384 let (mut available_hits, mut paged_hits) = finalize_hits(&results)?;
5385
5386 let needs_retry = initial_fetch_limit < fallback_fetch_limit
5387 && ((available_hits < target_hits && retry_state.has_more_candidates)
5388 || retry_state.exact_window_may_omit_competitor);
5389
5390 if needs_retry {
5391 tracing::debug!(
5392 query = canonical,
5393 target_hits,
5394 available_hits,
5395 initial_fetch_limit,
5396 fallback_fetch_limit,
5397 "retrying semantic fetch due to candidate-window shortfall"
5398 );
5399 let (retry_results, _, retry_ann_stats) = self.search_semantic_candidates(
5400 &candidate_context,
5401 &embedding,
5402 &filters,
5403 SemanticCandidateSearchRequest {
5404 fetch_limit: fallback_fetch_limit,
5405 approximate,
5406 tier_mode,
5407 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5408 ann_index: ann_index.as_ref(),
5409 },
5410 )?;
5411 if !self.semantic_context_matches(&context_token)? {
5412 tracing::debug!("semantic context changed during retry fetch; retrying");
5413 continue;
5414 }
5415 (available_hits, paged_hits) = finalize_hits(&retry_results)?;
5416 ann_stats = retry_ann_stats;
5417 }
5418
5419 tracing::trace!(
5420 query = canonical,
5421 target_hits,
5422 available_hits,
5423 returned = paged_hits.len(),
5424 "semantic fetch complete"
5425 );
5426
5427 return Ok((paged_hits, ann_stats));
5428 }
5429 }
5430
5431 fn hydrate_semantic_hits(
5432 &self,
5433 results: &[VectorSearchResult],
5434 field_mask: FieldMask,
5435 ) -> Result<Vec<SearchHit>> {
5436 self.hydrate_semantic_hits_with_ids(results, field_mask)
5437 .map(|rows| rows.into_iter().map(|(_, hit)| hit).collect())
5438 }
5439
5440 fn postprocess_hits_page(
5441 &self,
5442 hits: Vec<SearchHit>,
5443 query: &str,
5444 filters: &SearchFilters,
5445 limit: usize,
5446 offset: usize,
5447 ) -> (usize, Vec<SearchHit>) {
5448 let mut hits = deduplicate_hits_with_query(hits, query);
5449 if !filters.session_paths.is_empty() {
5450 hits.retain(|hit| filters.session_paths.contains(&hit.source_path));
5451 }
5452 let available_hits = hits.len();
5453 let paged_hits = hits.into_iter().skip(offset).take(limit).collect();
5454 (available_hits, paged_hits)
5455 }
5456
5457 pub fn search_with_fallback(
5461 &self,
5462 query: &str,
5463 filters: SearchFilters,
5464 limit: usize,
5465 offset: usize,
5466 sparse_threshold: usize,
5467 field_mask: FieldMask,
5468 ) -> Result<SearchResult> {
5469 let hits = self.search(query, filters.clone(), limit, offset, field_mask)?;
5471 let baseline_stats = self.cache_stats();
5472 let tantivy_total = self
5474 .last_tantivy_total_count
5475 .lock()
5476 .ok()
5477 .and_then(|guard| *guard);
5478
5479 let query_has_wildcards = query.contains('*');
5481 let has_boolean_or_phrase = fs_cass_has_boolean_operators(query);
5482 let is_sparse = should_try_wildcard_fallback(hits.len(), limit, offset, sparse_threshold);
5483
5484 if !is_sparse || query_has_wildcards || has_boolean_or_phrase || query.trim().is_empty() {
5485 let suggestions = if hits.is_empty() && !query.trim().is_empty() {
5489 self.generate_suggestions(query, &filters)
5490 } else {
5491 Vec::new()
5492 };
5493 return Ok(SearchResult {
5494 hits,
5495 wildcard_fallback: false,
5496 cache_stats: baseline_stats,
5497 suggestions,
5498 ann_stats: None,
5499 total_count: tantivy_total,
5500 });
5501 }
5502
5503 if should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(query, hits.len()) {
5504 let suggestions = if hits.is_empty() {
5505 self.generate_suggestions(query, &filters)
5506 } else {
5507 Vec::new()
5508 };
5509 return Ok(SearchResult {
5510 hits,
5511 wildcard_fallback: false,
5512 cache_stats: baseline_stats,
5513 suggestions,
5514 ann_stats: None,
5515 total_count: tantivy_total,
5516 });
5517 }
5518
5519 let wildcard_query = query
5521 .split_whitespace()
5522 .map(|term| format!("*{}*", term.trim_matches('*')))
5523 .collect::<Vec<_>>()
5524 .join(" ");
5525
5526 tracing::info!(
5527 original_query = query,
5528 wildcard_query = wildcard_query,
5529 original_count = hits.len(),
5530 "wildcard_fallback"
5531 );
5532
5533 let mut fallback_hits =
5534 self.search(&wildcard_query, filters.clone(), limit, offset, field_mask)?;
5535 let fallback_stats = self.cache_stats();
5536 let fallback_tantivy_total = self
5538 .last_tantivy_total_count
5539 .lock()
5540 .ok()
5541 .and_then(|guard| *guard);
5542
5543 if fallback_hits.len() > hits.len() {
5545 for hit in &mut fallback_hits {
5547 hit.match_type = MatchType::ImplicitWildcard;
5548 }
5549 let suggestions = if fallback_hits.is_empty() {
5551 self.generate_suggestions(query, &filters)
5552 } else {
5553 Vec::new()
5554 };
5555 Ok(SearchResult {
5556 hits: fallback_hits,
5557 wildcard_fallback: true,
5558 cache_stats: fallback_stats,
5559 suggestions,
5560 ann_stats: None,
5561 total_count: fallback_tantivy_total,
5562 })
5563 } else {
5564 let suggestions = if hits.is_empty() {
5567 self.generate_suggestions(query, &filters)
5568 } else {
5569 Vec::new()
5570 };
5571 Ok(SearchResult {
5572 hits,
5573 wildcard_fallback: false,
5574 cache_stats: baseline_stats,
5575 suggestions,
5576 ann_stats: None,
5577 total_count: tantivy_total,
5578 })
5579 }
5580 }
5581
5582 #[allow(clippy::too_many_arguments)]
5584 pub fn search_hybrid(
5585 &self,
5586 lexical_query: &str,
5587 semantic_query: &str,
5588 filters: SearchFilters,
5589 limit: usize,
5590 offset: usize,
5591 sparse_threshold: usize,
5592 field_mask: FieldMask,
5593 approximate: bool,
5594 ) -> Result<SearchResult> {
5595 self.search_hybrid_with_tier(
5596 lexical_query,
5597 semantic_query,
5598 filters,
5599 limit,
5600 offset,
5601 sparse_threshold,
5602 field_mask,
5603 approximate,
5604 SemanticTierMode::Single,
5605 )
5606 }
5607
5608 #[allow(clippy::too_many_arguments)]
5611 pub fn search_hybrid_with_tier(
5612 &self,
5613 lexical_query: &str,
5614 semantic_query: &str,
5615 filters: SearchFilters,
5616 limit: usize,
5617 offset: usize,
5618 sparse_threshold: usize,
5619 field_mask: FieldMask,
5620 approximate: bool,
5621 semantic_tier_mode: SemanticTierMode,
5622 ) -> Result<SearchResult> {
5623 let requested_limit = limit;
5624 let total_docs = self.total_docs().max(1);
5625 let limit = if requested_limit == 0 {
5626 total_docs.min(no_limit_result_cap()).max(1)
5627 } else {
5628 requested_limit
5629 };
5630 let fetch = limit.saturating_add(offset);
5631 if fetch == 0 {
5632 return Ok(SearchResult {
5633 hits: Vec::new(),
5634 wildcard_fallback: false,
5635 cache_stats: self.cache_stats(),
5636 suggestions: Vec::new(),
5637 ann_stats: None,
5638 total_count: None,
5639 });
5640 }
5641
5642 if semantic_query.trim().is_empty() {
5643 return self.search_with_fallback(
5644 lexical_query,
5645 filters,
5646 limit,
5647 offset,
5648 sparse_threshold,
5649 field_mask,
5650 );
5651 }
5652
5653 let budget =
5654 hybrid_candidate_budget(semantic_query, requested_limit, limit, offset, total_docs);
5655 let lexical = self.search_with_fallback(
5656 lexical_query,
5657 filters.clone(),
5658 budget.lexical_candidates,
5659 0,
5660 sparse_threshold,
5661 field_mask,
5662 )?;
5663 let (semantic_hits, semantic_ann_stats) = self.search_semantic_with_tier(
5664 semantic_query,
5665 filters,
5666 budget.semantic_candidates,
5667 0,
5668 field_mask,
5669 approximate,
5670 semantic_tier_mode,
5671 )?;
5672 let fused = rrf_fuse_hits(&lexical.hits, &semantic_hits, semantic_query, limit, offset);
5673 let suggestions = if fused.is_empty() {
5674 lexical.suggestions.clone()
5675 } else {
5676 Vec::new()
5677 };
5678 Ok(SearchResult {
5679 hits: fused,
5680 wildcard_fallback: lexical.wildcard_fallback,
5681 cache_stats: lexical.cache_stats,
5682 suggestions,
5683 ann_stats: semantic_ann_stats,
5684 total_count: None,
5685 })
5686 }
5687
5688 fn generate_suggestions(&self, query: &str, filters: &SearchFilters) -> Vec<QuerySuggestion> {
5690 let mut suggestions = Vec::new();
5691 let query_lower = query.to_lowercase();
5692
5693 if !query.contains('*') && query.len() >= 2 {
5695 suggestions.push(QuerySuggestion::wildcard(query).with_shortcut(1));
5696 }
5697
5698 if !filters.agents.is_empty() {
5700 let agents: Vec<&str> = filters
5701 .agents
5702 .iter()
5703 .map(std::string::String::as_str)
5704 .collect();
5705 let agent_str = agents.join(", ");
5706 suggestions
5707 .push(QuerySuggestion::remove_agent_filter(&agent_str, filters).with_shortcut(2));
5708 }
5709
5710 let known_agents = [
5712 "codex",
5713 "claude",
5714 "claude_code",
5715 "cline",
5716 "gemini",
5717 "amp",
5718 "opencode",
5719 ];
5720 for agent in &known_agents {
5721 if levenshtein_distance(&query_lower, agent) <= 2 && query_lower != *agent {
5722 suggestions.push(
5723 QuerySuggestion::spelling(query, agent)
5724 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5725 );
5726 break; }
5728 }
5729
5730 if filters.agents.is_empty()
5734 && let Ok(sqlite_guard) = self.sqlite.lock()
5735 && let Some(conn) = sqlite_guard.as_ref()
5736 && let Ok(rows) = conn.query_map_collect(
5737 "SELECT a.slug
5738 FROM conversations c
5739 JOIN agents a ON c.agent_id = a.id
5740 GROUP BY a.slug
5741 ORDER BY MAX(c.id) DESC
5742 LIMIT 3",
5743 &[],
5744 |row: &frankensqlite::Row| row.get_typed::<String>(0),
5745 )
5746 {
5747 for row in rows {
5748 if suggestions.len() < 3 {
5749 suggestions.push(
5750 QuerySuggestion::try_agent(&row)
5751 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5752 );
5753 }
5754 }
5755 }
5756
5757 suggestions.truncate(3);
5759 for (i, sugg) in suggestions.iter_mut().enumerate() {
5760 sugg.shortcut = Some((i + 1) as u8);
5761 }
5762
5763 suggestions
5764 }
5765
5766 fn searcher_for_thread(&self, reader: &IndexReader) -> Searcher {
5767 let epoch = self.reload_epoch.load(Ordering::Relaxed);
5768 let reader_key = reader as *const IndexReader as usize;
5769 THREAD_SEARCHER.with(|slot| {
5770 let mut slot = slot.borrow_mut();
5771 if let Some(entry) = slot.as_ref()
5772 && entry.epoch == epoch
5773 && entry.reader_key == reader_key
5774 {
5775 return entry.searcher.clone();
5776 }
5777 let searcher = reader.searcher();
5778 *slot = Some(SearcherCacheEntry {
5779 epoch,
5780 reader_key,
5781 searcher: searcher.clone(),
5782 });
5783 searcher
5784 })
5785 }
5786
5787 fn federated_readers(&self) -> Option<Arc<Vec<FederatedIndexReader>>> {
5788 FEDERATED_SEARCH_READERS
5789 .read()
5790 .get(&self.cache_namespace)
5791 .cloned()
5792 }
5793
5794 fn maybe_reload_federated_readers(
5795 &self,
5796 readers: &[FederatedIndexReader],
5797 ) -> Result<Option<u64>> {
5798 if !self.reload_on_search || readers.is_empty() {
5799 return Ok(None);
5800 }
5801 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
5802 let now = Instant::now();
5803 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
5804 if guard
5805 .map(|t| now.duration_since(t) < MIN_RELOAD_INTERVAL)
5806 .unwrap_or(false)
5807 {
5808 let signature = self.federated_generation_signature(readers);
5809 return Ok(Some(signature));
5810 }
5811
5812 let reload_started = Instant::now();
5813 for shard in readers {
5814 shard.reader.reload()?;
5815 }
5816 let elapsed = reload_started.elapsed();
5817 *guard = Some(now);
5818 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
5819 self.metrics.record_reload(elapsed);
5820 tracing::debug!(
5821 duration_ms = elapsed.as_millis() as u64,
5822 reload_epoch = epoch,
5823 shards = readers.len(),
5824 "tantivy_reader_reload_federated"
5825 );
5826 Ok(Some(self.federated_generation_signature(readers)))
5827 }
5828
5829 fn federated_generation_signature(&self, readers: &[FederatedIndexReader]) -> u64 {
5830 let mut hasher = std::collections::hash_map::DefaultHasher::new();
5831 readers.len().hash(&mut hasher);
5832 for shard in readers {
5833 self.searcher_for_thread(&shard.reader)
5834 .generation()
5835 .generation_id()
5836 .hash(&mut hasher);
5837 }
5838 hasher.finish()
5839 }
5840
5841 fn track_generation(&self, generation: u64) {
5842 let mut guard = self
5843 .last_generation
5844 .lock()
5845 .unwrap_or_else(|e| e.into_inner());
5846 if let Some(prev) = *guard
5847 && prev != generation
5848 && let Ok(mut cache) = self.prefix_cache.lock()
5849 {
5850 cache.clear();
5851 }
5852 *guard = Some(generation);
5853 }
5854
5855 fn hydrate_tantivy_hit_contents(
5856 &self,
5857 exact_keys: &[TantivyContentExactKey],
5858 fallback_keys: &[TantivyContentFallbackKey],
5859 ) -> Result<TantivyHydratedContentMaps> {
5860 if exact_keys.is_empty() && fallback_keys.is_empty() {
5861 return Ok((HashMap::new(), HashMap::new()));
5862 }
5863
5864 let sqlite_guard = match self.sqlite_guard() {
5865 Ok(guard) => guard,
5866 Err(_) => return Ok((HashMap::new(), HashMap::new())),
5867 };
5868 let Some(conn) = sqlite_guard.as_ref() else {
5869 return Ok((HashMap::new(), HashMap::new()));
5870 };
5871
5872 let mut hydrated_exact = HashMap::new();
5873 let mut hydrated_fallback = HashMap::new();
5874 const CHUNK_SIZE: usize = 300;
5875
5876 if !exact_keys.is_empty() {
5877 let mut unique_exact_keys = Vec::with_capacity(exact_keys.len());
5878 let mut seen = HashSet::with_capacity(exact_keys.len());
5879 for key in exact_keys {
5880 if seen.insert(*key) {
5881 unique_exact_keys.push(*key);
5882 }
5883 }
5884
5885 hydrated_exact.extend(hydrate_message_content_by_conversation(
5886 conn,
5887 &unique_exact_keys,
5888 )?);
5889 }
5890
5891 if !fallback_keys.is_empty() {
5892 let mut unique_fallback_keys = Vec::with_capacity(fallback_keys.len());
5893 let mut seen = HashSet::with_capacity(fallback_keys.len());
5894 for key in fallback_keys {
5895 if seen.insert(key.clone()) {
5896 unique_fallback_keys.push(key.clone());
5897 }
5898 }
5899
5900 let mut unique_source_paths = Vec::with_capacity(unique_fallback_keys.len());
5901 let mut seen_source_paths = HashSet::with_capacity(unique_fallback_keys.len());
5902 for (_, source_path, _) in &unique_fallback_keys {
5903 if seen_source_paths.insert(source_path.clone()) {
5904 unique_source_paths.push(source_path.clone());
5905 }
5906 }
5907
5908 let mut conversations_by_key: HashMap<(String, String), Vec<i64>> = HashMap::new();
5909 for chunk in unique_source_paths.chunks(CHUNK_SIZE) {
5910 let placeholders = sql_placeholders(chunk.len());
5911 let sql = format!(
5912 "SELECT c.id,
5913 c.source_path,
5914 COALESCE(c.source_id, ''),
5915 COALESCE(c.origin_host, ''),
5916 COALESCE(s.kind, '')
5917 FROM conversations c
5918 LEFT JOIN sources s ON c.source_id = s.id
5919 WHERE c.source_path IN ({placeholders})
5920 ORDER BY c.id"
5921 );
5922 let params = chunk
5923 .iter()
5924 .map(|source_path| ParamValue::from(source_path.clone()))
5925 .collect::<Vec<_>>();
5926 let rows: Vec<(i64, String, String, String, String)> =
5927 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
5928 Ok((
5929 row.get_typed(0)?,
5930 row.get_typed(1)?,
5931 row.get_typed(2)?,
5932 row.get_typed(3)?,
5933 row.get_typed(4)?,
5934 ))
5935 })?;
5936
5937 for (conversation_id, source_path, raw_source_id, origin_host, origin_kind) in rows
5938 {
5939 let normalized_source_id = normalized_search_hit_source_id_parts(
5940 &raw_source_id,
5941 &origin_kind,
5942 (!origin_host.trim().is_empty()).then_some(origin_host.as_str()),
5943 );
5944 conversations_by_key
5945 .entry((normalized_source_id, source_path))
5946 .or_default()
5947 .push(conversation_id);
5948 }
5949 }
5950
5951 let mut message_requests = Vec::new();
5952 let mut fallback_keys_by_exact: HashMap<
5953 TantivyContentExactKey,
5954 Vec<TantivyContentFallbackKey>,
5955 > = HashMap::new();
5956 let mut seen_message_requests = HashSet::new();
5957 for (source_id, source_path, line_idx) in &unique_fallback_keys {
5958 let key = (source_id.clone(), source_path.clone());
5959 let Some(conversation_ids) = conversations_by_key.get(&key) else {
5960 continue;
5961 };
5962 for &conversation_id in conversation_ids {
5963 let exact_key = (conversation_id, *line_idx);
5964 if seen_message_requests.insert(exact_key) {
5965 message_requests.push(exact_key);
5966 }
5967 fallback_keys_by_exact.entry(exact_key).or_default().push((
5968 source_id.clone(),
5969 source_path.clone(),
5970 *line_idx,
5971 ));
5972 }
5973 }
5974
5975 for ((conversation_id, line_idx), content) in
5976 hydrate_message_content_by_conversation(conn, &message_requests)?
5977 {
5978 if let Some(fallback_keys) =
5979 fallback_keys_by_exact.get(&(conversation_id, line_idx))
5980 {
5981 for fallback_key in fallback_keys {
5982 hydrated_fallback.insert(fallback_key.clone(), content.clone());
5983 }
5984 }
5985 }
5986 }
5987
5988 Ok((hydrated_exact, hydrated_fallback))
5989 }
5990
5991 #[allow(clippy::too_many_arguments)]
5992 fn search_tantivy(
5993 &self,
5994 reader: &IndexReader,
5995 fields: &FsCassFields,
5996 raw_query: &str,
5997 sanitized_query: &str,
5998 filters: SearchFilters,
5999 limit: usize,
6000 offset: usize,
6001 field_mask: FieldMask,
6002 ) -> Result<(Vec<SearchHit>, usize)> {
6003 struct PendingTantivyHit {
6004 score: f32,
6005 doc: TantivyDocument,
6006 title: String,
6007 stored_content: String,
6008 stored_preview: String,
6009 agent: String,
6010 source_path: String,
6011 workspace: String,
6012 workspace_original: Option<String>,
6013 created_at: Option<i64>,
6014 line_number: Option<usize>,
6015 stored_preview_snippet: Option<String>,
6016 source_id: String,
6017 conversation_id: Option<i64>,
6018 raw_origin_kind: Option<String>,
6019 origin_host: Option<String>,
6020 }
6021
6022 self.maybe_reload_reader(reader)?;
6023 let searcher = self.searcher_for_thread(reader);
6024 self.track_generation(searcher.generation().generation_id());
6025
6026 let wants_snippet = field_mask.wants_snippet();
6027 let needs_content = field_mask.needs_content() || wants_snippet;
6028
6029 let fs_filters = FsCassQueryFilters {
6032 agents: filters.agents.into_iter().collect(),
6033 workspaces: filters.workspaces.into_iter().collect(),
6034 created_from: filters.created_from,
6035 created_to: filters.created_to,
6036 source_filter: match filters.source_filter {
6037 SourceFilter::All => FsCassSourceFilter::All,
6038 SourceFilter::Local => FsCassSourceFilter::Local,
6039 SourceFilter::Remote => FsCassSourceFilter::Remote,
6040 SourceFilter::SourceId(id) => {
6041 FsCassSourceFilter::SourceId(normalize_search_source_filter_value(&id))
6042 }
6043 },
6044 };
6045
6046 let q: Box<dyn Query> = fs_cass_build_tantivy_query(raw_query, &fs_filters, fields);
6049
6050 let prefix_only = is_prefix_only(sanitized_query);
6051 let top_docs = execute_query_with_lazy_exact_count(&searcher, &*q, limit, offset)?;
6052 let tantivy_total_count = top_docs.total_count;
6053 let query_match_type = dominant_match_type(sanitized_query);
6054 let mut pending_hits = Vec::with_capacity(top_docs.hits.len());
6055 let mut missing_exact_content_keys = Vec::new();
6056 let mut missing_fallback_content_keys = Vec::new();
6057
6058 for ranked_hit in top_docs.hits {
6059 let score = ranked_hit.bm25_score;
6060 let doc: TantivyDocument = fs_load_doc(&searcher, ranked_hit.doc_address)?;
6061 let title = if field_mask.wants_title() {
6062 doc.get_first(fields.title)
6063 .and_then(|v| v.as_str())
6064 .unwrap_or("")
6065 .to_string()
6066 } else {
6067 String::new()
6068 };
6069 let stored_content = doc
6070 .get_first(fields.content)
6071 .and_then(|v| v.as_str())
6072 .unwrap_or("")
6073 .to_string();
6074 let stored_preview = doc
6075 .get_first(fields.preview)
6076 .and_then(|v| v.as_str())
6077 .unwrap_or("")
6078 .to_string();
6079 let stored_preview_snippet = snippet_from_preview_without_full_content(
6080 field_mask,
6081 &stored_preview,
6082 sanitized_query,
6083 );
6084 let agent = doc
6085 .get_first(fields.agent)
6086 .and_then(|v| v.as_str())
6087 .unwrap_or("")
6088 .to_string();
6089 let workspace = doc
6090 .get_first(fields.workspace)
6091 .and_then(|v| v.as_str())
6092 .unwrap_or("")
6093 .to_string();
6094 let workspace_original = doc
6095 .get_first(fields.workspace_original)
6096 .and_then(|v| v.as_str())
6097 .filter(|s| !s.is_empty())
6098 .map(String::from);
6099 let created_at = doc.get_first(fields.created_at).and_then(|v| v.as_i64());
6100 let line_number = doc
6101 .get_first(fields.msg_idx)
6102 .and_then(|v| v.as_u64())
6103 .and_then(|i| usize::try_from(i).ok())
6104 .map(|i| i.saturating_add(1));
6105 let raw_source_id = doc
6106 .get_first(fields.source_id)
6107 .and_then(|v| v.as_str())
6108 .unwrap_or_default()
6109 .to_string();
6110 let conversation_id = fields
6111 .conversation_id
6112 .and_then(|field| doc.get_first(field))
6113 .and_then(|v| v.as_i64());
6114 let source_path = doc
6115 .get_first(fields.source_path)
6116 .and_then(|v| v.as_str())
6117 .unwrap_or("")
6118 .to_string();
6119 let raw_origin_kind = doc
6120 .get_first(fields.origin_kind)
6121 .and_then(|v| v.as_str())
6122 .map(str::to_string);
6123 let origin_host = doc
6124 .get_first(fields.origin_host)
6125 .and_then(|v| v.as_str())
6126 .filter(|s| !s.is_empty())
6127 .map(String::from);
6128 let source_id = normalized_search_hit_source_id_parts(
6129 raw_source_id.as_str(),
6130 raw_origin_kind.as_deref().unwrap_or_default(),
6131 origin_host.as_deref(),
6132 );
6133
6134 let preview_satisfies_bounded_content =
6135 field_mask.preview_content_limit().is_some() && !stored_preview.is_empty();
6136 let preview_satisfies_full_content = field_mask.needs_content()
6137 && field_mask.preview_content_limit().is_none()
6138 && stored_preview_is_complete_content(&stored_preview);
6139 if needs_content
6140 && let Some(line_idx) = line_number
6141 .and_then(|line| line.checked_sub(1))
6142 .and_then(|line| i64::try_from(line).ok())
6143 && stored_content.is_empty()
6144 && !preview_satisfies_bounded_content
6145 && !preview_satisfies_full_content
6146 && stored_preview_snippet.is_none()
6147 {
6148 if let Some(conversation_id) = conversation_id {
6149 missing_exact_content_keys.push((conversation_id, line_idx));
6150 } else {
6151 missing_fallback_content_keys.push((
6152 source_id.clone(),
6153 source_path.clone(),
6154 line_idx,
6155 ));
6156 }
6157 }
6158
6159 pending_hits.push(PendingTantivyHit {
6160 score,
6161 doc,
6162 title,
6163 stored_content,
6164 stored_preview,
6165 agent,
6166 source_path,
6167 workspace,
6168 workspace_original,
6169 created_at,
6170 line_number,
6171 stored_preview_snippet,
6172 source_id,
6173 conversation_id,
6174 raw_origin_kind,
6175 origin_host,
6176 });
6177 }
6178
6179 let (hydrated_contents, hydrated_fallback_contents) = if needs_content
6180 && (!missing_exact_content_keys.is_empty() || !missing_fallback_content_keys.is_empty())
6181 {
6182 self.hydrate_tantivy_hit_contents(
6183 &missing_exact_content_keys,
6184 &missing_fallback_content_keys,
6185 )?
6186 } else {
6187 (HashMap::new(), HashMap::new())
6188 };
6189 let needs_tantivy_snippet_generator = wants_snippet
6190 && !prefix_only
6191 && pending_hits
6192 .iter()
6193 .any(|pending| pending.stored_preview_snippet.is_none());
6194 let snippet_generator = if needs_tantivy_snippet_generator {
6195 let snippet_cfg = FsSnippetConfig {
6196 max_chars: 160,
6197 highlight_prefix: "<b>".to_string(),
6198 highlight_postfix: "</b>".to_string(),
6199 };
6200 fs_try_build_snippet_generator(&searcher, &*q, fields.content, &snippet_cfg)
6201 } else {
6202 None
6203 };
6204 let mut hits = Vec::with_capacity(pending_hits.len());
6205 for pending in pending_hits {
6206 let hydrated_content = pending
6207 .line_number
6208 .and_then(|line| line.checked_sub(1))
6209 .and_then(|line| i64::try_from(line).ok())
6210 .and_then(|line_idx| {
6211 if let Some(conversation_id) = pending.conversation_id {
6212 hydrated_contents.get(&(conversation_id, line_idx)).cloned()
6213 } else {
6214 hydrated_fallback_contents
6215 .get(&(
6216 pending.source_id.clone(),
6217 pending.source_path.clone(),
6218 line_idx,
6219 ))
6220 .cloned()
6221 }
6222 });
6223 let preview_satisfies_effective_content = !pending.stored_preview.is_empty()
6224 && (field_mask.preview_content_limit().is_some()
6225 || (field_mask.needs_content()
6226 && field_mask.preview_content_limit().is_none()
6227 && stored_preview_is_complete_content(&pending.stored_preview)));
6228 let effective_content = if !pending.stored_content.is_empty() {
6229 pending.stored_content.clone()
6230 } else if preview_satisfies_effective_content {
6231 pending.stored_preview.clone()
6232 } else if let Some(content) = hydrated_content {
6233 content
6234 } else {
6235 pending.stored_preview.clone()
6236 };
6237 let snippet = if wants_snippet {
6238 if let Some(snippet) = pending.stored_preview_snippet.clone() {
6239 snippet
6240 } else if let Some(r#gen) = &snippet_generator {
6241 let rendered = if !pending.stored_content.is_empty() {
6242 fs_render_snippet_html(r#gen, &pending.doc, "<b>", "</b>")
6243 } else if !effective_content.is_empty() {
6244 let mut snippet_doc = TantivyDocument::new();
6245 snippet_doc.add_text(fields.content, &effective_content);
6246 fs_render_snippet_html(r#gen, &snippet_doc, "<b>", "</b>")
6247 } else {
6248 None
6249 };
6250 rendered
6251 .map(|html| html.replace("<b>", "**").replace("</b>", "**"))
6252 .or_else(|| cached_prefix_snippet(&effective_content, sanitized_query, 160))
6253 .unwrap_or_else(|| {
6254 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6255 })
6256 } else if let Some(sn) =
6257 cached_prefix_snippet(&effective_content, sanitized_query, 160)
6258 {
6259 sn
6260 } else {
6261 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6262 }
6263 } else {
6264 String::new()
6265 };
6266 let content = if field_mask.needs_content() {
6267 effective_content.clone()
6268 } else {
6269 String::new()
6270 };
6271 let content_hash = stable_hit_hash(
6272 &effective_content,
6273 &pending.source_path,
6274 pending.line_number,
6275 pending.created_at,
6276 );
6277 let origin_kind = normalized_search_hit_origin_kind(
6278 &pending.source_id,
6279 pending.raw_origin_kind.as_deref(),
6280 )
6281 .to_string();
6282 hits.push(SearchHit {
6283 title: pending.title,
6284 snippet,
6285 content,
6286 content_hash,
6287 conversation_id: pending.conversation_id,
6288 score: pending.score,
6289 source_path: pending.source_path,
6290 agent: pending.agent,
6291 workspace: pending.workspace,
6292 workspace_original: pending.workspace_original,
6293 created_at: pending.created_at,
6294 line_number: pending.line_number,
6295 match_type: query_match_type,
6296 source_id: pending.source_id,
6297 origin_kind,
6298 origin_host: pending.origin_host,
6299 });
6300 }
6301 Ok((hits, tantivy_total_count))
6302 }
6303
6304 #[allow(clippy::too_many_arguments)]
6305 fn search_tantivy_federated(
6306 &self,
6307 readers: &[FederatedIndexReader],
6308 raw_query: &str,
6309 sanitized_query: &str,
6310 filters: SearchFilters,
6311 limit: usize,
6312 field_mask: FieldMask,
6313 ) -> Result<(Vec<SearchHit>, usize)> {
6314 let mut ranked_hits = Vec::new();
6315 let mut total_count = 0usize;
6316
6317 for (shard_index, shard) in readers.iter().enumerate() {
6318 let (shard_hits, shard_total_count) = self.search_tantivy(
6319 &shard.reader,
6320 &shard.fields,
6321 raw_query,
6322 sanitized_query,
6323 filters.clone(),
6324 limit,
6325 0,
6326 field_mask,
6327 )?;
6328 total_count = total_count.saturating_add(shard_total_count);
6329 for (shard_rank, hit) in shard_hits.into_iter().enumerate() {
6330 ranked_hits.push(FederatedRankedHit {
6331 hit,
6332 shard_index,
6333 shard_rank,
6334 fused_score: federated_rrf_score(shard_rank),
6335 });
6336 }
6337 }
6338
6339 let raw_hit_count = ranked_hits.len();
6340 let generation_signature = self.federated_generation_signature(readers);
6341 self.track_generation(generation_signature);
6342 let combined_hits = merge_federated_ranked_hits(ranked_hits);
6343 tracing::debug!(
6344 generation_signature,
6345 shard_count = readers.len(),
6346 total_count,
6347 raw_hit_count,
6348 returned_hit_count = combined_hits.len(),
6349 merge_policy = "rrf_rank_then_stable_hit_key",
6350 "federated lexical search merged shard results"
6351 );
6352
6353 Ok((combined_hits, total_count))
6354 }
6355
6356 fn sqlite_fts_uses_message_id_column(conn: &Connection) -> Result<bool> {
6357 let params: [ParamValue; 0] = [];
6358 let ddl_rows: Vec<String> = franken_query_map_collect_retry(
6359 conn,
6360 "SELECT COALESCE(sql, '')
6361 FROM sqlite_master
6362 WHERE name = 'fts_messages'
6363 ORDER BY rowid DESC
6364 LIMIT 1",
6365 ¶ms,
6366 |row: &frankensqlite::Row| row.get_typed::<String>(0),
6367 )?;
6368 Ok(ddl_rows
6369 .first()
6370 .map(|sql| sql.to_ascii_lowercase().contains("message_id"))
6371 .unwrap_or(false))
6372 }
6373
6374 fn sqlite_fts_match_mode(conn: &Connection) -> Result<SqliteFtsMatchMode> {
6375 let params = [ParamValue::from("__cass_fts_probe_no_match__")];
6376 match franken_query_map_collect_retry(
6377 conn,
6378 "SELECT COUNT(*) FROM fts_messages WHERE fts_messages MATCH ?",
6379 ¶ms,
6380 |row: &frankensqlite::Row| row.get_typed::<i64>(0),
6381 ) {
6382 Ok(_) => Ok(SqliteFtsMatchMode::Table),
6383 Err(err)
6384 if err
6385 .to_string()
6386 .contains("no such column: fts_messages in table fts_messages") =>
6387 {
6388 Ok(SqliteFtsMatchMode::IndexedColumns)
6389 }
6390 Err(err) => Err(anyhow!(err)),
6391 }
6392 }
6393
6394 fn sqlite_fts5_rowid_projection_available(conn: &Connection) -> bool {
6395 let params: [ParamValue; 0] = [];
6396 franken_query_map_collect_retry(
6397 conn,
6398 "SELECT rowid FROM fts_messages LIMIT 1",
6399 ¶ms,
6400 |row: &frankensqlite::Row| row.get_typed::<i64>(0),
6401 )
6402 .is_ok()
6403 }
6404
6405 fn sqlite_fts5_match_clause(match_mode: SqliteFtsMatchMode) -> &'static str {
6406 match match_mode {
6407 SqliteFtsMatchMode::Table => "fts_messages MATCH ?",
6408 SqliteFtsMatchMode::IndexedColumns => {
6409 "(content MATCH ?
6410 OR title MATCH ?
6411 OR agent MATCH ?
6412 OR workspace MATCH ?
6413 OR source_path MATCH ?)"
6414 }
6415 }
6416 }
6417
6418 fn push_sqlite_fts5_match_params(
6419 params: &mut Vec<ParamValue>,
6420 fts_query: &str,
6421 match_mode: SqliteFtsMatchMode,
6422 ) {
6423 let copies = match match_mode {
6424 SqliteFtsMatchMode::Table => 1,
6425 SqliteFtsMatchMode::IndexedColumns => 5,
6426 };
6427 for _ in 0..copies {
6428 params.push(ParamValue::from(fts_query));
6429 }
6430 }
6431
6432 fn sqlite_fts5_rank_query(
6433 fts_query: &str,
6434 _filters: &SearchFilters,
6435 limit: usize,
6436 offset: usize,
6437 _uses_message_id: bool,
6438 match_mode: SqliteFtsMatchMode,
6439 ) -> (String, Vec<ParamValue>) {
6440 let match_clause = Self::sqlite_fts5_match_clause(match_mode);
6441 let mut sql = format!(
6442 "SELECT rowid,
6443 bm25(fts_messages)
6444 FROM fts_messages
6445 WHERE {match_clause}"
6446 );
6447 let mut params = Vec::with_capacity(9);
6448 Self::push_sqlite_fts5_match_params(&mut params, fts_query, match_mode);
6449
6450 sql.push_str(" ORDER BY bm25(fts_messages), rowid LIMIT ? OFFSET ?");
6451 params.push(ParamValue::from(limit as i64));
6452 params.push(ParamValue::from(offset as i64));
6453
6454 (sql, params)
6455 }
6456
6457 fn sqlite_fts5_hydrate_query(
6458 row_count: usize,
6459 field_mask: FieldMask,
6460 uses_message_id: bool,
6461 ) -> String {
6462 let title_expr = if field_mask.wants_title() {
6463 "fts_messages.title"
6464 } else {
6465 "NULL"
6466 };
6467 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6468 "fts_messages.content"
6469 } else {
6470 "NULL"
6471 };
6472 let message_key_expr = if uses_message_id {
6473 "CAST(fts_messages.message_id AS INTEGER)"
6474 } else {
6475 "rowid"
6476 };
6477 let placeholders = sql_placeholders(row_count);
6478
6479 format!(
6480 "SELECT rowid,
6481 {message_key_expr},
6482 {title_expr},
6483 {content_expr},
6484 fts_messages.agent,
6485 fts_messages.workspace,
6486 fts_messages.source_path,
6487 CAST(fts_messages.created_at AS INTEGER)
6488 FROM fts_messages
6489 WHERE rowid IN ({placeholders})"
6490 )
6491 }
6492
6493 fn sqlite_fts5_message_hydrate_query(row_count: usize, field_mask: FieldMask) -> String {
6494 let title_expr = if field_mask.wants_title() {
6495 "COALESCE(c.title, '')"
6496 } else {
6497 "''"
6498 };
6499 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6500 "COALESCE(m.content, '')"
6501 } else {
6502 "''"
6503 };
6504 let normalized_source_sql =
6505 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6506 let placeholders = sql_placeholders(row_count);
6507
6508 format!(
6509 "SELECT m.id,
6510 {title_expr},
6511 {content_expr},
6512 COALESCE(a.slug, ''),
6513 COALESCE(w.path, ''),
6514 COALESCE(c.source_path, ''),
6515 CAST(m.created_at AS INTEGER),
6516 m.idx,
6517 c.id,
6518 {normalized_source_sql},
6519 c.origin_host,
6520 s.kind
6521 FROM messages m
6522 LEFT JOIN conversations c ON m.conversation_id = c.id
6523 LEFT JOIN sources s ON c.source_id = s.id
6524 LEFT JOIN agents a ON c.agent_id = a.id
6525 LEFT JOIN workspaces w ON c.workspace_id = w.id
6526 WHERE m.id IN ({placeholders})"
6527 )
6528 }
6529
6530 fn sqlite_fts5_hydrate_row_chunks(
6531 ranked_rows: &[(i64, f64)],
6532 ) -> impl Iterator<Item = &[(i64, f64)]> {
6533 const _: () = assert!(SQLITE_FTS5_HYDRATE_PARAM_CHUNK <= SQLITE_MAX_VARIABLE_NUMBER);
6534 ranked_rows.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK)
6535 }
6536
6537 fn sqlite_fts5_filters_need_post_hydration(filters: &SearchFilters) -> bool {
6538 !filters.agents.is_empty()
6539 || !filters.workspaces.is_empty()
6540 || filters.created_from.is_some()
6541 || filters.created_to.is_some()
6542 || !filters.source_filter.is_all()
6543 || !filters.session_paths.is_empty()
6544 }
6545
6546 fn sqlite_fts5_hit_matches_filters(hit: &SearchHit, filters: &SearchFilters) -> bool {
6547 if !filters.agents.is_empty() && !filters.agents.contains(&hit.agent) {
6548 return false;
6549 }
6550 if !filters.workspaces.is_empty() && !filters.workspaces.contains(&hit.workspace) {
6551 return false;
6552 }
6553 if filters.created_from.is_some() || filters.created_to.is_some() {
6554 let Some(created_at) = hit.created_at else {
6555 return false;
6556 };
6557 if let Some(created_from) = filters.created_from
6558 && created_at < created_from
6559 {
6560 return false;
6561 }
6562 if let Some(created_to) = filters.created_to
6563 && created_at > created_to
6564 {
6565 return false;
6566 }
6567 }
6568 if !filters.session_paths.is_empty() && !filters.session_paths.contains(&hit.source_path) {
6569 return false;
6570 }
6571
6572 match &filters.source_filter {
6573 SourceFilter::All => true,
6574 SourceFilter::Local => matches!(
6575 hit.source_id
6576 .as_str()
6577 .cmp(crate::sources::provenance::LOCAL_SOURCE_ID),
6578 CmpOrdering::Equal
6579 ),
6580 SourceFilter::Remote => !matches!(
6581 hit.source_id
6582 .as_str()
6583 .cmp(crate::sources::provenance::LOCAL_SOURCE_ID),
6584 CmpOrdering::Equal
6585 ),
6586 SourceFilter::SourceId(id) => {
6587 let normalized = normalize_search_source_filter_value(id);
6588 matches!(
6589 hit.source_id.as_str().cmp(normalized.as_str()),
6590 CmpOrdering::Equal
6591 )
6592 }
6593 }
6594 }
6595
6596 fn sqlite_message_scan_query(raw_query: &str) -> Option<SqliteMessageScanQuery> {
6597 fn scan_parts(parts: Vec<String>) -> Vec<String> {
6598 parts
6599 .into_iter()
6600 .map(|part| part.trim_end_matches('*').to_lowercase())
6601 .filter(|part| !part.is_empty())
6602 .collect()
6603 }
6604
6605 let tokens = fs_cass_parse_boolean_query(raw_query);
6606 if tokens.is_empty() {
6607 return None;
6608 }
6609
6610 let mut include_groups = Vec::new();
6611 let mut pending_or_group: SqliteMessageScanGroup = Vec::new();
6612 let mut exclude_terms = Vec::new();
6613 let mut negated = false;
6614 let mut in_or_sequence = false;
6615 for token in tokens {
6616 match token {
6617 FsCassQueryToken::And => {
6618 if !pending_or_group.is_empty() {
6619 include_groups.push(std::mem::take(&mut pending_or_group));
6620 }
6621 in_or_sequence = false;
6622 negated = false;
6623 }
6624 FsCassQueryToken::Or => {
6625 if include_groups.is_empty() && pending_or_group.is_empty() {
6626 continue;
6627 }
6628 if negated {
6629 return None;
6630 }
6631 in_or_sequence = true;
6632 }
6633 FsCassQueryToken::Not => {
6634 if in_or_sequence {
6635 return None;
6636 }
6637 if !pending_or_group.is_empty() {
6638 include_groups.push(std::mem::take(&mut pending_or_group));
6639 }
6640 negated = true;
6641 in_or_sequence = false;
6642 }
6643 FsCassQueryToken::Term(term) => {
6644 let parts = scan_parts(normalize_term_parts(&term));
6645 if parts.is_empty() {
6646 continue;
6647 }
6648 if negated {
6649 exclude_terms.extend(parts);
6650 } else if in_or_sequence {
6651 if pending_or_group.is_empty() {
6652 let previous = include_groups.pop()?;
6653 pending_or_group.extend(previous);
6654 }
6655 pending_or_group.push(parts);
6656 } else {
6657 include_groups.push(vec![parts]);
6658 }
6659 negated = false;
6660 }
6661 FsCassQueryToken::Phrase(phrase) => {
6662 let parts = normalize_phrase_terms(&phrase);
6663 if parts.is_empty() {
6664 continue;
6665 }
6666 if negated {
6667 exclude_terms.extend(parts);
6668 } else if in_or_sequence {
6669 if pending_or_group.is_empty() {
6670 let previous = include_groups.pop()?;
6671 pending_or_group.extend(previous);
6672 }
6673 pending_or_group.push(parts);
6674 } else {
6675 include_groups.push(vec![parts]);
6676 }
6677 negated = false;
6678 }
6679 }
6680 }
6681
6682 if !pending_or_group.is_empty() {
6683 include_groups.push(pending_or_group);
6684 }
6685
6686 for group in &mut include_groups {
6687 for alternative in group.iter_mut() {
6688 alternative.sort();
6689 alternative.dedup();
6690 }
6691 group.retain(|alternative| !alternative.is_empty());
6692 group.sort();
6693 group.dedup();
6694 }
6695 include_groups.retain(|group| !group.is_empty());
6696 exclude_terms.sort();
6697 exclude_terms.dedup();
6698 if include_groups.is_empty() {
6699 return None;
6700 }
6701
6702 Some(SqliteMessageScanQuery {
6703 include_groups,
6704 exclude_terms,
6705 })
6706 }
6707
6708 fn sqlite_message_scan_score(haystack: &str, scan_query: &SqliteMessageScanQuery) -> f32 {
6709 for term in &scan_query.exclude_terms {
6710 if haystack.contains(term) {
6711 return 0.0;
6712 }
6713 }
6714
6715 let mut score = 0.0f32;
6716 for group in &scan_query.include_groups {
6717 let mut group_score = 0.0f32;
6718 for alternative in group {
6719 let mut alternative_score = 0.0f32;
6720 for term in alternative {
6721 let matches = haystack.matches(term).count();
6722 if matches < 1 {
6723 alternative_score = 0.0;
6724 break;
6725 }
6726 alternative_score += matches as f32;
6727 }
6728 group_score = group_score.max(alternative_score);
6729 }
6730 if group_score <= 0.0 {
6731 return 0.0;
6732 }
6733 score += group_score;
6734 }
6735 score
6736 }
6737
6738 fn sqlite_message_scan_query_sql(field_mask: FieldMask) -> String {
6739 let title_expr = if field_mask.wants_title() {
6740 "COALESCE(c.title, '')"
6741 } else {
6742 "''"
6743 };
6744 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6745 "COALESCE(m.content, '')"
6746 } else {
6747 "''"
6748 };
6749 let normalized_source_sql =
6750 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6751
6752 format!(
6753 "SELECT m.id,
6754 {title_expr},
6755 {content_expr},
6756 COALESCE(a.slug, ''),
6757 COALESCE(w.path, ''),
6758 COALESCE(c.source_path, ''),
6759 CAST(m.created_at AS INTEGER),
6760 m.idx,
6761 c.id,
6762 {normalized_source_sql},
6763 c.origin_host,
6764 s.kind,
6765 COALESCE(m.content, ''),
6766 COALESCE(c.title, '')
6767 FROM messages m
6768 LEFT JOIN conversations c ON m.conversation_id = c.id
6769 LEFT JOIN sources s ON c.source_id = s.id
6770 LEFT JOIN agents a ON c.agent_id = a.id
6771 LEFT JOIN workspaces w ON c.workspace_id = w.id
6772 ORDER BY m.id
6773 LIMIT ?"
6774 )
6775 }
6776
6777 fn search_sqlite_message_scan(
6778 &self,
6779 conn: &Connection,
6780 request: SqliteMessageScanRequest<'_>,
6781 ) -> Result<Vec<SearchHit>> {
6782 let Some(scan_query) = Self::sqlite_message_scan_query(request.raw_query) else {
6783 return Ok(Vec::new());
6784 };
6785
6786 let sql = Self::sqlite_message_scan_query_sql(request.field_mask);
6787 let params = [ParamValue::from(SQLITE_MESSAGE_SCAN_FALLBACK_LIMIT as i64)];
6788 let rows: Vec<(SqliteFtsMessageRow, String, String)> =
6789 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
6790 Ok((
6791 (
6792 row.get_typed(0)?,
6793 row.get_typed(1)?,
6794 row.get_typed(2)?,
6795 row.get_typed(3)?,
6796 row.get_typed(4)?,
6797 row.get_typed(5)?,
6798 row.get_typed(6)?,
6799 row.get_typed(7)?,
6800 row.get_typed(8)?,
6801 row.get_typed::<Option<String>>(9)?,
6802 row.get_typed(10)?,
6803 row.get_typed(11)?,
6804 ),
6805 row.get_typed(12)?,
6806 row.get_typed(13)?,
6807 ))
6808 })?;
6809
6810 let mut scored_hits = Vec::new();
6811 for (
6812 (
6813 _message_id,
6814 title,
6815 raw_content,
6816 agent,
6817 workspace,
6818 source_path,
6819 created_at,
6820 idx,
6821 conversation_id,
6822 raw_source_id,
6823 origin_host,
6824 raw_origin_kind,
6825 ),
6826 scan_content,
6827 scan_title,
6828 ) in rows
6829 {
6830 let mut haystack = String::with_capacity(
6831 scan_content.len()
6832 + scan_title.len()
6833 + agent.len()
6834 + workspace.len()
6835 + source_path.len()
6836 + 4,
6837 );
6838 haystack.push_str(&scan_content);
6839 haystack.push(' ');
6840 haystack.push_str(&scan_title);
6841 haystack.push(' ');
6842 haystack.push_str(&agent);
6843 haystack.push(' ');
6844 haystack.push_str(&workspace);
6845 haystack.push(' ');
6846 haystack.push_str(&source_path);
6847 let haystack = haystack.to_lowercase();
6848 let score = Self::sqlite_message_scan_score(&haystack, &scan_query);
6849 if score <= 0.0 {
6850 continue;
6851 }
6852
6853 let raw_source_id = raw_source_id.unwrap_or_else(default_source_id);
6854 let source_id = normalized_search_hit_source_id_parts(
6855 raw_source_id.as_str(),
6856 raw_origin_kind.as_deref().unwrap_or_default(),
6857 origin_host.as_deref(),
6858 );
6859 let origin_kind =
6860 normalized_search_hit_origin_kind(source_id.as_str(), raw_origin_kind.as_deref());
6861 let line_number = idx
6862 .and_then(|i| usize::try_from(i).ok())
6863 .map(|i| i.saturating_add(1));
6864 let snippet = if request.field_mask.wants_snippet() {
6865 snippet_from_content(&scan_content)
6866 } else {
6867 String::new()
6868 };
6869 let content = if request.field_mask.needs_content() {
6870 raw_content
6871 } else {
6872 String::new()
6873 };
6874 let content_hash = if content.is_empty() {
6875 stable_hit_hash(&snippet, &source_path, line_number, created_at)
6876 } else {
6877 stable_hit_hash(&content, &source_path, line_number, created_at)
6878 };
6879
6880 let hit = SearchHit {
6881 title,
6882 snippet,
6883 content,
6884 content_hash,
6885 conversation_id,
6886 score,
6887 source_path,
6888 agent,
6889 workspace,
6890 workspace_original: None,
6891 created_at,
6892 line_number,
6893 match_type: request.query_match_type,
6894 source_id,
6895 origin_kind,
6896 origin_host,
6897 };
6898
6899 if Self::sqlite_fts5_hit_matches_filters(&hit, request.filters) {
6900 scored_hits.push(hit);
6901 }
6902 }
6903
6904 scored_hits.sort_by(|left, right| {
6905 right
6906 .score
6907 .partial_cmp(&left.score)
6908 .unwrap_or(CmpOrdering::Equal)
6909 });
6910
6911 Ok(scored_hits
6912 .into_iter()
6913 .skip(request.offset)
6914 .take(request.limit)
6915 .collect())
6916 }
6917
6918 fn search_sqlite_fts5(
6919 &self,
6920 _db_path: &Path,
6921 raw_query: &str,
6922 filters: SearchFilters,
6923 limit: usize,
6924 offset: usize,
6925 field_mask: FieldMask,
6926 ) -> Result<Vec<SearchHit>> {
6927 if limit < 1 {
6928 return Ok(Vec::new());
6929 }
6930
6931 let fts_query = match transpile_to_fts5(raw_query) {
6932 Some(q) if !q.trim().is_empty() => q,
6933 _ => return Ok(Vec::new()),
6934 };
6935
6936 let sqlite_guard = self.sqlite_guard()?;
6937 let Some(conn) = sqlite_guard.as_ref() else {
6938 return Ok(Vec::new());
6939 };
6940
6941 let empty_params: [ParamValue; 0] = [];
6942 let has_fts = franken_query_map_collect_retry(
6943 conn,
6944 "SELECT 1 FROM sqlite_master WHERE name = 'fts_messages'",
6945 &empty_params,
6946 |row| row.get_typed::<i64>(0),
6947 )
6948 .map(|rows| !rows.is_empty())
6949 .unwrap_or(false);
6950 if !has_fts {
6951 return Ok(Vec::new());
6952 }
6953
6954 let query_match_type = dominant_match_type(raw_query);
6955 let scan_request = SqliteMessageScanRequest {
6956 raw_query,
6957 filters: &filters,
6958 limit,
6959 offset,
6960 field_mask,
6961 query_match_type,
6962 };
6963 let uses_message_id =
6964 if let Ok(uses_message_id) = Self::sqlite_fts_uses_message_id_column(conn) {
6965 uses_message_id
6966 } else {
6967 tracing::warn!(
6968 "sqlite FTS fallback is present but not queryable; skipping fallback search"
6969 );
6970 return self.search_sqlite_message_scan(conn, scan_request);
6971 };
6972 let match_mode = match Self::sqlite_fts_match_mode(conn) {
6973 Ok(match_mode) => match_mode,
6974 Err(err) => {
6975 tracing::warn!(
6976 error = %err,
6977 "sqlite FTS fallback is present but not queryable; skipping fallback search"
6978 );
6979 return self.search_sqlite_message_scan(conn, scan_request);
6980 }
6981 };
6982 if !Self::sqlite_fts5_rowid_projection_available(conn) {
6983 tracing::warn!(
6984 "sqlite FTS fallback cannot project rowid through frankensqlite; using source-table scan fallback"
6985 );
6986 return self.search_sqlite_message_scan(conn, scan_request);
6987 }
6988
6989 let post_filter = Self::sqlite_fts5_filters_need_post_hydration(&filters);
6990 let target_hits = if post_filter {
6991 offset.saturating_add(limit)
6992 } else {
6993 limit
6994 };
6995 let rank_batch_limit = if post_filter {
6996 target_hits.clamp(1, SQLITE_FTS5_POST_FILTER_SCAN_CHUNK)
6997 } else {
6998 limit
6999 };
7000 let mut rank_offset = if post_filter { 0 } else { offset };
7001 let mut scanned_rows = 0usize;
7002 let mut hits = Vec::with_capacity(target_hits.min(rank_batch_limit));
7003
7004 loop {
7005 let (rank_sql, rank_params) = Self::sqlite_fts5_rank_query(
7006 fts_query.as_str(),
7007 &filters,
7008 rank_batch_limit,
7009 rank_offset,
7010 uses_message_id,
7011 match_mode,
7012 );
7013 let ranked_rows: Vec<(i64, f64)> =
7014 match franken_query_map_collect_retry(conn, &rank_sql, &rank_params, |row| {
7015 Ok((row.get_typed(0)?, row.get_typed(1)?))
7016 }) {
7017 Ok(rows) => rows,
7018 Err(err) => {
7019 tracing::warn!(
7020 error = %err,
7021 "sqlite FTS fallback rank query failed; returning no fallback hits"
7022 );
7023 return self.search_sqlite_message_scan(conn, scan_request);
7024 }
7025 };
7026 if ranked_rows.is_empty() {
7027 break;
7028 }
7029
7030 scanned_rows = scanned_rows.saturating_add(ranked_rows.len());
7031 let bm25_by_rowid: HashMap<i64, f64> = ranked_rows.iter().copied().collect();
7032 let mut fts_rows_by_rowid = HashMap::with_capacity(ranked_rows.len());
7033 let mut message_ids = Vec::with_capacity(ranked_rows.len());
7034 let mut seen_message_ids = HashSet::with_capacity(ranked_rows.len());
7035
7036 for rank_chunk in Self::sqlite_fts5_hydrate_row_chunks(&ranked_rows) {
7037 let hydrate_sql =
7038 Self::sqlite_fts5_hydrate_query(rank_chunk.len(), field_mask, uses_message_id);
7039 let hydrate_params = rank_chunk
7040 .iter()
7041 .map(|(fts_rowid, _)| ParamValue::from(*fts_rowid))
7042 .collect::<Vec<_>>();
7043 let rows: Vec<SqliteFtsHydratedRow> = match franken_query_map_collect_retry(
7044 conn,
7045 &hydrate_sql,
7046 &hydrate_params,
7047 |row| {
7048 Ok((
7049 row.get_typed(0)?,
7050 row.get_typed(1)?,
7051 row.get_typed(2)?,
7052 row.get_typed(3)?,
7053 row.get_typed(4)?,
7054 row.get_typed(5)?,
7055 row.get_typed(6)?,
7056 row.get_typed(7)?,
7057 ))
7058 },
7059 ) {
7060 Ok(rows) => rows,
7061 Err(err) => {
7062 tracing::warn!(
7063 error = %err,
7064 "sqlite FTS fallback rowid hydration query failed; returning no fallback hits"
7065 );
7066 return self.search_sqlite_message_scan(conn, scan_request);
7067 }
7068 };
7069
7070 for row in rows {
7071 let fts_rowid = row.0;
7072 let message_id = row.1.unwrap_or(fts_rowid);
7073 if seen_message_ids.insert(message_id) {
7074 message_ids.push(message_id);
7075 }
7076 fts_rows_by_rowid.insert(fts_rowid, row);
7077 }
7078 }
7079
7080 let mut metadata_by_message_id = HashMap::with_capacity(message_ids.len());
7081 for message_chunk in message_ids.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK) {
7082 let metadata_sql =
7083 Self::sqlite_fts5_message_hydrate_query(message_chunk.len(), field_mask);
7084 let metadata_params = message_chunk
7085 .iter()
7086 .map(|message_id| ParamValue::from(*message_id))
7087 .collect::<Vec<_>>();
7088 let metadata_rows: Vec<SqliteFtsMessageRow> = match franken_query_map_collect_retry(
7089 conn,
7090 &metadata_sql,
7091 &metadata_params,
7092 |row| {
7093 Ok((
7094 row.get_typed(0)?,
7095 row.get_typed(1)?,
7096 row.get_typed(2)?,
7097 row.get_typed(3)?,
7098 row.get_typed(4)?,
7099 row.get_typed(5)?,
7100 row.get_typed(6)?,
7101 row.get_typed(7)?,
7102 row.get_typed(8)?,
7103 row.get_typed::<Option<String>>(9)?,
7104 row.get_typed(10)?,
7105 row.get_typed(11)?,
7106 ))
7107 },
7108 ) {
7109 Ok(rows) => rows,
7110 Err(err) => {
7111 tracing::warn!(
7112 error = %err,
7113 "sqlite FTS fallback message hydration query failed; returning no fallback hits"
7114 );
7115 return self.search_sqlite_message_scan(conn, scan_request);
7116 }
7117 };
7118 metadata_by_message_id.extend(metadata_rows.into_iter().map(|row| (row.0, row)));
7119 }
7120
7121 let mut hits_by_rowid = HashMap::with_capacity(ranked_rows.len());
7122 for (
7123 fts_rowid,
7124 fts_message_id,
7125 fts_title,
7126 fts_content,
7127 fts_agent,
7128 fts_workspace,
7129 fts_source_path,
7130 fts_created_at,
7131 ) in fts_rows_by_rowid.into_values()
7132 {
7133 let Some(&bm25_score) = bm25_by_rowid.get(&fts_rowid) else {
7134 continue;
7135 };
7136 let message_id = fts_message_id.unwrap_or(fts_rowid);
7137 let (
7138 title,
7139 raw_content,
7140 agent,
7141 workspace,
7142 source_path,
7143 created_at,
7144 idx,
7145 conversation_id,
7146 raw_source_id,
7147 origin_host,
7148 raw_origin_kind,
7149 ) = match metadata_by_message_id.remove(&message_id) {
7150 Some((
7151 _,
7152 metadata_title,
7153 metadata_content,
7154 metadata_agent,
7155 metadata_workspace,
7156 metadata_source_path,
7157 metadata_created_at,
7158 metadata_idx,
7159 metadata_conversation_id,
7160 metadata_raw_source_id,
7161 metadata_origin_host,
7162 metadata_raw_origin_kind,
7163 )) => (
7164 if metadata_title.is_empty() {
7165 fts_title.unwrap_or_default()
7166 } else {
7167 metadata_title
7168 },
7169 if metadata_content.is_empty() {
7170 fts_content.unwrap_or_default()
7171 } else {
7172 metadata_content
7173 },
7174 if metadata_agent.is_empty() {
7175 fts_agent.unwrap_or_default()
7176 } else {
7177 metadata_agent
7178 },
7179 if metadata_workspace.is_empty() {
7180 fts_workspace.unwrap_or_default()
7181 } else {
7182 metadata_workspace
7183 },
7184 if metadata_source_path.is_empty() {
7185 fts_source_path.unwrap_or_default()
7186 } else {
7187 metadata_source_path
7188 },
7189 metadata_created_at.or(fts_created_at),
7190 metadata_idx,
7191 metadata_conversation_id,
7192 metadata_raw_source_id.unwrap_or_else(default_source_id),
7193 metadata_origin_host,
7194 metadata_raw_origin_kind,
7195 ),
7196 None => (
7197 fts_title.unwrap_or_default(),
7198 fts_content.unwrap_or_default(),
7199 fts_agent.unwrap_or_default(),
7200 fts_workspace.unwrap_or_default(),
7201 fts_source_path.unwrap_or_default(),
7202 fts_created_at,
7203 None,
7204 None,
7205 default_source_id(),
7206 None,
7207 None,
7208 ),
7209 };
7210
7211 let source_id = normalized_search_hit_source_id_parts(
7212 raw_source_id.as_str(),
7213 raw_origin_kind.as_deref().unwrap_or_default(),
7214 origin_host.as_deref(),
7215 );
7216 let origin_kind = normalized_search_hit_origin_kind(
7217 source_id.as_str(),
7218 raw_origin_kind.as_deref(),
7219 );
7220 let line_number = idx
7221 .and_then(|i| usize::try_from(i).ok())
7222 .map(|i| i.saturating_add(1));
7223 let snippet = if field_mask.wants_snippet() {
7224 snippet_from_content(&raw_content)
7225 } else {
7226 String::new()
7227 };
7228 let content = if field_mask.needs_content() {
7229 raw_content
7230 } else {
7231 String::new()
7232 };
7233 let content_hash = if content.is_empty() {
7234 stable_hit_hash(&snippet, &source_path, line_number, created_at)
7235 } else {
7236 stable_hit_hash(&content, &source_path, line_number, created_at)
7237 };
7238
7239 let hit = SearchHit {
7240 title,
7241 snippet,
7242 content,
7243 content_hash,
7244 conversation_id,
7245 score: (-bm25_score) as f32,
7246 source_path,
7247 agent,
7248 workspace,
7249 workspace_original: None,
7250 created_at,
7251 line_number,
7252 match_type: query_match_type,
7253 source_id,
7254 origin_kind,
7255 origin_host,
7256 };
7257 hits_by_rowid.insert(fts_rowid, hit);
7258 }
7259
7260 for (fts_rowid, _) in &ranked_rows {
7261 if let Some(hit) = hits_by_rowid.remove(fts_rowid)
7262 && Self::sqlite_fts5_hit_matches_filters(&hit, &filters)
7263 {
7264 hits.push(hit);
7265 if hits.len() >= target_hits {
7266 break;
7267 }
7268 }
7269 }
7270
7271 if hits.len() >= target_hits
7272 || !post_filter
7273 || ranked_rows.len() < rank_batch_limit
7274 || scanned_rows >= SQLITE_FTS5_POST_FILTER_SCAN_LIMIT
7275 {
7276 break;
7277 }
7278 rank_offset = rank_offset.saturating_add(ranked_rows.len());
7279 }
7280
7281 if post_filter {
7282 let hits = hits
7283 .into_iter()
7284 .skip(offset)
7285 .take(limit)
7286 .collect::<Vec<_>>();
7287 if hits.is_empty() {
7288 self.search_sqlite_message_scan(conn, scan_request)
7289 } else {
7290 Ok(hits)
7291 }
7292 } else if hits.is_empty() {
7293 self.search_sqlite_message_scan(conn, scan_request)
7294 } else {
7295 Ok(hits)
7296 }
7297 }
7298
7299 pub fn browse_by_date(
7306 &self,
7307 filters: SearchFilters,
7308 limit: usize,
7309 offset: usize,
7310 newest_first: bool,
7311 field_mask: FieldMask,
7312 ) -> Result<Vec<SearchHit>> {
7313 let sqlite_guard = self.sqlite_guard()?;
7314 if let Some(conn) = sqlite_guard.as_ref() {
7315 self.browse_by_date_sqlite(conn, filters, limit, offset, newest_first, field_mask)
7316 } else {
7317 Ok(Vec::new())
7318 }
7319 }
7320
7321 fn browse_by_date_sqlite(
7322 &self,
7323 conn: &Connection,
7324 filters: SearchFilters,
7325 limit: usize,
7326 offset: usize,
7327 newest_first: bool,
7328 field_mask: FieldMask,
7329 ) -> Result<Vec<SearchHit>> {
7330 let order = if newest_first { "DESC" } else { "ASC" };
7331 let title_expr = if field_mask.wants_title() {
7332 "c.title"
7333 } else {
7334 "''"
7335 };
7336 let normalized_source_sql =
7344 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
7345 let mut sql = format!(
7346 "SELECT c.id, {title_expr}, m.content, \
7347 COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'), \
7348 w.path, c.source_path, m.created_at, m.idx, \
7349 {normalized_source_sql}, c.origin_host, s.kind
7350 FROM messages m
7351 JOIN conversations c ON m.conversation_id = c.id
7352 LEFT JOIN workspaces w ON c.workspace_id = w.id
7353 LEFT JOIN sources s ON c.source_id = s.id
7354 WHERE 1=1"
7355 );
7356 let mut params: Vec<ParamValue> = Vec::new();
7357
7358 if !filters.agents.is_empty() {
7359 let placeholders = sql_placeholders(filters.agents.len());
7360 sql.push_str(&format!(
7361 " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug IN ({placeholders}))"
7362 ));
7363 for a in &filters.agents {
7364 params.push(ParamValue::from(a.as_str()));
7365 }
7366 }
7367
7368 if !filters.workspaces.is_empty() {
7369 let placeholders = sql_placeholders(filters.workspaces.len());
7370 sql.push_str(&format!(" AND COALESCE(w.path, '') IN ({placeholders})"));
7371 for w in &filters.workspaces {
7372 params.push(ParamValue::from(w.as_str()));
7373 }
7374 }
7375
7376 if let Some(created_from) = filters.created_from {
7377 sql.push_str(" AND m.created_at >= ?");
7378 params.push(ParamValue::from(created_from));
7379 }
7380 if let Some(created_to) = filters.created_to {
7381 sql.push_str(" AND m.created_at <= ?");
7382 params.push(ParamValue::from(created_to));
7383 }
7384
7385 match &filters.source_filter {
7387 SourceFilter::All => {}
7388 SourceFilter::Local => sql.push_str(&format!(
7389 " AND {normalized_source_sql} = '{local}'",
7390 local = crate::sources::provenance::LOCAL_SOURCE_ID,
7391 )),
7392 SourceFilter::Remote => sql.push_str(&format!(
7393 " AND {normalized_source_sql} != '{local}'",
7394 local = crate::sources::provenance::LOCAL_SOURCE_ID,
7395 )),
7396 SourceFilter::SourceId(id) => {
7397 sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
7398 params.push(ParamValue::from(normalize_search_source_filter_value(id)));
7399 }
7400 }
7401
7402 sql.push_str(&format!(
7403 " ORDER BY CASE WHEN m.created_at IS NULL THEN 1 ELSE 0 END, m.created_at {order}, m.id {order} LIMIT ? OFFSET ?"
7404 ));
7405 params.push(ParamValue::from(limit as i64));
7406 params.push(ParamValue::from(offset as i64));
7407
7408 let rows: Vec<SearchHit> =
7409 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
7410 let conversation_id: i64 = row.get_typed(0)?;
7411 let title: String = if field_mask.wants_title() {
7412 row.get_typed::<Option<String>>(1)?.unwrap_or_default()
7413 } else {
7414 String::new()
7415 };
7416 let raw_content: String = row.get_typed(2)?;
7417 let agent: String = row.get_typed(3)?;
7418 let workspace: Option<String> = row.get_typed(4)?;
7419 let source_path: String = row.get_typed(5)?;
7420 let created_at: Option<i64> = row.get_typed(6)?;
7421 let idx: Option<i64> = row.get_typed(7)?;
7422 let raw_source_id: String = row
7423 .get_typed::<Option<String>>(8)?
7424 .unwrap_or_else(default_source_id);
7425 let origin_host: Option<String> = row.get_typed(9)?;
7426 let raw_origin_kind: Option<String> = row.get_typed(10)?;
7427 let source_id = normalized_search_hit_source_id_parts(
7428 raw_source_id.as_str(),
7429 raw_origin_kind.as_deref().unwrap_or_default(),
7430 origin_host.as_deref(),
7431 );
7432 let origin_kind = normalized_search_hit_origin_kind(
7433 source_id.as_str(),
7434 raw_origin_kind.as_deref(),
7435 );
7436 let line_number = idx
7437 .and_then(|i| usize::try_from(i).ok())
7438 .map(|i| i.saturating_add(1));
7439 let snippet = if field_mask.wants_snippet() {
7440 snippet_from_content(&raw_content)
7441 } else {
7442 String::new()
7443 };
7444 let content = if field_mask.needs_content() {
7445 raw_content.clone()
7446 } else {
7447 String::new()
7448 };
7449 let content_hash =
7450 stable_hit_hash(&raw_content, &source_path, line_number, created_at);
7451 Ok(SearchHit {
7452 title,
7453 snippet,
7454 content,
7455 content_hash,
7456 conversation_id: Some(conversation_id),
7457 score: 0.0,
7458 source_path,
7459 agent,
7460 workspace: workspace.unwrap_or_default(),
7461 workspace_original: None,
7462 created_at,
7463 line_number,
7464 match_type: MatchType::Exact,
7465 source_id,
7466 origin_kind,
7467 origin_host,
7468 })
7469 })?;
7470 Ok(rows)
7471 }
7472}
7473
7474#[doc(hidden)]
7481pub fn fuzz_transpile_to_fts5(raw_query: &str) -> Option<String> {
7482 transpile_to_fts5(raw_query)
7483}
7484
7485fn transpile_to_fts5(raw_query: &str) -> Option<String> {
7489 let tokens = fs_cass_parse_boolean_query(raw_query);
7490 if tokens.is_empty() {
7491 return Some("".to_string());
7492 }
7493
7494 let mut fts_clauses: Vec<(&str, String)> = Vec::new();
7495 let mut pending_or_group: Vec<String> = Vec::new();
7496 let mut next_op = "AND";
7497 let mut in_or_sequence = false;
7498 for token in tokens {
7499 match token {
7500 FsCassQueryToken::And => {
7501 if !pending_or_group.is_empty() {
7502 let group = if pending_or_group.len() > 1 {
7503 format!("({})", pending_or_group.join(" OR "))
7504 } else {
7505 pending_or_group.pop().unwrap_or_default()
7506 };
7507 fts_clauses.push(("AND", group));
7508 pending_or_group.clear();
7509 }
7510 in_or_sequence = false;
7511 next_op = "AND";
7512 }
7513 FsCassQueryToken::Or => {
7514 if fts_clauses.is_empty() && pending_or_group.is_empty() {
7515 continue;
7519 }
7520 in_or_sequence = true;
7523 }
7524 FsCassQueryToken::Not => {
7525 if in_or_sequence {
7529 return None;
7530 }
7531
7532 if fts_clauses.is_empty() && pending_or_group.is_empty() {
7533 return None;
7534 }
7535
7536 if !pending_or_group.is_empty() {
7537 let group = if pending_or_group.len() > 1 {
7538 format!("({})", pending_or_group.join(" OR "))
7539 } else {
7540 pending_or_group.pop().unwrap_or_default()
7541 };
7542 fts_clauses.push(("AND", group));
7543 pending_or_group.clear();
7544 }
7545 in_or_sequence = false;
7546 next_op = "NOT";
7547 }
7548 FsCassQueryToken::Term(t) => {
7549 let raw_pattern = FsCassWildcardPattern::parse(&t);
7550 if matches!(
7551 raw_pattern,
7552 FsCassWildcardPattern::Suffix(_)
7553 | FsCassWildcardPattern::Substring(_)
7554 | FsCassWildcardPattern::Complex(_)
7555 ) {
7556 return None;
7557 }
7558
7559 let term_parts = normalize_term_parts(&t);
7563 if term_parts.is_empty() {
7564 continue;
7565 }
7566
7567 let mut rendered_parts = Vec::with_capacity(term_parts.len());
7568 for part in &term_parts {
7569 rendered_parts.push(render_fts5_term_part(part)?);
7570 }
7571
7572 let fts_term = if rendered_parts.len() > 1 {
7575 format!("({})", rendered_parts.join(" AND "))
7576 } else {
7577 rendered_parts[0].clone()
7578 };
7579
7580 if in_or_sequence {
7581 if pending_or_group.is_empty() {
7582 let (op, _) = fts_clauses.last()?;
7583 if *op != "AND" {
7584 return None;
7587 }
7588 let (_, val) = fts_clauses.pop()?;
7589 pending_or_group.push(val);
7590 }
7591 pending_or_group.push(fts_term);
7592 in_or_sequence = true;
7593 } else {
7594 fts_clauses.push((next_op, fts_term));
7595 }
7596 next_op = "AND";
7597 }
7598 FsCassQueryToken::Phrase(p) => {
7599 let phrase_parts = normalize_phrase_terms(&p);
7600 if phrase_parts.is_empty() {
7601 continue;
7602 }
7603 let fts_phrase = format!("\"{}\"", phrase_parts.join(" "));
7604
7605 if in_or_sequence {
7606 if pending_or_group.is_empty() {
7607 let (op, _) = fts_clauses.last()?;
7608 if *op != "AND" {
7609 return None;
7612 }
7613 let (_, val) = fts_clauses.pop()?;
7614 pending_or_group.push(val);
7615 }
7616 pending_or_group.push(fts_phrase);
7617 in_or_sequence = true;
7618 } else {
7619 fts_clauses.push((next_op, fts_phrase));
7620 }
7621 next_op = "AND";
7622 }
7623 }
7624 }
7625
7626 if !pending_or_group.is_empty() {
7627 let group = if pending_or_group.len() > 1 {
7628 format!("({})", pending_or_group.join(" OR "))
7629 } else {
7630 pending_or_group.pop().unwrap_or_default()
7631 };
7632 fts_clauses.push((next_op, group));
7633 }
7634
7635 if fts_clauses.is_empty() {
7636 return Some("".to_string());
7637 }
7638
7639 if fts_clauses.first().is_some_and(|(op, _)| *op == "NOT") {
7642 return None;
7643 }
7644
7645 let mut query = String::new();
7647 for (i, (op, text)) in fts_clauses.into_iter().enumerate() {
7648 if i > 0 {
7649 query.push_str(&format!(" {} ", op));
7650 }
7651 query.push_str(&text);
7652 }
7653
7654 Some(query)
7655}
7656
7657#[derive(Default, Clone)]
7658struct Metrics {
7659 cache_hits: Arc<AtomicU64>,
7660 cache_miss: Arc<AtomicU64>,
7661 cache_shortfall: Arc<AtomicU64>,
7662 reloads: Arc<AtomicU64>,
7663 reload_ms_total: Arc<AtomicU64>,
7664 prewarm_scheduled: Arc<AtomicU64>,
7665 prewarm_skipped_pressure: Arc<AtomicU64>,
7666}
7667
7668impl Metrics {
7669 fn inc_cache_hits(&self) {
7670 self.cache_hits.fetch_add(1, Ordering::Relaxed);
7671 }
7672 fn inc_cache_miss(&self) {
7673 self.cache_miss.fetch_add(1, Ordering::Relaxed);
7674 }
7675 fn inc_cache_shortfall(&self) {
7676 self.cache_shortfall.fetch_add(1, Ordering::Relaxed);
7677 }
7678 fn inc_prewarm_scheduled(&self) {
7679 self.prewarm_scheduled.fetch_add(1, Ordering::Relaxed);
7680 }
7681 fn inc_prewarm_skipped_pressure(&self) {
7682 self.prewarm_skipped_pressure
7683 .fetch_add(1, Ordering::Relaxed);
7684 }
7685 fn inc_reload(&self) {
7686 self.reloads.fetch_add(1, Ordering::Relaxed);
7687 }
7688 fn record_reload(&self, duration: Duration) {
7689 self.inc_reload();
7690 self.reload_ms_total
7691 .fetch_add(duration.as_millis() as u64, Ordering::Relaxed);
7692 }
7693
7694 fn snapshot_all(&self) -> (u64, u64, u64, u64, u128) {
7695 (
7696 self.cache_hits.load(Ordering::Relaxed),
7697 self.cache_miss.load(Ordering::Relaxed),
7698 self.cache_shortfall.load(Ordering::Relaxed),
7699 self.reloads.load(Ordering::Relaxed),
7700 self.reload_ms_total.load(Ordering::Relaxed) as u128,
7701 )
7702 }
7703
7704 fn snapshot_prewarm(&self) -> (u64, u64) {
7705 (
7706 self.prewarm_scheduled.load(Ordering::Relaxed),
7707 self.prewarm_skipped_pressure.load(Ordering::Relaxed),
7708 )
7709 }
7710
7711 #[cfg(test)]
7712 #[allow(dead_code)]
7713 fn reset(&self) {
7714 self.cache_hits.store(0, Ordering::Relaxed);
7715 self.cache_miss.store(0, Ordering::Relaxed);
7716 self.cache_shortfall.store(0, Ordering::Relaxed);
7717 self.reloads.store(0, Ordering::Relaxed);
7718 self.reload_ms_total.store(0, Ordering::Relaxed);
7719 self.prewarm_scheduled.store(0, Ordering::Relaxed);
7720 self.prewarm_skipped_pressure.store(0, Ordering::Relaxed);
7721 }
7722}
7723
7724fn maybe_spawn_warm_worker(
7725 reader: IndexReader,
7726 fields: FsCassFields,
7727 reload_epoch: Arc<AtomicU64>,
7728 metrics: Metrics,
7729) -> Option<(mpsc::Sender<WarmJob>, std::thread::JoinHandle<()>)> {
7730 let (tx, rx) = mpsc::unbounded::<WarmJob>();
7731 let handle = std::thread::Builder::new()
7732 .name("cass-warm-worker".into())
7733 .spawn(move || {
7734 let mut last_run = Instant::now();
7736 while let Ok(job) = rx.recv() {
7737 let now = Instant::now();
7738 if now.duration_since(last_run) < Duration::from_millis(*WARM_DEBOUNCE_MS) {
7739 continue;
7740 }
7741 last_run = now;
7742 let reload_started = Instant::now();
7743 if let Err(err) = reader.reload() {
7744 tracing::warn!(error = ?err, "warm_worker_reload_failed");
7745 continue;
7746 }
7747 let elapsed = reload_started.elapsed();
7748 let epoch = reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
7749 metrics.record_reload(elapsed);
7750 tracing::debug!(
7751 duration_ms = elapsed.as_millis() as u64,
7752 reload_epoch = epoch,
7753 filters = %job.filters_fingerprint,
7754 shard = %job.shard_name,
7755 "warm_worker_reload"
7756 );
7757 let searcher = reader.searcher();
7760 let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
7761 for term_str in job.query.split_whitespace() {
7762 let term_lower = term_str.to_lowercase();
7763 let term_shoulds: Vec<(Occur, Box<dyn Query>)> = vec![
7764 (
7765 Occur::Should,
7766 Box::new(TermQuery::new(
7767 Term::from_field_text(fields.title, &term_lower),
7768 IndexRecordOption::WithFreqsAndPositions,
7769 )),
7770 ),
7771 (
7772 Occur::Should,
7773 Box::new(TermQuery::new(
7774 Term::from_field_text(fields.content, &term_lower),
7775 IndexRecordOption::WithFreqsAndPositions,
7776 )),
7777 ),
7778 ];
7779 clauses.push((Occur::Must, Box::new(BooleanQuery::new(term_shoulds))));
7780 }
7781 if !clauses.is_empty() {
7782 let q: Box<dyn Query> = Box::new(BooleanQuery::new(clauses));
7783 let _ = searcher.search(&q, &TopDocs::with_limit(1).order_by_score());
7784 }
7785 }
7786 })
7787 .ok()?;
7788 Some((tx, handle))
7789}
7790
7791fn cached_hit_from(hit: &SearchHit) -> CachedHit {
7792 let cache_text = if hit.content.is_empty() {
7793 hit.snippet.as_str()
7794 } else {
7795 hit.content.as_str()
7796 };
7797 let lc_content = cache_text.to_lowercase();
7798 let lc_title = (!hit.title.is_empty()).then(|| hit.title.to_lowercase());
7799 let bloom64 = bloom_from_text(&lc_content, &lc_title);
7801 CachedHit {
7802 hit: hit.clone(),
7803 lc_content,
7804 lc_title,
7805 bloom64,
7806 }
7807}
7808
7809fn bloom_from_text(content: &str, title: &Option<String>) -> u64 {
7810 let mut bits = 0u64;
7811 for token in token_stream(content) {
7812 bits |= hash_token(token);
7813 }
7814 if let Some(t) = title {
7815 for token in token_stream(t) {
7816 bits |= hash_token(token);
7817 }
7818 }
7819 bits
7820}
7821
7822fn token_stream(text: &str) -> impl Iterator<Item = &str> {
7823 text.split(|c: char| !c.is_alphanumeric())
7824 .filter(|s| !s.is_empty())
7825}
7826
7827fn hash_token(tok: &str) -> u64 {
7828 let mut h: u64 = 5381;
7830 for b in tok.as_bytes() {
7831 h = ((h << 5).wrapping_add(h)).wrapping_add(u64::from(*b));
7832 }
7833 1u64 << (h % 64)
7834}
7835
7836struct QueryTermsLower {
7846 query_lower: String,
7848 token_ranges: Vec<(usize, usize)>,
7850 bloom_mask: u64,
7852}
7853
7854impl QueryTermsLower {
7855 fn from_query(query: &str) -> Self {
7857 if query.is_empty() {
7858 return Self {
7859 query_lower: String::new(),
7860 token_ranges: Vec::new(),
7861 bloom_mask: 0,
7862 };
7863 }
7864
7865 let query_lower = query.to_lowercase();
7866 let mut token_ranges = Vec::new();
7867 let mut bloom_mask = 0u64;
7868
7869 let mut start = None;
7871 for (i, c) in query_lower.char_indices() {
7872 if c.is_alphanumeric() {
7873 if start.is_none() {
7874 start = Some(i);
7875 }
7876 } else if let Some(s) = start.take() {
7877 let token = &query_lower[s..i];
7878 bloom_mask |= hash_token(token);
7879 token_ranges.push((s, i));
7880 }
7881 }
7882 if let Some(s) = start {
7884 let token = &query_lower[s..];
7885 bloom_mask |= hash_token(token);
7886 token_ranges.push((s, query_lower.len()));
7887 }
7888
7889 Self {
7890 query_lower,
7891 token_ranges,
7892 bloom_mask,
7893 }
7894 }
7895
7896 #[inline]
7898 fn is_empty(&self) -> bool {
7899 self.token_ranges.is_empty()
7900 }
7901
7902 #[inline]
7904 fn tokens(&self) -> impl Iterator<Item = &str> {
7905 self.token_ranges
7906 .iter()
7907 .map(|(s, e)| &self.query_lower[*s..*e])
7908 }
7909
7910 #[inline]
7912 fn bloom_mask(&self) -> u64 {
7913 self.bloom_mask
7914 }
7915}
7916
7917fn hit_matches_query_cached_precomputed(hit: &CachedHit, terms: &QueryTermsLower) -> bool {
7920 if terms.is_empty() {
7921 return true;
7922 }
7923
7924 if hit.bloom64 & terms.bloom_mask() != terms.bloom_mask() {
7926 return false;
7927 }
7928
7929 terms.tokens().all(|t| {
7931 if token_stream(&hit.lc_content).any(|word| word.starts_with(t)) {
7933 return true;
7934 }
7935 if let Some(title) = &hit.lc_title
7937 && token_stream(title).any(|word| word.starts_with(t))
7938 {
7939 return true;
7940 }
7941 false
7942 })
7943}
7944
7945#[cfg(test)]
7948fn hit_matches_query_cached(hit: &CachedHit, query: &str) -> bool {
7949 let terms = QueryTermsLower::from_query(query);
7950 hit_matches_query_cached_precomputed(hit, &terms)
7951}
7952
7953fn is_prefix_only(query: &str) -> bool {
7954 let tokens: Vec<&str> = query.split_whitespace().collect();
7955 if tokens.len() != 1 {
7958 return false;
7959 }
7960 tokens[0].chars().all(char::is_alphanumeric)
7961}
7962
7963fn quick_prefix_snippet(content: &str, query: &str, max_chars: usize) -> String {
7964 if query.is_empty() {
7966 let mut chars = content.chars();
7967 let snippet: String = chars.by_ref().take(max_chars).collect();
7968 return if chars.next().is_some() {
7969 format!("{snippet}…")
7970 } else {
7971 snippet
7972 };
7973 }
7974
7975 let lc_content = content.to_lowercase();
7976 let lc_query = query.to_lowercase();
7977
7978 if let Some(pos) = lc_content.find(&lc_query) {
7979 let match_start_char_idx = lc_content[..pos].chars().count();
7981 let query_char_len = lc_query.chars().count();
7982
7983 let start_char = match_start_char_idx.saturating_sub(15);
7985 let mut chars_iter = content.chars().skip(start_char);
7986 let mut snippet = String::new();
7987 let mut chars_taken = 0;
7988 let mut current_idx = start_char;
7989
7990 while chars_taken < max_chars {
7991 if current_idx == match_start_char_idx {
7992 snippet.push_str("**");
7993 for _ in 0..query_char_len {
7994 if let Some(ch) = chars_iter.next() {
7995 snippet.push(ch);
7996 chars_taken += 1;
7997 current_idx += 1;
7998 }
7999 }
8000 snippet.push_str("**");
8001 if chars_taken >= max_chars {
8002 break;
8003 }
8004 continue;
8005 }
8006
8007 if let Some(ch) = chars_iter.next() {
8008 snippet.push(ch);
8009 chars_taken += 1;
8010 current_idx += 1;
8011 } else {
8012 break;
8013 }
8014 }
8015
8016 if chars_iter.next().is_some() {
8017 format!("{snippet}…")
8018 } else {
8019 snippet
8020 }
8021 } else {
8022 let mut chars = content.chars();
8023 let snippet: String = chars.by_ref().take(max_chars).collect();
8024 if chars.next().is_some() {
8025 format!("{snippet}…")
8026 } else {
8027 snippet
8028 }
8029 }
8030}
8031
8032fn cached_prefix_snippet(content: &str, query: &str, max_chars: usize) -> Option<String> {
8033 if query.trim().is_empty() {
8034 return None;
8035 }
8036 let lc_content = content.to_lowercase();
8037 let lc_query = query.to_lowercase();
8038 lc_content.find(&lc_query).map(|pos| {
8039 let match_start_char_idx = lc_content[..pos].chars().count();
8040 let query_char_len = lc_query.chars().count();
8041
8042 let start_char = match_start_char_idx.saturating_sub(15);
8043 let mut chars_iter = content.chars().skip(start_char);
8044 let mut snippet = String::new();
8045 let mut chars_taken = 0;
8046 let mut current_idx = start_char;
8047
8048 while chars_taken < max_chars {
8049 if current_idx == match_start_char_idx {
8050 snippet.push_str("**");
8051 for _ in 0..query_char_len {
8052 if let Some(ch) = chars_iter.next() {
8053 snippet.push(ch);
8054 chars_taken += 1;
8055 current_idx += 1;
8056 }
8057 }
8058 snippet.push_str("**");
8059 if chars_taken >= max_chars {
8060 break;
8061 }
8062 continue;
8063 }
8064
8065 if let Some(ch) = chars_iter.next() {
8066 snippet.push(ch);
8067 chars_taken += 1;
8068 current_idx += 1;
8069 } else {
8070 break;
8071 }
8072 }
8073
8074 if chars_iter.next().is_some() {
8075 format!("{snippet}…")
8076 } else {
8077 snippet
8078 }
8079 })
8080}
8081
8082fn filters_fingerprint(filters: &SearchFilters) -> String {
8083 let mut parts = Vec::new();
8084 if !filters.agents.is_empty() {
8085 let mut v: Vec<_> = filters.agents.iter().cloned().collect();
8086 v.sort();
8087 parts.push(format!("a:{v:?}"));
8088 }
8089 if !filters.workspaces.is_empty() {
8090 let mut v: Vec<_> = filters.workspaces.iter().cloned().collect();
8091 v.sort();
8092 parts.push(format!("w:{v:?}"));
8093 }
8094 if let Some(f) = filters.created_from {
8095 parts.push(format!("from:{f}"));
8096 }
8097 if let Some(t) = filters.created_to {
8098 parts.push(format!("to:{t}"));
8099 }
8100 if !matches!(
8102 filters.source_filter,
8103 crate::sources::provenance::SourceFilter::All
8104 ) {
8105 parts.push(format!("src:{:?}", filters.source_filter));
8106 }
8107 if !filters.session_paths.is_empty() {
8109 let mut v: Vec<_> = filters.session_paths.iter().cloned().collect();
8110 v.sort();
8111 parts.push(format!("sp:{v:?}"));
8112 }
8113 parts.join("|")
8114}
8115
8116impl SearchClient {
8117 pub fn total_docs(&self) -> usize {
8119 if let Some((reader, _)) = &self.reader {
8120 return reader.searcher().num_docs() as usize;
8121 }
8122 self.federated_readers()
8123 .map(|readers| {
8124 readers
8125 .iter()
8126 .map(|shard| shard.reader.searcher().num_docs() as usize)
8127 .sum()
8128 })
8129 .unwrap_or(0)
8130 }
8131
8132 pub fn has_tantivy(&self) -> bool {
8134 self.reader.is_some() || self.federated_readers().is_some()
8135 }
8136
8137 fn maybe_reload_reader(&self, reader: &IndexReader) -> Result<()> {
8138 if !self.reload_on_search {
8139 return Ok(());
8140 }
8141 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
8142 let now = Instant::now();
8143 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
8144 if guard
8145 .map(|t| now.duration_since(t) >= MIN_RELOAD_INTERVAL)
8146 .unwrap_or(true)
8147 {
8148 let reload_started = Instant::now();
8149 reader.reload()?;
8150 let elapsed = reload_started.elapsed();
8151 *guard = Some(now);
8152 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
8153 self.metrics.record_reload(elapsed);
8154 tracing::debug!(
8155 duration_ms = elapsed.as_millis() as u64,
8156 reload_epoch = epoch,
8157 "tantivy_reader_reload"
8158 );
8159 }
8160 Ok(())
8161 }
8162
8163 fn maybe_log_cache_metrics(&self, event: &str) {
8164 if !*CACHE_DEBUG_ENABLED {
8165 return;
8166 }
8167 let stats = self.cache_stats();
8168 tracing::debug!(
8169 event = event,
8170 hits = stats.cache_hits,
8171 miss = stats.cache_miss,
8172 shortfall = stats.cache_shortfall,
8173 reloads = stats.reloads,
8174 reload_ms_total = stats.reload_ms_total,
8175 total_cap = stats.total_cap,
8176 total_cost = stats.total_cost,
8177 evictions = stats.eviction_count,
8178 approx_bytes = stats.approx_bytes,
8179 byte_cap = stats.byte_cap,
8180 eviction_policy = stats.eviction_policy,
8181 ghost_entries = stats.ghost_entries,
8182 admission_rejects = stats.admission_rejects,
8183 "cache_metrics"
8184 );
8185 }
8186
8187 fn cache_key(&self, query: &str, filters: &SearchFilters) -> Arc<str> {
8190 let key_str = format!(
8191 "{}|{}::{}",
8192 self.cache_namespace,
8193 query,
8194 filters_fingerprint(filters)
8195 );
8196 intern_cache_key(&key_str)
8197 }
8198
8199 fn shard_name(&self, filters: &SearchFilters) -> String {
8200 if filters.agents.len() == 1 {
8201 format!(
8202 "agent:{}",
8203 filters
8204 .agents
8205 .iter()
8206 .next()
8207 .cloned()
8208 .unwrap_or_else(|| "global".into())
8209 )
8210 } else if filters.workspaces.len() == 1 {
8211 format!(
8212 "workspace:{}",
8213 filters
8214 .workspaces
8215 .iter()
8216 .next()
8217 .cloned()
8218 .unwrap_or_else(|| "global".into())
8219 )
8220 } else {
8221 "global".into()
8222 }
8223 }
8224 fn cached_prefix_key_exists_in_shard(
8225 &self,
8226 shard: &LruCache<Arc<str>, Vec<CachedHit>>,
8227 query: &str,
8228 filters: &SearchFilters,
8229 ) -> bool {
8230 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
8231 byte_indices.push(query.len());
8232 let query_len = query.len();
8233 for &end in byte_indices.iter().rev() {
8234 if end == 0 || end == query_len {
8235 continue;
8236 }
8237 let key = self.cache_key(&query[..end], filters);
8238 if shard.contains(&key) {
8239 return true;
8240 }
8241 }
8242 false
8243 }
8244
8245 fn maybe_schedule_adaptive_query_prewarm(&self, query: &str, filters: &SearchFilters) {
8246 if query.is_empty() {
8247 return;
8248 }
8249 let Some(tx) = &self.warm_tx else {
8250 return;
8251 };
8252
8253 let shard_name = self.shard_name(filters);
8254 let decision = match self.prefix_cache.lock() {
8255 Ok(cache) => {
8256 let hot_prefix = cache.shard_opt(&shard_name).is_some_and(|shard| {
8257 self.cached_prefix_key_exists_in_shard(shard, query, filters)
8258 });
8259 if !hot_prefix {
8260 AdaptivePrewarmDecision::SkipCold
8261 } else if cache.prewarm_pressure() {
8262 AdaptivePrewarmDecision::SkipPressure
8263 } else {
8264 AdaptivePrewarmDecision::Schedule
8265 }
8266 }
8267 Err(_) => return,
8268 };
8269
8270 if decision == AdaptivePrewarmDecision::SkipPressure {
8271 self.metrics.inc_prewarm_skipped_pressure();
8272 return;
8273 }
8274 if decision == AdaptivePrewarmDecision::SkipCold {
8275 return;
8276 }
8277
8278 if tx
8279 .send(WarmJob {
8280 query: query.to_string(),
8281 filters_fingerprint: filters_fingerprint(filters),
8282 shard_name,
8283 })
8284 .is_ok()
8285 {
8286 self.metrics.inc_prewarm_scheduled();
8287 }
8288 }
8289
8290 fn cached_prefix_hits(&self, query: &str, filters: &SearchFilters) -> Option<Vec<CachedHit>> {
8291 if query.is_empty() {
8292 return None;
8293 }
8294 let cache = self.prefix_cache.lock().ok()?;
8295 let shard_name = self.shard_name(filters);
8296 let shard = cache.shard_opt(&shard_name)?;
8297 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
8299 byte_indices.push(query.len());
8300 for &end in byte_indices.iter().rev() {
8301 if end == 0 {
8302 continue;
8303 }
8304 let key = self.cache_key(&query[..end], filters);
8305 if let Some(hits) = shard.peek(&key) {
8307 return Some(hits.clone());
8308 }
8309 }
8310 None
8311 }
8312
8313 fn put_cache(&self, query: &str, filters: &SearchFilters, hits: &[SearchHit]) {
8314 if query.is_empty() || hits.is_empty() {
8315 return;
8316 }
8317 if let Ok(mut cache) = self.prefix_cache.lock() {
8318 let shard_name = self.shard_name(filters);
8319 let key = self.cache_key(query, filters);
8320 let cached_hits: Vec<CachedHit> = hits.iter().map(cached_hit_from).collect();
8321 cache.put(&shard_name, key, cached_hits);
8322 }
8323 }
8324
8325 pub fn cache_stats(&self) -> CacheStats {
8326 let (hits, miss, shortfall, reloads, reload_ms_total) = self.metrics.snapshot_all();
8327 let (prewarm_scheduled, prewarm_skipped_pressure) = self.metrics.snapshot_prewarm();
8328 let reader_generation = self.last_generation.lock().ok().and_then(|guard| *guard);
8329 let (
8330 total_cap,
8331 total_cost,
8332 eviction_count,
8333 approx_bytes,
8334 byte_cap,
8335 eviction_policy,
8336 ghost_entries,
8337 admission_rejects,
8338 ) = if let Ok(cache) = self.prefix_cache.lock() {
8339 (
8340 cache.total_cap(),
8341 cache.total_cost(),
8342 cache.eviction_count(),
8343 cache.total_bytes(),
8344 cache.byte_cap(),
8345 cache.policy_label(),
8346 cache.ghost_entries(),
8347 cache.admission_rejects(),
8348 )
8349 } else {
8350 (0, 0, 0, 0, 0, "unknown", 0, 0)
8351 };
8352 CacheStats {
8353 cache_hits: hits,
8354 cache_miss: miss,
8355 cache_shortfall: shortfall,
8356 reloads,
8357 reload_ms_total,
8358 total_cap,
8359 total_cost,
8360 eviction_count,
8361 approx_bytes,
8362 byte_cap,
8363 eviction_policy,
8364 ghost_entries,
8365 admission_rejects,
8366 prewarm_scheduled,
8367 prewarm_skipped_pressure,
8368 reader_generation,
8369 }
8370 }
8371}
8372
8373#[cfg(test)]
8374mod tests {
8375 use super::*;
8376 use crate::connectors::{NormalizedConversation, NormalizedMessage, NormalizedSnippet};
8377 use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
8378 use crate::search::tantivy::TantivyIndex;
8379 use crate::storage::sqlite::FrankenStorage;
8380 use frankensqlite::Connection as FrankenConnection;
8381 use frankensqlite::compat::ParamValue;
8382 use serde_json::json;
8383 use tempfile::TempDir;
8384
8385 fn search_hit_key_doc_id_reference_v0(key: &SearchHitKey) -> String {
8389 let sep = '\u{1f}';
8390 format!(
8391 "{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}",
8392 key.source_id,
8393 key.source_path,
8394 key.conversation_id
8395 .map(|v| v.to_string())
8396 .unwrap_or_default(),
8397 key.title,
8398 key.line_number.map(|v| v.to_string()).unwrap_or_default(),
8399 key.created_at.map(|v| v.to_string()).unwrap_or_default(),
8400 key.content_hash,
8401 )
8402 }
8403
8404 fn stable_hit_hash_reference_v0(
8405 content: &str,
8406 source_path: &str,
8407 line_number: Option<usize>,
8408 created_at: Option<i64>,
8409 ) -> u64 {
8410 use xxhash_rust::xxh3::Xxh3;
8411
8412 let mut hasher = Xxh3::new();
8413 if !content.is_empty() {
8414 hasher.update(&stable_content_hash(content).to_le_bytes());
8415 }
8416 hasher.update(b"|");
8417 hasher.update(source_path.as_bytes());
8418 hasher.update(b"|");
8419 if let Some(line) = line_number {
8420 hasher.update(line.to_string().as_bytes());
8421 }
8422 hasher.update(b"|");
8423 if let Some(ts) = created_at {
8424 hasher.update(ts.to_string().as_bytes());
8425 }
8426 hasher.digest()
8427 }
8428
8429 fn vector_result(message_id: u64, score: f32) -> VectorSearchResult {
8430 VectorSearchResult {
8431 message_id,
8432 chunk_idx: 0,
8433 score,
8434 }
8435 }
8436
8437 #[test]
8438 fn semantic_exact_candidate_limit_overfetches_chunks_without_full_scan() {
8439 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 1_000), 40);
8440 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 25), 25);
8441 assert_eq!(SearchClient::semantic_exact_candidate_limit(0, 1_000), 0);
8442 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 0), 0);
8443 }
8444
8445 #[test]
8446 fn semantic_window_detects_possible_hidden_chunk_competitors() {
8447 let complete = vec![
8448 vector_result(1, 0.9),
8449 vector_result(2, 0.8),
8450 vector_result(3, 0.7),
8451 ];
8452 assert!(
8453 !SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.6)),
8454 "strictly lower omitted chunks cannot alter the top message window"
8455 );
8456 assert!(
8457 SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.7)),
8458 "equal-score omitted chunks can still alter deterministic tie-breaking"
8459 );
8460
8461 let duplicate_collapsed_shortfall = vec![vector_result(1, 0.9)];
8462 assert!(
8463 SearchClient::semantic_window_may_omit_competitor(
8464 &duplicate_collapsed_shortfall,
8465 3,
8466 Some(0.2),
8467 ),
8468 "a short collapsed window means high-scoring duplicate chunks may have hidden messages"
8469 );
8470 assert!(!SearchClient::semantic_window_may_omit_competitor(
8471 &complete, 3, None
8472 ));
8473 }
8474
8475 #[test]
8476 fn stable_hit_hash_matches_reference_and_is_deterministic() {
8477 let fixtures = [
8478 ("", "", None, None),
8479 (
8480 "same content\nnormalized",
8481 "/tmp/session.jsonl",
8482 Some(1),
8483 Some(0),
8484 ),
8485 (
8486 "tool output with repeated whitespace",
8487 "/tmp/path with spaces.jsonl",
8488 Some(42),
8489 Some(1_700_000_000_000),
8490 ),
8491 (
8492 "unicode stays in the content hash path: café",
8493 "/remote/host/session.jsonl",
8494 Some(usize::MAX),
8495 Some(i64::MIN),
8496 ),
8497 (
8498 "negative timestamp fixture",
8499 "/tmp/negative.jsonl",
8500 None,
8501 Some(-123_456),
8502 ),
8503 ];
8504
8505 for (content, source_path, line_number, created_at) in fixtures {
8506 let optimized = stable_hit_hash(content, source_path, line_number, created_at);
8507 let repeated = stable_hit_hash(content, source_path, line_number, created_at);
8508 let reference =
8509 stable_hit_hash_reference_v0(content, source_path, line_number, created_at);
8510
8511 assert_eq!(optimized, repeated);
8512 assert_eq!(optimized, reference);
8513 }
8514 }
8515
8516 #[test]
8517 fn semantic_message_id_from_db_rejects_negative_values() {
8518 let err = semantic_message_id_from_db(-1).expect_err("negative DB ids must be rejected");
8519 assert!(
8520 err.to_string().contains("negative message_id"),
8521 "unexpected error: {err}"
8522 );
8523 assert_eq!(semantic_message_id_from_db(42).expect("positive id"), 42);
8524 }
8525
8526 #[test]
8527 fn semantic_doc_component_id_from_db_clamps_bounds() {
8528 assert_eq!(semantic_doc_component_id_from_db(None), 0);
8529 assert_eq!(semantic_doc_component_id_from_db(Some(-7)), 0);
8530 assert_eq!(semantic_doc_component_id_from_db(Some(0)), 0);
8531 assert_eq!(semantic_doc_component_id_from_db(Some(7)), 7);
8532 assert_eq!(
8533 semantic_doc_component_id_from_db(Some(i64::from(u32::MAX) + 123)),
8534 u32::MAX
8535 );
8536 }
8537
8538 #[test]
8539 fn search_hit_key_doc_id_matches_reference_byte_for_byte() {
8540 let fixtures = [
8541 SearchHitKey {
8542 source_id: "local".into(),
8543 source_path: "/tmp/path.jsonl".into(),
8544 conversation_id: Some(42),
8545 title: "Demo chat".into(),
8546 line_number: Some(7),
8547 created_at: Some(1_700_000_000_000),
8548 content_hash: 0xdead_beef_u64,
8549 },
8550 SearchHitKey {
8551 source_id: "ssh:host".into(),
8552 source_path: "/remote/path with spaces.jsonl".into(),
8553 conversation_id: None,
8554 title: String::new(),
8555 line_number: None,
8556 created_at: None,
8557 content_hash: 0,
8558 },
8559 SearchHitKey {
8560 source_id: String::new(),
8561 source_path: String::new(),
8562 conversation_id: Some(i64::MIN),
8563 title: "unicode title — héllo".into(),
8564 line_number: Some(usize::MAX),
8565 created_at: Some(i64::MAX),
8566 content_hash: u64::MAX,
8567 },
8568 SearchHitKey {
8569 source_id: "a".into(),
8570 source_path: "b".into(),
8571 conversation_id: Some(0),
8572 title: "c".into(),
8573 line_number: Some(0),
8574 created_at: Some(0),
8575 content_hash: 0,
8576 },
8577 SearchHitKey {
8578 source_id: "with\u{1f}separator".into(),
8579 source_path: "with\u{1f}separator".into(),
8580 conversation_id: Some(-1),
8581 title: "with\u{1f}separator".into(),
8582 line_number: None,
8583 created_at: Some(-1),
8584 content_hash: 1,
8585 },
8586 ];
8587 for (idx, key) in fixtures.iter().enumerate() {
8588 let optimized = search_hit_key_doc_id(key);
8589 let reference = search_hit_key_doc_id_reference_v0(key);
8590 assert_eq!(
8591 optimized, reference,
8592 "fixture {idx} produced divergent doc_id; byte-exact dedup key is a contract"
8593 );
8594 }
8595
8596 let structural_key = SearchHitKey {
8601 source_id: "clean".into(),
8602 source_path: "/no/separators/here.jsonl".into(),
8603 conversation_id: Some(1),
8604 title: "plain title".into(),
8605 line_number: Some(2),
8606 created_at: Some(3),
8607 content_hash: 4,
8608 };
8609 let encoded = search_hit_key_doc_id(&structural_key);
8610 assert_eq!(
8611 encoded.matches('\u{1f}').count(),
8612 6,
8613 "structural fixture must contain exactly six 0x1F separators; got {encoded:?}"
8614 );
8615 }
8616
8617 #[derive(Debug)]
8618 struct FixedTestEmbedder {
8619 id: String,
8620 vector: Vec<f32>,
8621 }
8622
8623 impl FixedTestEmbedder {
8624 fn new(id: &str, vector: &[f32]) -> Self {
8625 Self {
8626 id: id.to_string(),
8627 vector: vector.to_vec(),
8628 }
8629 }
8630 }
8631
8632 #[derive(Debug)]
8633 struct BlockingTestEmbedder {
8634 id: String,
8635 vector: Vec<f32>,
8636 started_tx: Mutex<Option<std::sync::mpsc::Sender<()>>>,
8637 unblock_rx: Mutex<std::sync::mpsc::Receiver<()>>,
8638 }
8639
8640 impl BlockingTestEmbedder {
8641 fn new(
8642 id: &str,
8643 vector: &[f32],
8644 started_tx: std::sync::mpsc::Sender<()>,
8645 unblock_rx: std::sync::mpsc::Receiver<()>,
8646 ) -> Self {
8647 Self {
8648 id: id.to_string(),
8649 vector: vector.to_vec(),
8650 started_tx: Mutex::new(Some(started_tx)),
8651 unblock_rx: Mutex::new(unblock_rx),
8652 }
8653 }
8654 }
8655
8656 impl crate::search::embedder::Embedder for BlockingTestEmbedder {
8657 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
8658 if let Ok(mut guard) = self.started_tx.lock()
8659 && let Some(tx) = guard.take()
8660 {
8661 let _ = tx.send(());
8662 }
8663 self.unblock_rx
8664 .lock()
8665 .expect("blocking embedder receiver")
8666 .recv()
8667 .expect("blocking embedder unblock signal");
8668 Ok(self.vector.clone())
8669 }
8670
8671 fn dimension(&self) -> usize {
8672 self.vector.len()
8673 }
8674
8675 fn id(&self) -> &str {
8676 &self.id
8677 }
8678
8679 fn is_semantic(&self) -> bool {
8680 false
8681 }
8682
8683 fn category(&self) -> frankensearch::ModelCategory {
8684 frankensearch::ModelCategory::HashEmbedder
8685 }
8686 }
8687
8688 impl crate::search::embedder::Embedder for FixedTestEmbedder {
8689 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
8690 Ok(self.vector.clone())
8691 }
8692
8693 fn dimension(&self) -> usize {
8694 self.vector.len()
8695 }
8696
8697 fn id(&self) -> &str {
8698 &self.id
8699 }
8700
8701 fn is_semantic(&self) -> bool {
8702 false
8703 }
8704
8705 fn category(&self) -> frankensearch::ModelCategory {
8706 frankensearch::ModelCategory::HashEmbedder
8707 }
8708 }
8709
8710 struct SemanticTestFixture {
8711 _dir: TempDir,
8712 client: SearchClient,
8713 doc_ids: Vec<String>,
8714 source_paths: Vec<String>,
8715 }
8716
8717 struct ProgressiveHybridFixture {
8718 _dir: TempDir,
8719 client: Arc<SearchClient>,
8720 query: String,
8721 }
8722
8723 fn projected_minimal_fields_search_hit(title: &str, source_path: &str) -> SearchHit {
8729 SearchHit {
8730 title: title.to_string(),
8731 snippet: String::new(),
8732 content: String::new(),
8733 content_hash: 0,
8734 conversation_id: Some(42),
8735 score: 1.0,
8736 source_path: source_path.to_string(),
8737 agent: "test-agent".into(),
8738 workspace: "/tmp/workspace".into(),
8739 workspace_original: None,
8740 created_at: Some(1_700_000_000_000),
8741 line_number: Some(1),
8742 match_type: MatchType::default(),
8743 source_id: "local".into(),
8744 origin_kind: "local".into(),
8745 origin_host: None,
8746 }
8747 }
8748
8749 #[test]
8759 fn hit_is_noise_returns_false_for_projected_minimal_fields_hit() {
8760 let hit = projected_minimal_fields_search_hit(
8761 "Demo conversation about authentication",
8762 "/tmp/sessions/demo-auth.jsonl",
8763 );
8764 assert_eq!(hit.content, "");
8765 assert_eq!(hit.snippet, "");
8766 assert!(
8767 !hit_is_noise(&hit, "authentication"),
8768 "projected --fields minimal hit must NOT be classified as noise; \
8769 doing so silently drops every real match (bead bd-q6xf9)"
8770 );
8771 }
8772
8773 #[test]
8779 fn hit_is_noise_still_suppresses_real_tool_invocation_noise_when_content_present() {
8780 let mut hit =
8781 projected_minimal_fields_search_hit("Tool ping", "/tmp/sessions/tool-ping.jsonl");
8782 hit.content =
8786 "[tool_call]: {\"name\": \"bash\", \"arguments\": {\"command\": \"ls\"}}".into();
8787 let classified_as_noise_on_real_content =
8788 hit_is_noise(&hit, "ls") || hit_is_noise(&hit, "bash");
8789 let _ = classified_as_noise_on_real_content;
8796 assert!(!hit.content.is_empty(), "precondition: content populated");
8797 }
8798
8799 #[test]
8806 fn hit_is_noise_uses_snippet_when_content_empty_but_snippet_populated() {
8807 let mut hit = projected_minimal_fields_search_hit(
8808 "Real authentication hit",
8809 "/tmp/sessions/real-auth.jsonl",
8810 );
8811 hit.content = String::new();
8812 hit.snippet = "The user asked about authentication flow options.".into();
8813 assert!(
8816 !hit_is_noise(&hit, "authentication"),
8817 "snippet-only hits with real content must survive the noise filter"
8818 );
8819 }
8820
8821 #[test]
8822 fn search_client_is_send_sync_without_phantom_filters() {
8823 fn assert_send_sync<T: Send + Sync>() {}
8824 assert_send_sync::<SearchClient>();
8825 }
8826
8827 #[test]
8828 fn semantic_embedding_releases_semantic_lock_while_embedding() -> Result<()> {
8829 let fixture = build_semantic_test_fixture()?;
8830 let client = Arc::new(fixture.client);
8831 let (started_tx, started_rx) = std::sync::mpsc::channel();
8832 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8833
8834 {
8835 let mut guard = client
8836 .semantic
8837 .lock()
8838 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8839 let state = guard
8840 .as_mut()
8841 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8842 state.embedder = Arc::new(BlockingTestEmbedder::new(
8843 "test-fixed-2d",
8844 &[1.0, 0.0],
8845 started_tx,
8846 unblock_rx,
8847 ));
8848 state.query_cache = QueryCache::new(
8849 "test-fixed-2d",
8850 NonZeroUsize::new(100).expect("cache capacity"),
8851 );
8852 }
8853
8854 let search_client = Arc::clone(&client);
8855 let search_handle = std::thread::spawn(move || {
8856 search_client.search_semantic(
8857 "lock scope regression",
8858 SearchFilters::default(),
8859 3,
8860 0,
8861 FieldMask::FULL,
8862 false,
8863 )
8864 });
8865
8866 started_rx
8867 .recv_timeout(Duration::from_secs(1))
8868 .expect("embedder should start");
8869
8870 let clear_client = Arc::clone(&client);
8871 let (clear_tx, clear_rx) = std::sync::mpsc::channel();
8872 let clear_handle = std::thread::spawn(move || {
8873 let _ = clear_tx.send(clear_client.clear_semantic_context());
8874 });
8875
8876 clear_rx
8877 .recv_timeout(Duration::from_millis(500))
8878 .expect("semantic lock should not stay held during embed")?;
8879
8880 unblock_tx.send(()).expect("unblock embedder");
8881 clear_handle.join().expect("clear thread join");
8882 let search_result = search_handle.join().expect("search thread join");
8883 assert!(
8884 search_result.is_err(),
8885 "search should observe semantic context cleared after embedding"
8886 );
8887
8888 Ok(())
8889 }
8890
8891 #[test]
8892 fn semantic_embedding_ignores_stale_same_id_context_after_swap() -> Result<()> {
8893 let fixture = build_semantic_test_fixture()?;
8894 let client = Arc::new(fixture.client);
8895 let (started_tx, started_rx) = std::sync::mpsc::channel();
8896 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8897
8898 {
8899 let mut guard = client
8900 .semantic
8901 .lock()
8902 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8903 let state = guard
8904 .as_mut()
8905 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8906 state.embedder = Arc::new(BlockingTestEmbedder::new(
8907 "test-fixed-2d",
8908 &[1.0, 0.0],
8909 started_tx,
8910 unblock_rx,
8911 ));
8912 state.query_cache = QueryCache::new(
8913 "test-fixed-2d",
8914 NonZeroUsize::new(100).expect("cache capacity"),
8915 );
8916 }
8917
8918 let embedding_client = Arc::clone(&client);
8919 let handle =
8920 std::thread::spawn(move || embedding_client.semantic_query_embedding("context-swap"));
8921
8922 started_rx
8923 .recv_timeout(Duration::from_secs(1))
8924 .expect("embedder should start");
8925
8926 {
8927 let mut guard = client
8928 .semantic
8929 .lock()
8930 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8931 let state = guard
8932 .as_mut()
8933 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8934 state.context_token = Arc::new(());
8935 state.embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[0.0, 1.0]));
8936 state.query_cache = QueryCache::new(
8937 "test-fixed-2d",
8938 NonZeroUsize::new(100).expect("cache capacity"),
8939 );
8940 }
8941
8942 unblock_tx.send(()).expect("unblock embedder");
8943
8944 let embedding = handle.join().expect("embedding thread join")?.vector;
8945 assert_eq!(
8946 embedding,
8947 vec![0.0, 1.0],
8948 "stale embedding from the previous same-id context must not leak across the swap"
8949 );
8950
8951 Ok(())
8952 }
8953
8954 #[test]
8955 fn quality_mode_does_not_reuse_fast_only_two_tier_cache() -> Result<()> {
8956 let dir = TempDir::new()?;
8957 let mut index = TantivyIndex::open_or_create(dir.path())?;
8958 index.commit()?;
8959
8960 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8961 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8962 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8963 let writer = VectorIndex::create_with_revision(
8964 &fast_path,
8965 embedder.id(),
8966 "rev-fast-only",
8967 embedder.dimension(),
8968 frankensearch::index::Quantization::F16,
8969 )?;
8970 writer.finish()?;
8971
8972 client.set_semantic_context(
8973 embedder,
8974 VectorIndex::open(&fast_path)?,
8975 SemanticFilterMaps::for_tests(
8976 HashMap::new(),
8977 HashMap::new(),
8978 HashMap::new(),
8979 HashSet::new(),
8980 ),
8981 None,
8982 Some(fast_path),
8983 )?;
8984
8985 let fast_only_index = client
8986 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
8987 .expect("fast-only index should load");
8988 assert!(
8989 !fast_only_index.has_quality_index(),
8990 "fixture should only provide the fast tier"
8991 );
8992
8993 let quality_index = client.in_memory_two_tier_index(SemanticTierMode::QualityOnly)?;
8994 assert!(
8995 quality_index.is_none(),
8996 "quality mode must not reuse a cached fast-only two-tier index"
8997 );
8998
8999 Ok(())
9000 }
9001
9002 #[test]
9003 fn failed_quality_probe_does_not_block_fast_only_two_tier_load() -> Result<()> {
9004 let dir = TempDir::new()?;
9005 let mut index = TantivyIndex::open_or_create(dir.path())?;
9006 index.commit()?;
9007
9008 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9009 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9010 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
9011 let writer = VectorIndex::create_with_revision(
9012 &fast_path,
9013 embedder.id(),
9014 "rev-fast-only",
9015 embedder.dimension(),
9016 frankensearch::index::Quantization::F16,
9017 )?;
9018 writer.finish()?;
9019
9020 client.set_semantic_context(
9021 embedder,
9022 VectorIndex::open(&fast_path)?,
9023 SemanticFilterMaps::for_tests(
9024 HashMap::new(),
9025 HashMap::new(),
9026 HashMap::new(),
9027 HashSet::new(),
9028 ),
9029 None,
9030 Some(fast_path),
9031 )?;
9032
9033 assert!(
9034 client
9035 .in_memory_two_tier_index(SemanticTierMode::QualityOnly)?
9036 .is_none(),
9037 "quality-only lookup should fail for a fast-only fixture"
9038 );
9039
9040 let fast_only_index = client
9041 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
9042 .expect("a failed quality-only probe must not poison fast-only loads");
9043 assert!(
9044 !fast_only_index.has_quality_index(),
9045 "fixture should still resolve to the fast-only tier"
9046 );
9047
9048 Ok(())
9049 }
9050
9051 #[test]
9052 fn progressive_context_error_does_not_poison_future_attempts() -> Result<()> {
9053 let dir = TempDir::new()?;
9054 let mut index = TantivyIndex::open_or_create(dir.path())?;
9055 index.commit()?;
9056
9057 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9058 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9059 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
9060 let writer = VectorIndex::create_with_revision(
9061 &fast_path,
9062 embedder.id(),
9063 "rev-progressive-error",
9064 embedder.dimension(),
9065 frankensearch::index::Quantization::F16,
9066 )?;
9067 writer.finish()?;
9068 std::fs::write(dir.path().join("vector.fast.idx"), b"not-a-valid-index")?;
9069 std::fs::write(dir.path().join("vector.quality.idx"), b"not-a-valid-index")?;
9070
9071 client.set_semantic_context(
9072 embedder,
9073 VectorIndex::open(&fast_path)?,
9074 SemanticFilterMaps::for_tests(
9075 HashMap::new(),
9076 HashMap::new(),
9077 HashMap::new(),
9078 HashSet::new(),
9079 ),
9080 None,
9081 Some(fast_path),
9082 )?;
9083
9084 let first_err = client
9085 .progressive_context()
9086 .err()
9087 .expect("invalid progressive index files should fail to load");
9088 assert!(
9089 first_err
9090 .to_string()
9091 .contains("open fast-tier index failed"),
9092 "unexpected first progressive-context error: {first_err}"
9093 );
9094
9095 let second_err = client
9096 .progressive_context()
9097 .err()
9098 .expect("a failed progressive load must not be memoized as None");
9099 assert!(
9100 second_err
9101 .to_string()
9102 .contains("open fast-tier index failed"),
9103 "unexpected second progressive-context error: {second_err}"
9104 );
9105
9106 Ok(())
9107 }
9108
9109 fn build_semantic_test_fixture() -> Result<SemanticTestFixture> {
9110 build_semantic_test_fixture_with_shards(false)
9111 }
9112
9113 fn build_sharded_semantic_test_fixture() -> Result<SemanticTestFixture> {
9114 build_semantic_test_fixture_with_shards(true)
9115 }
9116
9117 fn build_semantic_test_fixture_with_shards(sharded: bool) -> Result<SemanticTestFixture> {
9118 let dir = TempDir::new()?;
9119 let db_path = dir.path().join("cass.db");
9120 let storage = FrankenStorage::open(&db_path)?;
9121
9122 let agent = Agent {
9123 id: None,
9124 slug: "codex".into(),
9125 name: "Codex".into(),
9126 version: None,
9127 kind: AgentKind::Cli,
9128 };
9129 let agent_id = storage.ensure_agent(&agent)?;
9130 let workspace_path = dir.path().join("workspace");
9131 std::fs::create_dir_all(&workspace_path)?;
9132 let workspace_id = storage.ensure_workspace(&workspace_path, None)?;
9133
9134 let documents = [
9135 ("session-a.jsonl", "top semantic match", [1.0_f32, 0.0_f32]),
9136 (
9137 "session-b.jsonl",
9138 "middle semantic match",
9139 [0.9_f32, 0.1_f32],
9140 ),
9141 ("session-c.jsonl", "late semantic match", [0.8_f32, 0.2_f32]),
9142 ];
9143 let base_ts = 1_700_000_000_000_i64;
9144 let mut doc_ids = Vec::with_capacity(documents.len());
9145 let mut source_paths = Vec::with_capacity(documents.len());
9146
9147 for (idx, (name, content, _vector)) in documents.iter().enumerate() {
9148 let source_path = dir.path().join(name);
9149 source_paths.push(source_path.to_string_lossy().to_string());
9150
9151 let conversation = Conversation {
9152 id: None,
9153 agent_slug: agent.slug.clone(),
9154 workspace: Some(workspace_path.clone()),
9155 external_id: Some(format!("semantic-{idx}")),
9156 title: Some(format!("semantic session {idx}")),
9157 source_path,
9158 started_at: Some(base_ts + idx as i64),
9159 ended_at: Some(base_ts + idx as i64),
9160 approx_tokens: Some(16),
9161 metadata_json: json!({"fixture": "semantic_search"}),
9162 messages: vec![Message {
9163 id: None,
9164 idx: 0,
9165 role: MessageRole::User,
9166 author: Some("user".into()),
9167 created_at: Some(base_ts + idx as i64),
9168 content: (*content).to_string(),
9169 extra_json: json!({}),
9170 snippets: Vec::new(),
9171 }],
9172 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
9173 origin_host: None,
9174 };
9175
9176 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
9177 }
9178
9179 let message_rows: Vec<(u64, i64)> = storage.raw().query_map_collect(
9180 "SELECT m.id, COALESCE(m.created_at, c.started_at, 0)
9181 FROM messages m
9182 JOIN conversations c ON m.conversation_id = c.id
9183 ORDER BY c.id",
9184 &[],
9185 |row: &frankensqlite::Row| {
9186 let message_id: i64 = row.get_typed(0)?;
9187 let created_at: i64 = row.get_typed(1)?;
9188 Ok((u64::try_from(message_id).unwrap_or(u64::MAX), created_at))
9189 },
9190 )?;
9191 assert_eq!(
9192 message_rows.len(),
9193 documents.len(),
9194 "fixture should create 3 messages"
9195 );
9196
9197 let filter_maps = SemanticFilterMaps::from_storage(&storage)?;
9198 let embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[1.0, 0.0]));
9199 let source_hash = crc32fast::hash(crate::sources::provenance::LOCAL_SOURCE_ID.as_bytes());
9200 let vector_dir = dir.path().join("vector_index");
9201 std::fs::create_dir_all(&vector_dir)?;
9202 let mut vector_records = Vec::with_capacity(documents.len());
9203
9204 for ((message_id, created_at_ms), (_, _, vector)) in message_rows.iter().zip(documents) {
9205 let doc_id = SemanticDocId {
9206 message_id: *message_id,
9207 chunk_idx: 0,
9208 agent_id: u32::try_from(agent_id)?,
9209 workspace_id: u32::try_from(workspace_id)?,
9210 source_id: source_hash,
9211 role: ROLE_USER,
9212 created_at_ms: *created_at_ms,
9213 content_hash: None,
9214 }
9215 .to_doc_id_string();
9216 doc_ids.push(doc_id.clone());
9217 vector_records.push((doc_id, vector));
9218 }
9219
9220 let mut vector_indexes = Vec::new();
9221 if sharded {
9222 for (shard_index, chunk) in vector_records.chunks(2).enumerate() {
9223 let vector_path = vector_dir.join(format!("shard-{shard_index}.fsvi"));
9224 let mut writer = VectorIndex::create_with_revision(
9225 &vector_path,
9226 embedder.id(),
9227 "rev-1",
9228 embedder.dimension(),
9229 frankensearch::index::Quantization::F16,
9230 )?;
9231 for (doc_id, vector) in chunk {
9232 writer.write_record(doc_id, vector)?;
9233 }
9234 writer.finish()?;
9235 vector_indexes.push(VectorIndex::open(&vector_path)?);
9236 }
9237 } else {
9238 let vector_path = vector_dir.join("index-test-fixed-2d.fsvi");
9239 let mut writer = VectorIndex::create_with_revision(
9240 &vector_path,
9241 embedder.id(),
9242 "rev-1",
9243 embedder.dimension(),
9244 frankensearch::index::Quantization::F16,
9245 )?;
9246 for (doc_id, vector) in &vector_records {
9247 writer.write_record(doc_id, vector)?;
9248 }
9249 writer.finish()?;
9250 vector_indexes.push(VectorIndex::open(&vector_path)?);
9251 }
9252 drop(storage);
9253
9254 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
9255 client.set_semantic_indexes_context(embedder, vector_indexes, filter_maps, None, None)?;
9256
9257 Ok(SemanticTestFixture {
9258 _dir: dir,
9259 client,
9260 doc_ids,
9261 source_paths,
9262 })
9263 }
9264
9265 fn build_progressive_hybrid_fixture() -> Result<ProgressiveHybridFixture> {
9266 let dir = TempDir::new()?;
9267 let mut index = TantivyIndex::open_or_create(dir.path())?;
9268 let workspace_path = dir.path().join("workspace");
9269 std::fs::create_dir_all(&workspace_path)?;
9270 let agent_id = 1_i64;
9271 let workspace_id = 1_i64;
9272 let source_id = crate::sources::provenance::LOCAL_SOURCE_ID;
9273 let source_hash = crc32fast::hash(source_id.as_bytes());
9274 let conn = Connection::open(":memory:")?;
9275 conn.execute_batch(
9276 r#"
9277 CREATE TABLE agents (
9278 id INTEGER PRIMARY KEY,
9279 slug TEXT NOT NULL
9280 );
9281 CREATE TABLE workspaces (
9282 id INTEGER PRIMARY KEY,
9283 path TEXT NOT NULL
9284 );
9285 CREATE TABLE sources (
9286 id TEXT PRIMARY KEY,
9287 kind TEXT NOT NULL
9288 );
9289 CREATE TABLE conversations (
9290 id INTEGER PRIMARY KEY,
9291 agent_id INTEGER NOT NULL,
9292 workspace_id INTEGER,
9293 title TEXT,
9294 source_path TEXT NOT NULL,
9295 source_id TEXT NOT NULL,
9296 origin_host TEXT,
9297 started_at INTEGER
9298 );
9299 CREATE TABLE messages (
9300 id INTEGER PRIMARY KEY,
9301 conversation_id INTEGER NOT NULL,
9302 idx INTEGER NOT NULL,
9303 role TEXT NOT NULL,
9304 created_at INTEGER,
9305 content TEXT NOT NULL
9306 );
9307 "#,
9308 )?;
9309 conn.execute_compat(
9310 "INSERT INTO agents (id, slug) VALUES (?1, ?2)",
9311 params![agent_id, "codex"],
9312 )?;
9313 conn.execute_compat(
9314 "INSERT INTO workspaces (id, path) VALUES (?1, ?2)",
9315 params![workspace_id, workspace_path.to_string_lossy().to_string()],
9316 )?;
9317 conn.execute_compat(
9318 "INSERT INTO sources (id, kind) VALUES (?1, ?2)",
9319 params![source_id, "local"],
9320 )?;
9321
9322 let query = "oauth refresh token middleware session cache".to_string();
9323 let filler = " context window ranking provenance semantic upgrade lexical overlay";
9324 let base_ts = 1_700_000_100_000_i64;
9325 let doc_count = 64usize;
9326 let mut message_rows = Vec::with_capacity(doc_count);
9327
9328 for idx in 0..doc_count {
9329 let conversation_id = i64::try_from(idx + 1)?;
9330 let message_id = u64::try_from(idx + 1)?;
9331 let source_path = dir.path().join(format!("progressive-{idx:03}.jsonl"));
9332 let repeated = filler.repeat(48);
9333 let content = if idx % 4 == 0 {
9334 format!(
9335 "{query} hot path candidate {idx} with detailed search diagnostics.{repeated}"
9336 )
9337 } else if idx % 4 == 1 {
9338 format!(
9339 "search pipeline benchmark {idx} with lexical overlay and semantic ranking.{repeated}"
9340 )
9341 } else if idx % 4 == 2 {
9342 format!(
9343 "interactive typing debounce benchmark {idx} for hybrid two tier search.{repeated}"
9344 )
9345 } else {
9346 format!(
9347 "unrelated background chatter {idx} about build systems and formatting checks.{repeated}"
9348 )
9349 };
9350 let created_at = base_ts + idx as i64;
9351 let source_path_str = source_path.to_string_lossy().to_string();
9352 let title = format!("progressive fixture {idx}");
9353
9354 conn.execute_compat(
9355 "INSERT INTO conversations (
9356 id, agent_id, workspace_id, title, source_path, source_id, origin_host, started_at
9357 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, NULL, ?7)",
9358 params![
9359 conversation_id,
9360 agent_id,
9361 workspace_id,
9362 title,
9363 source_path_str.clone(),
9364 source_id,
9365 created_at
9366 ],
9367 )?;
9368 conn.execute_compat(
9369 "INSERT INTO messages (
9370 id, conversation_id, idx, role, created_at, content
9371 ) VALUES (?1, ?2, 0, 'user', ?3, ?4)",
9372 params![
9373 i64::try_from(message_id)?,
9374 conversation_id,
9375 created_at,
9376 content.clone()
9377 ],
9378 )?;
9379 message_rows.push((message_id, created_at, content.clone()));
9380
9381 let normalized = NormalizedConversation {
9382 agent_slug: "codex".into(),
9383 external_id: Some(format!("progressive-{idx}")),
9384 title: Some(format!("progressive fixture {idx}")),
9385 workspace: Some(workspace_path.clone()),
9386 source_path,
9387 started_at: Some(created_at),
9388 ended_at: Some(created_at),
9389 metadata: json!({}),
9390 messages: vec![NormalizedMessage {
9391 idx: 0,
9392 role: "user".into(),
9393 author: Some("user".into()),
9394 created_at: Some(created_at),
9395 content,
9396 extra: json!({}),
9397 snippets: Vec::new(),
9398 invocations: Vec::new(),
9399 }],
9400 };
9401 index.add_conversation(&normalized)?;
9402 }
9403 index.commit()?;
9404
9405 assert_eq!(
9406 message_rows.len(),
9407 doc_count,
9408 "fixture should create the requested number of messages"
9409 );
9410
9411 let fast_embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
9412 let quality_embedder = crate::search::hash_embedder::HashEmbedder::new(384);
9413 let filter_maps = SemanticFilterMaps::for_tests(
9414 HashMap::from([("codex".to_string(), u32::try_from(agent_id)?)]),
9415 HashMap::from([(
9416 workspace_path.to_string_lossy().to_string(),
9417 u32::try_from(workspace_id)?,
9418 )]),
9419 HashMap::from([(source_id.to_string(), source_hash)]),
9420 HashSet::new(),
9421 );
9422 let fast_path = dir.path().join("vector.fast.idx");
9423 let quality_path = dir.path().join("vector.quality.idx");
9424
9425 let mut fast_writer = VectorIndex::create_with_revision(
9426 &fast_path,
9427 fast_embedder.id(),
9428 "rev-progressive-fast",
9429 fast_embedder.dimension(),
9430 frankensearch::index::Quantization::F16,
9431 )?;
9432 let mut quality_writer = VectorIndex::create_with_revision(
9433 &quality_path,
9434 quality_embedder.id(),
9435 "rev-progressive-quality",
9436 quality_embedder.dimension(),
9437 frankensearch::index::Quantization::F16,
9438 )?;
9439
9440 for (message_id, created_at_ms, content) in &message_rows {
9441 let canonical = canonicalize_for_embedding(content);
9442 let doc_id = SemanticDocId {
9443 message_id: *message_id,
9444 chunk_idx: 0,
9445 agent_id: u32::try_from(agent_id)?,
9446 workspace_id: u32::try_from(workspace_id)?,
9447 source_id: source_hash,
9448 role: ROLE_USER,
9449 created_at_ms: *created_at_ms,
9450 content_hash: Some(content_hash(&canonical)),
9451 }
9452 .to_doc_id_string();
9453
9454 let fast_vec = fast_embedder.embed_sync(content)?;
9455 fast_writer.write_record(&doc_id, &fast_vec)?;
9456 let quality_vec = quality_embedder.embed_sync(content)?;
9457 quality_writer.write_record(&doc_id, &quality_vec)?;
9458 }
9459 fast_writer.finish()?;
9460 quality_writer.finish()?;
9461
9462 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
9463 let client = SearchClient {
9464 reader,
9465 sqlite: Mutex::new(Some(SendConnection(conn))),
9466 sqlite_path: None,
9467 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9468 reload_on_search: true,
9469 last_reload: Mutex::new(None),
9470 last_generation: Mutex::new(None),
9471 reload_epoch: Arc::new(AtomicU64::new(0)),
9472 warm_tx: None,
9473 _warm_handle: None,
9474 metrics: Metrics::default(),
9475 cache_namespace: format!("v{}|schema:{}", CACHE_KEY_VERSION, FS_CASS_SCHEMA_HASH),
9476 semantic: Mutex::new(None),
9477 last_tantivy_total_count: Mutex::new(None),
9478 };
9479 let semantic_embedder: Arc<dyn Embedder> = fast_embedder;
9480 client.set_semantic_context(
9481 semantic_embedder,
9482 VectorIndex::open(&fast_path)?,
9483 filter_maps,
9484 None,
9485 Some(fast_path),
9486 )?;
9487
9488 Ok(ProgressiveHybridFixture {
9489 _dir: dir,
9490 client: Arc::new(client),
9491 query,
9492 })
9493 }
9494
9495 fn sanitize_query(raw: &str) -> String {
9496 nfc_sanitize_query(raw)
9497 }
9498
9499 fn parse_boolean_query(query: &str) -> Vec<FsCassQueryToken> {
9500 fs_cass_parse_boolean_query(query)
9501 }
9502
9503 fn sqlite_master_name_count(db_path: &Path, name: &str) -> Result<i64> {
9504 let conn = FrankenConnection::open(db_path.to_string_lossy().as_ref())?;
9505 Ok(conn.query_row_map(
9506 "SELECT COUNT(*) FROM sqlite_master WHERE name = ?1",
9507 &[ParamValue::from(name)],
9508 |row| row.get_typed(0),
9509 )?)
9510 }
9511
9512 type QueryToken = FsCassQueryToken;
9513 type WildcardPattern = FsCassWildcardPattern;
9514 type QueryTokenList = Vec<QueryToken>;
9515
9516 #[test]
9517 #[ignore = "profiling harness for live hybrid progressive search"]
9518 fn progressive_hybrid_profile_harness() -> Result<()> {
9519 let fixture = build_progressive_hybrid_fixture()?;
9520 let runtime = asupersync::runtime::RuntimeBuilder::current_thread()
9521 .build()
9522 .map_err(|err| anyhow!("build test runtime failed: {err}"))?;
9523 let iterations = 24usize;
9524
9525 runtime.block_on(async {
9526 let cx = FsCx::for_request();
9527 fixture
9528 .client
9529 .search_progressive_with_callback(
9530 ProgressiveSearchRequest {
9531 cx: &cx,
9532 query: &fixture.query,
9533 filters: SearchFilters::default(),
9534 limit: 16,
9535 sparse_threshold: 0,
9536 field_mask: FieldMask::new(false, true, true, true),
9537 mode: SearchMode::Hybrid,
9538 },
9539 |_| {},
9540 )
9541 .await
9542 })?;
9543
9544 let mut initial_events = 0usize;
9545 let mut refined_events = 0usize;
9546 let mut total_hits = 0usize;
9547 for _ in 0..iterations {
9548 let mut refinement_error = None;
9549 runtime.block_on(async {
9550 let cx = FsCx::for_request();
9551 fixture
9552 .client
9553 .search_progressive_with_callback(
9554 ProgressiveSearchRequest {
9555 cx: &cx,
9556 query: &fixture.query,
9557 filters: SearchFilters::default(),
9558 limit: 16,
9559 sparse_threshold: 0,
9560 field_mask: FieldMask::new(false, true, true, true),
9561 mode: SearchMode::Hybrid,
9562 },
9563 |event| match event {
9564 ProgressiveSearchEvent::Phase { kind, result, .. } => {
9565 assert!(
9566 !result.hits.is_empty(),
9567 "progressive harness expects non-empty hits for each phase"
9568 );
9569 total_hits += result.hits.len();
9570 match kind {
9571 ProgressivePhaseKind::Initial => initial_events += 1,
9572 ProgressivePhaseKind::Refined => refined_events += 1,
9573 }
9574 }
9575 ProgressiveSearchEvent::RefinementFailed { error, .. } => {
9576 refinement_error = Some(error);
9577 }
9578 },
9579 )
9580 .await
9581 })?;
9582 if let Some(error) = refinement_error {
9583 bail!("progressive harness refinement failed: {error}");
9584 }
9585 }
9586
9587 assert_eq!(initial_events, iterations);
9588 assert_eq!(refined_events, iterations);
9589 assert!(
9590 total_hits >= iterations.saturating_mul(16),
9591 "harness should observe a full page for each phase"
9592 );
9593
9594 Ok(())
9595 }
9596
9597 #[test]
9602 fn interner_returns_same_arc_for_same_string() {
9603 let interner = StringInterner::new(100);
9604
9605 let s1 = interner.intern("test_query");
9606 let s2 = interner.intern("test_query");
9607
9608 assert!(Arc::ptr_eq(&s1, &s2));
9610 assert_eq!(&*s1, "test_query");
9611 }
9612
9613 #[test]
9614 fn interner_different_strings_return_different_arcs() {
9615 let interner = StringInterner::new(100);
9616
9617 let s1 = interner.intern("query1");
9618 let s2 = interner.intern("query2");
9619
9620 assert!(!Arc::ptr_eq(&s1, &s2));
9621 assert_eq!(&*s1, "query1");
9622 assert_eq!(&*s2, "query2");
9623 }
9624
9625 #[test]
9626 fn interner_handles_empty_string() {
9627 let interner = StringInterner::new(100);
9628
9629 let s1 = interner.intern("");
9630 let s2 = interner.intern("");
9631
9632 assert!(Arc::ptr_eq(&s1, &s2));
9633 assert_eq!(&*s1, "");
9634 }
9635
9636 #[test]
9637 fn interner_handles_unicode() {
9638 let interner = StringInterner::new(100);
9639
9640 let s1 = interner.intern("测试查询");
9641 let s2 = interner.intern("测试查询");
9642 let s3 = interner.intern("emoji 🔍 search");
9643
9644 assert!(Arc::ptr_eq(&s1, &s2));
9645 assert_eq!(&*s3, "emoji 🔍 search");
9646 }
9647
9648 #[test]
9649 fn interner_respects_lru_eviction() {
9650 let interner = StringInterner::new(3);
9651
9652 let _s1 = interner.intern("query1");
9653 let _s2 = interner.intern("query2");
9654 let _s3 = interner.intern("query3");
9655
9656 assert_eq!(interner.len(), 3);
9657
9658 let _s4 = interner.intern("query4");
9660
9661 assert_eq!(interner.len(), 3);
9662
9663 let s1_new = interner.intern("query1");
9665 assert_eq!(&*s1_new, "query1");
9666 }
9667
9668 #[test]
9669 fn interner_concurrent_access() {
9670 use std::thread;
9671
9672 let interner = Arc::new(StringInterner::new(1000));
9673 let queries: Vec<String> = (0..100).map(|i| format!("query_{}", i)).collect();
9674
9675 let handles: Vec<_> = (0..4)
9676 .map(|_| {
9677 let interner = Arc::clone(&interner);
9678 let queries = queries.clone();
9679
9680 thread::spawn(move || {
9681 for _ in 0..10 {
9682 for query in &queries {
9683 let _ = interner.intern(query);
9684 }
9685 }
9686 })
9687 })
9688 .collect();
9689
9690 for handle in handles {
9691 handle.join().unwrap();
9692 }
9693
9694 for query in &queries {
9696 let s1 = interner.intern(query);
9697 let s2 = interner.intern(query);
9698 assert!(Arc::ptr_eq(&s1, &s2));
9699 }
9700 }
9701
9702 #[test]
9707 fn query_terms_lower_basic() {
9708 let terms = QueryTermsLower::from_query("Hello World");
9709
9710 assert_eq!(terms.query_lower, "hello world");
9711 let tokens: Vec<&str> = terms.tokens().collect();
9712 assert_eq!(tokens, vec!["hello", "world"]);
9713 }
9714
9715 #[test]
9716 fn query_terms_lower_empty() {
9717 let terms = QueryTermsLower::from_query("");
9718
9719 assert!(terms.is_empty());
9720 assert_eq!(terms.tokens().count(), 0);
9721 }
9722
9723 #[test]
9724 fn query_terms_lower_single_term() {
9725 let terms = QueryTermsLower::from_query("TEST");
9726
9727 let tokens: Vec<&str> = terms.tokens().collect();
9728 assert_eq!(tokens, vec!["test"]);
9729 }
9730
9731 #[test]
9732 fn query_terms_lower_with_punctuation() {
9733 let terms = QueryTermsLower::from_query("hello, world! how's it?");
9734
9735 let tokens: Vec<&str> = terms.tokens().collect();
9736 assert_eq!(tokens, vec!["hello", "world", "how", "s", "it"]);
9737 }
9738
9739 #[test]
9740 fn query_terms_lower_unicode() {
9741 let terms = QueryTermsLower::from_query("Héllo Wörld");
9742
9743 assert_eq!(terms.query_lower, "héllo wörld");
9744 let tokens: Vec<&str> = terms.tokens().collect();
9745 assert_eq!(tokens, vec!["héllo", "wörld"]);
9746 }
9747
9748 #[test]
9749 fn query_terms_lower_bloom_mask() {
9750 let terms = QueryTermsLower::from_query("test");
9751
9752 assert_ne!(terms.bloom_mask(), 0);
9754
9755 let terms2 = QueryTermsLower::from_query("test");
9757 assert_eq!(terms.bloom_mask(), terms2.bloom_mask());
9758 }
9759
9760 #[test]
9761 fn hit_matches_with_precomputed_terms() {
9762 let hit = SearchHit {
9763 title: "Test Title".into(),
9764 snippet: "".into(),
9765 content: "hello world content".into(),
9766 content_hash: stable_content_hash("hello world content"),
9767 score: 1.0,
9768 source_path: "p".into(),
9769 agent: "a".into(),
9770 workspace: "w".into(),
9771 workspace_original: None,
9772 created_at: None,
9773 line_number: None,
9774 match_type: MatchType::Exact,
9775 source_id: "local".into(),
9776 origin_kind: "local".into(),
9777 origin_host: None,
9778 conversation_id: None,
9779 };
9780 let cached = cached_hit_from(&hit);
9781
9782 let terms = QueryTermsLower::from_query("hello");
9784 assert!(hit_matches_query_cached_precomputed(&cached, &terms));
9785
9786 let terms_miss = QueryTermsLower::from_query("missing");
9787 assert!(!hit_matches_query_cached_precomputed(&cached, &terms_miss));
9788 }
9789
9790 fn make_fused_hit(
9795 id: &str,
9796 rrf: f32,
9797 lexical: Option<usize>,
9798 semantic: Option<usize>,
9799 ) -> FusedHit {
9800 FusedHit {
9801 key: SearchHitKey {
9802 source_id: "local".to_string(),
9803 source_path: id.to_string(),
9804 conversation_id: None,
9805 title: String::new(),
9806 line_number: None,
9807 created_at: None,
9808 content_hash: 0,
9809 },
9810 score: HybridScore {
9811 rrf,
9812 lexical_rank: lexical,
9813 semantic_rank: semantic,
9814 lexical_score: None,
9815 semantic_score: None,
9816 },
9817 hit: SearchHit {
9818 title: id.into(),
9819 snippet: "".into(),
9820 content: "".into(),
9821 content_hash: 0,
9822 score: rrf,
9823 source_path: id.into(),
9824 agent: "test".into(),
9825 workspace: "test".into(),
9826 workspace_original: None,
9827 created_at: None,
9828 line_number: None,
9829 match_type: MatchType::Exact,
9830 source_id: "local".into(),
9831 origin_kind: "local".into(),
9832 origin_host: None,
9833 conversation_id: None,
9834 },
9835 }
9836 }
9837
9838 fn make_federated_merge_hit(id: &str, agent: &str) -> SearchHit {
9839 SearchHit {
9840 title: id.into(),
9841 snippet: String::new(),
9842 content: id.into(),
9843 content_hash: stable_content_hash(id),
9844 score: 0.0,
9845 source_path: format!("{id}.jsonl"),
9846 agent: agent.into(),
9847 workspace: "workspace".into(),
9848 workspace_original: None,
9849 created_at: Some(1_700_000_000_000),
9850 line_number: Some(1),
9851 match_type: MatchType::Exact,
9852 source_id: "local".into(),
9853 origin_kind: "local".into(),
9854 origin_host: None,
9855 conversation_id: None,
9856 }
9857 }
9858
9859 fn make_federated_ranked_hit(
9860 shard_index: usize,
9861 shard_rank: usize,
9862 id: &str,
9863 ) -> FederatedRankedHit {
9864 FederatedRankedHit {
9865 hit: make_federated_merge_hit(id, &format!("shard-{shard_index}")),
9866 shard_index,
9867 shard_rank,
9868 fused_score: federated_rrf_score(shard_rank),
9869 }
9870 }
9871
9872 #[test]
9873 fn federated_merge_orders_equal_rank_hits_by_stable_hit_key() {
9874 let merged = merge_federated_ranked_hits(vec![
9875 make_federated_ranked_hit(2, 0, "zeta"),
9876 make_federated_ranked_hit(0, 0, "bravo"),
9877 make_federated_ranked_hit(1, 0, "alpha"),
9878 ]);
9879
9880 let paths = merged
9881 .iter()
9882 .map(|hit| hit.source_path.as_str())
9883 .collect::<Vec<_>>();
9884 assert_eq!(paths, vec!["alpha.jsonl", "bravo.jsonl", "zeta.jsonl"]);
9885 assert!(
9886 merged
9887 .iter()
9888 .all(|hit| (hit.score - federated_rrf_score(0)).abs() < f32::EPSILON),
9889 "equal per-shard rank should produce equal RRF scores"
9890 );
9891 }
9892
9893 #[test]
9894 fn federated_merge_keeps_rrf_rank_ahead_of_stable_key() {
9895 let merged = merge_federated_ranked_hits(vec![
9896 make_federated_ranked_hit(0, 1, "alpha"),
9897 make_federated_ranked_hit(1, 0, "zeta"),
9898 ]);
9899
9900 let paths = merged
9901 .iter()
9902 .map(|hit| hit.source_path.as_str())
9903 .collect::<Vec<_>>();
9904 assert_eq!(paths, vec!["zeta.jsonl", "alpha.jsonl"]);
9905 assert!(merged[0].score > merged[1].score);
9906 }
9907
9908 #[test]
9909 fn federated_merge_uses_shard_index_as_duplicate_final_tiebreak() {
9910 let merged = merge_federated_ranked_hits(vec![
9911 FederatedRankedHit {
9912 hit: make_federated_merge_hit("same", "shard-2"),
9913 shard_index: 2,
9914 shard_rank: 0,
9915 fused_score: federated_rrf_score(0),
9916 },
9917 FederatedRankedHit {
9918 hit: make_federated_merge_hit("same", "shard-0"),
9919 shard_index: 0,
9920 shard_rank: 0,
9921 fused_score: federated_rrf_score(0),
9922 },
9923 ]);
9924
9925 assert_eq!(merged[0].agent, "shard-0");
9926 assert_eq!(merged[1].agent, "shard-2");
9927 }
9928
9929 #[test]
9930 fn top_k_fused_basic() {
9931 let hits = vec![
9932 make_fused_hit("a", 1.0, Some(0), None),
9933 make_fused_hit("b", 3.0, Some(1), None),
9934 make_fused_hit("c", 2.0, Some(2), None),
9935 make_fused_hit("d", 5.0, Some(3), None),
9936 make_fused_hit("e", 4.0, Some(4), None),
9937 ];
9938
9939 let top = top_k_fused(hits, 3);
9940
9941 assert_eq!(top.len(), 3);
9942 assert_eq!(top[0].key.source_path, "d"); assert_eq!(top[1].key.source_path, "e"); assert_eq!(top[2].key.source_path, "b"); }
9946
9947 #[test]
9948 fn top_k_fused_empty() {
9949 let hits: Vec<FusedHit> = vec![];
9950 let top = top_k_fused(hits, 10);
9951 assert!(top.is_empty());
9952 }
9953
9954 #[test]
9955 fn top_k_fused_k_zero() {
9956 let hits = vec![
9957 make_fused_hit("a", 1.0, Some(0), None),
9958 make_fused_hit("b", 2.0, Some(1), None),
9959 ];
9960 let top = top_k_fused(hits, 0);
9961 assert!(top.is_empty());
9962 }
9963
9964 #[test]
9965 fn top_k_fused_k_larger_than_n() {
9966 let hits = vec![
9967 make_fused_hit("a", 1.0, Some(0), None),
9968 make_fused_hit("b", 2.0, Some(1), None),
9969 ];
9970
9971 let top = top_k_fused(hits, 10);
9972
9973 assert_eq!(top.len(), 2);
9974 assert_eq!(top[0].key.source_path, "b"); assert_eq!(top[1].key.source_path, "a"); }
9977
9978 #[test]
9979 fn top_k_fused_k_equals_n() {
9980 let hits = vec![
9981 make_fused_hit("a", 3.0, Some(0), None),
9982 make_fused_hit("b", 1.0, Some(1), None),
9983 make_fused_hit("c", 2.0, Some(2), None),
9984 ];
9985
9986 let top = top_k_fused(hits, 3);
9987
9988 assert_eq!(top.len(), 3);
9989 assert_eq!(top[0].key.source_path, "a"); assert_eq!(top[1].key.source_path, "c"); assert_eq!(top[2].key.source_path, "b"); }
9993
9994 #[test]
9995 fn top_k_fused_k_one() {
9996 let hits = vec![
9997 make_fused_hit("a", 1.0, Some(0), None),
9998 make_fused_hit("b", 3.0, Some(1), None),
9999 make_fused_hit("c", 2.0, Some(2), None),
10000 ];
10001
10002 let top = top_k_fused(hits, 1);
10003
10004 assert_eq!(top.len(), 1);
10005 assert_eq!(top[0].key.source_path, "b");
10006 assert_eq!(top[0].score.rrf, 3.0);
10007 }
10008
10009 #[test]
10010 fn top_k_fused_duplicate_scores() {
10011 let hits = vec![
10012 make_fused_hit("a", 2.0, Some(0), None),
10013 make_fused_hit("b", 2.0, Some(1), None),
10014 make_fused_hit("c", 2.0, Some(2), None),
10015 make_fused_hit("d", 1.0, Some(3), None),
10016 ];
10017
10018 let top = top_k_fused(hits, 2);
10019
10020 assert_eq!(top.len(), 2);
10021 assert_eq!(top[0].score.rrf, 2.0);
10023 assert_eq!(top[1].score.rrf, 2.0);
10024 }
10025
10026 #[test]
10027 fn top_k_fused_dual_source_tiebreaker() {
10028 let hits = vec![
10030 make_fused_hit("a", 2.0, Some(0), None), make_fused_hit("b", 2.0, Some(1), Some(0)), make_fused_hit("c", 2.0, None, Some(1)), ];
10034
10035 let top = top_k_fused(hits, 3);
10036
10037 assert_eq!(top.len(), 3);
10038 assert_eq!(top[0].key.source_path, "b");
10040 }
10041
10042 #[test]
10043 fn top_k_fused_large_input_uses_quickselect() {
10044 let hits: Vec<FusedHit> = (0..100)
10046 .map(|i| make_fused_hit(&format!("hit_{}", i), i as f32, Some(i), None))
10047 .collect();
10048
10049 let top = top_k_fused(hits, 10);
10050
10051 assert_eq!(top.len(), 10);
10052 for (i, hit) in top.iter().enumerate() {
10054 assert_eq!(hit.key.source_path, format!("hit_{}", 99 - i));
10055 assert_eq!(hit.score.rrf, (99 - i) as f32);
10056 }
10057 }
10058
10059 #[test]
10060 fn top_k_fused_equivalence_with_full_sort() {
10061 for n in [10, 50, 100, 200] {
10063 for k in [1, 5, 10, 25] {
10064 if k > n {
10065 continue;
10066 }
10067
10068 let hits: Vec<FusedHit> = (0..n)
10069 .map(|i| {
10070 let score = ((i * 17 + 7) % 1000) as f32;
10072 make_fused_hit(&format!("hit_{}", i), score, Some(i), None)
10073 })
10074 .collect();
10075
10076 let mut baseline = hits.clone();
10078 baseline.sort_by(cmp_fused_hit_desc);
10079 baseline.truncate(k);
10080
10081 let quickselect = top_k_fused(hits, k);
10083
10084 assert_eq!(quickselect.len(), baseline.len(), "n={}, k={}", n, k);
10086
10087 for (q, b) in quickselect.iter().zip(baseline.iter()) {
10089 assert_eq!(
10090 q.key.source_path, b.key.source_path,
10091 "n={}, k={}: mismatch",
10092 n, k
10093 );
10094 assert_eq!(q.score.rrf, b.score.rrf, "n={}, k={}: score mismatch", n, k);
10095 }
10096 }
10097 }
10098 }
10099
10100 #[test]
10101 fn cmp_fused_hit_desc_basic_ordering() {
10102 let a = make_fused_hit("a", 2.0, Some(0), None);
10103 let b = make_fused_hit("b", 3.0, Some(1), None);
10104
10105 assert_eq!(cmp_fused_hit_desc(&a, &b), CmpOrdering::Greater);
10107 assert_eq!(cmp_fused_hit_desc(&b, &a), CmpOrdering::Less);
10108 assert_eq!(cmp_fused_hit_desc(&a, &a), CmpOrdering::Equal);
10109 }
10110
10111 #[test]
10116 fn cache_enforces_prefix_matching() {
10117 let hit = SearchHit {
10119 title: "test".into(),
10120 snippet: "".into(),
10121 content: "arrow".into(),
10122 content_hash: stable_content_hash("arrow"),
10123 score: 1.0,
10124 source_path: "p".into(),
10125 agent: "a".into(),
10126 workspace: "w".into(),
10127 workspace_original: None,
10128 created_at: None,
10129 line_number: None,
10130 match_type: MatchType::Exact,
10131 source_id: "local".into(),
10132 origin_kind: "local".into(),
10133 origin_host: None,
10134 conversation_id: None,
10135 };
10136
10137 let cached = CachedHit {
10138 hit: hit.clone(),
10139 lc_content: "arrow".into(),
10140 lc_title: Some("test".into()),
10141 bloom64: u64::MAX, };
10143
10144 let matched = hit_matches_query_cached(&cached, "row");
10147
10148 assert!(
10149 !matched,
10150 "Query 'row' should NOT match content 'arrow' (prefix match required)"
10151 );
10152 }
10153
10154 #[test]
10155 fn search_deduplication_across_pages_repro() {
10156 let dir = TempDir::new().unwrap();
10161 let index_path = dir.path();
10162 let mut index = TantivyIndex::open_or_create(index_path).unwrap();
10163
10164 let msg1 = NormalizedMessage {
10168 idx: 0,
10169 role: "user".into(),
10170 author: None,
10171 created_at: Some(1000),
10172 content: "duplicate content".into(),
10173 extra: serde_json::json!({}),
10174 snippets: Vec::new(),
10175 invocations: Vec::new(),
10176 };
10177 let conv1 = NormalizedConversation {
10178 agent_slug: "agent1".into(),
10179 external_id: None,
10180 title: None,
10181 workspace: None,
10182 source_path: "path/1".into(),
10183 started_at: None,
10184 ended_at: None,
10185 metadata: serde_json::json!({}),
10186 messages: vec![msg1],
10187 };
10188
10189 let msg2 = NormalizedMessage {
10190 idx: 0,
10191 role: "user".into(),
10192 author: None,
10193 created_at: Some(2000), content: "duplicate content".into(), extra: serde_json::json!({}),
10196 snippets: Vec::new(),
10197 invocations: Vec::new(),
10198 };
10199 let conv2 = NormalizedConversation {
10200 agent_slug: "agent1".into(),
10201 external_id: None,
10202 title: None,
10203 workspace: None,
10204 source_path: "path/2".into(), started_at: None,
10206 ended_at: None,
10207 metadata: serde_json::json!({}),
10208 messages: vec![msg2],
10209 };
10210
10211 index.add_conversation(&conv1).unwrap();
10212 index.add_conversation(&conv2).unwrap();
10213 index.commit().unwrap();
10214
10215 let client = SearchClient::open(index_path, None).unwrap().unwrap();
10216
10217 let page1 = client
10219 .search("duplicate", SearchFilters::default(), 1, 0, FieldMask::FULL)
10220 .unwrap();
10221 assert_eq!(page1.len(), 1);
10222
10223 let page2 = client
10225 .search("duplicate", SearchFilters::default(), 1, 1, FieldMask::FULL)
10226 .unwrap();
10227
10228 assert_eq!(page2.len(), 1);
10229 assert_ne!(page1[0].source_path, page2[0].source_path);
10230 }
10231
10232 #[test]
10233 fn cache_skips_complex_queries() {
10234 let client = SearchClient {
10235 reader: None,
10236 sqlite: Mutex::new(None),
10237 sqlite_path: None,
10238 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10239 reload_on_search: true,
10240 last_reload: Mutex::new(None),
10241 last_generation: Mutex::new(None),
10242 reload_epoch: Arc::new(AtomicU64::new(0)),
10243 warm_tx: None,
10244 _warm_handle: None,
10245 metrics: Metrics::default(),
10246 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10247 semantic: Mutex::new(None),
10248 last_tantivy_total_count: Mutex::new(None),
10249 };
10250
10251 let _ = client.search("foo*", SearchFilters::default(), 10, 0, FieldMask::FULL);
10253 let stats = client.cache_stats();
10254 assert_eq!(
10255 stats.cache_miss, 0,
10256 "Wildcard query should not trigger cache miss"
10257 );
10258
10259 let _ = client.search(
10261 "foo OR bar",
10262 SearchFilters::default(),
10263 10,
10264 0,
10265 FieldMask::FULL,
10266 );
10267 let stats = client.cache_stats();
10268 assert_eq!(
10269 stats.cache_miss, 0,
10270 "Boolean query should not trigger cache miss"
10271 );
10272
10273 let _ = client.search("simple", SearchFilters::default(), 10, 0, FieldMask::FULL);
10275 let stats = client.cache_stats();
10276 assert_eq!(
10277 stats.cache_miss, 1,
10278 "Simple query should trigger cache miss"
10279 );
10280 }
10281
10282 #[test]
10283 fn cache_prefix_lookup_handles_utf8_boundaries() {
10284 let client = SearchClient {
10285 reader: None,
10286 sqlite: Mutex::new(None),
10287 sqlite_path: None,
10288 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10289 reload_on_search: true,
10290 last_reload: Mutex::new(None),
10291 last_generation: Mutex::new(None),
10292 reload_epoch: Arc::new(AtomicU64::new(0)),
10293 warm_tx: None,
10294 _warm_handle: None,
10295 metrics: Metrics::default(),
10296 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10297 semantic: Mutex::new(None),
10298 last_tantivy_total_count: Mutex::new(None),
10299 };
10300
10301 let hits = vec![SearchHit {
10302 title: "こんにちは".into(),
10303 snippet: String::new(),
10304 content: "こんにちは 世界".into(),
10305 content_hash: stable_content_hash("こんにちは 世界"),
10306 score: 1.0,
10307 source_path: "p".into(),
10308 agent: "a".into(),
10309 workspace: "w".into(),
10310 workspace_original: None,
10311 created_at: None,
10312 line_number: None,
10313 match_type: MatchType::Exact,
10314 source_id: "local".into(),
10315 origin_kind: "local".into(),
10316 origin_host: None,
10317 conversation_id: None,
10318 }];
10319
10320 client.put_cache("こん", &SearchFilters::default(), &hits);
10321
10322 let cached = client
10323 .cached_prefix_hits("こんにちは", &SearchFilters::default())
10324 .unwrap();
10325 assert_eq!(cached.len(), 1);
10326 assert_eq!(cached[0].hit.title, "こんにちは");
10327 }
10328
10329 #[test]
10330 fn bloom_gate_rejects_missing_terms() {
10331 let hit = SearchHit {
10332 title: "hello world".into(),
10333 snippet: "hello world".into(),
10334 content: "hello world".into(),
10335 content_hash: stable_content_hash("hello world"),
10336 score: 1.0,
10337 source_path: "p".into(),
10338 agent: "a".into(),
10339 workspace: "w".into(),
10340 workspace_original: None,
10341 created_at: None,
10342 line_number: None,
10343 match_type: MatchType::Exact,
10344 source_id: "local".into(),
10345 origin_kind: "local".into(),
10346 origin_host: None,
10347 conversation_id: None,
10348 };
10349 let cached = cached_hit_from(&hit);
10350 assert!(hit_matches_query_cached(&cached, "hello"));
10351 assert!(!hit_matches_query_cached(&cached, "missing"));
10352
10353 let metrics = Metrics::default();
10354 metrics.inc_cache_hits();
10355 metrics.inc_cache_miss();
10356 metrics.inc_cache_shortfall();
10357 metrics.inc_reload();
10358 let (hits, miss, shortfall, reloads, _) = metrics.snapshot_all();
10359 assert_eq!((hits, miss, shortfall, reloads), (1, 1, 1, 1));
10360 }
10361
10362 #[test]
10363 fn progressive_lexical_hit_omits_unused_content() {
10364 let hit = SearchHit {
10365 title: "hello world".into(),
10366 snippet: "hello **world**".into(),
10367 content: "hello world from a much larger conversation body".into(),
10368 content_hash: stable_content_hash("hello world from a much larger conversation body"),
10369 score: 1.0,
10370 source_path: "p".into(),
10371 agent: "a".into(),
10372 workspace: "w".into(),
10373 workspace_original: None,
10374 created_at: None,
10375 line_number: Some(3),
10376 match_type: MatchType::Exact,
10377 source_id: "local".into(),
10378 origin_kind: "local".into(),
10379 origin_host: None,
10380 conversation_id: None,
10381 };
10382
10383 let snippet_only =
10384 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(false, true, true, true));
10385 assert_eq!(snippet_only.title, hit.title);
10386 assert_eq!(snippet_only.snippet, hit.snippet);
10387 assert!(
10388 snippet_only.content.is_empty(),
10389 "snippet-only progressive cache should not retain full content"
10390 );
10391 assert_eq!(snippet_only.match_type, hit.match_type);
10392 assert_eq!(snippet_only.line_number, hit.line_number);
10393 assert_eq!(snippet_only.source_path, hit.source_path);
10394 assert_eq!(snippet_only.agent, hit.agent);
10395 assert_eq!(snippet_only.workspace, hit.workspace);
10396
10397 let full =
10398 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(true, true, true, true));
10399 assert_eq!(full.content, hit.content);
10400 }
10401
10402 #[test]
10403 fn progressive_phase_reuses_lexical_cache_without_db_hydration() -> Result<()> {
10404 let client = SearchClient {
10405 reader: None,
10406 sqlite: Mutex::new(None),
10407 sqlite_path: None,
10408 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10409 reload_on_search: true,
10410 last_reload: Mutex::new(None),
10411 last_generation: Mutex::new(None),
10412 reload_epoch: Arc::new(AtomicU64::new(0)),
10413 warm_tx: None,
10414 _warm_handle: None,
10415 metrics: Metrics::default(),
10416 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10417 semantic: Mutex::new(None),
10418 last_tantivy_total_count: Mutex::new(None),
10419 };
10420 let field_mask = FieldMask::new(false, true, true, true);
10421 let lexical_hit = SearchHit {
10422 title: "lexical title".into(),
10423 snippet: "lexical snippet".into(),
10424 content: "full lexical body".into(),
10425 content_hash: stable_content_hash("full lexical body"),
10426 score: 0.0,
10427 source_path: "/tmp/session.jsonl".into(),
10428 agent: "codex".into(),
10429 workspace: "/tmp".into(),
10430 workspace_original: Some("/original".into()),
10431 created_at: Some(1_700_000_000_000),
10432 line_number: Some(7),
10433 match_type: MatchType::Exact,
10434 source_id: "local".into(),
10435 origin_kind: "local".into(),
10436 origin_host: None,
10437 conversation_id: None,
10438 };
10439 let mut lexical_cache = ProgressiveLexicalCache::default();
10440 lexical_cache.hits_by_message.insert(
10441 42,
10442 ProgressiveLexicalHit::from_search_hit(&lexical_hit, field_mask),
10443 );
10444
10445 let hash_hex = "00".repeat(32);
10446 let results = vec![FsScoredResult {
10447 doc_id: format!("m|42|0|1|1|1|1|1700000000000|{hash_hex}"),
10448 score: 0.91,
10449 source: FsScoreSource::Lexical,
10450 index: None,
10451 fast_score: None,
10452 quality_score: None,
10453 lexical_score: Some(0.91),
10454 rerank_score: None,
10455 explanation: None,
10456 metadata: None,
10457 }];
10458
10459 let result = client.progressive_phase_to_result(
10460 &results,
10461 ProgressivePhaseContext {
10462 query: "merged title",
10463 filters: &SearchFilters::default(),
10464 field_mask,
10465 lexical_cache: Some(&lexical_cache),
10466 limit: 1,
10467 fetch_limit: 1,
10468 },
10469 )?;
10470
10471 assert_eq!(result.hits.len(), 1);
10472 assert_eq!(result.hits[0].title, lexical_hit.title);
10473 assert_eq!(result.hits[0].snippet, lexical_hit.snippet);
10474 assert!(
10475 result.hits[0].content.is_empty(),
10476 "masked lexical cache should still avoid carrying full content"
10477 );
10478 assert_eq!(result.hits[0].source_path, lexical_hit.source_path);
10479 assert_eq!(result.hits[0].score, 0.91);
10480
10481 Ok(())
10482 }
10483
10484 #[test]
10485 fn search_returns_results_with_filters_and_pagination() -> Result<()> {
10486 let dir = TempDir::new()?;
10487 let mut index = TantivyIndex::open_or_create(dir.path())?;
10488 let conv = NormalizedConversation {
10489 agent_slug: "codex".into(),
10490 external_id: None,
10491 title: Some("hello world convo".into()),
10492 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
10493 source_path: dir.path().join("rollout-1.jsonl"),
10494 started_at: Some(1_700_000_000_000),
10495 ended_at: None,
10496 metadata: serde_json::json!({}),
10497 messages: vec![NormalizedMessage {
10498 idx: 0,
10499 role: "user".into(),
10500 author: Some("me".into()),
10501 created_at: Some(1_700_000_000_000),
10502 content: "hello rust world".into(),
10503 extra: serde_json::json!({}),
10504 snippets: vec![NormalizedSnippet {
10505 file_path: None,
10506 start_line: None,
10507 end_line: None,
10508 language: None,
10509 snippet_text: None,
10510 }],
10511 invocations: Vec::new(),
10512 }],
10513 };
10514 index.add_conversation(&conv)?;
10515 index.commit()?;
10516
10517 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10518 let mut filters = SearchFilters::default();
10519 filters.agents.insert("codex".into());
10520
10521 let hits = client.search("hello", filters, 10, 0, FieldMask::FULL)?;
10522 assert_eq!(hits.len(), 1);
10523 assert_eq!(hits[0].agent, "codex");
10524 assert!(hits[0].snippet.contains("hello"));
10525 Ok(())
10526 }
10527
10528 #[test]
10529 fn search_honors_created_range_and_workspace() -> Result<()> {
10530 let dir = TempDir::new()?;
10531 let mut index = TantivyIndex::open_or_create(dir.path())?;
10532
10533 let conv_a = NormalizedConversation {
10534 agent_slug: "codex".into(),
10535 external_id: None,
10536 title: Some("needle one".into()),
10537 workspace: Some(std::path::PathBuf::from("/ws/a")),
10538 source_path: dir.path().join("a.jsonl"),
10539 started_at: Some(10),
10540 ended_at: None,
10541 metadata: serde_json::json!({}),
10542 messages: vec![NormalizedMessage {
10543 idx: 0,
10544 role: "user".into(),
10545 author: None,
10546 created_at: Some(10),
10547 content: "alpha needle".into(),
10548 extra: serde_json::json!({}),
10549 snippets: vec![NormalizedSnippet {
10550 file_path: None,
10551 start_line: None,
10552 end_line: None,
10553 language: None,
10554 snippet_text: None,
10555 }],
10556 invocations: Vec::new(),
10557 }],
10558 };
10559 let conv_b = NormalizedConversation {
10560 agent_slug: "codex".into(),
10561 external_id: None,
10562 title: Some("needle two".into()),
10563 workspace: Some(std::path::PathBuf::from("/ws/b")),
10564 source_path: dir.path().join("b.jsonl"),
10565 started_at: Some(20),
10566 ended_at: None,
10567 metadata: serde_json::json!({}),
10568 messages: vec![NormalizedMessage {
10569 idx: 0,
10570 role: "user".into(),
10571 author: None,
10572 created_at: Some(20),
10573 content: "\nneedle second line".into(),
10574 extra: serde_json::json!({}),
10575 snippets: vec![NormalizedSnippet {
10576 file_path: None,
10577 start_line: None,
10578 end_line: None,
10579 language: None,
10580 snippet_text: None,
10581 }],
10582 invocations: Vec::new(),
10583 }],
10584 };
10585 index.add_conversation(&conv_a)?;
10586 index.add_conversation(&conv_b)?;
10587 index.commit()?;
10588
10589 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10590 let mut filters = SearchFilters::default();
10591 filters.workspaces.insert("/ws/b".into());
10592 filters.created_from = Some(15);
10593 filters.created_to = Some(25);
10594
10595 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
10596 assert_eq!(hits.len(), 1);
10597 assert_eq!(hits[0].workspace, "/ws/b");
10598 assert!(hits[0].snippet.contains("second line"));
10599 Ok(())
10600 }
10601
10602 #[test]
10603 fn pagination_skips_results() -> Result<()> {
10604 let dir = TempDir::new()?;
10605 let mut index = TantivyIndex::open_or_create(dir.path())?;
10606 for i in 0..3 {
10607 let conv = NormalizedConversation {
10608 agent_slug: "codex".into(),
10609 external_id: None,
10610 title: Some(format!("doc-{i}")),
10611 workspace: Some(std::path::PathBuf::from("/ws/p")),
10612 source_path: dir.path().join(format!("{i}.jsonl")),
10613 started_at: Some(100 + i),
10614 ended_at: None,
10615 metadata: serde_json::json!({}),
10616 messages: vec![NormalizedMessage {
10617 idx: 0,
10618 role: "user".into(),
10619 author: None,
10620 created_at: Some(100 + i),
10621 content: format!("pagination needle document number {i}"),
10623 extra: serde_json::json!({}),
10624 snippets: vec![NormalizedSnippet {
10625 file_path: None,
10626 start_line: None,
10627 end_line: None,
10628 language: None,
10629 snippet_text: None,
10630 }],
10631 invocations: Vec::new(),
10632 }],
10633 };
10634 index.add_conversation(&conv)?;
10635 }
10636 index.commit()?;
10637
10638 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10639 let hits = client.search(
10640 "pagination",
10641 SearchFilters::default(),
10642 1,
10643 1,
10644 FieldMask::FULL,
10645 )?;
10646 assert_eq!(hits.len(), 1);
10647 Ok(())
10648 }
10649
10650 #[test]
10651 fn search_matches_hyphenated_term() -> Result<()> {
10652 let dir = TempDir::new()?;
10653 let mut index = TantivyIndex::open_or_create(dir.path())?;
10654 let conv = NormalizedConversation {
10655 agent_slug: "codex".into(),
10656 external_id: None,
10657 title: Some("cma-es notes".into()),
10658 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
10659 source_path: dir.path().join("rollout-1.jsonl"),
10660 started_at: Some(1_700_000_000_000),
10661 ended_at: None,
10662 metadata: serde_json::json!({}),
10663 messages: vec![NormalizedMessage {
10664 idx: 0,
10665 role: "user".into(),
10666 author: Some("me".into()),
10667 created_at: Some(1_700_000_000_000),
10668 content: "Need CMA-ES strategy and CMA ES variants".into(),
10669 extra: serde_json::json!({}),
10670 snippets: vec![NormalizedSnippet {
10671 file_path: None,
10672 start_line: None,
10673 end_line: None,
10674 language: None,
10675 snippet_text: None,
10676 }],
10677 invocations: Vec::new(),
10678 }],
10679 };
10680 index.add_conversation(&conv)?;
10681 index.commit()?;
10682
10683 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10684 let hits = client.search("cma-es", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10685 assert_eq!(hits.len(), 1);
10686 assert!(hits[0].snippet.to_lowercase().contains("cma"));
10687 Ok(())
10688 }
10689
10690 #[test]
10691 fn search_matches_prefix_edge_ngram() -> Result<()> {
10692 let dir = TempDir::new()?;
10693 let mut index = TantivyIndex::open_or_create(dir.path())?;
10694 let conv = NormalizedConversation {
10695 agent_slug: "codex".into(),
10696 external_id: None,
10697 title: Some("math logic".into()),
10698 workspace: Some(std::path::PathBuf::from("/ws/m")),
10699 source_path: dir.path().join("math.jsonl"),
10700 started_at: Some(1000),
10701 ended_at: None,
10702 metadata: serde_json::json!({}),
10703 messages: vec![NormalizedMessage {
10704 idx: 0,
10705 role: "user".into(),
10706 author: None,
10707 created_at: Some(1000),
10708 content: "please calculate the entropy".into(),
10709 extra: serde_json::json!({}),
10710 snippets: vec![],
10711 invocations: Vec::new(),
10712 }],
10713 };
10714 index.add_conversation(&conv)?;
10715 index.commit()?;
10716
10717 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10718
10719 let hits = client.search("cal", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10721 assert_eq!(hits.len(), 1);
10722 assert!(hits[0].content.contains("calculate"));
10723
10724 let hits = client.search("entr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10726 assert_eq!(hits.len(), 1);
10727
10728 Ok(())
10729 }
10730
10731 #[test]
10732 fn search_matches_snake_case() -> Result<()> {
10733 let dir = TempDir::new()?;
10734 let mut index = TantivyIndex::open_or_create(dir.path())?;
10735 let conv = NormalizedConversation {
10736 agent_slug: "codex".into(),
10737 external_id: None,
10738 title: Some("code".into()),
10739 workspace: None,
10740 source_path: dir.path().join("c.jsonl"),
10741 started_at: Some(1),
10742 ended_at: None,
10743 metadata: serde_json::json!({}),
10744 messages: vec![NormalizedMessage {
10745 idx: 0,
10746 role: "user".into(),
10747 author: None,
10748 created_at: Some(1),
10749 content: "check the my_variable_name please".into(),
10750 extra: serde_json::json!({}),
10751 snippets: vec![],
10752 invocations: Vec::new(),
10753 }],
10754 };
10755 index.add_conversation(&conv)?;
10756 index.commit()?;
10757
10758 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10759
10760 let hits = client.search("vari", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10762 assert_eq!(hits.len(), 1);
10763
10764 let hits = client.search(
10766 "my_variable",
10767 SearchFilters::default(),
10768 10,
10769 0,
10770 FieldMask::FULL,
10771 )?;
10772 assert_eq!(hits.len(), 1);
10773
10774 Ok(())
10775 }
10776
10777 #[test]
10778 fn search_matches_symbols_stripped() -> Result<()> {
10779 let dir = TempDir::new()?;
10780 let mut index = TantivyIndex::open_or_create(dir.path())?;
10781 let conv = NormalizedConversation {
10782 agent_slug: "codex".into(),
10783 external_id: None,
10784 title: Some("symbols".into()),
10785 workspace: None,
10786 source_path: dir.path().join("s.jsonl"),
10787 started_at: Some(1),
10788 ended_at: None,
10789 metadata: serde_json::json!({}),
10790 messages: vec![NormalizedMessage {
10791 idx: 0,
10792 role: "user".into(),
10793 author: None,
10794 created_at: Some(1),
10795 content: "working with c++ and foo.bar today".into(),
10796 extra: serde_json::json!({}),
10797 snippets: vec![],
10798 invocations: Vec::new(),
10799 }],
10800 };
10801 index.add_conversation(&conv)?;
10802 index.commit()?;
10803
10804 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10805
10806 let hits = client.search("c++", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10808 assert_eq!(hits.len(), 1);
10809
10810 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10812 assert_eq!(hits.len(), 1);
10813
10814 Ok(())
10815 }
10816
10817 #[test]
10818 fn search_sets_match_type_for_wildcards() -> Result<()> {
10819 let dir = TempDir::new()?;
10820 let mut index = TantivyIndex::open_or_create(dir.path())?;
10821
10822 let conv = NormalizedConversation {
10823 agent_slug: "codex".into(),
10824 external_id: None,
10825 title: Some("handlers".into()),
10826 workspace: None,
10827 source_path: dir.path().join("h.jsonl"),
10828 started_at: Some(1),
10829 ended_at: None,
10830 metadata: serde_json::json!({}),
10831 messages: vec![NormalizedMessage {
10832 idx: 0,
10833 role: "user".into(),
10834 author: None,
10835 created_at: Some(1),
10836 content: "the request handler delegates".into(),
10837 extra: serde_json::json!({}),
10838 snippets: vec![],
10839 invocations: Vec::new(),
10840 }],
10841 };
10842 index.add_conversation(&conv)?;
10843 index.commit()?;
10844
10845 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10846
10847 let exact = client.search("handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10848 assert_eq!(exact[0].match_type, MatchType::Exact);
10849
10850 let prefix = client.search("hand*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10851 assert_eq!(prefix[0].match_type, MatchType::Prefix);
10852
10853 let suffix = client.search("*handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10854 assert_eq!(suffix[0].match_type, MatchType::Suffix);
10855
10856 let substring =
10857 client.search("*andle*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10858 assert_eq!(substring[0].match_type, MatchType::Substring);
10859
10860 Ok(())
10861 }
10862
10863 #[test]
10864 fn search_with_fallback_marks_implicit_wildcard() -> Result<()> {
10865 let dir = TempDir::new()?;
10866 let mut index = TantivyIndex::open_or_create(dir.path())?;
10867
10868 let conv = NormalizedConversation {
10869 agent_slug: "codex".into(),
10870 external_id: None,
10871 title: Some("handlers".into()),
10872 workspace: None,
10873 source_path: dir.path().join("h2.jsonl"),
10874 started_at: Some(1),
10875 ended_at: None,
10876 metadata: serde_json::json!({}),
10877 messages: vec![NormalizedMessage {
10878 idx: 0,
10879 role: "user".into(),
10880 author: None,
10881 created_at: Some(1),
10882 content: "the request handler delegates".into(),
10883 extra: serde_json::json!({}),
10884 snippets: vec![],
10885 invocations: Vec::new(),
10886 }],
10887 };
10888 index.add_conversation(&conv)?;
10889 index.commit()?;
10890
10891 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10892
10893 let result = client.search_with_fallback(
10895 "andle",
10896 SearchFilters::default(),
10897 10,
10898 0,
10899 2,
10900 FieldMask::FULL,
10901 )?;
10902 assert!(result.wildcard_fallback);
10903 assert_eq!(result.hits.len(), 1);
10904 assert_eq!(result.hits[0].match_type, MatchType::ImplicitWildcard);
10905
10906 Ok(())
10907 }
10908
10909 #[test]
10910 fn sqlite_backend_skips_wildcard_queries() -> Result<()> {
10911 let conn = Connection::open(":memory:")?;
10913 let client = SearchClient {
10914 reader: None,
10915 sqlite: Mutex::new(Some(SendConnection(conn))),
10916 sqlite_path: None,
10917 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10918 reload_on_search: true,
10919 last_reload: Mutex::new(None),
10920 last_generation: Mutex::new(None),
10921 reload_epoch: Arc::new(AtomicU64::new(0)),
10922 warm_tx: None,
10923 _warm_handle: None,
10924 metrics: Metrics::default(),
10925 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10926 semantic: Mutex::new(None),
10927 last_tantivy_total_count: Mutex::new(None),
10928 };
10929
10930 let hits = client.search("*handler", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10931 assert!(
10932 hits.is_empty(),
10933 "wildcard should skip sqlite fallback, not error"
10934 );
10935
10936 Ok(())
10937 }
10938
10939 #[test]
10940 fn sqlite_backend_handles_null_workspace() -> Result<()> {
10941 let conn = Connection::open(":memory:")?;
10942 conn.execute_batch(
10943 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10944 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10945 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10946 CREATE TABLE conversations (
10947 id INTEGER PRIMARY KEY,
10948 agent_id INTEGER,
10949 workspace_id INTEGER,
10950 source_id TEXT,
10951 origin_host TEXT,
10952 title TEXT,
10953 source_path TEXT
10954 );
10955 CREATE TABLE messages (
10956 id INTEGER PRIMARY KEY,
10957 conversation_id INTEGER,
10958 idx INTEGER,
10959 content TEXT,
10960 created_at INTEGER
10961 );
10962 CREATE VIRTUAL TABLE fts_messages USING fts5(
10963 content,
10964 title,
10965 agent,
10966 workspace,
10967 source_path,
10968 created_at UNINDEXED,
10969 content='',
10970 tokenize='porter'
10971 );",
10972 )?;
10973 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10974 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10975 conn.execute(
10976 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 't', '/tmp/session.jsonl')",
10977 )?;
10978 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
10979 conn.execute_compat(
10980 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10981 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
10982 params![
10983 1_i64,
10984 "auth token failure",
10985 "t",
10986 "codex",
10987 "/tmp/session.jsonl",
10988 42_i64
10989 ],
10990 )?;
10991
10992 let client = SearchClient {
10993 reader: None,
10994 sqlite: Mutex::new(Some(SendConnection(conn))),
10995 sqlite_path: None,
10996 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10997 reload_on_search: true,
10998 last_reload: Mutex::new(None),
10999 last_generation: Mutex::new(None),
11000 reload_epoch: Arc::new(AtomicU64::new(0)),
11001 warm_tx: None,
11002 _warm_handle: None,
11003 metrics: Metrics::default(),
11004 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11005 semantic: Mutex::new(None),
11006 last_tantivy_total_count: Mutex::new(None),
11007 };
11008
11009 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11010 assert_eq!(hits.len(), 1);
11011 assert_eq!(hits[0].workspace, "");
11012 assert_eq!(hits[0].line_number, Some(1));
11013 assert_eq!(hits[0].source_id, "local");
11014 assert_eq!(hits[0].origin_kind, "local");
11015 Ok(())
11016 }
11017
11018 #[test]
11019 fn sqlite_backend_supports_legacy_fts_message_id_schema() -> Result<()> {
11020 let conn = Connection::open(":memory:")?;
11021 conn.execute_batch(
11022 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11023 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11024 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11025 CREATE TABLE conversations (
11026 id INTEGER PRIMARY KEY,
11027 agent_id INTEGER,
11028 workspace_id INTEGER,
11029 source_id TEXT,
11030 origin_host TEXT,
11031 title TEXT,
11032 source_path TEXT
11033 );
11034 CREATE TABLE messages (
11035 id INTEGER PRIMARY KEY,
11036 conversation_id INTEGER,
11037 idx INTEGER,
11038 content TEXT,
11039 created_at INTEGER
11040 );
11041 CREATE VIRTUAL TABLE fts_messages USING fts5(
11042 content,
11043 title,
11044 agent,
11045 workspace,
11046 source_path,
11047 created_at UNINDEXED,
11048 message_id UNINDEXED,
11049 tokenize='porter'
11050 );",
11051 )?;
11052 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11053 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11054 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/legacy')")?;
11055 conn.execute(
11056 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11057 VALUES(1, 1, 1, 'local', NULL, 'legacy title', '/tmp/legacy.jsonl')",
11058 )?;
11059 conn.execute(
11060 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11061 VALUES(42, 1, 4, 'legacy auth token failure', 99)",
11062 )?;
11063 conn.execute_compat(
11064 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at, message_id)
11065 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
11066 params![
11067 1_i64,
11068 "legacy auth token failure",
11069 "legacy title",
11070 "codex",
11071 "/legacy",
11072 "/tmp/legacy.jsonl",
11073 99_i64,
11074 42_i64
11075 ],
11076 )?;
11077
11078 let client = SearchClient {
11079 reader: None,
11080 sqlite: Mutex::new(Some(SendConnection(conn))),
11081 sqlite_path: None,
11082 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11083 reload_on_search: true,
11084 last_reload: Mutex::new(None),
11085 last_generation: Mutex::new(None),
11086 reload_epoch: Arc::new(AtomicU64::new(0)),
11087 warm_tx: None,
11088 _warm_handle: None,
11089 metrics: Metrics::default(),
11090 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11091 semantic: Mutex::new(None),
11092 last_tantivy_total_count: Mutex::new(None),
11093 };
11094
11095 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11096 assert_eq!(hits.len(), 1);
11097 assert_eq!(hits[0].title, "legacy title");
11098 assert_eq!(hits[0].source_path, "/tmp/legacy.jsonl");
11099 assert_eq!(hits[0].workspace, "/legacy");
11100 assert_eq!(hits[0].line_number, Some(5));
11101 assert_eq!(hits[0].content, "legacy auth token failure");
11102 Ok(())
11103 }
11104
11105 #[test]
11106 fn tantivy_reader_skips_sqlite_fallback_on_empty_lexical_results() -> Result<()> {
11107 let dir = TempDir::new()?;
11108 let mut index = TantivyIndex::open_or_create(dir.path())?;
11109 index.commit()?;
11110 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
11111 assert!(
11112 reader.is_some(),
11113 "test fixture should open a Tantivy reader even with an empty index"
11114 );
11115
11116 let conn = Connection::open(":memory:")?;
11117 conn.execute_batch(
11118 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11119 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11120 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11121 CREATE TABLE conversations (
11122 id INTEGER PRIMARY KEY,
11123 agent_id INTEGER,
11124 workspace_id INTEGER,
11125 source_id TEXT,
11126 origin_host TEXT,
11127 title TEXT,
11128 source_path TEXT
11129 );
11130 CREATE TABLE messages (
11131 id INTEGER PRIMARY KEY,
11132 conversation_id INTEGER,
11133 idx INTEGER,
11134 content TEXT,
11135 created_at INTEGER
11136 );
11137 CREATE VIRTUAL TABLE fts_messages USING fts5(
11138 content,
11139 title,
11140 agent,
11141 workspace,
11142 source_path,
11143 created_at UNINDEXED,
11144 content='',
11145 tokenize='porter'
11146 );",
11147 )?;
11148 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11149 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11150 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/sqlite-only')")?;
11151 conn.execute(
11152 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11153 VALUES(1, 1, 1, 'local', NULL, 'sqlite fallback only', '/tmp/sqlite-only.jsonl')",
11154 )?;
11155 conn.execute(
11156 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11157 VALUES(1, 1, 0, 'sqliteonlytoken overflow candidate', 42)",
11158 )?;
11159 conn.execute_compat(
11160 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11161 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11162 params![
11163 1_i64,
11164 "sqliteonlytoken overflow candidate",
11165 "sqlite fallback only",
11166 "codex",
11167 "/sqlite-only",
11168 "/tmp/sqlite-only.jsonl",
11169 42_i64
11170 ],
11171 )?;
11172
11173 let client = SearchClient {
11174 reader,
11175 sqlite: Mutex::new(Some(SendConnection(conn))),
11176 sqlite_path: None,
11177 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11178 reload_on_search: true,
11179 last_reload: Mutex::new(None),
11180 last_generation: Mutex::new(None),
11181 reload_epoch: Arc::new(AtomicU64::new(0)),
11182 warm_tx: None,
11183 _warm_handle: None,
11184 metrics: Metrics::default(),
11185 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11186 semantic: Mutex::new(None),
11187 last_tantivy_total_count: Mutex::new(None),
11188 };
11189
11190 let sqlite_hits = client.search_sqlite_fts5(
11191 Path::new(":memory:"),
11192 "sqliteonlytoken",
11193 SearchFilters::default(),
11194 5,
11195 0,
11196 FieldMask::FULL,
11197 )?;
11198 assert_eq!(
11199 sqlite_hits.len(),
11200 1,
11201 "fixture should prove sqlite fallback would have produced a hit"
11202 );
11203
11204 let tantivy_authoritative_hits = client.search(
11205 "sqliteonlytoken",
11206 SearchFilters::default(),
11207 5,
11208 0,
11209 FieldMask::FULL,
11210 )?;
11211 assert!(
11212 tantivy_authoritative_hits.is_empty(),
11213 "a live Tantivy reader should prevent sqlite fallback from populating empty lexical results"
11214 );
11215 Ok(())
11216 }
11217
11218 #[test]
11219 fn sqlite_guard_does_not_repair_fts_when_generation_key_stale() -> Result<()> {
11220 let temp_dir = TempDir::new()?;
11221 let db_path = temp_dir.path().join("stale-gen-fts.db");
11222
11223 {
11225 let storage = FrankenStorage::open(&db_path)?;
11226 let agent = Agent {
11227 id: None,
11228 slug: "codex".into(),
11229 name: "Codex".into(),
11230 version: None,
11231 kind: AgentKind::Cli,
11232 };
11233 let agent_id = storage.ensure_agent(&agent)?;
11234 let conversation = Conversation {
11235 id: None,
11236 agent_slug: "codex".into(),
11237 workspace: Some(PathBuf::from("/tmp/workspace")),
11238 external_id: Some("stale-gen-fts".into()),
11239 title: Some("Stale FTS generation".into()),
11240 source_path: PathBuf::from("/tmp/stale-gen-fts.jsonl"),
11241 started_at: Some(1_700_000_000_000),
11242 ended_at: Some(1_700_000_000_100),
11243 approx_tokens: Some(42),
11244 metadata_json: serde_json::Value::Null,
11245 messages: vec![Message {
11246 id: None,
11247 idx: 0,
11248 role: MessageRole::User,
11249 author: Some("user".into()),
11250 created_at: Some(1_700_000_000_050),
11251 content: "message that should remain queryable".into(),
11252 extra_json: serde_json::Value::Null,
11253 snippets: Vec::new(),
11254 }],
11255 source_id: "local".into(),
11256 origin_host: None,
11257 };
11258 storage.insert_conversation_tree(agent_id, None, &conversation)?;
11259 }
11260
11261 let count_before = sqlite_master_name_count(&db_path, "fts_messages")
11262 .context("count schema rows before generation key deletion")?;
11263
11264 {
11268 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
11269 conn.execute_compat(
11270 "DELETE FROM meta WHERE key = ?1",
11271 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
11272 )?;
11273 }
11274
11275 let client = SearchClient {
11278 reader: None,
11279 sqlite: Mutex::new(None),
11280 sqlite_path: Some(db_path.clone()),
11281 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11282 reload_on_search: true,
11283 last_reload: Mutex::new(None),
11284 last_generation: Mutex::new(None),
11285 reload_epoch: Arc::new(AtomicU64::new(0)),
11286 warm_tx: None,
11287 _warm_handle: None,
11288 metrics: Metrics::default(),
11289 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11290 semantic: Mutex::new(None),
11291 last_tantivy_total_count: Mutex::new(None),
11292 };
11293
11294 let guard = client
11295 .sqlite_guard()
11296 .context("open sqlite guard for stale generation fixture")?;
11297 assert!(guard.is_some(), "sqlite guard should open the db");
11298 let conn = guard
11299 .as_ref()
11300 .expect("sqlite guard should hold a connection");
11301 let no_params: [ParamValue; 0] = [];
11302 let cache_size: i64 =
11303 conn.query_row_map("PRAGMA cache_size;", &no_params, |row| row.get_typed(0))?;
11304 assert_eq!(
11305 cache_size, -SEARCH_SQLITE_HYDRATION_CACHE_KIB,
11306 "search hydration should not inherit the general storage cache profile"
11307 );
11308 drop(guard);
11309
11310 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
11312 let generation_after: Option<String> = conn
11313 .query_row_map(
11314 "SELECT value FROM meta WHERE key = ?1",
11315 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
11316 |row| row.get_typed(0),
11317 )
11318 .optional()?;
11319 assert!(
11320 generation_after.is_none(),
11321 "search sqlite guard must not mutate FTS rebuild metadata"
11322 );
11323
11324 let count_after = sqlite_master_name_count(&db_path, "fts_messages")
11326 .context("count schema rows after sqlite guard reopen")?;
11327 assert_eq!(
11328 count_after, count_before,
11329 "read-only reopen must leave FTS schema state unchanged"
11330 );
11331
11332 Ok(())
11333 }
11334
11335 #[test]
11336 fn sqlite_path_rusqlite_fallback_matches_hyphenated_ids_with_workspace_filter() -> Result<()> {
11337 fn fts_match_count(conn: &FrankenConnection, fts_query: &str) -> Result<Option<usize>> {
11338 let match_mode = SearchClient::sqlite_fts_match_mode(conn)?;
11339 let sql = format!(
11340 "SELECT COUNT(*) FROM fts_messages WHERE {}",
11341 SearchClient::sqlite_fts5_match_clause(match_mode)
11342 );
11343 let mut params = Vec::new();
11344 SearchClient::push_sqlite_fts5_match_params(&mut params, fts_query, match_mode);
11345 match franken_query_map_collect_retry(conn, &sql, ¶ms, |row| row.get_typed(0)) {
11346 Ok(rows) => {
11347 let count: i64 = rows.into_iter().next().unwrap_or(0);
11348 Ok(Some(usize::try_from(count.max(0)).unwrap_or(usize::MAX)))
11349 }
11350 Err(err) if err.to_string().contains("no such function: MATCH/2") => Ok(None),
11351 Err(err) => Err(err.into()),
11352 }
11353 }
11354
11355 let temp_dir = TempDir::new()?;
11356 let db_path = temp_dir.path().join("hyphenated-rusqlite-fallback.db");
11357
11358 {
11359 let storage = FrankenStorage::open(&db_path)?;
11360 storage.ensure_search_fallback_fts_consistency()?;
11363 let conn = storage.raw();
11364 conn.execute(
11365 "INSERT INTO agents(id, slug, name, kind, created_at, updated_at)
11366 VALUES(1, 'codex', 'Codex', 'codex', 1, 1)",
11367 )?;
11368 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws/alpha')")?;
11369 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/ws/beta')")?;
11370 conn.execute(
11371 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11372 VALUES(1, 1, 1, 'local', NULL, 'alpha bead', '/tmp/alpha.jsonl')",
11373 )?;
11374 conn.execute(
11375 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11376 VALUES(2, 1, 2, 'local', NULL, 'beta bead', '/tmp/beta.jsonl')",
11377 )?;
11378 conn.execute(
11379 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11380 VALUES(11, 1, 0, 'user', 'Need follow-up on br-123 root cause', 100)",
11381 )?;
11382 conn.execute(
11383 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11384 VALUES(12, 2, 0, 'user', 'Need follow-up on br-123 user report', 101)",
11385 )?;
11386 conn.execute_compat(
11387 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11388 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11389 &[
11390 ParamValue::from(11_i64),
11391 ParamValue::from("Need follow-up on br-123 root cause"),
11392 ParamValue::from("alpha bead"),
11393 ParamValue::from("codex"),
11394 ParamValue::from("/ws/alpha"),
11395 ParamValue::from("/tmp/alpha.jsonl"),
11396 ParamValue::from(100_i64),
11397 ],
11398 )?;
11399 conn.execute_compat(
11400 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11401 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11402 &[
11403 ParamValue::from(12_i64),
11404 ParamValue::from("Need follow-up on br-123 user report"),
11405 ParamValue::from("beta bead"),
11406 ParamValue::from("codex"),
11407 ParamValue::from("/ws/beta"),
11408 ParamValue::from("/tmp/beta.jsonl"),
11409 ParamValue::from(101_i64),
11410 ],
11411 )?;
11412 let preclose_total_rows: i64 =
11413 conn.query_row_map("SELECT COUNT(*) FROM fts_messages", params![], |row| {
11414 row.get_typed(0)
11415 })?;
11416 assert_eq!(
11417 preclose_total_rows, 2,
11418 "freshly seeded file-backed FTS should retain the inserted rows"
11419 );
11420 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
11421 if let Some(match_count) = fts_match_count(conn, transpiled.as_str())? {
11422 assert_eq!(
11423 match_count, 2,
11424 "freshly seeded file-backed FTS should match the transpiled hyphenated query before reopen"
11425 );
11426 }
11427 }
11428
11429 let client = SearchClient {
11430 reader: None,
11431 sqlite: Mutex::new(None),
11432 sqlite_path: Some(db_path),
11433 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11434 reload_on_search: true,
11435 last_reload: Mutex::new(None),
11436 last_generation: Mutex::new(None),
11437 reload_epoch: Arc::new(AtomicU64::new(0)),
11438 warm_tx: None,
11439 _warm_handle: None,
11440 metrics: Metrics::default(),
11441 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11442 semantic: Mutex::new(None),
11443 last_tantivy_total_count: Mutex::new(None),
11444 };
11445
11446 let guard = client.sqlite_guard()?;
11447 let conn = guard.as_ref().expect("sqlite guard should reopen file db");
11448 let reopened_total_rows: i64 =
11449 conn.query_row_map("SELECT COUNT(*) FROM fts_messages", params![], |row| {
11450 row.get_typed(0)
11451 })?;
11452 assert_eq!(
11453 reopened_total_rows, 2,
11454 "reopened file-backed FTS should still contain the seeded rows"
11455 );
11456 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
11457 if let Some(match_count) = fts_match_count(conn, transpiled.as_str())? {
11458 assert_eq!(
11459 match_count, 2,
11460 "reopened file-backed FTS should still match the transpiled hyphenated query"
11461 );
11462 }
11463 drop(guard);
11464
11465 let all_hits = client.search("br-123", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
11466 assert_eq!(all_hits.len(), 2);
11467 assert!(
11468 all_hits.iter().all(|hit| hit.content.contains("br-123")),
11469 "hyphenated bead IDs should survive the file-backed sqlite fallback path"
11470 );
11471
11472 let leading_or_hits = client.search(
11473 "OR br-123",
11474 SearchFilters::default(),
11475 10,
11476 0,
11477 FieldMask::FULL,
11478 )?;
11479 assert_eq!(leading_or_hits.len(), 2);
11480
11481 let dotted_hits = client.search(
11482 "br-123.jsonl",
11483 SearchFilters::default(),
11484 10,
11485 0,
11486 FieldMask::FULL,
11487 )?;
11488 assert_eq!(dotted_hits.len(), 2);
11489
11490 let dotted_prefix_hits = client.search(
11491 "br-123.json*",
11492 SearchFilters::default(),
11493 10,
11494 0,
11495 FieldMask::FULL,
11496 )?;
11497 assert_eq!(dotted_prefix_hits.len(), 2);
11498
11499 let prefix_hits =
11500 client.search("br-12*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
11501 assert_eq!(prefix_hits.len(), 2);
11502
11503 let filtered_hits = client.search(
11504 "br-123",
11505 SearchFilters {
11506 workspaces: HashSet::from_iter(["/ws/beta".to_string()]),
11507 ..SearchFilters::default()
11508 },
11509 10,
11510 0,
11511 FieldMask::FULL,
11512 )?;
11513 assert_eq!(filtered_hits.len(), 1);
11514 assert_eq!(filtered_hits[0].workspace, "/ws/beta");
11515 assert_eq!(filtered_hits[0].source_path, "/tmp/beta.jsonl");
11516 assert!(filtered_hits[0].content.contains("br-123"));
11517
11518 Ok(())
11519 }
11520
11521 #[test]
11522 fn sqlite_backend_orders_hits_by_bm25_score() -> Result<()> {
11523 let conn = Connection::open(":memory:")?;
11524 conn.execute_batch(
11525 "CREATE TABLE conversations (
11526 id INTEGER PRIMARY KEY,
11527 agent_id INTEGER,
11528 workspace_id INTEGER,
11529 source_id TEXT,
11530 origin_host TEXT,
11531 title TEXT,
11532 source_path TEXT
11533 );
11534 CREATE TABLE messages (
11535 id INTEGER PRIMARY KEY,
11536 conversation_id INTEGER,
11537 idx INTEGER,
11538 content TEXT,
11539 created_at INTEGER
11540 );
11541 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11542 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11543 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11544 CREATE VIRTUAL TABLE fts_messages USING fts5(
11545 content,
11546 title,
11547 agent,
11548 workspace,
11549 source_path,
11550 created_at UNINDEXED,
11551 content='',
11552 tokenize='porter'
11553 );",
11554 )?;
11555 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11556 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11557 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
11558 conn.execute(
11559 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'best', '/tmp/best.jsonl')",
11560 )?;
11561 conn.execute(
11562 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'worse', '/tmp/worse.jsonl')",
11563 )?;
11564 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(7, 1, 0, 'auth auth auth failure', 42)")?;
11565 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(8, 2, 0, 'auth failure', 43)")?;
11566 conn.execute_compat(
11567 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11568 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11569 params![
11570 7_i64,
11571 "auth auth auth failure",
11572 "best",
11573 "codex",
11574 "/ws",
11575 "/tmp/best.jsonl",
11576 42_i64
11577 ],
11578 )?;
11579 conn.execute_compat(
11580 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11581 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11582 params![
11583 8_i64,
11584 "auth failure",
11585 "worse",
11586 "codex",
11587 "/ws",
11588 "/tmp/worse.jsonl",
11589 43_i64
11590 ],
11591 )?;
11592 let client = SearchClient {
11593 reader: None,
11594 sqlite: Mutex::new(Some(SendConnection(conn))),
11595 sqlite_path: None,
11596 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11597 reload_on_search: true,
11598 last_reload: Mutex::new(None),
11599 last_generation: Mutex::new(None),
11600 reload_epoch: Arc::new(AtomicU64::new(0)),
11601 warm_tx: None,
11602 _warm_handle: None,
11603 metrics: Metrics::default(),
11604 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11605 semantic: Mutex::new(None),
11606 last_tantivy_total_count: Mutex::new(None),
11607 };
11608 let direct_hits = client.search_sqlite_fts5(
11609 Path::new(":memory:"),
11610 "auth",
11611 SearchFilters::default(),
11612 5,
11613 0,
11614 FieldMask::FULL,
11615 )?;
11616 assert_eq!(direct_hits.len(), 2);
11617
11618 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11619 assert_eq!(hits.len(), 2);
11620 assert_eq!(hits[0].title, "best");
11621 assert_eq!(hits[1].title, "worse");
11622 assert!(hits[0].score > hits[1].score);
11623
11624 Ok(())
11625 }
11626
11627 #[test]
11628 fn sqlite_fts5_ranked_phase_defers_content_decode_until_after_limit() {
11629 let (rank_sql, params) = SearchClient::sqlite_fts5_rank_query(
11630 "auth",
11631 &SearchFilters::default(),
11632 50,
11633 0,
11634 false,
11635 SqliteFtsMatchMode::Table,
11636 );
11637 let hydrate_sql = SearchClient::sqlite_fts5_hydrate_query(
11638 2,
11639 FieldMask::new(true, true, true, true),
11640 false,
11641 );
11642
11643 assert!(
11644 !rank_sql.contains("fts_messages.content"),
11645 "rank query must not decode large content rows before LIMIT"
11646 );
11647 assert!(
11648 hydrate_sql.contains("fts_messages.content"),
11649 "hydration query should still provide requested content"
11650 );
11651 assert!(
11652 rank_sql.contains("LIMIT ? OFFSET ?"),
11653 "rank query must apply page bounds before hydration"
11654 );
11655 assert_eq!(params.len(), 3, "fts query plus limit and offset params");
11656 }
11657
11658 #[test]
11659 fn sqlite_fts5_hydration_chunks_stay_below_bind_variable_limit() {
11660 let oversized_row_count = SQLITE_MAX_VARIABLE_NUMBER + 1;
11661 let unchunked_sql = SearchClient::sqlite_fts5_hydrate_query(
11662 oversized_row_count,
11663 FieldMask::new(true, true, true, true),
11664 false,
11665 );
11666 assert!(
11667 unchunked_sql.matches('?').count() > SQLITE_MAX_VARIABLE_NUMBER,
11668 "the pre-fix one-shot hydration query would exceed frankensqlite's bind limit"
11669 );
11670
11671 let ranked_rows: Vec<(i64, f64)> = (0..(SQLITE_FTS5_HYDRATE_PARAM_CHUNK + 17))
11672 .map(|idx| (idx as i64, idx as f64))
11673 .collect();
11674 let chunk_sizes: Vec<usize> = SearchClient::sqlite_fts5_hydrate_row_chunks(&ranked_rows)
11675 .map(<[(i64, f64)]>::len)
11676 .collect();
11677
11678 assert_eq!(
11679 chunk_sizes,
11680 vec![SQLITE_FTS5_HYDRATE_PARAM_CHUNK, 17],
11681 "large fallback pages must hydrate in bounded chunks while preserving rank windows"
11682 );
11683 assert!(
11684 chunk_sizes
11685 .iter()
11686 .all(|chunk_size| *chunk_size <= SQLITE_MAX_VARIABLE_NUMBER),
11687 "every hydration chunk must fit under frankensqlite's bind-variable ceiling"
11688 );
11689 }
11690
11691 #[test]
11692 fn tantivy_fallback_hydration_narrows_by_normalized_source_before_message_lookup() -> Result<()>
11693 {
11694 let conn = Connection::open(":memory:")?;
11695 conn.execute_batch(
11696 "CREATE TABLE conversations (
11697 id INTEGER PRIMARY KEY,
11698 source_id TEXT,
11699 origin_host TEXT,
11700 source_path TEXT NOT NULL
11701 );
11702 CREATE TABLE messages (
11703 id INTEGER PRIMARY KEY,
11704 conversation_id INTEGER NOT NULL,
11705 idx INTEGER NOT NULL,
11706 content TEXT NOT NULL,
11707 UNIQUE(conversation_id, idx)
11708 );
11709 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11710 )?;
11711 conn.execute(
11712 "INSERT INTO conversations(id, source_id, origin_host, source_path)
11713 VALUES(1, '', 'devbox', '/tmp/shared-fallback.jsonl')",
11714 )?;
11715 conn.execute(
11716 "INSERT INTO conversations(id, source_id, origin_host, source_path)
11717 VALUES(2, 'local', NULL, '/tmp/shared-fallback.jsonl')",
11718 )?;
11719 conn.execute(
11720 "INSERT INTO messages(id, conversation_id, idx, content)
11721 VALUES(10, 1, 2, 'remote fallback content')",
11722 )?;
11723 conn.execute(
11724 "INSERT INTO messages(id, conversation_id, idx, content)
11725 VALUES(20, 2, 2, 'local content must not win')",
11726 )?;
11727
11728 let client = SearchClient {
11729 reader: None,
11730 sqlite: Mutex::new(Some(SendConnection(conn))),
11731 sqlite_path: None,
11732 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11733 reload_on_search: true,
11734 last_reload: Mutex::new(None),
11735 last_generation: Mutex::new(None),
11736 reload_epoch: Arc::new(AtomicU64::new(0)),
11737 warm_tx: None,
11738 _warm_handle: None,
11739 metrics: Metrics::default(),
11740 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11741 semantic: Mutex::new(None),
11742 last_tantivy_total_count: Mutex::new(None),
11743 };
11744
11745 let fallback_key = (
11746 "devbox".to_string(),
11747 "/tmp/shared-fallback.jsonl".to_string(),
11748 2,
11749 );
11750 let (_, hydrated_fallback) =
11751 client.hydrate_tantivy_hit_contents(&[], std::slice::from_ref(&fallback_key))?;
11752
11753 assert_eq!(
11754 hydrated_fallback.get(&fallback_key).map(String::as_str),
11755 Some("remote fallback content")
11756 );
11757
11758 Ok(())
11759 }
11760
11761 #[test]
11762 fn exact_content_hydration_returns_only_requested_message_indices() -> Result<()> {
11763 let conn = Connection::open(":memory:")?;
11764 conn.execute_batch(
11765 "CREATE TABLE messages (
11766 id INTEGER PRIMARY KEY,
11767 conversation_id INTEGER NOT NULL,
11768 idx INTEGER NOT NULL,
11769 content TEXT NOT NULL,
11770 UNIQUE(conversation_id, idx)
11771 );",
11772 )?;
11773
11774 for idx in 0..8 {
11775 conn.execute(&format!(
11776 "INSERT INTO messages(conversation_id, idx, content)
11777 VALUES(1, {idx}, 'conversation one row {idx}')"
11778 ))?;
11779 }
11780 conn.execute(
11781 "INSERT INTO messages(conversation_id, idx, content)
11782 VALUES(2, 0, 'conversation two row 0')",
11783 )?;
11784
11785 let hydrated =
11786 hydrate_message_content_by_conversation(&conn, &[(1, 6), (1, 2), (2, 0), (1, 99)])?;
11787
11788 assert_eq!(hydrated.len(), 3);
11789 assert_eq!(
11790 hydrated.get(&(1, 2)).map(String::as_str),
11791 Some("conversation one row 2")
11792 );
11793 assert_eq!(
11794 hydrated.get(&(1, 6)).map(String::as_str),
11795 Some("conversation one row 6")
11796 );
11797 assert_eq!(
11798 hydrated.get(&(2, 0)).map(String::as_str),
11799 Some("conversation two row 0")
11800 );
11801 assert!(!hydrated.contains_key(&(1, 99)));
11802
11803 Ok(())
11804 }
11805
11806 #[test]
11807 fn sqlite_backend_generates_snippet_from_content() -> Result<()> {
11808 let conn = Connection::open(":memory:")?;
11809 conn.execute_batch(
11810 "CREATE TABLE conversations (
11811 id INTEGER PRIMARY KEY,
11812 agent_id INTEGER,
11813 workspace_id INTEGER,
11814 source_id TEXT,
11815 origin_host TEXT,
11816 title TEXT,
11817 source_path TEXT
11818 );
11819 CREATE TABLE messages (
11820 id INTEGER PRIMARY KEY,
11821 conversation_id INTEGER,
11822 idx INTEGER,
11823 content TEXT,
11824 created_at INTEGER
11825 );
11826 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11827 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11828 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11829 CREATE VIRTUAL TABLE fts_messages USING fts5(
11830 content,
11831 title,
11832 agent,
11833 workspace,
11834 source_path,
11835 created_at UNINDEXED,
11836 content='',
11837 tokenize='porter'
11838 );",
11839 )?;
11840 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11841 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11842 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
11843 conn.execute(
11844 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'snippet title', '/tmp/snippet.jsonl')",
11845 )?;
11846 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'alpha beta gamma delta epsilon zeta eta theta', 42)")?;
11847 conn.execute_compat(
11848 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11849 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11850 params![
11851 1_i64,
11852 "alpha beta gamma delta epsilon zeta eta theta",
11853 "snippet title",
11854 "codex",
11855 "/ws",
11856 "/tmp/snippet.jsonl",
11857 42_i64
11858 ],
11859 )?;
11860
11861 let client = SearchClient {
11862 reader: None,
11863 sqlite: Mutex::new(Some(SendConnection(conn))),
11864 sqlite_path: None,
11865 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11866 reload_on_search: true,
11867 last_reload: Mutex::new(None),
11868 last_generation: Mutex::new(None),
11869 reload_epoch: Arc::new(AtomicU64::new(0)),
11870 warm_tx: None,
11871 _warm_handle: None,
11872 metrics: Metrics::default(),
11873 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11874 semantic: Mutex::new(None),
11875 last_tantivy_total_count: Mutex::new(None),
11876 };
11877
11878 let hits = client.search("delta", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11879 assert_eq!(hits.len(), 1);
11880 assert_eq!(hits[0].snippet, snippet_from_content(&hits[0].content));
11882 assert!(hits[0].snippet.contains("delta"));
11883
11884 Ok(())
11885 }
11886
11887 #[test]
11888 fn sqlite_backend_respects_source_filter() -> Result<()> {
11889 let conn = Connection::open(":memory:")?;
11890 conn.execute_batch(
11891 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11892 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11893 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11894 CREATE TABLE conversations (
11895 id INTEGER PRIMARY KEY,
11896 agent_id INTEGER,
11897 workspace_id INTEGER,
11898 source_id TEXT,
11899 origin_host TEXT,
11900 title TEXT,
11901 source_path TEXT
11902 );
11903 CREATE TABLE messages (
11904 id INTEGER PRIMARY KEY,
11905 conversation_id INTEGER,
11906 idx INTEGER,
11907 content TEXT,
11908 created_at INTEGER
11909 );
11910 CREATE VIRTUAL TABLE fts_messages USING fts5(
11911 content,
11912 title,
11913 agent,
11914 workspace,
11915 source_path,
11916 created_at UNINDEXED,
11917 content='',
11918 tokenize='porter'
11919 );",
11920 )?;
11921 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11922 conn.execute("INSERT INTO sources(id, kind) VALUES('laptop', 'ssh')")?;
11923 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11924 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/local')")?;
11925 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/remote')")?;
11926 conn.execute(
11927 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, ' local ', NULL, 'local title', '/tmp/local.jsonl')",
11928 )?;
11929 conn.execute("INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 2, 'laptop', 'dev@laptop', 'remote title', '/tmp/remote.jsonl')")?;
11930 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
11931 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
11932 conn.execute_compat(
11933 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11934 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11935 params![
11936 1_i64,
11937 "auth token failure",
11938 "local title",
11939 "codex",
11940 "/local",
11941 "/tmp/local.jsonl",
11942 42_i64
11943 ],
11944 )?;
11945 conn.execute_compat(
11946 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11947 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11948 params![
11949 2_i64,
11950 "auth token failure",
11951 "remote title",
11952 "codex",
11953 "/remote",
11954 "/tmp/remote.jsonl",
11955 43_i64
11956 ],
11957 )?;
11958
11959 let client = SearchClient {
11960 reader: None,
11961 sqlite: Mutex::new(Some(SendConnection(conn))),
11962 sqlite_path: None,
11963 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11964 reload_on_search: true,
11965 last_reload: Mutex::new(None),
11966 last_generation: Mutex::new(None),
11967 reload_epoch: Arc::new(AtomicU64::new(0)),
11968 warm_tx: None,
11969 _warm_handle: None,
11970 metrics: Metrics::default(),
11971 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11972 semantic: Mutex::new(None),
11973 last_tantivy_total_count: Mutex::new(None),
11974 };
11975
11976 let local_hits = client.browse_by_date(
11977 SearchFilters {
11978 source_filter: SourceFilter::Local,
11979 ..SearchFilters::default()
11980 },
11981 5,
11982 0,
11983 true,
11984 FieldMask::FULL,
11985 )?;
11986 assert_eq!(local_hits.len(), 1);
11987 assert_eq!(local_hits[0].source_id, "local");
11988
11989 let remote_hits = client.browse_by_date(
11990 SearchFilters {
11991 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
11992 ..SearchFilters::default()
11993 },
11994 5,
11995 0,
11996 true,
11997 FieldMask::FULL,
11998 )?;
11999 assert_eq!(remote_hits.len(), 1);
12000 assert_eq!(remote_hits[0].source_id, "local");
12001 assert_eq!(remote_hits[0].origin_kind, "local");
12002
12003 Ok(())
12004 }
12005
12006 #[test]
12007 fn sqlite_backend_remote_source_filter_matches_blank_source_id_with_origin_host() -> Result<()>
12008 {
12009 let conn = Connection::open(":memory:")?;
12010 conn.execute_batch(
12011 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12012 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
12013 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
12014 CREATE TABLE conversations (
12015 id INTEGER PRIMARY KEY,
12016 agent_id INTEGER,
12017 workspace_id INTEGER,
12018 source_id TEXT,
12019 origin_host TEXT,
12020 title TEXT,
12021 source_path TEXT
12022 );
12023 CREATE TABLE messages (
12024 id INTEGER PRIMARY KEY,
12025 conversation_id INTEGER,
12026 idx INTEGER,
12027 content TEXT,
12028 created_at INTEGER
12029 );
12030 CREATE VIRTUAL TABLE fts_messages USING fts5(
12031 content,
12032 title,
12033 agent,
12034 workspace,
12035 source_path,
12036 created_at UNINDEXED,
12037 content='',
12038 tokenize='porter'
12039 );",
12040 )?;
12041 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12042 conn.execute(
12043 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12044 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'remote title', '/tmp/remote-filter.jsonl')",
12045 )?;
12046 conn.execute(
12047 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12048 VALUES(1, 1, 0, 'remote filter proof', 42)",
12049 )?;
12050 conn.execute_compat(
12051 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12052 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
12053 params![
12054 1_i64,
12055 "remote filter proof",
12056 "remote title",
12057 "codex",
12058 "/tmp/remote-filter.jsonl",
12059 42_i64
12060 ],
12061 )?;
12062
12063 let client = SearchClient {
12064 reader: None,
12065 sqlite: Mutex::new(Some(SendConnection(conn))),
12066 sqlite_path: None,
12067 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12068 reload_on_search: true,
12069 last_reload: Mutex::new(None),
12070 last_generation: Mutex::new(None),
12071 reload_epoch: Arc::new(AtomicU64::new(0)),
12072 warm_tx: None,
12073 _warm_handle: None,
12074 metrics: Metrics::default(),
12075 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12076 semantic: Mutex::new(None),
12077 last_tantivy_total_count: Mutex::new(None),
12078 };
12079
12080 let remote_hits = client.search(
12081 "remote",
12082 SearchFilters {
12083 source_filter: SourceFilter::Remote,
12084 ..Default::default()
12085 },
12086 5,
12087 0,
12088 FieldMask::FULL,
12089 )?;
12090 assert_eq!(remote_hits.len(), 1);
12091 assert_eq!(remote_hits[0].source_id, "dev@laptop");
12092 assert_eq!(remote_hits[0].origin_kind, "remote");
12093 assert_eq!(remote_hits[0].origin_host.as_deref(), Some("dev@laptop"));
12094
12095 let source_hits = client.search(
12096 "remote",
12097 SearchFilters {
12098 source_filter: SourceFilter::SourceId("dev@laptop".into()),
12099 ..Default::default()
12100 },
12101 5,
12102 0,
12103 FieldMask::FULL,
12104 )?;
12105 assert_eq!(source_hits.len(), 1);
12106 assert_eq!(source_hits[0].source_id, "dev@laptop");
12107 assert_eq!(source_hits[0].origin_kind, "remote");
12108
12109 Ok(())
12110 }
12111
12112 #[test]
12113 fn sqlite_backend_workspace_filter_matches_null_workspace_as_empty_string() -> Result<()> {
12114 let conn = Connection::open(":memory:")?;
12115 conn.execute_batch(
12116 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12117 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
12118 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
12119 CREATE TABLE conversations (
12120 id INTEGER PRIMARY KEY,
12121 agent_id INTEGER,
12122 workspace_id INTEGER,
12123 source_id TEXT,
12124 origin_host TEXT,
12125 title TEXT,
12126 source_path TEXT
12127 );
12128 CREATE TABLE messages (
12129 id INTEGER PRIMARY KEY,
12130 conversation_id INTEGER,
12131 idx INTEGER,
12132 content TEXT,
12133 created_at INTEGER
12134 );
12135 CREATE VIRTUAL TABLE fts_messages USING fts5(
12136 content,
12137 title,
12138 agent,
12139 workspace,
12140 source_path,
12141 created_at UNINDEXED,
12142 content='',
12143 tokenize='porter'
12144 );",
12145 )?;
12146 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
12147 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12148 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/named')")?;
12149 conn.execute(
12151 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 'null workspace', '/tmp/null-workspace.jsonl')",
12152 )?;
12153 conn.execute(
12155 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'named workspace', '/tmp/named-workspace.jsonl')",
12156 )?;
12157 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
12158 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
12159 conn.execute_compat(
12160 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12161 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
12162 params![
12163 1_i64,
12164 "auth token failure",
12165 "null workspace",
12166 "codex",
12167 "/tmp/null-workspace.jsonl",
12168 42_i64
12169 ],
12170 )?;
12171 conn.execute_compat(
12172 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
12173 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
12174 params![
12175 2_i64,
12176 "auth token failure",
12177 "named workspace",
12178 "codex",
12179 "/named",
12180 "/tmp/named-workspace.jsonl",
12181 43_i64
12182 ],
12183 )?;
12184
12185 let client = SearchClient {
12186 reader: None,
12187 sqlite: Mutex::new(Some(SendConnection(conn))),
12188 sqlite_path: None,
12189 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12190 reload_on_search: true,
12191 last_reload: Mutex::new(None),
12192 last_generation: Mutex::new(None),
12193 reload_epoch: Arc::new(AtomicU64::new(0)),
12194 warm_tx: None,
12195 _warm_handle: None,
12196 metrics: Metrics::default(),
12197 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12198 semantic: Mutex::new(None),
12199 last_tantivy_total_count: Mutex::new(None),
12200 };
12201
12202 let hits = client.search(
12203 "auth",
12204 SearchFilters {
12205 workspaces: HashSet::from_iter([String::new()]),
12206 ..SearchFilters::default()
12207 },
12208 5,
12209 0,
12210 FieldMask::FULL,
12211 )?;
12212 assert_eq!(hits.len(), 1);
12213 assert_eq!(hits[0].workspace, "");
12214 assert_eq!(hits[0].source_path, "/tmp/null-workspace.jsonl");
12215
12216 Ok(())
12217 }
12218
12219 #[test]
12220 fn sqlite_message_scan_preserves_boolean_or_precedence() {
12221 let simple_or =
12222 SearchClient::sqlite_message_scan_query("alpha OR beta").expect("simple OR scan query");
12223 assert!(SearchClient::sqlite_message_scan_score("alpha", &simple_or) > 0.0);
12224 assert!(SearchClient::sqlite_message_scan_score("beta", &simple_or) > 0.0);
12225 assert_eq!(
12226 SearchClient::sqlite_message_scan_score("gamma", &simple_or),
12227 0.0
12228 );
12229
12230 let and_then_or = SearchClient::sqlite_message_scan_query("alpha AND beta OR gamma")
12231 .expect("AND followed by OR scan query");
12232 assert!(
12233 SearchClient::sqlite_message_scan_score("alpha gamma", &and_then_or) > 0.0,
12234 "alpha AND (beta OR gamma) should accept the gamma branch"
12235 );
12236 assert_eq!(
12237 SearchClient::sqlite_message_scan_score("alpha", &and_then_or),
12238 0.0
12239 );
12240 assert_eq!(
12241 SearchClient::sqlite_message_scan_score("beta gamma", &and_then_or),
12242 0.0
12243 );
12244
12245 let or_then_and = SearchClient::sqlite_message_scan_query("alpha OR beta AND gamma")
12246 .expect("OR followed by AND scan query");
12247 assert!(
12248 SearchClient::sqlite_message_scan_score("alpha gamma", &or_then_and) > 0.0,
12249 "(alpha OR beta) AND gamma should accept the alpha branch"
12250 );
12251 assert!(
12252 SearchClient::sqlite_message_scan_score("beta gamma", &or_then_and) > 0.0,
12253 "(alpha OR beta) AND gamma should accept the beta branch"
12254 );
12255 assert_eq!(
12256 SearchClient::sqlite_message_scan_score("alpha", &or_then_and),
12257 0.0
12258 );
12259
12260 let binary_not =
12261 SearchClient::sqlite_message_scan_query("alpha NOT beta").expect("NOT scan query");
12262 assert!(SearchClient::sqlite_message_scan_score("alpha", &binary_not) > 0.0);
12263 assert_eq!(
12264 SearchClient::sqlite_message_scan_score("alpha beta", &binary_not),
12265 0.0
12266 );
12267 }
12268
12269 #[test]
12270 fn browse_by_date_treats_null_workspace_and_source_as_local() -> Result<()> {
12271 let conn = Connection::open(":memory:")?;
12272 conn.execute_batch(
12273 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12274 CREATE TABLE conversations (
12275 id INTEGER PRIMARY KEY,
12276 agent_id INTEGER NOT NULL,
12277 workspace_id INTEGER,
12278 source_id TEXT,
12279 origin_host TEXT,
12280 title TEXT,
12281 source_path TEXT NOT NULL
12282 );
12283 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12284 CREATE TABLE messages (
12285 id INTEGER PRIMARY KEY,
12286 conversation_id INTEGER NOT NULL,
12287 idx INTEGER,
12288 content TEXT NOT NULL,
12289 created_at INTEGER
12290 );
12291 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12292 )?;
12293 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12294 conn.execute(
12295 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12296 VALUES(1, 1, NULL, NULL, NULL, 'browse title', '/tmp/browse.jsonl')",
12297 )?;
12298 conn.execute(
12299 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12300 VALUES(1, 1, 0, 'browse auth token failure', 123)",
12301 )?;
12302
12303 let client = SearchClient {
12304 reader: None,
12305 sqlite: Mutex::new(Some(SendConnection(conn))),
12306 sqlite_path: None,
12307 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12308 reload_on_search: true,
12309 last_reload: Mutex::new(None),
12310 last_generation: Mutex::new(None),
12311 reload_epoch: Arc::new(AtomicU64::new(0)),
12312 warm_tx: None,
12313 _warm_handle: None,
12314 metrics: Metrics::default(),
12315 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12316 semantic: Mutex::new(None),
12317 last_tantivy_total_count: Mutex::new(None),
12318 };
12319
12320 let hits = client.browse_by_date(
12321 SearchFilters {
12322 workspaces: HashSet::from_iter([String::new()]),
12323 source_filter: SourceFilter::Local,
12324 ..SearchFilters::default()
12325 },
12326 5,
12327 0,
12328 true,
12329 FieldMask::FULL,
12330 )?;
12331 assert_eq!(hits.len(), 1);
12332 assert_eq!(hits[0].workspace, "");
12333 assert_eq!(hits[0].source_id, "local");
12334 assert_eq!(hits[0].origin_kind, "local");
12335
12336 Ok(())
12337 }
12338
12339 #[test]
12340 fn hydrate_semantic_hits_with_ids_snippet_only_uses_full_content_for_snippets_and_identity()
12341 -> Result<()> {
12342 let conn = Connection::open(":memory:")?;
12343 conn.execute_batch(
12344 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12345 CREATE TABLE conversations (
12346 id INTEGER PRIMARY KEY,
12347 agent_id INTEGER NOT NULL,
12348 workspace_id INTEGER,
12349 source_id TEXT,
12350 origin_host TEXT,
12351 title TEXT,
12352 source_path TEXT NOT NULL,
12353 started_at INTEGER
12354 );
12355 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12356 CREATE TABLE messages (
12357 id INTEGER PRIMARY KEY,
12358 conversation_id INTEGER NOT NULL,
12359 idx INTEGER,
12360 role TEXT,
12361 content TEXT NOT NULL,
12362 created_at INTEGER
12363 );
12364 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12365 )?;
12366 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12367 conn.execute(
12368 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12369 VALUES(1, 1, NULL, 'local', NULL, 'semantic title', '/tmp/semantic.jsonl', 100)",
12370 )?;
12371 let shared_prefix = "shared-prefix ".repeat(32);
12372 let first = format!("{shared_prefix}first unique semantic tail");
12373 let second = format!("{shared_prefix}second unique semantic tail");
12374 conn.execute_with_params(
12375 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12376 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
12377 &[
12378 fsqlite_types::value::SqliteValue::Integer(1),
12379 fsqlite_types::value::SqliteValue::Integer(0),
12380 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
12381 fsqlite_types::value::SqliteValue::Integer(101),
12382 ],
12383 )?;
12384 conn.execute_with_params(
12385 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12386 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
12387 &[
12388 fsqlite_types::value::SqliteValue::Integer(2),
12389 fsqlite_types::value::SqliteValue::Integer(1),
12390 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
12391 fsqlite_types::value::SqliteValue::Integer(102),
12392 ],
12393 )?;
12394
12395 let client = SearchClient {
12396 reader: None,
12397 sqlite: Mutex::new(Some(SendConnection(conn))),
12398 sqlite_path: None,
12399 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12400 reload_on_search: true,
12401 last_reload: Mutex::new(None),
12402 last_generation: Mutex::new(None),
12403 reload_epoch: Arc::new(AtomicU64::new(0)),
12404 warm_tx: None,
12405 _warm_handle: None,
12406 metrics: Metrics::default(),
12407 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12408 semantic: Mutex::new(None),
12409 last_tantivy_total_count: Mutex::new(None),
12410 };
12411
12412 let hits = client.hydrate_semantic_hits_with_ids(
12413 &[
12414 VectorSearchResult {
12415 message_id: 1,
12416 chunk_idx: 0,
12417 score: 0.9,
12418 },
12419 VectorSearchResult {
12420 message_id: 2,
12421 chunk_idx: 0,
12422 score: 0.8,
12423 },
12424 ],
12425 FieldMask::new(false, true, true, true),
12426 )?;
12427 assert_eq!(hits.len(), 2);
12428 assert!(hits.iter().all(|(_, hit)| hit.content.is_empty()));
12429 assert!(hits.iter().all(|(_, hit)| !hit.snippet.is_empty()));
12430 assert_ne!(hits[0].1.content_hash, hits[1].1.content_hash);
12431
12432 Ok(())
12433 }
12434
12435 #[test]
12436 fn hydrate_semantic_hits_with_ids_normalizes_trimmed_local_source_metadata() -> Result<()> {
12437 let conn = Connection::open(":memory:")?;
12438 conn.execute_batch(
12439 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12440 CREATE TABLE conversations (
12441 id INTEGER PRIMARY KEY,
12442 agent_id INTEGER NOT NULL,
12443 workspace_id INTEGER,
12444 source_id TEXT,
12445 origin_host TEXT,
12446 title TEXT,
12447 source_path TEXT NOT NULL,
12448 started_at INTEGER
12449 );
12450 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12451 CREATE TABLE messages (
12452 id INTEGER PRIMARY KEY,
12453 conversation_id INTEGER NOT NULL,
12454 idx INTEGER,
12455 role TEXT,
12456 content TEXT NOT NULL,
12457 created_at INTEGER
12458 );
12459 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12460 )?;
12461 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12462 conn.execute(
12463 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12464 VALUES(1, 1, NULL, ' local ', NULL, 'trimmed local semantic', '/tmp/trimmed-local-semantic.jsonl', 100)",
12465 )?;
12466 conn.execute_with_params(
12467 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12468 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12469 &[
12470 fsqlite_types::value::SqliteValue::Integer(1),
12471 fsqlite_types::value::SqliteValue::Text("trimmed local semantic body".into()),
12472 ],
12473 )?;
12474
12475 let client = SearchClient {
12476 reader: None,
12477 sqlite: Mutex::new(Some(SendConnection(conn))),
12478 sqlite_path: None,
12479 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12480 reload_on_search: true,
12481 last_reload: Mutex::new(None),
12482 last_generation: Mutex::new(None),
12483 reload_epoch: Arc::new(AtomicU64::new(0)),
12484 warm_tx: None,
12485 _warm_handle: None,
12486 metrics: Metrics::default(),
12487 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12488 semantic: Mutex::new(None),
12489 last_tantivy_total_count: Mutex::new(None),
12490 };
12491
12492 let hits = client.hydrate_semantic_hits_with_ids(
12493 &[VectorSearchResult {
12494 message_id: 1,
12495 chunk_idx: 0,
12496 score: 0.9,
12497 }],
12498 FieldMask::new(false, true, true, true),
12499 )?;
12500 assert_eq!(hits.len(), 1);
12501 assert_eq!(hits[0].1.source_id, "local");
12502 assert_eq!(hits[0].1.origin_kind, "local");
12503
12504 Ok(())
12505 }
12506
12507 #[test]
12508 fn hydrate_semantic_hits_with_ids_preserves_remote_origin_without_source_row() -> Result<()> {
12509 let conn = Connection::open(":memory:")?;
12510 conn.execute_batch(
12511 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12512 CREATE TABLE conversations (
12513 id INTEGER PRIMARY KEY,
12514 agent_id INTEGER NOT NULL,
12515 workspace_id INTEGER,
12516 source_id TEXT,
12517 origin_host TEXT,
12518 title TEXT,
12519 source_path TEXT NOT NULL,
12520 started_at INTEGER
12521 );
12522 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12523 CREATE TABLE messages (
12524 id INTEGER PRIMARY KEY,
12525 conversation_id INTEGER NOT NULL,
12526 idx INTEGER,
12527 role TEXT,
12528 content TEXT NOT NULL,
12529 created_at INTEGER
12530 );
12531 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12532 )?;
12533 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12534 conn.execute(
12535 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12536 VALUES(1, 1, NULL, 'laptop', 'dev@laptop', 'remote semantic', '/tmp/remote-semantic.jsonl', 100)",
12537 )?;
12538 conn.execute_with_params(
12539 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12540 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12541 &[
12542 fsqlite_types::value::SqliteValue::Integer(1),
12543 fsqlite_types::value::SqliteValue::Text("remote semantic body".into()),
12544 ],
12545 )?;
12546
12547 let client = SearchClient {
12548 reader: None,
12549 sqlite: Mutex::new(Some(SendConnection(conn))),
12550 sqlite_path: None,
12551 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12552 reload_on_search: true,
12553 last_reload: Mutex::new(None),
12554 last_generation: Mutex::new(None),
12555 reload_epoch: Arc::new(AtomicU64::new(0)),
12556 warm_tx: None,
12557 _warm_handle: None,
12558 metrics: Metrics::default(),
12559 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12560 semantic: Mutex::new(None),
12561 last_tantivy_total_count: Mutex::new(None),
12562 };
12563
12564 let hits = client.hydrate_semantic_hits_with_ids(
12565 &[VectorSearchResult {
12566 message_id: 1,
12567 chunk_idx: 0,
12568 score: 0.9,
12569 }],
12570 FieldMask::new(false, true, true, true),
12571 )?;
12572 assert_eq!(hits.len(), 1);
12573 assert_eq!(hits[0].1.source_id, "laptop");
12574 assert_eq!(hits[0].1.origin_kind, "remote");
12575 assert_eq!(hits[0].1.origin_host.as_deref(), Some("dev@laptop"));
12576
12577 Ok(())
12578 }
12579
12580 #[test]
12581 fn resolve_semantic_doc_ids_for_hits_distinguishes_same_source_path_line_by_content_hash()
12582 -> Result<()> {
12583 let conn = Connection::open(":memory:")?;
12584 conn.execute_batch(
12585 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12586 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12587 CREATE TABLE conversations (
12588 id INTEGER PRIMARY KEY,
12589 agent_id INTEGER NOT NULL,
12590 workspace_id INTEGER,
12591 source_id TEXT,
12592 origin_host TEXT,
12593 title TEXT,
12594 source_path TEXT NOT NULL
12595 );
12596 CREATE TABLE messages (
12597 id INTEGER PRIMARY KEY,
12598 conversation_id INTEGER NOT NULL,
12599 idx INTEGER,
12600 role TEXT,
12601 content TEXT NOT NULL,
12602 created_at INTEGER
12603 );",
12604 )?;
12605 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12606 conn.execute(
12607 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12608 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
12609 )?;
12610 conn.execute(
12611 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12612 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
12613 )?;
12614 let first = "same prefix first tail".to_string();
12615 let second = "same prefix second tail".to_string();
12616 conn.execute_with_params(
12617 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12618 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12619 &[
12620 fsqlite_types::value::SqliteValue::Integer(11),
12621 fsqlite_types::value::SqliteValue::Integer(1),
12622 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
12623 ],
12624 )?;
12625 conn.execute_with_params(
12626 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12627 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12628 &[
12629 fsqlite_types::value::SqliteValue::Integer(22),
12630 fsqlite_types::value::SqliteValue::Integer(2),
12631 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
12632 ],
12633 )?;
12634
12635 let client = SearchClient {
12636 reader: None,
12637 sqlite: Mutex::new(Some(SendConnection(conn))),
12638 sqlite_path: None,
12639 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12640 reload_on_search: true,
12641 last_reload: Mutex::new(None),
12642 last_generation: Mutex::new(None),
12643 reload_epoch: Arc::new(AtomicU64::new(0)),
12644 warm_tx: None,
12645 _warm_handle: None,
12646 metrics: Metrics::default(),
12647 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12648 semantic: Mutex::new(None),
12649 last_tantivy_total_count: Mutex::new(None),
12650 };
12651
12652 let first_hit = SearchHit {
12653 title: "Shared Session".into(),
12654 snippet: String::new(),
12655 content: String::new(),
12656 content_hash: stable_hit_hash(
12657 &first,
12658 "/tmp/progressive-shared.jsonl",
12659 Some(1),
12660 Some(100),
12661 ),
12662 score: 0.0,
12663 source_path: "/tmp/progressive-shared.jsonl".into(),
12664 agent: "codex".into(),
12665 workspace: String::new(),
12666 workspace_original: None,
12667 created_at: Some(100),
12668 line_number: Some(1),
12669 match_type: MatchType::Exact,
12670 source_id: "local".into(),
12671 origin_kind: "local".into(),
12672 origin_host: None,
12673 conversation_id: None,
12674 };
12675 let second_hit = SearchHit {
12676 title: "Shared Session".into(),
12677 snippet: String::new(),
12678 content: String::new(),
12679 content_hash: stable_hit_hash(
12680 &second,
12681 "/tmp/progressive-shared.jsonl",
12682 Some(1),
12683 Some(100),
12684 ),
12685 score: 0.0,
12686 source_path: "/tmp/progressive-shared.jsonl".into(),
12687 agent: "codex".into(),
12688 workspace: String::new(),
12689 workspace_original: None,
12690 created_at: Some(100),
12691 line_number: Some(1),
12692 match_type: MatchType::Exact,
12693 source_id: "local".into(),
12694 origin_kind: "local".into(),
12695 origin_host: None,
12696 conversation_id: None,
12697 };
12698
12699 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
12700 assert_eq!(resolved.len(), 2);
12701 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12702 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
12703 assert_ne!(
12704 resolved[0].as_ref().map(|hit| hit.doc_id.as_str()),
12705 resolved[1].as_ref().map(|hit| hit.doc_id.as_str())
12706 );
12707
12708 Ok(())
12709 }
12710
12711 #[test]
12712 fn hydrate_semantic_hits_with_ids_keeps_missing_title_empty() -> Result<()> {
12713 let conn = Connection::open(":memory:")?;
12714 conn.execute_batch(
12715 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12716 CREATE TABLE conversations (
12717 id INTEGER PRIMARY KEY,
12718 agent_id INTEGER NOT NULL,
12719 workspace_id INTEGER,
12720 source_id TEXT,
12721 origin_host TEXT,
12722 title TEXT,
12723 source_path TEXT NOT NULL,
12724 started_at INTEGER
12725 );
12726 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12727 CREATE TABLE messages (
12728 id INTEGER PRIMARY KEY,
12729 conversation_id INTEGER NOT NULL,
12730 idx INTEGER,
12731 role TEXT,
12732 content TEXT NOT NULL,
12733 created_at INTEGER
12734 );
12735 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12736 )?;
12737 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12738 conn.execute(
12739 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12740 VALUES(1, 1, NULL, 'local', NULL, NULL, '/tmp/untitled-semantic.jsonl', 100)",
12741 )?;
12742 conn.execute_with_params(
12743 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12744 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12745 &[
12746 fsqlite_types::value::SqliteValue::Integer(1),
12747 fsqlite_types::value::SqliteValue::Text("untitled semantic body".into()),
12748 ],
12749 )?;
12750
12751 let client = SearchClient {
12752 reader: None,
12753 sqlite: Mutex::new(Some(SendConnection(conn))),
12754 sqlite_path: None,
12755 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12756 reload_on_search: true,
12757 last_reload: Mutex::new(None),
12758 last_generation: Mutex::new(None),
12759 reload_epoch: Arc::new(AtomicU64::new(0)),
12760 warm_tx: None,
12761 _warm_handle: None,
12762 metrics: Metrics::default(),
12763 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12764 semantic: Mutex::new(None),
12765 last_tantivy_total_count: Mutex::new(None),
12766 };
12767
12768 let hits = client.hydrate_semantic_hits_with_ids(
12769 &[VectorSearchResult {
12770 message_id: 1,
12771 chunk_idx: 0,
12772 score: 0.9,
12773 }],
12774 FieldMask::new(false, true, true, true),
12775 )?;
12776 assert_eq!(hits.len(), 1);
12777 assert_eq!(hits[0].1.title, "");
12778
12779 Ok(())
12780 }
12781
12782 #[test]
12783 fn resolve_semantic_doc_ids_for_hits_prefers_conversation_id_over_ambiguous_provenance()
12784 -> Result<()> {
12785 let conn = Connection::open(":memory:")?;
12786 conn.execute_batch(
12787 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12788 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12789 CREATE TABLE conversations (
12790 id INTEGER PRIMARY KEY,
12791 agent_id INTEGER NOT NULL,
12792 workspace_id INTEGER,
12793 source_id TEXT,
12794 origin_host TEXT,
12795 title TEXT,
12796 source_path TEXT NOT NULL
12797 );
12798 CREATE TABLE messages (
12799 id INTEGER PRIMARY KEY,
12800 conversation_id INTEGER NOT NULL,
12801 idx INTEGER,
12802 role TEXT,
12803 content TEXT NOT NULL,
12804 created_at INTEGER
12805 );",
12806 )?;
12807 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12808 conn.execute(
12809 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12810 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
12811 )?;
12812 conn.execute(
12813 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12814 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
12815 )?;
12816 let content = "same ambiguous content".to_string();
12817 conn.execute_with_params(
12818 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12819 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12820 &[
12821 fsqlite_types::value::SqliteValue::Integer(11),
12822 fsqlite_types::value::SqliteValue::Integer(1),
12823 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12824 ],
12825 )?;
12826 conn.execute_with_params(
12827 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12828 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12829 &[
12830 fsqlite_types::value::SqliteValue::Integer(22),
12831 fsqlite_types::value::SqliteValue::Integer(2),
12832 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12833 ],
12834 )?;
12835
12836 let client = SearchClient {
12837 reader: None,
12838 sqlite: Mutex::new(Some(SendConnection(conn))),
12839 sqlite_path: None,
12840 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12841 reload_on_search: true,
12842 last_reload: Mutex::new(None),
12843 last_generation: Mutex::new(None),
12844 reload_epoch: Arc::new(AtomicU64::new(0)),
12845 warm_tx: None,
12846 _warm_handle: None,
12847 metrics: Metrics::default(),
12848 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12849 semantic: Mutex::new(None),
12850 last_tantivy_total_count: Mutex::new(None),
12851 };
12852
12853 let first_hit = SearchHit {
12854 title: "Shared Session".into(),
12855 snippet: String::new(),
12856 content: String::new(),
12857 content_hash: stable_hit_hash(
12858 &content,
12859 "/tmp/progressive-conversation-id.jsonl",
12860 Some(1),
12861 Some(100),
12862 ),
12863 score: 0.0,
12864 source_path: "/tmp/progressive-conversation-id.jsonl".into(),
12865 agent: "codex".into(),
12866 workspace: String::new(),
12867 workspace_original: None,
12868 created_at: Some(100),
12869 line_number: Some(1),
12870 match_type: MatchType::Exact,
12871 source_id: "local".into(),
12872 origin_kind: "local".into(),
12873 origin_host: None,
12874 conversation_id: Some(1),
12875 };
12876 let second_hit = SearchHit {
12877 conversation_id: Some(2),
12878 ..first_hit.clone()
12879 };
12880
12881 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
12882 assert_eq!(resolved.len(), 2);
12883 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12884 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
12885
12886 Ok(())
12887 }
12888
12889 #[test]
12890 fn resolve_semantic_doc_ids_for_hits_treats_null_source_as_local() -> Result<()> {
12891 let conn = Connection::open(":memory:")?;
12892 conn.execute_batch(
12893 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12894 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12895 CREATE TABLE conversations (
12896 id INTEGER PRIMARY KEY,
12897 agent_id INTEGER NOT NULL,
12898 workspace_id INTEGER,
12899 source_id TEXT,
12900 origin_host TEXT,
12901 title TEXT,
12902 source_path TEXT NOT NULL
12903 );
12904 CREATE TABLE messages (
12905 id INTEGER PRIMARY KEY,
12906 conversation_id INTEGER NOT NULL,
12907 idx INTEGER,
12908 role TEXT,
12909 content TEXT NOT NULL,
12910 created_at INTEGER
12911 );",
12912 )?;
12913 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12914 conn.execute(
12915 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12916 VALUES(1, 1, NULL, NULL, NULL, 'Legacy Local', '/tmp/legacy-local.jsonl')",
12917 )?;
12918 let content = "legacy local semantic message".to_string();
12919 conn.execute_with_params(
12920 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12921 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12922 &[
12923 fsqlite_types::value::SqliteValue::Integer(11),
12924 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12925 ],
12926 )?;
12927
12928 let client = SearchClient {
12929 reader: None,
12930 sqlite: Mutex::new(Some(SendConnection(conn))),
12931 sqlite_path: None,
12932 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12933 reload_on_search: true,
12934 last_reload: Mutex::new(None),
12935 last_generation: Mutex::new(None),
12936 reload_epoch: Arc::new(AtomicU64::new(0)),
12937 warm_tx: None,
12938 _warm_handle: None,
12939 metrics: Metrics::default(),
12940 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12941 semantic: Mutex::new(None),
12942 last_tantivy_total_count: Mutex::new(None),
12943 };
12944
12945 let hit = SearchHit {
12946 title: "Legacy Local".into(),
12947 snippet: String::new(),
12948 content: String::new(),
12949 content_hash: stable_hit_hash(&content, "/tmp/legacy-local.jsonl", Some(1), Some(100)),
12950 score: 0.0,
12951 source_path: "/tmp/legacy-local.jsonl".into(),
12952 agent: "codex".into(),
12953 workspace: String::new(),
12954 workspace_original: None,
12955 created_at: Some(100),
12956 line_number: Some(1),
12957 match_type: MatchType::Exact,
12958 source_id: "local".into(),
12959 origin_kind: "local".into(),
12960 origin_host: None,
12961 conversation_id: None,
12962 };
12963
12964 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12965 assert_eq!(resolved.len(), 1);
12966 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12967
12968 Ok(())
12969 }
12970
12971 #[test]
12972 fn resolve_semantic_doc_ids_for_hits_matches_trimmed_local_source_id() -> Result<()> {
12973 let conn = Connection::open(":memory:")?;
12974 conn.execute_batch(
12975 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12976 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12977 CREATE TABLE conversations (
12978 id INTEGER PRIMARY KEY,
12979 agent_id INTEGER NOT NULL,
12980 workspace_id INTEGER,
12981 source_id TEXT,
12982 origin_host TEXT,
12983 title TEXT,
12984 source_path TEXT NOT NULL
12985 );
12986 CREATE TABLE messages (
12987 id INTEGER PRIMARY KEY,
12988 conversation_id INTEGER NOT NULL,
12989 idx INTEGER,
12990 role TEXT,
12991 content TEXT NOT NULL,
12992 created_at INTEGER
12993 );",
12994 )?;
12995 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12996 conn.execute(
12997 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12998 VALUES(1, 1, NULL, ' local ', NULL, 'Trimmed Local', '/tmp/trimmed-local.jsonl')",
12999 )?;
13000 let content = "trimmed local semantic message".to_string();
13001 conn.execute_with_params(
13002 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13003 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13004 &[
13005 fsqlite_types::value::SqliteValue::Integer(11),
13006 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13007 ],
13008 )?;
13009
13010 let client = SearchClient {
13011 reader: None,
13012 sqlite: Mutex::new(Some(SendConnection(conn))),
13013 sqlite_path: None,
13014 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13015 reload_on_search: true,
13016 last_reload: Mutex::new(None),
13017 last_generation: Mutex::new(None),
13018 reload_epoch: Arc::new(AtomicU64::new(0)),
13019 warm_tx: None,
13020 _warm_handle: None,
13021 metrics: Metrics::default(),
13022 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13023 semantic: Mutex::new(None),
13024 last_tantivy_total_count: Mutex::new(None),
13025 };
13026
13027 let hit = SearchHit {
13028 title: "Trimmed Local".into(),
13029 snippet: String::new(),
13030 content: String::new(),
13031 content_hash: stable_hit_hash(&content, "/tmp/trimmed-local.jsonl", Some(1), Some(100)),
13032 score: 0.0,
13033 source_path: "/tmp/trimmed-local.jsonl".into(),
13034 agent: "codex".into(),
13035 workspace: String::new(),
13036 workspace_original: None,
13037 created_at: Some(100),
13038 line_number: Some(1),
13039 match_type: MatchType::Exact,
13040 source_id: "local".into(),
13041 origin_kind: "local".into(),
13042 origin_host: None,
13043 conversation_id: None,
13044 };
13045
13046 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13047 assert_eq!(resolved.len(), 1);
13048 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
13049
13050 Ok(())
13051 }
13052
13053 #[test]
13054 fn resolve_semantic_doc_ids_for_hits_normalizes_blank_local_source_id() -> Result<()> {
13055 let conn = Connection::open(":memory:")?;
13056 conn.execute_batch(
13057 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13058 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
13059 CREATE TABLE conversations (
13060 id INTEGER PRIMARY KEY,
13061 agent_id INTEGER NOT NULL,
13062 workspace_id INTEGER,
13063 source_id TEXT,
13064 origin_host TEXT,
13065 title TEXT,
13066 source_path TEXT NOT NULL
13067 );
13068 CREATE TABLE messages (
13069 id INTEGER PRIMARY KEY,
13070 conversation_id INTEGER NOT NULL,
13071 idx INTEGER,
13072 role TEXT,
13073 content TEXT NOT NULL,
13074 created_at INTEGER
13075 );",
13076 )?;
13077 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13078 conn.execute(
13079 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13080 VALUES(1, 1, NULL, 'local', NULL, 'Blank Local', '/tmp/blank-local.jsonl')",
13081 )?;
13082 let content = "blank local semantic message".to_string();
13083 conn.execute_with_params(
13084 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13085 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13086 &[
13087 fsqlite_types::value::SqliteValue::Integer(11),
13088 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13089 ],
13090 )?;
13091
13092 let client = SearchClient {
13093 reader: None,
13094 sqlite: Mutex::new(Some(SendConnection(conn))),
13095 sqlite_path: None,
13096 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13097 reload_on_search: true,
13098 last_reload: Mutex::new(None),
13099 last_generation: Mutex::new(None),
13100 reload_epoch: Arc::new(AtomicU64::new(0)),
13101 warm_tx: None,
13102 _warm_handle: None,
13103 metrics: Metrics::default(),
13104 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13105 semantic: Mutex::new(None),
13106 last_tantivy_total_count: Mutex::new(None),
13107 };
13108
13109 let hit = SearchHit {
13110 title: "Blank Local".into(),
13111 snippet: String::new(),
13112 content: String::new(),
13113 content_hash: stable_hit_hash(&content, "/tmp/blank-local.jsonl", Some(1), Some(100)),
13114 score: 0.0,
13115 source_path: "/tmp/blank-local.jsonl".into(),
13116 agent: "codex".into(),
13117 workspace: String::new(),
13118 workspace_original: None,
13119 created_at: Some(100),
13120 line_number: Some(1),
13121 match_type: MatchType::Exact,
13122 source_id: " ".into(),
13123 origin_kind: "local".into(),
13124 origin_host: None,
13125 conversation_id: None,
13126 };
13127
13128 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13129 assert_eq!(resolved.len(), 1);
13130 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
13131
13132 Ok(())
13133 }
13134
13135 #[test]
13136 fn resolve_semantic_doc_ids_for_hits_infers_remote_source_from_origin_host_when_source_id_blank()
13137 -> Result<()> {
13138 let conn = Connection::open(":memory:")?;
13139 conn.execute_batch(
13140 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13141 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
13142 CREATE TABLE conversations (
13143 id INTEGER PRIMARY KEY,
13144 agent_id INTEGER NOT NULL,
13145 workspace_id INTEGER,
13146 source_id TEXT,
13147 origin_host TEXT,
13148 title TEXT,
13149 source_path TEXT NOT NULL
13150 );
13151 CREATE TABLE messages (
13152 id INTEGER PRIMARY KEY,
13153 conversation_id INTEGER NOT NULL,
13154 idx INTEGER,
13155 role TEXT,
13156 content TEXT NOT NULL,
13157 created_at INTEGER
13158 );",
13159 )?;
13160 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13161 conn.execute(
13162 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13163 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'Legacy Remote', '/tmp/legacy-remote.jsonl')",
13164 )?;
13165 let content = "legacy remote semantic message".to_string();
13166 conn.execute_with_params(
13167 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
13168 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
13169 &[
13170 fsqlite_types::value::SqliteValue::Integer(11),
13171 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
13172 ],
13173 )?;
13174
13175 let client = SearchClient {
13176 reader: None,
13177 sqlite: Mutex::new(Some(SendConnection(conn))),
13178 sqlite_path: None,
13179 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13180 reload_on_search: true,
13181 last_reload: Mutex::new(None),
13182 last_generation: Mutex::new(None),
13183 reload_epoch: Arc::new(AtomicU64::new(0)),
13184 warm_tx: None,
13185 _warm_handle: None,
13186 metrics: Metrics::default(),
13187 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13188 semantic: Mutex::new(None),
13189 last_tantivy_total_count: Mutex::new(None),
13190 };
13191
13192 let hit = SearchHit {
13193 title: "Legacy Remote".into(),
13194 snippet: String::new(),
13195 content: String::new(),
13196 content_hash: stable_hit_hash(&content, "/tmp/legacy-remote.jsonl", Some(1), Some(100)),
13197 score: 0.0,
13198 source_path: "/tmp/legacy-remote.jsonl".into(),
13199 agent: "codex".into(),
13200 workspace: String::new(),
13201 workspace_original: None,
13202 created_at: Some(100),
13203 line_number: Some(1),
13204 match_type: MatchType::Exact,
13205 source_id: "dev@laptop".into(),
13206 origin_kind: "remote".into(),
13207 origin_host: Some("dev@laptop".into()),
13208 conversation_id: None,
13209 };
13210
13211 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
13212 assert_eq!(resolved.len(), 1);
13213 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
13214
13215 Ok(())
13216 }
13217
13218 #[test]
13219 fn browse_by_date_snippet_only_uses_full_content_for_hit_identity() -> Result<()> {
13220 let conn = Connection::open(":memory:")?;
13221 conn.execute_batch(
13222 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
13223 CREATE TABLE conversations (
13224 id INTEGER PRIMARY KEY,
13225 agent_id INTEGER NOT NULL,
13226 workspace_id INTEGER,
13227 source_id TEXT,
13228 origin_host TEXT,
13229 title TEXT,
13230 source_path TEXT NOT NULL
13231 );
13232 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
13233 CREATE TABLE messages (
13234 id INTEGER PRIMARY KEY,
13235 conversation_id INTEGER NOT NULL,
13236 idx INTEGER,
13237 content TEXT NOT NULL,
13238 created_at INTEGER
13239 );
13240 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
13241 )?;
13242 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
13243 conn.execute(
13244 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
13245 VALUES(1, 1, NULL, 'local', NULL, 'browse title', '/tmp/browse-shared.jsonl')",
13246 )?;
13247 let shared_prefix = "shared-prefix ".repeat(48);
13248 let first = format!("{shared_prefix}first browse-only tail");
13249 let second = format!("{shared_prefix}second browse-only tail");
13250 conn.execute_with_params(
13251 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
13252 VALUES(?1, 1, ?2, ?3, ?4)",
13253 &[
13254 fsqlite_types::value::SqliteValue::Integer(1),
13255 fsqlite_types::value::SqliteValue::Integer(0),
13256 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
13257 fsqlite_types::value::SqliteValue::Integer(101),
13258 ],
13259 )?;
13260 conn.execute_with_params(
13261 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
13262 VALUES(?1, 1, ?2, ?3, ?4)",
13263 &[
13264 fsqlite_types::value::SqliteValue::Integer(2),
13265 fsqlite_types::value::SqliteValue::Integer(1),
13266 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
13267 fsqlite_types::value::SqliteValue::Integer(102),
13268 ],
13269 )?;
13270
13271 let client = SearchClient {
13272 reader: None,
13273 sqlite: Mutex::new(Some(SendConnection(conn))),
13274 sqlite_path: None,
13275 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13276 reload_on_search: true,
13277 last_reload: Mutex::new(None),
13278 last_generation: Mutex::new(None),
13279 reload_epoch: Arc::new(AtomicU64::new(0)),
13280 warm_tx: None,
13281 _warm_handle: None,
13282 metrics: Metrics::default(),
13283 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13284 semantic: Mutex::new(None),
13285 last_tantivy_total_count: Mutex::new(None),
13286 };
13287
13288 let hits = client.browse_by_date(
13289 SearchFilters::default(),
13290 10,
13291 0,
13292 true,
13293 FieldMask::new(false, true, true, true),
13294 )?;
13295 assert_eq!(hits.len(), 2);
13296 assert!(hits.iter().all(|hit| hit.content.is_empty()));
13297 assert!(hits.iter().all(|hit| !hit.snippet.is_empty()));
13298 assert_ne!(hits[0].content_hash, hits[1].content_hash);
13299
13300 Ok(())
13301 }
13302
13303 #[test]
13304 fn cache_invalidates_on_new_data() -> Result<()> {
13305 let dir = TempDir::new()?;
13306 let mut index = TantivyIndex::open_or_create(dir.path())?;
13307
13308 let conv1 = NormalizedConversation {
13310 agent_slug: "codex".into(),
13311 external_id: None,
13312 title: Some("first".into()),
13313 workspace: None,
13314 source_path: dir.path().join("1.jsonl"),
13315 started_at: Some(1),
13316 ended_at: None,
13317 metadata: serde_json::json!({}),
13318 messages: vec![NormalizedMessage {
13319 idx: 0,
13320 role: "user".into(),
13321 author: None,
13322 created_at: Some(1),
13323 content: "apple banana".into(),
13324 extra: serde_json::json!({}),
13325 snippets: vec![],
13326 invocations: Vec::new(),
13327 }],
13328 };
13329 index.add_conversation(&conv1)?;
13330 index.commit()?;
13331
13332 let client = SearchClient::open(dir.path(), None)?.expect("index present");
13333
13334 let hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
13336 assert_eq!(hits.len(), 1);
13337 assert_eq!(hits[0].content, "apple banana");
13338
13339 {
13341 let cache = client.prefix_cache.lock().unwrap();
13342 let shard = cache.shard_opt("global").unwrap();
13343 assert!(shard.contains(&client.cache_key("app", &SearchFilters::default())));
13345 }
13346
13347 let conv2 = NormalizedConversation {
13349 agent_slug: "codex".into(),
13350 external_id: None,
13351 title: Some("second".into()),
13352 workspace: None,
13353 source_path: dir.path().join("2.jsonl"),
13354 started_at: Some(2),
13355 ended_at: None,
13356 metadata: serde_json::json!({}),
13357 messages: vec![NormalizedMessage {
13358 idx: 0,
13359 role: "user".into(),
13360 author: None,
13361 created_at: Some(2),
13362 content: "apricot".into(),
13363 extra: serde_json::json!({}),
13364 snippets: vec![],
13365 invocations: Vec::new(),
13366 }],
13367 };
13368 index.add_conversation(&conv2)?;
13369 index.commit()?;
13370
13371 std::thread::sleep(std::time::Duration::from_millis(350));
13377
13378 let _hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
13381 let hits = client.search("apr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
13385 assert_eq!(hits.len(), 1);
13386 assert_eq!(hits[0].content, "apricot");
13387
13388 Ok(())
13392 }
13393
13394 #[test]
13395 fn track_generation_clears_cache_on_change() {
13396 let client = SearchClient {
13397 reader: None,
13398 sqlite: Mutex::new(None),
13399 sqlite_path: None,
13400 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13401 reload_on_search: true,
13402 last_reload: Mutex::new(None),
13403 last_generation: Mutex::new(None),
13404 reload_epoch: Arc::new(AtomicU64::new(0)),
13405 warm_tx: None,
13406 _warm_handle: None,
13407 metrics: Metrics::default(),
13408 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13409 semantic: Mutex::new(None),
13410 last_tantivy_total_count: Mutex::new(None),
13411 };
13412
13413 let hit = SearchHit {
13414 title: "hello world".into(),
13415 snippet: "hello".into(),
13416 content: "hello world".into(),
13417 content_hash: stable_content_hash("hello world"),
13418 score: 1.0,
13419 source_path: "p".into(),
13420 agent: "a".into(),
13421 workspace: "w".into(),
13422 workspace_original: None,
13423 created_at: None,
13424 line_number: None,
13425 match_type: MatchType::Exact,
13426 source_id: "local".into(),
13427 origin_kind: "local".into(),
13428 origin_host: None,
13429 conversation_id: None,
13430 };
13431 let hits = vec![hit];
13432
13433 client.put_cache("hello", &SearchFilters::default(), &hits);
13434 {
13435 let cache = client.prefix_cache.lock().unwrap();
13436 assert!(!cache.shards.is_empty());
13437 }
13438
13439 client.track_generation(1);
13440 {
13441 let cache = client.prefix_cache.lock().unwrap();
13442 assert!(!cache.shards.is_empty());
13443 }
13444
13445 client.track_generation(2);
13446 {
13447 let cache = client.prefix_cache.lock().unwrap();
13448 assert!(cache.shards.is_empty());
13449 }
13450 }
13451
13452 #[test]
13453 fn cache_total_cap_evicts_across_shards() {
13454 let client = SearchClient {
13455 reader: None,
13456 sqlite: Mutex::new(None),
13457 sqlite_path: None,
13458 prefix_cache: Mutex::new(CacheShards::new(2, 0)), reload_on_search: true,
13460 last_reload: Mutex::new(None),
13461 last_generation: Mutex::new(None),
13462 reload_epoch: Arc::new(AtomicU64::new(0)),
13463 warm_tx: None,
13464 _warm_handle: None,
13465 metrics: Metrics::default(),
13466 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13467 semantic: Mutex::new(None),
13468 last_tantivy_total_count: Mutex::new(None),
13469 };
13470
13471 let hit = SearchHit {
13472 title: "a".into(),
13473 snippet: "a".into(),
13474 content: "a".into(),
13475 content_hash: stable_content_hash("a"),
13476 score: 1.0,
13477 source_path: "p".into(),
13478 agent: "agent1".into(),
13479 workspace: "w".into(),
13480 workspace_original: None,
13481 created_at: None,
13482 line_number: None,
13483 match_type: MatchType::Exact,
13484 source_id: "local".into(),
13485 origin_kind: "local".into(),
13486 origin_host: None,
13487 conversation_id: None,
13488 };
13489 let hits = vec![hit.clone()];
13490
13491 let mut filters = SearchFilters::default();
13492 filters.agents.insert("agent1".into());
13493 client.put_cache("a", &filters, &hits);
13494 filters.agents.clear();
13495 filters.agents.insert("agent2".into());
13496 client.put_cache("b", &filters, &hits);
13497 filters.agents.clear();
13498 filters.agents.insert("agent3".into());
13499 client.put_cache("c", &filters, &hits);
13500
13501 let stats = client.cache_stats();
13502 assert!(stats.total_cost <= stats.total_cap);
13503 assert_eq!(stats.total_cap, 2);
13504 }
13505
13506 #[test]
13507 fn cache_stats_reflect_metrics() {
13508 let client = SearchClient {
13509 reader: None,
13510 sqlite: Mutex::new(None),
13511 sqlite_path: None,
13512 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
13513 reload_on_search: true,
13514 last_reload: Mutex::new(None),
13515 last_generation: Mutex::new(None),
13516 reload_epoch: Arc::new(AtomicU64::new(0)),
13517 warm_tx: None,
13518 _warm_handle: None,
13519 metrics: Metrics::default(),
13520 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13521 semantic: Mutex::new(None),
13522 last_tantivy_total_count: Mutex::new(None),
13523 };
13524
13525 client.metrics.inc_cache_hits();
13526 client.metrics.inc_cache_miss();
13527 client.metrics.inc_cache_shortfall();
13528 client.metrics.record_reload(Duration::from_millis(10));
13529
13530 let stats = client.cache_stats();
13531 assert_eq!(stats.cache_hits, 1);
13532 assert_eq!(stats.cache_miss, 1);
13533 assert_eq!(stats.cache_shortfall, 1);
13534 assert_eq!(stats.reloads, 1);
13535 assert_eq!(stats.reload_ms_total, 10);
13536 assert_eq!(stats.total_cap, *CACHE_TOTAL_CAP);
13537 assert_eq!(stats.eviction_policy, "lru");
13538 assert_eq!(stats.prewarm_scheduled, 0);
13539 assert_eq!(stats.prewarm_skipped_pressure, 0);
13540 assert_eq!(CacheStats::default().eviction_policy, "unknown");
13541 }
13542
13543 #[test]
13544 fn adaptive_query_prewarm_schedules_only_after_hot_prefix_cache_entry() {
13545 let (tx, rx) = mpsc::unbounded();
13546 let client = SearchClient {
13547 reader: None,
13548 sqlite: Mutex::new(None),
13549 sqlite_path: None,
13550 prefix_cache: Mutex::new(CacheShards::new(10, 0)),
13551 reload_on_search: true,
13552 last_reload: Mutex::new(None),
13553 last_generation: Mutex::new(None),
13554 reload_epoch: Arc::new(AtomicU64::new(0)),
13555 warm_tx: Some(tx),
13556 _warm_handle: None,
13557 metrics: Metrics::default(),
13558 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13559 semantic: Mutex::new(None),
13560 last_tantivy_total_count: Mutex::new(None),
13561 };
13562 let mut filters = SearchFilters::default();
13563 filters.workspaces.insert("/tmp/cass-workspace".into());
13564
13565 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
13566 assert!(
13567 rx.try_recv().is_err(),
13568 "cold prefixes should not schedule adaptive prewarm"
13569 );
13570
13571 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
13572 hit.snippet = "hello".into();
13573 hit.content = "hello world".into();
13574 hit.content_hash = stable_content_hash(&hit.content);
13575 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
13576
13577 let total_cost_before = client.cache_stats().total_cost;
13578 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
13579 assert!(
13580 rx.try_recv().is_err(),
13581 "an exact cached query should not schedule redundant prewarm"
13582 );
13583 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
13584
13585 let job = rx
13586 .try_recv()
13587 .expect("hot prefix should schedule adaptive prewarm");
13588 assert_eq!(job.query, "hello");
13589 assert_eq!(job.shard_name, "workspace:/tmp/cass-workspace");
13590 assert_eq!(job.filters_fingerprint, filters_fingerprint(&filters));
13591 let stats = client.cache_stats();
13592 assert_eq!(stats.prewarm_scheduled, 1);
13593 assert_eq!(stats.prewarm_skipped_pressure, 0);
13594 assert_eq!(
13595 stats.total_cost, total_cost_before,
13596 "prewarm scheduling should not mutate result-cache contents"
13597 );
13598 }
13599
13600 #[test]
13601 fn adaptive_query_prewarm_skips_when_cache_byte_cap_is_under_pressure() {
13602 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
13603 hit.snippet = "hello".into();
13604 hit.content = "hello world with enough content to consume the small byte budget".into();
13605 hit.content_hash = stable_content_hash(&hit.content);
13606 let byte_cap = cached_hit_from(&hit).approx_bytes();
13607
13608 let (tx, rx) = mpsc::unbounded();
13609 let client = SearchClient {
13610 reader: None,
13611 sqlite: Mutex::new(None),
13612 sqlite_path: None,
13613 prefix_cache: Mutex::new(CacheShards::new(10, byte_cap)),
13614 reload_on_search: true,
13615 last_reload: Mutex::new(None),
13616 last_generation: Mutex::new(None),
13617 reload_epoch: Arc::new(AtomicU64::new(0)),
13618 warm_tx: Some(tx),
13619 _warm_handle: None,
13620 metrics: Metrics::default(),
13621 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13622 semantic: Mutex::new(None),
13623 last_tantivy_total_count: Mutex::new(None),
13624 };
13625 let filters = SearchFilters::default();
13626
13627 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
13628 client.maybe_schedule_adaptive_query_prewarm("zebra", &filters);
13629 assert_eq!(
13630 client.cache_stats().prewarm_skipped_pressure,
13631 0,
13632 "cold queries should not be counted as pressure-skipped prewarm jobs"
13633 );
13634
13635 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
13636
13637 assert!(
13638 rx.try_recv().is_err(),
13639 "prewarm should be disabled while cache byte pressure is high"
13640 );
13641 let stats = client.cache_stats();
13642 assert_eq!(stats.prewarm_scheduled, 0);
13643 assert_eq!(stats.prewarm_skipped_pressure, 1);
13644 assert!(stats.approx_bytes <= stats.byte_cap);
13645 }
13646
13647 #[test]
13648 fn cache_eviction_count_tracks_evictions() {
13649 let client = SearchClient {
13651 reader: None,
13652 sqlite: Mutex::new(None),
13653 sqlite_path: None,
13654 prefix_cache: Mutex::new(CacheShards::new(2, 0)),
13655 reload_on_search: true,
13656 last_reload: Mutex::new(None),
13657 last_generation: Mutex::new(None),
13658 reload_epoch: Arc::new(AtomicU64::new(0)),
13659 warm_tx: None,
13660 _warm_handle: None,
13661 metrics: Metrics::default(),
13662 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13663 semantic: Mutex::new(None),
13664 last_tantivy_total_count: Mutex::new(None),
13665 };
13666
13667 let hit = SearchHit {
13668 title: "test".into(),
13669 snippet: "snippet".into(),
13670 content: "content".into(),
13671 content_hash: stable_content_hash("content"),
13672 score: 1.0,
13673 source_path: "p".into(),
13674 agent: "a".into(),
13675 workspace: "w".into(),
13676 workspace_original: None,
13677 created_at: None,
13678 line_number: None,
13679 match_type: MatchType::Exact,
13680 source_id: "local".into(),
13681 origin_kind: "local".into(),
13682 origin_host: None,
13683 conversation_id: None,
13684 };
13685
13686 client.put_cache(
13688 "query1",
13689 &SearchFilters::default(),
13690 std::slice::from_ref(&hit),
13691 );
13692 client.put_cache(
13693 "query2",
13694 &SearchFilters::default(),
13695 std::slice::from_ref(&hit),
13696 );
13697 client.put_cache(
13698 "query3",
13699 &SearchFilters::default(),
13700 std::slice::from_ref(&hit),
13701 );
13702
13703 let stats = client.cache_stats();
13704 assert!(
13705 stats.eviction_count >= 1,
13706 "should have evicted at least 1 entry"
13707 );
13708 assert!(stats.total_cost <= 2, "should be at or below cap");
13709 assert!(stats.approx_bytes > 0, "should track bytes used");
13710 }
13711
13712 #[test]
13713 fn default_cache_byte_cap_scales_with_available_memory() {
13714 let gib = 1024_u64 * 1024 * 1024;
13715
13716 assert_eq!(
13717 default_cache_byte_cap_for_available(None),
13718 DEFAULT_CACHE_BYTE_CAP_FALLBACK
13719 );
13720 assert_eq!(
13721 default_cache_byte_cap_for_available(Some(2 * gib)),
13722 DEFAULT_CACHE_BYTE_CAP_FALLBACK,
13723 "small hosts keep a conservative cache byte budget"
13724 );
13725 assert_eq!(
13726 default_cache_byte_cap_for_available(Some(64 * gib)),
13727 512 * 1024 * 1024,
13728 "larger hosts get a proportionally larger cache byte budget"
13729 );
13730 assert_eq!(
13731 default_cache_byte_cap_for_available(Some(256 * gib)),
13732 usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX),
13733 "large swarm hosts still have a bounded default cache budget"
13734 );
13735 }
13736
13737 #[test]
13738 fn malformed_cache_byte_cap_env_uses_default_instead_of_disabling_guard() {
13739 let gib = 1024_u64 * 1024 * 1024;
13740
13741 assert_eq!(cache_byte_cap_from_env_value(Some("0"), Some(64 * gib)), 0);
13742 assert_eq!(
13743 cache_byte_cap_from_env_value(Some("not-a-number"), Some(64 * gib)),
13744 default_cache_byte_cap_for_available(Some(64 * gib)),
13745 "malformed env should keep the default memory guard active"
13746 );
13747 assert_eq!(
13748 cache_byte_cap_from_env_value(None, Some(64 * gib)),
13749 default_cache_byte_cap_for_available(Some(64 * gib))
13750 );
13751 }
13752
13753 #[test]
13754 fn cache_eviction_policy_env_defaults_to_lru_and_accepts_s3_fifo() {
13755 assert_eq!(
13756 cache_eviction_policy_from_env_value(None),
13757 CacheEvictionPolicy::Lru
13758 );
13759 assert_eq!(
13760 cache_eviction_policy_from_env_value(Some("not-a-policy")),
13761 CacheEvictionPolicy::Lru,
13762 "malformed env keeps the current LRU behavior"
13763 );
13764 assert_eq!(
13765 cache_eviction_policy_from_env_value(Some("s3-fifo")),
13766 CacheEvictionPolicy::S3Fifo
13767 );
13768 assert_eq!(
13769 cache_eviction_policy_from_env_value(Some("s3_fifo")),
13770 CacheEvictionPolicy::S3Fifo
13771 );
13772 }
13773
13774 #[test]
13775 fn s3_fifo_admission_rejects_one_off_byte_heavy_entries_then_admits_ghost_replay() {
13776 let content = "large".repeat(1_000);
13777 let hit = SearchHit {
13778 title: "large".into(),
13779 snippet: "large".into(),
13780 content: content.clone(),
13781 content_hash: stable_content_hash(&content),
13782 score: 1.0,
13783 source_path: "large-path".into(),
13784 agent: "a".into(),
13785 workspace: "w".into(),
13786 workspace_original: None,
13787 created_at: None,
13788 line_number: None,
13789 match_type: MatchType::Exact,
13790 source_id: "local".into(),
13791 origin_kind: "local".into(),
13792 origin_host: None,
13793 conversation_id: None,
13794 };
13795 let cached = cached_hit_from(&hit);
13796 let byte_cap = cached.approx_bytes() + 1_024;
13797 assert!(
13798 cached.approx_bytes() > byte_cap.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR)
13799 );
13800
13801 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::S3Fifo);
13802 let key = Arc::<str>::from("large-query");
13803
13804 cache.put("global", key.clone(), vec![cached.clone()]);
13805 assert_eq!(
13806 cache.total_cost(),
13807 0,
13808 "first one-off large entry is not admitted"
13809 );
13810 assert_eq!(cache.ghost_entries(), 1);
13811 assert_eq!(cache.admission_rejects(), 1);
13812
13813 cache.put("global", key, vec![cached]);
13814 assert_eq!(
13815 cache.total_cost(),
13816 1,
13817 "ghost replay admits the repeated query"
13818 );
13819 assert_eq!(cache.ghost_entries(), 0);
13820 assert!(cache.ghost_keys.is_empty());
13821 assert_eq!(cache.admission_rejects(), 1);
13822 assert!(cache.total_bytes() <= cache.byte_cap());
13823 }
13824
13825 #[test]
13826 fn lru_policy_keeps_admitting_large_entries_under_existing_caps() {
13827 let content = "large".repeat(1_000);
13828 let hit = SearchHit {
13829 title: "large".into(),
13830 snippet: "large".into(),
13831 content: content.clone(),
13832 content_hash: stable_content_hash(&content),
13833 score: 1.0,
13834 source_path: "large-path".into(),
13835 agent: "a".into(),
13836 workspace: "w".into(),
13837 workspace_original: None,
13838 created_at: None,
13839 line_number: None,
13840 match_type: MatchType::Exact,
13841 source_id: "local".into(),
13842 origin_kind: "local".into(),
13843 origin_host: None,
13844 conversation_id: None,
13845 };
13846 let cached = cached_hit_from(&hit);
13847 let byte_cap = cached.approx_bytes() + 1_024;
13848 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::Lru);
13849
13850 cache.put("global", Arc::<str>::from("large-query"), vec![cached]);
13851
13852 assert_eq!(cache.total_cost(), 1);
13853 assert_eq!(cache.ghost_entries(), 0);
13854 assert_eq!(cache.admission_rejects(), 0);
13855 assert_eq!(cache.policy_label(), "lru");
13856 }
13857
13858 #[test]
13859 fn cache_byte_cap_triggers_eviction() {
13860 let client = SearchClient {
13862 reader: None,
13863 sqlite: Mutex::new(None),
13864 sqlite_path: None,
13865 prefix_cache: Mutex::new(CacheShards::new(1000, 100)), reload_on_search: true,
13867 last_reload: Mutex::new(None),
13868 last_generation: Mutex::new(None),
13869 reload_epoch: Arc::new(AtomicU64::new(0)),
13870 warm_tx: None,
13871 _warm_handle: None,
13872 metrics: Metrics::default(),
13873 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13874 semantic: Mutex::new(None),
13875 last_tantivy_total_count: Mutex::new(None),
13876 };
13877
13878 let content = "c".repeat(100);
13880 let hit = SearchHit {
13881 title: "a".repeat(50),
13882 snippet: "b".repeat(50),
13883 content: content.clone(), content_hash: stable_content_hash(&content),
13885 score: 1.0,
13886 source_path: "p".into(),
13887 agent: "a".into(),
13888 workspace: "w".into(),
13889 workspace_original: None,
13890 created_at: None,
13891 line_number: None,
13892 match_type: MatchType::Exact,
13893 source_id: "local".into(),
13894 origin_kind: "local".into(),
13895 origin_host: None,
13896 conversation_id: None,
13897 };
13898
13899 client.put_cache("q1", &SearchFilters::default(), std::slice::from_ref(&hit));
13901 client.put_cache("q2", &SearchFilters::default(), std::slice::from_ref(&hit));
13902 client.put_cache("q3", &SearchFilters::default(), std::slice::from_ref(&hit));
13903
13904 let stats = client.cache_stats();
13905 assert!(
13906 stats.eviction_count >= 1,
13907 "byte cap should trigger evictions"
13908 );
13909 assert_eq!(stats.byte_cap, 100, "byte cap should be reported");
13910 }
13912
13913 #[test]
13914 fn cache_byte_pressure_evicts_byte_heavy_shard_before_small_entries() {
13915 let small_hit = SearchHit {
13916 title: "small".into(),
13917 snippet: "small".into(),
13918 content: "small".into(),
13919 content_hash: stable_content_hash("small"),
13920 score: 1.0,
13921 source_path: "small-path".into(),
13922 agent: "a".into(),
13923 workspace: "w".into(),
13924 workspace_original: None,
13925 created_at: None,
13926 line_number: None,
13927 match_type: MatchType::Exact,
13928 source_id: "local".into(),
13929 origin_kind: "local".into(),
13930 origin_host: None,
13931 conversation_id: None,
13932 };
13933 let large_content = "large".repeat(2_000);
13934 let large_hit = SearchHit {
13935 title: "large".into(),
13936 snippet: "large".into(),
13937 content: large_content.clone(),
13938 content_hash: stable_content_hash(&large_content),
13939 score: 1.0,
13940 source_path: "large-path".into(),
13941 agent: "b".into(),
13942 workspace: "w".into(),
13943 workspace_original: None,
13944 created_at: None,
13945 line_number: None,
13946 match_type: MatchType::Exact,
13947 source_id: "local".into(),
13948 origin_kind: "local".into(),
13949 origin_host: None,
13950 conversation_id: None,
13951 };
13952
13953 let mut cache = CacheShards::new(100, 1_024);
13954 cache.put(
13955 "small",
13956 Arc::<str>::from("small-1"),
13957 vec![cached_hit_from(&small_hit)],
13958 );
13959 cache.put(
13960 "small",
13961 Arc::<str>::from("small-2"),
13962 vec![cached_hit_from(&small_hit)],
13963 );
13964 cache.put(
13965 "large",
13966 Arc::<str>::from("large-1"),
13967 vec![cached_hit_from(&large_hit)],
13968 );
13969
13970 assert_eq!(
13971 cache.shard_opt("small").map(LruCache::len),
13972 Some(2),
13973 "byte pressure should preserve the small shard"
13974 );
13975 assert!(
13976 cache.shard_opt("large").is_none_or(LruCache::is_empty),
13977 "oversized shard should be evicted first under byte pressure"
13978 );
13979 assert!(cache.total_bytes() <= cache.byte_cap());
13980 }
13981
13982 #[test]
13987 fn wildcard_pattern_parse_exact() {
13988 assert_eq!(
13990 FsCassWildcardPattern::parse("hello"),
13991 FsCassWildcardPattern::Exact("hello".into())
13992 );
13993 assert_eq!(
13994 FsCassWildcardPattern::parse("HELLO"),
13995 FsCassWildcardPattern::Exact("hello".into()) );
13997 assert_eq!(
13998 FsCassWildcardPattern::parse("FooBar123"),
13999 FsCassWildcardPattern::Exact("foobar123".into())
14000 );
14001 }
14002
14003 #[test]
14004 fn wildcard_pattern_parse_prefix() {
14005 assert_eq!(
14007 FsCassWildcardPattern::parse("foo*"),
14008 FsCassWildcardPattern::Prefix("foo".into())
14009 );
14010 assert_eq!(
14011 FsCassWildcardPattern::parse("CONFIG*"),
14012 FsCassWildcardPattern::Prefix("config".into())
14013 );
14014 assert_eq!(
14015 FsCassWildcardPattern::parse("test*"),
14016 FsCassWildcardPattern::Prefix("test".into())
14017 );
14018 }
14019
14020 #[test]
14021 fn wildcard_pattern_parse_suffix() {
14022 assert_eq!(
14024 FsCassWildcardPattern::parse("*foo"),
14025 FsCassWildcardPattern::Suffix("foo".into())
14026 );
14027 assert_eq!(
14028 FsCassWildcardPattern::parse("*Error"),
14029 FsCassWildcardPattern::Suffix("error".into())
14030 );
14031 assert_eq!(
14032 FsCassWildcardPattern::parse("*Handler"),
14033 FsCassWildcardPattern::Suffix("handler".into())
14034 );
14035 }
14036
14037 #[test]
14038 fn wildcard_pattern_parse_substring() {
14039 assert_eq!(
14041 FsCassWildcardPattern::parse("*foo*"),
14042 FsCassWildcardPattern::Substring("foo".into())
14043 );
14044 assert_eq!(
14045 FsCassWildcardPattern::parse("*CONFIG*"),
14046 FsCassWildcardPattern::Substring("config".into())
14047 );
14048 assert_eq!(
14049 FsCassWildcardPattern::parse("*test*"),
14050 FsCassWildcardPattern::Substring("test".into())
14051 );
14052 }
14053
14054 #[test]
14055 fn wildcard_pattern_parse_edge_cases() {
14056 assert_eq!(
14058 FsCassWildcardPattern::parse("*"),
14059 FsCassWildcardPattern::Exact(String::new())
14060 );
14061 assert_eq!(
14062 FsCassWildcardPattern::parse("**"),
14063 FsCassWildcardPattern::Exact(String::new())
14064 );
14065 assert_eq!(
14066 FsCassWildcardPattern::parse("***"),
14067 FsCassWildcardPattern::Exact(String::new())
14068 );
14069
14070 assert_eq!(
14072 FsCassWildcardPattern::parse("*a*"),
14073 FsCassWildcardPattern::Substring("a".into())
14074 );
14075 assert_eq!(
14076 FsCassWildcardPattern::parse("a*"),
14077 FsCassWildcardPattern::Prefix("a".into())
14078 );
14079 assert_eq!(
14080 FsCassWildcardPattern::parse("*a"),
14081 FsCassWildcardPattern::Suffix("a".into())
14082 );
14083
14084 assert_eq!(
14086 FsCassWildcardPattern::parse("***foo***"),
14087 FsCassWildcardPattern::Substring("foo".into())
14088 );
14089 }
14090
14091 #[test]
14092 fn wildcard_pattern_to_regex_suffix() {
14093 let pattern = FsCassWildcardPattern::Suffix("foo".into());
14094 assert_eq!(pattern.to_regex(), Some(".*foo$".into()));
14096 }
14097
14098 #[test]
14099 fn wildcard_pattern_to_regex_substring() {
14100 let pattern = FsCassWildcardPattern::Substring("bar".into());
14101 assert_eq!(pattern.to_regex(), Some(".*bar.*".into()));
14102 }
14103
14104 #[test]
14105 fn wildcard_pattern_to_regex_exact_prefix_none() {
14106 let exact = FsCassWildcardPattern::Exact("foo".into());
14108 assert_eq!(exact.to_regex(), None);
14109
14110 let prefix = FsCassWildcardPattern::Prefix("bar".into());
14111 assert_eq!(prefix.to_regex(), None);
14112 }
14113
14114 #[test]
14115 fn match_type_quality_factors() {
14116 assert_eq!(MatchType::Exact.quality_factor(), 1.0);
14118 assert_eq!(MatchType::Prefix.quality_factor(), 0.9);
14120 assert_eq!(MatchType::Suffix.quality_factor(), 0.8);
14122 assert_eq!(MatchType::Substring.quality_factor(), 0.7);
14124 assert_eq!(MatchType::ImplicitWildcard.quality_factor(), 0.6);
14126 }
14127
14128 #[test]
14129 fn dominant_match_type_single_terms() {
14130 assert_eq!(dominant_match_type("hello"), MatchType::Exact);
14132 assert_eq!(dominant_match_type("hello*"), MatchType::Prefix);
14133 assert_eq!(dominant_match_type("*hello"), MatchType::Suffix);
14134 assert_eq!(dominant_match_type("*hello*"), MatchType::Substring);
14135 }
14136
14137 #[test]
14138 fn dominant_match_type_multiple_terms() {
14139 assert_eq!(dominant_match_type("foo bar"), MatchType::Exact);
14141 assert_eq!(dominant_match_type("foo bar*"), MatchType::Prefix);
14142 assert_eq!(dominant_match_type("foo *bar"), MatchType::Suffix);
14143 assert_eq!(dominant_match_type("foo* *bar*"), MatchType::Substring);
14144 assert_eq!(dominant_match_type("foo *bar* baz"), MatchType::Substring);
14146 }
14147
14148 #[test]
14149 fn dominant_match_type_empty_query() {
14150 assert_eq!(dominant_match_type(""), MatchType::Exact);
14151 assert_eq!(dominant_match_type(" "), MatchType::Exact);
14152 }
14153
14154 #[test]
14155 fn wildcard_pattern_to_regex_escapes_special_chars() {
14156 assert_eq!(
14157 FsCassWildcardPattern::Suffix("foo.bar".into()).to_regex(),
14158 Some(".*foo\\.bar$".into())
14159 );
14160 assert_eq!(
14161 FsCassWildcardPattern::Substring("a+b*c?".into()).to_regex(),
14162 Some(".*a\\+b\\*c\\?.*".into())
14163 );
14164 }
14165
14166 #[test]
14167 fn wildcard_pattern_to_regex_escapes_complex_patterns() {
14168 assert_eq!(
14169 FsCassWildcardPattern::Suffix("test[0-9]+".into()).to_regex(),
14170 Some(".*test\\[0-9\\]\\+$".into())
14171 );
14172 assert_eq!(
14173 FsCassWildcardPattern::Substring("(a|b)".into()).to_regex(),
14174 Some(".*\\(a\\|b\\).*".into())
14175 );
14176 assert_eq!(
14177 FsCassWildcardPattern::Substring("end$".into()).to_regex(),
14178 Some(".*end\\$.*".into())
14179 );
14180 assert_eq!(
14181 FsCassWildcardPattern::Substring("^start".into()).to_regex(),
14182 Some(".*\\^start.*".into())
14183 );
14184 }
14185
14186 #[test]
14187 fn is_tool_invocation_noise_detects_noise() {
14188 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
14190 assert!(!is_tool_invocation_noise("[Tool: Read]"));
14191
14192 assert!(is_tool_invocation_noise("[Tool:]"));
14194 assert!(is_tool_invocation_noise("[Tool: ]"));
14195
14196 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
14198 assert!(!is_tool_invocation_noise(" [Tool: Grep - Search files] "));
14199
14200 assert!(is_tool_invocation_noise("[tool]"));
14202 assert!(is_tool_invocation_noise("tool: Bash"));
14203 }
14204
14205 #[test]
14206 fn is_tool_invocation_noise_allows_useful_content() {
14207 assert!(!is_tool_invocation_noise("[Tool: Read - src/main.rs]"));
14209 assert!(!is_tool_invocation_noise("[Tool: Bash - cargo test --lib]"));
14210 }
14211
14212 #[test]
14213 fn is_tool_invocation_noise_detects_tool_markers() {
14214 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
14216 assert!(!is_tool_invocation_noise("[Tool: Read]"));
14217
14218 assert!(is_tool_invocation_noise("[Tool:]"));
14220
14221 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
14223 assert!(!is_tool_invocation_noise(" [Tool: Write - description] "));
14224 }
14225
14226 #[test]
14227 fn deduplicate_hits_removes_exact_dupes() {
14228 let hits = vec![
14229 SearchHit {
14230 title: "title1".into(),
14231 snippet: "snip1".into(),
14232 content: "hello world".into(),
14233 content_hash: stable_content_hash("hello world"),
14234 score: 1.0,
14235 source_path: "a.jsonl".into(),
14236 agent: "agent".into(),
14237 workspace: "ws".into(),
14238 workspace_original: None,
14239 created_at: Some(100),
14240 line_number: None,
14241 match_type: MatchType::Exact,
14242 source_id: "local".into(),
14243 origin_kind: "local".into(),
14244 origin_host: None,
14245 conversation_id: None,
14246 },
14247 SearchHit {
14248 title: "title1".into(),
14249 snippet: "snip2".into(),
14250 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14252 score: 0.5, source_path: "a.jsonl".into(),
14254 agent: "agent".into(),
14255 workspace: "ws".into(),
14256 workspace_original: None,
14257 created_at: Some(100),
14258 line_number: None,
14259 match_type: MatchType::Exact,
14260 source_id: "local".into(), origin_kind: "local".into(),
14262 origin_host: None,
14263 conversation_id: None,
14264 },
14265 ];
14266
14267 let deduped = deduplicate_hits(hits);
14268 assert_eq!(deduped.len(), 1);
14269 assert_eq!(deduped[0].score, 1.0); assert_eq!(deduped[0].title, "title1");
14271 }
14272
14273 #[test]
14274 fn deduplicate_hits_keeps_higher_score() {
14275 let hits = vec![
14276 SearchHit {
14277 title: "title1".into(),
14278 snippet: "snip1".into(),
14279 content: "hello world".into(),
14280 content_hash: stable_content_hash("hello world"),
14281 score: 0.3, source_path: "a.jsonl".into(),
14283 agent: "agent".into(),
14284 workspace: "ws".into(),
14285 workspace_original: None,
14286 created_at: Some(100),
14287 line_number: None,
14288 match_type: MatchType::Exact,
14289 source_id: "local".into(),
14290 origin_kind: "local".into(),
14291 origin_host: None,
14292 conversation_id: None,
14293 },
14294 SearchHit {
14295 title: "title1".into(),
14296 snippet: "snip2".into(),
14297 content: "hello world".into(),
14298 content_hash: stable_content_hash("hello world"),
14299 score: 0.9, source_path: "a.jsonl".into(),
14301 agent: "agent".into(),
14302 workspace: "ws".into(),
14303 workspace_original: None,
14304 created_at: Some(100),
14305 line_number: None,
14306 match_type: MatchType::Exact,
14307 source_id: "local".into(),
14308 origin_kind: "local".into(),
14309 origin_host: None,
14310 conversation_id: None,
14311 },
14312 ];
14313
14314 let deduped = deduplicate_hits(hits);
14315 assert_eq!(deduped.len(), 1);
14316 assert_eq!(deduped[0].score, 0.9); assert_eq!(deduped[0].title, "title1");
14318 }
14319
14320 #[test]
14321 fn deduplicate_hits_keeps_repeated_same_content_at_different_lines() {
14322 let first = SearchHit {
14323 title: "Shared Session".into(),
14324 snippet: String::new(),
14325 content: "repeat me".into(),
14326 content_hash: stable_content_hash("repeat me"),
14327 score: 10.0,
14328 source_path: "/shared/session.jsonl".into(),
14329 agent: "codex".into(),
14330 workspace: "/ws".into(),
14331 workspace_original: None,
14332 created_at: Some(100),
14333 line_number: Some(1),
14334 match_type: MatchType::Exact,
14335 source_id: "local".into(),
14336 origin_kind: "local".into(),
14337 origin_host: None,
14338 conversation_id: None,
14339 };
14340 let mut second = first.clone();
14341 second.line_number = Some(2);
14342 second.created_at = Some(200);
14343 second.score = 9.0;
14344
14345 let deduped = deduplicate_hits(vec![first, second]);
14346 assert_eq!(deduped.len(), 2);
14347 }
14348
14349 #[test]
14350 fn deduplicate_hits_keeps_distinct_conversation_ids_with_same_title_path_and_content() {
14351 let mut first = make_test_hit("same", 1.0);
14352 first.title = "Shared Session".into();
14353 first.source_path = "/shared/session.jsonl".into();
14354 first.content = "identical body".into();
14355 first.content_hash = stable_content_hash("identical body");
14356 first.conversation_id = Some(1);
14357
14358 let mut second = first.clone();
14359 second.conversation_id = Some(2);
14360 second.score = 0.9;
14361
14362 let deduped = deduplicate_hits(vec![first, second]);
14363 assert_eq!(deduped.len(), 2);
14364 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(1)));
14365 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(2)));
14366 }
14367
14368 #[test]
14369 fn deduplicate_hits_coalesces_same_conversation_id_despite_title_drift() {
14370 let mut first = make_test_hit("same", 1.0);
14371 first.title = "Morning Session".into();
14372 first.source_path = "/shared/session.jsonl".into();
14373 first.content = "identical body".into();
14374 first.content_hash = stable_content_hash("identical body");
14375 first.conversation_id = Some(7);
14376
14377 let mut second = first.clone();
14378 second.title = "Evening Session".into();
14379 second.score = 0.9;
14380
14381 let deduped = deduplicate_hits(vec![first, second]);
14382 assert_eq!(deduped.len(), 1);
14383 assert_eq!(deduped[0].conversation_id, Some(7));
14384 }
14385
14386 #[test]
14387 fn deduplicate_hits_keeps_distinct_titles_with_same_source_path_and_content() {
14388 let hits = vec![
14389 SearchHit {
14390 title: "Morning Session".into(),
14391 snippet: "snip1".into(),
14392 content: "hello world".into(),
14393 content_hash: stable_content_hash("hello world"),
14394 score: 0.9,
14395 source_path: "shared.jsonl".into(),
14396 agent: "agent".into(),
14397 workspace: "ws".into(),
14398 workspace_original: None,
14399 created_at: None,
14400 line_number: Some(1),
14401 match_type: MatchType::Exact,
14402 source_id: "local".into(),
14403 origin_kind: "local".into(),
14404 origin_host: None,
14405 conversation_id: None,
14406 },
14407 SearchHit {
14408 title: "Evening Session".into(),
14409 snippet: "snip2".into(),
14410 content: "hello world".into(),
14411 content_hash: stable_content_hash("hello world"),
14412 score: 0.8,
14413 source_path: "shared.jsonl".into(),
14414 agent: "agent".into(),
14415 workspace: "ws".into(),
14416 workspace_original: None,
14417 created_at: None,
14418 line_number: Some(1),
14419 match_type: MatchType::Exact,
14420 source_id: "local".into(),
14421 origin_kind: "local".into(),
14422 origin_host: None,
14423 conversation_id: None,
14424 },
14425 ];
14426
14427 let deduped = deduplicate_hits(hits);
14428 assert_eq!(deduped.len(), 2);
14429 assert!(deduped.iter().any(|hit| hit.title == "Morning Session"));
14430 assert!(deduped.iter().any(|hit| hit.title == "Evening Session"));
14431 }
14432
14433 #[test]
14434 fn deduplicate_hits_normalizes_whitespace() {
14435 let hits = vec![
14436 SearchHit {
14437 title: "title1".into(),
14438 snippet: "snip1".into(),
14439 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14441 score: 1.0,
14442 source_path: "a.jsonl".into(),
14443 agent: "agent".into(),
14444 workspace: "ws".into(),
14445 workspace_original: None,
14446 created_at: Some(100),
14447 line_number: None,
14448 match_type: MatchType::Exact,
14449 source_id: "local".into(),
14450 origin_kind: "local".into(),
14451 origin_host: None,
14452 conversation_id: None,
14453 },
14454 SearchHit {
14455 title: "title1".into(),
14456 snippet: "snip2".into(),
14457 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14459 score: 0.5,
14460 source_path: "a.jsonl".into(),
14461 agent: "agent".into(),
14462 workspace: "ws".into(),
14463 workspace_original: None,
14464 created_at: Some(100),
14465 line_number: None,
14466 match_type: MatchType::Exact,
14467 source_id: "local".into(),
14468 origin_kind: "local".into(),
14469 origin_host: None,
14470 conversation_id: None,
14471 },
14472 ];
14473
14474 let deduped = deduplicate_hits(hits);
14475 assert_eq!(deduped.len(), 1); }
14477
14478 #[test]
14479 fn deduplicate_hits_normalizes_blank_local_source_id() {
14480 let hits = vec![
14481 SearchHit {
14482 title: "title1".into(),
14483 snippet: "snip1".into(),
14484 content: "hello world".into(),
14485 content_hash: stable_content_hash("hello world"),
14486 score: 1.0,
14487 source_path: "a.jsonl".into(),
14488 agent: "agent".into(),
14489 workspace: "ws".into(),
14490 workspace_original: None,
14491 created_at: Some(100),
14492 line_number: None,
14493 match_type: MatchType::Exact,
14494 source_id: "local".into(),
14495 origin_kind: "local".into(),
14496 origin_host: None,
14497 conversation_id: None,
14498 },
14499 SearchHit {
14500 title: "title1".into(),
14501 snippet: "snip2".into(),
14502 content: "hello world".into(),
14503 content_hash: stable_content_hash("hello world"),
14504 score: 0.5,
14505 source_path: "a.jsonl".into(),
14506 agent: "agent".into(),
14507 workspace: "ws".into(),
14508 workspace_original: None,
14509 created_at: Some(100),
14510 line_number: None,
14511 match_type: MatchType::Exact,
14512 source_id: " ".into(),
14513 origin_kind: "local".into(),
14514 origin_host: None,
14515 conversation_id: None,
14516 },
14517 ];
14518
14519 let deduped = deduplicate_hits(hits);
14520 assert_eq!(deduped.len(), 1);
14521 assert_eq!(deduped[0].source_id, "local");
14522 }
14523
14524 #[test]
14525 fn deduplicate_hits_filters_tool_noise() {
14526 let hits = vec![
14527 SearchHit {
14528 title: "title1".into(),
14529 snippet: "snip1".into(),
14530 content: "[Tool:]".into(), content_hash: stable_content_hash("[Tool:]"),
14532 score: 1.0,
14533 source_path: "a.jsonl".into(),
14534 agent: "agent".into(),
14535 workspace: "ws".into(),
14536 workspace_original: None,
14537 created_at: Some(100),
14538 line_number: None,
14539 match_type: MatchType::Exact,
14540 source_id: "local".into(),
14541 origin_kind: "local".into(),
14542 origin_host: None,
14543 conversation_id: None,
14544 },
14545 SearchHit {
14546 title: "title2".into(),
14547 snippet: "snip2".into(),
14548 content: "This is real content about testing".into(),
14549 content_hash: stable_content_hash("This is real content about testing"),
14550 score: 0.5,
14551 source_path: "b.jsonl".into(),
14552 agent: "agent".into(),
14553 workspace: "ws".into(),
14554 workspace_original: None,
14555 created_at: Some(200),
14556 line_number: None,
14557 match_type: MatchType::Exact,
14558 source_id: "local".into(),
14559 origin_kind: "local".into(),
14560 origin_host: None,
14561 conversation_id: None,
14562 },
14563 ];
14564
14565 let deduped = deduplicate_hits(hits);
14566 assert_eq!(deduped.len(), 1);
14567 assert!(deduped[0].content.contains("real content"));
14568 }
14569
14570 #[test]
14571 fn deduplicate_hits_filters_acknowledgement_noise() {
14572 let hits = vec![
14573 SearchHit {
14574 title: "ack".into(),
14575 snippet: "ack".into(),
14576 content: "Acknowledged.".into(),
14577 content_hash: stable_content_hash("Acknowledged."),
14578 score: 1.0,
14579 source_path: "ack.jsonl".into(),
14580 agent: "agent".into(),
14581 workspace: "ws".into(),
14582 workspace_original: None,
14583 created_at: Some(100),
14584 line_number: None,
14585 match_type: MatchType::Exact,
14586 source_id: "local".into(),
14587 origin_kind: "local".into(),
14588 origin_host: None,
14589 conversation_id: None,
14590 },
14591 SearchHit {
14592 title: "real".into(),
14593 snippet: "real".into(),
14594 content: "Authentication refresh logic changed".into(),
14595 content_hash: stable_content_hash("Authentication refresh logic changed"),
14596 score: 0.5,
14597 source_path: "real.jsonl".into(),
14598 agent: "agent".into(),
14599 workspace: "ws".into(),
14600 workspace_original: None,
14601 created_at: Some(200),
14602 line_number: None,
14603 match_type: MatchType::Exact,
14604 source_id: "local".into(),
14605 origin_kind: "local".into(),
14606 origin_host: None,
14607 conversation_id: None,
14608 },
14609 ];
14610
14611 let deduped = deduplicate_hits_with_query(hits, "authentication");
14612 assert_eq!(deduped.len(), 1);
14613 assert_eq!(deduped[0].title, "real");
14614 }
14615
14616 #[test]
14617 fn deduplicate_hits_hides_system_prompts_unless_query_requests_them() {
14618 let prompt_hit = SearchHit {
14619 title: "prompt".into(),
14620 snippet: "prompt".into(),
14621 content:
14622 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly."
14623 .into(),
14624 content_hash: stable_content_hash(
14625 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly.",
14626 ),
14627 score: 1.0,
14628 source_path: "prompt.jsonl".into(),
14629 agent: "agent".into(),
14630 workspace: "ws".into(),
14631 workspace_original: None,
14632 created_at: Some(100),
14633 line_number: None,
14634 match_type: MatchType::Exact,
14635 source_id: "local".into(),
14636 origin_kind: "local".into(),
14637 origin_host: None,
14638 conversation_id: None,
14639 };
14640
14641 assert!(
14642 deduplicate_hits_with_query(vec![prompt_hit.clone()], "coding assistant").is_empty()
14643 );
14644
14645 let kept = deduplicate_hits_with_query(vec![prompt_hit], "AGENTS.md instructions");
14646 assert_eq!(kept.len(), 1);
14647 assert_eq!(kept[0].title, "prompt");
14648 }
14649
14650 #[test]
14651 fn deduplicate_hits_preserves_unique_content() {
14652 let hits = vec![
14653 SearchHit {
14654 title: "title1".into(),
14655 snippet: "snip1".into(),
14656 content: "first message".into(),
14657 content_hash: stable_content_hash("first message"),
14658 score: 1.0,
14659 source_path: "a.jsonl".into(),
14660 agent: "agent".into(),
14661 workspace: "ws".into(),
14662 workspace_original: None,
14663 created_at: Some(100),
14664 line_number: None,
14665 match_type: MatchType::Exact,
14666 source_id: "local".into(),
14667 origin_kind: "local".into(),
14668 origin_host: None,
14669 conversation_id: None,
14670 },
14671 SearchHit {
14672 title: "title2".into(),
14673 snippet: "snip2".into(),
14674 content: "second message".into(),
14675 content_hash: stable_content_hash("second message"),
14676 score: 0.8,
14677 source_path: "b.jsonl".into(),
14678 agent: "agent".into(),
14679 workspace: "ws".into(),
14680 workspace_original: None,
14681 created_at: Some(200),
14682 line_number: None,
14683 match_type: MatchType::Exact,
14684 source_id: "local".into(),
14685 origin_kind: "local".into(),
14686 origin_host: None,
14687 conversation_id: None,
14688 },
14689 SearchHit {
14690 title: "title3".into(),
14691 snippet: "snip3".into(),
14692 content: "third message".into(),
14693 content_hash: stable_content_hash("third message"),
14694 score: 0.6,
14695 source_path: "c.jsonl".into(),
14696 agent: "agent".into(),
14697 workspace: "ws".into(),
14698 workspace_original: None,
14699 created_at: Some(300),
14700 line_number: None,
14701 match_type: MatchType::Exact,
14702 source_id: "local".into(),
14703 origin_kind: "local".into(),
14704 origin_host: None,
14705 conversation_id: None,
14706 },
14707 ];
14708
14709 let deduped = deduplicate_hits(hits);
14710 assert_eq!(deduped.len(), 3); }
14712
14713 #[test]
14716 fn deduplicate_hits_respects_source_boundaries() {
14717 let hits = vec![
14718 SearchHit {
14719 title: "local title".into(),
14720 snippet: "snip".into(),
14721 content: "hello world".into(),
14722 content_hash: stable_content_hash("hello world"),
14723 score: 1.0,
14724 source_path: "a.jsonl".into(),
14725 agent: "agent".into(),
14726 workspace: "ws".into(),
14727 workspace_original: None,
14728 created_at: Some(100),
14729 line_number: None,
14730 match_type: MatchType::Exact,
14731 source_id: "local".into(),
14732 origin_kind: "local".into(),
14733 origin_host: None,
14734 conversation_id: None,
14735 },
14736 SearchHit {
14737 title: "remote title".into(),
14738 snippet: "snip".into(),
14739 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14741 score: 0.9,
14742 source_path: "b.jsonl".into(),
14743 agent: "agent".into(),
14744 workspace: "ws".into(),
14745 workspace_original: None,
14746 created_at: Some(200),
14747 line_number: None,
14748 match_type: MatchType::Exact,
14749 source_id: "work-laptop".into(), origin_kind: "ssh".into(),
14751 origin_host: Some("work-laptop.local".into()),
14752 conversation_id: None,
14753 },
14754 ];
14755
14756 let deduped = deduplicate_hits(hits);
14757 assert_eq!(
14758 deduped.len(),
14759 2,
14760 "same content from different sources should not dedupe"
14761 );
14762 assert!(deduped.iter().any(|h| h.source_id == "local"));
14763 assert!(deduped.iter().any(|h| h.source_id == "work-laptop"));
14764 }
14765
14766 #[test]
14767 fn wildcard_fallback_sparse_check_uses_effective_limit() {
14768 assert!(
14769 !should_try_wildcard_fallback(1, 1, 0, 3),
14770 "a filled one-result page is not sparse for fallback purposes"
14771 );
14772 assert!(
14773 !should_try_wildcard_fallback(2, 2, 0, 3),
14774 "a filled two-result page is not sparse for fallback purposes"
14775 );
14776 assert!(
14777 should_try_wildcard_fallback(0, 1, 0, 3),
14778 "zero hits should still trigger fallback even for tiny pages"
14779 );
14780 assert!(
14781 should_try_wildcard_fallback(1, 2, 0, 3),
14782 "a partially filled page should still trigger fallback"
14783 );
14784 assert!(
14785 !should_try_wildcard_fallback(0, 5, 10, 3),
14786 "pagination should not trigger wildcard fallback"
14787 );
14788 assert!(
14789 should_try_wildcard_fallback(1, 0, 0, 3),
14790 "limit zero preserves the legacy sparse-threshold semantics"
14791 );
14792 }
14793
14794 #[test]
14795 fn snippet_preview_fast_path_requires_snippet_only_match() {
14796 let snippet_only = FieldMask::new(false, true, false, false);
14797 let snippet = snippet_from_preview_without_full_content(
14798 snippet_only,
14799 "migration checks the database constraint before writing",
14800 "database",
14801 )
14802 .expect("preview should satisfy a snippet-only request when it contains the query");
14803 assert!(snippet.contains("**database**"));
14804
14805 assert!(
14806 snippet_from_preview_without_full_content(
14807 FieldMask::FULL,
14808 "migration checks the database constraint before writing",
14809 "database",
14810 )
14811 .is_none(),
14812 "full-content requests must keep the sqlite hydration path"
14813 );
14814 assert!(
14815 snippet_from_preview_without_full_content(
14816 snippet_only,
14817 "migration checks constraints before writing",
14818 "database",
14819 )
14820 .is_none(),
14821 "snippet-only requests hydrate when the preview cannot show the match"
14822 );
14823 }
14824
14825 #[test]
14826 fn search_with_fallback_returns_exact_when_sufficient() -> Result<()> {
14827 let dir = TempDir::new()?;
14828 let mut index = TantivyIndex::open_or_create(dir.path())?;
14829
14830 for i in 0..5 {
14832 let conv = NormalizedConversation {
14833 agent_slug: "codex".into(),
14834 external_id: None,
14835 title: Some(format!("doc-{i}")),
14836 workspace: Some(std::path::PathBuf::from("/ws")),
14837 source_path: dir.path().join(format!("{i}.jsonl")),
14838 started_at: Some(100 + i),
14839 ended_at: None,
14840 metadata: serde_json::json!({}),
14841 messages: vec![NormalizedMessage {
14842 idx: 0,
14843 role: "user".into(),
14844 author: None,
14845 created_at: Some(100 + i),
14846 content: format!("apple fruit number {i} is delicious and healthy"),
14848 extra: serde_json::json!({}),
14849 snippets: vec![],
14850 invocations: Vec::new(),
14851 }],
14852 };
14853 index.add_conversation(&conv)?;
14854 }
14855 index.commit()?;
14856
14857 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14858
14859 let result = client.search_with_fallback(
14861 "apple",
14862 SearchFilters::default(),
14863 10,
14864 0,
14865 3, FieldMask::FULL,
14867 )?;
14868
14869 assert!(!result.wildcard_fallback);
14870 assert!(result.hits.len() >= 3); Ok(())
14873 }
14874
14875 #[test]
14876 fn search_with_fallback_triggers_on_sparse_results() -> Result<()> {
14877 let dir = TempDir::new()?;
14878 let mut index = TantivyIndex::open_or_create(dir.path())?;
14879
14880 let conv = NormalizedConversation {
14882 agent_slug: "codex".into(),
14883 external_id: None,
14884 title: Some("substring test".into()),
14885 workspace: Some(std::path::PathBuf::from("/ws")),
14886 source_path: dir.path().join("test.jsonl"),
14887 started_at: Some(100),
14888 ended_at: None,
14889 metadata: serde_json::json!({}),
14890 messages: vec![NormalizedMessage {
14891 idx: 0,
14892 role: "user".into(),
14893 author: None,
14894 created_at: Some(100),
14895 content: "configuration management system".into(),
14896 extra: serde_json::json!({}),
14897 snippets: vec![],
14898 invocations: Vec::new(),
14899 }],
14900 };
14901 index.add_conversation(&conv)?;
14902 index.commit()?;
14903
14904 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14905
14906 let result = client.search_with_fallback(
14908 "config",
14909 SearchFilters::default(),
14910 10,
14911 0,
14912 5, FieldMask::FULL,
14914 )?;
14915
14916 assert!(!result.hits.is_empty());
14919
14920 Ok(())
14921 }
14922
14923 #[test]
14924 fn search_with_fallback_skips_when_query_has_wildcards() -> Result<()> {
14925 let dir = TempDir::new()?;
14926 let mut index = TantivyIndex::open_or_create(dir.path())?;
14927
14928 let conv = NormalizedConversation {
14929 agent_slug: "codex".into(),
14930 external_id: None,
14931 title: Some("test".into()),
14932 workspace: None,
14933 source_path: dir.path().join("test.jsonl"),
14934 started_at: Some(100),
14935 ended_at: None,
14936 metadata: serde_json::json!({}),
14937 messages: vec![NormalizedMessage {
14938 idx: 0,
14939 role: "user".into(),
14940 author: None,
14941 created_at: Some(100),
14942 content: "testing data".into(),
14943 extra: serde_json::json!({}),
14944 snippets: vec![],
14945 invocations: Vec::new(),
14946 }],
14947 };
14948 index.add_conversation(&conv)?;
14949 index.commit()?;
14950
14951 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14952
14953 let result = client.search_with_fallback(
14955 "*test*",
14956 SearchFilters::default(),
14957 10,
14958 0,
14959 10, FieldMask::FULL,
14961 )?;
14962
14963 assert!(!result.wildcard_fallback); Ok(())
14965 }
14966
14967 #[test]
14968 fn search_with_fallback_prefers_wildcards_when_they_add_hits() -> Result<()> {
14969 let dir = TempDir::new()?;
14970 let mut index = TantivyIndex::open_or_create(dir.path())?;
14971
14972 for (i, body) in [
14975 "alphabet soup for coders",
14976 "mapping the alphabet city blocks",
14977 ]
14978 .iter()
14979 .enumerate()
14980 {
14981 let conv = NormalizedConversation {
14982 agent_slug: "codex".into(),
14983 external_id: None,
14984 title: Some(format!("alpha-{i}")),
14985 workspace: Some(std::path::PathBuf::from("/ws")),
14986 source_path: dir.path().join(format!("alpha-{i}.jsonl")),
14987 started_at: Some(100 + i as i64),
14988 ended_at: None,
14989 metadata: serde_json::json!({}),
14990 messages: vec![NormalizedMessage {
14991 idx: 0,
14992 role: "user".into(),
14993 author: None,
14994 created_at: Some(100 + i as i64),
14995 content: body.to_string(),
14996 extra: serde_json::json!({}),
14997 snippets: vec![],
14998 invocations: Vec::new(),
14999 }],
15000 };
15001 index.add_conversation(&conv)?;
15002 }
15003 index.commit()?;
15004
15005 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15006
15007 let result = client.search_with_fallback(
15008 "bet",
15009 SearchFilters::default(),
15010 10,
15011 0,
15012 2,
15013 FieldMask::FULL,
15014 )?;
15015
15016 assert!(
15017 result.wildcard_fallback,
15018 "should switch to wildcard fallback when it yields more hits"
15019 );
15020 assert_eq!(
15021 result.hits.len(),
15022 2,
15023 "fallback should surface all alphabet docs"
15024 );
15025 assert!(
15026 result
15027 .hits
15028 .iter()
15029 .all(|h| h.match_type == MatchType::ImplicitWildcard)
15030 );
15031 assert!(result.hits.iter().all(|h| h.content.contains("alphabet")));
15032
15033 Ok(())
15034 }
15035
15036 #[test]
15037 fn automatic_wildcard_fallback_skips_long_zero_hit_token() -> Result<()> {
15038 let dir = TempDir::new()?;
15039 let mut index = TantivyIndex::open_or_create(dir.path())?;
15040
15041 let conv = NormalizedConversation {
15042 agent_slug: "codex".into(),
15043 external_id: None,
15044 title: Some("fruit".into()),
15045 workspace: Some(std::path::PathBuf::from("/ws")),
15046 source_path: dir.path().join("fruit.jsonl"),
15047 started_at: Some(100),
15048 ended_at: None,
15049 metadata: serde_json::json!({}),
15050 messages: vec![NormalizedMessage {
15051 idx: 0,
15052 role: "user".into(),
15053 author: None,
15054 created_at: Some(100),
15055 content: "apple pear banana".into(),
15056 extra: serde_json::json!({}),
15057 snippets: vec![],
15058 invocations: Vec::new(),
15059 }],
15060 };
15061 index.add_conversation(&conv)?;
15062 index.commit()?;
15063
15064 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15065
15066 let result = client.search_with_fallback(
15067 "zzzzzzunlikelyterm",
15068 SearchFilters::default(),
15069 10,
15070 0,
15071 1,
15072 FieldMask::FULL,
15073 )?;
15074 assert!(result.hits.is_empty());
15075 assert!(!result.wildcard_fallback);
15076 assert!(
15077 result
15078 .suggestions
15079 .iter()
15080 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
15081 "manual wildcard suggestion should remain available"
15082 );
15083
15084 let short_result = client.search_with_fallback(
15085 "pple",
15086 SearchFilters::default(),
15087 10,
15088 0,
15089 1,
15090 FieldMask::FULL,
15091 )?;
15092 assert!(short_result.wildcard_fallback);
15093 assert_eq!(short_result.hits.len(), 1);
15094 assert_eq!(short_result.hits[0].match_type, MatchType::ImplicitWildcard);
15095
15096 Ok(())
15097 }
15098
15099 #[test]
15100 fn nohit_suggestions_do_not_lazy_open_sqlite_when_tantivy_is_present() -> Result<()> {
15101 let dir = TempDir::new()?;
15102 let index_path = dir.path().join("index");
15103 let db_path = dir.path().join("cass.db");
15104
15105 let storage = FrankenStorage::open(&db_path)?;
15106 storage.close()?;
15107
15108 let mut index = TantivyIndex::open_or_create(&index_path)?;
15109 let conv = NormalizedConversation {
15110 agent_slug: "codex".into(),
15111 external_id: None,
15112 title: Some("fruit".into()),
15113 workspace: Some(std::path::PathBuf::from("/ws")),
15114 source_path: dir.path().join("fruit.jsonl"),
15115 started_at: Some(100),
15116 ended_at: None,
15117 metadata: serde_json::json!({}),
15118 messages: vec![NormalizedMessage {
15119 idx: 0,
15120 role: "user".into(),
15121 author: None,
15122 created_at: Some(100),
15123 content: "apple pear banana".into(),
15124 extra: serde_json::json!({}),
15125 snippets: vec![],
15126 invocations: Vec::new(),
15127 }],
15128 };
15129 index.add_conversation(&conv)?;
15130 index.commit()?;
15131
15132 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("index present");
15133 assert!(
15134 client
15135 .sqlite
15136 .lock()
15137 .map(|guard| guard.is_none())
15138 .unwrap_or(false),
15139 "sqlite should start closed"
15140 );
15141
15142 let result = client.search_with_fallback(
15143 "zzzzzzunlikelyterm",
15144 SearchFilters::default(),
15145 10,
15146 0,
15147 1,
15148 FieldMask::FULL,
15149 )?;
15150
15151 assert!(result.hits.is_empty());
15152 assert!(
15153 result
15154 .suggestions
15155 .iter()
15156 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
15157 "manual wildcard suggestion should remain available"
15158 );
15159 assert!(
15160 result
15161 .suggestions
15162 .iter()
15163 .all(|s| !matches!(s.kind, SuggestionKind::AlternateAgent)),
15164 "alternate-agent suggestions should not force a SQLite open"
15165 );
15166 assert!(
15167 client
15168 .sqlite
15169 .lock()
15170 .map(|guard| guard.is_none())
15171 .unwrap_or(false),
15172 "sqlite should stay closed after Tantivy no-hit suggestions"
15173 );
15174
15175 Ok(())
15176 }
15177
15178 #[test]
15179 fn search_with_fallback_emits_wildcard_suggestion_on_zero_hits() -> Result<()> {
15180 let client = SearchClient {
15181 reader: None,
15182 sqlite: Mutex::new(None),
15183 sqlite_path: None,
15184 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15185 reload_on_search: true,
15186 last_reload: Mutex::new(None),
15187 last_generation: Mutex::new(None),
15188 reload_epoch: Arc::new(AtomicU64::new(0)),
15189 warm_tx: None,
15190 _warm_handle: None,
15191 metrics: Metrics::default(),
15192 cache_namespace: "vtest|schema:none".into(),
15193 semantic: Mutex::new(None),
15194 last_tantivy_total_count: Mutex::new(None),
15195 };
15196
15197 let result = client.search_with_fallback(
15198 "ghost",
15199 SearchFilters::default(),
15200 5,
15201 0,
15202 3,
15203 FieldMask::FULL,
15204 )?;
15205
15206 assert!(
15207 result.hits.is_empty(),
15208 "no index/db means no hits should be returned"
15209 );
15210 assert!(
15211 !result.wildcard_fallback,
15212 "with zero baseline and fallback hits, we should keep baseline and mark fallback=false"
15213 );
15214
15215 let wildcard = result
15216 .suggestions
15217 .iter()
15218 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
15219 .expect("should suggest adding wildcards");
15220 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
15221
15222 Ok(())
15223 }
15224
15225 #[test]
15226 fn search_with_fallback_skips_empty_query() -> Result<()> {
15227 let dir = TempDir::new()?;
15228 let mut index = TantivyIndex::open_or_create(dir.path())?;
15229
15230 let conv = NormalizedConversation {
15231 agent_slug: "codex".into(),
15232 external_id: None,
15233 title: Some("test".into()),
15234 workspace: None,
15235 source_path: dir.path().join("test.jsonl"),
15236 started_at: Some(100),
15237 ended_at: None,
15238 metadata: serde_json::json!({}),
15239 messages: vec![NormalizedMessage {
15240 idx: 0,
15241 role: "user".into(),
15242 author: None,
15243 created_at: Some(100),
15244 content: "testing data".into(),
15245 extra: serde_json::json!({}),
15246 snippets: vec![],
15247 invocations: Vec::new(),
15248 }],
15249 };
15250 index.add_conversation(&conv)?;
15251 index.commit()?;
15252
15253 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15254
15255 let result = client.search_with_fallback(
15257 " ",
15258 SearchFilters::default(),
15259 10,
15260 0,
15261 10,
15262 FieldMask::FULL,
15263 )?;
15264
15265 assert!(!result.wildcard_fallback);
15266 Ok(())
15267 }
15268
15269 #[test]
15270 fn search_with_fallback_skips_for_nonzero_offset() -> Result<()> {
15271 let client = SearchClient {
15273 reader: None,
15274 sqlite: Mutex::new(None),
15275 sqlite_path: None,
15276 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15277 reload_on_search: true,
15278 last_reload: Mutex::new(None),
15279 last_generation: Mutex::new(None),
15280 reload_epoch: Arc::new(AtomicU64::new(0)),
15281 warm_tx: None,
15282 _warm_handle: None,
15283 metrics: Metrics::default(),
15284 cache_namespace: "vtest|schema:none".into(),
15285 semantic: Mutex::new(None),
15286 last_tantivy_total_count: Mutex::new(None),
15287 };
15288
15289 let result = client.search_with_fallback(
15290 "ghost",
15291 SearchFilters::default(),
15292 5,
15293 10,
15294 3,
15295 FieldMask::FULL,
15296 )?;
15297
15298 assert!(
15299 !result.wildcard_fallback,
15300 "fallback should not run on paginated searches"
15301 );
15302 let wildcard = result
15304 .suggestions
15305 .iter()
15306 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
15307 .expect("wildcard suggestion present");
15308 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
15309
15310 Ok(())
15311 }
15312
15313 #[test]
15314 fn generate_suggestions_limits_and_sets_shortcuts() -> Result<()> {
15315 let client = SearchClient {
15317 reader: None,
15318 sqlite: Mutex::new(None),
15319 sqlite_path: None,
15320 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15321 reload_on_search: true,
15322 last_reload: Mutex::new(None),
15323 last_generation: Mutex::new(None),
15324 reload_epoch: Arc::new(AtomicU64::new(0)),
15325 warm_tx: None,
15326 _warm_handle: None,
15327 metrics: Metrics::default(),
15328 cache_namespace: "vtest|schema:none".into(),
15329 semantic: Mutex::new(None),
15330 last_tantivy_total_count: Mutex::new(None),
15331 };
15332
15333 let mut filters = SearchFilters::default();
15334 filters.agents.insert("codex".into()); let result = client.search_with_fallback("claud", filters, 5, 0, 3, FieldMask::FULL)?;
15337
15338 assert_eq!(
15340 result.suggestions.len(),
15341 3,
15342 "should truncate to 3 suggestions"
15343 );
15344 for (idx, sugg) in result.suggestions.iter().enumerate() {
15345 assert_eq!(
15346 sugg.shortcut,
15347 Some((idx + 1) as u8),
15348 "shortcut should match position (1-based)"
15349 );
15350 }
15351
15352 assert!(
15354 result
15355 .suggestions
15356 .iter()
15357 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
15358 "should suggest wildcard search"
15359 );
15360 assert!(
15361 result
15362 .suggestions
15363 .iter()
15364 .any(|s| matches!(s.kind, SuggestionKind::RemoveFilter)),
15365 "should suggest removing agent filter"
15366 );
15367 assert!(
15368 result
15369 .suggestions
15370 .iter()
15371 .any(|s| matches!(s.kind, SuggestionKind::SpellingFix)),
15372 "should suggest spelling fix for nearby agent name"
15373 );
15374
15375 Ok(())
15376 }
15377
15378 #[test]
15379 fn generate_suggestions_includes_recent_alternate_agents() -> Result<()> {
15380 let dir = TempDir::new()?;
15381 let db_path = dir.path().join("cass.db");
15382 let storage = FrankenStorage::open(&db_path)?;
15383 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
15384 let base_ts = 1_700_000_010_000_i64;
15385
15386 for (idx, slug) in ["claude_code", "codex"].iter().enumerate() {
15387 let agent = Agent {
15388 id: None,
15389 slug: (*slug).to_string(),
15390 name: (*slug).to_string(),
15391 version: None,
15392 kind: AgentKind::Cli,
15393 };
15394 let agent_id = storage.ensure_agent(&agent)?;
15395 let conversation = Conversation {
15396 id: None,
15397 agent_slug: (*slug).to_string(),
15398 workspace: Some(dir.path().to_path_buf()),
15399 external_id: Some(format!("alt-agent-{idx}")),
15400 title: Some(format!("alternate agent {idx}")),
15401 source_path: dir.path().join(format!("{slug}.jsonl")),
15402 started_at: Some(base_ts + idx as i64),
15403 ended_at: Some(base_ts + idx as i64),
15404 approx_tokens: Some(8),
15405 metadata_json: json!({}),
15406 messages: vec![Message {
15407 id: None,
15408 idx: 0,
15409 role: MessageRole::User,
15410 author: Some("user".into()),
15411 created_at: Some(base_ts + idx as i64),
15412 content: format!("content from {slug}"),
15413 extra_json: json!({}),
15414 snippets: Vec::new(),
15415 }],
15416 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
15417 origin_host: None,
15418 };
15419 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
15420 }
15421 drop(storage);
15422
15423 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
15424 let result = client.search_with_fallback(
15425 "ghost",
15426 SearchFilters::default(),
15427 5,
15428 0,
15429 3,
15430 FieldMask::FULL,
15431 )?;
15432
15433 let alternate_agents: HashSet<String> = result
15434 .suggestions
15435 .iter()
15436 .filter(|suggestion| matches!(suggestion.kind, SuggestionKind::AlternateAgent))
15437 .filter_map(|suggestion| suggestion.suggested_filters.as_ref())
15438 .flat_map(|filters| filters.agents.iter().cloned())
15439 .collect();
15440
15441 assert!(
15442 alternate_agents.contains("claude_code"),
15443 "should suggest claude_code from normalized conversations schema"
15444 );
15445 assert!(
15446 alternate_agents.contains("codex"),
15447 "should suggest codex from normalized conversations schema"
15448 );
15449
15450 Ok(())
15451 }
15452
15453 #[test]
15454 fn sanitize_query_preserves_wildcards() {
15455 assert_eq!(fs_cass_sanitize_query("*foo*"), "*foo*");
15457 assert_eq!(fs_cass_sanitize_query("foo*"), "foo*");
15458 assert_eq!(fs_cass_sanitize_query("*bar"), "*bar");
15459 assert_eq!(fs_cass_sanitize_query("*config*"), "*config*");
15460 }
15461
15462 #[test]
15463 fn sanitize_query_strips_other_special_chars() {
15464 assert_eq!(fs_cass_sanitize_query("foo.bar"), "foo bar");
15466 assert_eq!(fs_cass_sanitize_query("c++"), "c ");
15467 assert_eq!(fs_cass_sanitize_query("foo-bar"), "foo-bar");
15468 assert_eq!(fs_cass_sanitize_query("test_case"), "test case");
15469 }
15470
15471 #[test]
15472 fn sanitize_query_combined() {
15473 assert_eq!(fs_cass_sanitize_query("*foo.bar*"), "*foo bar*");
15475 assert_eq!(fs_cass_sanitize_query("test-*"), "test-*");
15476 assert_eq!(fs_cass_sanitize_query("*c++*"), "*c *");
15477 }
15478
15479 #[test]
15481 fn parse_boolean_query_simple_terms() {
15482 let tokens = fs_cass_parse_boolean_query("foo bar baz");
15483 assert_eq!(tokens.len(), 3);
15484 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15485 assert_eq!(tokens[1], FsCassQueryToken::Term("bar".to_string()));
15486 assert_eq!(tokens[2], FsCassQueryToken::Term("baz".to_string()));
15487 }
15488
15489 #[test]
15490 fn parse_boolean_query_and_operator() {
15491 let tokens = fs_cass_parse_boolean_query("foo AND bar");
15492 assert_eq!(tokens.len(), 3);
15493 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15494 assert_eq!(tokens[1], FsCassQueryToken::And);
15495 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15496
15497 let tokens2 = fs_cass_parse_boolean_query("foo && bar");
15499 assert_eq!(tokens2.len(), 3);
15500 assert_eq!(tokens2[1], FsCassQueryToken::And);
15501 }
15502
15503 #[test]
15504 fn parse_boolean_query_or_operator() {
15505 let tokens = fs_cass_parse_boolean_query("foo OR bar");
15506 assert_eq!(tokens.len(), 3);
15507 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15508 assert_eq!(tokens[1], FsCassQueryToken::Or);
15509 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15510
15511 let tokens2 = fs_cass_parse_boolean_query("foo || bar");
15513 assert_eq!(tokens2.len(), 3);
15514 assert_eq!(tokens2[1], FsCassQueryToken::Or);
15515 }
15516
15517 #[test]
15518 fn parse_boolean_query_not_operator() {
15519 let tokens = fs_cass_parse_boolean_query("foo NOT bar");
15520 assert_eq!(tokens.len(), 3);
15521 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15522 assert_eq!(tokens[1], FsCassQueryToken::Not);
15523 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15524 }
15525
15526 #[test]
15527 fn parse_boolean_query_quoted_phrase() {
15528 let tokens = fs_cass_parse_boolean_query(r#"foo "exact phrase" bar"#);
15529 assert_eq!(tokens.len(), 3);
15530 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
15531 assert_eq!(
15532 tokens[1],
15533 FsCassQueryToken::Phrase("exact phrase".to_string())
15534 );
15535 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
15536 }
15537
15538 #[test]
15539 fn parse_boolean_query_complex() {
15540 let tokens = fs_cass_parse_boolean_query(r#"error OR warning NOT "false positive""#);
15541 assert_eq!(tokens.len(), 5);
15542 assert_eq!(tokens[0], FsCassQueryToken::Term("error".to_string()));
15543 assert_eq!(tokens[1], FsCassQueryToken::Or);
15544 assert_eq!(tokens[2], FsCassQueryToken::Term("warning".to_string()));
15545 assert_eq!(tokens[3], FsCassQueryToken::Not);
15546 assert_eq!(
15547 tokens[4],
15548 FsCassQueryToken::Phrase("false positive".to_string())
15549 );
15550 }
15551
15552 #[test]
15553 fn has_boolean_operators_detection() {
15554 assert!(!fs_cass_has_boolean_operators("foo bar"));
15555 assert!(fs_cass_has_boolean_operators("foo AND bar"));
15556 assert!(fs_cass_has_boolean_operators("foo OR bar"));
15557 assert!(fs_cass_has_boolean_operators("foo NOT bar"));
15558 assert!(fs_cass_has_boolean_operators(r#""exact phrase""#));
15559 assert!(fs_cass_has_boolean_operators("foo && bar"));
15560 assert!(fs_cass_has_boolean_operators("foo || bar"));
15561 }
15562
15563 #[test]
15564 fn parse_boolean_query_case_insensitive_operators() {
15565 let tokens = fs_cass_parse_boolean_query("foo and bar or baz not qux");
15567 assert_eq!(tokens.len(), 7);
15568 assert_eq!(tokens[1], FsCassQueryToken::And);
15569 assert_eq!(tokens[3], FsCassQueryToken::Or);
15570 assert_eq!(tokens[5], FsCassQueryToken::Not);
15571 }
15572
15573 #[test]
15574 fn parse_boolean_query_with_wildcards() {
15575 let tokens = fs_cass_parse_boolean_query("*config* OR env*");
15576 assert_eq!(tokens.len(), 3);
15577 assert_eq!(tokens[0], FsCassQueryToken::Term("*config*".to_string()));
15578 assert_eq!(tokens[1], FsCassQueryToken::Or);
15579 assert_eq!(tokens[2], FsCassQueryToken::Term("env*".to_string()));
15580 }
15581
15582 #[test]
15588 fn tantivy_search_hydrates_long_content_when_content_field_is_not_stored() -> Result<()> {
15589 let dir = TempDir::new()?;
15590 let db_path = dir.path().join("cass.db");
15591 let storage = FrankenStorage::open(&db_path)?;
15592 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
15593 let agent = Agent {
15594 id: None,
15595 slug: "codex".into(),
15596 name: "Codex".into(),
15597 version: None,
15598 kind: AgentKind::Cli,
15599 };
15600 let agent_id = storage.ensure_agent(&agent)?;
15601 let long_content = format!(
15602 "{}needle appears past the preview boundary for hydration proof",
15603 "padding ".repeat(70)
15604 );
15605 let short_content = "shortneedle fits entirely inside the stored preview".to_string();
15606 let conversation = Conversation {
15607 id: None,
15608 agent_slug: "codex".into(),
15609 workspace: Some(dir.path().to_path_buf()),
15610 external_id: Some("hydrate-long-content".into()),
15611 title: Some("hydrated lexical doc".into()),
15612 source_path: dir.path().join("hydrate.jsonl"),
15613 started_at: Some(1_700_000_123_000),
15614 ended_at: Some(1_700_000_123_000),
15615 approx_tokens: Some(32),
15616 metadata_json: json!({}),
15617 messages: vec![
15618 Message {
15619 id: None,
15620 idx: 0,
15621 role: MessageRole::User,
15622 author: Some("user".into()),
15623 created_at: Some(1_700_000_123_000),
15624 content: long_content.clone(),
15625 extra_json: json!({}),
15626 snippets: Vec::new(),
15627 },
15628 Message {
15629 id: None,
15630 idx: 1,
15631 role: MessageRole::Agent,
15632 author: Some("assistant".into()),
15633 created_at: Some(1_700_000_124_000),
15634 content: short_content.clone(),
15635 extra_json: json!({}),
15636 snippets: Vec::new(),
15637 },
15638 ],
15639 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
15640 origin_host: None,
15641 };
15642 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
15643 storage.close()?;
15644
15645 let index_path = dir.path().join("search-index");
15646 let mut index = TantivyIndex::open_or_create(&index_path)?;
15647 let normalized = NormalizedConversation {
15648 agent_slug: "codex".into(),
15649 external_id: Some("hydrate-long-content".into()),
15650 title: Some("hydrated lexical doc".into()),
15651 workspace: Some(dir.path().to_path_buf()),
15652 source_path: dir.path().join("hydrate.jsonl"),
15653 started_at: Some(1_700_000_123_000),
15654 ended_at: Some(1_700_000_123_000),
15655 metadata: json!({}),
15656 messages: vec![
15657 NormalizedMessage {
15658 idx: 0,
15659 role: "user".into(),
15660 author: Some("user".into()),
15661 created_at: Some(1_700_000_123_000),
15662 content: long_content.clone(),
15663 extra: json!({}),
15664 snippets: vec![],
15665 invocations: Vec::new(),
15666 },
15667 NormalizedMessage {
15668 idx: 1,
15669 role: "assistant".into(),
15670 author: Some("assistant".into()),
15671 created_at: Some(1_700_000_124_000),
15672 content: short_content.clone(),
15673 extra: json!({}),
15674 snippets: vec![],
15675 invocations: Vec::new(),
15676 },
15677 ],
15678 };
15679 index.add_conversation(&normalized)?;
15680 index.commit()?;
15681
15682 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
15683 let hits = client.search("needle", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
15684
15685 assert_eq!(hits.len(), 1, "expected one lexical hit");
15686 assert_eq!(hits[0].title, "hydrated lexical doc");
15687 assert!(
15688 hits[0]
15689 .content
15690 .contains("needle appears past the preview boundary"),
15691 "lexical hit should hydrate full content from sqlite when Tantivy content is not stored"
15692 );
15693 assert!(
15694 hits[0].snippet.to_lowercase().contains("needle"),
15695 "snippet should still be rendered from hydrated content"
15696 );
15697
15698 let bounded_hits = client.search(
15699 "needle",
15700 SearchFilters::default(),
15701 5,
15702 0,
15703 FieldMask::FULL.with_preview_content_limit(Some(200)),
15704 )?;
15705
15706 assert_eq!(bounded_hits.len(), 1, "expected one lexical hit");
15707 assert!(
15708 bounded_hits[0].content.starts_with("padding padding"),
15709 "bounded content may be served from the stored preview prefix"
15710 );
15711 assert!(
15712 !bounded_hits[0]
15713 .content
15714 .contains("needle appears past the preview boundary"),
15715 "bounded preview content should not hydrate the full sqlite row"
15716 );
15717
15718 let short_client =
15719 SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
15720 assert!(
15721 short_client
15722 .sqlite
15723 .lock()
15724 .map(|guard| guard.is_none())
15725 .unwrap_or(false),
15726 "sqlite should start closed for short preview hit"
15727 );
15728
15729 let short_hits = short_client.search(
15730 "shortneedle",
15731 SearchFilters::default(),
15732 5,
15733 0,
15734 FieldMask::FULL,
15735 )?;
15736
15737 assert_eq!(short_hits.len(), 1, "expected one short lexical hit");
15738 assert_eq!(
15739 short_hits[0].content, short_content,
15740 "untruncated stored preview is exact full content"
15741 );
15742 assert!(
15743 short_client
15744 .sqlite
15745 .lock()
15746 .map(|guard| guard.is_none())
15747 .unwrap_or(false),
15748 "short full-content hit should not lazy-open sqlite"
15749 );
15750
15751 Ok(())
15752 }
15753
15754 #[test]
15755 fn filter_fidelity_agent_filter_respected() -> Result<()> {
15756 let dir = TempDir::new()?;
15758 let mut index = TantivyIndex::open_or_create(dir.path())?;
15759
15760 let conv_a = NormalizedConversation {
15762 agent_slug: "codex".into(),
15763 external_id: None,
15764 title: Some("alpha doc".into()),
15765 workspace: None,
15766 source_path: dir.path().join("a.jsonl"),
15767 started_at: Some(100),
15768 ended_at: None,
15769 metadata: serde_json::json!({}),
15770 messages: vec![NormalizedMessage {
15771 idx: 0,
15772 role: "user".into(),
15773 author: None,
15774 created_at: Some(100),
15775 content: "hello world findme alpha".into(),
15776 extra: serde_json::json!({}),
15777 snippets: vec![],
15778 invocations: Vec::new(),
15779 }],
15780 };
15781 let conv_b = NormalizedConversation {
15783 agent_slug: "claude".into(),
15784 external_id: None,
15785 title: Some("beta doc".into()),
15786 workspace: None,
15787 source_path: dir.path().join("b.jsonl"),
15788 started_at: Some(200),
15789 ended_at: None,
15790 metadata: serde_json::json!({}),
15791 messages: vec![NormalizedMessage {
15792 idx: 0,
15793 role: "user".into(),
15794 author: None,
15795 created_at: Some(200),
15796 content: "hello world findme beta".into(),
15797 extra: serde_json::json!({}),
15798 snippets: vec![],
15799 invocations: Vec::new(),
15800 }],
15801 };
15802 index.add_conversation(&conv_a)?;
15803 index.add_conversation(&conv_b)?;
15804 index.commit()?;
15805
15806 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15807
15808 let mut filters = SearchFilters::default();
15810 filters.agents.insert("codex".into());
15811
15812 let hits = client.search("findme", filters.clone(), 10, 0, FieldMask::FULL)?;
15813
15814 for hit in &hits {
15816 assert_eq!(
15817 hit.agent, "codex",
15818 "Agent filter violated: got agent '{}' instead of 'codex'",
15819 hit.agent
15820 );
15821 }
15822 assert!(!hits.is_empty(), "Should have found results");
15823
15824 let cached_hits = client.search("findme", filters, 10, 0, FieldMask::FULL)?;
15826 for hit in &cached_hits {
15827 assert_eq!(hit.agent, "codex", "Cached search violated agent filter");
15828 }
15829
15830 Ok(())
15831 }
15832
15833 #[test]
15834 fn filter_fidelity_workspace_filter_respected() -> Result<()> {
15835 let dir = TempDir::new()?;
15837 let mut index = TantivyIndex::open_or_create(dir.path())?;
15838
15839 let conv_a = NormalizedConversation {
15841 agent_slug: "codex".into(),
15842 external_id: None,
15843 title: Some("ws_a doc".into()),
15844 workspace: Some(std::path::PathBuf::from("/workspace/alpha")),
15845 source_path: dir.path().join("a.jsonl"),
15846 started_at: Some(100),
15847 ended_at: None,
15848 metadata: serde_json::json!({}),
15849 messages: vec![NormalizedMessage {
15850 idx: 0,
15851 role: "user".into(),
15852 author: None,
15853 created_at: Some(100),
15854 content: "workspace test needle".into(),
15855 extra: serde_json::json!({}),
15856 snippets: vec![],
15857 invocations: Vec::new(),
15858 }],
15859 };
15860 let conv_b = NormalizedConversation {
15862 agent_slug: "codex".into(),
15863 external_id: None,
15864 title: Some("ws_b doc".into()),
15865 workspace: Some(std::path::PathBuf::from("/workspace/beta")),
15866 source_path: dir.path().join("b.jsonl"),
15867 started_at: Some(200),
15868 ended_at: None,
15869 metadata: serde_json::json!({}),
15870 messages: vec![NormalizedMessage {
15871 idx: 0,
15872 role: "user".into(),
15873 author: None,
15874 created_at: Some(200),
15875 content: "workspace test needle".into(),
15876 extra: serde_json::json!({}),
15877 snippets: vec![],
15878 invocations: Vec::new(),
15879 }],
15880 };
15881 index.add_conversation(&conv_a)?;
15882 index.add_conversation(&conv_b)?;
15883 index.commit()?;
15884
15885 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15886
15887 let mut filters = SearchFilters::default();
15889 filters.workspaces.insert("/workspace/beta".into());
15890
15891 let hits = client.search("needle", filters.clone(), 10, 0, FieldMask::FULL)?;
15892
15893 for hit in &hits {
15895 assert_eq!(
15896 hit.workspace, "/workspace/beta",
15897 "Workspace filter violated: got '{}' instead of '/workspace/beta'",
15898 hit.workspace
15899 );
15900 }
15901 assert!(!hits.is_empty(), "Should have found results");
15902
15903 let cached_hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
15905 for hit in &cached_hits {
15906 assert_eq!(
15907 hit.workspace, "/workspace/beta",
15908 "Cached search violated workspace filter"
15909 );
15910 }
15911
15912 Ok(())
15913 }
15914
15915 #[test]
15916 fn filter_fidelity_date_range_respected() -> Result<()> {
15917 let dir = TempDir::new()?;
15919 let mut index = TantivyIndex::open_or_create(dir.path())?;
15920
15921 let conv_early = NormalizedConversation {
15923 agent_slug: "codex".into(),
15924 external_id: None,
15925 title: Some("early".into()),
15926 workspace: None,
15927 source_path: dir.path().join("early.jsonl"),
15928 started_at: Some(100),
15929 ended_at: None,
15930 metadata: serde_json::json!({}),
15931 messages: vec![NormalizedMessage {
15932 idx: 0,
15933 role: "user".into(),
15934 author: None,
15935 created_at: Some(100),
15936 content: "date range test".into(),
15937 extra: serde_json::json!({}),
15938 snippets: vec![],
15939 invocations: Vec::new(),
15940 }],
15941 };
15942 let conv_middle = NormalizedConversation {
15944 agent_slug: "codex".into(),
15945 external_id: None,
15946 title: Some("middle".into()),
15947 workspace: None,
15948 source_path: dir.path().join("middle.jsonl"),
15949 started_at: Some(500),
15950 ended_at: None,
15951 metadata: serde_json::json!({}),
15952 messages: vec![NormalizedMessage {
15953 idx: 0,
15954 role: "user".into(),
15955 author: None,
15956 created_at: Some(500),
15957 content: "date range test".into(),
15958 extra: serde_json::json!({}),
15959 snippets: vec![],
15960 invocations: Vec::new(),
15961 }],
15962 };
15963 let conv_late = NormalizedConversation {
15965 agent_slug: "codex".into(),
15966 external_id: None,
15967 title: Some("late".into()),
15968 workspace: None,
15969 source_path: dir.path().join("late.jsonl"),
15970 started_at: Some(900),
15971 ended_at: None,
15972 metadata: serde_json::json!({}),
15973 messages: vec![NormalizedMessage {
15974 idx: 0,
15975 role: "user".into(),
15976 author: None,
15977 created_at: Some(900),
15978 content: "date range test".into(),
15979 extra: serde_json::json!({}),
15980 snippets: vec![],
15981 invocations: Vec::new(),
15982 }],
15983 };
15984 index.add_conversation(&conv_early)?;
15985 index.add_conversation(&conv_middle)?;
15986 index.add_conversation(&conv_late)?;
15987 index.commit()?;
15988
15989 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15990
15991 let filters = SearchFilters {
15993 created_from: Some(400),
15994 created_to: Some(600),
15995 ..Default::default()
15996 };
15997
15998 let hits = client.search("range", filters.clone(), 10, 0, FieldMask::FULL)?;
15999
16000 for hit in &hits {
16002 if let Some(ts) = hit.created_at {
16003 assert!(
16004 (400..=600).contains(&ts),
16005 "Date range filter violated: got ts={ts} outside [400, 600]"
16006 );
16007 }
16008 }
16009 assert_eq!(hits.len(), 1, "Should find exactly 1 doc in range");
16011
16012 let cached_hits = client.search("range", filters, 10, 0, FieldMask::FULL)?;
16014 for hit in &cached_hits {
16015 if let Some(ts) = hit.created_at {
16016 assert!(
16017 (400..=600).contains(&ts),
16018 "Cached search violated date range filter"
16019 );
16020 }
16021 }
16022
16023 Ok(())
16024 }
16025
16026 #[test]
16027 fn filter_fidelity_combined_filters_respected() -> Result<()> {
16028 let dir = TempDir::new()?;
16030 let mut index = TantivyIndex::open_or_create(dir.path())?;
16031
16032 let combinations = [
16034 ("codex", "/ws/prod", 100), ("claude", "/ws/prod", 500), ("claude", "/ws/dev", 500), ("claude", "/ws/prod", 900), ];
16039
16040 for (i, (agent, ws, ts)) in combinations.iter().enumerate() {
16041 let conv = NormalizedConversation {
16042 agent_slug: (*agent).into(),
16043 external_id: None,
16044 title: Some(format!("combo-{i}")),
16045 workspace: Some(std::path::PathBuf::from(*ws)),
16046 source_path: dir.path().join(format!("{i}.jsonl")),
16047 started_at: Some(*ts),
16048 ended_at: None,
16049 metadata: serde_json::json!({}),
16050 messages: vec![NormalizedMessage {
16051 idx: 0,
16052 role: "user".into(),
16053 author: None,
16054 created_at: Some(*ts),
16055 content: "hello world combotest query".into(),
16056 extra: serde_json::json!({}),
16057 snippets: vec![],
16058 invocations: Vec::new(),
16059 }],
16060 };
16061 index.add_conversation(&conv)?;
16062 }
16063 index.commit()?;
16064
16065 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16066
16067 let mut filters = SearchFilters::default();
16069 filters.agents.insert("claude".into());
16070 filters.workspaces.insert("/ws/prod".into());
16071 filters.created_from = Some(400);
16072 filters.created_to = Some(600);
16073
16074 let hits = client.search("combotest", filters.clone(), 10, 0, FieldMask::FULL)?;
16075
16076 assert_eq!(hits.len(), 1, "Combined filter should match exactly 1 doc");
16078
16079 for hit in &hits {
16080 assert_eq!(hit.agent, "claude", "Agent filter violated");
16081 assert_eq!(hit.workspace, "/ws/prod", "Workspace filter violated");
16082 if let Some(ts) = hit.created_at {
16083 assert!((400..=600).contains(&ts), "Date filter violated: ts={ts}");
16084 }
16085 }
16086
16087 let cached = client.search("combotest", filters, 10, 0, FieldMask::FULL)?;
16089 assert_eq!(cached.len(), 1, "Cached result count mismatch");
16090
16091 Ok(())
16092 }
16093
16094 #[test]
16095 fn lexical_hits_normalize_trimmed_local_source_metadata() -> Result<()> {
16096 let dir = TempDir::new()?;
16097 let mut index = TantivyIndex::open_or_create(dir.path())?;
16098
16099 let conv = NormalizedConversation {
16100 agent_slug: "codex".into(),
16101 external_id: None,
16102 title: Some("trimmed local doc".into()),
16103 workspace: None,
16104 source_path: dir.path().join("trimmed-local.jsonl"),
16105 started_at: Some(100),
16106 ended_at: None,
16107 metadata: serde_json::json!({
16108 "cass": {
16109 "origin": {
16110 "source_id": " LOCAL ",
16111 "kind": "local"
16112 }
16113 }
16114 }),
16115 messages: vec![NormalizedMessage {
16116 idx: 0,
16117 role: "user".into(),
16118 author: None,
16119 created_at: Some(100),
16120 content: "trimmed local lexical".into(),
16121 extra: serde_json::json!({}),
16122 snippets: vec![],
16123 invocations: Vec::new(),
16124 }],
16125 };
16126 index.add_conversation(&conv)?;
16127 index.commit()?;
16128
16129 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16130 let hits = client.search("trimmed", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16131
16132 assert_eq!(hits.len(), 1);
16133 assert_eq!(hits[0].source_id, "local");
16134 assert_eq!(hits[0].origin_kind, "local");
16135
16136 Ok(())
16137 }
16138
16139 #[test]
16140 fn lexical_hits_normalize_remote_origin_kind_without_source_id() -> Result<()> {
16141 let dir = TempDir::new()?;
16142 let mut index = TantivyIndex::open_or_create(dir.path())?;
16143
16144 let conv = NormalizedConversation {
16145 agent_slug: "codex".into(),
16146 external_id: None,
16147 title: Some("remote lexical doc".into()),
16148 workspace: None,
16149 source_path: dir.path().join("remote-lexical.jsonl"),
16150 started_at: Some(100),
16151 ended_at: None,
16152 metadata: serde_json::json!({
16153 "cass": {
16154 "origin": {
16155 "source_id": " ",
16156 "kind": "ssh",
16157 "host": "dev@laptop"
16158 }
16159 }
16160 }),
16161 messages: vec![NormalizedMessage {
16162 idx: 0,
16163 role: "user".into(),
16164 author: None,
16165 created_at: Some(100),
16166 content: "remote lexical".into(),
16167 extra: serde_json::json!({}),
16168 snippets: vec![],
16169 invocations: Vec::new(),
16170 }],
16171 };
16172 index.add_conversation(&conv)?;
16173 index.commit()?;
16174
16175 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16176 let hits = client.search("remote", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16177
16178 assert_eq!(hits.len(), 1);
16179 assert_eq!(hits[0].source_id, "dev@laptop");
16180 assert_eq!(hits[0].origin_kind, "remote");
16181 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
16182
16183 Ok(())
16184 }
16185
16186 #[test]
16187 fn lexical_hits_infer_remote_origin_from_host_without_kind() -> Result<()> {
16188 let dir = TempDir::new()?;
16189 let mut index = TantivyIndex::open_or_create(dir.path())?;
16190
16191 let conv = NormalizedConversation {
16192 agent_slug: "codex".into(),
16193 external_id: None,
16194 title: Some("legacy host-only lexical doc".into()),
16195 workspace: None,
16196 source_path: dir.path().join("legacy-host-only-lexical.jsonl"),
16197 started_at: Some(100),
16198 ended_at: None,
16199 metadata: serde_json::json!({
16200 "cass": {
16201 "origin": {
16202 "source_id": " ",
16203 "host": "dev@laptop"
16204 }
16205 }
16206 }),
16207 messages: vec![NormalizedMessage {
16208 idx: 0,
16209 role: "user".into(),
16210 author: None,
16211 created_at: Some(100),
16212 content: "legacy remote lexical".into(),
16213 extra: serde_json::json!({}),
16214 snippets: vec![],
16215 invocations: Vec::new(),
16216 }],
16217 };
16218 index.add_conversation(&conv)?;
16219 index.commit()?;
16220
16221 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16222 let hits = client.search("legacy", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16223
16224 assert_eq!(hits.len(), 1);
16225 assert_eq!(hits[0].source_id, "dev@laptop");
16226 assert_eq!(hits[0].origin_kind, "remote");
16227 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
16228
16229 Ok(())
16230 }
16231
16232 #[test]
16233 fn filter_fidelity_source_filter_respected() -> Result<()> {
16234 let dir = TempDir::new()?;
16236 let mut index = TantivyIndex::open_or_create(dir.path())?;
16237
16238 let conv_local = NormalizedConversation {
16240 agent_slug: "codex".into(),
16241 external_id: None,
16242 title: Some("local doc".into()),
16243 workspace: None,
16244 source_path: dir.path().join("local.jsonl"),
16245 started_at: Some(100),
16246 ended_at: None,
16247 metadata: serde_json::json!({}),
16248 messages: vec![NormalizedMessage {
16249 idx: 0,
16250 role: "user".into(),
16251 author: None,
16252 created_at: Some(100),
16253 content: "source filter test local".into(),
16254 extra: serde_json::json!({}),
16255 snippets: vec![],
16256 invocations: Vec::new(),
16257 }],
16258 };
16259 index.add_conversation(&conv_local)?;
16262 index.commit()?;
16263
16264 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16265
16266 let filters = SearchFilters {
16268 source_filter: SourceFilter::Local,
16269 ..Default::default()
16270 };
16271
16272 let hits = client.search("source", filters.clone(), 10, 0, FieldMask::FULL)?;
16273
16274 for hit in &hits {
16276 assert_eq!(
16277 hit.source_id, "local",
16278 "Source filter violated: got source_id '{}' instead of 'local'",
16279 hit.source_id
16280 );
16281 }
16282 assert!(!hits.is_empty(), "Should have found local results");
16283
16284 let filters_id = SearchFilters {
16286 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
16287 ..Default::default()
16288 };
16289
16290 let hits_id = client.search("source", filters_id, 10, 0, FieldMask::FULL)?;
16291 for hit in &hits_id {
16292 assert_eq!(
16293 hit.source_id, "local",
16294 "SourceId filter violated: got '{}' instead of 'local'",
16295 hit.source_id
16296 );
16297 }
16298 assert!(
16299 !hits_id.is_empty(),
16300 "Should have found results for source_id=local"
16301 );
16302
16303 Ok(())
16304 }
16305
16306 #[test]
16307 fn filter_fidelity_cache_key_isolation() {
16308 let client = SearchClient {
16310 reader: None,
16311 sqlite: Mutex::new(None),
16312 sqlite_path: None,
16313 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16314 reload_on_search: true,
16315 last_reload: Mutex::new(None),
16316 last_generation: Mutex::new(None),
16317 reload_epoch: Arc::new(AtomicU64::new(0)),
16318 warm_tx: None,
16319 _warm_handle: None,
16320 metrics: Metrics::default(),
16321 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
16322 semantic: Mutex::new(None),
16323 last_tantivy_total_count: Mutex::new(None),
16324 };
16325
16326 let filters_empty = SearchFilters::default();
16327 let mut filters_agent = SearchFilters::default();
16328 filters_agent.agents.insert("codex".into());
16329
16330 let mut filters_ws = SearchFilters::default();
16331 filters_ws.workspaces.insert("/ws".into());
16332
16333 let key_empty = client.cache_key("test", &filters_empty);
16334 let key_agent = client.cache_key("test", &filters_agent);
16335 let key_ws = client.cache_key("test", &filters_ws);
16336
16337 assert_ne!(
16339 key_empty, key_agent,
16340 "Empty vs agent filter keys should differ"
16341 );
16342 assert_ne!(
16343 key_empty, key_ws,
16344 "Empty vs workspace filter keys should differ"
16345 );
16346 assert_ne!(
16347 key_agent, key_ws,
16348 "Agent vs workspace filter keys should differ"
16349 );
16350
16351 let mut filters_agent2 = SearchFilters::default();
16353 filters_agent2.agents.insert("codex".into());
16354 let key_agent2 = client.cache_key("test", &filters_agent2);
16355 assert_eq!(key_agent, key_agent2, "Same filter should produce same key");
16356 }
16357
16358 #[test]
16366 fn sanitize_query_preserves_unicode_alphanumeric() {
16367 assert_eq!(fs_cass_sanitize_query("こんにちは"), "こんにちは");
16369 assert_eq!(fs_cass_sanitize_query("café"), "café");
16370 assert_eq!(fs_cass_sanitize_query("日本語123"), "日本語123");
16371 }
16372
16373 #[test]
16374 fn sanitize_query_handles_multiple_consecutive_special_chars() {
16375 assert_eq!(fs_cass_sanitize_query("foo---bar"), "foo---bar");
16376 assert_eq!(fs_cass_sanitize_query("a!@#$%^&()b"), "a b");
16378 }
16379
16380 #[test]
16383 fn wildcard_pattern_empty_after_trim_returns_exact_empty() {
16384 assert_eq!(
16385 FsCassWildcardPattern::parse("*"),
16386 FsCassWildcardPattern::Exact(String::new())
16387 );
16388 assert_eq!(
16389 FsCassWildcardPattern::parse("**"),
16390 FsCassWildcardPattern::Exact(String::new())
16391 );
16392 assert_eq!(
16393 FsCassWildcardPattern::parse("***"),
16394 FsCassWildcardPattern::Exact(String::new())
16395 );
16396 }
16397
16398 #[test]
16399 fn wildcard_pattern_to_regex_generation() {
16400 assert_eq!(FsCassWildcardPattern::Exact("foo".into()).to_regex(), None);
16402 assert_eq!(FsCassWildcardPattern::Prefix("foo".into()).to_regex(), None);
16403 assert_eq!(
16406 FsCassWildcardPattern::Suffix("foo".into()).to_regex(),
16407 Some(".*foo$".into())
16408 );
16409 assert_eq!(
16410 FsCassWildcardPattern::Substring("foo".into()).to_regex(),
16411 Some(".*foo.*".into())
16412 );
16413 }
16414
16415 #[test]
16418 fn parse_boolean_query_prefix_minus_not() {
16419 let tokens = fs_cass_parse_boolean_query("-world");
16421 let expected = vec![
16422 FsCassQueryToken::Not,
16423 FsCassQueryToken::Term("world".into()),
16424 ];
16425 assert_eq!(tokens, expected);
16426
16427 let tokens = fs_cass_parse_boolean_query("hello -world");
16429 let expected = vec![
16430 FsCassQueryToken::Term("hello".into()),
16431 FsCassQueryToken::Not,
16432 FsCassQueryToken::Term("world".into()),
16433 ];
16434 assert_eq!(tokens, expected);
16435 }
16436
16437 #[test]
16438 fn parse_boolean_query_empty_quoted_phrase_ignored() {
16439 let tokens = parse_boolean_query("\"\"");
16440 assert!(tokens.is_empty());
16441
16442 let tokens = parse_boolean_query("foo \"\" bar");
16443 let expected: QueryTokenList = vec![
16444 QueryToken::Term("foo".into()),
16445 QueryToken::Term("bar".into()),
16446 ];
16447 assert_eq!(tokens, expected);
16448 }
16449
16450 #[test]
16451 fn parse_boolean_query_unclosed_quote() {
16452 let tokens = parse_boolean_query("\"hello world");
16454 let expected: QueryTokenList = vec![QueryToken::Phrase("hello world".into())];
16455 assert_eq!(tokens, expected);
16456 }
16457
16458 #[test]
16459 fn transpile_to_fts5_rejects_leading_unary_not_queries() {
16460 assert_eq!(transpile_to_fts5("NOT foo"), None);
16461 assert_eq!(transpile_to_fts5("-foo"), None);
16462 }
16463
16464 #[test]
16465 fn transpile_to_fts5_rejects_or_not_forms_it_cannot_represent() {
16466 assert_eq!(transpile_to_fts5("foo OR NOT bar"), None);
16467 assert_eq!(transpile_to_fts5("foo NOT bar OR baz"), None);
16468 }
16469
16470 #[test]
16471 fn transpile_to_fts5_ignores_leading_or() {
16472 assert_eq!(transpile_to_fts5("OR test"), Some("test".to_string()));
16473 assert_eq!(
16474 transpile_to_fts5("OR foo-bar"),
16475 Some("(foo AND bar)".to_string())
16476 );
16477 }
16478
16479 #[test]
16480 fn transpile_to_fts5_splits_hyphenated_subterms_for_sqlite_fts() {
16481 assert_eq!(
16482 transpile_to_fts5("br-123.jsonl"),
16483 Some("(br AND 123 AND jsonl)".to_string())
16484 );
16485 assert_eq!(
16486 transpile_to_fts5("br-123.json*"),
16487 Some("(br AND 123 AND json*)".to_string())
16488 );
16489 }
16490
16491 #[test]
16492 fn transpile_to_fts5_preserves_supported_binary_not() {
16493 assert_eq!(
16494 transpile_to_fts5("foo NOT bar").as_deref(),
16495 Some("foo NOT bar")
16496 );
16497 assert_eq!(
16498 transpile_to_fts5("foo NOT bar-baz"),
16499 Some("foo NOT (bar AND baz)".to_string())
16500 );
16501 }
16502
16503 #[test]
16504 fn search_sqlite_fts5_returns_empty_when_sqlite_is_unavailable() {
16505 let client = SearchClient {
16506 reader: None,
16507 sqlite: Mutex::new(None),
16508 sqlite_path: None,
16509 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16510 reload_on_search: false,
16511 last_reload: Mutex::new(None),
16512 last_generation: Mutex::new(None),
16513 reload_epoch: Arc::new(AtomicU64::new(0)),
16514 warm_tx: None,
16515 _warm_handle: None,
16516 metrics: Metrics::default(),
16517 cache_namespace: "fts5-disabled".to_string(),
16518 semantic: Mutex::new(None),
16519 last_tantivy_total_count: Mutex::new(None),
16520 };
16521
16522 let hits = client.search_sqlite_fts5(
16523 Path::new("/nonexistent"),
16524 "test query",
16525 SearchFilters::default(),
16526 10,
16527 0,
16528 FieldMask::FULL,
16529 );
16530
16531 assert!(hits.is_ok(), "disabled FTS5 path should stay non-fatal");
16532 assert!(
16533 hits.unwrap().is_empty(),
16534 "unavailable SQLite fallback should keep returning an empty result set"
16535 );
16536 }
16537
16538 #[test]
16560 fn search_sqlite_fts5_rank_and_hydrate_split_preserves_limit_prefix_invariant() -> Result<()> {
16561 let conn = Connection::open(":memory:")?;
16562 conn.execute_batch(
16563 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
16564 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
16565 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
16566 CREATE TABLE conversations (
16567 id INTEGER PRIMARY KEY,
16568 agent_id INTEGER,
16569 workspace_id INTEGER,
16570 source_id TEXT,
16571 origin_host TEXT,
16572 title TEXT,
16573 source_path TEXT
16574 );
16575 CREATE TABLE messages (
16576 id INTEGER PRIMARY KEY,
16577 conversation_id INTEGER,
16578 idx INTEGER,
16579 content TEXT,
16580 created_at INTEGER
16581 );
16582 CREATE VIRTUAL TABLE fts_messages USING fts5(
16583 content,
16584 title,
16585 agent,
16586 workspace,
16587 source_path,
16588 created_at UNINDEXED,
16589 message_id UNINDEXED,
16590 tokenize='porter'
16591 );",
16592 )?;
16593 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
16594 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
16595 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/tmp/k0e5p')")?;
16596
16597 for (i, repeats) in (1..=6_i64).enumerate() {
16604 let conv_id = i as i64 + 1;
16605 let msg_id = (i as i64 + 1) * 10;
16606 conn.execute_compat(
16607 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, \
16608 origin_host, title, source_path) \
16609 VALUES(?1, 1, 1, 'local', NULL, ?2, ?3)",
16610 params![
16611 conv_id,
16612 format!("k0e5p-{}", i),
16613 format!("/tmp/k0e5p/{}.jsonl", i),
16614 ],
16615 )?;
16616 let content = "rankprobe ".repeat(repeats as usize);
16617 conn.execute_compat(
16618 "INSERT INTO messages(id, conversation_id, idx, content, created_at) \
16619 VALUES(?1, ?2, ?3, ?4, ?5)",
16620 params![
16621 msg_id,
16622 conv_id,
16623 i as i64,
16624 content.as_str(),
16625 1_700_000_000_i64 + i as i64
16626 ],
16627 )?;
16628 conn.execute_compat(
16629 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, \
16630 source_path, created_at, message_id) \
16631 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
16632 params![
16633 msg_id,
16634 content.as_str(),
16635 format!("k0e5p-{}", i),
16636 "codex",
16637 "/tmp/k0e5p",
16638 format!("/tmp/k0e5p/{}.jsonl", i),
16639 1_700_000_000_i64 + i as i64,
16640 msg_id,
16641 ],
16642 )?;
16643 }
16644
16645 let client = SearchClient {
16646 reader: None,
16647 sqlite: Mutex::new(Some(SendConnection(conn))),
16648 sqlite_path: None,
16649 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16650 reload_on_search: false,
16651 last_reload: Mutex::new(None),
16652 last_generation: Mutex::new(None),
16653 reload_epoch: Arc::new(AtomicU64::new(0)),
16654 warm_tx: None,
16655 _warm_handle: None,
16656 metrics: Metrics::default(),
16657 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:k0e5p"),
16658 semantic: Mutex::new(None),
16659 last_tantivy_total_count: Mutex::new(None),
16660 };
16661
16662 fn hit_keys(hits: &[SearchHit]) -> Vec<(String, Option<usize>)> {
16667 hits.iter()
16668 .map(|h| (h.source_path.clone(), h.line_number))
16669 .collect()
16670 }
16671
16672 let large_hits = client.search_sqlite_fts5(
16673 Path::new(":memory:"),
16674 "rankprobe",
16675 SearchFilters::default(),
16676 6,
16677 0,
16678 FieldMask::FULL,
16679 )?;
16680 assert_eq!(
16681 large_hits.len(),
16682 6,
16683 "limit=N must return all N candidates when the corpus has exactly N matches"
16684 );
16685
16686 let small_hits = client.search_sqlite_fts5(
16687 Path::new(":memory:"),
16688 "rankprobe",
16689 SearchFilters::default(),
16690 3,
16691 0,
16692 FieldMask::FULL,
16693 )?;
16694 assert_eq!(small_hits.len(), 3, "limit=3 must return exactly 3 hits");
16695
16696 let large_keys = hit_keys(&large_hits);
16699 let small_keys = hit_keys(&small_hits);
16700 assert_eq!(
16701 small_keys,
16702 large_keys[..3],
16703 "limit=3 hit keys MUST be the first 3 of limit=6 hit keys (rank+hydrate \
16704 split must not re-order or re-filter); small={small_keys:?} \
16705 large_prefix={:?}",
16706 &large_keys[..3]
16707 );
16708
16709 for (idx, (small, large)) in small_hits.iter().zip(large_hits.iter()).enumerate() {
16715 assert_eq!(
16716 small.content, large.content,
16717 "hit[{idx}] content must agree across limit=3 and limit=6: \
16718 small={:?} large={:?}",
16719 small.content, large.content
16720 );
16721 assert_eq!(
16722 small.title, large.title,
16723 "hit[{idx}] title must agree across limit=3 and limit=6"
16724 );
16725 }
16726
16727 let zero_hits = client.search_sqlite_fts5(
16731 Path::new(":memory:"),
16732 "rankprobe",
16733 SearchFilters::default(),
16734 0,
16735 0,
16736 FieldMask::FULL,
16737 )?;
16738 assert!(
16739 zero_hits.is_empty(),
16740 "limit=0 must return zero hits even though the rank phase has candidates; \
16741 got {} hits",
16742 zero_hits.len()
16743 );
16744
16745 Ok(())
16746 }
16747
16748 #[test]
16751 fn levenshtein_distance_identical_strings() {
16752 assert_eq!(levenshtein_distance("hello", "hello"), 0);
16753 assert_eq!(levenshtein_distance("", ""), 0);
16754 }
16755
16756 #[test]
16757 fn levenshtein_distance_insertions() {
16758 assert_eq!(levenshtein_distance("", "abc"), 3);
16759 assert_eq!(levenshtein_distance("cat", "cats"), 1);
16760 }
16761
16762 #[test]
16763 fn levenshtein_distance_deletions() {
16764 assert_eq!(levenshtein_distance("abc", ""), 3);
16765 assert_eq!(levenshtein_distance("cats", "cat"), 1);
16766 }
16767
16768 #[test]
16769 fn levenshtein_distance_substitutions() {
16770 assert_eq!(levenshtein_distance("cat", "bat"), 1);
16771 assert_eq!(levenshtein_distance("kitten", "sitten"), 1);
16772 }
16773
16774 #[test]
16775 fn levenshtein_distance_mixed_operations() {
16776 assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
16777 assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
16778 }
16779
16780 #[test]
16783 fn is_tool_invocation_noise_allows_real_content() {
16784 assert!(!is_tool_invocation_noise("This is a normal message"));
16785 assert!(!is_tool_invocation_noise(
16786 "Let me use the Tool feature to accomplish this task. Here is the implementation..."
16787 ));
16788 let long_content = "[Tool: Read] Now here is a lot of useful content that explains the implementation details and provides context for the changes being made to the codebase.";
16790 assert!(!is_tool_invocation_noise(long_content));
16791 }
16792
16793 #[test]
16794 fn is_tool_invocation_noise_handles_short_tool_markers() {
16795 assert!(is_tool_invocation_noise("[tool: x]"));
16796 assert!(is_tool_invocation_noise("tool: bash"));
16797 }
16798
16799 #[test]
16802 fn search_boolean_and_filters_results() -> Result<()> {
16803 let dir = TempDir::new()?;
16804 let mut index = TantivyIndex::open_or_create(dir.path())?;
16805
16806 let conv1 = NormalizedConversation {
16808 agent_slug: "codex".into(),
16809 external_id: None,
16810 title: Some("doc1".into()),
16811 workspace: None,
16812 source_path: dir.path().join("1.jsonl"),
16813 started_at: Some(1),
16814 ended_at: None,
16815 metadata: serde_json::json!({}),
16816 messages: vec![NormalizedMessage {
16817 idx: 0,
16818 role: "user".into(),
16819 author: None,
16820 created_at: Some(1),
16821 content: "alpha beta gamma".into(),
16822 extra: serde_json::json!({}),
16823 snippets: vec![],
16824 invocations: Vec::new(),
16825 }],
16826 };
16827 let conv2 = NormalizedConversation {
16828 agent_slug: "codex".into(),
16829 external_id: None,
16830 title: Some("doc2".into()),
16831 workspace: None,
16832 source_path: dir.path().join("2.jsonl"),
16833 started_at: Some(2),
16834 ended_at: None,
16835 metadata: serde_json::json!({}),
16836 messages: vec![NormalizedMessage {
16837 idx: 0,
16838 role: "user".into(),
16839 author: None,
16840 created_at: Some(2),
16841 content: "alpha delta".into(),
16842 extra: serde_json::json!({}),
16843 snippets: vec![],
16844 invocations: Vec::new(),
16845 }],
16846 };
16847 index.add_conversation(&conv1)?;
16848 index.add_conversation(&conv2)?;
16849 index.commit()?;
16850
16851 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16852
16853 let hits = client.search(
16855 "alpha AND beta",
16856 SearchFilters::default(),
16857 10,
16858 0,
16859 FieldMask::FULL,
16860 )?;
16861 assert_eq!(hits.len(), 1);
16862 assert!(hits[0].content.contains("gamma"));
16863
16864 let hits = client.search(
16866 "alpha AND delta",
16867 SearchFilters::default(),
16868 10,
16869 0,
16870 FieldMask::FULL,
16871 )?;
16872 assert_eq!(hits.len(), 1);
16873 assert!(hits[0].content.contains("delta"));
16874
16875 Ok(())
16876 }
16877
16878 #[test]
16879 fn search_boolean_or_expands_results() -> Result<()> {
16880 let dir = TempDir::new()?;
16881 let mut index = TantivyIndex::open_or_create(dir.path())?;
16882
16883 let conv1 = NormalizedConversation {
16884 agent_slug: "codex".into(),
16885 external_id: None,
16886 title: Some("doc1".into()),
16887 workspace: None,
16888 source_path: dir.path().join("1.jsonl"),
16889 started_at: Some(1),
16890 ended_at: None,
16891 metadata: serde_json::json!({}),
16892 messages: vec![NormalizedMessage {
16893 idx: 0,
16894 role: "user".into(),
16895 author: None,
16896 created_at: Some(1),
16897 content: "unique xyzzy term".into(),
16898 extra: serde_json::json!({}),
16899 snippets: vec![],
16900 invocations: Vec::new(),
16901 }],
16902 };
16903 let conv2 = NormalizedConversation {
16904 agent_slug: "codex".into(),
16905 external_id: None,
16906 title: Some("doc2".into()),
16907 workspace: None,
16908 source_path: dir.path().join("2.jsonl"),
16909 started_at: Some(2),
16910 ended_at: None,
16911 metadata: serde_json::json!({}),
16912 messages: vec![NormalizedMessage {
16913 idx: 0,
16914 role: "user".into(),
16915 author: None,
16916 created_at: Some(2),
16917 content: "unique plugh term".into(),
16918 extra: serde_json::json!({}),
16919 snippets: vec![],
16920 invocations: Vec::new(),
16921 }],
16922 };
16923 index.add_conversation(&conv1)?;
16924 index.add_conversation(&conv2)?;
16925 index.commit()?;
16926
16927 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16928
16929 let hits = client.search(
16931 "xyzzy OR plugh",
16932 SearchFilters::default(),
16933 10,
16934 0,
16935 FieldMask::FULL,
16936 )?;
16937 assert_eq!(hits.len(), 2);
16938
16939 Ok(())
16940 }
16941
16942 #[test]
16943 fn search_boolean_not_excludes_results() -> Result<()> {
16944 let dir = TempDir::new()?;
16945 let mut index = TantivyIndex::open_or_create(dir.path())?;
16946
16947 let conv1 = NormalizedConversation {
16948 agent_slug: "codex".into(),
16949 external_id: None,
16950 title: Some("doc1".into()),
16951 workspace: None,
16952 source_path: dir.path().join("1.jsonl"),
16953 started_at: Some(1),
16954 ended_at: None,
16955 metadata: serde_json::json!({}),
16956 messages: vec![NormalizedMessage {
16957 idx: 0,
16958 role: "user".into(),
16959 author: None,
16960 created_at: Some(1),
16961 content: "nottest keep this".into(),
16962 extra: serde_json::json!({}),
16963 snippets: vec![],
16964 invocations: Vec::new(),
16965 }],
16966 };
16967 let conv2 = NormalizedConversation {
16968 agent_slug: "codex".into(),
16969 external_id: None,
16970 title: Some("doc2".into()),
16971 workspace: None,
16972 source_path: dir.path().join("2.jsonl"),
16973 started_at: Some(2),
16974 ended_at: None,
16975 metadata: serde_json::json!({}),
16976 messages: vec![NormalizedMessage {
16977 idx: 0,
16978 role: "user".into(),
16979 author: None,
16980 created_at: Some(2),
16981 content: "nottest exclude this".into(),
16982 extra: serde_json::json!({}),
16983 snippets: vec![],
16984 invocations: Vec::new(),
16985 }],
16986 };
16987 index.add_conversation(&conv1)?;
16988 index.add_conversation(&conv2)?;
16989 index.commit()?;
16990
16991 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16992
16993 let hits = client.search(
16995 "nottest NOT exclude",
16996 SearchFilters::default(),
16997 10,
16998 0,
16999 FieldMask::FULL,
17000 )?;
17001 assert_eq!(hits.len(), 1);
17002 assert!(
17004 !hits[0].content.contains("exclude"),
17005 "NOT exclude should filter out doc with 'exclude'"
17006 );
17007
17008 let hits = client.search(
17010 "nottest -exclude",
17011 SearchFilters::default(),
17012 10,
17013 0,
17014 FieldMask::FULL,
17015 )?;
17016 assert_eq!(hits.len(), 1);
17017 assert!(
17018 !hits[0].content.contains("exclude"),
17019 "Prefix -exclude should filter out doc with 'exclude'"
17020 );
17021
17022 Ok(())
17023 }
17024
17025 #[test]
17026 fn search_phrase_query_matches_exact_sequence() -> Result<()> {
17027 let dir = TempDir::new()?;
17028 let mut index = TantivyIndex::open_or_create(dir.path())?;
17029
17030 let conv1 = NormalizedConversation {
17031 agent_slug: "codex".into(),
17032 external_id: None,
17033 title: Some("doc1".into()),
17034 workspace: None,
17035 source_path: dir.path().join("1.jsonl"),
17036 started_at: Some(1),
17037 ended_at: None,
17038 metadata: serde_json::json!({}),
17039 messages: vec![NormalizedMessage {
17040 idx: 0,
17041 role: "user".into(),
17042 author: None,
17043 created_at: Some(1),
17044 content: "the quick brown fox".into(),
17045 extra: serde_json::json!({}),
17046 snippets: vec![],
17047 invocations: Vec::new(),
17048 }],
17049 };
17050 let conv2 = NormalizedConversation {
17051 agent_slug: "codex".into(),
17052 external_id: None,
17053 title: Some("doc2".into()),
17054 workspace: None,
17055 source_path: dir.path().join("2.jsonl"),
17056 started_at: Some(2),
17057 ended_at: None,
17058 metadata: serde_json::json!({}),
17059 messages: vec![NormalizedMessage {
17060 idx: 0,
17061 role: "user".into(),
17062 author: None,
17063 created_at: Some(2),
17064 content: "the brown quick fox".into(),
17065 extra: serde_json::json!({}),
17066 snippets: vec![],
17067 invocations: Vec::new(),
17068 }],
17069 };
17070 index.add_conversation(&conv1)?;
17071 index.add_conversation(&conv2)?;
17072 index.commit()?;
17073
17074 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17075
17076 let hits = client.search(
17078 "quick brown",
17079 SearchFilters::default(),
17080 10,
17081 0,
17082 FieldMask::FULL,
17083 )?;
17084 assert_eq!(hits.len(), 2);
17085
17086 let hits = client.search(
17088 "\"quick brown\"",
17089 SearchFilters::default(),
17090 10,
17091 0,
17092 FieldMask::FULL,
17093 )?;
17094 assert_eq!(hits.len(), 1);
17095 assert!(hits[0].content.contains("quick brown"));
17096
17097 Ok(())
17098 }
17099
17100 #[test]
17101 fn search_dot_punctuation_splits_terms_but_hyphens_preserve_compound_semantics() -> Result<()> {
17102 let dir = TempDir::new()?;
17103 let mut index = TantivyIndex::open_or_create(dir.path())?;
17104
17105 let conv = NormalizedConversation {
17106 agent_slug: "codex".into(),
17107 external_id: None,
17108 title: Some("doc".into()),
17109 workspace: None,
17110 source_path: dir.path().join("3.jsonl"),
17111 started_at: Some(1),
17112 ended_at: None,
17113 metadata: serde_json::json!({}),
17114 messages: vec![NormalizedMessage {
17115 idx: 0,
17116 role: "user".into(),
17117 author: None,
17118 created_at: Some(1),
17119 content: "foo bar baz".into(),
17120 extra: serde_json::json!({}),
17121 snippets: vec![],
17122 invocations: Vec::new(),
17123 }],
17124 };
17125 index.add_conversation(&conv)?;
17126 index.commit()?;
17127
17128 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17129
17130 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17131 assert_eq!(hits.len(), 1);
17132
17133 let hits = client.search("foo-bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17134 assert_eq!(hits.len(), 0);
17135
17136 Ok(())
17137 }
17138
17139 #[test]
17144 fn explanation_classifies_simple_query() {
17145 let exp = QueryExplanation::analyze("hello", &SearchFilters::default());
17146 assert_eq!(exp.query_type, QueryType::Simple);
17147 assert_eq!(exp.index_strategy, IndexStrategy::EdgeNgram);
17148 assert_eq!(exp.estimated_cost, QueryCost::Low);
17149 assert!(exp.parsed.terms.len() == 1);
17150 assert_eq!(exp.parsed.terms[0].text, "hello");
17151 assert!(!exp.parsed.terms[0].subterms.is_empty());
17152 assert_eq!(exp.parsed.terms[0].subterms[0].pattern, "exact");
17153 }
17154
17155 #[test]
17156 fn explanation_classifies_wildcard_query() {
17157 let exp = QueryExplanation::analyze("*handler*", &SearchFilters::default());
17158 assert_eq!(exp.query_type, QueryType::Wildcard);
17159 assert_eq!(exp.index_strategy, IndexStrategy::RegexScan);
17160 assert_eq!(exp.estimated_cost, QueryCost::High);
17161 assert!(!exp.parsed.terms[0].subterms.is_empty());
17162 assert!(
17163 exp.parsed.terms[0].subterms[0]
17164 .pattern
17165 .contains("substring")
17166 );
17167 assert!(exp.warnings.iter().any(|w| w.contains("regex scan")));
17168 }
17169
17170 #[test]
17171 fn explanation_classifies_boolean_query() {
17172 let exp = QueryExplanation::analyze("foo AND bar", &SearchFilters::default());
17173 assert_eq!(exp.query_type, QueryType::Boolean);
17174 assert_eq!(exp.index_strategy, IndexStrategy::BooleanCombination);
17175 assert!(exp.parsed.operators.contains(&"AND".to_string()));
17176 }
17177
17178 #[test]
17179 fn explanation_classifies_phrase_query() {
17180 let exp = QueryExplanation::analyze("\"exact phrase\"", &SearchFilters::default());
17181 assert_eq!(exp.query_type, QueryType::Phrase);
17182 assert!(exp.parsed.phrases.contains(&"exact phrase".to_string()));
17183 }
17184
17185 #[test]
17186 fn explanation_handles_filtered_query() {
17187 let mut filters = SearchFilters::default();
17188 filters.agents.insert("codex".to_string());
17189
17190 let exp = QueryExplanation::analyze("test", &filters);
17191 assert_eq!(exp.query_type, QueryType::Filtered);
17192 assert_eq!(exp.filters_summary.agent_count, 1);
17193 assert!(
17194 exp.filters_summary
17195 .description
17196 .as_ref()
17197 .unwrap()
17198 .contains("1 agent")
17199 );
17200 assert!(exp.warnings.iter().any(|w| w.contains("codex")));
17201 }
17202
17203 #[test]
17204 fn explanation_handles_empty_query() {
17205 let exp = QueryExplanation::analyze("", &SearchFilters::default());
17206 assert_eq!(exp.query_type, QueryType::Empty);
17207 assert_eq!(exp.index_strategy, IndexStrategy::FullScan);
17208 assert_eq!(exp.estimated_cost, QueryCost::High);
17209 assert!(exp.warnings.iter().any(|w| w.contains("Empty query")));
17210 }
17211
17212 #[test]
17213 fn explanation_warns_short_terms() {
17214 let exp = QueryExplanation::analyze("a", &SearchFilters::default());
17215 assert!(exp.warnings.iter().any(|w| w.contains("Very short term")));
17216 }
17217
17218 #[test]
17219 fn explanation_with_wildcard_fallback() {
17220 let exp = QueryExplanation::analyze("test", &SearchFilters::default())
17221 .with_wildcard_fallback(true);
17222 assert!(exp.wildcard_applied);
17223 assert!(exp.warnings.iter().any(|w| w.contains("Wildcard fallback")));
17225 }
17226
17227 #[test]
17228 fn explanation_complex_query_has_higher_cost() {
17229 let exp = QueryExplanation::analyze(
17230 "foo AND bar OR baz NOT qux AND \"phrase here\"",
17231 &SearchFilters::default(),
17232 );
17233 assert_eq!(exp.query_type, QueryType::Boolean);
17234 assert!(matches!(
17236 exp.estimated_cost,
17237 QueryCost::Medium | QueryCost::High
17238 ));
17239 }
17240
17241 #[test]
17242 fn explanation_preserves_original_query() {
17243 let exp = QueryExplanation::analyze("Hello World!", &SearchFilters::default());
17244 assert_eq!(exp.original_query, "Hello World!");
17245 assert!(exp.sanitized_query.contains("Hello"));
17247 assert!(!exp.sanitized_query.contains("!"));
17249 }
17250
17251 #[test]
17252 fn explanation_detects_not_operator() {
17253 let exp = QueryExplanation::analyze("foo NOT bar", &SearchFilters::default());
17254 assert!(exp.parsed.operators.contains(&"NOT".to_string()));
17255 assert!(
17257 exp.parsed
17258 .terms
17259 .iter()
17260 .any(|t| t.negated && t.text == "bar")
17261 );
17262 }
17263
17264 #[test]
17265 fn explanation_implicit_and() {
17266 let exp = QueryExplanation::analyze("foo bar", &SearchFilters::default());
17267 assert!(exp.parsed.implicit_and);
17268 assert_eq!(exp.parsed.terms.len(), 2);
17269 }
17270
17271 #[test]
17272 fn explanation_serializes_to_json() {
17273 let exp = QueryExplanation::analyze("test query", &SearchFilters::default());
17274 let json = serde_json::to_value(&exp).expect("should serialize");
17275 assert!(json["original_query"].is_string());
17276 assert!(json["query_type"].is_string());
17277 assert!(json["index_strategy"].is_string());
17278 assert!(json["estimated_cost"].is_string());
17279 assert!(json["parsed"]["terms"].is_array());
17280 }
17281
17282 #[test]
17287 fn search_multi_filter_agent_workspace_time() -> Result<()> {
17288 let dir = TempDir::new()?;
17290 let mut index = TantivyIndex::open_or_create(dir.path())?;
17291
17292 let convs = [
17294 ("codex", "/ws/alpha", 100, "needle alpha codex"),
17295 ("claude", "/ws/alpha", 200, "needle alpha claude"),
17296 ("codex", "/ws/beta", 150, "needle beta codex"),
17297 ("codex", "/ws/alpha", 300, "needle alpha codex late"),
17298 ];
17299
17300 for (i, (agent, ws, ts, content)) in convs.iter().enumerate() {
17301 let conv = NormalizedConversation {
17302 agent_slug: (*agent).into(),
17303 external_id: None,
17304 title: Some(format!("conv-{i}")),
17305 workspace: Some(std::path::PathBuf::from(*ws)),
17306 source_path: dir.path().join(format!("{i}.jsonl")),
17307 started_at: Some(*ts),
17308 ended_at: None,
17309 metadata: serde_json::json!({}),
17310 messages: vec![NormalizedMessage {
17311 idx: 0,
17312 role: "user".into(),
17313 author: None,
17314 created_at: Some(*ts),
17315 content: (*content).into(),
17316 extra: serde_json::json!({}),
17317 snippets: vec![],
17318 invocations: Vec::new(),
17319 }],
17320 };
17321 index.add_conversation(&conv)?;
17322 }
17323 index.commit()?;
17324
17325 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17326
17327 let mut filters = SearchFilters::default();
17329 filters.agents.insert("codex".into());
17330 filters.workspaces.insert("/ws/alpha".into());
17331 filters.created_from = Some(50);
17332 filters.created_to = Some(250);
17333
17334 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17335 assert_eq!(
17336 hits.len(),
17337 1,
17338 "Should match only one conv (codex + alpha + ts=100)"
17339 );
17340 assert_eq!(hits[0].agent, "codex");
17341 assert_eq!(hits[0].workspace, "/ws/alpha");
17342 assert!(hits[0].content.contains("alpha codex"));
17343 assert!(!hits[0].content.contains("late")); Ok(())
17346 }
17347
17348 #[test]
17349 fn search_multi_agent_filter() -> Result<()> {
17350 let dir = TempDir::new()?;
17352 let mut index = TantivyIndex::open_or_create(dir.path())?;
17353
17354 for agent in ["codex", "claude", "cline", "gemini"] {
17355 let conv = NormalizedConversation {
17356 agent_slug: agent.into(),
17357 external_id: None,
17358 title: Some(format!("{agent}-conv")),
17359 workspace: Some(std::path::PathBuf::from("/ws")),
17360 source_path: dir.path().join(format!("{agent}.jsonl")),
17361 started_at: Some(100),
17362 ended_at: None,
17363 metadata: serde_json::json!({}),
17364 messages: vec![NormalizedMessage {
17365 idx: 0,
17366 role: "user".into(),
17367 author: None,
17368 created_at: Some(100),
17369 content: format!("needle from {agent}"),
17370 extra: serde_json::json!({}),
17371 snippets: vec![],
17372 invocations: Vec::new(),
17373 }],
17374 };
17375 index.add_conversation(&conv)?;
17376 }
17377 index.commit()?;
17378
17379 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17380
17381 let mut filters = SearchFilters::default();
17383 filters.agents.insert("codex".into());
17384 filters.agents.insert("claude".into());
17385
17386 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17387 assert_eq!(hits.len(), 2);
17388 let agents: Vec<_> = hits.iter().map(|h| h.agent.as_str()).collect();
17389 assert!(agents.contains(&"codex"));
17390 assert!(agents.contains(&"claude"));
17391 assert!(!agents.contains(&"cline"));
17392 assert!(!agents.contains(&"gemini"));
17393
17394 Ok(())
17395 }
17396
17397 #[test]
17402 fn cache_metrics_incremented_on_operations() {
17403 let client = SearchClient {
17404 reader: None,
17405 sqlite: Mutex::new(None),
17406 sqlite_path: None,
17407 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
17408 reload_on_search: true,
17409 last_reload: Mutex::new(None),
17410 last_generation: Mutex::new(None),
17411 reload_epoch: Arc::new(AtomicU64::new(0)),
17412 warm_tx: None,
17413 _warm_handle: None,
17414 metrics: Metrics::default(),
17415 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
17416 semantic: Mutex::new(None),
17417 last_tantivy_total_count: Mutex::new(None),
17418 };
17419
17420 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
17422 assert_eq!((hits, miss, shortfall, reloads), (0, 0, 0, 0));
17423
17424 client.metrics.inc_cache_hits();
17426 client.metrics.inc_cache_hits();
17427 client.metrics.inc_cache_miss();
17428 client.metrics.inc_cache_shortfall();
17429 client.metrics.inc_reload();
17430
17431 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
17432 assert_eq!(hits, 2);
17433 assert_eq!(miss, 1);
17434 assert_eq!(shortfall, 1);
17435 assert_eq!(reloads, 1);
17436 }
17437
17438 #[test]
17439 fn cache_shard_name_deterministic() {
17440 let client = SearchClient {
17442 reader: None,
17443 sqlite: Mutex::new(None),
17444 sqlite_path: None,
17445 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
17446 reload_on_search: true,
17447 last_reload: Mutex::new(None),
17448 last_generation: Mutex::new(None),
17449 reload_epoch: Arc::new(AtomicU64::new(0)),
17450 warm_tx: None,
17451 _warm_handle: None,
17452 metrics: Metrics::default(),
17453 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
17454 semantic: Mutex::new(None),
17455 last_tantivy_total_count: Mutex::new(None),
17456 };
17457
17458 let filters1 = SearchFilters::default();
17459 let mut filters2 = SearchFilters::default();
17460 filters2.agents.insert("codex".into());
17461 let mut filters3 = SearchFilters::default();
17462 filters3.workspaces.insert("/tmp/cass-workspace".into());
17463
17464 let shard1_first = client.shard_name(&filters1);
17466 let shard1_second = client.shard_name(&filters1);
17467 assert_eq!(
17468 shard1_first, shard1_second,
17469 "Same filters should produce same shard name"
17470 );
17471
17472 let shard2 = client.shard_name(&filters2);
17474 assert_ne!(
17475 shard1_first, shard2,
17476 "Different filters should produce different shard names"
17477 );
17478
17479 assert_eq!(shard2, client.shard_name(&filters2));
17481 assert_eq!(
17482 client.shard_name(&filters3),
17483 "workspace:/tmp/cass-workspace"
17484 );
17485 }
17486
17487 #[test]
17492 fn wildcard_fallback_respects_filter_constraints() -> Result<()> {
17493 let dir = TempDir::new()?;
17494 let mut index = TantivyIndex::open_or_create(dir.path())?;
17495
17496 let conv_match = NormalizedConversation {
17498 agent_slug: "codex".into(),
17499 external_id: None,
17500 title: Some("match".into()),
17501 workspace: Some(std::path::PathBuf::from("/target")),
17502 source_path: dir.path().join("match.jsonl"),
17503 started_at: Some(100),
17504 ended_at: None,
17505 metadata: serde_json::json!({}),
17506 messages: vec![NormalizedMessage {
17507 idx: 0,
17508 role: "user".into(),
17509 author: None,
17510 created_at: Some(100),
17511 content: "unique specific term here".into(),
17512 extra: serde_json::json!({}),
17513 snippets: vec![],
17514 invocations: Vec::new(),
17515 }],
17516 };
17517
17518 let conv_other = NormalizedConversation {
17519 agent_slug: "claude".into(),
17520 external_id: None,
17521 title: Some("other".into()),
17522 workspace: Some(std::path::PathBuf::from("/other")),
17523 source_path: dir.path().join("other.jsonl"),
17524 started_at: Some(100),
17525 ended_at: None,
17526 metadata: serde_json::json!({}),
17527 messages: vec![NormalizedMessage {
17528 idx: 0,
17529 role: "user".into(),
17530 author: None,
17531 created_at: Some(100),
17532 content: "unique specific also here".into(),
17533 extra: serde_json::json!({}),
17534 snippets: vec![],
17535 invocations: Vec::new(),
17536 }],
17537 };
17538
17539 index.add_conversation(&conv_match)?;
17540 index.add_conversation(&conv_other)?;
17541 index.commit()?;
17542
17543 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17544
17545 let mut filters = SearchFilters::default();
17547 filters.agents.insert("codex".into());
17548
17549 let result =
17550 client.search_with_fallback("unique", filters.clone(), 10, 0, 100, FieldMask::FULL)?;
17551 assert!(result.hits.iter().all(|h| h.agent == "codex"));
17553
17554 Ok(())
17555 }
17556
17557 #[test]
17558 fn wildcard_fallback_short_query_triggers_prefix() -> Result<()> {
17559 let dir = TempDir::new()?;
17560 let mut index = TantivyIndex::open_or_create(dir.path())?;
17561
17562 let conv = NormalizedConversation {
17563 agent_slug: "codex".into(),
17564 external_id: None,
17565 title: Some("test".into()),
17566 workspace: None,
17567 source_path: dir.path().join("test.jsonl"),
17568 started_at: Some(100),
17569 ended_at: None,
17570 metadata: serde_json::json!({}),
17571 messages: vec![NormalizedMessage {
17572 idx: 0,
17573 role: "user".into(),
17574 author: None,
17575 created_at: Some(100),
17576 content: "authentication authorization oauth".into(),
17577 extra: serde_json::json!({}),
17578 snippets: vec![],
17579 invocations: Vec::new(),
17580 }],
17581 };
17582 index.add_conversation(&conv)?;
17583 index.commit()?;
17584
17585 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17586
17587 let result = client.search_with_fallback(
17589 "auth",
17590 SearchFilters::default(),
17591 10,
17592 0,
17593 100,
17594 FieldMask::FULL,
17595 )?;
17596 assert!(
17597 !result.hits.is_empty(),
17598 "Short prefix should match via prefix search"
17599 );
17600 assert!(result.hits[0].content.contains("auth"));
17601
17602 Ok(())
17603 }
17604
17605 #[test]
17610 fn search_real_fixture_multiple_messages() -> Result<()> {
17611 let dir = TempDir::new()?;
17612 let mut index = TantivyIndex::open_or_create(dir.path())?;
17613
17614 let conv = NormalizedConversation {
17616 agent_slug: "claude_code".into(),
17617 external_id: Some("conv-123".into()),
17618 title: Some("Implementing authentication".into()),
17619 workspace: Some(std::path::PathBuf::from("/home/user/project")),
17620 source_path: dir.path().join("session-1.jsonl"),
17621 started_at: Some(1700000000000),
17622 ended_at: Some(1700000060000),
17623 metadata: serde_json::json!({
17624 "model": "claude-3-sonnet",
17625 "tokens": 1500
17626 }),
17627 messages: vec![
17628 NormalizedMessage {
17629 idx: 0,
17630 role: "user".into(),
17631 author: Some("developer".into()),
17632 created_at: Some(1700000000000),
17633 content: "Help me implement JWT authentication for my Express API".into(),
17634 extra: serde_json::json!({}),
17635 snippets: vec![],
17636 invocations: Vec::new(),
17637 },
17638 NormalizedMessage {
17639 idx: 1,
17640 role: "assistant".into(),
17641 author: Some("claude".into()),
17642 created_at: Some(1700000010000),
17643 content: "I'll help you implement JWT authentication. First, let's install the required packages.".into(),
17644 extra: serde_json::json!({}),
17645 snippets: vec![NormalizedSnippet {
17646 file_path: Some("package.json".into()),
17647 start_line: Some(1),
17648 end_line: Some(5),
17649 language: Some("json".into()),
17650 snippet_text: Some(r#"{"dependencies":{"jsonwebtoken":"^9.0.0"}}"#.into()),
17651 }],
17652 invocations: Vec::new(),
17653 },
17654 NormalizedMessage {
17655 idx: 2,
17656 role: "user".into(),
17657 author: Some("developer".into()),
17658 created_at: Some(1700000030000),
17659 content: "Can you also add refresh token support?".into(),
17660 extra: serde_json::json!({}),
17661 snippets: vec![],
17662 invocations: Vec::new(),
17663 },
17664 ],
17665 };
17666 index.add_conversation(&conv)?;
17667 index.commit()?;
17668
17669 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17670
17671 let hits = client.search(
17673 "JWT authentication",
17674 SearchFilters::default(),
17675 10,
17676 0,
17677 FieldMask::FULL,
17678 )?;
17679 assert!(!hits.is_empty(), "Should find JWT authentication");
17680 assert!(hits.iter().any(|h| h.agent == "claude_code"));
17681 assert!(
17682 hits.iter()
17683 .any(|h| h.snippet.contains("JWT") || h.snippet.contains("authentication"))
17684 );
17685
17686 let hits = client.search(
17688 "required packages",
17689 SearchFilters::default(),
17690 10,
17691 0,
17692 FieldMask::FULL,
17693 )?;
17694 assert!(
17695 !hits.is_empty(),
17696 "Should find 'required packages' in assistant response"
17697 );
17698
17699 let hits = client.search(
17701 "refresh token",
17702 SearchFilters::default(),
17703 10,
17704 0,
17705 FieldMask::FULL,
17706 )?;
17707 assert!(!hits.is_empty(), "Should find refresh token");
17708 assert!(hits.iter().any(|h| h.content.contains("refresh")));
17709
17710 Ok(())
17711 }
17712
17713 #[test]
17714 fn search_deduplication_with_similar_content() -> Result<()> {
17715 let dir = TempDir::new()?;
17716 let mut index = TantivyIndex::open_or_create(dir.path())?;
17717
17718 for i in 0..2 {
17720 let conv = NormalizedConversation {
17721 agent_slug: "codex".into(),
17722 external_id: None,
17723 title: Some(format!("similar-{i}")),
17724 workspace: Some(std::path::PathBuf::from("/ws")),
17725 source_path: dir.path().join(format!("similar-{i}.jsonl")),
17726 started_at: Some(100 + i),
17727 ended_at: None,
17728 metadata: serde_json::json!({}),
17729 messages: vec![NormalizedMessage {
17730 idx: 0,
17731 role: "user".into(),
17732 author: None,
17733 created_at: Some(100 + i),
17734 content: "implement the sorting algorithm".into(),
17736 extra: serde_json::json!({}),
17737 snippets: vec![],
17738 invocations: Vec::new(),
17739 }],
17740 };
17741 index.add_conversation(&conv)?;
17742 }
17743 index.commit()?;
17744
17745 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17746 let result = client.search_with_fallback(
17747 "sorting algorithm",
17748 SearchFilters::default(),
17749 10,
17750 0,
17751 100,
17752 FieldMask::FULL,
17753 )?;
17754
17755 assert!(!result.hits.is_empty());
17758
17759 Ok(())
17760 }
17761
17762 #[test]
17767 fn search_session_paths_filter() -> Result<()> {
17768 let dir = TempDir::new()?;
17770 let mut index = TantivyIndex::open_or_create(dir.path())?;
17771
17772 let paths = [
17774 dir.path().join("session-a.jsonl"),
17775 dir.path().join("session-b.jsonl"),
17776 dir.path().join("session-c.jsonl"),
17777 ];
17778
17779 for (i, path) in paths.iter().enumerate() {
17780 let conv = NormalizedConversation {
17781 agent_slug: "claude".into(),
17782 external_id: None,
17783 title: Some(format!("session-{}", i)),
17784 workspace: Some(std::path::PathBuf::from("/ws")),
17785 source_path: path.clone(),
17786 started_at: Some(100 + i as i64),
17787 ended_at: None,
17788 metadata: serde_json::json!({}),
17789 messages: vec![NormalizedMessage {
17790 idx: 0,
17791 role: "user".into(),
17792 author: None,
17793 created_at: Some(100 + i as i64),
17794 content: format!("needle content for session {}", i),
17795 extra: serde_json::json!({}),
17796 snippets: vec![],
17797 invocations: Vec::new(),
17798 }],
17799 };
17800 index.add_conversation(&conv)?;
17801 }
17802 index.commit()?;
17803
17804 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17805
17806 let hits_all = client.search("needle", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17808 assert_eq!(hits_all.len(), 3, "Should find all 3 sessions");
17809
17810 let mut filters = SearchFilters::default();
17812 filters
17813 .session_paths
17814 .insert(paths[0].to_string_lossy().to_string());
17815 filters
17816 .session_paths
17817 .insert(paths[2].to_string_lossy().to_string());
17818
17819 let hits_filtered = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17820 assert_eq!(
17821 hits_filtered.len(),
17822 2,
17823 "Should find only 2 sessions (A and C)"
17824 );
17825
17826 let filtered_paths: HashSet<&str> = hits_filtered
17828 .iter()
17829 .map(|h| h.source_path.as_str())
17830 .collect();
17831 assert!(filtered_paths.contains(paths[0].to_string_lossy().as_ref()));
17832 assert!(filtered_paths.contains(paths[2].to_string_lossy().as_ref()));
17833 assert!(!filtered_paths.contains(paths[1].to_string_lossy().as_ref()));
17834
17835 Ok(())
17836 }
17837
17838 #[test]
17839 fn lexical_session_paths_filter_retries_past_initial_page() -> Result<()> {
17840 let dir = TempDir::new()?;
17841 let mut index = TantivyIndex::open_or_create(dir.path())?;
17842 let requested_path = dir.path().join("requested-session.jsonl");
17843
17844 for i in 0..4 {
17845 let conv = NormalizedConversation {
17846 agent_slug: "claude".into(),
17847 external_id: None,
17848 title: Some(format!("distractor-{i}")),
17849 workspace: Some(std::path::PathBuf::from("/ws")),
17850 source_path: dir.path().join(format!("distractor-{i}.jsonl")),
17851 started_at: Some(100 + i as i64),
17852 ended_at: None,
17853 metadata: serde_json::json!({}),
17854 messages: vec![NormalizedMessage {
17855 idx: 0,
17856 role: "user".into(),
17857 author: None,
17858 created_at: Some(100 + i as i64),
17859 content: "needle needle needle high ranking distractor".into(),
17860 extra: serde_json::json!({}),
17861 snippets: vec![],
17862 invocations: Vec::new(),
17863 }],
17864 };
17865 index.add_conversation(&conv)?;
17866 }
17867
17868 let requested = NormalizedConversation {
17869 agent_slug: "claude".into(),
17870 external_id: None,
17871 title: Some("requested".into()),
17872 workspace: Some(std::path::PathBuf::from("/ws")),
17873 source_path: requested_path.clone(),
17874 started_at: Some(200),
17875 ended_at: None,
17876 metadata: serde_json::json!({}),
17877 messages: vec![NormalizedMessage {
17878 idx: 0,
17879 role: "user".into(),
17880 author: None,
17881 created_at: Some(200),
17882 content: "needle requested session should survive post-filter paging".into(),
17883 extra: serde_json::json!({}),
17884 snippets: vec![],
17885 invocations: Vec::new(),
17886 }],
17887 };
17888 index.add_conversation(&requested)?;
17889 index.commit()?;
17890
17891 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17892 let mut filters = SearchFilters::default();
17893 filters
17894 .session_paths
17895 .insert(requested_path.to_string_lossy().to_string());
17896
17897 let hits = client.search("needle", filters, 1, 0, FieldMask::FULL)?;
17898
17899 assert_eq!(hits.len(), 1);
17900 assert_eq!(hits[0].source_path, requested_path.to_string_lossy());
17901
17902 Ok(())
17903 }
17904
17905 #[test]
17906 fn search_session_paths_empty_filter_returns_all() -> Result<()> {
17907 let dir = TempDir::new()?;
17909 let mut index = TantivyIndex::open_or_create(dir.path())?;
17910
17911 let conv = NormalizedConversation {
17912 agent_slug: "claude".into(),
17913 external_id: None,
17914 title: Some("test".into()),
17915 workspace: Some(std::path::PathBuf::from("/ws")),
17916 source_path: dir.path().join("test.jsonl"),
17917 started_at: Some(100),
17918 ended_at: None,
17919 metadata: serde_json::json!({}),
17920 messages: vec![NormalizedMessage {
17921 idx: 0,
17922 role: "user".into(),
17923 author: None,
17924 created_at: Some(100),
17925 content: "needle content".into(),
17926 extra: serde_json::json!({}),
17927 snippets: vec![],
17928 invocations: Vec::new(),
17929 }],
17930 };
17931 index.add_conversation(&conv)?;
17932 index.commit()?;
17933
17934 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17935
17936 let filters = SearchFilters::default();
17938 assert!(filters.session_paths.is_empty());
17939
17940 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17941 assert_eq!(hits.len(), 1);
17942
17943 Ok(())
17944 }
17945
17946 #[test]
17947 fn search_client_reads_federated_lexical_bundle_as_one_corpus() -> Result<()> {
17948 let root = TempDir::new()?;
17949 let shard_a = root.path().join("shard-a");
17950 let shard_b = root.path().join("shard-b");
17951 let published = root.path().join("published");
17952
17953 let mut shard_a_index = TantivyIndex::open_or_create(&shard_a)?;
17954 let mut shard_b_index = TantivyIndex::open_or_create(&shard_b)?;
17955
17956 let make_conv =
17957 |external_id: &str, title: &str, source_path: &str, tag: &str| NormalizedConversation {
17958 agent_slug: "codex".into(),
17959 external_id: Some(external_id.into()),
17960 title: Some(title.into()),
17961 workspace: Some(std::path::PathBuf::from("/ws")),
17962 source_path: std::path::PathBuf::from(source_path),
17963 started_at: Some(1_700_000_100_000),
17964 ended_at: Some(1_700_000_100_100),
17965 metadata: json!({}),
17966 messages: vec![
17967 NormalizedMessage {
17968 idx: 0,
17969 role: "user".into(),
17970 author: None,
17971 created_at: Some(1_700_000_100_010),
17972 content: format!("shared federated needle {tag} user"),
17973 extra: json!({}),
17974 snippets: vec![],
17975 invocations: Vec::new(),
17976 },
17977 NormalizedMessage {
17978 idx: 1,
17979 role: "assistant".into(),
17980 author: None,
17981 created_at: Some(1_700_000_100_020),
17982 content: format!("shared federated needle {tag} assistant"),
17983 extra: json!({}),
17984 snippets: vec![],
17985 invocations: Vec::new(),
17986 },
17987 ],
17988 };
17989
17990 let conv_a = make_conv(
17991 "fed-query-a",
17992 "Fed Query A",
17993 "/tmp/fed-query-a.jsonl",
17994 "alpha",
17995 );
17996 let conv_b = make_conv(
17997 "fed-query-b",
17998 "Fed Query B",
17999 "/tmp/fed-query-b.jsonl",
18000 "beta",
18001 );
18002
18003 shard_a_index.add_conversation(&conv_a)?;
18004 shard_b_index.add_conversation(&conv_b)?;
18005 shard_a_index.commit()?;
18006 shard_b_index.commit()?;
18007 drop(shard_a_index);
18008 drop(shard_b_index);
18009
18010 crate::search::tantivy::publish_federated_searchable_index_directories(
18011 &published,
18012 &[&shard_a, &shard_b],
18013 )?;
18014
18015 let client = SearchClient::open(&published, None)?.expect("federated index present");
18016 assert!(client.has_tantivy());
18017 assert_eq!(client.total_docs(), 4);
18018
18019 let hits = client.search(
18020 "shared federated needle",
18021 SearchFilters::default(),
18022 10,
18023 0,
18024 FieldMask::FULL,
18025 )?;
18026 assert_eq!(hits.len(), 4);
18027 let observed_order = hits
18028 .iter()
18029 .map(|hit| {
18030 (
18031 hit.source_path.clone(),
18032 hit.line_number,
18033 hit.content.clone(),
18034 hit.score.to_bits(),
18035 )
18036 })
18037 .collect::<Vec<_>>();
18038 let hit_paths = hits
18039 .iter()
18040 .map(|hit| hit.source_path.as_str())
18041 .collect::<std::collections::HashSet<_>>();
18042 assert!(hit_paths.contains("/tmp/fed-query-a.jsonl"));
18043 assert!(hit_paths.contains("/tmp/fed-query-b.jsonl"));
18044
18045 for attempt in 0..3 {
18046 let repeated = client.search(
18047 "shared federated needle",
18048 SearchFilters::default(),
18049 10,
18050 0,
18051 FieldMask::FULL,
18052 )?;
18053 let repeated_order = repeated
18054 .iter()
18055 .map(|hit| {
18056 (
18057 hit.source_path.clone(),
18058 hit.line_number,
18059 hit.content.clone(),
18060 hit.score.to_bits(),
18061 )
18062 })
18063 .collect::<Vec<_>>();
18064 assert_eq!(
18065 repeated_order, observed_order,
18066 "federated lexical query order drifted on repeated attempt {attempt}"
18067 );
18068 }
18069
18070 Ok(())
18071 }
18072
18073 #[test]
18074 fn semantic_search_session_paths_filter_retries_past_initial_candidates() -> Result<()> {
18075 let fixture = build_semantic_test_fixture()?;
18076 let mut filters = SearchFilters::default();
18077 filters
18078 .session_paths
18079 .insert(fixture.source_paths[2].clone());
18080
18081 let (hits, ann_stats) = fixture.client.search_semantic(
18082 "semantic fixture query",
18083 filters,
18084 1,
18085 0,
18086 FieldMask::FULL,
18087 false,
18088 )?;
18089
18090 assert!(
18091 ann_stats.is_none(),
18092 "exact search should not emit ANN stats"
18093 );
18094 assert_eq!(
18095 hits.len(),
18096 1,
18097 "filtered semantic search should still return a hit"
18098 );
18099 assert_eq!(
18100 hits[0].source_path, fixture.source_paths[2],
18101 "semantic search should keep searching until it finds the requested session path"
18102 );
18103
18104 Ok(())
18105 }
18106
18107 #[test]
18108 fn semantic_search_offsets_after_session_paths_filtering() -> Result<()> {
18109 let fixture = build_semantic_test_fixture()?;
18110 let mut filters = SearchFilters::default();
18111 filters
18112 .session_paths
18113 .insert(fixture.source_paths[1].clone());
18114 filters
18115 .session_paths
18116 .insert(fixture.source_paths[2].clone());
18117
18118 let (hits, _) = fixture.client.search_semantic(
18119 "semantic fixture query",
18120 filters,
18121 1,
18122 1,
18123 FieldMask::FULL,
18124 false,
18125 )?;
18126
18127 assert_eq!(
18128 hits.len(),
18129 1,
18130 "second filtered page should still return one hit"
18131 );
18132 assert_eq!(
18133 hits[0].source_path, fixture.source_paths[2],
18134 "offset must apply after semantic deduplication and session path filtering"
18135 );
18136
18137 Ok(())
18138 }
18139
18140 #[test]
18141 fn semantic_search_merges_sharded_vector_indexes() -> Result<()> {
18142 let fixture = build_sharded_semantic_test_fixture()?;
18143 let (hits, ann_stats) = fixture.client.search_semantic(
18144 "semantic fixture query",
18145 SearchFilters::default(),
18146 3,
18147 0,
18148 FieldMask::FULL,
18149 false,
18150 )?;
18151
18152 assert!(
18153 ann_stats.is_none(),
18154 "sharded exact search should not emit ANN stats"
18155 );
18156 assert_eq!(hits.len(), 3);
18157 assert_eq!(hits[0].source_path, fixture.source_paths[0]);
18158 assert_eq!(hits[1].source_path, fixture.source_paths[1]);
18159 assert_eq!(hits[2].source_path, fixture.source_paths[2]);
18160
18161 Ok(())
18162 }
18163
18164 #[test]
18165 fn progressive_phase_overfetches_before_session_paths_filtering() -> Result<()> {
18166 let fixture = build_semantic_test_fixture()?;
18167 let mut filters = SearchFilters::default();
18168 filters
18169 .session_paths
18170 .insert(fixture.source_paths[2].clone());
18171
18172 let results = vec![
18173 FsScoredResult {
18174 doc_id: fixture.doc_ids[0].clone(),
18175 score: 1.0,
18176 source: FsScoreSource::SemanticFast,
18177 index: None,
18178 fast_score: Some(1.0),
18179 quality_score: None,
18180 lexical_score: None,
18181 rerank_score: None,
18182 explanation: None,
18183 metadata: None,
18184 },
18185 FsScoredResult {
18186 doc_id: fixture.doc_ids[1].clone(),
18187 score: 0.9,
18188 source: FsScoreSource::SemanticFast,
18189 index: None,
18190 fast_score: Some(0.9),
18191 quality_score: None,
18192 lexical_score: None,
18193 rerank_score: None,
18194 explanation: None,
18195 metadata: None,
18196 },
18197 FsScoredResult {
18198 doc_id: fixture.doc_ids[2].clone(),
18199 score: 0.8,
18200 source: FsScoreSource::SemanticFast,
18201 index: None,
18202 fast_score: Some(0.8),
18203 quality_score: None,
18204 lexical_score: None,
18205 rerank_score: None,
18206 explanation: None,
18207 metadata: None,
18208 },
18209 ];
18210
18211 let result = fixture.client.progressive_phase_to_result(
18212 &results,
18213 ProgressivePhaseContext {
18214 query: "session path filter",
18215 filters: &filters,
18216 field_mask: FieldMask::FULL,
18217 lexical_cache: None,
18218 limit: 1,
18219 fetch_limit: 3,
18220 },
18221 )?;
18222
18223 assert_eq!(
18224 result.hits.len(),
18225 1,
18226 "progressive phase should retain enough overfetched hits to satisfy post-search session path filtering"
18227 );
18228 assert_eq!(
18229 result.hits[0].source_path, fixture.source_paths[2],
18230 "progressive phase should page after session path filtering"
18231 );
18232
18233 Ok(())
18234 }
18235
18236 #[test]
18241 fn sql_placeholders_empty() {
18242 assert_eq!(sql_placeholders(0), "");
18243 }
18244
18245 #[test]
18246 fn sql_placeholders_single() {
18247 assert_eq!(sql_placeholders(1), "?");
18248 }
18249
18250 #[test]
18251 fn sql_placeholders_multiple() {
18252 assert_eq!(sql_placeholders(3), "?,?,?");
18253 assert_eq!(sql_placeholders(5), "?,?,?,?,?");
18254 }
18255
18256 #[test]
18257 fn sql_placeholders_capacity_efficient() {
18258 let result = sql_placeholders(3);
18260 assert_eq!(result.len(), 5);
18261 assert!(result.capacity() >= 5); let result = sql_placeholders(10);
18265 assert_eq!(result.len(), 19);
18266 assert!(result.capacity() >= 19);
18267 }
18268
18269 #[test]
18270 fn sql_placeholders_large_count() {
18271 let result = sql_placeholders(100);
18273 assert_eq!(result.len(), 199); assert_eq!(result.chars().filter(|c| *c == '?').count(), 100);
18275 assert_eq!(result.chars().filter(|c| *c == ',').count(), 99);
18276 }
18277
18278 #[test]
18279 fn hybrid_budget_identifier_biases_lexical() {
18280 let budget = hybrid_candidate_budget("src/main.rs", 20, 20, 5, 10_000);
18281 assert!(
18282 budget.lexical_candidates > budget.semantic_candidates,
18283 "identifier queries should allocate more lexical than semantic fanout"
18284 );
18285 assert!(budget.lexical_candidates >= 25);
18286 }
18287
18288 #[test]
18289 fn hybrid_budget_natural_language_biases_semantic() {
18290 let budget = hybrid_candidate_budget(
18291 "how do we fix authentication middleware latency",
18292 20,
18293 20,
18294 5,
18295 10_000,
18296 );
18297 assert!(
18298 budget.semantic_candidates > budget.lexical_candidates,
18299 "natural language queries should allocate more semantic than lexical fanout"
18300 );
18301 }
18302
18303 #[test]
18304 fn hybrid_budget_no_limit_caps_both_lexical_and_semantic() {
18305 let total_docs = 2_000_000;
18313 let budget =
18314 hybrid_candidate_budget("authentication middleware", 0, total_docs, 0, total_docs);
18315 let cap = no_limit_result_cap();
18316 assert!(
18317 budget.lexical_candidates <= cap,
18318 "lexical fanout must respect no_limit_result_cap() = {cap}; got {}",
18319 budget.lexical_candidates
18320 );
18321 assert!(
18322 budget.lexical_candidates <= NO_LIMIT_RESULT_MAX,
18323 "lexical fanout must respect the absolute NO_LIMIT_RESULT_MAX; got {}",
18324 budget.lexical_candidates
18325 );
18326 assert!(budget.semantic_candidates <= HYBRID_NO_LIMIT_SEMANTIC_CAP);
18327 assert!(
18334 budget.semantic_candidates <= budget.lexical_candidates,
18335 "semantic ({}) must not exceed lexical ({}) fanout",
18336 budget.semantic_candidates,
18337 budget.lexical_candidates
18338 );
18339 }
18340
18341 #[test]
18342 fn compute_no_limit_result_cap_clamps_explicit_over_ceiling_env_override() {
18343 let cap = compute_no_limit_result_cap_from(Some("999999999999".to_string()), None, None);
18349 assert!(
18350 cap <= NO_LIMIT_RESULT_MAX,
18351 "explicit override must still clamp to ceiling; got {cap} > {NO_LIMIT_RESULT_MAX}"
18352 );
18353 assert!(cap >= NO_LIMIT_RESULT_MIN);
18354 }
18355
18356 #[test]
18357 fn compute_no_limit_result_cap_clamps_tiny_explicit_override_up_to_floor() {
18358 let cap = compute_no_limit_result_cap_from(Some("1".to_string()), None, None);
18360 assert_eq!(cap, NO_LIMIT_RESULT_MIN);
18361 }
18362
18363 #[test]
18364 fn compute_no_limit_result_cap_uses_meminfo_when_no_env_override() {
18365 let cap = compute_no_limit_result_cap_from(None, None, Some(128u64 * 1024 * 1024 * 1024));
18369 assert!(cap >= NO_LIMIT_RESULT_MIN, "cap {cap} below floor");
18370 assert!(cap <= NO_LIMIT_RESULT_MAX, "cap {cap} above ceiling");
18371 assert!(cap > NO_LIMIT_RESULT_MIN * 10);
18373 }
18374
18375 #[test]
18376 fn compute_no_limit_result_cap_falls_back_to_floor_when_meminfo_unavailable() {
18377 let cap = compute_no_limit_result_cap_from(None, None, None);
18381 assert!(cap >= NO_LIMIT_RESULT_MIN);
18382 assert!(cap <= NO_LIMIT_RESULT_MAX);
18383 }
18384
18385 #[test]
18386 fn compute_no_limit_result_cap_bytes_env_takes_priority_over_meminfo() {
18387 let four_gib = (4u64 * 1024 * 1024 * 1024).to_string();
18392 let cap = compute_no_limit_result_cap_from(
18393 None,
18394 Some(four_gib),
18395 Some(1024u64 * 1024 * 1024 * 1024), );
18397 let expected_hits = ((4u64 * 1024 * 1024 * 1024) / AVG_HIT_BYTES) as usize;
18398 let expected = expected_hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
18399 assert_eq!(cap, expected, "bytes env must win over meminfo");
18400 }
18401
18402 #[test]
18403 fn no_limit_budget_bytes_preserves_fallback_priority() {
18404 let huge_meminfo = Some(1024u64 * 1024 * 1024 * 1024);
18405 let four_gib = 4u64 * 1024 * 1024 * 1024;
18406
18407 assert_eq!(
18408 no_limit_budget_bytes(Some(four_gib.to_string()), huge_meminfo),
18409 four_gib
18410 );
18411 assert_eq!(
18412 no_limit_budget_bytes(Some("0".to_string()), huge_meminfo),
18413 NO_LIMIT_BYTES_CEILING
18414 );
18415 assert_eq!(no_limit_budget_bytes(None, None), NO_LIMIT_BYTES_FLOOR);
18416 }
18417
18418 #[test]
18419 fn compute_no_limit_result_cap_ignores_malformed_env() {
18420 for bad in ["", "abc", "0", "-1"] {
18422 let cap = compute_no_limit_result_cap_from(
18423 Some(bad.to_string()),
18424 Some(bad.to_string()),
18425 None,
18426 );
18427 assert!(cap >= NO_LIMIT_RESULT_MIN, "bad={bad:?} cap={cap}");
18428 assert!(cap <= NO_LIMIT_RESULT_MAX, "bad={bad:?} cap={cap}");
18429 }
18430 }
18431
18432 fn make_test_hit(id: &str, score: f32) -> SearchHit {
18437 SearchHit {
18438 title: id.to_string(),
18439 snippet: String::new(),
18440 content: id.to_string(),
18441 content_hash: stable_content_hash(id),
18442 score,
18443 source_path: format!("/path/{}.jsonl", id),
18444 agent: "test".to_string(),
18445 workspace: "/workspace".to_string(),
18446 workspace_original: None,
18447 created_at: Some(1_700_000_000_000),
18448 line_number: Some(1),
18449 match_type: MatchType::Exact,
18450 source_id: "local".to_string(),
18451 origin_kind: "local".to_string(),
18452 origin_host: None,
18453 conversation_id: None,
18454 }
18455 }
18456
18457 #[test]
18458 fn test_rrf_fusion_ordering() {
18459 let lexical = vec![
18462 make_test_hit("A", 10.0),
18463 make_test_hit("B", 8.0),
18464 make_test_hit("C", 6.0),
18465 ];
18466 let semantic = vec![
18467 make_test_hit("A", 0.9),
18468 make_test_hit("B", 0.7),
18469 make_test_hit("D", 0.5),
18470 ];
18471
18472 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18473
18474 assert_eq!(fused.len(), 4);
18476 assert_eq!(fused[0].title, "A"); assert_eq!(fused[1].title, "B"); }
18480
18481 #[test]
18482 fn test_rrf_handles_disjoint_sets() {
18483 let lexical = vec![make_test_hit("A", 10.0), make_test_hit("B", 8.0)];
18485 let semantic = vec![make_test_hit("C", 0.9), make_test_hit("D", 0.7)];
18486
18487 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18488
18489 assert_eq!(fused.len(), 4);
18491 let titles: Vec<&str> = fused.iter().map(|h| h.title.as_str()).collect();
18492 assert!(titles.contains(&"A"));
18493 assert!(titles.contains(&"B"));
18494 assert!(titles.contains(&"C"));
18495 assert!(titles.contains(&"D"));
18496 }
18497
18498 #[test]
18499 fn test_rrf_tie_breaking_deterministic() {
18500 let lexical = vec![
18502 make_test_hit("X", 5.0),
18503 make_test_hit("Y", 5.0),
18504 make_test_hit("Z", 5.0),
18505 ];
18506 let semantic = vec![]; let fused1 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18510 let fused2 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18511 let fused3 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18512
18513 assert_eq!(fused1.len(), fused2.len());
18515 assert_eq!(fused2.len(), fused3.len());
18516
18517 for i in 0..fused1.len() {
18518 assert_eq!(fused1[i].title, fused2[i].title, "Mismatch at index {}", i);
18519 assert_eq!(fused2[i].title, fused3[i].title, "Mismatch at index {}", i);
18520 }
18521 }
18522
18523 #[test]
18524 fn test_rrf_both_lists_bonus() {
18525 let lexical = vec![
18528 make_test_hit("solo_lex", 10.0), make_test_hit("both", 5.0), ];
18531 let semantic = vec![
18532 make_test_hit("solo_sem", 0.9), make_test_hit("both", 0.5), ];
18535
18536 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
18537
18538 assert_eq!(
18542 fused[0].title, "both",
18543 "Doc in both lists should rank first"
18544 );
18545 }
18546
18547 #[test]
18548 fn test_rrf_respects_limit_and_offset() {
18549 let lexical = vec![
18550 make_test_hit("A", 10.0),
18551 make_test_hit("B", 8.0),
18552 make_test_hit("C", 6.0),
18553 ];
18554 let semantic = vec![];
18555
18556 let fused = rrf_fuse_hits(&lexical, &semantic, "", 2, 0);
18558 assert_eq!(fused.len(), 2);
18559
18560 let fused_offset = rrf_fuse_hits(&lexical, &semantic, "", 10, 1);
18562 assert_eq!(fused_offset.len(), 2); let fused_empty = rrf_fuse_hits(&lexical, &semantic, "", 0, 0);
18566 assert!(fused_empty.is_empty());
18567 }
18568
18569 #[test]
18570 fn test_rrf_empty_inputs() {
18571 let empty: Vec<SearchHit> = vec![];
18572 let non_empty = vec![make_test_hit("A", 10.0)];
18573
18574 assert!(rrf_fuse_hits(&empty, &empty, "", 10, 0).is_empty());
18576
18577 let fused = rrf_fuse_hits(&empty, &non_empty, "", 10, 0);
18579 assert_eq!(fused.len(), 1);
18580 assert_eq!(fused[0].title, "A");
18581
18582 let fused = rrf_fuse_hits(&non_empty, &empty, "", 10, 0);
18584 assert_eq!(fused.len(), 1);
18585 assert_eq!(fused[0].title, "A");
18586 }
18587
18588 #[test]
18589 fn test_rrf_coalesces_empty_title_hits_across_search_modes() {
18590 let mut lexical = make_test_hit("shared", 10.0);
18591 lexical.title.clear();
18592 lexical.source_path = "/shared/untitled.jsonl".into();
18593 lexical.content = "same untitled body".into();
18594 lexical.content_hash = stable_content_hash("same untitled body");
18595
18596 let mut semantic = lexical.clone();
18597 semantic.score = 0.9;
18598
18599 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18600 assert_eq!(fused.len(), 1);
18601 assert_eq!(fused[0].title, "");
18602 }
18603
18604 #[test]
18605 fn test_rrf_coalesces_blank_local_source_id_hits_across_search_modes() {
18606 let mut lexical = make_test_hit("shared-local", 10.0);
18607 lexical.source_path = "/shared/local.jsonl".into();
18608 lexical.content = "same local body".into();
18609 lexical.content_hash = stable_content_hash("same local body");
18610 lexical.source_id = "local".into();
18611 lexical.origin_kind = "local".into();
18612
18613 let mut semantic = lexical.clone();
18614 semantic.source_id = " ".into();
18615 semantic.origin_kind = "local".into();
18616 semantic.score = 0.9;
18617
18618 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18619 assert_eq!(fused.len(), 1);
18620 assert_eq!(fused[0].source_id, "local");
18621 }
18622
18623 #[test]
18624 fn test_rrf_keeps_repeated_same_content_at_different_lines() {
18625 let mut first = make_test_hit("same", 10.0);
18626 first.title = "Shared Session".into();
18627 first.source_path = "/shared/session.jsonl".into();
18628 first.content = "repeat me".into();
18629 first.content_hash = stable_content_hash("repeat me");
18630 first.line_number = Some(1);
18631 first.created_at = Some(100);
18632
18633 let mut second = first.clone();
18634 second.line_number = Some(2);
18635 second.created_at = Some(200);
18636 second.score = 0.9;
18637
18638 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
18639 assert_eq!(fused.len(), 2);
18640 assert_eq!(fused[0].line_number, Some(1));
18641 assert_eq!(fused[1].line_number, Some(2));
18642 }
18643
18644 #[test]
18645 fn test_rrf_coalesces_present_and_missing_conversation_id_for_same_message() {
18646 let mut lexical = make_test_hit("same", 10.0);
18647 lexical.title = "Shared Session".into();
18648 lexical.source_path = "/shared/session.jsonl".into();
18649 lexical.content = "identical body".into();
18650 lexical.content_hash = stable_content_hash("identical body");
18651 lexical.created_at = Some(100);
18652 lexical.line_number = Some(1);
18653 lexical.conversation_id = None;
18654
18655 let mut semantic = lexical.clone();
18656 semantic.conversation_id = Some(42);
18657 semantic.score = 0.9;
18658
18659 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18660 assert_eq!(fused.len(), 1);
18661 assert_eq!(fused[0].conversation_id, Some(42));
18662 }
18663
18664 #[test]
18665 fn test_rrf_coalesces_present_and_missing_conversation_id_despite_blank_local_source_id() {
18666 let mut lexical = make_test_hit("same", 10.0);
18667 lexical.title = "Shared Session".into();
18668 lexical.source_path = "/shared/session.jsonl".into();
18669 lexical.content = "identical body".into();
18670 lexical.content_hash = stable_content_hash("identical body");
18671 lexical.created_at = Some(100);
18672 lexical.line_number = Some(1);
18673 lexical.conversation_id = None;
18674 lexical.source_id = "local".into();
18675 lexical.origin_kind = "local".into();
18676
18677 let mut semantic = lexical.clone();
18678 semantic.conversation_id = Some(42);
18679 semantic.source_id = " ".into();
18680 semantic.origin_kind = "local".into();
18681 semantic.score = 0.9;
18682
18683 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18684 assert_eq!(fused.len(), 1);
18685 assert_eq!(fused[0].conversation_id, Some(42));
18686 }
18687
18688 #[test]
18689 fn test_rrf_keeps_distinct_conversation_ids_for_shared_path_and_content() {
18690 let mut first = make_test_hit("same", 10.0);
18691 first.title = "Shared Session".into();
18692 first.source_path = "/shared/session.jsonl".into();
18693 first.content = "identical body".into();
18694 first.content_hash = stable_content_hash("identical body");
18695 first.conversation_id = Some(1);
18696
18697 let mut second = first.clone();
18698 second.conversation_id = Some(2);
18699 second.score = 0.9;
18700
18701 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
18702 assert_eq!(fused.len(), 2);
18703 assert!(fused.iter().any(|hit| hit.conversation_id == Some(1)));
18704 assert!(fused.iter().any(|hit| hit.conversation_id == Some(2)));
18705 }
18706
18707 #[test]
18708 fn test_rrf_coalesces_same_conversation_id_despite_title_drift() {
18709 let mut lexical = make_test_hit("same", 10.0);
18710 lexical.title = "Morning Session".into();
18711 lexical.source_path = "/shared/session.jsonl".into();
18712 lexical.content = "identical body".into();
18713 lexical.content_hash = stable_content_hash("identical body");
18714 lexical.conversation_id = Some(9);
18715
18716 let mut semantic = lexical.clone();
18717 semantic.title = "Evening Session".into();
18718 semantic.score = 0.9;
18719
18720 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18721 assert_eq!(fused.len(), 1);
18722 assert_eq!(fused[0].conversation_id, Some(9));
18723 }
18724
18725 #[test]
18726 fn test_rrf_keeps_distinct_titles_for_shared_path_and_content() {
18727 let mut morning = make_test_hit("same", 10.0);
18728 morning.title = "Morning Session".into();
18729 morning.source_path = "/shared/session.jsonl".into();
18730 morning.content = "identical body".into();
18731 morning.content_hash = stable_content_hash("identical body");
18732 morning.created_at = None;
18733
18734 let mut evening = morning.clone();
18735 evening.title = "Evening Session".into();
18736 evening.score = 0.9;
18737
18738 let fused = rrf_fuse_hits(&[morning], &[evening], "", 10, 0);
18739 assert_eq!(fused.len(), 2);
18740 assert!(fused.iter().any(|hit| hit.title == "Morning Session"));
18741 assert!(fused.iter().any(|hit| hit.title == "Evening Session"));
18742 }
18743
18744 #[test]
18745 fn test_rrf_candidate_depth() {
18746 let lexical: Vec<_> = (0..50)
18748 .map(|i| make_test_hit(&format!("L{}", i), 100.0 - i as f32))
18749 .collect();
18750 let semantic: Vec<_> = (0..50)
18751 .map(|i| make_test_hit(&format!("S{}", i), 1.0 - 0.01 * i as f32))
18752 .collect();
18753
18754 let fused = rrf_fuse_hits(&lexical, &semantic, "", 20, 0);
18755
18756 assert_eq!(fused.len(), 20);
18758
18759 let mut seen = std::collections::HashSet::new();
18761 for hit in &fused {
18762 assert!(seen.insert(&hit.title), "Duplicate hit: {}", hit.title);
18763 }
18764 }
18765
18766 #[test]
18771 fn query_token_list_parses_small_queries() {
18772 let cases = [
18773 ("hello", 1),
18774 ("hello world", 2),
18775 ("hello AND world", 3),
18776 ("hello world foo bar", 4),
18777 ];
18778
18779 for (query, expected_len) in cases {
18780 let tokens = parse_boolean_query(query);
18781 assert_eq!(tokens.len(), expected_len, "{query}");
18782 }
18783 }
18784
18785 #[test]
18786 fn query_token_list_parses_large_queries() {
18787 let tokens = parse_boolean_query("a b c d e f g h i");
18788 assert_eq!(tokens.len(), 9);
18789 }
18790
18791 #[test]
18792 fn query_token_list_handles_quoted_phrases() {
18793 let tokens = parse_boolean_query("\"hello world\" test");
18794 assert_eq!(tokens.len(), 2);
18795
18796 assert!(
18798 matches!(&tokens[0], QueryToken::Phrase(phrase) if phrase == "hello world"),
18799 "Expected Phrase token"
18800 );
18801 }
18802
18803 #[test]
18804 fn query_token_list_handles_operators() {
18805 let tokens = parse_boolean_query("foo AND bar OR baz");
18806 assert_eq!(tokens.len(), 5);
18807 assert_eq!(tokens[1], QueryToken::And);
18808 assert_eq!(tokens[3], QueryToken::Or);
18809 }
18810
18811 #[test]
18812 fn query_token_list_empty_query() {
18813 let tokens = parse_boolean_query("");
18814 assert!(tokens.is_empty());
18815 }
18816
18817 #[test]
18818 fn query_token_list_iteration_works() {
18819 let tokens = parse_boolean_query("a b c");
18820 let terms: Vec<_> = tokens
18821 .iter()
18822 .filter_map(|t| match t {
18823 QueryToken::Term(s) => Some(s.as_str()),
18824 _ => None,
18825 })
18826 .collect();
18827 assert_eq!(terms, vec!["a", "b", "c"]);
18828 }
18829
18830 #[test]
18840 fn unicode_emoji_treated_as_separator() {
18841 let sanitized = sanitize_query("🚀 launch");
18843 assert_eq!(sanitized, " launch", "Emoji should become space");
18844 }
18845
18846 #[test]
18847 fn unicode_emoji_splits_terms() {
18848 let sanitized = sanitize_query("hot🔥code");
18850 assert_eq!(sanitized, "hot code", "Emoji between words splits them");
18851 }
18852
18853 #[test]
18854 fn unicode_multiple_emoji_become_spaces() {
18855 let sanitized = sanitize_query("🚀🔥💻");
18856 assert_eq!(
18857 sanitized.trim(),
18858 "",
18859 "All-emoji query sanitizes to whitespace"
18860 );
18861 }
18862
18863 #[test]
18864 fn unicode_emoji_query_parses_without_panic() {
18865 let tokens = parse_boolean_query("🚀 launch code 🔥");
18866 let terms: Vec<_> = tokens
18867 .iter()
18868 .filter_map(|t| match t {
18869 QueryToken::Term(s) => Some(s.clone()),
18870 _ => None,
18871 })
18872 .collect();
18873 assert!(
18875 terms
18876 .iter()
18877 .any(|t| t.contains("launch") || t.contains("code"))
18878 );
18879 }
18880
18881 #[test]
18882 fn unicode_emoji_query_terms_lower() {
18883 let terms = QueryTermsLower::from_query("🚀 LAUNCH");
18884 let tokens: Vec<&str> = terms.tokens().collect();
18886 assert!(
18887 tokens.contains(&"launch"),
18888 "Should extract 'launch' from emoji query"
18889 );
18890 }
18891
18892 #[test]
18895 fn unicode_cjk_chinese_preserved() {
18896 assert_eq!(sanitize_query("测试代码"), "测试代码");
18897 assert_eq!(sanitize_query("测试 代码"), "测试 代码");
18898 }
18899
18900 #[test]
18901 fn unicode_cjk_japanese_preserved() {
18902 assert_eq!(sanitize_query("テスト"), "テスト");
18903 assert_eq!(sanitize_query("こんにちは世界"), "こんにちは世界");
18905 }
18906
18907 #[test]
18908 fn unicode_cjk_korean_preserved() {
18909 assert_eq!(sanitize_query("테스트"), "테스트");
18910 assert_eq!(sanitize_query("안녕하세요"), "안녕하세요");
18911 }
18912
18913 #[test]
18914 fn unicode_cjk_parsed_as_terms() {
18915 let tokens = parse_boolean_query("测试 代码 search");
18916 let terms: Vec<_> = tokens
18917 .iter()
18918 .filter_map(|t| match t {
18919 QueryToken::Term(s) => Some(s.as_str()),
18920 _ => None,
18921 })
18922 .collect();
18923 assert_eq!(terms, vec!["测试", "代码", "search"]);
18924 }
18925
18926 #[test]
18927 fn unicode_cjk_query_terms_lower() {
18928 let terms = QueryTermsLower::from_query("测试 代码");
18929 let tokens: Vec<&str> = terms.tokens().collect();
18930 assert_eq!(tokens, vec!["测试", "代码"]);
18931 }
18932
18933 #[test]
18936 fn unicode_hebrew_preserved() {
18937 assert_eq!(sanitize_query("שלום עולם"), "שלום עולם");
18938 }
18939
18940 #[test]
18941 fn unicode_arabic_preserved() {
18942 assert_eq!(sanitize_query("مرحبا"), "مرحبا");
18943 }
18944
18945 #[test]
18946 fn unicode_hebrew_parsed_as_terms() {
18947 let tokens = parse_boolean_query("שלום עולם");
18948 let terms: Vec<_> = tokens
18949 .iter()
18950 .filter_map(|t| match t {
18951 QueryToken::Term(s) => Some(s.as_str()),
18952 _ => None,
18953 })
18954 .collect();
18955 assert_eq!(terms, vec!["שלום", "עולם"]);
18956 }
18957
18958 #[test]
18959 fn unicode_arabic_query_terms_lower() {
18960 let terms = QueryTermsLower::from_query("مرحبا بالعالم");
18962 let tokens: Vec<&str> = terms.tokens().collect();
18963 assert_eq!(tokens, vec!["مرحبا", "بالعالم"]);
18964 }
18965
18966 #[test]
18969 fn unicode_mixed_scripts_preserved() {
18970 let sanitized = sanitize_query("Hello 世界 мир");
18971 assert_eq!(sanitized, "Hello 世界 мир");
18972 }
18973
18974 #[test]
18975 fn unicode_mixed_scripts_parsed() {
18976 let tokens = parse_boolean_query("Hello 世界 мир");
18977 let terms: Vec<_> = tokens
18978 .iter()
18979 .filter_map(|t| match t {
18980 QueryToken::Term(s) => Some(s.as_str()),
18981 _ => None,
18982 })
18983 .collect();
18984 assert_eq!(terms, vec!["Hello", "世界", "мир"]);
18985 }
18986
18987 #[test]
18988 fn unicode_mixed_scripts_with_emoji() {
18989 let sanitized = sanitize_query("Hello 🌍 世界");
18991 assert_eq!(sanitized, "Hello 世界");
18992 }
18993
18994 #[test]
18995 fn unicode_latin_cyrillic_arabic_query() {
18996 let terms = QueryTermsLower::from_query("Hello Мир مرحبا");
18997 let tokens: Vec<&str> = terms.tokens().collect();
18998 assert_eq!(tokens, vec!["hello", "мир", "مرحبا"]);
18999 }
19000
19001 #[test]
19004 fn unicode_zero_width_joiner_removed() {
19005 let sanitized = sanitize_query("test\u{200D}query");
19007 assert_eq!(sanitized, "test query");
19008 }
19009
19010 #[test]
19011 fn unicode_zero_width_non_joiner_removed() {
19012 let sanitized = sanitize_query("test\u{200C}query");
19014 assert_eq!(sanitized, "test query");
19015 }
19016
19017 #[test]
19018 fn unicode_zero_width_space_removed() {
19019 let sanitized = sanitize_query("test\u{200B}query");
19021 assert_eq!(sanitized, "test query");
19022 }
19023
19024 #[test]
19025 fn unicode_bom_removed() {
19026 let sanitized = sanitize_query("\u{FEFF}test");
19028 assert_eq!(sanitized, " test");
19029 }
19030
19031 #[test]
19034 fn unicode_precomposed_accent_preserved() {
19035 let sanitized = sanitize_query("café");
19037 assert_eq!(sanitized, "café");
19038 }
19039
19040 #[test]
19041 fn unicode_combining_accent_becomes_separator() {
19042 let input = "cafe\u{0301}";
19046 let sanitized = sanitize_query(input);
19047 assert_eq!(sanitized, "caf\u{00e9}");
19048 }
19049
19050 #[test]
19051 fn unicode_nfc_and_nfd_produce_same_sanitized_query() {
19052 let nfc = "caf\u{00E9}";
19054 let nfd = "cafe\u{0301}";
19056
19057 let san_nfc = sanitize_query(nfc);
19058 let san_nfd = sanitize_query(nfd);
19059
19060 assert_eq!(san_nfc, "café");
19064 assert_eq!(san_nfd, "café");
19065 assert_eq!(san_nfc, san_nfd);
19066 }
19067
19068 #[test]
19069 fn unicode_combining_marks_do_not_panic() {
19070 let zalgo = "t\u{0301}\u{0302}\u{0303}e\u{0304}\u{0305}st";
19072 let sanitized = sanitize_query(zalgo);
19073 assert!(sanitized.contains('t'));
19075 assert!(sanitized.contains('s'));
19076 }
19077
19078 #[test]
19081 fn unicode_mathematical_bold_letters_preserved() {
19082 let input = "\u{1D400}\u{1D401}\u{1D402}";
19084 let sanitized = sanitize_query(input);
19085 assert_eq!(
19086 sanitized, input,
19087 "Mathematical bold letters are alphanumeric"
19088 );
19089 }
19090
19091 #[test]
19092 fn unicode_supplementary_ideograph_preserved() {
19093 let input = "\u{20000}";
19095 let sanitized = sanitize_query(input);
19096 assert_eq!(
19097 sanitized, input,
19098 "Supplementary CJK ideographs are alphanumeric"
19099 );
19100 }
19101
19102 #[test]
19103 fn unicode_supplementary_emoji_removed() {
19104 let input = "test\u{1F600}query";
19106 let sanitized = sanitize_query(input);
19107 assert_eq!(sanitized, "test query");
19108 }
19109
19110 #[test]
19113 fn unicode_bidi_mixed_ltr_rtl_no_panic() {
19114 let input = "hello שלום world עולם";
19115 let tokens = parse_boolean_query(input);
19116 let terms: Vec<_> = tokens
19117 .iter()
19118 .filter_map(|t| match t {
19119 QueryToken::Term(s) => Some(s.as_str()),
19120 _ => None,
19121 })
19122 .collect();
19123 assert_eq!(terms.len(), 4);
19124 assert!(terms.contains(&"hello"));
19125 assert!(terms.contains(&"שלום"));
19126 assert!(terms.contains(&"world"));
19127 assert!(terms.contains(&"עולם"));
19128 }
19129
19130 #[test]
19131 fn unicode_bidi_override_chars_removed() {
19132 let input = "test\u{202D}content\u{202C}end";
19135 let sanitized = sanitize_query(input);
19136 assert_eq!(sanitized, "test content end");
19137 }
19138
19139 #[test]
19140 fn unicode_bidi_rtl_mark_removed() {
19141 let input = "test\u{200F}content";
19143 let sanitized = sanitize_query(input);
19144 assert_eq!(sanitized, "test content");
19145 }
19146
19147 #[test]
19150 fn unicode_full_pipeline_cjk_query() {
19151 let explanation = QueryExplanation::analyze("测试 代码", &SearchFilters::default());
19152 assert_eq!(explanation.parsed.terms.len(), 2);
19153 assert!(!explanation.parsed.terms[0].text.is_empty());
19154 assert!(!explanation.parsed.terms[1].text.is_empty());
19155 }
19156
19157 #[test]
19158 fn unicode_full_pipeline_mixed_script_boolean() {
19159 let explanation =
19160 QueryExplanation::analyze("Hello AND 世界 OR مرحبا", &SearchFilters::default());
19161 assert!(
19163 explanation.parsed.operators.iter().any(|op| op == "AND"),
19164 "AND operator should be recognized in mixed-script query"
19165 );
19166 }
19167
19168 #[test]
19169 fn unicode_full_pipeline_emoji_query_type() {
19170 let explanation = QueryExplanation::analyze("🚀🔥💻", &SearchFilters::default());
19172 assert!(
19174 explanation.parsed.terms.is_empty()
19175 || explanation
19176 .parsed
19177 .terms
19178 .iter()
19179 .all(|t| t.subterms.is_empty()),
19180 "All-emoji query should produce no meaningful terms"
19181 );
19182 }
19183
19184 #[test]
19185 fn unicode_full_pipeline_phrase_with_cjk() {
19186 let explanation = QueryExplanation::analyze("\"测试代码\"", &SearchFilters::default());
19187 assert!(
19188 !explanation.parsed.phrases.is_empty(),
19189 "CJK phrase should be recognized"
19190 );
19191 }
19192
19193 #[test]
19194 fn unicode_full_pipeline_wildcard_with_unicode() {
19195 let explanation = QueryExplanation::analyze("*测试*", &SearchFilters::default());
19196 assert!(
19197 !explanation.parsed.terms.is_empty(),
19198 "Wildcard with CJK should produce terms"
19199 );
19200 if let Some(term) = explanation.parsed.terms.first() {
19202 assert!(
19203 term.subterms
19204 .iter()
19205 .any(|s| s.pattern.contains("*") || s.pattern == "exact"),
19206 "CJK wildcard should produce wildcard or exact pattern"
19207 );
19208 }
19209 }
19210
19211 #[test]
19212 fn unicode_query_terms_lower_case_folding() {
19213 let terms = QueryTermsLower::from_query("STRAßE");
19215 assert_eq!(terms.query_lower, "straße");
19216
19217 let terms2 = QueryTermsLower::from_query("HELLO");
19220 assert_eq!(terms2.query_lower, "hello");
19221 }
19222
19223 #[test]
19224 fn unicode_normalize_term_parts_cjk() {
19225 let parts = normalize_term_parts("测试 代码");
19226 assert_eq!(parts, vec!["测试", "代码"]);
19227 }
19228
19229 #[test]
19230 fn unicode_normalize_term_parts_strips_emoji() {
19231 let parts = normalize_term_parts("🚀launch🔥code");
19232 assert!(parts.contains(&"launch".to_string()));
19234 assert!(parts.contains(&"code".to_string()));
19235 }
19236
19237 #[test]
19242 fn special_char_unbalanced_quote_no_panic() {
19243 let tokens = parse_boolean_query("\"hello world");
19244 assert!(
19245 tokens
19246 .iter()
19247 .any(|t| matches!(t, QueryToken::Phrase(p) if p.contains("hello"))),
19248 "Unbalanced quote should still produce a phrase: {tokens:?}"
19249 );
19250 }
19251
19252 #[test]
19253 fn special_char_unbalanced_trailing_quote() {
19254 let tokens = parse_boolean_query("test\"");
19255 assert!(
19256 tokens
19257 .iter()
19258 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
19259 "Text before trailing quote should parse as term: {tokens:?}"
19260 );
19261 }
19262
19263 #[test]
19264 fn special_char_multiple_unbalanced_quotes() {
19265 let tokens = parse_boolean_query("\"foo \"bar");
19266 assert!(
19267 !tokens.is_empty(),
19268 "Should parse despite odd quotes: {tokens:?}"
19269 );
19270 }
19271
19272 #[test]
19273 fn special_char_empty_quotes() {
19274 let tokens = parse_boolean_query("\"\" test");
19275 assert!(
19276 tokens
19277 .iter()
19278 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
19279 "Empty quotes should be skipped: {tokens:?}"
19280 );
19281 }
19282
19283 #[test]
19284 fn special_char_unbalanced_via_sanitize() {
19285 let sanitized = sanitize_query("\"hello world");
19286 assert!(
19287 sanitized.contains('"'),
19288 "Quotes preserved by sanitize_query"
19289 );
19290 }
19291
19292 #[test]
19295 fn special_char_backslash_quote_sanitize() {
19296 let sanitized = sanitize_query("\\\"test\\\"");
19297 assert!(sanitized.contains('"'));
19298 assert!(!sanitized.contains('\\'), "Backslash should be stripped");
19299 }
19300
19301 #[test]
19302 fn special_char_backslash_quote_parse() {
19303 let tokens = parse_boolean_query("\\\"test\\\"");
19304 assert!(!tokens.is_empty(), "Should parse without panic: {tokens:?}");
19305 }
19306
19307 #[test]
19308 fn special_char_inner_escaped_quotes() {
19309 let tokens = parse_boolean_query("\"test \\\"inner\\\" test\"");
19310 assert!(
19311 !tokens.is_empty(),
19312 "Nested escaped quotes should not panic: {tokens:?}"
19313 );
19314 }
19315
19316 #[test]
19319 fn special_char_windows_path_sanitize() {
19320 let sanitized = sanitize_query("C:\\Users\\test");
19321 assert_eq!(sanitized, "C Users test");
19322 }
19323
19324 #[test]
19325 fn special_char_unc_path_sanitize() {
19326 let sanitized = sanitize_query("\\\\server\\share");
19327 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19328 assert!(parts.contains(&"server"));
19329 assert!(parts.contains(&"share"));
19330 }
19331
19332 #[test]
19333 fn special_char_windows_path_terms() {
19334 let parts = normalize_term_parts("C:\\Users\\test\\file.rs");
19335 assert!(parts.contains(&"C".to_string()));
19336 assert!(parts.contains(&"Users".to_string()));
19337 assert!(parts.contains(&"test".to_string()));
19338 assert!(parts.contains(&"file".to_string()));
19339 assert!(parts.contains(&"rs".to_string()));
19340 }
19341
19342 #[test]
19345 fn special_char_regex_dot_star() {
19346 let sanitized = sanitize_query("foo.*bar");
19347 assert_eq!(sanitized, "foo *bar");
19348 }
19349
19350 #[test]
19351 fn special_char_regex_char_class() {
19352 let sanitized = sanitize_query("[a-z]+");
19353 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19354 assert_eq!(parts, vec!["a-z"]);
19355 assert_eq!(normalize_term_parts("[a-z]+"), vec!["a", "z"]);
19356 }
19357
19358 #[test]
19359 fn special_char_regex_anchors() {
19360 let sanitized = sanitize_query("^start$");
19361 assert_eq!(sanitized.trim(), "start");
19362 }
19363
19364 #[test]
19365 fn special_char_regex_pipe_groups() {
19366 let sanitized = sanitize_query("(foo|bar)");
19367 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19368 assert_eq!(parts, vec!["foo", "bar"]);
19369 }
19370
19371 #[test]
19374 fn special_char_sql_injection_or() {
19375 let sanitized = sanitize_query("'OR 1=1--");
19376 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19377 assert!(parts.contains(&"OR"));
19378 assert!(parts.contains(&"1"));
19379 assert!(!sanitized.contains('\''));
19380 assert!(!sanitized.contains('='));
19381 }
19382
19383 #[test]
19384 fn special_char_sql_injection_drop() {
19385 let sanitized = sanitize_query("; DROP TABLE users;--");
19386 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19387 assert!(parts.contains(&"DROP"));
19388 assert!(parts.contains(&"TABLE"));
19389 assert!(parts.contains(&"users"));
19390 assert!(!sanitized.contains(';'));
19391 }
19392
19393 #[test]
19394 fn special_char_sql_injection_union() {
19395 let sanitized = sanitize_query("' UNION SELECT * FROM passwords --");
19396 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19397 assert!(parts.contains(&"UNION"));
19398 assert!(parts.contains(&"SELECT"));
19399 assert!(parts.contains(&"*"));
19400 assert!(parts.contains(&"FROM"));
19401 assert!(parts.contains(&"passwords"));
19402 }
19403
19404 #[test]
19405 fn special_char_sql_parse_as_literal() {
19406 let tokens = parse_boolean_query("OR 1=1");
19407 assert!(
19408 tokens.iter().any(|t| matches!(t, QueryToken::Or)),
19409 "OR should be parsed as Or operator: {tokens:?}"
19410 );
19411 }
19412
19413 #[test]
19416 fn special_char_shell_subshell() {
19417 let sanitized = sanitize_query("$(cmd)");
19418 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19419 assert_eq!(parts, vec!["cmd"]);
19420 }
19421
19422 #[test]
19423 fn special_char_shell_backticks() {
19424 let sanitized = sanitize_query("`cmd`");
19425 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19426 assert_eq!(parts, vec!["cmd"]);
19427 }
19428
19429 #[test]
19430 fn special_char_shell_pipe_rm() {
19431 let sanitized = sanitize_query("| rm -rf /");
19432 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19433 assert!(parts.contains(&"rm"));
19434 assert!(parts.contains(&"-rf"));
19435 assert_eq!(normalize_term_parts("| rm -rf /"), vec!["rm", "rf"]);
19436 assert!(!sanitized.contains('|'));
19437 assert!(!sanitized.contains('/'));
19438 }
19439
19440 #[test]
19441 fn special_char_shell_semicolon_chain() {
19442 let sanitized = sanitize_query("test; echo pwned; cat /etc/passwd");
19443 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19444 assert!(parts.contains(&"test"));
19445 assert!(parts.contains(&"echo"));
19446 assert!(parts.contains(&"pwned"));
19447 assert!(!sanitized.contains(';'));
19448 }
19449
19450 #[test]
19453 fn special_char_null_byte_mid_string() {
19454 let sanitized = sanitize_query("test\x00hidden");
19455 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19456 assert_eq!(parts, vec!["test", "hidden"]);
19457 }
19458
19459 #[test]
19460 fn special_char_null_byte_leading() {
19461 let sanitized = sanitize_query("\x00\x00attack");
19462 assert_eq!(sanitized.trim(), "attack");
19463 }
19464
19465 #[test]
19466 fn special_char_null_byte_trailing() {
19467 let sanitized = sanitize_query("query\x00\x00\x00");
19468 assert_eq!(sanitized.trim(), "query");
19469 }
19470
19471 #[test]
19472 fn special_char_null_byte_parse() {
19473 let tokens = parse_boolean_query("test\x00hidden");
19474 assert!(
19475 !tokens.is_empty(),
19476 "Null bytes should not prevent parsing: {tokens:?}"
19477 );
19478 }
19479
19480 #[test]
19483 fn special_char_control_newline() {
19484 let sanitized = sanitize_query("line1\nline2");
19485 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19486 assert_eq!(parts, vec!["line1", "line2"]);
19487 }
19488
19489 #[test]
19490 fn special_char_control_tab_cr() {
19491 let sanitized = sanitize_query("tab\there\r\nend");
19492 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19493 assert_eq!(parts, vec!["tab", "here", "end"]);
19494 }
19495
19496 #[test]
19497 fn special_char_control_parse_whitespace() {
19498 let tokens = parse_boolean_query("hello\tworld\ntest");
19499 let terms: Vec<&str> = tokens
19500 .iter()
19501 .filter_map(|t| match t {
19502 QueryToken::Term(s) => Some(s.as_str()),
19503 _ => None,
19504 })
19505 .collect();
19506 assert_eq!(terms, vec!["hello", "world", "test"]);
19507 }
19508
19509 #[test]
19510 fn special_char_control_bell_escape() {
19511 let sanitized = sanitize_query("test\x07\x1b[31mred");
19512 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19513 assert!(parts.contains(&"test"));
19514 assert!(parts.contains(&"31mred"));
19515 }
19516
19517 #[test]
19520 fn special_char_html_entity_lt() {
19521 let sanitized = sanitize_query("<script>");
19522 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19523 assert_eq!(parts, vec!["lt", "script", "gt"]);
19524 }
19525
19526 #[test]
19527 fn special_char_html_numeric_entity() {
19528 let sanitized = sanitize_query("<script>");
19529 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19530 assert!(parts.contains(&"x3C"));
19531 assert!(parts.contains(&"script"));
19532 assert!(parts.contains(&"x3E"));
19533 }
19534
19535 #[test]
19536 fn special_char_html_tags_stripped() {
19537 let sanitized = sanitize_query("<script>alert('xss')</script>");
19538 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19539 assert!(parts.contains(&"script"));
19540 assert!(parts.contains(&"alert"));
19541 assert!(parts.contains(&"xss"));
19542 }
19543
19544 #[test]
19545 fn special_char_html_attribute() {
19546 let sanitized = sanitize_query("<img src=\"evil.js\" onerror=\"alert(1)\">");
19547 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19548 assert!(parts.contains(&"img"));
19549 assert!(parts.contains(&"src"));
19550 assert!(parts.contains(&"onerror"));
19551 }
19552
19553 #[test]
19556 fn special_char_url_percent_encoding() {
19557 let sanitized = sanitize_query("%20space%2Fslash");
19558 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19559 assert_eq!(parts, vec!["20space", "2Fslash"]);
19560 }
19561
19562 #[test]
19563 fn special_char_url_null_byte_encoded() {
19564 let sanitized = sanitize_query("test%00hidden");
19565 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19566 assert_eq!(parts, vec!["test", "00hidden"]);
19567 }
19568
19569 #[test]
19570 fn special_char_url_full_query_string() {
19571 let sanitized = sanitize_query("search?q=hello&lang=en");
19572 let parts: Vec<&str> = sanitized.split_whitespace().collect();
19573 assert_eq!(parts, vec!["search", "q", "hello", "lang", "en"]);
19574 }
19575
19576 #[test]
19579 fn special_char_explain_sql_injection() {
19580 let filters = SearchFilters::default();
19581 let explanation = QueryExplanation::analyze("'OR 1=1--", &filters);
19582 assert!(
19583 !explanation.parsed.terms.is_empty() || !explanation.parsed.phrases.is_empty(),
19584 "SQL injection should produce parseable terms"
19585 );
19586 }
19587
19588 #[test]
19589 fn special_char_explain_shell_injection() {
19590 let filters = SearchFilters::default();
19591 let explanation = QueryExplanation::analyze("$(rm -rf /)", &filters);
19592 assert!(
19593 !explanation.parsed.terms.is_empty(),
19594 "Shell injection should produce parseable terms"
19595 );
19596 }
19597
19598 #[test]
19599 fn special_char_explain_html_xss() {
19600 let filters = SearchFilters::default();
19601 let explanation = QueryExplanation::analyze("<script>alert('xss')</script>", &filters);
19602 assert!(
19603 !explanation.parsed.terms.is_empty(),
19604 "XSS payload should produce parseable terms"
19605 );
19606 }
19607
19608 #[test]
19609 fn special_char_terms_lower_injection() {
19610 let qt = QueryTermsLower::from_query("'; DROP TABLE--");
19611 let tokens: Vec<&str> = qt.tokens().collect();
19612 for token in &tokens {
19613 assert!(
19614 token.chars().all(|c| c.is_alphanumeric()),
19615 "Token should only contain alphanumeric characters: {token}"
19616 );
19617 }
19618 }
19619
19620 #[test]
19621 fn special_char_terms_lower_null_bytes() {
19622 let qt = QueryTermsLower::from_query("test\x00hidden");
19623 let tokens: Vec<&str> = qt.tokens().collect();
19624 assert!(tokens.contains(&"test"));
19625 assert!(tokens.contains(&"hidden"));
19626 }
19627
19628 #[test]
19629 fn special_char_boolean_with_injection() {
19630 let tokens = parse_boolean_query("search AND 'OR 1=1-- NOT drop");
19631 assert!(
19632 tokens.iter().any(|t| matches!(t, QueryToken::And)),
19633 "Boolean AND should still be recognized: {tokens:?}"
19634 );
19635 assert!(
19636 tokens.iter().any(|t| matches!(t, QueryToken::Not)),
19637 "Boolean NOT should still be recognized: {tokens:?}"
19638 );
19639 }
19640
19641 #[test]
19647 fn stress_query_100k_chars_completes_quickly() {
19648 let long_query = "a ".repeat(50000);
19650 assert_eq!(long_query.len(), 100000);
19651
19652 let start = std::time::Instant::now();
19653 let sanitized = sanitize_query(&long_query);
19654 let elapsed_sanitize = start.elapsed();
19655
19656 let start = std::time::Instant::now();
19657 let tokens = parse_boolean_query(&sanitized);
19658 let elapsed_parse = start.elapsed();
19659
19660 assert!(
19661 elapsed_sanitize < std::time::Duration::from_secs(1),
19662 "sanitize_query with 100k chars took {:?} (>1s)",
19663 elapsed_sanitize
19664 );
19665 assert!(
19666 elapsed_parse < std::time::Duration::from_secs(1),
19667 "parse_boolean_query with 100k chars took {:?} (>1s)",
19668 elapsed_parse
19669 );
19670 assert!(!tokens.is_empty(), "100k char query should produce tokens");
19671 }
19672
19673 #[test]
19674 fn stress_query_1000_terms() {
19675 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
19677 let query = words.join(" ");
19678
19679 let start = std::time::Instant::now();
19680 let sanitized = sanitize_query(&query);
19681 let tokens = parse_boolean_query(&sanitized);
19682 let elapsed = start.elapsed();
19683
19684 assert!(
19685 elapsed < std::time::Duration::from_secs(1),
19686 "1000 terms query took {:?} (>1s)",
19687 elapsed
19688 );
19689 let term_count = tokens
19691 .iter()
19692 .filter(|t| matches!(t, QueryToken::Term(_)))
19693 .count();
19694 assert!(
19695 term_count >= 900,
19696 "Expected ~1000 terms, got {} terms",
19697 term_count
19698 );
19699 }
19700
19701 #[test]
19702 fn stress_query_1000_identical_terms() {
19703 let query = "test ".repeat(1000);
19705
19706 let start = std::time::Instant::now();
19707 let sanitized = sanitize_query(&query);
19708 let tokens = parse_boolean_query(&sanitized);
19709 let elapsed = start.elapsed();
19710
19711 assert!(
19712 elapsed < std::time::Duration::from_secs(1),
19713 "1000 identical terms query took {:?} (>1s)",
19714 elapsed
19715 );
19716
19717 let parsed_term_count = tokens
19719 .iter()
19720 .filter(|t| matches!(t, QueryToken::Term(_)))
19721 .count();
19722 assert_eq!(parsed_term_count, 1000, "Parser should produce 1000 terms");
19723
19724 let qt = QueryTermsLower::from_query(&query);
19726 let tokens_lower: Vec<&str> = qt.tokens().collect();
19727 assert_eq!(
19728 tokens_lower.len(),
19729 1000,
19730 "All 1000 identical terms should be preserved"
19731 );
19732 assert!(
19733 tokens_lower.iter().all(|t| *t == "test"),
19734 "All tokens should be 'test'"
19735 );
19736 }
19737
19738 #[test]
19739 fn stress_query_10k_char_single_term() {
19740 let long_term = "a".repeat(10000);
19742
19743 let start = std::time::Instant::now();
19744 let sanitized = sanitize_query(&long_term);
19745 let tokens = parse_boolean_query(&sanitized);
19746 let elapsed = start.elapsed();
19747
19748 assert!(
19749 elapsed < std::time::Duration::from_secs(1),
19750 "10k char single term took {:?} (>1s)",
19751 elapsed
19752 );
19753 assert_eq!(tokens.len(), 1, "Should produce exactly one token");
19754 assert!(
19755 matches!(&tokens[0], QueryToken::Term(t) if t.len() == 10000),
19756 "Expected Term token"
19757 );
19758 }
19759
19760 #[test]
19761 fn stress_deeply_nested_parentheses() {
19762 let open_parens = "(".repeat(100);
19765 let close_parens = ")".repeat(100);
19766 let query = format!("{}test{}", open_parens, close_parens);
19767
19768 let start = std::time::Instant::now();
19769 let sanitized = sanitize_query(&query);
19770 let tokens = parse_boolean_query(&sanitized);
19771 let elapsed = start.elapsed();
19772
19773 assert!(
19774 elapsed < std::time::Duration::from_millis(100),
19775 "Deeply nested parens took {:?} (>100ms)",
19776 elapsed
19777 );
19778 let term_count = tokens
19780 .iter()
19781 .filter(|t| matches!(t, QueryToken::Term(_)))
19782 .count();
19783 assert_eq!(term_count, 1, "Should have 1 term after sanitizing parens");
19784 }
19785
19786 #[test]
19787 fn stress_many_boolean_operators() {
19788 let terms: Vec<String> = (0..101).map(|i| format!("term{}", i)).collect();
19790 let query = terms.join(" AND ");
19791
19792 let start = std::time::Instant::now();
19793 let tokens = parse_boolean_query(&query);
19794 let elapsed = start.elapsed();
19795
19796 assert!(
19797 elapsed < std::time::Duration::from_secs(1),
19798 "100+ boolean ops took {:?} (>1s)",
19799 elapsed
19800 );
19801
19802 let and_count = tokens
19803 .iter()
19804 .filter(|t| matches!(t, QueryToken::And))
19805 .count();
19806 let term_count = tokens
19807 .iter()
19808 .filter(|t| matches!(t, QueryToken::Term(_)))
19809 .count();
19810
19811 assert_eq!(and_count, 100, "Should have 100 AND operators");
19812 assert_eq!(term_count, 101, "Should have 101 terms");
19813 }
19814
19815 #[test]
19816 fn stress_many_or_operators() {
19817 let terms: Vec<String> = (0..101).map(|i| format!("opt{}", i)).collect();
19819 let query = terms.join(" OR ");
19820
19821 let start = std::time::Instant::now();
19822 let tokens = parse_boolean_query(&query);
19823 let elapsed = start.elapsed();
19824
19825 assert!(
19826 elapsed < std::time::Duration::from_secs(1),
19827 "100+ OR ops took {:?} (>1s)",
19828 elapsed
19829 );
19830
19831 let or_count = tokens
19832 .iter()
19833 .filter(|t| matches!(t, QueryToken::Or))
19834 .count();
19835 assert_eq!(or_count, 100, "Should have 100 OR operators");
19836 }
19837
19838 #[test]
19839 fn stress_mixed_boolean_operators() {
19840 let query = "a AND b OR c NOT d AND e OR f NOT g ".repeat(50);
19842
19843 let start = std::time::Instant::now();
19844 let tokens = parse_boolean_query(&query);
19845 let elapsed = start.elapsed();
19846
19847 assert!(
19848 elapsed < std::time::Duration::from_secs(1),
19849 "Mixed boolean ops took {:?} (>1s)",
19850 elapsed
19851 );
19852 assert!(
19853 !tokens.is_empty(),
19854 "Complex boolean query should produce tokens"
19855 );
19856 }
19857
19858 #[test]
19859 fn stress_memory_bounds_large_query() {
19860 let large_query = "x".repeat(100000);
19864
19865 let sanitized = sanitize_query(&large_query);
19866 let tokens = parse_boolean_query(&sanitized);
19867
19868 assert!(
19870 sanitized.len() <= large_query.len(),
19871 "Sanitized output should not exceed input size"
19872 );
19873
19874 assert_eq!(tokens.len(), 1);
19876
19877 let qt = QueryTermsLower::from_query(&large_query);
19879 let token_count = qt.tokens().count();
19880 assert_eq!(token_count, 1, "Should be 1 token of 100k chars");
19881 }
19882
19883 #[test]
19884 fn stress_concurrent_queries() {
19885 use std::thread;
19886
19887 let queries: Vec<String> = (0..100)
19888 .map(|i| format!("concurrent_query_{} test search", i))
19889 .collect();
19890
19891 let handles: Vec<_> = queries
19892 .into_iter()
19893 .map(|query| {
19894 thread::spawn(move || {
19895 let sanitized = sanitize_query(&query);
19896 let tokens = parse_boolean_query(&sanitized);
19897 let qt = QueryTermsLower::from_query(&query);
19898 (tokens.len(), qt.tokens().count())
19899 })
19900 })
19901 .collect();
19902
19903 for (i, handle) in handles.into_iter().enumerate() {
19904 let (token_len, qt_len) = handle.join().expect("Thread panicked");
19905 assert!(token_len > 0, "Query {} should produce tokens", i);
19906 assert!(qt_len > 0, "Query {} QueryTermsLower should have tokens", i);
19907 }
19908 }
19909
19910 #[test]
19911 fn stress_many_quoted_phrases() {
19912 let phrases: Vec<String> = (0..50)
19914 .map(|i| format!("\"phrase number {}\"", i))
19915 .collect();
19916 let query = phrases.join(" AND ");
19917
19918 let start = std::time::Instant::now();
19919 let tokens = parse_boolean_query(&query);
19920 let elapsed = start.elapsed();
19921
19922 assert!(
19923 elapsed < std::time::Duration::from_secs(1),
19924 "50 quoted phrases took {:?} (>1s)",
19925 elapsed
19926 );
19927
19928 let phrase_count = tokens
19929 .iter()
19930 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19931 .count();
19932 assert_eq!(phrase_count, 50, "Should have 50 phrases");
19933 }
19934
19935 #[test]
19936 fn stress_alternating_quotes() {
19937 let parts: Vec<String> = (0..100)
19939 .map(|i| {
19940 if i % 2 == 0 {
19941 format!("\"word{}\"", i)
19942 } else {
19943 format!("word{}", i)
19944 }
19945 })
19946 .collect();
19947 let query = parts.join(" ");
19948
19949 let start = std::time::Instant::now();
19950 let tokens = parse_boolean_query(&query);
19951 let elapsed = start.elapsed();
19952
19953 assert!(
19954 elapsed < std::time::Duration::from_secs(1),
19955 "100 alternating quotes took {:?} (>1s)",
19956 elapsed
19957 );
19958
19959 let phrase_count = tokens
19960 .iter()
19961 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19962 .count();
19963 let term_count = tokens
19964 .iter()
19965 .filter(|t| matches!(t, QueryToken::Term(_)))
19966 .count();
19967
19968 assert_eq!(phrase_count, 50, "Should have 50 phrases");
19969 assert_eq!(term_count, 50, "Should have 50 terms");
19970 }
19971
19972 #[test]
19973 fn stress_many_wildcards() {
19974 let patterns: Vec<&str> = vec!["pre*", "*suf", "*sub*", "a*b", "test*", "*ing", "*tion*"];
19976 let query = patterns
19977 .iter()
19978 .cycle()
19979 .take(100)
19980 .cloned()
19981 .collect::<Vec<_>>()
19982 .join(" ");
19983
19984 let start = std::time::Instant::now();
19985 let sanitized = sanitize_query(&query);
19986 let tokens = parse_boolean_query(&sanitized);
19987 let elapsed = start.elapsed();
19988
19989 assert!(
19990 elapsed < std::time::Duration::from_secs(1),
19991 "100 wildcards took {:?} (>1s)",
19992 elapsed
19993 );
19994 assert!(!tokens.is_empty());
19995 }
19996
19997 #[test]
19998 fn stress_query_explanation_large_query() {
19999 let words: Vec<String> = (0..100).map(|i| format!("term{}", i)).collect();
20001 let query = words.join(" ");
20002 let filters = SearchFilters::default();
20003
20004 let start = std::time::Instant::now();
20005 let explanation = QueryExplanation::analyze(&query, &filters);
20006 let elapsed = start.elapsed();
20007
20008 assert!(
20009 elapsed < std::time::Duration::from_secs(2),
20010 "QueryExplanation for 100 terms took {:?} (>2s)",
20011 elapsed
20012 );
20013 assert!(
20014 !explanation.parsed.terms.is_empty(),
20015 "Should parse terms successfully"
20016 );
20017 }
20018
20019 #[test]
20020 fn stress_very_long_single_quoted_phrase() {
20021 let words: Vec<String> = (0..500).map(|i| format!("word{}", i)).collect();
20023 let phrase = format!("\"{}\"", words.join(" "));
20024
20025 let start = std::time::Instant::now();
20026 let tokens = parse_boolean_query(&phrase);
20027 let elapsed = start.elapsed();
20028
20029 assert!(
20030 elapsed < std::time::Duration::from_secs(1),
20031 "500-word phrase took {:?} (>1s)",
20032 elapsed
20033 );
20034
20035 let phrase_count = tokens
20036 .iter()
20037 .filter(|t| matches!(t, QueryToken::Phrase(_)))
20038 .count();
20039 assert_eq!(phrase_count, 1, "Should have exactly 1 phrase");
20040 }
20041
20042 #[test]
20043 fn stress_not_prefix_many() {
20044 let terms: Vec<String> = (0..100).map(|i| format!("-term{}", i)).collect();
20046 let query = terms.join(" ");
20047
20048 let start = std::time::Instant::now();
20049 let tokens = parse_boolean_query(&query);
20050 let elapsed = start.elapsed();
20051
20052 assert!(
20053 elapsed < std::time::Duration::from_secs(1),
20054 "100 NOT prefixes took {:?} (>1s)",
20055 elapsed
20056 );
20057
20058 let not_count = tokens
20059 .iter()
20060 .filter(|t| matches!(t, QueryToken::Not))
20061 .count();
20062 assert_eq!(not_count, 100, "Should have 100 NOT operators");
20063 }
20064
20065 #[test]
20066 fn stress_unicode_large_cjk_query() {
20067 let cjk_chars = "中文日本語한국어".repeat(1000);
20069
20070 let start = std::time::Instant::now();
20071 let sanitized = sanitize_query(&cjk_chars);
20072 let qt = QueryTermsLower::from_query(&sanitized);
20073 let elapsed = start.elapsed();
20074
20075 assert!(
20076 elapsed < std::time::Duration::from_secs(1),
20077 "Large CJK query took {:?} (>1s)",
20078 elapsed
20079 );
20080 assert!(!qt.is_empty(), "CJK query should produce tokens");
20081 }
20082
20083 #[test]
20084 fn stress_unicode_many_emoji() {
20085 let emoji_query = "🚀 🔍 📝 💻 🎯 ".repeat(500);
20087
20088 let start = std::time::Instant::now();
20089 let sanitized = sanitize_query(&emoji_query);
20090 let tokens = parse_boolean_query(&sanitized);
20091 let elapsed = start.elapsed();
20092
20093 assert!(
20094 elapsed < std::time::Duration::from_secs(1),
20095 "Emoji query took {:?} (>1s)",
20096 elapsed
20097 );
20098 assert!(
20100 tokens.is_empty(),
20101 "Emoji-only query should produce no tokens"
20102 );
20103 }
20104
20105 #[test]
20106 fn stress_mixed_content_large() {
20107 let mixed = r#"
20109 function test() { return x + y; }
20110 SELECT * FROM users WHERE id = 1;
20111 The quick brown fox 狐狸 jumps over lazy dog
20112 Error: "undefined is not a function" at line 42
20113 https://example.com/path?query=value&other=123
20114 "#
20115 .repeat(100);
20116
20117 let start = std::time::Instant::now();
20118 let sanitized = sanitize_query(&mixed);
20119 let tokens = parse_boolean_query(&sanitized);
20120 let qt = QueryTermsLower::from_query(&mixed);
20121 let elapsed = start.elapsed();
20122
20123 assert!(
20124 elapsed < std::time::Duration::from_secs(2),
20125 "Mixed content query took {:?} (>2s)",
20126 elapsed
20127 );
20128 assert!(!tokens.is_empty());
20129 assert!(!qt.is_empty());
20130 }
20131
20132 #[test]
20139 fn unicode_emoji_mixed_with_alphanumeric() {
20140 let tokens = parse_boolean_query("rocket🚀launch");
20142 assert_eq!(tokens.len(), 1);
20143 let sanitized = sanitize_query("rocket🚀launch");
20145 assert_eq!(sanitized, "rocket launch");
20146
20147 let sanitized2 = sanitize_query("test🔥🎯code");
20149 assert_eq!(sanitized2, "test code");
20150 }
20151
20152 #[test]
20153 fn unicode_emoji_with_boolean_operators() {
20154 let tokens = parse_boolean_query("🚀code AND test");
20156 let term_count = tokens
20158 .iter()
20159 .filter(|t| matches!(t, QueryToken::Term(_)))
20160 .count();
20161 assert!(term_count >= 1, "Should have at least one term");
20162
20163 let tokens_or = parse_boolean_query("deploy OR 🎯target");
20165 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
20166 assert!(has_or, "Should detect OR operator");
20167 }
20168
20169 #[test]
20170 fn unicode_emoji_at_word_boundaries() {
20171 let sanitized_start = sanitize_query("🔍search");
20173 assert_eq!(sanitized_start, " search");
20174
20175 let sanitized_end = sanitize_query("complete✅");
20177 assert_eq!(sanitized_end, "complete ");
20178
20179 let sanitized_only = sanitize_query("🎉🎊🎁");
20181 assert!(
20182 sanitized_only.trim().is_empty(),
20183 "Emoji-only should be empty after trimming"
20184 );
20185 }
20186
20187 #[test]
20190 fn unicode_arabic_text_preserved() {
20191 let arabic = "مرحبا بالعالم"; let sanitized = sanitize_query(arabic);
20194 assert_eq!(
20195 sanitized, arabic,
20196 "Arabic alphanumeric chars should be preserved"
20197 );
20198
20199 let tokens = parse_boolean_query(arabic);
20200 assert!(!tokens.is_empty(), "Arabic query should produce tokens");
20201 }
20202
20203 #[test]
20204 fn unicode_hebrew_text_preserved() {
20205 let hebrew = "שלום עולם"; let sanitized = sanitize_query(hebrew);
20208 assert_eq!(
20209 sanitized, hebrew,
20210 "Hebrew alphanumeric chars should be preserved"
20211 );
20212
20213 let tokens = parse_boolean_query(hebrew);
20214 assert!(!tokens.is_empty(), "Hebrew query should produce tokens");
20215 }
20216
20217 #[test]
20218 fn unicode_mixed_rtl_and_ltr() {
20219 let mixed = "hello مرحبا world";
20221 let sanitized = sanitize_query(mixed);
20222 assert_eq!(sanitized, mixed, "Mixed RTL/LTR should be preserved");
20223
20224 let tokens = parse_boolean_query(mixed);
20225 let term_count = tokens
20226 .iter()
20227 .filter(|t| matches!(t, QueryToken::Term(_)))
20228 .count();
20229 assert_eq!(term_count, 3, "Should have 3 terms");
20230 }
20231
20232 #[test]
20233 fn unicode_rtl_with_boolean_operators() {
20234 let hebrew_and = "שלום AND עולם";
20236 let tokens = parse_boolean_query(hebrew_and);
20237 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
20238 assert!(has_and, "Should detect AND operator in Hebrew query");
20239
20240 let arabic_not = "مرحبا NOT بالعالم";
20242 let tokens_not = parse_boolean_query(arabic_not);
20243 let has_not = tokens_not.iter().any(|t| matches!(t, QueryToken::Not));
20244 assert!(has_not, "Should detect NOT operator in Arabic query");
20245 }
20246
20247 #[test]
20250 fn special_chars_backslash_stripped() {
20251 let query = r"path\to\file";
20253 let sanitized = sanitize_query(query);
20254 assert_eq!(sanitized, "path to file");
20255 }
20256
20257 #[test]
20258 fn special_chars_escaped_quotes_handling() {
20259 let query = r#"say \"hello\""#;
20261 let sanitized = sanitize_query(query);
20262 assert!(sanitized.contains('"'), "Quotes should be preserved");
20264 }
20265
20266 #[test]
20267 fn special_chars_windows_paths() {
20268 let path = r"C:\Users\test\Documents";
20270 let sanitized = sanitize_query(path);
20271 assert_eq!(sanitized, "C Users test Documents");
20272 }
20273
20274 #[test]
20277 fn boolean_deeply_nested_operators() {
20278 let query = "a AND b OR c NOT d AND e";
20280 let tokens = parse_boolean_query(query);
20281
20282 let mut and_count = 0;
20283 let mut or_count = 0;
20284 let mut not_count = 0;
20285 for token in &tokens {
20286 match token {
20287 QueryToken::And => and_count += 1,
20288 QueryToken::Or => or_count += 1,
20289 QueryToken::Not => not_count += 1,
20290 _ => {}
20291 }
20292 }
20293
20294 assert_eq!(and_count, 2, "Should have 2 AND operators");
20295 assert_eq!(or_count, 1, "Should have 1 OR operator");
20296 assert_eq!(not_count, 1, "Should have 1 NOT operator");
20297 }
20298
20299 #[test]
20300 fn boolean_consecutive_operators_degenerate() {
20301 let tokens = parse_boolean_query("foo AND AND bar");
20303 let term_count = tokens
20305 .iter()
20306 .filter(|t| matches!(t, QueryToken::Term(_)))
20307 .count();
20308 assert!(
20309 term_count >= 2,
20310 "Should have at least 2 terms (foo and bar)"
20311 );
20312 }
20313
20314 #[test]
20315 fn boolean_operator_at_start() {
20316 let tokens = parse_boolean_query("AND foo");
20318 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
20319 assert!(has_and, "Leading AND should be detected");
20320
20321 let tokens_or = parse_boolean_query("OR test");
20322 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
20323 assert!(has_or, "Leading OR should be detected");
20324 }
20325
20326 #[test]
20327 fn boolean_operator_at_end() {
20328 let tokens = parse_boolean_query("foo AND");
20330 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
20331 assert!(has_and, "Trailing AND should be detected");
20332 }
20333
20334 #[test]
20337 fn numeric_query_digits_only() {
20338 let tokens = parse_boolean_query("12345");
20340 assert_eq!(tokens.len(), 1);
20341 assert_eq!(tokens[0], QueryToken::Term("12345".to_string()));
20342
20343 let sanitized = sanitize_query("12345");
20344 assert_eq!(sanitized, "12345");
20345 }
20346
20347 #[test]
20348 fn numeric_query_with_text() {
20349 let tokens = parse_boolean_query("error 404 not found");
20351 let term_count = tokens
20352 .iter()
20353 .filter(|t| matches!(t, QueryToken::Term(_)))
20354 .count();
20355 assert!(term_count >= 3, "Should have at least 3 terms");
20357 }
20358
20359 #[test]
20360 fn numeric_versions_with_dots() {
20361 let sanitized = sanitize_query("version 1.2.3");
20363 assert_eq!(sanitized, "version 1 2 3"); }
20365
20366 #[test]
20369 fn whitespace_tabs_treated_as_separators() {
20370 let tokens = parse_boolean_query("foo\tbar\tbaz");
20371 let term_count = tokens
20372 .iter()
20373 .filter(|t| matches!(t, QueryToken::Term(_)))
20374 .count();
20375 assert_eq!(term_count, 3, "Tabs should separate terms");
20376 }
20377
20378 #[test]
20379 fn whitespace_newlines_treated_as_separators() {
20380 let tokens = parse_boolean_query("foo\nbar\nbaz");
20381 let term_count = tokens
20382 .iter()
20383 .filter(|t| matches!(t, QueryToken::Term(_)))
20384 .count();
20385 assert_eq!(term_count, 3, "Newlines should separate terms");
20386 }
20387
20388 #[test]
20389 fn whitespace_mixed_types() {
20390 let tokens = parse_boolean_query("a \t b \n c d");
20391 let term_count = tokens
20392 .iter()
20393 .filter(|t| matches!(t, QueryToken::Term(_)))
20394 .count();
20395 assert_eq!(term_count, 4, "Mixed whitespace should separate properly");
20396 }
20397
20398 #[test]
20401 fn stress_very_long_single_term() {
20402 let long_term = "a".repeat(10_000);
20404
20405 let start = std::time::Instant::now();
20406 let tokens = parse_boolean_query(&long_term);
20407 let elapsed = start.elapsed();
20408
20409 assert!(
20410 elapsed < std::time::Duration::from_secs(1),
20411 "10K char term took {:?} (>1s)",
20412 elapsed
20413 );
20414 assert_eq!(tokens.len(), 1);
20415 assert!(
20416 matches!(tokens.first(), Some(QueryToken::Term(t)) if t.len() == 10_000),
20417 "Expected 10K Term token, got {tokens:?}"
20418 );
20419 }
20420
20421 #[test]
20422 fn stress_very_long_term_with_wildcard() {
20423 let long_pattern = format!("{}*", "prefix".repeat(1000));
20425
20426 let start = std::time::Instant::now();
20427 let sanitized = sanitize_query(&long_pattern);
20428 let pattern = WildcardPattern::parse(&sanitized);
20429 let elapsed = start.elapsed();
20430
20431 assert!(
20432 elapsed < std::time::Duration::from_secs(1),
20433 "Long wildcard pattern took {:?} (>1s)",
20434 elapsed
20435 );
20436 assert!(
20437 matches!(pattern, WildcardPattern::Prefix(_)),
20438 "Should parse as prefix pattern"
20439 );
20440 }
20441
20442 #[test]
20445 fn query_explanation_empty_query() {
20446 let explanation = QueryExplanation::analyze("", &SearchFilters::default());
20447 assert_eq!(explanation.query_type, QueryType::Empty);
20448 }
20449
20450 #[test]
20451 fn search_mode_default_is_hybrid_preferred() {
20452 assert_eq!(SearchMode::default(), SearchMode::Hybrid);
20453 }
20454
20455 #[test]
20456 fn query_explanation_whitespace_only_query() {
20457 let explanation = QueryExplanation::analyze(" \t\n ", &SearchFilters::default());
20458 assert_eq!(explanation.query_type, QueryType::Empty);
20459 }
20460
20461 #[test]
20462 fn query_explanation_unicode_query() {
20463 let explanation = QueryExplanation::analyze("日本語 search", &SearchFilters::default());
20464 assert!(!explanation.parsed.terms.is_empty());
20466 }
20467
20468 #[test]
20471 fn query_terms_lower_unicode_normalization() {
20472 let terms = QueryTermsLower::from_query("CAFÉ RÉSUMÉ");
20474 assert_eq!(terms.query_lower, "café résumé");
20475 }
20476
20477 #[test]
20478 fn query_terms_lower_mixed_case_unicode() {
20479 let terms = QueryTermsLower::from_query("Hello日本語World");
20481 assert!(terms.query_lower.contains("hello"));
20483 assert!(terms.query_lower.contains("world"));
20484 }
20485
20486 #[test]
20487 fn query_terms_lower_preserves_numbers() {
20488 let terms = QueryTermsLower::from_query("ABC123XYZ");
20489 assert_eq!(terms.query_lower, "abc123xyz");
20490 }
20491
20492 #[test]
20495 fn wildcard_pattern_internal_asterisk() {
20496 let pattern = WildcardPattern::parse("f*o");
20498 assert!(
20499 matches!(pattern, WildcardPattern::Complex(_)),
20500 "Internal asterisk should be Complex"
20501 );
20502 }
20503
20504 #[test]
20505 fn wildcard_pattern_multiple_internal_asterisks() {
20506 let pattern = WildcardPattern::parse("a*b*c");
20508 assert!(
20509 matches!(pattern, WildcardPattern::Complex(_)),
20510 "Multiple internal asterisks should be Complex"
20511 );
20512 }
20513
20514 #[test]
20515 fn wildcard_pattern_regex_escapes_special_chars() {
20516 let pattern = WildcardPattern::parse("*foo.bar*");
20518 if let Some(regex) = pattern.to_regex() {
20519 assert!(
20520 regex.contains("\\."),
20521 "Dot should be escaped in regex: {}",
20522 regex
20523 );
20524 }
20525 }
20526
20527 #[test]
20528 fn wildcard_pattern_complex_regex_generation() {
20529 let pattern = WildcardPattern::parse("f*o*o");
20530 if let Some(regex) = pattern.to_regex() {
20531 assert!(
20533 regex.contains(".*"),
20534 "Should have .* for internal wildcards: {}",
20535 regex
20536 );
20537 }
20538 }
20539
20540 #[test]
20541 fn test_transpile_to_fts5() {
20542 assert_eq!(
20544 transpile_to_fts5("foo bar"),
20545 Some("foo AND bar".to_string())
20546 );
20547
20548 assert_eq!(
20550 transpile_to_fts5("foo AND bar"),
20551 Some("foo AND bar".to_string())
20552 );
20553 assert_eq!(
20554 transpile_to_fts5("foo OR bar"),
20555 Some("(foo OR bar)".to_string())
20556 );
20557 assert_eq!(transpile_to_fts5("OR foo"), Some("foo".to_string()));
20558 assert_eq!(transpile_to_fts5("NOT foo"), None);
20559
20560 assert_eq!(
20563 transpile_to_fts5("A AND B OR C"),
20564 Some("A AND (B OR C)".to_string())
20565 );
20566
20567 assert_eq!(
20569 transpile_to_fts5("A OR B AND C"),
20570 Some("(A OR B) AND C".to_string())
20571 );
20572
20573 assert_eq!(
20575 transpile_to_fts5("A OR B OR C"),
20576 Some("(A OR B OR C)".to_string())
20577 );
20578
20579 assert_eq!(
20581 transpile_to_fts5("\"foo bar\""),
20582 Some("\"foo bar\"".to_string())
20583 );
20584
20585 assert_eq!(transpile_to_fts5("foo*"), Some("foo*".to_string()));
20587
20588 assert_eq!(transpile_to_fts5("*foo"), None);
20590 assert_eq!(transpile_to_fts5("f*o"), None);
20591
20592 assert_eq!(
20595 transpile_to_fts5("foo-bar"),
20596 Some("(foo AND bar)".to_string())
20597 );
20598 assert_eq!(
20599 transpile_to_fts5("foo-bar*"),
20600 Some("(foo AND bar*)".to_string())
20601 );
20602 assert_eq!(
20603 transpile_to_fts5("br-123.jsonl"),
20604 Some("(br AND 123 AND jsonl)".to_string())
20605 );
20606 assert_eq!(
20607 transpile_to_fts5("br-123.json*"),
20608 Some("(br AND 123 AND json*)".to_string())
20609 );
20610
20611 assert_eq!(transpile_to_fts5("NOT A OR B"), None);
20613 }
20614
20615 #[test]
20616 fn semantic_doc_id_roundtrip_from_query() {
20617 let hash_hex = "00".repeat(32);
20618 let doc_id = format!("m|42|2|3|7|11|1|1700000000000|{hash_hex}");
20619 let parsed = parse_semantic_doc_id(&doc_id).expect("roundtrip parse");
20620 assert_eq!(parsed.message_id, 42);
20621 assert_eq!(parsed.chunk_idx, 2);
20622 assert_eq!(parsed.agent_id, 3);
20623 assert_eq!(parsed.workspace_id, 7);
20624 assert_eq!(parsed.source_id, 11);
20625 assert_eq!(parsed.role, 1);
20626 assert_eq!(parsed.created_at_ms, 1_700_000_000_000);
20627 }
20628
20629 #[test]
20630 fn semantic_filter_applies_all_constraints() {
20631 use frankensearch::core::filter::SearchFilter;
20632
20633 let filter = SemanticFilter {
20634 agents: Some(HashSet::from([3])),
20635 workspaces: Some(HashSet::from([7])),
20636 sources: Some(HashSet::from([11])),
20637 roles: Some(HashSet::from([1])),
20638 created_from: Some(1_700_000_000_000),
20639 created_to: Some(1_700_000_000_100),
20640 };
20641
20642 assert!(filter.matches("m|42|2|3|7|11|1|1700000000001", None));
20643 assert!(!filter.matches("m|42|2|99|7|11|1|1700000000001", None));
20644 assert!(!filter.matches("m|42|2|3|7|11|1|1699999999999", None));
20645 assert!(!filter.matches("not-a-doc-id", None));
20646 }
20647
20648 #[test]
20649 fn fs_semantic_index_runs_filtered_search() -> Result<()> {
20650 let temp = TempDir::new()?;
20651 let index_path = crate::search::vector_index::vector_index_path(temp.path(), "embed-fast");
20652 if let Some(parent) = index_path.parent() {
20653 std::fs::create_dir_all(parent)?;
20654 }
20655
20656 let hash_a = "00".repeat(32);
20657 let hash_b = "11".repeat(32);
20658 let doc_a = format!("m|101|0|1|10|100|1|1700000000001|{hash_a}");
20659 let doc_b = format!("m|202|0|2|20|200|1|1700000000002|{hash_b}");
20660
20661 let mut writer = VectorIndex::create_with_revision(
20662 &index_path,
20663 "embed-fast",
20664 "rev-1",
20665 2,
20666 frankensearch::index::Quantization::F16,
20667 )
20668 .map_err(|err| anyhow!("create fsvi index failed: {err}"))?;
20669 writer
20670 .write_record(&doc_a, &[1.0, 0.0])
20671 .map_err(|err| anyhow!("write_record failed: {err}"))?;
20672 writer
20673 .write_record(&doc_b, &[0.0, 1.0])
20674 .map_err(|err| anyhow!("write_record failed: {err}"))?;
20675 writer
20676 .finish()
20677 .map_err(|err| anyhow!("finish fsvi index failed: {err}"))?;
20678
20679 let fs_index =
20680 VectorIndex::open(&index_path).map_err(|err| anyhow!("open fsvi failed: {err}"))?;
20681 let filter = SemanticFilter {
20682 agents: Some(HashSet::from([1])),
20683 workspaces: None,
20684 sources: None,
20685 roles: None,
20686 created_from: None,
20687 created_to: None,
20688 };
20689 let fs_filter = semantic_filter_as_search_filter(&filter).expect("expected active filter");
20690 let hits = fs_index
20691 .search_top_k(&[1.0, 0.0], 5, Some(fs_filter))
20692 .map_err(|err| anyhow!("frankensearch search failed: {err}"))?;
20693 assert_eq!(hits.len(), 1);
20694 let parsed = parse_semantic_doc_id(&hits[0].doc_id).expect("parse bridged doc_id");
20695 assert_eq!(parsed.message_id, 101);
20696 assert_eq!(parsed.agent_id, 1);
20697 Ok(())
20698 }
20699
20700 #[test]
20712 fn hit_is_noise_returns_false_when_content_and_snippet_both_empty() {
20713 let hit = SearchHit {
20714 title: String::new(),
20715 snippet: String::new(),
20716 content: String::new(),
20717 content_hash: 0,
20718 conversation_id: Some(1),
20719 score: 1.0,
20720 source_path: "/tmp/session.jsonl".to_string(),
20721 agent: "codex".to_string(),
20722 workspace: String::new(),
20723 workspace_original: None,
20724 created_at: Some(1700000000000),
20725 line_number: Some(1),
20726 match_type: MatchType::Exact,
20727 source_id: "local".to_string(),
20728 origin_kind: "local".to_string(),
20729 origin_host: None,
20730 };
20731
20732 assert!(
20736 !hit_is_noise(&hit, "anything"),
20737 "hit with empty content AND snippet (projection-only) must NOT be classified as noise"
20738 );
20739 assert!(
20740 !hit_is_noise(&hit, ""),
20741 "noise classifier must not treat an empty-query projection-only hit as noise"
20742 );
20743 }
20744
20745 #[test]
20750 fn hit_is_noise_still_drops_tool_acknowledgement_when_content_present() {
20751 let hit = SearchHit {
20752 title: String::new(),
20753 snippet: String::new(),
20754 content: "ok".to_string(),
20755 content_hash: 0,
20756 conversation_id: Some(1),
20757 score: 1.0,
20758 source_path: "/tmp/session.jsonl".to_string(),
20759 agent: "codex".to_string(),
20760 workspace: String::new(),
20761 workspace_original: None,
20762 created_at: Some(1700000000000),
20763 line_number: Some(1),
20764 match_type: MatchType::Exact,
20765 source_id: "local".to_string(),
20766 origin_kind: "local".to_string(),
20767 origin_host: None,
20768 };
20769
20770 assert!(
20771 hit_is_noise(&hit, ""),
20772 "bare tool-ack 'ok' with content present should still be dropped as noise"
20773 );
20774 }
20775}