1use anyhow::{Context, Result, anyhow, bail};
2use crossbeam_channel as mpsc;
3use frankensearch::lexical::{
4 BooleanQuery, CASS_SCHEMA_HASH as FS_CASS_SCHEMA_HASH, CassFields as FsCassFields,
5 CassQueryFilters as FsCassQueryFilters, CassQueryToken as FsCassQueryToken,
6 CassSourceFilter as FsCassSourceFilter, CassWildcardPattern as FsCassWildcardPattern, Count,
7 IndexReader, IndexRecordOption, LexicalDocHit as FsLexicalDocHit,
8 LexicalSearchResult as FsLexicalSearchResult, Occur, Query, ReloadPolicy, Searcher,
9 SnippetConfig as FsSnippetConfig, TantivyDocument, Term, TermQuery, TopDocs, Value,
10 cass_build_tantivy_query as fs_cass_build_tantivy_query,
11 cass_has_boolean_operators as fs_cass_has_boolean_operators,
12 cass_open_search_reader as fs_cass_open_search_reader,
13 cass_parse_boolean_query as fs_cass_parse_boolean_query,
14 cass_sanitize_query as fs_cass_sanitize_query, load_doc as fs_load_doc,
15 render_snippet_html as fs_render_snippet_html,
16 try_build_snippet_generator as fs_try_build_snippet_generator,
17};
18use frankensearch::{
19 Cx as FsCx, InMemoryTwoTierIndex as FsInMemoryTwoTierIndex,
20 InMemoryVectorIndex as FsInMemoryVectorIndex, LexicalSearch as FsLexicalSearch,
21 QueryClass as FsQueryClass, RrfConfig as FsRrfConfig, ScoreSource as FsScoreSource,
22 ScoredResult as FsScoredResult, SearchError as FsSearchError, SearchFuture as FsSearchFuture,
23 SearchPhase as FsSearchPhase, SyncEmbedderAdapter as FsSyncEmbedderAdapter,
24 SyncTwoTierSearcher as FsSyncTwoTierSearcher, TwoTierConfig as FsTwoTierConfig,
25 TwoTierIndex as FsTwoTierIndex, TwoTierSearcher as FsTwoTierSearcher, VectorHit as FsVectorHit,
26 candidate_count as fs_candidate_count,
27 core::filter::SearchFilter as FsSearchFilter,
28 index::{
29 HNSW_DEFAULT_EF_SEARCH as FS_HNSW_DEFAULT_EF_SEARCH, HnswIndex as FsHnswIndex,
30 VectorIndex as FsVectorIndex,
31 },
32 rrf_fuse as fs_rrf_fuse,
33};
34use lru::LruCache;
35use once_cell::sync::Lazy;
36use parking_lot::RwLock;
37use std::cell::RefCell;
38use std::cmp::Ordering as CmpOrdering;
39use std::collections::{HashMap, HashSet, VecDeque};
40use std::hash::{Hash, Hasher};
41use std::num::NonZeroUsize;
42use std::path::{Path, PathBuf};
43use std::sync::atomic::{AtomicU64, Ordering};
44use std::sync::{Arc, Mutex};
45use std::time::{Duration, Instant};
46
47use frankensqlite::Connection;
48#[cfg(test)]
49use frankensqlite::compat::OptionalExtension;
50use frankensqlite::compat::{ConnectionExt, ParamValue, RowExt};
51#[cfg(test)]
52use frankensqlite::params;
53
54struct SendConnection(Connection);
62
63type TantivyContentExactKey = (i64, i64);
64type TantivyContentFallbackKey = (String, String, i64);
65type TantivyHydratedContentMaps = (
66 HashMap<TantivyContentExactKey, String>,
67 HashMap<TantivyContentFallbackKey, String>,
68);
69type SqliteFtsHydratedRow = (
70 i64,
71 String,
72 String,
73 String,
74 String,
75 String,
76 Option<i64>,
77 Option<i64>,
78 Option<i64>,
79 Option<String>,
80 Option<String>,
81 Option<String>,
82);
83
84const SQLITE_FTS5_HYDRATE_PARAM_CHUNK: usize = 30_000;
88const SQLITE_MAX_VARIABLE_NUMBER: usize = 32_766;
89const SEARCH_SQLITE_HYDRATION_CACHE_KIB: i64 = 4_096;
90const SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER: usize = 4;
91
92unsafe impl Send for SendConnection {}
95
96impl std::ops::Deref for SendConnection {
97 type Target = Connection;
98 fn deref(&self) -> &Connection {
99 &self.0
100 }
101}
102
103fn open_search_hydration_sqlite(path: &Path, timeout: Duration) -> Result<Connection> {
104 let conn =
105 crate::storage::sqlite::open_franken_raw_readonly_connection_with_timeout(path, timeout)?;
106 conn.execute("PRAGMA query_only = 1;")
107 .with_context(|| "setting search hydration query_only")?;
108 conn.execute("PRAGMA busy_timeout = 5000;")
109 .with_context(|| "setting search hydration busy_timeout")?;
110 conn.execute(&format!(
111 "PRAGMA cache_size = -{SEARCH_SQLITE_HYDRATION_CACHE_KIB};"
112 ))
113 .with_context(|| "setting search hydration cache_size")?;
114 Ok(conn)
115}
116
117fn nfc_sanitize_query(raw: &str) -> String {
121 use unicode_normalization::UnicodeNormalization;
122 let nfc: String = raw.nfc().collect();
123 fs_cass_sanitize_query(&nfc)
124}
125
126fn franken_query_map_collect_retry<T, F>(
127 conn: &Connection,
128 sql: &str,
129 params: &[ParamValue],
130 map: F,
131) -> Result<Vec<T>, frankensqlite::FrankenError>
132where
133 F: Copy + Fn(&frankensqlite::Row) -> Result<T, frankensqlite::FrankenError>,
134{
135 let deadline = Instant::now() + Duration::from_secs(2);
136 let mut backoff = Duration::from_millis(4);
137 loop {
138 match conn.query_map_collect(sql, params, |row| map(row)) {
139 Ok(values) => return Ok(values),
140 Err(err) if crate::storage::sqlite::retryable_franken_error(&err) => {
141 let now = Instant::now();
142 if now >= deadline {
143 return Err(err);
144 }
145 let remaining = deadline.saturating_duration_since(now);
146 crate::storage::sqlite::sleep_with_franken_retry_backoff(
147 &mut backoff,
148 remaining,
149 Duration::from_millis(64),
150 );
151 }
152 Err(err) => return Err(err),
153 }
154 }
155}
156
157fn hydrate_message_content_by_conversation(
158 conn: &Connection,
159 requests: &[TantivyContentExactKey],
160) -> Result<HashMap<TantivyContentExactKey, String>> {
161 if requests.is_empty() {
162 return Ok(HashMap::new());
163 }
164
165 let mut wanted_by_conversation: HashMap<i64, HashSet<i64>> = HashMap::new();
166 for &(conversation_id, line_idx) in requests {
167 wanted_by_conversation
168 .entry(conversation_id)
169 .or_default()
170 .insert(line_idx);
171 }
172
173 let mut conversation_ids = wanted_by_conversation.keys().copied().collect::<Vec<_>>();
174 conversation_ids.sort_unstable();
175 let mut hydrated = HashMap::with_capacity(requests.len());
176
177 for conversation_id in conversation_ids {
178 let Some(wanted_indices) = wanted_by_conversation.get(&conversation_id) else {
179 continue;
180 };
181 let mut wanted_indices = wanted_indices.iter().copied().collect::<Vec<_>>();
182 wanted_indices.sort_unstable();
183 let placeholders = sql_placeholders(wanted_indices.len());
184 let sql = format!(
185 "SELECT m.conversation_id, m.idx, m.content
186 FROM messages m INDEXED BY sqlite_autoindex_messages_1
187 WHERE m.conversation_id = ? AND m.idx IN ({placeholders})
188 ORDER BY m.idx"
189 );
190 let mut params = Vec::with_capacity(wanted_indices.len() + 1);
191 params.push(ParamValue::from(conversation_id));
192 params.extend(wanted_indices.iter().copied().map(ParamValue::from));
193 let rows: Vec<(i64, i64, String)> =
194 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
195 Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?))
196 })?;
197 for (conversation_id, line_idx, content) in rows {
198 hydrated.insert((conversation_id, line_idx), content);
199 }
200 }
201
202 Ok(hydrated)
203}
204
205fn semantic_message_id_from_db(message_id: i64) -> std::io::Result<u64> {
206 u64::try_from(message_id).map_err(|_| std::io::Error::other("negative message_id"))
207}
208
209fn semantic_doc_component_id_from_db(raw: Option<i64>) -> u32 {
210 raw.map(|value| u32::try_from(value.max(0)).unwrap_or(u32::MAX))
211 .unwrap_or(0)
212}
213
214use crate::search::canonicalize::{canonicalize_for_embedding, content_hash, is_search_noise_text};
215use crate::search::embedder::Embedder;
216use crate::search::vector_index::{
217 ROLE_USER, SemanticDocId, SemanticFilter, SemanticFilterMaps, VectorIndex, VectorSearchResult,
218 parse_semantic_doc_id, role_code_from_str,
219};
220use crate::sources::provenance::SourceFilter;
221
222pub struct StringInterner {
233 cache: RwLock<LruCache<Arc<str>, Arc<str>>>,
234}
235
236impl StringInterner {
237 pub fn new(capacity: usize) -> Self {
239 Self {
240 cache: RwLock::new(LruCache::new(
241 NonZeroUsize::new(capacity).expect("capacity must be > 0"),
242 )),
243 }
244 }
245
246 pub fn intern(&self, s: &str) -> Arc<str> {
252 {
254 let cache = self.cache.read();
255 if let Some(arc) = cache.peek(s) {
258 return Arc::clone(arc);
259 }
260 }
261
262 let mut cache = self.cache.write();
264
265 if let Some(arc) = cache.get(s) {
268 return Arc::clone(arc);
269 }
270
271 let arc: Arc<str> = Arc::from(s);
273 cache.put(Arc::clone(&arc), Arc::clone(&arc));
274 arc
275 }
276
277 #[allow(dead_code)]
279 pub fn len(&self) -> usize {
280 self.cache.read().len()
281 }
282
283 #[allow(dead_code)]
285 pub fn is_empty(&self) -> bool {
286 self.cache.read().is_empty()
287 }
288}
289
290static CACHE_KEY_INTERNER: Lazy<StringInterner> = Lazy::new(|| StringInterner::new(10_000));
293
294#[inline]
296fn intern_cache_key(s: &str) -> Arc<str> {
297 CACHE_KEY_INTERNER.intern(s)
298}
299
300#[inline]
316pub fn sql_placeholders(count: usize) -> String {
317 if count == 0 {
318 return String::new();
319 }
320 let capacity = count.saturating_mul(2).saturating_sub(1);
322 let mut result = String::with_capacity(capacity);
323 for i in 0..count {
324 if i > 0 {
325 result.push(',');
326 }
327 result.push('?');
328 }
329 result
330}
331
332#[derive(Debug, Clone, Default, PartialEq, Eq, serde::Serialize)]
333pub struct SearchFilters {
334 pub agents: HashSet<String>,
335 pub workspaces: HashSet<String>,
336 pub created_from: Option<i64>,
337 pub created_to: Option<i64>,
338 #[serde(skip_serializing_if = "SourceFilter::is_all")]
340 pub source_filter: SourceFilter,
341 #[serde(skip_serializing_if = "HashSet::is_empty")]
343 pub session_paths: HashSet<String>,
344}
345
346#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize, clap::ValueEnum)]
347#[serde(rename_all = "snake_case")]
348pub enum SearchMode {
349 Lexical,
351 Semantic,
353 #[default]
355 Hybrid,
356}
357
358impl SearchMode {
359 pub fn next(self) -> Self {
360 match self {
361 SearchMode::Lexical => SearchMode::Semantic,
362 SearchMode::Semantic => SearchMode::Hybrid,
363 SearchMode::Hybrid => SearchMode::Lexical,
364 }
365 }
366}
367
368#[derive(Debug, Clone, Copy, Default, PartialEq, Eq, serde::Serialize)]
375#[serde(rename_all = "snake_case")]
376pub enum SemanticTierMode {
377 #[default]
378 Single,
379 Progressive,
380 FastOnly,
381 QualityOnly,
382}
383
384impl SemanticTierMode {
385 const fn wants_two_tier(self) -> bool {
386 !matches!(self, Self::Single)
387 }
388
389 fn to_frankensearch_config(self) -> FsTwoTierConfig {
390 let mut config = frankensearch_two_tier_config();
391 match self {
392 Self::Single | Self::Progressive => {}
393 Self::FastOnly => {
394 config.fast_only = true;
395 }
396 Self::QualityOnly => {
397 config.fast_only = false;
398 config.quality_weight = 1.0;
399 }
400 }
401 config
402 }
403}
404
405const PROGRESSIVE_EMBEDDING_CACHE_CAPACITY: usize = 64;
406const ANN_CANDIDATE_MULTIPLIER: usize = 4;
407const HYBRID_NO_LIMIT_PLANNING_WINDOW: usize = 64;
408const HYBRID_NO_LIMIT_SEMANTIC_CAP: usize = 2048;
409const AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS: usize = 16;
410
411pub const NO_LIMIT_RESULT_MIN: usize = 1_000;
432pub const NO_LIMIT_RESULT_MAX: usize = 1_000_000;
433
434const AVG_HIT_BYTES: u64 = 80 * 1024;
439
440const NO_LIMIT_BYTES_CEILING: u64 = 16 * 1024 * 1024 * 1024;
446
447const NO_LIMIT_BYTES_FLOOR: u64 = 256 * 1024 * 1024;
451
452const NO_LIMIT_RAM_DIVISOR: u64 = 16;
456
457fn available_memory_bytes() -> Option<u64> {
458 let meminfo = std::fs::read_to_string("/proc/meminfo").ok()?;
459 for line in meminfo.lines() {
460 if let Some(rest) = line.strip_prefix("MemAvailable:") {
461 let kb: u64 = rest.split_whitespace().next()?.parse().ok()?;
462 return Some(kb.saturating_mul(1024));
463 }
464 }
465 None
466}
467
468fn no_limit_result_cap() -> usize {
469 static CAP: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
470 *CAP.get_or_init(|| {
471 compute_no_limit_result_cap_from(
472 std::env::var("CASS_SEARCH_NO_LIMIT_CAP").ok(),
473 std::env::var("CASS_SEARCH_NO_LIMIT_BYTES").ok(),
474 available_memory_bytes(),
475 )
476 })
477}
478
479fn compute_no_limit_result_cap_from(
486 cap_env: Option<String>,
487 bytes_env: Option<String>,
488 available_bytes: Option<u64>,
489) -> usize {
490 if let Some(hits) = cap_env
494 .and_then(|v| v.parse::<usize>().ok())
495 .filter(|v| *v > 0)
496 {
497 return hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
498 }
499
500 let budget_bytes = no_limit_budget_bytes(bytes_env, available_bytes);
501 let hits = (budget_bytes / AVG_HIT_BYTES) as usize;
502 hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX)
503}
504
505fn no_limit_budget_bytes(bytes_env: Option<String>, available_bytes: Option<u64>) -> u64 {
506 bytes_env
507 .and_then(|v| v.parse::<u64>().ok())
508 .filter(|v| *v > 0)
509 .or_else(|| no_limit_available_memory_budget(available_bytes))
510 .unwrap_or(NO_LIMIT_BYTES_FLOOR)
511}
512
513fn no_limit_available_memory_budget(available_bytes: Option<u64>) -> Option<u64> {
514 available_bytes.map(|avail| {
515 (avail / NO_LIMIT_RAM_DIVISOR).clamp(NO_LIMIT_BYTES_FLOOR, NO_LIMIT_BYTES_CEILING)
516 })
517}
518
519static FRANKENSEARCH_TWO_TIER_CONFIG: Lazy<FsTwoTierConfig> =
520 Lazy::new(|| FsTwoTierConfig::optimized().with_env_overrides());
521
522fn frankensearch_two_tier_config() -> FsTwoTierConfig {
523 FRANKENSEARCH_TWO_TIER_CONFIG.clone()
524}
525
526#[inline]
527const fn progressive_phase_fetch_limit(limit: usize) -> usize {
528 let limit = if limit == 0 { 1 } else { limit };
529 limit.saturating_mul(3)
530}
531
532#[derive(Debug, Clone, Copy, PartialEq, Eq)]
533struct HybridCandidateBudget {
534 lexical_candidates: usize,
535 semantic_candidates: usize,
536}
537
538#[inline]
539const fn hybrid_stage_multipliers(query_class: FsQueryClass) -> (usize, usize) {
540 match query_class {
541 FsQueryClass::Identifier => (6, 2),
543 FsQueryClass::ShortKeyword => (4, 4),
545 FsQueryClass::NaturalLanguage => (2, 8),
547 FsQueryClass::Empty => (0, 0),
549 }
550}
551
552#[inline]
553fn hybrid_candidate_budget(
554 query: &str,
555 requested_limit: usize,
556 effective_limit: usize,
557 offset: usize,
558 total_docs: usize,
559) -> HybridCandidateBudget {
560 let query_class = FsQueryClass::classify(query);
561 let (lex_mult, sem_mult) = hybrid_stage_multipliers(query_class);
562 let total_docs = total_docs.max(1);
563
564 if requested_limit == 0 {
567 let planning_window = HYBRID_NO_LIMIT_PLANNING_WINDOW.max(offset.saturating_add(1));
568 let lexical = effective_limit.min(total_docs).min(no_limit_result_cap());
573 let semantic = fs_candidate_count(planning_window, 0, sem_mult)
581 .max(planning_window)
582 .min(HYBRID_NO_LIMIT_SEMANTIC_CAP.max(offset.saturating_add(planning_window)))
583 .min(total_docs)
584 .min(lexical);
585 return HybridCandidateBudget {
586 lexical_candidates: lexical,
587 semantic_candidates: semantic,
588 };
589 }
590
591 let lexical = fs_candidate_count(requested_limit, offset, lex_mult.max(1))
592 .max(requested_limit.saturating_add(offset))
593 .min(total_docs);
594 let semantic = fs_candidate_count(requested_limit, offset, sem_mult.max(1))
595 .max(requested_limit.saturating_add(offset))
596 .min(total_docs);
597
598 HybridCandidateBudget {
599 lexical_candidates: lexical,
600 semantic_candidates: semantic,
601 }
602}
603
604#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
610#[serde(rename_all = "snake_case")]
611pub enum QueryType {
612 Simple,
614 Phrase,
616 Boolean,
618 Wildcard,
620 Filtered,
622 Empty,
624}
625
626#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
628#[serde(rename_all = "snake_case")]
629pub enum IndexStrategy {
630 EdgeNgram,
632 RegexScan,
634 BooleanCombination,
636 RangeScan,
638 FullScan,
640}
641
642#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize)]
644#[serde(rename_all = "snake_case")]
645pub enum QueryCost {
646 Low,
648 Medium,
650 High,
652}
653
654#[derive(Debug, Clone, serde::Serialize)]
656pub struct ParsedSubTerm {
657 pub text: String,
658 pub pattern: String,
659}
660
661#[derive(Debug, Clone, serde::Serialize)]
663pub struct ParsedTerm {
664 pub text: String,
666 pub negated: bool,
668 pub subterms: Vec<ParsedSubTerm>,
670}
671
672#[derive(Debug, Clone, Default, serde::Serialize)]
674pub struct ParsedQuery {
675 pub terms: Vec<ParsedTerm>,
677 pub phrases: Vec<String>,
679 pub operators: Vec<String>,
681 pub implicit_and: bool,
683}
684
685#[derive(Debug, Clone, serde::Serialize)]
687pub struct QueryExplanation {
688 pub original_query: String,
690 pub sanitized_query: String,
692 pub parsed: ParsedQuery,
694 pub query_type: QueryType,
696 pub index_strategy: IndexStrategy,
698 pub wildcard_applied: bool,
700 pub estimated_cost: QueryCost,
702 pub filters_summary: FiltersSummary,
704 pub warnings: Vec<String>,
706}
707
708#[derive(Debug, Clone, Default, serde::Serialize)]
710pub struct FiltersSummary {
711 pub agent_count: usize,
713 pub workspace_count: usize,
715 pub has_time_filter: bool,
717 pub description: Option<String>,
719}
720
721impl QueryExplanation {
722 pub fn analyze(query: &str, filters: &SearchFilters) -> Self {
724 let sanitized = nfc_sanitize_query(query);
725 let tokens = fs_cass_parse_boolean_query(query);
727
728 let mut parsed = ParsedQuery::default();
730 let mut has_explicit_operator = false;
731 let mut next_negated = false;
732
733 for token in &tokens {
734 match token {
735 FsCassQueryToken::Term(t) => {
736 let parts: Vec<String> = nfc_sanitize_query(t)
737 .split_whitespace()
738 .map(|s| s.to_string())
739 .collect();
740 if parts.is_empty() {
741 next_negated = false;
742 continue;
743 }
744 let mut subterms = Vec::new();
745 for part in parts {
746 let pattern = FsCassWildcardPattern::parse(&part);
747 let pattern_str = match &pattern {
748 FsCassWildcardPattern::Exact(_) => "exact",
749 FsCassWildcardPattern::Prefix(_) => "prefix (*)",
750 FsCassWildcardPattern::Suffix(_) => "suffix (*)",
751 FsCassWildcardPattern::Substring(_) => "substring (*)",
752 FsCassWildcardPattern::Complex(_) => "complex (*)",
753 };
754 subterms.push(ParsedSubTerm {
755 text: part,
756 pattern: pattern_str.to_string(),
757 });
758 }
759 parsed.terms.push(ParsedTerm {
760 text: t.clone(),
761 negated: next_negated,
762 subterms,
763 });
764 next_negated = false;
765 }
766 FsCassQueryToken::Phrase(p) => {
767 let parts: Vec<String> = nfc_sanitize_query(p)
768 .split_whitespace()
769 .map(|s| s.trim_matches('*').to_lowercase())
770 .filter(|s| !s.is_empty())
771 .collect();
772 if !parts.is_empty() {
773 parsed.phrases.push(parts.join(" "));
774 }
775 next_negated = false;
776 }
777 FsCassQueryToken::And => {
778 parsed.operators.push("AND".to_string());
779 has_explicit_operator = true;
780 }
781 FsCassQueryToken::Or => {
782 parsed.operators.push("OR".to_string());
783 has_explicit_operator = true;
784 }
785 FsCassQueryToken::Not => {
786 parsed.operators.push("NOT".to_string());
787 has_explicit_operator = true;
788 next_negated = true;
789 }
790 }
791 }
792
793 parsed.implicit_and = !has_explicit_operator && parsed.terms.len() > 1;
795
796 let query_type = Self::classify_query(&parsed, filters, &sanitized);
798
799 let index_strategy = Self::determine_strategy(&parsed, &sanitized);
801
802 let estimated_cost = Self::estimate_cost(&parsed, &index_strategy, filters);
804
805 let filters_summary = Self::summarize_filters(filters);
807
808 let warnings = Self::generate_warnings(&parsed, &sanitized, filters);
810
811 Self {
812 original_query: query.to_string(),
813 sanitized_query: sanitized,
814 parsed,
815 query_type,
816 index_strategy,
817 wildcard_applied: false, estimated_cost,
819 filters_summary,
820 warnings,
821 }
822 }
823
824 fn classify_query(parsed: &ParsedQuery, filters: &SearchFilters, sanitized: &str) -> QueryType {
825 if sanitized.trim().is_empty() {
826 return QueryType::Empty;
827 }
828
829 let has_filters = !filters.agents.is_empty()
831 || !filters.workspaces.is_empty()
832 || filters.created_from.is_some()
833 || filters.created_to.is_some()
834 || !filters.source_filter.is_all();
835
836 if has_filters {
837 return QueryType::Filtered;
838 }
839
840 if !parsed.operators.is_empty() {
842 return QueryType::Boolean;
843 }
844
845 if !parsed.phrases.is_empty() {
847 return QueryType::Phrase;
848 }
849
850 let has_wildcards = parsed
852 .terms
853 .iter()
854 .flat_map(|t| &t.subterms)
855 .any(|t| t.pattern != "exact");
856 if has_wildcards {
857 return QueryType::Wildcard;
858 }
859
860 QueryType::Simple
861 }
862
863 fn determine_strategy(parsed: &ParsedQuery, sanitized: &str) -> IndexStrategy {
864 if sanitized.trim().is_empty() {
865 return IndexStrategy::FullScan;
866 }
867
868 let has_leading_wildcard = parsed
870 .terms
871 .iter()
872 .flat_map(|t| &t.subterms)
873 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
874
875 if has_leading_wildcard {
876 return IndexStrategy::RegexScan;
877 }
878
879 let has_compound_terms = parsed.terms.iter().any(|t| t.subterms.len() > 1);
882
883 if !parsed.operators.is_empty()
884 || parsed.terms.len() > 1
885 || !parsed.phrases.is_empty()
886 || has_compound_terms
887 {
888 return IndexStrategy::BooleanCombination;
889 }
890
891 IndexStrategy::EdgeNgram
893 }
894
895 fn estimate_cost(
896 parsed: &ParsedQuery,
897 strategy: &IndexStrategy,
898 filters: &SearchFilters,
899 ) -> QueryCost {
900 if matches!(strategy, IndexStrategy::RegexScan) {
902 return QueryCost::High;
903 }
904
905 if matches!(strategy, IndexStrategy::FullScan) {
907 return QueryCost::High;
908 }
909
910 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
912
913 let term_count: usize = parsed.terms.iter().map(|t| t.subterms.len()).sum();
915 let operator_count = parsed.operators.len();
916 let phrase_count = parsed.phrases.len();
917
918 let complexity = term_count + operator_count * 2 + phrase_count * 2;
919
920 if complexity > 6 || has_time_filter {
921 QueryCost::High
922 } else if complexity > 2 {
923 QueryCost::Medium
924 } else {
925 QueryCost::Low
926 }
927 }
928
929 fn summarize_filters(filters: &SearchFilters) -> FiltersSummary {
930 let agent_count = filters.agents.len();
931 let workspace_count = filters.workspaces.len();
932 let has_time_filter = filters.created_from.is_some() || filters.created_to.is_some();
933
934 let mut parts = Vec::new();
935 if agent_count > 0 {
936 parts.push(format!(
937 "{} agent{}",
938 agent_count,
939 if agent_count > 1 { "s" } else { "" }
940 ));
941 }
942 if workspace_count > 0 {
943 parts.push(format!(
944 "{} workspace{}",
945 workspace_count,
946 if workspace_count > 1 { "s" } else { "" }
947 ));
948 }
949 if has_time_filter {
950 parts.push("time range".to_string());
951 }
952
953 let description = if parts.is_empty() {
954 None
955 } else {
956 Some(format!("Filtering by: {}", parts.join(", ")))
957 };
958
959 FiltersSummary {
960 agent_count,
961 workspace_count,
962 has_time_filter,
963 description,
964 }
965 }
966
967 fn generate_warnings(
968 parsed: &ParsedQuery,
969 sanitized: &str,
970 filters: &SearchFilters,
971 ) -> Vec<String> {
972 let mut warnings = Vec::new();
973
974 let has_leading_wildcard = parsed
976 .terms
977 .iter()
978 .flat_map(|t| &t.subterms)
979 .any(|t| t.pattern == "suffix (*)" || t.pattern == "substring (*)");
980 if has_leading_wildcard {
981 warnings.push(
982 "Leading wildcards (*foo) require regex scan and may be slow on large indexes"
983 .to_string(),
984 );
985 }
986
987 for term in &parsed.terms {
989 for sub in &term.subterms {
990 if sub.text.trim_matches('*').len() < 2 {
991 warnings.push(format!(
992 "Very short term '{}' may match many documents",
993 sub.text
994 ));
995 }
996 }
997 }
998
999 if sanitized.trim().is_empty() {
1001 warnings.push("Empty query will return all documents (expensive)".to_string());
1002 }
1003
1004 if parsed.operators.len() > 3 {
1006 warnings.push("Complex boolean query may have unexpected precedence".to_string());
1007 }
1008
1009 if let Some(agent) = filters.agents.iter().next()
1011 && filters.agents.len() == 1
1012 && filters.workspaces.is_empty()
1013 {
1014 warnings.push(format!(
1015 "Searching only in agent '{}' - results from other agents will be excluded",
1016 agent
1017 ));
1018 }
1019
1020 warnings
1021 }
1022
1023 pub fn with_wildcard_fallback(mut self, applied: bool) -> Self {
1025 self.wildcard_applied = applied;
1026 if applied
1027 && !self
1028 .warnings
1029 .iter()
1030 .any(|w| w.contains("wildcard fallback"))
1031 {
1032 self.warnings.push(
1033 "Wildcard fallback was applied automatically due to sparse exact matches"
1034 .to_string(),
1035 );
1036 }
1037 self
1038 }
1039}
1040
1041#[derive(Debug, Clone, Copy, PartialEq, Eq, Default, serde::Serialize)]
1044#[serde(rename_all = "snake_case")]
1045pub enum MatchType {
1046 #[default]
1048 Exact,
1049 Prefix,
1051 Suffix,
1053 Substring,
1055 Wildcard,
1057 ImplicitWildcard,
1059}
1060
1061impl MatchType {
1062 pub fn quality_factor(self) -> f32 {
1064 match self {
1065 MatchType::Exact => 1.0,
1066 MatchType::Prefix => 0.9,
1067 MatchType::Suffix => 0.8,
1068 MatchType::Substring => 0.7,
1069 MatchType::Wildcard => 0.65,
1070 MatchType::ImplicitWildcard => 0.6,
1071 }
1072 }
1073}
1074
1075#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize)]
1077#[serde(rename_all = "snake_case")]
1078pub enum SuggestionKind {
1079 SpellingFix,
1081 WildcardQuery,
1083 RemoveFilter,
1085 AlternateAgent,
1087 BroaderDateRange,
1089}
1090
1091#[derive(Debug, Clone, serde::Serialize)]
1093pub struct QuerySuggestion {
1094 pub kind: SuggestionKind,
1096 pub message: String,
1098 pub suggested_query: Option<String>,
1100 pub suggested_filters: Option<SearchFilters>,
1102 pub shortcut: Option<u8>,
1104}
1105
1106impl QuerySuggestion {
1107 fn spelling(_query: &str, corrected: &str) -> Self {
1108 Self {
1109 kind: SuggestionKind::SpellingFix,
1110 message: format!("Did you mean: \"{corrected}\"?"),
1111 suggested_query: Some(corrected.to_string()),
1112 suggested_filters: None,
1113 shortcut: None,
1114 }
1115 }
1116
1117 fn wildcard(query: &str) -> Self {
1118 let wildcard_query = format!("*{}*", query.trim_matches('*'));
1119 Self {
1120 kind: SuggestionKind::WildcardQuery,
1121 message: format!("Try broader search: \"{wildcard_query}\""),
1122 suggested_query: Some(wildcard_query),
1123 suggested_filters: None,
1124 shortcut: None,
1125 }
1126 }
1127
1128 fn remove_agent_filter(current_agent: &str, current_filters: &SearchFilters) -> Self {
1129 let mut filters = current_filters.clone();
1132 filters.agents.clear();
1133 Self {
1134 kind: SuggestionKind::RemoveFilter,
1135 message: format!("Remove agent filter (currently: {current_agent})"),
1136 suggested_query: None,
1137 suggested_filters: Some(filters),
1138 shortcut: None,
1139 }
1140 }
1141
1142 fn try_agent(agent_slug: &str) -> Self {
1143 let mut filters = SearchFilters::default();
1144 filters.agents.insert(agent_slug.to_string());
1145 Self {
1146 kind: SuggestionKind::AlternateAgent,
1147 message: format!("Try searching in: {agent_slug}"),
1148 suggested_query: None,
1149 suggested_filters: Some(filters),
1150 shortcut: None,
1151 }
1152 }
1153
1154 fn with_shortcut(mut self, key: u8) -> Self {
1155 self.shortcut = Some(key);
1156 self
1157 }
1158}
1159
1160#[derive(Debug, Clone, Copy)]
1161pub struct FieldMask {
1162 flags: u8,
1163 preview_content_chars: Option<usize>,
1164}
1165
1166impl FieldMask {
1167 const CONTENT: u8 = 1 << 0;
1168 const SNIPPET: u8 = 1 << 1;
1169 const TITLE: u8 = 1 << 2;
1170 const CACHE: u8 = 1 << 3;
1171
1172 pub const FULL: Self = Self {
1173 flags: Self::CONTENT | Self::SNIPPET | Self::TITLE | Self::CACHE,
1174 preview_content_chars: None,
1175 };
1176
1177 pub fn new(
1178 wants_content: bool,
1179 wants_snippet: bool,
1180 wants_title: bool,
1181 allows_cache: bool,
1182 ) -> Self {
1183 let mut flags = 0;
1184 if wants_content {
1185 flags |= Self::CONTENT;
1186 }
1187 if wants_snippet {
1188 flags |= Self::SNIPPET;
1189 }
1190 if wants_title {
1191 flags |= Self::TITLE;
1192 }
1193 if allows_cache {
1194 flags |= Self::CACHE;
1195 }
1196 Self {
1197 flags,
1198 preview_content_chars: None,
1199 }
1200 }
1201
1202 pub fn with_preview_content_limit(mut self, max_chars: Option<usize>) -> Self {
1203 self.preview_content_chars = max_chars;
1204 if max_chars.is_some() {
1205 self.flags &= !Self::CACHE;
1206 }
1207 self
1208 }
1209
1210 pub fn needs_content(self) -> bool {
1211 self.flags & Self::CONTENT != 0
1212 }
1213
1214 pub fn wants_snippet(self) -> bool {
1215 self.flags & Self::SNIPPET != 0
1216 }
1217
1218 pub fn wants_title(self) -> bool {
1219 self.flags & Self::TITLE != 0
1220 }
1221
1222 pub fn allows_cache(self) -> bool {
1223 self.flags & Self::CACHE != 0
1224 }
1225
1226 pub fn preview_content_limit(self) -> Option<usize> {
1227 self.preview_content_chars
1228 }
1229}
1230
1231#[derive(Debug, Clone, serde::Serialize)]
1232pub struct SearchHit {
1233 pub title: String,
1234 pub snippet: String,
1235 pub content: String,
1236 #[serde(skip_serializing)]
1237 pub content_hash: u64,
1238 #[serde(skip_serializing)]
1239 pub conversation_id: Option<i64>,
1240 pub score: f32,
1241 pub source_path: String,
1242 pub agent: String,
1243 pub workspace: String,
1244 #[serde(skip_serializing_if = "Option::is_none")]
1246 pub workspace_original: Option<String>,
1247 pub created_at: Option<i64>,
1248 pub line_number: Option<usize>,
1250 #[serde(default)]
1252 pub match_type: MatchType,
1253 #[serde(default = "default_source_id")]
1256 pub source_id: String,
1257 #[serde(default = "default_source_id")]
1259 pub origin_kind: String,
1260 #[serde(skip_serializing_if = "Option::is_none")]
1262 pub origin_host: Option<String>,
1263}
1264
1265static LAZY_FIELDS_ENABLED: Lazy<bool> = Lazy::new(|| {
1266 dotenvy::var("CASS_LAZY_FIELDS")
1267 .ok()
1268 .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
1269 .unwrap_or(true)
1270});
1271
1272fn default_source_id() -> String {
1273 "local".to_string()
1274}
1275
1276fn effective_field_mask(field_mask: FieldMask) -> FieldMask {
1277 if *LAZY_FIELDS_ENABLED {
1278 field_mask
1279 } else {
1280 FieldMask::FULL
1281 }
1282}
1283
1284fn execute_query_with_lazy_exact_count(
1285 searcher: &Searcher,
1286 query: &dyn Query,
1287 limit: usize,
1288 offset: usize,
1289) -> Result<FsLexicalSearchResult> {
1290 let top_docs = searcher.search(
1291 query,
1292 &TopDocs::with_limit(limit)
1293 .and_offset(offset)
1294 .order_by_score(),
1295 )?;
1296 let page_saturated = top_docs.len() == limit;
1297 let total_count = if page_saturated {
1298 searcher.search(query, &Count)?
1299 } else {
1300 offset.saturating_add(top_docs.len())
1301 };
1302 let hits = top_docs
1303 .into_iter()
1304 .enumerate()
1305 .map(|(rank, (bm25_score, doc_address))| FsLexicalDocHit {
1306 bm25_score,
1307 rank,
1308 doc_address,
1309 })
1310 .collect();
1311
1312 Ok(FsLexicalSearchResult { hits, total_count })
1313}
1314
1315#[derive(Debug, Clone)]
1317pub struct SearchResult {
1318 pub hits: Vec<SearchHit>,
1320 pub wildcard_fallback: bool,
1322 pub cache_stats: CacheStats,
1324 pub suggestions: Vec<QuerySuggestion>,
1326 pub ann_stats: Option<crate::search::ann_index::AnnSearchStats>,
1328 pub total_count: Option<usize>,
1334}
1335
1336#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1337pub enum ProgressivePhaseKind {
1338 Initial,
1339 Refined,
1340}
1341
1342#[allow(clippy::large_enum_variant)]
1345#[derive(Debug, Clone)]
1346pub enum ProgressiveSearchEvent {
1347 Phase {
1348 kind: ProgressivePhaseKind,
1349 result: SearchResult,
1350 elapsed_ms: u128,
1351 },
1352 RefinementFailed {
1353 latency_ms: u128,
1354 error: String,
1355 },
1356}
1357
1358#[derive(Debug, Clone)]
1359pub(crate) struct ProgressiveSearchRequest<'a> {
1360 pub(crate) cx: &'a FsCx,
1361 pub(crate) query: &'a str,
1362 pub(crate) filters: SearchFilters,
1363 pub(crate) limit: usize,
1364 pub(crate) sparse_threshold: usize,
1365 pub(crate) field_mask: FieldMask,
1366 pub(crate) mode: SearchMode,
1367}
1368
1369#[derive(Debug, Clone, PartialEq, Eq, Hash)]
1370struct SearchHitKey {
1371 source_id: String,
1372 source_path: String,
1373 conversation_id: Option<i64>,
1374 title: String,
1375 line_number: Option<usize>,
1376 created_at: Option<i64>,
1377 content_hash: u64,
1378}
1379
1380fn normalized_search_source_id_sql_expr(
1381 source_id_column: &str,
1382 origin_kind_column: &str,
1383 origin_host_column: &str,
1384) -> String {
1385 format!(
1386 "CASE \
1387 WHEN TRIM(COALESCE({source_id_column}, '')) != '' THEN \
1388 CASE \
1389 WHEN LOWER(TRIM(COALESCE({source_id_column}, ''))) = '{local}' THEN '{local}' \
1390 ELSE TRIM(COALESCE({source_id_column}, '')) \
1391 END \
1392 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) IN ('ssh', 'remote') THEN \
1393 CASE \
1394 WHEN TRIM(COALESCE({origin_host_column}, '')) = '' THEN 'remote' \
1395 ELSE TRIM(COALESCE({origin_host_column}, '')) \
1396 END \
1397 WHEN LOWER(TRIM(COALESCE({origin_kind_column}, ''))) = '{local}' THEN '{local}' \
1398 WHEN TRIM(COALESCE({origin_host_column}, '')) != '' THEN TRIM(COALESCE({origin_host_column}, '')) \
1399 ELSE '{local}' \
1400 END",
1401 local = crate::sources::provenance::LOCAL_SOURCE_ID,
1402 )
1403}
1404
1405fn normalize_search_source_filter_value(source_id: &str) -> String {
1406 let trimmed = source_id.trim();
1407 if trimmed.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1408 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1409 } else {
1410 trimmed.to_string()
1411 }
1412}
1413
1414fn normalized_search_hit_source_id_parts(
1415 source_id: &str,
1416 origin_kind: &str,
1417 origin_host: Option<&str>,
1418) -> String {
1419 let trimmed_source_id = source_id.trim();
1420 if !trimmed_source_id.is_empty() {
1421 if trimmed_source_id.eq_ignore_ascii_case(crate::sources::provenance::LOCAL_SOURCE_ID) {
1422 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1423 }
1424 return trimmed_source_id.to_string();
1425 }
1426
1427 let trimmed_origin_host = origin_host.map(str::trim).filter(|value| !value.is_empty());
1428 let trimmed_origin_kind = origin_kind.trim();
1429 if trimmed_origin_kind.eq_ignore_ascii_case("ssh")
1430 || trimmed_origin_kind.eq_ignore_ascii_case("remote")
1431 {
1432 return trimmed_origin_host.unwrap_or("remote").to_string();
1433 }
1434 if let Some(origin_host) = trimmed_origin_host {
1435 return origin_host.to_string();
1436 }
1437
1438 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1439}
1440
1441fn normalized_search_hit_origin_kind(source_id: &str, origin_kind: Option<&str>) -> String {
1442 if let Some(kind) = origin_kind.map(str::trim).filter(|value| !value.is_empty()) {
1443 if kind.eq_ignore_ascii_case("local") {
1444 return crate::sources::provenance::LOCAL_SOURCE_ID.to_string();
1445 }
1446 if kind.eq_ignore_ascii_case("ssh") || kind.eq_ignore_ascii_case("remote") {
1447 return "remote".to_string();
1448 }
1449 return kind.to_ascii_lowercase();
1450 }
1451
1452 if source_id == crate::sources::provenance::LOCAL_SOURCE_ID {
1453 crate::sources::provenance::LOCAL_SOURCE_ID.to_string()
1454 } else {
1455 "remote".to_string()
1456 }
1457}
1458
1459fn normalized_search_hit_source_id(hit: &SearchHit) -> String {
1460 normalized_search_hit_source_id_parts(
1461 hit.source_id.as_str(),
1462 hit.origin_kind.as_str(),
1463 hit.origin_host.as_deref(),
1464 )
1465}
1466
1467impl SearchHitKey {
1468 fn from_hit(hit: &SearchHit) -> Self {
1469 Self {
1470 source_id: normalized_search_hit_source_id(hit),
1471 source_path: hit.source_path.clone(),
1472 conversation_id: hit.conversation_id,
1473 title: if hit.conversation_id.is_some() {
1474 String::new()
1475 } else {
1476 hit.title.trim().to_string()
1477 },
1478 line_number: hit.line_number,
1479 created_at: hit.created_at,
1480 content_hash: hit.content_hash,
1481 }
1482 }
1483}
1484
1485impl Ord for SearchHitKey {
1486 fn cmp(&self, other: &Self) -> CmpOrdering {
1487 self.source_id
1488 .cmp(&other.source_id)
1489 .then_with(|| self.source_path.cmp(&other.source_path))
1490 .then_with(|| self.conversation_id.cmp(&other.conversation_id))
1491 .then_with(|| self.title.cmp(&other.title))
1492 .then_with(|| self.line_number.cmp(&other.line_number))
1493 .then_with(|| self.created_at.cmp(&other.created_at))
1494 .then_with(|| self.content_hash.cmp(&other.content_hash))
1495 }
1496}
1497
1498impl PartialOrd for SearchHitKey {
1499 fn partial_cmp(&self, other: &Self) -> Option<CmpOrdering> {
1500 Some(self.cmp(other))
1501 }
1502}
1503
1504const FEDERATED_RRF_K: f32 = 60.0;
1505
1506#[derive(Debug)]
1507struct FederatedRankedHit {
1508 hit: SearchHit,
1509 shard_index: usize,
1510 shard_rank: usize,
1511 fused_score: f32,
1512}
1513
1514fn federated_rrf_score(shard_rank: usize) -> f32 {
1515 1.0 / (FEDERATED_RRF_K + shard_rank as f32 + 1.0)
1516}
1517
1518fn merge_federated_ranked_hits(mut ranked_hits: Vec<FederatedRankedHit>) -> Vec<SearchHit> {
1519 ranked_hits.sort_by(|a, b| {
1520 b.fused_score
1521 .total_cmp(&a.fused_score)
1522 .then_with(|| a.shard_rank.cmp(&b.shard_rank))
1523 .then_with(|| SearchHitKey::from_hit(&a.hit).cmp(&SearchHitKey::from_hit(&b.hit)))
1524 .then_with(|| a.shard_index.cmp(&b.shard_index))
1525 });
1526 ranked_hits
1527 .into_iter()
1528 .map(|mut ranked| {
1529 ranked.hit.score = ranked.fused_score;
1530 ranked.hit
1531 })
1532 .collect()
1533}
1534
1535#[cfg(test)]
1536#[allow(dead_code)]
1537#[derive(Debug, Default, Clone)]
1538struct HybridScore {
1539 rrf: f32,
1540 lexical_rank: Option<usize>,
1541 semantic_rank: Option<usize>,
1542 lexical_score: Option<f32>,
1543 semantic_score: Option<f32>,
1544}
1545
1546#[cfg(test)]
1547#[allow(dead_code)]
1548#[derive(Debug, Clone)]
1549struct FusedHit {
1550 key: SearchHitKey,
1551 score: HybridScore,
1552 hit: SearchHit,
1553}
1554
1555pub(crate) fn stable_content_hash(content: &str) -> u64 {
1565 use xxhash_rust::xxh3::Xxh3;
1566 let mut hasher = Xxh3::new();
1567 let mut first = true;
1568 for token in content.split_whitespace() {
1569 if !first {
1570 hasher.update(b" ");
1571 }
1572 hasher.update(token.as_bytes());
1573 first = false;
1574 }
1575 hasher.digest()
1576}
1577
1578fn stable_hit_hash(
1579 content: &str,
1580 source_path: &str,
1581 line_number: Option<usize>,
1582 created_at: Option<i64>,
1583) -> u64 {
1584 use xxhash_rust::xxh3::Xxh3;
1585 let mut hasher = Xxh3::new();
1586 if !content.is_empty() {
1589 hasher.update(&stable_content_hash(content).to_le_bytes());
1590 }
1591 hasher.update(b"|");
1592 hasher.update(source_path.as_bytes());
1593 hasher.update(b"|");
1594 if let Some(line) = line_number {
1595 let mut buf = itoa::Buffer::new();
1596 hasher.update(buf.format(line).as_bytes());
1597 }
1598 hasher.update(b"|");
1599 if let Some(ts) = created_at {
1600 let mut buf = itoa::Buffer::new();
1601 hasher.update(buf.format(ts).as_bytes());
1602 }
1603 hasher.digest()
1604}
1605
1606fn search_hit_key_doc_id(key: &SearchHitKey) -> String {
1607 use std::fmt::Write as _;
1615 const SEP: char = '\u{1f}';
1616 let capacity = key.source_id.len()
1618 + key.source_path.len()
1619 + key.title.len()
1620 + 6 + 3 * 20 + 20; let mut out = String::with_capacity(capacity);
1624 out.push_str(&key.source_id);
1625 out.push(SEP);
1626 out.push_str(&key.source_path);
1627 out.push(SEP);
1628 if let Some(v) = key.conversation_id {
1629 let _ = write!(out, "{v}");
1630 }
1631 out.push(SEP);
1632 out.push_str(&key.title);
1633 out.push(SEP);
1634 if let Some(v) = key.line_number {
1635 let _ = write!(out, "{v}");
1636 }
1637 out.push(SEP);
1638 if let Some(v) = key.created_at {
1639 let _ = write!(out, "{v}");
1640 }
1641 out.push(SEP);
1642 let _ = write!(out, "{}", key.content_hash);
1643 out
1644}
1645
1646fn search_hit_doc_id(hit: &SearchHit) -> String {
1647 search_hit_key_doc_id(&SearchHitKey::from_hit(hit))
1648}
1649
1650#[cfg(test)]
1652fn cmp_fused_hit_desc(a: &FusedHit, b: &FusedHit) -> CmpOrdering {
1653 b.score
1654 .rrf
1655 .total_cmp(&a.score.rrf)
1656 .then_with(|| {
1657 let a_both = a.score.lexical_rank.is_some() && a.score.semantic_rank.is_some();
1658 let b_both = b.score.lexical_rank.is_some() && b.score.semantic_rank.is_some();
1659 match (b_both, a_both) {
1660 (true, false) => CmpOrdering::Greater,
1661 (false, true) => CmpOrdering::Less,
1662 _ => CmpOrdering::Equal,
1663 }
1664 })
1665 .then_with(|| a.key.cmp(&b.key))
1666}
1667
1668#[cfg(test)]
1670#[allow(dead_code)]
1671const QUICKSELECT_THRESHOLD: usize = 64;
1672
1673#[cfg(test)]
1682#[allow(dead_code)]
1683fn top_k_fused(mut hits: Vec<FusedHit>, k: usize) -> Vec<FusedHit> {
1684 let n = hits.len();
1685
1686 if n == 0 || k == 0 {
1688 return Vec::new();
1689 }
1690 if k >= n {
1691 hits.sort_by(cmp_fused_hit_desc);
1692 return hits;
1693 }
1694
1695 if n < QUICKSELECT_THRESHOLD {
1697 hits.sort_by(cmp_fused_hit_desc);
1698 hits.truncate(k);
1699 return hits;
1700 }
1701
1702 hits.select_nth_unstable_by(k - 1, cmp_fused_hit_desc);
1704
1705 hits.truncate(k);
1707
1708 hits.sort_by(cmp_fused_hit_desc);
1710
1711 hits
1712}
1713
1714pub fn rrf_fuse_hits(
1717 lexical: &[SearchHit],
1718 semantic: &[SearchHit],
1719 query: &str,
1720 limit: usize,
1721 offset: usize,
1722) -> Vec<SearchHit> {
1723 if limit == 0 {
1724 return Vec::new();
1725 }
1726 let total_candidates = lexical.len().saturating_add(semantic.len());
1727 if total_candidates == 0 {
1728 return Vec::new();
1729 }
1730
1731 let mut lexical_scored = Vec::with_capacity(lexical.len());
1732 let mut semantic_scored = Vec::with_capacity(semantic.len());
1733 let mut hit_by_doc_id: HashMap<String, SearchHit> = HashMap::with_capacity(total_candidates);
1734
1735 for hit in lexical {
1736 let doc_id = search_hit_doc_id(hit);
1737 hit_by_doc_id.insert(doc_id.clone(), hit.clone());
1739 lexical_scored.push(FsScoredResult {
1740 doc_id,
1741 score: hit.score,
1742 source: FsScoreSource::Lexical,
1743 index: None,
1744 fast_score: None,
1745 quality_score: None,
1746 lexical_score: Some(hit.score),
1747 rerank_score: None,
1748 explanation: None,
1749 metadata: None,
1750 });
1751 }
1752
1753 for (idx, hit) in semantic.iter().enumerate() {
1754 let doc_id = search_hit_doc_id(hit);
1755 hit_by_doc_id
1756 .entry(doc_id.clone())
1757 .or_insert_with(|| hit.clone());
1758 semantic_scored.push(FsVectorHit {
1759 index: u32::try_from(idx).unwrap_or(u32::MAX),
1760 score: hit.score,
1761 doc_id,
1762 });
1763 }
1764
1765 let fused = fs_rrf_fuse(
1768 &lexical_scored,
1769 &semantic_scored,
1770 total_candidates,
1771 0,
1772 &FsRrfConfig::default(),
1773 );
1774
1775 #[derive(Clone, Copy)]
1780 struct CompatSlot {
1781 index: usize,
1782 conversation_id: Option<i64>,
1783 ambiguous: bool,
1784 }
1785
1786 let mut source_ids: HashMap<String, u32> = HashMap::new();
1787 let mut path_ids: HashMap<String, u32> = HashMap::new();
1788 let mut title_ids: HashMap<String, u32> = HashMap::new();
1789 let mut next_source_id: u32 = 0;
1790 let mut next_path_id: u32 = 0;
1791 let mut next_title_id: u32 = 0;
1792 type CompatExactKey = (
1793 u32,
1794 u32,
1795 Option<i64>,
1796 Option<u32>,
1797 Option<usize>,
1798 Option<i64>,
1799 u64,
1800 );
1801 type CompatFallbackKey = (u32, u32, u32, Option<usize>, Option<i64>, u64);
1802
1803 let mut exact_seen: HashMap<CompatExactKey, usize> = HashMap::with_capacity(fused.len());
1804 let mut fallback_seen: HashMap<CompatFallbackKey, CompatSlot> =
1805 HashMap::with_capacity(fused.len());
1806 let mut unique_hits: Vec<SearchHit> = Vec::with_capacity(fused.len());
1807
1808 let update_slot = |slot: &mut CompatSlot, conversation_id: Option<i64>| {
1809 if slot.ambiguous {
1810 return;
1811 }
1812 match (slot.conversation_id, conversation_id) {
1813 (Some(existing), Some(current)) if existing != current => slot.ambiguous = true,
1814 (None, Some(current)) => slot.conversation_id = Some(current),
1815 _ => {}
1816 }
1817 };
1818
1819 for fused_hit in fused {
1820 let mut hit = match hit_by_doc_id.remove(&fused_hit.doc_id) {
1821 Some(hit) => hit,
1822 None => continue,
1823 };
1824 if hit_is_noise(&hit, query) {
1825 continue;
1826 }
1827
1828 let normalized_source_id = normalized_search_hit_source_id(&hit);
1829 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
1830 *id
1831 } else {
1832 let id = next_source_id;
1833 next_source_id = next_source_id.saturating_add(1);
1834 source_ids.insert(normalized_source_id, id);
1835 id
1836 };
1837 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
1838 *id
1839 } else {
1840 let id = next_path_id;
1841 next_path_id = next_path_id.saturating_add(1);
1842 path_ids.insert(hit.source_path.clone(), id);
1843 id
1844 };
1845 let normalized_title = hit.title.trim();
1846 let fallback_title_key = if let Some(id) = title_ids.get(normalized_title) {
1847 *id
1848 } else {
1849 let id = next_title_id;
1850 next_title_id = next_title_id.saturating_add(1);
1851 title_ids.insert(normalized_title.to_string(), id);
1852 id
1853 };
1854 let exact_title_key = if hit.conversation_id.is_some() {
1855 None
1856 } else {
1857 Some(fallback_title_key)
1858 };
1859 let exact_key = (
1860 source_key,
1861 path_key,
1862 hit.conversation_id,
1863 exact_title_key,
1864 hit.line_number,
1865 hit.created_at,
1866 hit.content_hash,
1867 );
1868 let fallback_key = (
1869 source_key,
1870 path_key,
1871 fallback_title_key,
1872 hit.line_number,
1873 hit.created_at,
1874 hit.content_hash,
1875 );
1876
1877 let merged_idx = exact_seen.get(&exact_key).copied().or_else(|| {
1878 fallback_seen.get(&fallback_key).and_then(|slot| {
1879 if slot.ambiguous {
1880 return None;
1881 }
1882 match (slot.conversation_id, hit.conversation_id) {
1883 (Some(existing), Some(current)) if existing != current => None,
1884 _ => Some(slot.index),
1885 }
1886 })
1887 });
1888
1889 if let Some(existing_idx) = merged_idx {
1890 exact_seen.insert(exact_key, existing_idx);
1891 let slot = fallback_seen.entry(fallback_key).or_insert(CompatSlot {
1892 index: existing_idx,
1893 conversation_id: hit.conversation_id,
1894 ambiguous: false,
1895 });
1896 update_slot(slot, hit.conversation_id);
1897 if unique_hits[existing_idx].conversation_id.is_none() && hit.conversation_id.is_some()
1898 {
1899 unique_hits[existing_idx].conversation_id = hit.conversation_id;
1900 }
1901 unique_hits[existing_idx].score += fused_hit.rrf_score as f32;
1902 continue;
1903 }
1904
1905 hit.score = fused_hit.rrf_score as f32;
1906 let index = unique_hits.len();
1907 unique_hits.push(hit);
1908 exact_seen.insert(exact_key, index);
1909 match fallback_seen.get_mut(&fallback_key) {
1910 Some(slot) => update_slot(slot, unique_hits[index].conversation_id),
1911 None => {
1912 fallback_seen.insert(
1913 fallback_key,
1914 CompatSlot {
1915 index,
1916 conversation_id: unique_hits[index].conversation_id,
1917 ambiguous: false,
1918 },
1919 );
1920 }
1921 }
1922 }
1923
1924 unique_hits.sort_by(|a, b| {
1925 b.score
1926 .total_cmp(&a.score)
1927 .then_with(|| SearchHitKey::from_hit(a).cmp(&SearchHitKey::from_hit(b)))
1928 });
1929
1930 let start = offset.min(unique_hits.len());
1931 unique_hits.into_iter().skip(start).take(limit).collect()
1932}
1933
1934struct QueryCache {
1935 embedder_id: String,
1936 embeddings: LruCache<String, Vec<f32>>,
1937}
1938
1939impl QueryCache {
1940 fn new(embedder_id: &str, capacity: NonZeroUsize) -> Self {
1941 Self {
1942 embedder_id: embedder_id.to_string(),
1943 embeddings: LruCache::new(capacity),
1944 }
1945 }
1946
1947 fn align_embedder(&mut self, embedder: &dyn Embedder) {
1948 if self.embedder_id != embedder.id() {
1949 self.embedder_id = embedder.id().to_string();
1950 self.embeddings.clear();
1951 }
1952 }
1953
1954 fn get_cached(&mut self, embedder: &dyn Embedder, canonical: &str) -> Option<Vec<f32>> {
1955 self.align_embedder(embedder);
1956 self.embeddings.get(canonical).cloned()
1957 }
1958
1959 fn store(&mut self, embedder: &dyn Embedder, canonical: &str, embedding: Vec<f32>) {
1960 self.align_embedder(embedder);
1961 self.embeddings.put(canonical.to_string(), embedding);
1962 }
1963}
1964
1965fn semantic_filter_as_search_filter(filter: &SemanticFilter) -> Option<&dyn FsSearchFilter> {
1968 let unrestricted = filter.agents.is_none()
1969 && filter.workspaces.is_none()
1970 && filter.sources.is_none()
1971 && filter.roles.is_none()
1972 && filter.created_from.is_none()
1973 && filter.created_to.is_none();
1974 if unrestricted { None } else { Some(filter) }
1975}
1976
1977fn open_fs_semantic_ann_index(fs_index: &FsVectorIndex, ann_path: &Path) -> Result<FsHnswIndex> {
1978 if !ann_path.is_file() {
1979 bail!(
1980 "approximate search unavailable: HNSW index not found at {}",
1981 ann_path.display()
1982 );
1983 }
1984
1985 let ann = FsHnswIndex::load(ann_path, fs_index)
1986 .map_err(|err| anyhow!("open HNSW index failed: {err}"))?;
1987 let matches = ann
1988 .matches_vector_index(fs_index)
1989 .map_err(|err| anyhow!("validate HNSW index failed: {err}"))?;
1990 if !matches {
1991 bail!(
1992 "approximate search unavailable: HNSW index at {} is stale for current semantic index (run 'cass index --semantic --build-hnsw')",
1993 ann_path.display()
1994 );
1995 }
1996
1997 Ok(ann)
1998}
1999
2000struct SemanticSearchState {
2001 context_token: Arc<()>,
2002 embedder: Arc<dyn Embedder>,
2003 fs_semantic_index: Arc<FsVectorIndex>,
2004 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2005 fs_ann_index: Option<Arc<FsHnswIndex>>,
2006 ann_path: Option<PathBuf>,
2007 fs_in_memory_two_tier_index: Option<Arc<FsInMemoryTwoTierIndex>>,
2008 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable,
2009 progressive_context: Option<Arc<ProgressiveTwoTierContext>>,
2010 progressive_context_unavailable: bool,
2011 filter_maps: SemanticFilterMaps,
2012 roles: Option<HashSet<u8>>,
2013 query_cache: QueryCache,
2014}
2015
2016#[derive(Debug, Clone, Copy, Default)]
2017struct InMemoryTwoTierUnavailable {
2018 fast_only: bool,
2019 quality: bool,
2020}
2021
2022impl InMemoryTwoTierUnavailable {
2023 fn is_known_unavailable(self, tier_mode: SemanticTierMode) -> bool {
2024 match tier_mode {
2025 SemanticTierMode::Single => false,
2026 SemanticTierMode::FastOnly => self.fast_only,
2027 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => self.quality,
2028 }
2029 }
2030
2031 fn mark_unavailable(&mut self, tier_mode: SemanticTierMode) {
2032 match tier_mode {
2033 SemanticTierMode::Single => {}
2034 SemanticTierMode::FastOnly => {
2035 self.fast_only = true;
2036 }
2037 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly => {
2038 self.quality = true;
2039 }
2040 }
2041 }
2042}
2043
2044struct ProgressiveTwoTierContext {
2045 context_token: Arc<()>,
2046 index: Arc<FsTwoTierIndex>,
2047 fast_embedder: Arc<dyn frankensearch::Embedder>,
2048 quality_embedder: Option<Arc<dyn frankensearch::Embedder>>,
2049}
2050
2051#[derive(Clone)]
2052struct SemanticCandidateContext {
2053 fs_semantic_index: Arc<FsVectorIndex>,
2054 fs_semantic_indexes: Arc<Vec<Arc<FsVectorIndex>>>,
2055 filter_maps: SemanticFilterMaps,
2056 roles: Option<HashSet<u8>>,
2057}
2058
2059struct SemanticCandidateSearchRequest<'a> {
2060 fetch_limit: usize,
2061 approximate: bool,
2062 tier_mode: SemanticTierMode,
2063 in_memory_two_tier_index: Option<&'a Arc<FsInMemoryTwoTierIndex>>,
2064 ann_index: Option<&'a Arc<FsHnswIndex>>,
2065}
2066
2067#[derive(Debug, Clone, Copy, Default)]
2068struct SemanticCandidateRetryState {
2069 has_more_candidates: bool,
2070 exact_window_may_omit_competitor: bool,
2071}
2072
2073struct SemanticQueryEmbedding {
2074 context_token: Arc<()>,
2075 vector: Vec<f32>,
2076}
2077
2078struct SharedCassSyncEmbedder {
2079 inner: Arc<dyn Embedder>,
2080 cache: Mutex<LruCache<String, Vec<f32>>>,
2081}
2082
2083impl SharedCassSyncEmbedder {
2084 fn new(inner: Arc<dyn Embedder>) -> Self {
2085 let cache_capacity =
2086 NonZeroUsize::new(PROGRESSIVE_EMBEDDING_CACHE_CAPACITY).expect("cache capacity > 0");
2087 Self {
2088 inner,
2089 cache: Mutex::new(LruCache::new(cache_capacity)),
2090 }
2091 }
2092}
2093
2094impl Embedder for SharedCassSyncEmbedder {
2095 fn embed_sync(&self, text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
2096 if let Ok(mut cache) = self.cache.lock()
2097 && let Some(embedding) = cache.get(text).cloned()
2098 {
2099 return Ok(embedding);
2100 }
2101
2102 let embedding = self.inner.embed_sync(text)?;
2103 if let Ok(mut cache) = self.cache.lock() {
2104 cache.put(text.to_owned(), embedding.clone());
2105 }
2106 Ok(embedding)
2107 }
2108
2109 fn embed_batch_sync(
2110 &self,
2111 texts: &[&str],
2112 ) -> crate::search::embedder::EmbedderResult<Vec<Vec<f32>>> {
2113 self.inner.embed_batch_sync(texts)
2114 }
2115
2116 fn dimension(&self) -> usize {
2117 self.inner.dimension()
2118 }
2119
2120 fn id(&self) -> &str {
2121 self.inner.id()
2122 }
2123
2124 fn model_name(&self) -> &str {
2125 self.inner.model_name()
2126 }
2127
2128 fn is_ready(&self) -> bool {
2129 self.inner.is_ready()
2130 }
2131
2132 fn is_semantic(&self) -> bool {
2133 self.inner.is_semantic()
2134 }
2135
2136 fn category(&self) -> frankensearch::ModelCategory {
2137 self.inner.category()
2138 }
2139
2140 fn tier(&self) -> frankensearch::ModelTier {
2141 self.inner.tier()
2142 }
2143
2144 fn supports_mrl(&self) -> bool {
2145 self.inner.supports_mrl()
2146 }
2147}
2148
2149fn build_in_memory_two_tier_index(
2150 ann_path: Option<PathBuf>,
2151 embedder_id: &str,
2152 tier_mode: SemanticTierMode,
2153) -> Option<Arc<FsInMemoryTwoTierIndex>> {
2154 let index_dir = ann_path
2155 .as_ref()
2156 .and_then(|path| path.parent().map(Path::to_path_buf));
2157 let Some(index_dir) = index_dir else {
2158 tracing::debug!("two-tier semantic unavailable: ann/index directory path missing");
2159 return None;
2160 };
2161
2162 match FsInMemoryTwoTierIndex::from_dir(&index_dir) {
2163 Ok(index) => return Some(Arc::new(index)),
2164 Err(err) => {
2165 tracing::debug!(
2166 dir = %index_dir.display(),
2167 error = %err,
2168 "two-tier semantic index load failed; considering fallback"
2169 );
2170 }
2171 }
2172
2173 if !matches!(tier_mode, SemanticTierMode::FastOnly) {
2174 return None;
2175 }
2176
2177 let fallback_fast = index_dir.join(format!("index-{embedder_id}.fsvi"));
2178 if !fallback_fast.is_file() {
2179 return None;
2180 }
2181
2182 match FsInMemoryVectorIndex::from_fsvi(&fallback_fast) {
2183 Ok(fast) => Some(Arc::new(FsInMemoryTwoTierIndex::new(fast, None))),
2184 Err(err) => {
2185 tracing::debug!(
2186 path = %fallback_fast.display(),
2187 error = %err,
2188 "fast-only semantic fallback index load failed"
2189 );
2190 None
2191 }
2192 }
2193}
2194
2195fn two_tier_index_supports_mode(
2196 index: &FsInMemoryTwoTierIndex,
2197 tier_mode: SemanticTierMode,
2198) -> bool {
2199 !matches!(
2200 tier_mode,
2201 SemanticTierMode::Progressive | SemanticTierMode::QualityOnly
2202 ) || index.has_quality_index()
2203}
2204
2205#[derive(Debug, Clone)]
2206struct ResolvedSemanticDocId {
2207 message_id: u64,
2208 doc_id: String,
2209}
2210
2211type ProgressiveLookupKey = (String, String, Option<i64>, String, i64, Option<i64>, u64);
2212type ProgressiveExactQueryKey = (i64, i64);
2213type ProgressiveFallbackQueryKey = (String, String, i64);
2214type ResolvedSemanticLookupRow = Option<(ProgressiveLookupKey, ResolvedSemanticDocId)>;
2215
2216#[derive(Debug, Clone)]
2217struct ProgressiveLexicalHit {
2218 title: String,
2219 snippet: String,
2220 content: String,
2221 content_hash: u64,
2222 conversation_id: Option<i64>,
2223 source_path: String,
2224 agent: String,
2225 workspace: String,
2226 workspace_original: Option<String>,
2227 created_at: Option<i64>,
2228 match_type: MatchType,
2229 line_number: Option<usize>,
2230 source_id: String,
2231 origin_kind: String,
2232 origin_host: Option<String>,
2233}
2234
2235impl ProgressiveLexicalHit {
2236 fn from_search_hit(hit: &SearchHit, field_mask: FieldMask) -> Self {
2237 Self {
2238 title: if field_mask.wants_title() {
2239 hit.title.clone()
2240 } else {
2241 String::new()
2242 },
2243 snippet: if field_mask.wants_snippet() {
2244 hit.snippet.clone()
2245 } else {
2246 String::new()
2247 },
2248 content: if field_mask.needs_content() {
2249 hit.content.clone()
2250 } else {
2251 String::new()
2252 },
2253 content_hash: hit.content_hash,
2254 conversation_id: hit.conversation_id,
2255 source_path: hit.source_path.clone(),
2256 agent: hit.agent.clone(),
2257 workspace: hit.workspace.clone(),
2258 workspace_original: hit.workspace_original.clone(),
2259 created_at: hit.created_at,
2260 match_type: hit.match_type,
2261 line_number: hit.line_number,
2262 source_id: hit.source_id.clone(),
2263 origin_kind: hit.origin_kind.clone(),
2264 origin_host: hit.origin_host.clone(),
2265 }
2266 }
2267
2268 fn to_search_hit(&self, score: f32) -> SearchHit {
2269 SearchHit {
2270 title: self.title.clone(),
2271 snippet: self.snippet.clone(),
2272 content: self.content.clone(),
2273 content_hash: self.content_hash,
2274 conversation_id: self.conversation_id,
2275 score,
2276 source_path: self.source_path.clone(),
2277 agent: self.agent.clone(),
2278 workspace: self.workspace.clone(),
2279 workspace_original: self.workspace_original.clone(),
2280 created_at: self.created_at,
2281 line_number: self.line_number,
2282 match_type: self.match_type,
2283 source_id: self.source_id.clone(),
2284 origin_kind: self.origin_kind.clone(),
2285 origin_host: self.origin_host.clone(),
2286 }
2287 }
2288}
2289
2290#[derive(Debug, Default)]
2291struct ProgressiveLexicalCache {
2292 hits_by_message: HashMap<u64, ProgressiveLexicalHit>,
2293 wildcard_fallback: bool,
2294 suggestions: Vec<QuerySuggestion>,
2295}
2296
2297#[derive(Clone, Copy)]
2298struct ProgressivePhaseContext<'a> {
2299 query: &'a str,
2300 filters: &'a SearchFilters,
2301 field_mask: FieldMask,
2302 lexical_cache: Option<&'a ProgressiveLexicalCache>,
2303 limit: usize,
2304 fetch_limit: usize,
2305}
2306
2307type ProgressiveLexicalSnapshot = Arc<ProgressiveLexicalCache>;
2308
2309struct CassProgressiveLexicalAdapter {
2310 client: Arc<SearchClient>,
2311 filters: SearchFilters,
2312 field_mask: FieldMask,
2313 sparse_threshold: usize,
2314 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2315}
2316
2317impl CassProgressiveLexicalAdapter {
2318 fn new(
2319 client: Arc<SearchClient>,
2320 filters: SearchFilters,
2321 field_mask: FieldMask,
2322 sparse_threshold: usize,
2323 shared: Arc<Mutex<ProgressiveLexicalSnapshot>>,
2324 ) -> Self {
2325 Self {
2326 client,
2327 filters,
2328 field_mask,
2329 sparse_threshold,
2330 shared,
2331 }
2332 }
2333}
2334
2335impl FsLexicalSearch for CassProgressiveLexicalAdapter {
2336 fn search<'a>(
2337 &'a self,
2338 cx: &'a FsCx,
2339 query: &'a str,
2340 limit: usize,
2341 ) -> FsSearchFuture<'a, Vec<FsScoredResult>> {
2342 Box::pin(async move {
2343 if cx.is_cancel_requested() {
2344 return Err(FsSearchError::Cancelled {
2345 phase: "lexical".to_string(),
2346 reason: "cancel requested".to_string(),
2347 });
2348 }
2349
2350 let result = self
2351 .client
2352 .search_with_fallback(
2353 query,
2354 self.filters.clone(),
2355 limit,
2356 0,
2357 self.sparse_threshold,
2358 self.field_mask,
2359 )
2360 .map_err(|err| FsSearchError::SubsystemError {
2361 subsystem: "cass_lexical_adapter",
2362 source: Box::new(std::io::Error::other(err.to_string())),
2363 })?;
2364
2365 let resolved = self
2366 .client
2367 .resolve_semantic_doc_ids_for_hits(&result.hits)
2368 .map_err(|err| FsSearchError::SubsystemError {
2369 subsystem: "cass_lexical_adapter",
2370 source: Box::new(std::io::Error::other(err.to_string())),
2371 })?;
2372
2373 let mut scored = Vec::with_capacity(result.hits.len());
2374 let mut hits_by_message = HashMap::with_capacity(result.hits.len());
2375
2376 for (hit, resolved_doc) in result.hits.iter().zip(resolved) {
2377 let Some(resolved_doc) = resolved_doc else {
2378 continue;
2379 };
2380 hits_by_message
2381 .entry(resolved_doc.message_id)
2382 .or_insert_with(|| {
2383 ProgressiveLexicalHit::from_search_hit(hit, self.field_mask)
2384 });
2385 scored.push(FsScoredResult {
2386 doc_id: resolved_doc.doc_id,
2387 score: hit.score,
2388 source: FsScoreSource::Lexical,
2389 index: None,
2390 fast_score: None,
2391 quality_score: None,
2392 lexical_score: Some(hit.score),
2393 rerank_score: None,
2394 explanation: None,
2395 metadata: None,
2396 });
2397 }
2398
2399 if let Ok(mut guard) = self.shared.lock() {
2400 *guard = Arc::new(ProgressiveLexicalCache {
2401 hits_by_message,
2402 wildcard_fallback: result.wildcard_fallback,
2403 suggestions: result.suggestions,
2404 });
2405 }
2406
2407 Ok(scored)
2408 })
2409 }
2410
2411 fn index_document<'a>(
2412 &'a self,
2413 _cx: &'a FsCx,
2414 _doc: &'a frankensearch::IndexableDocument,
2415 ) -> FsSearchFuture<'a, ()> {
2416 Box::pin(async move {
2417 Err(FsSearchError::SubsystemError {
2418 subsystem: "cass_lexical_adapter",
2419 source: Box::new(std::io::Error::other("cass lexical adapter is read-only")),
2420 })
2421 })
2422 }
2423
2424 fn commit<'a>(&'a self, _cx: &'a FsCx) -> FsSearchFuture<'a, ()> {
2425 Box::pin(async move { Ok(()) })
2426 }
2427
2428 fn doc_count(&self) -> usize {
2429 self.client.total_docs()
2430 }
2431}
2432
2433pub struct SearchClient {
2434 reader: Option<(IndexReader, FsCassFields)>,
2435 sqlite: Mutex<Option<SendConnection>>,
2436 sqlite_path: Option<PathBuf>,
2437 prefix_cache: Mutex<CacheShards>,
2438 reload_on_search: bool,
2439 last_reload: Mutex<Option<Instant>>,
2440 last_generation: Mutex<Option<u64>>,
2441 reload_epoch: Arc<AtomicU64>,
2442 warm_tx: Option<mpsc::Sender<WarmJob>>,
2443 _warm_handle: Option<std::thread::JoinHandle<()>>,
2444 metrics: Metrics,
2445 cache_namespace: String,
2446 semantic: Mutex<Option<SemanticSearchState>>,
2447 last_tantivy_total_count: Mutex<Option<usize>>,
2451}
2452
2453#[derive(Debug, Clone, Copy)]
2454pub struct SearchClientOptions {
2455 pub enable_reload: bool,
2456 pub enable_warm: bool,
2457}
2458
2459impl Default for SearchClientOptions {
2460 fn default() -> Self {
2461 Self {
2462 enable_reload: true,
2463 enable_warm: true,
2464 }
2465 }
2466}
2467
2468impl Drop for SearchClient {
2469 fn drop(&mut self) {
2470 FEDERATED_SEARCH_READERS
2471 .write()
2472 .remove(&self.cache_namespace);
2473 }
2474}
2475
2476#[derive(Debug, Clone, PartialEq, Eq)]
2477pub struct CacheStats {
2478 pub cache_hits: u64,
2479 pub cache_miss: u64,
2480 pub cache_shortfall: u64,
2481 pub reloads: u64,
2482 pub reload_ms_total: u128,
2483 pub total_cap: usize,
2484 pub total_cost: usize,
2485 pub eviction_count: u64,
2487 pub approx_bytes: usize,
2489 pub byte_cap: usize,
2491 pub eviction_policy: &'static str,
2493 pub ghost_entries: usize,
2495 pub admission_rejects: u64,
2497 pub prewarm_scheduled: u64,
2499 pub prewarm_skipped_pressure: u64,
2501 pub reader_generation: Option<u64>,
2503}
2504
2505impl Default for CacheStats {
2506 fn default() -> Self {
2507 Self {
2508 cache_hits: 0,
2509 cache_miss: 0,
2510 cache_shortfall: 0,
2511 reloads: 0,
2512 reload_ms_total: 0,
2513 total_cap: 0,
2514 total_cost: 0,
2515 eviction_count: 0,
2516 approx_bytes: 0,
2517 byte_cap: 0,
2518 eviction_policy: "unknown",
2519 ghost_entries: 0,
2520 admission_rejects: 0,
2521 prewarm_scheduled: 0,
2522 prewarm_skipped_pressure: 0,
2523 reader_generation: None,
2524 }
2525 }
2526}
2527
2528static CACHE_SHARD_CAP: Lazy<usize> = Lazy::new(|| {
2531 dotenvy::var("CASS_CACHE_SHARD_CAP")
2532 .ok()
2533 .and_then(|v| v.parse::<usize>().ok())
2534 .filter(|v| *v > 0)
2535 .unwrap_or(256)
2536});
2537
2538static CACHE_TOTAL_CAP: Lazy<usize> = Lazy::new(|| {
2540 dotenvy::var("CASS_CACHE_TOTAL_CAP")
2541 .ok()
2542 .and_then(|v| v.parse::<usize>().ok())
2543 .filter(|v| *v > 0)
2544 .unwrap_or(2048)
2545});
2546
2547static CACHE_DEBUG_ENABLED: Lazy<bool> = Lazy::new(|| {
2548 dotenvy::var("CASS_DEBUG_CACHE_METRICS")
2549 .map(|v| v == "1" || v.eq_ignore_ascii_case("true"))
2550 .unwrap_or(false)
2551});
2552
2553static CACHE_BYTE_CAP: Lazy<usize> = Lazy::new(|| match dotenvy::var("CASS_CACHE_BYTE_CAP") {
2556 Ok(value) => cache_byte_cap_from_env_value(Some(&value), available_memory_bytes()),
2557 Err(_) => default_cache_byte_cap(),
2558});
2559
2560static CACHE_EVICTION_POLICY: Lazy<CacheEvictionPolicy> = Lazy::new(|| {
2561 cache_eviction_policy_from_env_value(dotenvy::var("CASS_CACHE_EVICTION_POLICY").ok().as_deref())
2562});
2563
2564const DEFAULT_CACHE_BYTE_CAP_FALLBACK: usize = 64 * 1024 * 1024;
2565const DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR: u64 = 128;
2566const DEFAULT_CACHE_BYTE_CAP_CEILING: u64 = 2 * 1024 * 1024 * 1024;
2567const S3_FIFO_GHOST_CAP_MULTIPLIER: usize = 2;
2568const S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR: usize = 4;
2569const PREWARM_ENTRY_PRESSURE_NUMERATOR: usize = 9;
2570const PREWARM_ENTRY_PRESSURE_DENOMINATOR: usize = 10;
2571const PREWARM_BYTE_PRESSURE_NUMERATOR: usize = 4;
2572const PREWARM_BYTE_PRESSURE_DENOMINATOR: usize = 5;
2573
2574const CACHE_KEY_VERSION: &str = "1";
2575
2576static WARM_DEBOUNCE_MS: Lazy<u64> = Lazy::new(|| {
2578 dotenvy::var("CASS_WARM_DEBOUNCE_MS")
2579 .ok()
2580 .and_then(|v| v.parse::<u64>().ok())
2581 .filter(|v| *v > 0)
2582 .unwrap_or(120)
2583});
2584
2585fn default_cache_byte_cap() -> usize {
2586 default_cache_byte_cap_for_available(available_memory_bytes())
2587}
2588
2589fn cache_byte_cap_from_env_value(value: Option<&str>, available_bytes: Option<u64>) -> usize {
2590 let Some(raw) = value else {
2591 return default_cache_byte_cap_for_available(available_bytes);
2592 };
2593 raw.parse::<usize>()
2594 .unwrap_or_else(|_| default_cache_byte_cap_for_available(available_bytes))
2595}
2596
2597fn default_cache_byte_cap_for_available(available_bytes: Option<u64>) -> usize {
2598 let Some(available_bytes) = available_bytes else {
2599 return DEFAULT_CACHE_BYTE_CAP_FALLBACK;
2600 };
2601 let ceiling = usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX);
2602 let budget = available_bytes / DEFAULT_CACHE_BYTE_CAP_MEMORY_FRACTION_DENOMINATOR;
2603 let budget = budget.min(DEFAULT_CACHE_BYTE_CAP_CEILING);
2604 let budget = usize::try_from(budget).unwrap_or(ceiling);
2605 budget.clamp(DEFAULT_CACHE_BYTE_CAP_FALLBACK, ceiling)
2606}
2607
2608#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2609enum CacheEvictionPolicy {
2610 Lru,
2611 S3Fifo,
2612}
2613
2614impl CacheEvictionPolicy {
2615 fn label(self) -> &'static str {
2616 match self {
2617 CacheEvictionPolicy::Lru => "lru",
2618 CacheEvictionPolicy::S3Fifo => "s3-fifo",
2619 }
2620 }
2621}
2622
2623fn cache_eviction_policy_from_env_value(value: Option<&str>) -> CacheEvictionPolicy {
2624 match value.map(str::trim).filter(|value| !value.is_empty()) {
2625 Some(value) if value.eq_ignore_ascii_case("s3-fifo") => CacheEvictionPolicy::S3Fifo,
2626 Some(value) if value.eq_ignore_ascii_case("s3fifo") => CacheEvictionPolicy::S3Fifo,
2627 Some(value) if value.eq_ignore_ascii_case("s3_fifo") => CacheEvictionPolicy::S3Fifo,
2628 _ => CacheEvictionPolicy::Lru,
2629 }
2630}
2631
2632#[derive(Clone)]
2633struct CachedHit {
2634 hit: SearchHit,
2635 lc_content: String,
2636 lc_title: Option<String>,
2637 bloom64: u64,
2638}
2639
2640impl CachedHit {
2641 fn approx_bytes(&self) -> usize {
2644 let base = std::mem::size_of::<Self>();
2646 let hit_strings = self.hit.title.len()
2648 + self.hit.snippet.len()
2649 + self.hit.content.len()
2650 + self.hit.source_path.len()
2651 + self.hit.agent.len()
2652 + self.hit.workspace.len()
2653 + self
2654 .hit
2655 .workspace_original
2656 .as_ref()
2657 .map_or(0, std::string::String::len)
2658 + self.hit.source_id.len()
2659 + self.hit.origin_kind.len()
2660 + self
2661 .hit
2662 .origin_host
2663 .as_ref()
2664 .map_or(0, std::string::String::len);
2665 let lc_strings =
2667 self.lc_content.len() + self.lc_title.as_ref().map_or(0, std::string::String::len);
2668 base + hit_strings + lc_strings
2669 }
2670}
2671
2672struct CacheShards {
2673 shards: HashMap<Arc<str>, LruCache<Arc<str>, Vec<CachedHit>>>,
2675 total_cap: usize,
2676 total_cost: usize,
2677 eviction_count: u64,
2679 total_bytes: usize,
2681 byte_cap: usize,
2683 policy: CacheEvictionPolicy,
2685 ghost_keys: VecDeque<Arc<str>>,
2687 ghost_set: HashSet<Arc<str>>,
2688 admission_rejects: u64,
2689}
2690
2691impl CacheShards {
2692 fn new(total_cap: usize, byte_cap: usize) -> Self {
2693 Self::new_with_policy(total_cap, byte_cap, *CACHE_EVICTION_POLICY)
2694 }
2695
2696 fn new_with_policy(total_cap: usize, byte_cap: usize, policy: CacheEvictionPolicy) -> Self {
2697 Self {
2698 shards: HashMap::new(),
2699 total_cap: total_cap.max(1),
2700 total_cost: 0,
2701 eviction_count: 0,
2702 total_bytes: 0,
2703 byte_cap,
2704 policy,
2705 ghost_keys: VecDeque::new(),
2706 ghost_set: HashSet::new(),
2707 admission_rejects: 0,
2708 }
2709 }
2710
2711 fn shard_mut(&mut self, name: &str) -> &mut LruCache<Arc<str>, Vec<CachedHit>> {
2712 let interned_name = intern_cache_key(name);
2714 self.shards
2715 .entry(interned_name)
2716 .or_insert_with(|| LruCache::new(NonZeroUsize::new(*CACHE_SHARD_CAP).unwrap()))
2717 }
2718
2719 fn shard_opt(&self, name: &str) -> Option<&LruCache<Arc<str>, Vec<CachedHit>>> {
2720 self.shards.get(name)
2722 }
2723
2724 fn put(&mut self, shard_name: &str, key: Arc<str>, value: Vec<CachedHit>) {
2725 let new_cost = value.len();
2726 let new_bytes: usize = value.iter().map(CachedHit::approx_bytes).sum();
2727 let replacing = self
2728 .shard_opt(shard_name)
2729 .is_some_and(|shard| shard.contains(&key));
2730
2731 if !replacing && !self.should_admit(&key, new_cost, new_bytes) {
2732 self.admission_rejects += 1;
2733 self.record_ghost(key);
2734 return;
2735 }
2736
2737 self.remove_ghost(&key);
2738
2739 let shard = self.shard_mut(shard_name);
2740 let old_val = shard.put(key, value);
2741 let (old_cost, old_bytes) = old_val.as_ref().map_or((0, 0), |v| {
2742 (v.len(), v.iter().map(CachedHit::approx_bytes).sum())
2743 });
2744
2745 self.total_cost = self
2746 .total_cost
2747 .saturating_add(new_cost)
2748 .saturating_sub(old_cost);
2749 self.total_bytes = self
2750 .total_bytes
2751 .saturating_add(new_bytes)
2752 .saturating_sub(old_bytes);
2753 self.evict_until_within_cap();
2754 }
2755
2756 fn evict_until_within_cap(&mut self) {
2757 while self.total_cost > self.total_cap
2759 || (self.byte_cap > 0 && self.total_bytes > self.byte_cap)
2760 {
2761 let byte_pressure = self.byte_cap > 0 && self.total_bytes > self.byte_cap;
2766 let mut largest_shard_key = None;
2767 let mut max_score = 0usize;
2768 for (k, v) in self.shards.iter() {
2769 let score = if byte_pressure {
2770 shard_cached_bytes(v)
2771 } else {
2772 v.len()
2773 };
2774 if score > max_score {
2775 max_score = score;
2776 largest_shard_key = Some(k.clone());
2777 }
2778 }
2779
2780 if let Some(key) = largest_shard_key {
2781 if let Some(shard) = self.shards.get_mut(&key)
2782 && let Some((evicted_key, v)) = shard.pop_lru()
2783 {
2784 let evicted_bytes: usize = v.iter().map(CachedHit::approx_bytes).sum();
2785 self.total_cost = self.total_cost.saturating_sub(v.len());
2786 self.total_bytes = self.total_bytes.saturating_sub(evicted_bytes);
2787 self.eviction_count += 1;
2788 self.record_ghost(evicted_key);
2789 }
2790 } else {
2791 break; }
2793 }
2794 }
2795
2796 fn should_admit(&self, key: &Arc<str>, cost: usize, bytes: usize) -> bool {
2797 if self.policy == CacheEvictionPolicy::Lru || self.ghost_set.contains(key) {
2798 return true;
2799 }
2800 !self.is_s3_fifo_large_candidate(cost, bytes)
2801 }
2802
2803 fn is_s3_fifo_large_candidate(&self, cost: usize, bytes: usize) -> bool {
2804 let entry_heavy = cost
2805 > self
2806 .total_cap
2807 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2808 let byte_heavy = self.byte_cap > 0
2809 && bytes
2810 > self
2811 .byte_cap
2812 .div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR);
2813 entry_heavy || byte_heavy
2814 }
2815
2816 fn record_ghost(&mut self, key: Arc<str>) {
2817 if self.policy != CacheEvictionPolicy::S3Fifo {
2818 return;
2819 }
2820 if self.ghost_set.insert(key.clone()) {
2821 self.ghost_keys.push_back(key);
2822 }
2823 let cap = self
2824 .total_cap
2825 .saturating_mul(S3_FIFO_GHOST_CAP_MULTIPLIER)
2826 .max(1);
2827 while self.ghost_set.len() > cap {
2828 if let Some(old) = self.ghost_keys.pop_front() {
2829 self.ghost_set.remove(&old);
2830 } else {
2831 break;
2832 }
2833 }
2834 }
2835
2836 fn remove_ghost(&mut self, key: &Arc<str>) {
2837 self.ghost_set.remove(key);
2838 self.ghost_keys.retain(|candidate| candidate != key);
2839 }
2840
2841 fn clear(&mut self) {
2842 self.shards.clear();
2843 self.total_cost = 0;
2844 self.total_bytes = 0;
2845 self.ghost_keys.clear();
2846 self.ghost_set.clear();
2847 }
2849
2850 fn total_cost(&self) -> usize {
2851 self.total_cost
2852 }
2853
2854 fn total_cap(&self) -> usize {
2855 self.total_cap
2856 }
2857
2858 fn eviction_count(&self) -> u64 {
2859 self.eviction_count
2860 }
2861
2862 fn total_bytes(&self) -> usize {
2863 self.total_bytes
2864 }
2865
2866 fn byte_cap(&self) -> usize {
2867 self.byte_cap
2868 }
2869
2870 fn policy_label(&self) -> &'static str {
2871 self.policy.label()
2872 }
2873
2874 fn ghost_entries(&self) -> usize {
2875 self.ghost_set.len()
2876 }
2877
2878 fn admission_rejects(&self) -> u64 {
2879 self.admission_rejects
2880 }
2881
2882 fn prewarm_pressure(&self) -> bool {
2883 let entry_pressure = self
2884 .total_cost
2885 .saturating_mul(PREWARM_ENTRY_PRESSURE_DENOMINATOR)
2886 >= self
2887 .total_cap
2888 .saturating_mul(PREWARM_ENTRY_PRESSURE_NUMERATOR);
2889 let byte_pressure = self.byte_cap > 0
2890 && self
2891 .total_bytes
2892 .saturating_mul(PREWARM_BYTE_PRESSURE_DENOMINATOR)
2893 >= self
2894 .byte_cap
2895 .saturating_mul(PREWARM_BYTE_PRESSURE_NUMERATOR);
2896 entry_pressure || byte_pressure
2897 }
2898}
2899
2900fn shard_cached_bytes(shard: &LruCache<Arc<str>, Vec<CachedHit>>) -> usize {
2901 shard
2902 .iter()
2903 .map(|(_key, hits)| hits.iter().map(CachedHit::approx_bytes).sum::<usize>())
2904 .sum()
2905}
2906
2907#[derive(Clone)]
2908struct WarmJob {
2909 query: String,
2910 filters_fingerprint: String,
2911 shard_name: String,
2912}
2913
2914#[derive(Debug, Clone, Copy, PartialEq, Eq)]
2915enum AdaptivePrewarmDecision {
2916 Schedule,
2917 SkipCold,
2918 SkipPressure,
2919}
2920
2921#[derive(Clone)]
2922struct SearcherCacheEntry {
2923 epoch: u64,
2924 reader_key: usize,
2925 searcher: Searcher,
2926}
2927
2928thread_local! {
2929 static THREAD_SEARCHER: RefCell<Option<SearcherCacheEntry>> = const { RefCell::new(None) };
2930}
2931
2932#[derive(Clone)]
2933struct FederatedIndexReader {
2934 reader: IndexReader,
2935 fields: FsCassFields,
2936}
2937
2938static FEDERATED_SEARCH_READERS: Lazy<RwLock<HashMap<String, Arc<Vec<FederatedIndexReader>>>>> =
2939 Lazy::new(|| RwLock::new(HashMap::new()));
2940static SEARCH_CLIENT_INSTANCE_COUNTER: AtomicU64 = AtomicU64::new(1);
2941
2942fn levenshtein_distance(a: &str, b: &str) -> usize {
2945 let a_chars: Vec<char> = a.chars().collect();
2946 let b_chars: Vec<char> = b.chars().collect();
2947 let a_len = a_chars.len();
2948 let b_len = b_chars.len();
2949
2950 if a_len == 0 {
2951 return b_len;
2952 }
2953 if b_len == 0 {
2954 return a_len;
2955 }
2956
2957 let mut prev_row: Vec<usize> = (0..=b_len).collect();
2959 let mut curr_row: Vec<usize> = vec![0; b_len + 1];
2960
2961 for (i, a_char) in a_chars.iter().enumerate() {
2962 curr_row[0] = i + 1;
2963 for (j, b_char) in b_chars.iter().enumerate() {
2964 let cost = usize::from(a_char != b_char);
2965 curr_row[j + 1] = (prev_row[j + 1] + 1) .min(curr_row[j] + 1) .min(prev_row[j] + cost); }
2969 std::mem::swap(&mut prev_row, &mut curr_row);
2970 }
2971
2972 prev_row[b_len]
2973}
2974
2975fn normalize_term_parts(raw: &str) -> Vec<String> {
2980 let mut parts = Vec::new();
2981 for token in nfc_sanitize_query(raw).split_whitespace() {
2982 let mut current = String::new();
2983 let mut chars = token.chars().peekable();
2984 while let Some(ch) = chars.next() {
2985 let trailing_wildcard = ch == '*' && chars.peek().is_none() && !current.is_empty();
2986 if ch.is_alphanumeric() || ch == '_' || trailing_wildcard {
2987 current.push(ch);
2988 continue;
2989 }
2990
2991 if !current.is_empty() {
2992 parts.push(std::mem::take(&mut current));
2993 }
2994 }
2995
2996 if !current.is_empty() {
2997 parts.push(current);
2998 }
2999 }
3000 parts
3001}
3002
3003fn normalize_phrase_terms(raw: &str) -> Vec<String> {
3005 normalize_term_parts(raw)
3006 .into_iter()
3007 .map(|s| s.trim_matches('*').to_lowercase())
3008 .filter(|s| !s.is_empty())
3009 .collect()
3010}
3011
3012fn render_fts5_term_part(part: &str) -> Option<String> {
3013 let pattern = FsCassWildcardPattern::parse(part);
3014 if matches!(
3015 pattern,
3016 FsCassWildcardPattern::Suffix(_)
3017 | FsCassWildcardPattern::Substring(_)
3018 | FsCassWildcardPattern::Complex(_)
3019 ) {
3020 return None;
3021 }
3022
3023 Some(part.to_string())
3024}
3025
3026fn dominant_match_type(query: &str) -> MatchType {
3029 let mut worst = MatchType::Exact;
3030 for term in query.split_whitespace() {
3031 let pattern = FsCassWildcardPattern::parse(term);
3032 let mt = match pattern {
3033 FsCassWildcardPattern::Exact(_) => MatchType::Exact,
3034 FsCassWildcardPattern::Prefix(_) => MatchType::Prefix,
3035 FsCassWildcardPattern::Suffix(_) => MatchType::Suffix,
3036 FsCassWildcardPattern::Substring(_) => MatchType::Substring,
3037 FsCassWildcardPattern::Complex(_) => MatchType::Wildcard,
3038 };
3039 if mt.quality_factor() < worst.quality_factor() {
3041 worst = mt;
3042 }
3043 }
3044 worst
3045}
3046
3047pub(crate) fn is_tool_invocation_noise(content: &str) -> bool {
3050 let trimmed = content.trim();
3051
3052 if trimmed.starts_with("[Tool:") {
3054 if let Some(close_idx) = trimmed.find(']') {
3056 let after = &trimmed[close_idx + 1..];
3058 if !after.trim().is_empty() {
3059 return false; }
3061
3062 let inner = &trimmed[6..close_idx]; return inner.trim().is_empty();
3068 }
3069 return true;
3071 }
3072
3073 if trimmed.len() < 20 {
3075 let lower = trimmed.to_lowercase();
3076 if lower.starts_with("[tool") || lower.starts_with("tool:") {
3077 return true;
3078 }
3079 }
3080
3081 false
3082}
3083
3084fn hit_content_for_noise_check(hit: &SearchHit) -> &str {
3085 if hit.content.is_empty() {
3086 &hit.snippet
3087 } else {
3088 &hit.content
3089 }
3090}
3091
3092fn hit_is_noise(hit: &SearchHit, query: &str) -> bool {
3093 let content_to_check = hit_content_for_noise_check(hit);
3094 if content_to_check.is_empty() {
3104 return false;
3105 }
3106 is_search_noise_text(content_to_check, query) || is_tool_invocation_noise(content_to_check)
3107}
3108
3109fn snippet_from_content(content: &str) -> String {
3110 let trimmed = content.trim();
3111 let mut chars = trimmed.chars();
3112 let preview: String = chars.by_ref().take(200).collect();
3113 if chars.next().is_some() {
3114 format!("{preview}...")
3115 } else {
3116 preview
3117 }
3118}
3119
3120#[cfg(test)]
3128pub(crate) fn deduplicate_hits(hits: Vec<SearchHit>) -> Vec<SearchHit> {
3129 deduplicate_hits_with_query(hits, "")
3130}
3131
3132pub(crate) fn deduplicate_hits_with_query(hits: Vec<SearchHit>, query: &str) -> Vec<SearchHit> {
3133 let mut source_ids: HashMap<String, u32> = HashMap::new();
3140 let mut path_ids: HashMap<String, u32> = HashMap::new();
3141 let mut title_ids: HashMap<String, u32> = HashMap::new();
3142 let mut next_source_id: u32 = 0;
3143 let mut next_path_id: u32 = 0;
3144 let mut next_title_id: u32 = 0;
3145 type DedupKey = (
3146 u32,
3147 u32,
3148 Option<i64>,
3149 Option<u32>,
3150 Option<usize>,
3151 Option<i64>,
3152 u64,
3153 );
3154
3155 let mut seen: HashMap<DedupKey, usize> = HashMap::new();
3156 let mut deduped: Vec<SearchHit> = Vec::new();
3157
3158 for hit in hits {
3159 if hit_is_noise(&hit, query) {
3160 continue;
3161 }
3162
3163 let normalized_source_id = normalized_search_hit_source_id(&hit);
3166 let source_key = if let Some(id) = source_ids.get(normalized_source_id.as_str()) {
3167 *id
3168 } else {
3169 let id = next_source_id;
3170 next_source_id = next_source_id.saturating_add(1);
3171 source_ids.insert(normalized_source_id, id);
3172 id
3173 };
3174 let path_key = if let Some(id) = path_ids.get(hit.source_path.as_str()) {
3175 *id
3176 } else {
3177 let id = next_path_id;
3178 next_path_id = next_path_id.saturating_add(1);
3179 path_ids.insert(hit.source_path.clone(), id);
3180 id
3181 };
3182 let title_key = if hit.conversation_id.is_some() {
3183 None
3184 } else {
3185 let normalized_title = hit.title.trim();
3186 Some(if let Some(id) = title_ids.get(normalized_title) {
3187 *id
3188 } else {
3189 let id = next_title_id;
3190 next_title_id = next_title_id.saturating_add(1);
3191 title_ids.insert(normalized_title.to_string(), id);
3192 id
3193 })
3194 };
3195 let key = (
3196 source_key,
3197 path_key,
3198 hit.conversation_id,
3199 title_key,
3200 hit.line_number,
3201 hit.created_at,
3202 hit.content_hash,
3203 );
3204
3205 if let Some(&existing_idx) = seen.get(&key) {
3206 if deduped[existing_idx].score < hit.score {
3208 deduped[existing_idx] = hit;
3209 }
3210 } else {
3212 seen.insert(key, deduped.len());
3213 deduped.push(hit);
3214 }
3215 }
3216
3217 deduped
3218}
3219
3220fn should_try_wildcard_fallback(
3221 returned_hits: usize,
3222 limit: usize,
3223 offset: usize,
3224 sparse_threshold: usize,
3225) -> bool {
3226 if offset != 0 {
3227 return false;
3228 }
3229
3230 let effective_sparse_threshold = if limit == 0 {
3231 sparse_threshold
3232 } else {
3233 sparse_threshold.min(limit)
3234 };
3235
3236 returned_hits < effective_sparse_threshold
3237}
3238
3239fn should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(
3240 query: &str,
3241 returned_hits: usize,
3242) -> bool {
3243 if returned_hits != 0 {
3244 return false;
3245 }
3246
3247 for token in normalize_phrase_terms(query) {
3248 if token.chars().count() > AUTOMATIC_WILDCARD_FALLBACK_MAX_TOKEN_CHARS {
3249 return true;
3250 }
3251 }
3252
3253 false
3254}
3255
3256fn snippet_from_preview_without_full_content(
3257 field_mask: FieldMask,
3258 stored_preview: &str,
3259 query: &str,
3260) -> Option<String> {
3261 if field_mask.needs_content() || !field_mask.wants_snippet() || stored_preview.is_empty() {
3262 return None;
3263 }
3264
3265 cached_prefix_snippet(stored_preview, query, 160)
3266}
3267
3268fn stored_preview_is_complete_content(stored_preview: &str) -> bool {
3269 !stored_preview.is_empty() && !stored_preview.ends_with('…')
3272}
3273
3274impl SearchClient {
3275 pub fn open(index_path: &Path, db_path: Option<&Path>) -> Result<Option<Self>> {
3276 Self::open_with_options(index_path, db_path, SearchClientOptions::default())
3277 }
3278
3279 pub fn open_with_options(
3280 index_path: &Path,
3281 db_path: Option<&Path>,
3282 options: SearchClientOptions,
3283 ) -> Result<Option<Self>> {
3284 let tantivy = fs_cass_open_search_reader(index_path, ReloadPolicy::Manual).ok();
3285 let client_id = SEARCH_CLIENT_INSTANCE_COUNTER.fetch_add(1, Ordering::Relaxed);
3286 let cache_namespace = format!(
3287 "v{}|schema:{}|client:{}|index:{}",
3288 CACHE_KEY_VERSION,
3289 FS_CASS_SCHEMA_HASH,
3290 client_id,
3291 index_path.display()
3292 );
3293 let federated_readers = if tantivy.is_none() {
3294 crate::search::tantivy::open_federated_search_readers(index_path, ReloadPolicy::Manual)
3295 .ok()
3296 .flatten()
3297 .filter(|readers| !readers.is_empty())
3298 .map(|readers| {
3299 Arc::new(
3300 readers
3301 .into_iter()
3302 .map(|(reader, fields)| FederatedIndexReader { reader, fields })
3303 .collect::<Vec<_>>(),
3304 )
3305 })
3306 } else {
3307 None
3308 };
3309
3310 let sqlite_path = db_path.map(Path::to_path_buf).filter(|path| path.exists());
3311
3312 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_some() {
3313 tracing::warn!(
3314 index_path = %index_path.display(),
3315 "Tantivy search index not found or incompatible. \
3316 Search results will be degraded. \
3317 Run `cass index --full` to rebuild the index."
3318 );
3319 }
3320
3321 if tantivy.is_none() && federated_readers.is_none() && sqlite_path.is_none() {
3322 return Ok(None);
3323 }
3324
3325 let reload_epoch = Arc::new(AtomicU64::new(0));
3326 let metrics = Metrics::default();
3327
3328 let warm_pair = if options.enable_warm
3329 && let Some((reader, fields)) = &tantivy
3330 {
3331 maybe_spawn_warm_worker(
3332 reader.clone(),
3333 *fields,
3334 reload_epoch.clone(),
3335 metrics.clone(),
3336 )
3337 } else {
3338 None
3339 };
3340
3341 if let Some(readers) = &federated_readers {
3342 FEDERATED_SEARCH_READERS
3343 .write()
3344 .insert(cache_namespace.clone(), Arc::clone(readers));
3345 } else {
3346 FEDERATED_SEARCH_READERS.write().remove(&cache_namespace);
3347 }
3348
3349 Ok(Some(Self {
3350 reader: tantivy,
3351 sqlite: Mutex::new(None),
3352 sqlite_path,
3353 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
3354 reload_on_search: options.enable_reload,
3355 last_reload: Mutex::new(None),
3356 last_generation: Mutex::new(None),
3357 reload_epoch,
3358 warm_tx: warm_pair.as_ref().map(|(tx, _)| tx.clone()),
3359 _warm_handle: warm_pair.map(|(_, h)| h),
3360 metrics,
3361 cache_namespace,
3362 semantic: Mutex::new(None),
3363 last_tantivy_total_count: Mutex::new(None),
3364 }))
3365 }
3366
3367 fn sqlite_guard(&self) -> Result<std::sync::MutexGuard<'_, Option<SendConnection>>> {
3368 let mut guard = self
3369 .sqlite
3370 .lock()
3371 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3372
3373 if guard.is_none()
3374 && let Some(path) = &self.sqlite_path
3375 {
3376 match open_search_hydration_sqlite(path, std::time::Duration::from_secs(1)) {
3377 Ok(conn) => {
3378 *guard = Some(SendConnection(conn));
3379 }
3380 Err(err) => {
3381 tracing::debug!(
3382 error = %err,
3383 path = %path.display(),
3384 "readonly sqlite open failed for search client"
3385 );
3386 }
3387 }
3388 }
3389
3390 Ok(guard)
3391 }
3392
3393 pub fn search(
3394 &self,
3395 query: &str,
3396 filters: SearchFilters,
3397 limit: usize,
3398 offset: usize,
3399 field_mask: FieldMask,
3400 ) -> Result<Vec<SearchHit>> {
3401 use unicode_normalization::UnicodeNormalization;
3405 let query: String = query.nfc().collect();
3406 let query: &str = &query;
3407 let sanitized = nfc_sanitize_query(query);
3408 let field_mask = effective_field_mask(field_mask);
3409 let limit = if limit == 0 {
3410 self.total_docs().min(no_limit_result_cap()).max(1)
3411 } else {
3412 limit
3413 };
3414 let can_use_cache =
3415 field_mask.allows_cache() && (field_mask.needs_content() || field_mask.wants_snippet());
3416
3417 if let Some((reader, _)) = &self.reader {
3420 self.maybe_reload_reader(reader)?;
3421 let searcher = self.searcher_for_thread(reader);
3422 self.track_generation(searcher.generation().generation_id());
3423 } else if let Some(readers) = self.federated_readers()
3424 && let Some(signature) = self.maybe_reload_federated_readers(readers.as_ref())?
3425 {
3426 self.track_generation(signature);
3427 }
3428
3429 if can_use_cache
3434 && offset == 0
3435 && !query.contains('*')
3436 && !fs_cass_has_boolean_operators(query)
3437 {
3438 self.maybe_schedule_adaptive_query_prewarm(&sanitized, &filters);
3439 if let Some(cached) = self.cached_prefix_hits(&sanitized, &filters) {
3440 let query_terms = QueryTermsLower::from_query(&sanitized);
3442 let mut filtered: Vec<SearchHit> = cached
3443 .into_iter()
3444 .filter(|h| hit_matches_query_cached_precomputed(h, &query_terms))
3445 .map(|c| c.hit.clone())
3446 .collect();
3447 if filtered.len() >= limit {
3448 filtered.truncate(limit);
3449 self.metrics.inc_cache_hits();
3450 self.maybe_log_cache_metrics("hit");
3451 return Ok(filtered);
3452 }
3453 self.metrics.inc_cache_shortfall();
3455 self.maybe_log_cache_metrics("shortfall");
3456 } else {
3457 self.metrics.inc_cache_miss();
3459 self.maybe_log_cache_metrics("miss");
3460 }
3461 }
3462
3463 let target_hits = offset.saturating_add(limit);
3467 let initial_fetch_limit = if target_hits <= 16 {
3468 target_hits.saturating_mul(2)
3469 } else {
3470 target_hits.saturating_mul(3).div_ceil(2)
3473 };
3474 let session_path_filter_active = !filters.session_paths.is_empty();
3475 let fallback_fetch_limit = if session_path_filter_active {
3476 self.total_docs()
3477 .min(no_limit_result_cap())
3478 .max(target_hits.saturating_mul(3))
3479 .max(1)
3480 } else {
3481 target_hits.saturating_mul(3)
3482 };
3483
3484 if let Some((reader, fields)) = &self.reader {
3486 tracing::info!(
3487 backend = "tantivy",
3488 query = sanitized,
3489 limit = initial_fetch_limit,
3490 offset = 0,
3491 "search_start"
3492 );
3493 let (hits, tantivy_total_count) = self.search_tantivy(
3494 reader,
3495 fields,
3496 query,
3497 &sanitized,
3498 filters.clone(),
3499 initial_fetch_limit,
3500 0, field_mask,
3502 )?;
3503 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3504 *tc = Some(tantivy_total_count);
3505 }
3506 if !hits.is_empty() {
3507 let initial_hit_count = hits.len();
3508 let page_hits = |raw_hits: Vec<SearchHit>| {
3509 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3510 };
3511
3512 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3513
3514 let needs_retry = deduped_len < target_hits
3515 && initial_hit_count == initial_fetch_limit
3516 && initial_fetch_limit < fallback_fetch_limit;
3517
3518 if needs_retry {
3519 tracing::debug!(
3520 query = sanitized,
3521 target_hits,
3522 deduped_len,
3523 initial_fetch_limit,
3524 fallback_fetch_limit,
3525 session_path_filter_active,
3526 "retrying lexical fetch due to dedup or session-path shortfall"
3527 );
3528 let (retry_hits, retry_total_count) = self.search_tantivy(
3529 reader,
3530 fields,
3531 query,
3532 &sanitized,
3533 filters.clone(),
3534 fallback_fetch_limit,
3535 0,
3536 field_mask,
3537 )?;
3538 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3539 *tc = Some(retry_total_count);
3540 }
3541 if !retry_hits.is_empty() {
3542 (deduped_len, paged_hits) = page_hits(retry_hits);
3543 }
3544 }
3545
3546 tracing::trace!(
3547 query = sanitized,
3548 target_hits,
3549 deduped_len,
3550 returned = paged_hits.len(),
3551 "lexical fetch complete"
3552 );
3553
3554 if can_use_cache && offset == 0 {
3555 self.put_cache(&sanitized, &filters, &paged_hits);
3556 }
3557 return Ok(paged_hits);
3558 }
3559 tracing::debug!(
3560 query = sanitized,
3561 "tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3562 );
3563 return Ok(Vec::new());
3564 } else if let Some(readers) = self.federated_readers() {
3565 tracing::info!(
3566 backend = "tantivy-federated",
3567 query = sanitized,
3568 limit = initial_fetch_limit,
3569 offset = 0,
3570 shards = readers.len(),
3571 "search_start"
3572 );
3573 let (hits, tantivy_total_count) = self.search_tantivy_federated(
3574 readers.as_ref(),
3575 query,
3576 &sanitized,
3577 filters.clone(),
3578 initial_fetch_limit,
3579 field_mask,
3580 )?;
3581 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3582 *tc = Some(tantivy_total_count);
3583 }
3584 if !hits.is_empty() {
3585 let initial_hit_count = hits.len();
3586 let page_hits = |raw_hits: Vec<SearchHit>| {
3587 self.postprocess_hits_page(raw_hits, &sanitized, &filters, limit, offset)
3588 };
3589
3590 let (mut deduped_len, mut paged_hits) = page_hits(hits);
3591 let expected_federated_capacity = initial_fetch_limit.saturating_mul(readers.len());
3592 let federated_initial_capacity_reached = if session_path_filter_active {
3593 initial_hit_count >= initial_fetch_limit.min(expected_federated_capacity)
3594 } else {
3595 initial_hit_count == expected_federated_capacity
3596 };
3597 let needs_retry = deduped_len < target_hits
3598 && federated_initial_capacity_reached
3599 && initial_fetch_limit < fallback_fetch_limit;
3600
3601 if needs_retry {
3602 tracing::debug!(
3603 query = sanitized,
3604 target_hits,
3605 deduped_len,
3606 initial_fetch_limit,
3607 fallback_fetch_limit,
3608 shards = readers.len(),
3609 session_path_filter_active,
3610 "retrying federated lexical fetch due to dedup or session-path shortfall"
3611 );
3612 let (retry_hits, retry_total_count) = self.search_tantivy_federated(
3613 readers.as_ref(),
3614 query,
3615 &sanitized,
3616 filters.clone(),
3617 fallback_fetch_limit,
3618 field_mask,
3619 )?;
3620 if let Ok(mut tc) = self.last_tantivy_total_count.lock() {
3621 *tc = Some(retry_total_count);
3622 }
3623 if !retry_hits.is_empty() {
3624 (deduped_len, paged_hits) = page_hits(retry_hits);
3625 }
3626 }
3627
3628 tracing::trace!(
3629 query = sanitized,
3630 target_hits,
3631 deduped_len,
3632 returned = paged_hits.len(),
3633 shards = readers.len(),
3634 "federated lexical fetch complete"
3635 );
3636
3637 if can_use_cache && offset == 0 {
3638 self.put_cache(&sanitized, &filters, &paged_hits);
3639 }
3640 return Ok(paged_hits);
3641 }
3642 tracing::debug!(
3643 query = sanitized,
3644 shards = readers.len(),
3645 "federated tantivy returned zero hits; skipping sqlite fallback because tantivy is authoritative when available"
3646 );
3647 return Ok(Vec::new());
3648 }
3649
3650 let unsupported_wildcards = sanitized.split_whitespace().any(|t| {
3654 let core = t.trim_end_matches('*');
3655 core.contains('*') });
3657
3658 if unsupported_wildcards {
3659 return Ok(Vec::new());
3660 }
3661
3662 let has_sqlite_backend = {
3663 let sqlite_guard = self
3664 .sqlite
3665 .lock()
3666 .map_err(|_| anyhow!("sqlite lock poisoned"))?;
3667 sqlite_guard.is_some() || self.sqlite_path.is_some()
3668 };
3669
3670 if has_sqlite_backend {
3671 tracing::info!(
3672 backend = "sqlite-fts5",
3673 query = sanitized,
3674 limit = fallback_fetch_limit,
3675 offset = 0,
3676 "search_start"
3677 );
3678 let hits = self.search_sqlite_fts5(
3679 self.sqlite_path
3680 .as_deref()
3681 .unwrap_or_else(|| Path::new(":memory:")),
3682 query,
3683 filters.clone(),
3684 fallback_fetch_limit,
3685 0, field_mask,
3687 )?;
3688 let (_, paged_hits) =
3689 self.postprocess_hits_page(hits, &sanitized, &filters, limit, offset);
3690
3691 if can_use_cache && offset == 0 {
3692 self.put_cache(&sanitized, &filters, &paged_hits);
3693 }
3694 return Ok(paged_hits);
3695 }
3696
3697 tracing::info!(backend = "none", query = query, "search_start");
3698 Ok(Vec::new())
3699 }
3700
3701 pub fn set_semantic_context(
3702 &self,
3703 embedder: Arc<dyn Embedder>,
3704 fs_semantic_index: VectorIndex,
3705 filter_maps: SemanticFilterMaps,
3706 roles: Option<HashSet<u8>>,
3707 ann_path: Option<PathBuf>,
3708 ) -> Result<()> {
3709 self.set_semantic_indexes_context(
3710 embedder,
3711 vec![fs_semantic_index],
3712 filter_maps,
3713 roles,
3714 ann_path,
3715 )
3716 }
3717
3718 pub fn set_semantic_indexes_context(
3719 &self,
3720 embedder: Arc<dyn Embedder>,
3721 fs_semantic_indexes: Vec<VectorIndex>,
3722 filter_maps: SemanticFilterMaps,
3723 roles: Option<HashSet<u8>>,
3724 ann_path: Option<PathBuf>,
3725 ) -> Result<()> {
3726 if fs_semantic_indexes.is_empty() {
3727 bail!("semantic context requires at least one vector index");
3728 }
3729
3730 let fs_semantic_indexes = fs_semantic_indexes
3731 .into_iter()
3732 .map(|index| {
3733 let embedder_id = index.embedder_id().to_string();
3734 let dimension = index.dimension();
3735 if embedder_id != embedder.id() {
3736 bail!(
3737 "embedder mismatch: index uses {}, embedder is {}",
3738 embedder_id,
3739 embedder.id()
3740 );
3741 }
3742 if dimension != embedder.dimension() {
3743 bail!(
3744 "embedder dimension mismatch: index uses {}, embedder is {}",
3745 dimension,
3746 embedder.dimension()
3747 );
3748 }
3749 Ok(Arc::new(index))
3750 })
3751 .collect::<Result<Vec<_>>>()?;
3752 let fs_semantic_index = Arc::clone(&fs_semantic_indexes[0]);
3753 let shard_count = fs_semantic_indexes.len();
3754 let ann_path = if shard_count == 1 { ann_path } else { None };
3755 let embedder_id = fs_semantic_index.embedder_id().to_string();
3756 let dimension = fs_semantic_index.dimension();
3757 let fs_semantic_indexes = Arc::new(fs_semantic_indexes);
3758
3759 let capacity = NonZeroUsize::new(100).ok_or_else(|| anyhow!("invalid cache size"))?;
3760 let context_token = Arc::new(());
3761 let mut state_guard = self
3762 .semantic
3763 .lock()
3764 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3765 *state_guard = Some(SemanticSearchState {
3766 context_token,
3767 embedder,
3768 fs_semantic_index,
3769 fs_semantic_indexes,
3770 fs_ann_index: None,
3771 ann_path,
3772 fs_in_memory_two_tier_index: None,
3773 in_memory_two_tier_unavailable: InMemoryTwoTierUnavailable::default(),
3774 progressive_context: None,
3775 progressive_context_unavailable: false,
3776 filter_maps,
3777 roles,
3778 query_cache: QueryCache::new(embedder_id.as_str(), capacity),
3779 });
3780 if shard_count > 1 {
3781 tracing::info!(
3782 shard_count,
3783 dimension,
3784 embedder = embedder_id,
3785 "semantic search context loaded sharded vector generation"
3786 );
3787 }
3788 Ok(())
3789 }
3790
3791 pub fn clear_semantic_context(&self) -> Result<()> {
3792 let mut guard = self
3793 .semantic
3794 .lock()
3795 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3796 *guard = None;
3797 Ok(())
3798 }
3799
3800 fn semantic_context_matches(&self, context_token: &Arc<()>) -> Result<bool> {
3801 let guard = self
3802 .semantic
3803 .lock()
3804 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3805 Ok(guard
3806 .as_ref()
3807 .is_some_and(|state| Arc::ptr_eq(&state.context_token, context_token)))
3808 }
3809
3810 fn semantic_query_embedding(&self, canonical: &str) -> Result<SemanticQueryEmbedding> {
3811 loop {
3812 let (embedder, context_token) = {
3813 let mut guard = self
3814 .semantic
3815 .lock()
3816 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3817 let state = guard.as_mut().ok_or_else(|| {
3818 anyhow!("semantic search unavailable (no embedder or vector index)")
3819 })?;
3820 if let Some(hit) = state
3821 .query_cache
3822 .get_cached(state.embedder.as_ref(), canonical)
3823 {
3824 return Ok(SemanticQueryEmbedding {
3825 context_token: Arc::clone(&state.context_token),
3826 vector: hit,
3827 });
3828 }
3829 (
3830 Arc::clone(&state.embedder),
3831 Arc::clone(&state.context_token),
3832 )
3833 };
3834
3835 let embedding = embedder
3836 .embed_sync(canonical)
3837 .map_err(|e| anyhow!("embedding failed: {e}"))?;
3838
3839 let mut guard = self
3840 .semantic
3841 .lock()
3842 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3843 let state = guard.as_mut().ok_or_else(|| {
3844 anyhow!("semantic search unavailable (no embedder or vector index)")
3845 })?;
3846 if !Arc::ptr_eq(&state.context_token, &context_token) {
3847 continue;
3848 }
3849 if let Some(hit) = state
3850 .query_cache
3851 .get_cached(state.embedder.as_ref(), canonical)
3852 {
3853 return Ok(SemanticQueryEmbedding {
3854 context_token,
3855 vector: hit,
3856 });
3857 }
3858 state
3859 .query_cache
3860 .store(state.embedder.as_ref(), canonical, embedding.clone());
3861 return Ok(SemanticQueryEmbedding {
3862 context_token,
3863 vector: embedding,
3864 });
3865 }
3866 }
3867
3868 fn in_memory_two_tier_index(
3869 &self,
3870 tier_mode: SemanticTierMode,
3871 ) -> Result<Option<Arc<FsInMemoryTwoTierIndex>>> {
3872 loop {
3873 let (ann_path, embedder_id, context_token) = {
3874 let mut guard = self
3875 .semantic
3876 .lock()
3877 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3878 let state = guard.as_mut().ok_or_else(|| {
3879 anyhow!("semantic search unavailable (no embedder or vector index)")
3880 })?;
3881 if let Some(index) = state.fs_in_memory_two_tier_index.as_ref()
3882 && two_tier_index_supports_mode(index.as_ref(), tier_mode)
3883 {
3884 return Ok(Some(Arc::clone(index)));
3885 }
3886 if state
3887 .in_memory_two_tier_unavailable
3888 .is_known_unavailable(tier_mode)
3889 {
3890 return Ok(None);
3891 }
3892 (
3893 state.ann_path.clone(),
3894 state.embedder.id().to_string(),
3895 Arc::clone(&state.context_token),
3896 )
3897 };
3898
3899 let index = build_in_memory_two_tier_index(ann_path.clone(), &embedder_id, tier_mode);
3900
3901 let mut guard = self
3902 .semantic
3903 .lock()
3904 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3905 let state = guard.as_mut().ok_or_else(|| {
3906 anyhow!("semantic search unavailable (no embedder or vector index)")
3907 })?;
3908 if let Some(existing) = state.fs_in_memory_two_tier_index.as_ref()
3909 && two_tier_index_supports_mode(existing.as_ref(), tier_mode)
3910 {
3911 return Ok(Some(Arc::clone(existing)));
3912 }
3913 if !Arc::ptr_eq(&state.context_token, &context_token) {
3914 continue;
3915 }
3916 let Some(index) = index else {
3917 state
3918 .in_memory_two_tier_unavailable
3919 .mark_unavailable(tier_mode);
3920 return Ok(None);
3921 };
3922 if !two_tier_index_supports_mode(index.as_ref(), tier_mode) {
3923 state
3924 .in_memory_two_tier_unavailable
3925 .mark_unavailable(tier_mode);
3926 return Ok(None);
3927 }
3928 state.fs_in_memory_two_tier_index = Some(Arc::clone(&index));
3929 if index.has_quality_index() {
3930 state.in_memory_two_tier_unavailable = InMemoryTwoTierUnavailable::default();
3931 } else {
3932 state.in_memory_two_tier_unavailable.fast_only = false;
3933 }
3934 return Ok(Some(index));
3935 }
3936 }
3937
3938 fn ann_index(&self) -> Result<Arc<FsHnswIndex>> {
3939 loop {
3940 let (ann_path, fs_semantic_index) = {
3941 let mut guard = self
3942 .semantic
3943 .lock()
3944 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3945 let state = guard.as_mut().ok_or_else(|| {
3946 anyhow!("semantic search unavailable (no embedder or vector index)")
3947 })?;
3948 if let Some(index) = state.fs_ann_index.as_ref() {
3949 return Ok(Arc::clone(index));
3950 }
3951 let ann_path = state.ann_path.clone().ok_or_else(|| {
3952 anyhow!(
3953 "approximate search unavailable: HNSW index missing (run 'cass index --semantic --build-hnsw')"
3954 )
3955 })?;
3956 (ann_path, Arc::clone(&state.fs_semantic_index))
3957 };
3958
3959 let ann = Arc::new(open_fs_semantic_ann_index(
3960 fs_semantic_index.as_ref(),
3961 &ann_path,
3962 )?);
3963
3964 let mut guard = self
3965 .semantic
3966 .lock()
3967 .map_err(|_| anyhow!("semantic lock poisoned"))?;
3968 let state = guard.as_mut().ok_or_else(|| {
3969 anyhow!("semantic search unavailable (no embedder or vector index)")
3970 })?;
3971 if let Some(existing) = state.fs_ann_index.as_ref() {
3972 return Ok(Arc::clone(existing));
3973 }
3974 if state.ann_path.as_ref() != Some(&ann_path)
3975 || !Arc::ptr_eq(&state.fs_semantic_index, &fs_semantic_index)
3976 {
3977 continue;
3978 }
3979 state.fs_ann_index = Some(Arc::clone(&ann));
3980 return Ok(ann);
3981 }
3982 }
3983
3984 fn collapse_semantic_results(
3985 best_by_message: HashMap<u64, VectorSearchResult>,
3986 fetch_limit: usize,
3987 ) -> Vec<VectorSearchResult> {
3988 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
3989 collapsed.sort_by(|a, b| {
3990 b.score
3991 .total_cmp(&a.score)
3992 .then_with(|| a.message_id.cmp(&b.message_id))
3993 });
3994 if collapsed.len() > fetch_limit {
3995 collapsed.truncate(fetch_limit);
3996 }
3997 collapsed
3998 }
3999
4000 fn semantic_exact_candidate_limit(fetch_limit: usize, record_count: usize) -> usize {
4001 fetch_limit
4002 .saturating_mul(SEMANTIC_EXACT_CHUNK_OVERFETCH_MULTIPLIER)
4003 .max(fetch_limit)
4004 .min(record_count)
4005 }
4006
4007 fn semantic_window_may_omit_competitor(
4008 collapsed: &[VectorSearchResult],
4009 fetch_limit: usize,
4010 max_omitted_score: Option<f32>,
4011 ) -> bool {
4012 if fetch_limit == 0 {
4013 return false;
4014 }
4015 let Some(max_omitted_score) = max_omitted_score else {
4016 return false;
4017 };
4018 if collapsed.len() < fetch_limit {
4019 return true;
4020 }
4021 let Some(last_in_requested_window) = collapsed.get(fetch_limit - 1) else {
4022 return true;
4023 };
4024 !last_in_requested_window
4025 .score
4026 .total_cmp(&max_omitted_score)
4027 .is_gt()
4028 }
4029
4030 fn record_fs_semantic_hit(
4031 best_by_message: &mut HashMap<u64, VectorSearchResult>,
4032 hit: &FsVectorHit,
4033 ) {
4034 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4035 return;
4036 };
4037 best_by_message
4038 .entry(parsed.message_id)
4039 .and_modify(|entry| {
4040 if hit.score > entry.score {
4041 entry.score = hit.score;
4042 entry.chunk_idx = parsed.chunk_idx;
4043 }
4044 })
4045 .or_insert(VectorSearchResult {
4046 message_id: parsed.message_id,
4047 chunk_idx: parsed.chunk_idx,
4048 score: hit.score,
4049 });
4050 }
4051
4052 fn search_exact_semantic_indexes(
4053 context: &SemanticCandidateContext,
4054 embedding: &[f32],
4055 fetch_limit: usize,
4056 fs_filter: Option<&dyn FsSearchFilter>,
4057 ) -> Result<(Vec<VectorSearchResult>, SemanticCandidateRetryState)> {
4058 if context.fs_semantic_indexes.len() == 1 {
4059 let record_count = context.fs_semantic_index.record_count();
4060 let candidate_limit = Self::semantic_exact_candidate_limit(fetch_limit, record_count);
4061 let fs_hits = context
4062 .fs_semantic_index
4063 .search_top_k(embedding, candidate_limit, fs_filter)
4064 .map_err(|err| anyhow!("frankensearch semantic search failed: {err}"))?;
4065 let mut best_by_message = HashMap::with_capacity(fs_hits.len());
4066 for hit in &fs_hits {
4067 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4068 }
4069 let collapsed = Self::collapse_semantic_results(best_by_message, candidate_limit);
4070 let has_more_candidates =
4071 fs_hits.len() >= candidate_limit && candidate_limit < record_count;
4072 let max_omitted_score = if has_more_candidates {
4073 fs_hits.last().map(|hit| hit.score)
4074 } else {
4075 None
4076 };
4077 let exact_window_may_omit_competitor = Self::semantic_window_may_omit_competitor(
4078 &collapsed,
4079 fetch_limit,
4080 max_omitted_score,
4081 );
4082 return Ok((
4083 collapsed,
4084 SemanticCandidateRetryState {
4085 has_more_candidates,
4086 exact_window_may_omit_competitor,
4087 },
4088 ));
4089 }
4090
4091 let mut best_by_message = HashMap::new();
4092 let mut raw_hits = 0usize;
4093 let mut max_omitted_score: Option<f32> = None;
4094 let mut has_more_candidates = false;
4095 for index in context.fs_semantic_indexes.iter() {
4096 let shard_record_count = index.record_count();
4097 let shard_limit = Self::semantic_exact_candidate_limit(fetch_limit, shard_record_count);
4103 if shard_limit == 0 {
4104 continue;
4105 }
4106 let fs_hits = index
4107 .search_top_k(embedding, shard_limit, fs_filter)
4108 .map_err(|err| anyhow!("frankensearch sharded semantic search failed: {err}"))?;
4109 if fs_hits.len() >= shard_limit
4110 && shard_limit < shard_record_count
4111 && let Some(last_hit) = fs_hits.last()
4112 {
4113 has_more_candidates = true;
4114 max_omitted_score = Some(
4115 max_omitted_score
4116 .map(|current| current.max(last_hit.score))
4117 .unwrap_or(last_hit.score),
4118 );
4119 }
4120 raw_hits = raw_hits.saturating_add(fs_hits.len());
4121 best_by_message.reserve(fs_hits.len());
4122 for hit in &fs_hits {
4123 Self::record_fs_semantic_hit(&mut best_by_message, hit);
4124 }
4125 }
4126 let candidate_return_limit = Self::semantic_exact_candidate_limit(fetch_limit, raw_hits);
4127 let collapsed = Self::collapse_semantic_results(best_by_message, candidate_return_limit);
4128 let exact_window_may_omit_competitor =
4129 Self::semantic_window_may_omit_competitor(&collapsed, fetch_limit, max_omitted_score);
4130 tracing::debug!(
4131 shard_count = context.fs_semantic_indexes.len(),
4132 raw_hits,
4133 returned = collapsed.len(),
4134 "semantic sharded exact merge complete"
4135 );
4136 Ok((
4137 collapsed,
4138 SemanticCandidateRetryState {
4139 has_more_candidates,
4140 exact_window_may_omit_competitor,
4141 },
4142 ))
4143 }
4144
4145 fn search_semantic_candidates(
4146 &self,
4147 context: &SemanticCandidateContext,
4148 embedding: &[f32],
4149 filters: &SearchFilters,
4150 request: SemanticCandidateSearchRequest<'_>,
4151 ) -> Result<(
4152 Vec<VectorSearchResult>,
4153 SemanticCandidateRetryState,
4154 Option<crate::search::ann_index::AnnSearchStats>,
4155 )> {
4156 let mut semantic_filter =
4157 SemanticFilter::from_search_filters(filters, &context.filter_maps)?;
4158 if let Some(roles) = context.roles.clone() {
4159 semantic_filter = semantic_filter.with_roles(Some(roles));
4160 }
4161
4162 if request.tier_mode.wants_two_tier() && !request.approximate {
4163 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4164 if let Some(two_tier_index) = request.in_memory_two_tier_index {
4165 let config = request.tier_mode.to_frankensearch_config();
4166 let searcher = FsSyncTwoTierSearcher::new(Arc::clone(two_tier_index), config);
4167 let (tier_hits, metrics) = searcher
4168 .search_collect_with_filter(embedding, request.fetch_limit, fs_filter)
4169 .map_err(|err| {
4170 anyhow!("frankensearch two-tier semantic search failed: {err}")
4171 })?;
4172
4173 tracing::debug!(
4174 tier_mode = ?request.tier_mode,
4175 phase1_ms = metrics.phase1_total_ms,
4176 phase2_ms = metrics.phase2_total_ms,
4177 skip_reason = ?metrics.skip_reason,
4178 returned = tier_hits.len(),
4179 "semantic two-tier search executed"
4180 );
4181
4182 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4183 HashMap::with_capacity(tier_hits.len());
4184 for hit in tier_hits.iter() {
4185 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4186 continue;
4187 };
4188 best_by_message
4189 .entry(parsed.message_id)
4190 .and_modify(|entry| {
4191 if hit.score > entry.score {
4192 entry.score = hit.score;
4193 entry.chunk_idx = parsed.chunk_idx;
4194 }
4195 })
4196 .or_insert(VectorSearchResult {
4197 message_id: parsed.message_id,
4198 chunk_idx: parsed.chunk_idx,
4199 score: hit.score,
4200 });
4201 }
4202
4203 return Ok((
4204 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4205 SemanticCandidateRetryState {
4206 has_more_candidates: tier_hits.len() >= request.fetch_limit,
4207 exact_window_may_omit_competitor: false,
4208 },
4209 None,
4210 ));
4211 }
4212
4213 tracing::debug!(
4214 tier_mode = ?request.tier_mode,
4215 "two-tier semantic unavailable; falling back to exact single-tier search"
4216 );
4217
4218 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4219 let (results, truncated) = Self::search_exact_semantic_indexes(
4220 context,
4221 embedding,
4222 request.fetch_limit,
4223 fs_filter,
4224 )?;
4225 return Ok((results, truncated, None));
4226 }
4227
4228 if request.approximate {
4229 if request.tier_mode.wants_two_tier() {
4230 tracing::debug!(
4231 tier_mode = ?request.tier_mode,
4232 "approximate search requested; bypassing two-tier mode"
4233 );
4234 }
4235
4236 let ann = request
4237 .ann_index
4238 .ok_or_else(|| anyhow!("HNSW index failed to initialize"))?;
4239 let candidate = request
4240 .fetch_limit
4241 .saturating_mul(ANN_CANDIDATE_MULTIPLIER)
4242 .max(request.fetch_limit);
4243 let ef = FS_HNSW_DEFAULT_EF_SEARCH.max(candidate);
4244 let (ann_results, search_stats) =
4245 ann.knn_search_with_stats(embedding, candidate, ef)
4246 .map_err(|err| anyhow!("frankensearch approximate search failed: {err}"))?;
4247 let ann_stats = Some(crate::search::ann_index::AnnSearchStats {
4248 index_size: search_stats.index_size,
4249 dimension: search_stats.dimension,
4250 ef_search: search_stats.ef_search,
4251 k_requested: search_stats.k_requested,
4252 k_returned: search_stats.k_returned,
4253 search_time_us: search_stats.search_time_us,
4254 estimated_recall: search_stats.estimated_recall as f32,
4255 is_approximate: search_stats.is_approximate,
4256 });
4257
4258 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4259
4260 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4261 HashMap::with_capacity(ann_results.len());
4262 for hit in ann_results.iter() {
4263 if let Some(filter) = fs_filter
4264 && !filter.matches(&hit.doc_id, None)
4265 {
4266 continue;
4267 }
4268 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4269 continue;
4270 };
4271 best_by_message
4272 .entry(parsed.message_id)
4273 .and_modify(|entry| {
4274 if hit.score > entry.score {
4275 entry.score = hit.score;
4276 entry.chunk_idx = parsed.chunk_idx;
4277 }
4278 })
4279 .or_insert(VectorSearchResult {
4280 message_id: parsed.message_id,
4281 chunk_idx: parsed.chunk_idx,
4282 score: hit.score,
4283 });
4284 }
4285
4286 return Ok((
4287 Self::collapse_semantic_results(best_by_message, request.fetch_limit),
4288 SemanticCandidateRetryState {
4289 has_more_candidates: ann_results.len() >= candidate,
4290 exact_window_may_omit_competitor: false,
4291 },
4292 ann_stats,
4293 ));
4294 }
4295
4296 let fs_filter = semantic_filter_as_search_filter(&semantic_filter);
4297 let (results, truncated) = Self::search_exact_semantic_indexes(
4298 context,
4299 embedding,
4300 request.fetch_limit,
4301 fs_filter,
4302 )?;
4303 Ok((results, truncated, None))
4304 }
4305
4306 pub fn can_progressively_refine(&self) -> bool {
4307 self.progressive_context()
4308 .map(|context| {
4309 context.as_ref().is_some_and(|ctx| {
4310 ctx.quality_embedder.is_some() && ctx.index.has_quality_index()
4311 })
4312 })
4313 .unwrap_or(false)
4314 }
4315
4316 fn progressive_context(&self) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4317 loop {
4318 let (ann_path, embedder, context_token) = {
4319 let mut guard = self
4320 .semantic
4321 .lock()
4322 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4323 let state = guard.as_mut().ok_or_else(|| {
4324 anyhow!("semantic search unavailable (no embedder or vector index)")
4325 })?;
4326 if let Some(context) = state.progressive_context.as_ref() {
4327 return Ok(Some(Arc::clone(context)));
4328 }
4329 if state.progressive_context_unavailable {
4330 return Ok(None);
4331 }
4332 (
4333 state.ann_path.clone(),
4334 Arc::clone(&state.embedder),
4335 Arc::clone(&state.context_token),
4336 )
4337 };
4338
4339 let context = match self.build_progressive_context(
4340 ann_path.clone(),
4341 embedder,
4342 Arc::clone(&context_token),
4343 ) {
4344 Ok(context) => context,
4345 Err(err) => {
4346 let mut guard = self
4347 .semantic
4348 .lock()
4349 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4350 let state = guard.as_mut().ok_or_else(|| {
4351 anyhow!("semantic search unavailable (no embedder or vector index)")
4352 })?;
4353 if let Some(existing) = state.progressive_context.as_ref() {
4354 return Ok(Some(Arc::clone(existing)));
4355 }
4356 if !Arc::ptr_eq(&state.context_token, &context_token) {
4357 continue;
4358 }
4359 return Err(err);
4360 }
4361 };
4362
4363 let Some(context) = context else {
4364 let mut guard = self
4365 .semantic
4366 .lock()
4367 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4368 let state = guard.as_mut().ok_or_else(|| {
4369 anyhow!("semantic search unavailable (no embedder or vector index)")
4370 })?;
4371 if let Some(existing) = state.progressive_context.as_ref() {
4372 return Ok(Some(Arc::clone(existing)));
4373 }
4374 if !Arc::ptr_eq(&state.context_token, &context_token) {
4375 continue;
4376 }
4377 state.progressive_context_unavailable = true;
4378 return Ok(None);
4379 };
4380
4381 let mut guard = self
4382 .semantic
4383 .lock()
4384 .map_err(|_| anyhow!("semantic lock poisoned"))?;
4385 let state = guard.as_mut().ok_or_else(|| {
4386 anyhow!("semantic search unavailable (no embedder or vector index)")
4387 })?;
4388 if let Some(existing) = state.progressive_context.as_ref() {
4389 return Ok(Some(Arc::clone(existing)));
4390 }
4391 if !Arc::ptr_eq(&state.context_token, &context_token) {
4392 continue;
4393 }
4394 state.progressive_context_unavailable = false;
4395 state.progressive_context = Some(Arc::clone(&context));
4396 return Ok(Some(context));
4397 }
4398 }
4399
4400 fn build_progressive_context(
4401 &self,
4402 ann_path: Option<PathBuf>,
4403 embedder: Arc<dyn Embedder>,
4404 context_token: Arc<()>,
4405 ) -> Result<Option<Arc<ProgressiveTwoTierContext>>> {
4406 let Some(index_dir) = ann_path
4407 .as_ref()
4408 .and_then(|path| path.parent().map(Path::to_path_buf))
4409 else {
4410 return Ok(None);
4411 };
4412
4413 let fast_path = {
4414 let explicit = index_dir.join("vector.fast.idx");
4415 if explicit.is_file() {
4416 explicit
4417 } else {
4418 let fallback = index_dir.join("vector.idx");
4419 if fallback.is_file() {
4420 fallback
4421 } else {
4422 return Ok(None);
4423 }
4424 }
4425 };
4426 let quality_path = index_dir.join("vector.quality.idx");
4427 if !quality_path.is_file() {
4428 return Ok(None);
4429 }
4430
4431 let fast_index = FsVectorIndex::open(&fast_path)
4432 .map_err(|err| anyhow!("open fast-tier index failed: {err}"))?;
4433 let quality_index = FsVectorIndex::open(&quality_path)
4434 .map_err(|err| anyhow!("open quality-tier index failed: {err}"))?;
4435 let index = Arc::new(
4436 FsTwoTierIndex::open(&index_dir, frankensearch_two_tier_config())
4437 .map_err(|err| anyhow!("open progressive two-tier index failed: {err}"))?,
4438 );
4439
4440 let fast_embedder = self.load_embedder_for_progressive_id(
4441 &embedder,
4442 fast_index.embedder_id(),
4443 fast_index.dimension(),
4444 )?;
4445 let fast_embedder: Arc<dyn frankensearch::Embedder> = Arc::new(FsSyncEmbedderAdapter(
4446 SharedCassSyncEmbedder::new(fast_embedder),
4447 ));
4448 let quality_embedder = Some(self.load_embedder_for_progressive_id(
4449 &embedder,
4450 quality_index.embedder_id(),
4451 quality_index.dimension(),
4452 )?);
4453 let quality_embedder = quality_embedder.map(|embedder| {
4454 Arc::new(FsSyncEmbedderAdapter(SharedCassSyncEmbedder::new(embedder)))
4455 as Arc<dyn frankensearch::Embedder>
4456 });
4457
4458 Ok(Some(Arc::new(ProgressiveTwoTierContext {
4459 context_token,
4460 index,
4461 fast_embedder,
4462 quality_embedder,
4463 })))
4464 }
4465
4466 fn load_embedder_for_progressive_id(
4467 &self,
4468 current_embedder: &Arc<dyn Embedder>,
4469 embedder_id: &str,
4470 dimension: usize,
4471 ) -> Result<Arc<dyn Embedder>> {
4472 if current_embedder.id() == embedder_id {
4473 return Ok(Arc::clone(current_embedder));
4474 }
4475
4476 if let Some(dim) = embedder_id.strip_prefix("fnv1a-")
4477 && let Ok(parsed) = dim.parse::<usize>()
4478 {
4479 return Ok(Arc::new(crate::search::hash_embedder::HashEmbedder::new(
4480 parsed.max(dimension),
4481 )));
4482 }
4483
4484 if let Some(embedder_name) =
4485 crate::search::fastembed_embedder::FastEmbedder::canonical_name(embedder_id)
4486 {
4487 let data_dir = self
4488 .sqlite_path
4489 .as_ref()
4490 .and_then(|path| path.parent())
4491 .ok_or_else(|| anyhow!("cannot resolve data dir for progressive embedder load"))?;
4492 let embedder = crate::search::fastembed_embedder::FastEmbedder::load_by_name(
4493 data_dir,
4494 embedder_name,
4495 )
4496 .with_context(|| format!("loading FastEmbed model for {embedder_name}"))?;
4497 if embedder.dimension() != dimension {
4498 bail!(
4499 "progressive embedder dimension mismatch: {} index expects {}, model has {}",
4500 embedder_id,
4501 dimension,
4502 embedder.dimension()
4503 );
4504 }
4505 return Ok(Arc::new(embedder));
4506 }
4507
4508 bail!("unsupported progressive embedder id: {embedder_id}");
4509 }
4510
4511 fn resolve_semantic_doc_ids_for_hits(
4512 &self,
4513 hits: &[SearchHit],
4514 ) -> Result<Vec<Option<ResolvedSemanticDocId>>> {
4515 if hits.is_empty() {
4516 return Ok(Vec::new());
4517 }
4518
4519 let lookup_keys: Vec<Option<ProgressiveLookupKey>> = hits
4520 .iter()
4521 .map(|hit| {
4522 let idx = hit
4523 .line_number
4524 .and_then(|line| line.checked_sub(1))
4525 .map(i64::try_from)
4526 .transpose()
4527 .ok()
4528 .flatten()?;
4529 Some((
4530 normalized_search_hit_source_id(hit),
4531 hit.source_path.clone(),
4532 hit.conversation_id,
4533 hit.title.trim().to_string(),
4534 idx,
4535 hit.created_at,
4536 hit.content_hash,
4537 ))
4538 })
4539 .collect();
4540
4541 let mut seen_exact = HashSet::new();
4542 let mut exact_query_keys = Vec::new();
4543 let mut seen_fallback = HashSet::new();
4544 let mut fallback_query_keys = Vec::new();
4545 for (source_id, source_path, conversation_id, _title, idx, _created_at, _content_hash) in
4546 lookup_keys.iter().flatten()
4547 {
4548 if let Some(conversation_id) = conversation_id {
4549 let query_key: ProgressiveExactQueryKey = (*conversation_id, *idx);
4550 if seen_exact.insert(query_key) {
4551 exact_query_keys.push(query_key);
4552 }
4553 } else {
4554 let query_key: ProgressiveFallbackQueryKey =
4555 (source_id.clone(), source_path.clone(), *idx);
4556 if seen_fallback.insert(query_key.clone()) {
4557 fallback_query_keys.push(query_key);
4558 }
4559 }
4560 }
4561
4562 if exact_query_keys.is_empty() && fallback_query_keys.is_empty() {
4563 return Ok(vec![None; hits.len()]);
4564 }
4565
4566 let sqlite_guard = self.sqlite_guard()?;
4567 let conn = sqlite_guard
4568 .as_ref()
4569 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4570
4571 let mut resolved_by_key = HashMap::new();
4572 let normalized_source_sql =
4573 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4574
4575 const CHUNK_SIZE: usize = 300;
4576 for chunk in exact_query_keys.chunks(CHUNK_SIZE) {
4577 let mut sql = String::from("SELECT c.id, ");
4578 sql.push_str(&normalized_source_sql);
4579 sql.push_str(
4580 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4581 FROM messages m
4582 JOIN conversations c ON m.conversation_id = c.id
4583 LEFT JOIN sources s ON c.source_id = s.id
4584 WHERE ",
4585 );
4586 let mut params = Vec::with_capacity(chunk.len().saturating_mul(2));
4587 for (idx, (conversation_id, line_idx)) in chunk.iter().enumerate() {
4588 if idx > 0 {
4589 sql.push_str(" OR ");
4590 }
4591 sql.push_str("(c.id = ? AND m.idx = ?)");
4592 params.push(ParamValue::from(*conversation_id));
4593 params.push(ParamValue::from(*line_idx));
4594 }
4595
4596 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4597 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4598 let conversation_id: i64 = row.get_typed(0)?;
4599 let source_id: String = row.get_typed(1)?;
4600 let source_path: String = row.get_typed(2)?;
4601 let idx: i64 = row.get_typed(3)?;
4602 let message_id_raw: i64 = row.get_typed(4)?;
4603 let agent_id_raw: Option<i64> = row.get_typed(5)?;
4606 let workspace_id_raw: Option<i64> = row.get_typed(6)?;
4607 let role_raw: String = row.get_typed(7)?;
4608 let created_at_ms: Option<i64> = row.get_typed(8)?;
4609 let content: String = row.get_typed(9)?;
4610 let title: Option<String> = row.get_typed(10)?;
4611
4612 let canonical = canonicalize_for_embedding(&content);
4613 if canonical.is_empty() {
4614 return Ok(None);
4615 }
4616
4617 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4618 std::io::Error::other("message id out of range for progressive doc_id")
4619 })?;
4620 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4621 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4622 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4623 let doc_id = SemanticDocId {
4624 message_id,
4625 chunk_idx: 0,
4626 agent_id,
4627 workspace_id,
4628 source_id: crc32fast::hash(source_id.as_bytes()),
4629 role,
4630 created_at_ms: created_at_ms.unwrap_or(0),
4631 content_hash: Some(content_hash(&canonical)),
4632 }
4633 .to_doc_id_string();
4634 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4635 let lookup_key = (
4636 source_id,
4637 source_path.clone(),
4638 Some(conversation_id),
4639 title.unwrap_or_default().trim().to_string(),
4640 idx,
4641 created_at_ms,
4642 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4643 );
4644
4645 Ok(Some((
4646 lookup_key,
4647 ResolvedSemanticDocId { message_id, doc_id },
4648 )))
4649 })?;
4650
4651 for row in chunk_rows.into_iter().flatten() {
4652 resolved_by_key.insert(row.0, row.1);
4653 }
4654 }
4655
4656 for chunk in fallback_query_keys.chunks(CHUNK_SIZE) {
4657 let mut sql = String::from("SELECT ");
4658 sql.push_str(&normalized_source_sql);
4659 sql.push_str(
4660 ", c.source_path, m.idx, m.id, c.agent_id, c.workspace_id, m.role, m.created_at, m.content, c.title
4661 FROM messages m
4662 JOIN conversations c ON m.conversation_id = c.id
4663 LEFT JOIN sources s ON c.source_id = s.id
4664 WHERE ",
4665 );
4666 let mut params = Vec::with_capacity(chunk.len().saturating_mul(3));
4667 for (idx, (source_id, source_path, line_idx)) in chunk.iter().enumerate() {
4668 if idx > 0 {
4669 sql.push_str(" OR ");
4670 }
4671 sql.push_str(&format!(
4672 "({normalized_source_sql} = ? AND c.source_path = ? AND m.idx = ?)"
4673 ));
4674 params.push(ParamValue::from(normalize_search_source_filter_value(
4675 source_id,
4676 )));
4677 params.push(ParamValue::from(source_path.clone()));
4678 params.push(ParamValue::from(*line_idx));
4679 }
4680
4681 let chunk_rows: Vec<ResolvedSemanticLookupRow> =
4682 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4683 let source_id: String = row.get_typed(0)?;
4684 let source_path: String = row.get_typed(1)?;
4685 let idx: i64 = row.get_typed(2)?;
4686 let message_id_raw: i64 = row.get_typed(3)?;
4687 let agent_id_raw: Option<i64> = row.get_typed(4)?;
4690 let workspace_id_raw: Option<i64> = row.get_typed(5)?;
4691 let role_raw: String = row.get_typed(6)?;
4692 let created_at_ms: Option<i64> = row.get_typed(7)?;
4693 let content: String = row.get_typed(8)?;
4694 let title: Option<String> = row.get_typed(9)?;
4695
4696 let canonical = canonicalize_for_embedding(&content);
4697 if canonical.is_empty() {
4698 return Ok(None);
4699 }
4700
4701 let message_id = u64::try_from(message_id_raw).map_err(|_| {
4702 std::io::Error::other("message id out of range for progressive doc_id")
4703 })?;
4704 let agent_id = semantic_doc_component_id_from_db(agent_id_raw);
4705 let workspace_id = semantic_doc_component_id_from_db(workspace_id_raw);
4706 let role = role_code_from_str(&role_raw).unwrap_or(ROLE_USER);
4707 let doc_id = SemanticDocId {
4708 message_id,
4709 chunk_idx: 0,
4710 agent_id,
4711 workspace_id,
4712 source_id: crc32fast::hash(source_id.as_bytes()),
4713 role,
4714 created_at_ms: created_at_ms.unwrap_or(0),
4715 content_hash: Some(content_hash(&canonical)),
4716 }
4717 .to_doc_id_string();
4718 let line_number = usize::try_from(idx).ok().map(|line| line.saturating_add(1));
4719 let lookup_key = (
4720 source_id,
4721 source_path.clone(),
4722 None,
4723 title.unwrap_or_default().trim().to_string(),
4724 idx,
4725 created_at_ms,
4726 stable_hit_hash(&content, &source_path, line_number, created_at_ms),
4727 );
4728
4729 Ok(Some((
4730 lookup_key,
4731 ResolvedSemanticDocId { message_id, doc_id },
4732 )))
4733 })?;
4734
4735 for row in chunk_rows.into_iter().flatten() {
4736 resolved_by_key.insert(row.0, row.1);
4737 }
4738 }
4739
4740 Ok(lookup_keys
4741 .into_iter()
4742 .map(|key| key.and_then(|lookup| resolved_by_key.get(&lookup).cloned()))
4743 .collect())
4744 }
4745
4746 fn load_message_text_by_id(&self, message_id: u64) -> Result<Option<String>> {
4747 let sqlite_guard = self.sqlite_guard()?;
4748 let conn = sqlite_guard
4749 .as_ref()
4750 .ok_or_else(|| anyhow!("progressive search requires database connection"))?;
4751 let rows: Vec<String> = conn.query_map_collect(
4752 "SELECT content FROM messages WHERE id = ?",
4753 &[ParamValue::from(i64::try_from(message_id)?)],
4754 |row: &frankensqlite::Row| row.get_typed(0),
4755 )?;
4756 Ok(rows.into_iter().next())
4757 }
4758
4759 fn collapse_progressive_scored_results(
4760 &self,
4761 results: &[FsScoredResult],
4762 fetch_limit: usize,
4763 ) -> Vec<VectorSearchResult> {
4764 let fetch = fetch_limit.max(1);
4765 let mut best_by_message: HashMap<u64, VectorSearchResult> =
4766 HashMap::with_capacity(results.len());
4767 for hit in results {
4768 let Some(parsed) = parse_semantic_doc_id(&hit.doc_id) else {
4769 continue;
4770 };
4771 best_by_message
4772 .entry(parsed.message_id)
4773 .and_modify(|entry| {
4774 if hit.score > entry.score {
4775 entry.score = hit.score;
4776 entry.chunk_idx = parsed.chunk_idx;
4777 }
4778 })
4779 .or_insert(VectorSearchResult {
4780 message_id: parsed.message_id,
4781 chunk_idx: parsed.chunk_idx,
4782 score: hit.score,
4783 });
4784 }
4785 let mut collapsed: Vec<VectorSearchResult> = best_by_message.into_values().collect();
4786 collapsed.sort_by(|a, b| {
4787 b.score
4788 .total_cmp(&a.score)
4789 .then_with(|| a.message_id.cmp(&b.message_id))
4790 });
4791 if collapsed.len() > fetch {
4792 collapsed.truncate(fetch);
4793 }
4794 collapsed
4795 }
4796
4797 fn hydrate_semantic_hits_with_ids(
4798 &self,
4799 results: &[VectorSearchResult],
4800 field_mask: FieldMask,
4801 ) -> Result<Vec<(u64, SearchHit)>> {
4802 if results.is_empty() {
4803 return Ok(Vec::new());
4804 }
4805 let sqlite_guard = self.sqlite_guard()?;
4806 let conn = sqlite_guard
4807 .as_ref()
4808 .ok_or_else(|| anyhow!("semantic search requires database connection"))?;
4809
4810 let placeholder_capacity = results.len().saturating_mul(2).saturating_sub(1);
4811 let mut placeholders = String::with_capacity(placeholder_capacity);
4812 let mut params: Vec<ParamValue> = Vec::with_capacity(results.len());
4813 for (idx, result) in results.iter().enumerate() {
4814 if idx > 0 {
4815 placeholders.push(',');
4816 }
4817 placeholders.push('?');
4818 params.push(ParamValue::from(i64::try_from(result.message_id)?));
4819 }
4820
4821 let title_expr = if field_mask.wants_title() {
4822 "c.title"
4823 } else {
4824 "''"
4825 };
4826 let normalized_source_sql =
4827 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
4828 let sql = format!(
4833 "SELECT m.id, c.id, m.content, m.created_at, m.idx, m.role, {title_expr}, c.source_path, {normalized_source_sql}, c.origin_host, COALESCE(a.slug, 'unknown'), w.path, s.kind, c.started_at
4834 FROM messages m
4835 JOIN conversations c ON m.conversation_id = c.id
4836 LEFT JOIN agents a ON c.agent_id = a.id
4837 LEFT JOIN workspaces w ON c.workspace_id = w.id
4838 LEFT JOIN sources s ON c.source_id = s.id
4839 WHERE m.id IN ({placeholders})"
4840 );
4841
4842 let rows: Vec<(u64, SearchHit)> =
4843 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
4844 let message_id: i64 = row.get_typed(0)?;
4845 let conversation_id: i64 = row.get_typed(1)?;
4846 let full_content: String = row.get_typed(2)?;
4847 let msg_created_at: Option<i64> = row.get_typed(3)?;
4848 let idx: Option<i64> = row.get_typed(4)?;
4849 let title: Option<String> = if field_mask.wants_title() {
4850 row.get_typed(6)?
4851 } else {
4852 None
4853 };
4854 let source_path: String = row.get_typed(7)?;
4855 let raw_source_id: String = row.get_typed(8)?;
4856 let origin_host: Option<String> = row.get_typed(9)?;
4857 let agent: String = row.get_typed(10)?;
4858 let workspace: Option<String> = row.get_typed(11)?;
4859 let raw_origin_kind: Option<String> = row.get_typed(12)?;
4860 let started_at: Option<i64> = row.get_typed(13)?;
4861
4862 let created_at = msg_created_at.or(started_at);
4863 let line_number = idx
4864 .and_then(|i| usize::try_from(i).ok())
4865 .map(|i| i.saturating_add(1));
4866 let snippet = if field_mask.wants_snippet() {
4867 snippet_from_content(&full_content)
4868 } else {
4869 String::new()
4870 };
4871 let content = if field_mask.needs_content() {
4872 full_content.clone()
4873 } else {
4874 String::new()
4875 };
4876 let content_hash =
4877 stable_hit_hash(&full_content, &source_path, line_number, created_at);
4878 let source_id = normalized_search_hit_source_id_parts(
4879 raw_source_id.as_str(),
4880 raw_origin_kind.as_deref().unwrap_or_default(),
4881 origin_host.as_deref(),
4882 );
4883 let origin_kind =
4884 normalized_search_hit_origin_kind(&source_id, raw_origin_kind.as_deref());
4885
4886 let hit = SearchHit {
4887 title: if field_mask.wants_title() {
4888 title.unwrap_or_default()
4889 } else {
4890 String::new()
4891 },
4892 snippet,
4893 content,
4894 content_hash,
4895 conversation_id: Some(conversation_id),
4896 score: 0.0,
4897 source_path,
4898 agent,
4899 workspace: workspace.unwrap_or_default(),
4900 workspace_original: None,
4901 created_at,
4902 line_number,
4903 match_type: MatchType::Exact,
4904 source_id,
4905 origin_kind,
4906 origin_host,
4907 };
4908
4909 Ok((semantic_message_id_from_db(message_id)?, hit))
4910 })?;
4911
4912 let mut hits_by_id = HashMap::new();
4913 for (id, hit) in rows {
4914 hits_by_id.insert(id, hit);
4915 }
4916
4917 let mut ordered = Vec::new();
4918 for result in results {
4919 if let Some(mut hit) = hits_by_id.remove(&result.message_id) {
4920 hit.score = result.score;
4921 ordered.push((result.message_id, hit));
4922 }
4923 }
4924
4925 Ok(ordered)
4926 }
4927
4928 fn overlay_progressive_lexical_hit(
4929 &self,
4930 hit: &mut SearchHit,
4931 lexical: &ProgressiveLexicalHit,
4932 field_mask: FieldMask,
4933 ) {
4934 if field_mask.wants_title() && !lexical.title.is_empty() {
4935 hit.title = lexical.title.clone();
4936 }
4937 if field_mask.wants_snippet() && !lexical.snippet.is_empty() {
4938 hit.snippet = lexical.snippet.clone();
4939 }
4940 if field_mask.needs_content() && !lexical.content.is_empty() {
4941 hit.content = lexical.content.clone();
4942 }
4943 hit.match_type = lexical.match_type;
4944 hit.line_number = lexical.line_number.or(hit.line_number);
4945 }
4946
4947 fn progressive_phase_to_result(
4948 &self,
4949 results: &[FsScoredResult],
4950 ctx: ProgressivePhaseContext<'_>,
4951 ) -> Result<SearchResult> {
4952 let collapsed = self.collapse_progressive_scored_results(results, ctx.fetch_limit);
4953 let missing: Vec<VectorSearchResult> = collapsed
4954 .iter()
4955 .filter(|result| {
4956 ctx.lexical_cache
4957 .and_then(|cache| cache.hits_by_message.get(&result.message_id))
4958 .is_none()
4959 })
4960 .map(|result| VectorSearchResult {
4961 message_id: result.message_id,
4962 chunk_idx: result.chunk_idx,
4963 score: result.score,
4964 })
4965 .collect();
4966 let mut hydrated_by_id: HashMap<u64, SearchHit> = self
4967 .hydrate_semantic_hits_with_ids(&missing, ctx.field_mask)?
4968 .into_iter()
4969 .collect();
4970
4971 let mut hydrated: Vec<(u64, SearchHit)> = Vec::with_capacity(collapsed.len());
4972 for result in &collapsed {
4973 if let Some(cache) = ctx.lexical_cache
4974 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
4975 {
4976 hydrated.push((result.message_id, lexical.to_search_hit(result.score)));
4977 continue;
4978 }
4979 if let Some(mut hit) = hydrated_by_id.remove(&result.message_id) {
4980 if let Some(cache) = ctx.lexical_cache
4981 && let Some(lexical) = cache.hits_by_message.get(&result.message_id)
4982 {
4983 self.overlay_progressive_lexical_hit(&mut hit, lexical, ctx.field_mask);
4984 }
4985 hydrated.push((result.message_id, hit));
4986 }
4987 }
4988
4989 let mut hits: Vec<SearchHit> = hydrated.into_iter().map(|(_, hit)| hit).collect();
4990 (_, hits) = self.postprocess_hits_page(hits, ctx.query, ctx.filters, ctx.limit, 0);
4991
4992 let (wildcard_fallback, suggestions) = ctx
4993 .lexical_cache
4994 .map(|cache| {
4995 let suggestions = if hits.is_empty() {
4996 cache.suggestions.clone()
4997 } else {
4998 Vec::new()
4999 };
5000 (cache.wildcard_fallback, suggestions)
5001 })
5002 .unwrap_or((false, Vec::new()));
5003
5004 Ok(SearchResult {
5005 hits,
5006 wildcard_fallback,
5007 cache_stats: self.cache_stats(),
5008 suggestions,
5009 ann_stats: None,
5010 total_count: None,
5011 })
5012 }
5013
5014 pub(crate) async fn search_progressive_with_callback(
5015 self: &Arc<Self>,
5016 request: ProgressiveSearchRequest<'_>,
5017 mut on_event: impl FnMut(ProgressiveSearchEvent) + Send,
5018 ) -> Result<()> {
5019 let ProgressiveSearchRequest {
5020 cx,
5021 query,
5022 filters,
5023 limit,
5024 sparse_threshold,
5025 field_mask,
5026 mode,
5027 } = request;
5028 let field_mask = effective_field_mask(field_mask);
5029 let limit = limit.max(1);
5030 let fetch_limit = progressive_phase_fetch_limit(limit);
5031
5032 match mode {
5033 SearchMode::Lexical => {
5034 let started = Instant::now();
5035 let result = self.search_with_fallback(
5036 query,
5037 filters,
5038 limit,
5039 0,
5040 sparse_threshold,
5041 field_mask,
5042 )?;
5043 on_event(ProgressiveSearchEvent::Phase {
5044 kind: ProgressivePhaseKind::Initial,
5045 elapsed_ms: started.elapsed().as_millis(),
5046 result,
5047 });
5048 return Ok(());
5049 }
5050 SearchMode::Semantic | SearchMode::Hybrid => {}
5051 }
5052
5053 let progressive_context = {
5054 self.progressive_context()?
5055 .ok_or_else(|| anyhow!("progressive two-tier context unavailable"))?
5056 };
5057 let progressive_context_token = Arc::clone(&progressive_context.context_token);
5058
5059 let lexical_cache: Arc<Mutex<ProgressiveLexicalSnapshot>> =
5060 Arc::new(Mutex::new(Arc::new(ProgressiveLexicalCache::default())));
5061 let text_cache: Arc<Mutex<HashMap<u64, String>>> = Arc::new(Mutex::new(HashMap::new()));
5062 let text_client = Arc::clone(self);
5063 let text_cache_for_lookup = Arc::clone(&text_cache);
5064 let text_fn = move |doc_id: &str| -> Option<String> {
5065 let parsed = parse_semantic_doc_id(doc_id)?;
5066 if let Ok(cache) = text_cache_for_lookup.lock()
5067 && let Some(text) = cache.get(&parsed.message_id)
5068 {
5069 return Some(text.clone());
5070 }
5071 let loaded = text_client
5072 .load_message_text_by_id(parsed.message_id)
5073 .ok()
5074 .flatten()?;
5075 if let Ok(mut cache) = text_cache_for_lookup.lock() {
5076 cache.insert(parsed.message_id, loaded.clone());
5077 }
5078 Some(loaded)
5079 };
5080
5081 let mut searcher = FsTwoTierSearcher::new(
5082 Arc::clone(&progressive_context.index),
5083 Arc::clone(&progressive_context.fast_embedder),
5084 frankensearch_two_tier_config(),
5085 );
5086
5087 if let Some(quality_embedder) = progressive_context.quality_embedder.as_ref() {
5088 searcher = searcher.with_quality_embedder(Arc::clone(quality_embedder));
5089 }
5090
5091 if matches!(mode, SearchMode::Hybrid) {
5092 let lexical = Arc::new(CassProgressiveLexicalAdapter::new(
5093 Arc::clone(self),
5094 filters.clone(),
5095 field_mask,
5096 sparse_threshold,
5097 Arc::clone(&lexical_cache),
5098 ));
5099 searcher = searcher.with_lexical(lexical);
5100 }
5101
5102 let phase_client = Arc::clone(self);
5103 let phase_filters = filters.clone();
5104 let phase_cache = Arc::clone(&lexical_cache);
5105 let mut phase_error: Option<anyhow::Error> = None;
5106
5107 let search_result = searcher
5108 .search(cx, query, fetch_limit, text_fn, |phase| {
5109 if phase_error.is_some() {
5110 return;
5111 }
5112 match phase_client.semantic_context_matches(&progressive_context_token) {
5113 Ok(true) => {}
5114 Ok(false) => {
5115 phase_error = Some(anyhow!(
5116 "progressive search aborted: semantic context changed"
5117 ));
5118 cx.set_cancel_requested(true);
5119 return;
5120 }
5121 Err(err) => {
5122 phase_error = Some(err);
5123 cx.set_cancel_requested(true);
5124 return;
5125 }
5126 }
5127 let lexical_snapshot = phase_cache.lock().ok().map(|guard| Arc::clone(&guard));
5128 let event_result = match phase {
5129 FsSearchPhase::Initial {
5130 results, latency, ..
5131 } => phase_client
5132 .progressive_phase_to_result(
5133 &results,
5134 ProgressivePhaseContext {
5135 query,
5136 filters: &phase_filters,
5137 field_mask,
5138 lexical_cache: lexical_snapshot.as_deref(),
5139 limit,
5140 fetch_limit,
5141 },
5142 )
5143 .map(|result| ProgressiveSearchEvent::Phase {
5144 kind: ProgressivePhaseKind::Initial,
5145 elapsed_ms: latency.as_millis(),
5146 result,
5147 }),
5148 FsSearchPhase::Refined {
5149 results, latency, ..
5150 } => phase_client
5151 .progressive_phase_to_result(
5152 &results,
5153 ProgressivePhaseContext {
5154 query,
5155 filters: &phase_filters,
5156 field_mask,
5157 lexical_cache: lexical_snapshot.as_deref(),
5158 limit,
5159 fetch_limit,
5160 },
5161 )
5162 .map(|result| ProgressiveSearchEvent::Phase {
5163 kind: ProgressivePhaseKind::Refined,
5164 elapsed_ms: latency.as_millis(),
5165 result,
5166 }),
5167 FsSearchPhase::Reranked {
5173 results, latency, ..
5174 } => phase_client
5175 .progressive_phase_to_result(
5176 &results,
5177 ProgressivePhaseContext {
5178 query,
5179 filters: &phase_filters,
5180 field_mask,
5181 lexical_cache: lexical_snapshot.as_deref(),
5182 limit,
5183 fetch_limit,
5184 },
5185 )
5186 .map(|result| ProgressiveSearchEvent::Phase {
5187 kind: ProgressivePhaseKind::Refined,
5188 elapsed_ms: latency.as_millis(),
5189 result,
5190 }),
5191 FsSearchPhase::RefinementFailed { error, latency, .. } => {
5192 Ok(ProgressiveSearchEvent::RefinementFailed {
5193 latency_ms: latency.as_millis(),
5194 error: error.to_string(),
5195 })
5196 }
5197 };
5198
5199 match event_result {
5200 Ok(event) => on_event(event),
5201 Err(err) => {
5202 phase_error = Some(err);
5203 cx.set_cancel_requested(true);
5204 }
5205 }
5206 })
5207 .await;
5208
5209 if let Some(err) = phase_error {
5210 return Err(err);
5211 }
5212
5213 search_result
5214 .map(|_| ())
5215 .map_err(|err| anyhow!("progressive search failed: {err}"))
5216 }
5217
5218 pub fn search_semantic(
5220 &self,
5221 query: &str,
5222 filters: SearchFilters,
5223 limit: usize,
5224 offset: usize,
5225 field_mask: FieldMask,
5226 approximate: bool,
5227 ) -> Result<(
5228 Vec<SearchHit>,
5229 Option<crate::search::ann_index::AnnSearchStats>,
5230 )> {
5231 self.search_semantic_with_tier(
5232 query,
5233 filters,
5234 limit,
5235 offset,
5236 field_mask,
5237 approximate,
5238 SemanticTierMode::Single,
5239 )
5240 }
5241
5242 #[allow(clippy::too_many_arguments)]
5244 pub fn search_semantic_with_tier(
5245 &self,
5246 query: &str,
5247 filters: SearchFilters,
5248 limit: usize,
5249 offset: usize,
5250 field_mask: FieldMask,
5251 approximate: bool,
5252 tier_mode: SemanticTierMode,
5253 ) -> Result<(
5254 Vec<SearchHit>,
5255 Option<crate::search::ann_index::AnnSearchStats>,
5256 )> {
5257 let field_mask = effective_field_mask(field_mask);
5258 let canonical = canonicalize_for_embedding(query);
5259 if canonical.trim().is_empty() {
5260 return Ok((Vec::new(), None));
5261 }
5262 let limit = if limit == 0 {
5263 self.total_docs().min(no_limit_result_cap()).max(1)
5264 } else {
5265 limit
5266 };
5267 let target_hits = limit.saturating_add(offset);
5268 if target_hits == 0 {
5269 return Ok((Vec::new(), None));
5270 }
5271 let initial_fetch_limit = target_hits;
5272 let fallback_fetch_limit = target_hits.saturating_mul(3);
5273 loop {
5274 let (embedding, candidate_context, in_memory_two_tier_index, ann_index, context_token) = loop {
5275 let embedding = self.semantic_query_embedding(&canonical)?;
5276 let (candidate_context, context_token) = {
5277 let guard = self
5278 .semantic
5279 .lock()
5280 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5281 let state = guard.as_ref().ok_or_else(|| {
5282 anyhow!("semantic search unavailable (no embedder or vector index)")
5283 })?;
5284 (
5285 SemanticCandidateContext {
5286 fs_semantic_index: Arc::clone(&state.fs_semantic_index),
5287 fs_semantic_indexes: Arc::clone(&state.fs_semantic_indexes),
5288 filter_maps: state.filter_maps.clone(),
5289 roles: state.roles.clone(),
5290 },
5291 Arc::clone(&state.context_token),
5292 )
5293 };
5294 if !Arc::ptr_eq(&embedding.context_token, &context_token) {
5295 continue;
5296 }
5297 let in_memory_two_tier_index = if tier_mode.wants_two_tier() && !approximate {
5298 self.in_memory_two_tier_index(tier_mode)?
5299 } else {
5300 None
5301 };
5302 let ann_index = if approximate {
5303 Some(self.ann_index()?)
5304 } else {
5305 None
5306 };
5307
5308 let guard = self
5309 .semantic
5310 .lock()
5311 .map_err(|_| anyhow!("semantic lock poisoned"))?;
5312 let state = guard.as_ref().ok_or_else(|| {
5313 anyhow!("semantic search unavailable (no embedder or vector index)")
5314 })?;
5315 if !Arc::ptr_eq(&state.context_token, &context_token) {
5316 continue;
5317 }
5318 break (
5319 embedding.vector,
5320 candidate_context,
5321 in_memory_two_tier_index,
5322 ann_index,
5323 context_token,
5324 );
5325 };
5326
5327 let finalize_hits =
5328 |results: &[VectorSearchResult]| -> Result<(usize, Vec<SearchHit>)> {
5329 let hits = self.hydrate_semantic_hits(results, field_mask)?;
5330 Ok(self.postprocess_hits_page(hits, query, &filters, limit, offset))
5331 };
5332
5333 let (results, retry_state, mut ann_stats) = self.search_semantic_candidates(
5334 &candidate_context,
5335 &embedding,
5336 &filters,
5337 SemanticCandidateSearchRequest {
5338 fetch_limit: initial_fetch_limit,
5339 approximate,
5340 tier_mode,
5341 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5342 ann_index: ann_index.as_ref(),
5343 },
5344 )?;
5345 if !self.semantic_context_matches(&context_token)? {
5346 tracing::debug!("semantic context changed during candidate search; retrying");
5347 continue;
5348 }
5349 let (mut available_hits, mut paged_hits) = finalize_hits(&results)?;
5350
5351 let needs_retry = initial_fetch_limit < fallback_fetch_limit
5352 && ((available_hits < target_hits && retry_state.has_more_candidates)
5353 || retry_state.exact_window_may_omit_competitor);
5354
5355 if needs_retry {
5356 tracing::debug!(
5357 query = canonical,
5358 target_hits,
5359 available_hits,
5360 initial_fetch_limit,
5361 fallback_fetch_limit,
5362 "retrying semantic fetch due to candidate-window shortfall"
5363 );
5364 let (retry_results, _, retry_ann_stats) = self.search_semantic_candidates(
5365 &candidate_context,
5366 &embedding,
5367 &filters,
5368 SemanticCandidateSearchRequest {
5369 fetch_limit: fallback_fetch_limit,
5370 approximate,
5371 tier_mode,
5372 in_memory_two_tier_index: in_memory_two_tier_index.as_ref(),
5373 ann_index: ann_index.as_ref(),
5374 },
5375 )?;
5376 if !self.semantic_context_matches(&context_token)? {
5377 tracing::debug!("semantic context changed during retry fetch; retrying");
5378 continue;
5379 }
5380 (available_hits, paged_hits) = finalize_hits(&retry_results)?;
5381 ann_stats = retry_ann_stats;
5382 }
5383
5384 tracing::trace!(
5385 query = canonical,
5386 target_hits,
5387 available_hits,
5388 returned = paged_hits.len(),
5389 "semantic fetch complete"
5390 );
5391
5392 return Ok((paged_hits, ann_stats));
5393 }
5394 }
5395
5396 fn hydrate_semantic_hits(
5397 &self,
5398 results: &[VectorSearchResult],
5399 field_mask: FieldMask,
5400 ) -> Result<Vec<SearchHit>> {
5401 self.hydrate_semantic_hits_with_ids(results, field_mask)
5402 .map(|rows| rows.into_iter().map(|(_, hit)| hit).collect())
5403 }
5404
5405 fn postprocess_hits_page(
5406 &self,
5407 hits: Vec<SearchHit>,
5408 query: &str,
5409 filters: &SearchFilters,
5410 limit: usize,
5411 offset: usize,
5412 ) -> (usize, Vec<SearchHit>) {
5413 let mut hits = deduplicate_hits_with_query(hits, query);
5414 if !filters.session_paths.is_empty() {
5415 hits.retain(|hit| filters.session_paths.contains(&hit.source_path));
5416 }
5417 let available_hits = hits.len();
5418 let paged_hits = hits.into_iter().skip(offset).take(limit).collect();
5419 (available_hits, paged_hits)
5420 }
5421
5422 pub fn search_with_fallback(
5426 &self,
5427 query: &str,
5428 filters: SearchFilters,
5429 limit: usize,
5430 offset: usize,
5431 sparse_threshold: usize,
5432 field_mask: FieldMask,
5433 ) -> Result<SearchResult> {
5434 let hits = self.search(query, filters.clone(), limit, offset, field_mask)?;
5436 let baseline_stats = self.cache_stats();
5437 let tantivy_total = self
5439 .last_tantivy_total_count
5440 .lock()
5441 .ok()
5442 .and_then(|guard| *guard);
5443
5444 let query_has_wildcards = query.contains('*');
5446 let has_boolean_or_phrase = fs_cass_has_boolean_operators(query);
5447 let is_sparse = should_try_wildcard_fallback(hits.len(), limit, offset, sparse_threshold);
5448
5449 if !is_sparse || query_has_wildcards || has_boolean_or_phrase || query.trim().is_empty() {
5450 let suggestions = if hits.is_empty() && !query.trim().is_empty() {
5454 self.generate_suggestions(query, &filters)
5455 } else {
5456 Vec::new()
5457 };
5458 return Ok(SearchResult {
5459 hits,
5460 wildcard_fallback: false,
5461 cache_stats: baseline_stats,
5462 suggestions,
5463 ann_stats: None,
5464 total_count: tantivy_total,
5465 });
5466 }
5467
5468 if should_skip_automatic_wildcard_fallback_for_long_zero_hit_query(query, hits.len()) {
5469 let suggestions = if hits.is_empty() {
5470 self.generate_suggestions(query, &filters)
5471 } else {
5472 Vec::new()
5473 };
5474 return Ok(SearchResult {
5475 hits,
5476 wildcard_fallback: false,
5477 cache_stats: baseline_stats,
5478 suggestions,
5479 ann_stats: None,
5480 total_count: tantivy_total,
5481 });
5482 }
5483
5484 let wildcard_query = query
5486 .split_whitespace()
5487 .map(|term| format!("*{}*", term.trim_matches('*')))
5488 .collect::<Vec<_>>()
5489 .join(" ");
5490
5491 tracing::info!(
5492 original_query = query,
5493 wildcard_query = wildcard_query,
5494 original_count = hits.len(),
5495 "wildcard_fallback"
5496 );
5497
5498 let mut fallback_hits =
5499 self.search(&wildcard_query, filters.clone(), limit, offset, field_mask)?;
5500 let fallback_stats = self.cache_stats();
5501 let fallback_tantivy_total = self
5503 .last_tantivy_total_count
5504 .lock()
5505 .ok()
5506 .and_then(|guard| *guard);
5507
5508 if fallback_hits.len() > hits.len() {
5510 for hit in &mut fallback_hits {
5512 hit.match_type = MatchType::ImplicitWildcard;
5513 }
5514 let suggestions = if fallback_hits.is_empty() {
5516 self.generate_suggestions(query, &filters)
5517 } else {
5518 Vec::new()
5519 };
5520 Ok(SearchResult {
5521 hits: fallback_hits,
5522 wildcard_fallback: true,
5523 cache_stats: fallback_stats,
5524 suggestions,
5525 ann_stats: None,
5526 total_count: fallback_tantivy_total,
5527 })
5528 } else {
5529 let suggestions = if hits.is_empty() {
5532 self.generate_suggestions(query, &filters)
5533 } else {
5534 Vec::new()
5535 };
5536 Ok(SearchResult {
5537 hits,
5538 wildcard_fallback: false,
5539 cache_stats: baseline_stats,
5540 suggestions,
5541 ann_stats: None,
5542 total_count: tantivy_total,
5543 })
5544 }
5545 }
5546
5547 #[allow(clippy::too_many_arguments)]
5549 pub fn search_hybrid(
5550 &self,
5551 lexical_query: &str,
5552 semantic_query: &str,
5553 filters: SearchFilters,
5554 limit: usize,
5555 offset: usize,
5556 sparse_threshold: usize,
5557 field_mask: FieldMask,
5558 approximate: bool,
5559 ) -> Result<SearchResult> {
5560 self.search_hybrid_with_tier(
5561 lexical_query,
5562 semantic_query,
5563 filters,
5564 limit,
5565 offset,
5566 sparse_threshold,
5567 field_mask,
5568 approximate,
5569 SemanticTierMode::Single,
5570 )
5571 }
5572
5573 #[allow(clippy::too_many_arguments)]
5576 pub fn search_hybrid_with_tier(
5577 &self,
5578 lexical_query: &str,
5579 semantic_query: &str,
5580 filters: SearchFilters,
5581 limit: usize,
5582 offset: usize,
5583 sparse_threshold: usize,
5584 field_mask: FieldMask,
5585 approximate: bool,
5586 semantic_tier_mode: SemanticTierMode,
5587 ) -> Result<SearchResult> {
5588 let requested_limit = limit;
5589 let total_docs = self.total_docs().max(1);
5590 let limit = if requested_limit == 0 {
5591 total_docs.min(no_limit_result_cap()).max(1)
5592 } else {
5593 requested_limit
5594 };
5595 let fetch = limit.saturating_add(offset);
5596 if fetch == 0 {
5597 return Ok(SearchResult {
5598 hits: Vec::new(),
5599 wildcard_fallback: false,
5600 cache_stats: self.cache_stats(),
5601 suggestions: Vec::new(),
5602 ann_stats: None,
5603 total_count: None,
5604 });
5605 }
5606
5607 if semantic_query.trim().is_empty() {
5608 return self.search_with_fallback(
5609 lexical_query,
5610 filters,
5611 limit,
5612 offset,
5613 sparse_threshold,
5614 field_mask,
5615 );
5616 }
5617
5618 let budget =
5619 hybrid_candidate_budget(semantic_query, requested_limit, limit, offset, total_docs);
5620 let lexical = self.search_with_fallback(
5621 lexical_query,
5622 filters.clone(),
5623 budget.lexical_candidates,
5624 0,
5625 sparse_threshold,
5626 field_mask,
5627 )?;
5628 let (semantic_hits, semantic_ann_stats) = self.search_semantic_with_tier(
5629 semantic_query,
5630 filters,
5631 budget.semantic_candidates,
5632 0,
5633 field_mask,
5634 approximate,
5635 semantic_tier_mode,
5636 )?;
5637 let fused = rrf_fuse_hits(&lexical.hits, &semantic_hits, semantic_query, limit, offset);
5638 let suggestions = if fused.is_empty() {
5639 lexical.suggestions.clone()
5640 } else {
5641 Vec::new()
5642 };
5643 Ok(SearchResult {
5644 hits: fused,
5645 wildcard_fallback: lexical.wildcard_fallback,
5646 cache_stats: lexical.cache_stats,
5647 suggestions,
5648 ann_stats: semantic_ann_stats,
5649 total_count: None,
5650 })
5651 }
5652
5653 fn generate_suggestions(&self, query: &str, filters: &SearchFilters) -> Vec<QuerySuggestion> {
5655 let mut suggestions = Vec::new();
5656 let query_lower = query.to_lowercase();
5657
5658 if !query.contains('*') && query.len() >= 2 {
5660 suggestions.push(QuerySuggestion::wildcard(query).with_shortcut(1));
5661 }
5662
5663 if !filters.agents.is_empty() {
5665 let agents: Vec<&str> = filters
5666 .agents
5667 .iter()
5668 .map(std::string::String::as_str)
5669 .collect();
5670 let agent_str = agents.join(", ");
5671 suggestions
5672 .push(QuerySuggestion::remove_agent_filter(&agent_str, filters).with_shortcut(2));
5673 }
5674
5675 let known_agents = [
5677 "codex",
5678 "claude",
5679 "claude_code",
5680 "cline",
5681 "gemini",
5682 "amp",
5683 "opencode",
5684 ];
5685 for agent in &known_agents {
5686 if levenshtein_distance(&query_lower, agent) <= 2 && query_lower != *agent {
5687 suggestions.push(
5688 QuerySuggestion::spelling(query, agent)
5689 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5690 );
5691 break; }
5693 }
5694
5695 if filters.agents.is_empty()
5699 && let Ok(sqlite_guard) = self.sqlite.lock()
5700 && let Some(conn) = sqlite_guard.as_ref()
5701 && let Ok(rows) = conn.query_map_collect(
5702 "SELECT a.slug
5703 FROM conversations c
5704 JOIN agents a ON c.agent_id = a.id
5705 GROUP BY a.slug
5706 ORDER BY MAX(c.id) DESC
5707 LIMIT 3",
5708 &[],
5709 |row: &frankensqlite::Row| row.get_typed::<String>(0),
5710 )
5711 {
5712 for row in rows {
5713 if suggestions.len() < 3 {
5714 suggestions.push(
5715 QuerySuggestion::try_agent(&row)
5716 .with_shortcut(suggestions.len().min(2) as u8 + 1),
5717 );
5718 }
5719 }
5720 }
5721
5722 suggestions.truncate(3);
5724 for (i, sugg) in suggestions.iter_mut().enumerate() {
5725 sugg.shortcut = Some((i + 1) as u8);
5726 }
5727
5728 suggestions
5729 }
5730
5731 fn searcher_for_thread(&self, reader: &IndexReader) -> Searcher {
5732 let epoch = self.reload_epoch.load(Ordering::Relaxed);
5733 let reader_key = reader as *const IndexReader as usize;
5734 THREAD_SEARCHER.with(|slot| {
5735 let mut slot = slot.borrow_mut();
5736 if let Some(entry) = slot.as_ref()
5737 && entry.epoch == epoch
5738 && entry.reader_key == reader_key
5739 {
5740 return entry.searcher.clone();
5741 }
5742 let searcher = reader.searcher();
5743 *slot = Some(SearcherCacheEntry {
5744 epoch,
5745 reader_key,
5746 searcher: searcher.clone(),
5747 });
5748 searcher
5749 })
5750 }
5751
5752 fn federated_readers(&self) -> Option<Arc<Vec<FederatedIndexReader>>> {
5753 FEDERATED_SEARCH_READERS
5754 .read()
5755 .get(&self.cache_namespace)
5756 .cloned()
5757 }
5758
5759 fn maybe_reload_federated_readers(
5760 &self,
5761 readers: &[FederatedIndexReader],
5762 ) -> Result<Option<u64>> {
5763 if !self.reload_on_search || readers.is_empty() {
5764 return Ok(None);
5765 }
5766 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
5767 let now = Instant::now();
5768 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
5769 if guard
5770 .map(|t| now.duration_since(t) < MIN_RELOAD_INTERVAL)
5771 .unwrap_or(false)
5772 {
5773 let signature = self.federated_generation_signature(readers);
5774 return Ok(Some(signature));
5775 }
5776
5777 let reload_started = Instant::now();
5778 for shard in readers {
5779 shard.reader.reload()?;
5780 }
5781 let elapsed = reload_started.elapsed();
5782 *guard = Some(now);
5783 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
5784 self.metrics.record_reload(elapsed);
5785 tracing::debug!(
5786 duration_ms = elapsed.as_millis() as u64,
5787 reload_epoch = epoch,
5788 shards = readers.len(),
5789 "tantivy_reader_reload_federated"
5790 );
5791 Ok(Some(self.federated_generation_signature(readers)))
5792 }
5793
5794 fn federated_generation_signature(&self, readers: &[FederatedIndexReader]) -> u64 {
5795 let mut hasher = std::collections::hash_map::DefaultHasher::new();
5796 readers.len().hash(&mut hasher);
5797 for shard in readers {
5798 self.searcher_for_thread(&shard.reader)
5799 .generation()
5800 .generation_id()
5801 .hash(&mut hasher);
5802 }
5803 hasher.finish()
5804 }
5805
5806 fn track_generation(&self, generation: u64) {
5807 let mut guard = self
5808 .last_generation
5809 .lock()
5810 .unwrap_or_else(|e| e.into_inner());
5811 if let Some(prev) = *guard
5812 && prev != generation
5813 && let Ok(mut cache) = self.prefix_cache.lock()
5814 {
5815 cache.clear();
5816 }
5817 *guard = Some(generation);
5818 }
5819
5820 fn hydrate_tantivy_hit_contents(
5821 &self,
5822 exact_keys: &[TantivyContentExactKey],
5823 fallback_keys: &[TantivyContentFallbackKey],
5824 ) -> Result<TantivyHydratedContentMaps> {
5825 if exact_keys.is_empty() && fallback_keys.is_empty() {
5826 return Ok((HashMap::new(), HashMap::new()));
5827 }
5828
5829 let sqlite_guard = match self.sqlite_guard() {
5830 Ok(guard) => guard,
5831 Err(_) => return Ok((HashMap::new(), HashMap::new())),
5832 };
5833 let Some(conn) = sqlite_guard.as_ref() else {
5834 return Ok((HashMap::new(), HashMap::new()));
5835 };
5836
5837 let mut hydrated_exact = HashMap::new();
5838 let mut hydrated_fallback = HashMap::new();
5839 const CHUNK_SIZE: usize = 300;
5840
5841 if !exact_keys.is_empty() {
5842 let mut unique_exact_keys = Vec::with_capacity(exact_keys.len());
5843 let mut seen = HashSet::with_capacity(exact_keys.len());
5844 for key in exact_keys {
5845 if seen.insert(*key) {
5846 unique_exact_keys.push(*key);
5847 }
5848 }
5849
5850 hydrated_exact.extend(hydrate_message_content_by_conversation(
5851 conn,
5852 &unique_exact_keys,
5853 )?);
5854 }
5855
5856 if !fallback_keys.is_empty() {
5857 let mut unique_fallback_keys = Vec::with_capacity(fallback_keys.len());
5858 let mut seen = HashSet::with_capacity(fallback_keys.len());
5859 for key in fallback_keys {
5860 if seen.insert(key.clone()) {
5861 unique_fallback_keys.push(key.clone());
5862 }
5863 }
5864
5865 let mut unique_source_paths = Vec::with_capacity(unique_fallback_keys.len());
5866 let mut seen_source_paths = HashSet::with_capacity(unique_fallback_keys.len());
5867 for (_, source_path, _) in &unique_fallback_keys {
5868 if seen_source_paths.insert(source_path.clone()) {
5869 unique_source_paths.push(source_path.clone());
5870 }
5871 }
5872
5873 let mut conversations_by_key: HashMap<(String, String), Vec<i64>> = HashMap::new();
5874 for chunk in unique_source_paths.chunks(CHUNK_SIZE) {
5875 let placeholders = sql_placeholders(chunk.len());
5876 let sql = format!(
5877 "SELECT c.id,
5878 c.source_path,
5879 COALESCE(c.source_id, ''),
5880 COALESCE(c.origin_host, ''),
5881 COALESCE(s.kind, '')
5882 FROM conversations c
5883 LEFT JOIN sources s ON c.source_id = s.id
5884 WHERE c.source_path IN ({placeholders})
5885 ORDER BY c.id"
5886 );
5887 let params = chunk
5888 .iter()
5889 .map(|source_path| ParamValue::from(source_path.clone()))
5890 .collect::<Vec<_>>();
5891 let rows: Vec<(i64, String, String, String, String)> =
5892 franken_query_map_collect_retry(conn, &sql, ¶ms, |row| {
5893 Ok((
5894 row.get_typed(0)?,
5895 row.get_typed(1)?,
5896 row.get_typed(2)?,
5897 row.get_typed(3)?,
5898 row.get_typed(4)?,
5899 ))
5900 })?;
5901
5902 for (conversation_id, source_path, raw_source_id, origin_host, origin_kind) in rows
5903 {
5904 let normalized_source_id = normalized_search_hit_source_id_parts(
5905 &raw_source_id,
5906 &origin_kind,
5907 (!origin_host.trim().is_empty()).then_some(origin_host.as_str()),
5908 );
5909 conversations_by_key
5910 .entry((normalized_source_id, source_path))
5911 .or_default()
5912 .push(conversation_id);
5913 }
5914 }
5915
5916 let mut message_requests = Vec::new();
5917 let mut fallback_keys_by_exact: HashMap<
5918 TantivyContentExactKey,
5919 Vec<TantivyContentFallbackKey>,
5920 > = HashMap::new();
5921 let mut seen_message_requests = HashSet::new();
5922 for (source_id, source_path, line_idx) in &unique_fallback_keys {
5923 let key = (source_id.clone(), source_path.clone());
5924 let Some(conversation_ids) = conversations_by_key.get(&key) else {
5925 continue;
5926 };
5927 for &conversation_id in conversation_ids {
5928 let exact_key = (conversation_id, *line_idx);
5929 if seen_message_requests.insert(exact_key) {
5930 message_requests.push(exact_key);
5931 }
5932 fallback_keys_by_exact.entry(exact_key).or_default().push((
5933 source_id.clone(),
5934 source_path.clone(),
5935 *line_idx,
5936 ));
5937 }
5938 }
5939
5940 for ((conversation_id, line_idx), content) in
5941 hydrate_message_content_by_conversation(conn, &message_requests)?
5942 {
5943 if let Some(fallback_keys) =
5944 fallback_keys_by_exact.get(&(conversation_id, line_idx))
5945 {
5946 for fallback_key in fallback_keys {
5947 hydrated_fallback.insert(fallback_key.clone(), content.clone());
5948 }
5949 }
5950 }
5951 }
5952
5953 Ok((hydrated_exact, hydrated_fallback))
5954 }
5955
5956 #[allow(clippy::too_many_arguments)]
5957 fn search_tantivy(
5958 &self,
5959 reader: &IndexReader,
5960 fields: &FsCassFields,
5961 raw_query: &str,
5962 sanitized_query: &str,
5963 filters: SearchFilters,
5964 limit: usize,
5965 offset: usize,
5966 field_mask: FieldMask,
5967 ) -> Result<(Vec<SearchHit>, usize)> {
5968 struct PendingTantivyHit {
5969 score: f32,
5970 doc: TantivyDocument,
5971 title: String,
5972 stored_content: String,
5973 stored_preview: String,
5974 agent: String,
5975 source_path: String,
5976 workspace: String,
5977 workspace_original: Option<String>,
5978 created_at: Option<i64>,
5979 line_number: Option<usize>,
5980 stored_preview_snippet: Option<String>,
5981 source_id: String,
5982 conversation_id: Option<i64>,
5983 raw_origin_kind: Option<String>,
5984 origin_host: Option<String>,
5985 }
5986
5987 self.maybe_reload_reader(reader)?;
5988 let searcher = self.searcher_for_thread(reader);
5989 self.track_generation(searcher.generation().generation_id());
5990
5991 let wants_snippet = field_mask.wants_snippet();
5992 let needs_content = field_mask.needs_content() || wants_snippet;
5993
5994 let fs_filters = FsCassQueryFilters {
5997 agents: filters.agents.into_iter().collect(),
5998 workspaces: filters.workspaces.into_iter().collect(),
5999 created_from: filters.created_from,
6000 created_to: filters.created_to,
6001 source_filter: match filters.source_filter {
6002 SourceFilter::All => FsCassSourceFilter::All,
6003 SourceFilter::Local => FsCassSourceFilter::Local,
6004 SourceFilter::Remote => FsCassSourceFilter::Remote,
6005 SourceFilter::SourceId(id) => {
6006 FsCassSourceFilter::SourceId(normalize_search_source_filter_value(&id))
6007 }
6008 },
6009 };
6010
6011 let q: Box<dyn Query> = fs_cass_build_tantivy_query(raw_query, &fs_filters, fields);
6014
6015 let prefix_only = is_prefix_only(sanitized_query);
6016 let top_docs = execute_query_with_lazy_exact_count(&searcher, &*q, limit, offset)?;
6017 let tantivy_total_count = top_docs.total_count;
6018 let query_match_type = dominant_match_type(sanitized_query);
6019 let mut pending_hits = Vec::with_capacity(top_docs.hits.len());
6020 let mut missing_exact_content_keys = Vec::new();
6021 let mut missing_fallback_content_keys = Vec::new();
6022
6023 for ranked_hit in top_docs.hits {
6024 let score = ranked_hit.bm25_score;
6025 let doc: TantivyDocument = fs_load_doc(&searcher, ranked_hit.doc_address)?;
6026 let title = if field_mask.wants_title() {
6027 doc.get_first(fields.title)
6028 .and_then(|v| v.as_str())
6029 .unwrap_or("")
6030 .to_string()
6031 } else {
6032 String::new()
6033 };
6034 let stored_content = doc
6035 .get_first(fields.content)
6036 .and_then(|v| v.as_str())
6037 .unwrap_or("")
6038 .to_string();
6039 let stored_preview = doc
6040 .get_first(fields.preview)
6041 .and_then(|v| v.as_str())
6042 .unwrap_or("")
6043 .to_string();
6044 let stored_preview_snippet = snippet_from_preview_without_full_content(
6045 field_mask,
6046 &stored_preview,
6047 sanitized_query,
6048 );
6049 let agent = doc
6050 .get_first(fields.agent)
6051 .and_then(|v| v.as_str())
6052 .unwrap_or("")
6053 .to_string();
6054 let workspace = doc
6055 .get_first(fields.workspace)
6056 .and_then(|v| v.as_str())
6057 .unwrap_or("")
6058 .to_string();
6059 let workspace_original = doc
6060 .get_first(fields.workspace_original)
6061 .and_then(|v| v.as_str())
6062 .filter(|s| !s.is_empty())
6063 .map(String::from);
6064 let created_at = doc.get_first(fields.created_at).and_then(|v| v.as_i64());
6065 let line_number = doc
6066 .get_first(fields.msg_idx)
6067 .and_then(|v| v.as_u64())
6068 .and_then(|i| usize::try_from(i).ok())
6069 .map(|i| i.saturating_add(1));
6070 let raw_source_id = doc
6071 .get_first(fields.source_id)
6072 .and_then(|v| v.as_str())
6073 .unwrap_or_default()
6074 .to_string();
6075 let conversation_id = fields
6076 .conversation_id
6077 .and_then(|field| doc.get_first(field))
6078 .and_then(|v| v.as_i64());
6079 let source_path = doc
6080 .get_first(fields.source_path)
6081 .and_then(|v| v.as_str())
6082 .unwrap_or("")
6083 .to_string();
6084 let raw_origin_kind = doc
6085 .get_first(fields.origin_kind)
6086 .and_then(|v| v.as_str())
6087 .map(str::to_string);
6088 let origin_host = doc
6089 .get_first(fields.origin_host)
6090 .and_then(|v| v.as_str())
6091 .filter(|s| !s.is_empty())
6092 .map(String::from);
6093 let source_id = normalized_search_hit_source_id_parts(
6094 raw_source_id.as_str(),
6095 raw_origin_kind.as_deref().unwrap_or_default(),
6096 origin_host.as_deref(),
6097 );
6098
6099 let preview_satisfies_bounded_content =
6100 field_mask.preview_content_limit().is_some() && !stored_preview.is_empty();
6101 let preview_satisfies_full_content = field_mask.needs_content()
6102 && field_mask.preview_content_limit().is_none()
6103 && stored_preview_is_complete_content(&stored_preview);
6104 if needs_content
6105 && let Some(line_idx) = line_number
6106 .and_then(|line| line.checked_sub(1))
6107 .and_then(|line| i64::try_from(line).ok())
6108 && stored_content.is_empty()
6109 && !preview_satisfies_bounded_content
6110 && !preview_satisfies_full_content
6111 && stored_preview_snippet.is_none()
6112 {
6113 if let Some(conversation_id) = conversation_id {
6114 missing_exact_content_keys.push((conversation_id, line_idx));
6115 } else {
6116 missing_fallback_content_keys.push((
6117 source_id.clone(),
6118 source_path.clone(),
6119 line_idx,
6120 ));
6121 }
6122 }
6123
6124 pending_hits.push(PendingTantivyHit {
6125 score,
6126 doc,
6127 title,
6128 stored_content,
6129 stored_preview,
6130 agent,
6131 source_path,
6132 workspace,
6133 workspace_original,
6134 created_at,
6135 line_number,
6136 stored_preview_snippet,
6137 source_id,
6138 conversation_id,
6139 raw_origin_kind,
6140 origin_host,
6141 });
6142 }
6143
6144 let (hydrated_contents, hydrated_fallback_contents) = if needs_content
6145 && (!missing_exact_content_keys.is_empty() || !missing_fallback_content_keys.is_empty())
6146 {
6147 self.hydrate_tantivy_hit_contents(
6148 &missing_exact_content_keys,
6149 &missing_fallback_content_keys,
6150 )?
6151 } else {
6152 (HashMap::new(), HashMap::new())
6153 };
6154 let needs_tantivy_snippet_generator = wants_snippet
6155 && !prefix_only
6156 && pending_hits
6157 .iter()
6158 .any(|pending| pending.stored_preview_snippet.is_none());
6159 let snippet_generator = if needs_tantivy_snippet_generator {
6160 let snippet_cfg = FsSnippetConfig {
6161 max_chars: 160,
6162 highlight_prefix: "<b>".to_string(),
6163 highlight_postfix: "</b>".to_string(),
6164 };
6165 fs_try_build_snippet_generator(&searcher, &*q, fields.content, &snippet_cfg)
6166 } else {
6167 None
6168 };
6169 let mut hits = Vec::with_capacity(pending_hits.len());
6170 for pending in pending_hits {
6171 let hydrated_content = pending
6172 .line_number
6173 .and_then(|line| line.checked_sub(1))
6174 .and_then(|line| i64::try_from(line).ok())
6175 .and_then(|line_idx| {
6176 if let Some(conversation_id) = pending.conversation_id {
6177 hydrated_contents.get(&(conversation_id, line_idx)).cloned()
6178 } else {
6179 hydrated_fallback_contents
6180 .get(&(
6181 pending.source_id.clone(),
6182 pending.source_path.clone(),
6183 line_idx,
6184 ))
6185 .cloned()
6186 }
6187 });
6188 let preview_satisfies_effective_content = !pending.stored_preview.is_empty()
6189 && (field_mask.preview_content_limit().is_some()
6190 || (field_mask.needs_content()
6191 && field_mask.preview_content_limit().is_none()
6192 && stored_preview_is_complete_content(&pending.stored_preview)));
6193 let effective_content = if !pending.stored_content.is_empty() {
6194 pending.stored_content.clone()
6195 } else if preview_satisfies_effective_content {
6196 pending.stored_preview.clone()
6197 } else if let Some(content) = hydrated_content {
6198 content
6199 } else {
6200 pending.stored_preview.clone()
6201 };
6202 let snippet = if wants_snippet {
6203 if let Some(snippet) = pending.stored_preview_snippet.clone() {
6204 snippet
6205 } else if let Some(r#gen) = &snippet_generator {
6206 let rendered = if !pending.stored_content.is_empty() {
6207 fs_render_snippet_html(r#gen, &pending.doc, "<b>", "</b>")
6208 } else if !effective_content.is_empty() {
6209 let mut snippet_doc = TantivyDocument::new();
6210 snippet_doc.add_text(fields.content, &effective_content);
6211 fs_render_snippet_html(r#gen, &snippet_doc, "<b>", "</b>")
6212 } else {
6213 None
6214 };
6215 rendered
6216 .map(|html| html.replace("<b>", "**").replace("</b>", "**"))
6217 .or_else(|| cached_prefix_snippet(&effective_content, sanitized_query, 160))
6218 .unwrap_or_else(|| {
6219 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6220 })
6221 } else if let Some(sn) =
6222 cached_prefix_snippet(&effective_content, sanitized_query, 160)
6223 {
6224 sn
6225 } else {
6226 quick_prefix_snippet(&effective_content, sanitized_query, 160)
6227 }
6228 } else {
6229 String::new()
6230 };
6231 let content = if field_mask.needs_content() {
6232 effective_content.clone()
6233 } else {
6234 String::new()
6235 };
6236 let content_hash = stable_hit_hash(
6237 &effective_content,
6238 &pending.source_path,
6239 pending.line_number,
6240 pending.created_at,
6241 );
6242 let origin_kind = normalized_search_hit_origin_kind(
6243 &pending.source_id,
6244 pending.raw_origin_kind.as_deref(),
6245 )
6246 .to_string();
6247 hits.push(SearchHit {
6248 title: pending.title,
6249 snippet,
6250 content,
6251 content_hash,
6252 conversation_id: pending.conversation_id,
6253 score: pending.score,
6254 source_path: pending.source_path,
6255 agent: pending.agent,
6256 workspace: pending.workspace,
6257 workspace_original: pending.workspace_original,
6258 created_at: pending.created_at,
6259 line_number: pending.line_number,
6260 match_type: query_match_type,
6261 source_id: pending.source_id,
6262 origin_kind,
6263 origin_host: pending.origin_host,
6264 });
6265 }
6266 Ok((hits, tantivy_total_count))
6267 }
6268
6269 #[allow(clippy::too_many_arguments)]
6270 fn search_tantivy_federated(
6271 &self,
6272 readers: &[FederatedIndexReader],
6273 raw_query: &str,
6274 sanitized_query: &str,
6275 filters: SearchFilters,
6276 limit: usize,
6277 field_mask: FieldMask,
6278 ) -> Result<(Vec<SearchHit>, usize)> {
6279 let mut ranked_hits = Vec::new();
6280 let mut total_count = 0usize;
6281
6282 for (shard_index, shard) in readers.iter().enumerate() {
6283 let (shard_hits, shard_total_count) = self.search_tantivy(
6284 &shard.reader,
6285 &shard.fields,
6286 raw_query,
6287 sanitized_query,
6288 filters.clone(),
6289 limit,
6290 0,
6291 field_mask,
6292 )?;
6293 total_count = total_count.saturating_add(shard_total_count);
6294 for (shard_rank, hit) in shard_hits.into_iter().enumerate() {
6295 ranked_hits.push(FederatedRankedHit {
6296 hit,
6297 shard_index,
6298 shard_rank,
6299 fused_score: federated_rrf_score(shard_rank),
6300 });
6301 }
6302 }
6303
6304 let raw_hit_count = ranked_hits.len();
6305 let generation_signature = self.federated_generation_signature(readers);
6306 self.track_generation(generation_signature);
6307 let combined_hits = merge_federated_ranked_hits(ranked_hits);
6308 tracing::debug!(
6309 generation_signature,
6310 shard_count = readers.len(),
6311 total_count,
6312 raw_hit_count,
6313 returned_hit_count = combined_hits.len(),
6314 merge_policy = "rrf_rank_then_stable_hit_key",
6315 "federated lexical search merged shard results"
6316 );
6317
6318 Ok((combined_hits, total_count))
6319 }
6320
6321 fn sqlite_fts_uses_message_id_column(conn: &Connection) -> Result<bool> {
6322 let params: [ParamValue; 0] = [];
6323 let ddl_rows: Vec<String> = franken_query_map_collect_retry(
6324 conn,
6325 "SELECT COALESCE(sql, '')
6326 FROM sqlite_master
6327 WHERE name = 'fts_messages'
6328 ORDER BY rowid DESC
6329 LIMIT 1",
6330 ¶ms,
6331 |row: &frankensqlite::Row| row.get_typed::<String>(0),
6332 )?;
6333 Ok(ddl_rows
6334 .first()
6335 .map(|sql| sql.to_ascii_lowercase().contains("message_id"))
6336 .unwrap_or(false))
6337 }
6338
6339 fn sqlite_fts5_rank_query(
6340 fts_query: &str,
6341 filters: &SearchFilters,
6342 limit: usize,
6343 offset: usize,
6344 uses_message_id: bool,
6345 ) -> (String, Vec<ParamValue>) {
6346 let normalized_source_sql =
6347 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6348 let created_at_expr = "CAST(fts_messages.created_at AS INTEGER)";
6349 let message_key_expr = if uses_message_id {
6350 "CAST(fts_messages.message_id AS INTEGER)"
6351 } else {
6352 "fts_messages.rowid"
6353 };
6354
6355 let mut sql = format!(
6356 "SELECT fts_messages.rowid,
6357 bm25(fts_messages)
6358 FROM fts_messages
6359 LEFT JOIN messages m ON {message_key_expr} = m.id
6360 LEFT JOIN conversations c ON m.conversation_id = c.id
6361 LEFT JOIN sources s ON c.source_id = s.id
6362 WHERE fts_messages MATCH ?"
6363 );
6364 let mut params = Vec::with_capacity(filters.agents.len() + filters.workspaces.len() + 5);
6365 params.push(ParamValue::from(fts_query));
6366
6367 if !filters.agents.is_empty() {
6368 let placeholders = sql_placeholders(filters.agents.len());
6369 sql.push_str(&format!(" AND fts_messages.agent IN ({placeholders})"));
6370 for agent in &filters.agents {
6371 params.push(ParamValue::from(agent.as_str()));
6372 }
6373 }
6374
6375 if !filters.workspaces.is_empty() {
6376 let placeholders = sql_placeholders(filters.workspaces.len());
6377 sql.push_str(&format!(
6378 " AND COALESCE(fts_messages.workspace, '') IN ({placeholders})"
6379 ));
6380 for workspace in &filters.workspaces {
6381 params.push(ParamValue::from(workspace.as_str()));
6382 }
6383 }
6384
6385 if let Some(created_from) = filters.created_from {
6386 sql.push_str(&format!(" AND {created_at_expr} >= ?"));
6387 params.push(ParamValue::from(created_from));
6388 }
6389 if let Some(created_to) = filters.created_to {
6390 sql.push_str(&format!(" AND {created_at_expr} <= ?"));
6391 params.push(ParamValue::from(created_to));
6392 }
6393
6394 match &filters.source_filter {
6395 SourceFilter::All => {}
6396 SourceFilter::Local => sql.push_str(&format!(
6397 " AND {normalized_source_sql} = '{local}'",
6398 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6399 )),
6400 SourceFilter::Remote => sql.push_str(&format!(
6401 " AND {normalized_source_sql} != '{local}'",
6402 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6403 )),
6404 SourceFilter::SourceId(id) => {
6405 sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
6406 params.push(ParamValue::from(normalize_search_source_filter_value(id)));
6407 }
6408 }
6409
6410 sql.push_str(&format!(
6411 " ORDER BY bm25(fts_messages), {message_key_expr}, fts_messages.rowid LIMIT ? OFFSET ?"
6412 ));
6413 params.push(ParamValue::from(limit as i64));
6414 params.push(ParamValue::from(offset as i64));
6415
6416 (sql, params)
6417 }
6418
6419 fn sqlite_fts5_hydrate_query(
6420 row_count: usize,
6421 field_mask: FieldMask,
6422 uses_message_id: bool,
6423 ) -> String {
6424 let title_expr = if field_mask.wants_title() {
6425 "fts_messages.title"
6426 } else {
6427 "''"
6428 };
6429 let content_expr = if field_mask.needs_content() || field_mask.wants_snippet() {
6430 "fts_messages.content"
6431 } else {
6432 "''"
6433 };
6434 let normalized_source_sql =
6435 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6436 let created_at_expr = "CAST(fts_messages.created_at AS INTEGER)";
6437 let message_key_expr = if uses_message_id {
6438 "CAST(fts_messages.message_id AS INTEGER)"
6439 } else {
6440 "fts_messages.rowid"
6441 };
6442 let placeholders = sql_placeholders(row_count);
6443
6444 format!(
6445 "SELECT fts_messages.rowid,
6446 {title_expr},
6447 {content_expr},
6448 fts_messages.agent,
6449 COALESCE(fts_messages.workspace, ''),
6450 fts_messages.source_path,
6451 {created_at_expr},
6452 m.idx,
6453 c.id,
6454 {normalized_source_sql},
6455 c.origin_host,
6456 s.kind
6457 FROM fts_messages
6458 LEFT JOIN messages m ON {message_key_expr} = m.id
6459 LEFT JOIN conversations c ON m.conversation_id = c.id
6460 LEFT JOIN sources s ON c.source_id = s.id
6461 WHERE fts_messages.rowid IN ({placeholders})"
6462 )
6463 }
6464
6465 fn sqlite_fts5_hydrate_row_chunks(
6466 ranked_rows: &[(i64, f64)],
6467 ) -> impl Iterator<Item = &[(i64, f64)]> {
6468 const _: () = assert!(SQLITE_FTS5_HYDRATE_PARAM_CHUNK <= SQLITE_MAX_VARIABLE_NUMBER);
6469 ranked_rows.chunks(SQLITE_FTS5_HYDRATE_PARAM_CHUNK)
6470 }
6471
6472 fn search_sqlite_fts5(
6473 &self,
6474 _db_path: &Path,
6475 raw_query: &str,
6476 filters: SearchFilters,
6477 limit: usize,
6478 offset: usize,
6479 field_mask: FieldMask,
6480 ) -> Result<Vec<SearchHit>> {
6481 let fts_query = match transpile_to_fts5(raw_query) {
6482 Some(q) if !q.trim().is_empty() => q,
6483 _ => return Ok(Vec::new()),
6484 };
6485
6486 let sqlite_guard = self.sqlite_guard()?;
6487 let Some(conn) = sqlite_guard.as_ref() else {
6488 return Ok(Vec::new());
6489 };
6490
6491 let empty_params: [ParamValue; 0] = [];
6492 let has_fts = franken_query_map_collect_retry(
6493 conn,
6494 "SELECT 1 FROM sqlite_master WHERE name = 'fts_messages'",
6495 &empty_params,
6496 |row| row.get_typed::<i64>(0),
6497 )
6498 .map(|rows| !rows.is_empty())
6499 .unwrap_or(false);
6500 if !has_fts {
6501 return Ok(Vec::new());
6502 }
6503
6504 let query_match_type = dominant_match_type(raw_query);
6505 let uses_message_id =
6506 if let Ok(uses_message_id) = Self::sqlite_fts_uses_message_id_column(conn) {
6507 uses_message_id
6508 } else {
6509 tracing::warn!(
6510 "sqlite FTS fallback is present but not queryable; skipping fallback search"
6511 );
6512 return Ok(Vec::new());
6513 };
6514 let (rank_sql, rank_params) = Self::sqlite_fts5_rank_query(
6515 fts_query.as_str(),
6516 &filters,
6517 limit,
6518 offset,
6519 uses_message_id,
6520 );
6521 let ranked_rows: Vec<(i64, f64)> =
6522 match franken_query_map_collect_retry(conn, &rank_sql, &rank_params, |row| {
6523 Ok((row.get_typed(0)?, row.get_typed(1)?))
6524 }) {
6525 Ok(rows) => rows,
6526 Err(err) => {
6527 tracing::warn!(
6528 error = %err,
6529 "sqlite FTS fallback rank query failed; returning no fallback hits"
6530 );
6531 return Ok(Vec::new());
6532 }
6533 };
6534 if ranked_rows.is_empty() {
6535 return Ok(Vec::new());
6536 }
6537
6538 let bm25_by_rowid: HashMap<i64, f64> = ranked_rows.iter().copied().collect();
6539 let mut hits_by_rowid = HashMap::with_capacity(ranked_rows.len());
6540 for rank_chunk in Self::sqlite_fts5_hydrate_row_chunks(&ranked_rows) {
6541 let hydrate_sql =
6542 Self::sqlite_fts5_hydrate_query(rank_chunk.len(), field_mask, uses_message_id);
6543 let hydrate_params = rank_chunk
6544 .iter()
6545 .map(|(fts_rowid, _)| ParamValue::from(*fts_rowid))
6546 .collect::<Vec<_>>();
6547 let rows: Vec<SqliteFtsHydratedRow> =
6548 match franken_query_map_collect_retry(conn, &hydrate_sql, &hydrate_params, |row| {
6549 Ok((
6550 row.get_typed(0)?,
6551 row.get_typed(1)?,
6552 row.get_typed(2)?,
6553 row.get_typed(3)?,
6554 row.get_typed(4)?,
6555 row.get_typed(5)?,
6556 row.get_typed(6)?,
6557 row.get_typed(7)?,
6558 row.get_typed(8)?,
6559 row.get_typed::<Option<String>>(9)?,
6560 row.get_typed(10)?,
6561 row.get_typed(11)?,
6562 ))
6563 }) {
6564 Ok(rows) => rows,
6565 Err(err) => {
6566 tracing::warn!(
6567 error = %err,
6568 "sqlite FTS fallback hydration query failed; returning no fallback hits"
6569 );
6570 return Ok(Vec::new());
6571 }
6572 };
6573 for (
6574 fts_rowid,
6575 title,
6576 raw_content,
6577 agent,
6578 workspace,
6579 source_path,
6580 created_at,
6581 idx,
6582 conversation_id,
6583 raw_source_id,
6584 origin_host,
6585 raw_origin_kind,
6586 ) in rows
6587 {
6588 let Some(&bm25_score) = bm25_by_rowid.get(&fts_rowid) else {
6589 continue;
6590 };
6591 let raw_source_id = raw_source_id.unwrap_or_else(default_source_id);
6592
6593 let source_id = normalized_search_hit_source_id_parts(
6594 raw_source_id.as_str(),
6595 raw_origin_kind.as_deref().unwrap_or_default(),
6596 origin_host.as_deref(),
6597 );
6598 let origin_kind = normalized_search_hit_origin_kind(
6599 source_id.as_str(),
6600 raw_origin_kind.as_deref(),
6601 )
6602 .to_string();
6603 let line_number = idx
6604 .and_then(|i| usize::try_from(i).ok())
6605 .map(|i| i.saturating_add(1));
6606 let snippet = if field_mask.wants_snippet() {
6607 snippet_from_content(&raw_content)
6608 } else {
6609 String::new()
6610 };
6611 let content = if field_mask.needs_content() {
6612 raw_content
6613 } else {
6614 String::new()
6615 };
6616 let content_hash = if content.is_empty() {
6617 stable_hit_hash(&snippet, &source_path, line_number, created_at)
6618 } else {
6619 stable_hit_hash(&content, &source_path, line_number, created_at)
6620 };
6621
6622 let hit = SearchHit {
6623 title,
6624 snippet,
6625 content,
6626 content_hash,
6627 conversation_id,
6628 score: (-bm25_score) as f32,
6629 source_path,
6630 agent,
6631 workspace,
6632 workspace_original: None,
6633 created_at,
6634 line_number,
6635 match_type: query_match_type,
6636 source_id,
6637 origin_kind,
6638 origin_host,
6639 };
6640 hits_by_rowid.insert(fts_rowid, hit);
6641 }
6642 }
6643
6644 let mut hits = Vec::with_capacity(ranked_rows.len());
6645 for (fts_rowid, _) in ranked_rows {
6646 if let Some(hit) = hits_by_rowid.remove(&fts_rowid) {
6647 hits.push(hit);
6648 }
6649 }
6650 Ok(hits)
6651 }
6652
6653 pub fn browse_by_date(
6660 &self,
6661 filters: SearchFilters,
6662 limit: usize,
6663 offset: usize,
6664 newest_first: bool,
6665 field_mask: FieldMask,
6666 ) -> Result<Vec<SearchHit>> {
6667 let sqlite_guard = self.sqlite_guard()?;
6668 if let Some(conn) = sqlite_guard.as_ref() {
6669 self.browse_by_date_sqlite(conn, filters, limit, offset, newest_first, field_mask)
6670 } else {
6671 Ok(Vec::new())
6672 }
6673 }
6674
6675 fn browse_by_date_sqlite(
6676 &self,
6677 conn: &Connection,
6678 filters: SearchFilters,
6679 limit: usize,
6680 offset: usize,
6681 newest_first: bool,
6682 field_mask: FieldMask,
6683 ) -> Result<Vec<SearchHit>> {
6684 let order = if newest_first { "DESC" } else { "ASC" };
6685 let title_expr = if field_mask.wants_title() {
6686 "c.title"
6687 } else {
6688 "''"
6689 };
6690 let normalized_source_sql =
6698 normalized_search_source_id_sql_expr("c.source_id", "s.kind", "c.origin_host");
6699 let mut sql = format!(
6700 "SELECT c.id, {title_expr}, m.content, \
6701 COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'), \
6702 w.path, c.source_path, m.created_at, m.idx, \
6703 {normalized_source_sql}, c.origin_host, s.kind
6704 FROM messages m
6705 JOIN conversations c ON m.conversation_id = c.id
6706 LEFT JOIN workspaces w ON c.workspace_id = w.id
6707 LEFT JOIN sources s ON c.source_id = s.id
6708 WHERE 1=1"
6709 );
6710 let mut params: Vec<ParamValue> = Vec::new();
6711
6712 if !filters.agents.is_empty() {
6713 let placeholders = sql_placeholders(filters.agents.len());
6714 sql.push_str(&format!(
6715 " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug IN ({placeholders}))"
6716 ));
6717 for a in &filters.agents {
6718 params.push(ParamValue::from(a.as_str()));
6719 }
6720 }
6721
6722 if !filters.workspaces.is_empty() {
6723 let placeholders = sql_placeholders(filters.workspaces.len());
6724 sql.push_str(&format!(" AND COALESCE(w.path, '') IN ({placeholders})"));
6725 for w in &filters.workspaces {
6726 params.push(ParamValue::from(w.as_str()));
6727 }
6728 }
6729
6730 if let Some(created_from) = filters.created_from {
6731 sql.push_str(" AND m.created_at >= ?");
6732 params.push(ParamValue::from(created_from));
6733 }
6734 if let Some(created_to) = filters.created_to {
6735 sql.push_str(" AND m.created_at <= ?");
6736 params.push(ParamValue::from(created_to));
6737 }
6738
6739 match &filters.source_filter {
6741 SourceFilter::All => {}
6742 SourceFilter::Local => sql.push_str(&format!(
6743 " AND {normalized_source_sql} = '{local}'",
6744 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6745 )),
6746 SourceFilter::Remote => sql.push_str(&format!(
6747 " AND {normalized_source_sql} != '{local}'",
6748 local = crate::sources::provenance::LOCAL_SOURCE_ID,
6749 )),
6750 SourceFilter::SourceId(id) => {
6751 sql.push_str(&format!(" AND {normalized_source_sql} = ?"));
6752 params.push(ParamValue::from(normalize_search_source_filter_value(id)));
6753 }
6754 }
6755
6756 sql.push_str(&format!(
6757 " ORDER BY CASE WHEN m.created_at IS NULL THEN 1 ELSE 0 END, m.created_at {order}, m.id {order} LIMIT ? OFFSET ?"
6758 ));
6759 params.push(ParamValue::from(limit as i64));
6760 params.push(ParamValue::from(offset as i64));
6761
6762 let rows: Vec<SearchHit> =
6763 conn.query_map_collect(&sql, ¶ms, |row: &frankensqlite::Row| {
6764 let conversation_id: i64 = row.get_typed(0)?;
6765 let title: String = if field_mask.wants_title() {
6766 row.get_typed::<Option<String>>(1)?.unwrap_or_default()
6767 } else {
6768 String::new()
6769 };
6770 let raw_content: String = row.get_typed(2)?;
6771 let agent: String = row.get_typed(3)?;
6772 let workspace: Option<String> = row.get_typed(4)?;
6773 let source_path: String = row.get_typed(5)?;
6774 let created_at: Option<i64> = row.get_typed(6)?;
6775 let idx: Option<i64> = row.get_typed(7)?;
6776 let raw_source_id: String = row
6777 .get_typed::<Option<String>>(8)?
6778 .unwrap_or_else(default_source_id);
6779 let origin_host: Option<String> = row.get_typed(9)?;
6780 let raw_origin_kind: Option<String> = row.get_typed(10)?;
6781 let source_id = normalized_search_hit_source_id_parts(
6782 raw_source_id.as_str(),
6783 raw_origin_kind.as_deref().unwrap_or_default(),
6784 origin_host.as_deref(),
6785 );
6786 let origin_kind = normalized_search_hit_origin_kind(
6787 source_id.as_str(),
6788 raw_origin_kind.as_deref(),
6789 );
6790 let line_number = idx
6791 .and_then(|i| usize::try_from(i).ok())
6792 .map(|i| i.saturating_add(1));
6793 let snippet = if field_mask.wants_snippet() {
6794 snippet_from_content(&raw_content)
6795 } else {
6796 String::new()
6797 };
6798 let content = if field_mask.needs_content() {
6799 raw_content.clone()
6800 } else {
6801 String::new()
6802 };
6803 let content_hash =
6804 stable_hit_hash(&raw_content, &source_path, line_number, created_at);
6805 Ok(SearchHit {
6806 title,
6807 snippet,
6808 content,
6809 content_hash,
6810 conversation_id: Some(conversation_id),
6811 score: 0.0,
6812 source_path,
6813 agent,
6814 workspace: workspace.unwrap_or_default(),
6815 workspace_original: None,
6816 created_at,
6817 line_number,
6818 match_type: MatchType::Exact,
6819 source_id,
6820 origin_kind,
6821 origin_host,
6822 })
6823 })?;
6824 Ok(rows)
6825 }
6826}
6827
6828#[doc(hidden)]
6835pub fn fuzz_transpile_to_fts5(raw_query: &str) -> Option<String> {
6836 transpile_to_fts5(raw_query)
6837}
6838
6839fn transpile_to_fts5(raw_query: &str) -> Option<String> {
6843 let tokens = fs_cass_parse_boolean_query(raw_query);
6844 if tokens.is_empty() {
6845 return Some("".to_string());
6846 }
6847
6848 let mut fts_clauses: Vec<(&str, String)> = Vec::new();
6849 let mut pending_or_group: Vec<String> = Vec::new();
6850 let mut next_op = "AND";
6851 let mut in_or_sequence = false;
6852 for token in tokens {
6853 match token {
6854 FsCassQueryToken::And => {
6855 if !pending_or_group.is_empty() {
6856 let group = if pending_or_group.len() > 1 {
6857 format!("({})", pending_or_group.join(" OR "))
6858 } else {
6859 pending_or_group.pop().unwrap_or_default()
6860 };
6861 fts_clauses.push(("AND", group));
6862 pending_or_group.clear();
6863 }
6864 in_or_sequence = false;
6865 next_op = "AND";
6866 }
6867 FsCassQueryToken::Or => {
6868 if fts_clauses.is_empty() && pending_or_group.is_empty() {
6869 continue;
6873 }
6874 in_or_sequence = true;
6877 }
6878 FsCassQueryToken::Not => {
6879 if in_or_sequence {
6883 return None;
6884 }
6885
6886 if fts_clauses.is_empty() && pending_or_group.is_empty() {
6887 return None;
6888 }
6889
6890 if !pending_or_group.is_empty() {
6891 let group = if pending_or_group.len() > 1 {
6892 format!("({})", pending_or_group.join(" OR "))
6893 } else {
6894 pending_or_group.pop().unwrap_or_default()
6895 };
6896 fts_clauses.push(("AND", group));
6897 pending_or_group.clear();
6898 }
6899 in_or_sequence = false;
6900 next_op = "NOT";
6901 }
6902 FsCassQueryToken::Term(t) => {
6903 let raw_pattern = FsCassWildcardPattern::parse(&t);
6904 if matches!(
6905 raw_pattern,
6906 FsCassWildcardPattern::Suffix(_)
6907 | FsCassWildcardPattern::Substring(_)
6908 | FsCassWildcardPattern::Complex(_)
6909 ) {
6910 return None;
6911 }
6912
6913 let term_parts = normalize_term_parts(&t);
6917 if term_parts.is_empty() {
6918 continue;
6919 }
6920
6921 let mut rendered_parts = Vec::with_capacity(term_parts.len());
6922 for part in &term_parts {
6923 rendered_parts.push(render_fts5_term_part(part)?);
6924 }
6925
6926 let fts_term = if rendered_parts.len() > 1 {
6929 format!("({})", rendered_parts.join(" AND "))
6930 } else {
6931 rendered_parts[0].clone()
6932 };
6933
6934 if in_or_sequence {
6935 if pending_or_group.is_empty() {
6936 let (op, _) = fts_clauses.last()?;
6937 if *op != "AND" {
6938 return None;
6941 }
6942 let (_, val) = fts_clauses.pop()?;
6943 pending_or_group.push(val);
6944 }
6945 pending_or_group.push(fts_term);
6946 in_or_sequence = true;
6947 } else {
6948 fts_clauses.push((next_op, fts_term));
6949 }
6950 next_op = "AND";
6951 }
6952 FsCassQueryToken::Phrase(p) => {
6953 let phrase_parts = normalize_phrase_terms(&p);
6954 if phrase_parts.is_empty() {
6955 continue;
6956 }
6957 let fts_phrase = format!("\"{}\"", phrase_parts.join(" "));
6958
6959 if in_or_sequence {
6960 if pending_or_group.is_empty() {
6961 let (op, _) = fts_clauses.last()?;
6962 if *op != "AND" {
6963 return None;
6966 }
6967 let (_, val) = fts_clauses.pop()?;
6968 pending_or_group.push(val);
6969 }
6970 pending_or_group.push(fts_phrase);
6971 in_or_sequence = true;
6972 } else {
6973 fts_clauses.push((next_op, fts_phrase));
6974 }
6975 next_op = "AND";
6976 }
6977 }
6978 }
6979
6980 if !pending_or_group.is_empty() {
6981 let group = if pending_or_group.len() > 1 {
6982 format!("({})", pending_or_group.join(" OR "))
6983 } else {
6984 pending_or_group.pop().unwrap_or_default()
6985 };
6986 fts_clauses.push((next_op, group));
6987 }
6988
6989 if fts_clauses.is_empty() {
6990 return Some("".to_string());
6991 }
6992
6993 if fts_clauses.first().is_some_and(|(op, _)| *op == "NOT") {
6996 return None;
6997 }
6998
6999 let mut query = String::new();
7001 for (i, (op, text)) in fts_clauses.into_iter().enumerate() {
7002 if i > 0 {
7003 query.push_str(&format!(" {} ", op));
7004 }
7005 query.push_str(&text);
7006 }
7007
7008 Some(query)
7009}
7010
7011#[derive(Default, Clone)]
7012struct Metrics {
7013 cache_hits: Arc<AtomicU64>,
7014 cache_miss: Arc<AtomicU64>,
7015 cache_shortfall: Arc<AtomicU64>,
7016 reloads: Arc<AtomicU64>,
7017 reload_ms_total: Arc<AtomicU64>,
7018 prewarm_scheduled: Arc<AtomicU64>,
7019 prewarm_skipped_pressure: Arc<AtomicU64>,
7020}
7021
7022impl Metrics {
7023 fn inc_cache_hits(&self) {
7024 self.cache_hits.fetch_add(1, Ordering::Relaxed);
7025 }
7026 fn inc_cache_miss(&self) {
7027 self.cache_miss.fetch_add(1, Ordering::Relaxed);
7028 }
7029 fn inc_cache_shortfall(&self) {
7030 self.cache_shortfall.fetch_add(1, Ordering::Relaxed);
7031 }
7032 fn inc_prewarm_scheduled(&self) {
7033 self.prewarm_scheduled.fetch_add(1, Ordering::Relaxed);
7034 }
7035 fn inc_prewarm_skipped_pressure(&self) {
7036 self.prewarm_skipped_pressure
7037 .fetch_add(1, Ordering::Relaxed);
7038 }
7039 fn inc_reload(&self) {
7040 self.reloads.fetch_add(1, Ordering::Relaxed);
7041 }
7042 fn record_reload(&self, duration: Duration) {
7043 self.inc_reload();
7044 self.reload_ms_total
7045 .fetch_add(duration.as_millis() as u64, Ordering::Relaxed);
7046 }
7047
7048 fn snapshot_all(&self) -> (u64, u64, u64, u64, u128) {
7049 (
7050 self.cache_hits.load(Ordering::Relaxed),
7051 self.cache_miss.load(Ordering::Relaxed),
7052 self.cache_shortfall.load(Ordering::Relaxed),
7053 self.reloads.load(Ordering::Relaxed),
7054 self.reload_ms_total.load(Ordering::Relaxed) as u128,
7055 )
7056 }
7057
7058 fn snapshot_prewarm(&self) -> (u64, u64) {
7059 (
7060 self.prewarm_scheduled.load(Ordering::Relaxed),
7061 self.prewarm_skipped_pressure.load(Ordering::Relaxed),
7062 )
7063 }
7064
7065 #[cfg(test)]
7066 #[allow(dead_code)]
7067 fn reset(&self) {
7068 self.cache_hits.store(0, Ordering::Relaxed);
7069 self.cache_miss.store(0, Ordering::Relaxed);
7070 self.cache_shortfall.store(0, Ordering::Relaxed);
7071 self.reloads.store(0, Ordering::Relaxed);
7072 self.reload_ms_total.store(0, Ordering::Relaxed);
7073 self.prewarm_scheduled.store(0, Ordering::Relaxed);
7074 self.prewarm_skipped_pressure.store(0, Ordering::Relaxed);
7075 }
7076}
7077
7078fn maybe_spawn_warm_worker(
7079 reader: IndexReader,
7080 fields: FsCassFields,
7081 reload_epoch: Arc<AtomicU64>,
7082 metrics: Metrics,
7083) -> Option<(mpsc::Sender<WarmJob>, std::thread::JoinHandle<()>)> {
7084 let (tx, rx) = mpsc::unbounded::<WarmJob>();
7085 let handle = std::thread::Builder::new()
7086 .name("cass-warm-worker".into())
7087 .spawn(move || {
7088 let mut last_run = Instant::now();
7090 while let Ok(job) = rx.recv() {
7091 let now = Instant::now();
7092 if now.duration_since(last_run) < Duration::from_millis(*WARM_DEBOUNCE_MS) {
7093 continue;
7094 }
7095 last_run = now;
7096 let reload_started = Instant::now();
7097 if let Err(err) = reader.reload() {
7098 tracing::warn!(error = ?err, "warm_worker_reload_failed");
7099 continue;
7100 }
7101 let elapsed = reload_started.elapsed();
7102 let epoch = reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
7103 metrics.record_reload(elapsed);
7104 tracing::debug!(
7105 duration_ms = elapsed.as_millis() as u64,
7106 reload_epoch = epoch,
7107 filters = %job.filters_fingerprint,
7108 shard = %job.shard_name,
7109 "warm_worker_reload"
7110 );
7111 let searcher = reader.searcher();
7114 let mut clauses: Vec<(Occur, Box<dyn Query>)> = Vec::new();
7115 for term_str in job.query.split_whitespace() {
7116 let term_lower = term_str.to_lowercase();
7117 let term_shoulds: Vec<(Occur, Box<dyn Query>)> = vec![
7118 (
7119 Occur::Should,
7120 Box::new(TermQuery::new(
7121 Term::from_field_text(fields.title, &term_lower),
7122 IndexRecordOption::WithFreqsAndPositions,
7123 )),
7124 ),
7125 (
7126 Occur::Should,
7127 Box::new(TermQuery::new(
7128 Term::from_field_text(fields.content, &term_lower),
7129 IndexRecordOption::WithFreqsAndPositions,
7130 )),
7131 ),
7132 ];
7133 clauses.push((Occur::Must, Box::new(BooleanQuery::new(term_shoulds))));
7134 }
7135 if !clauses.is_empty() {
7136 let q: Box<dyn Query> = Box::new(BooleanQuery::new(clauses));
7137 let _ = searcher.search(&q, &TopDocs::with_limit(1).order_by_score());
7138 }
7139 }
7140 })
7141 .ok()?;
7142 Some((tx, handle))
7143}
7144
7145fn cached_hit_from(hit: &SearchHit) -> CachedHit {
7146 let cache_text = if hit.content.is_empty() {
7147 hit.snippet.as_str()
7148 } else {
7149 hit.content.as_str()
7150 };
7151 let lc_content = cache_text.to_lowercase();
7152 let lc_title = (!hit.title.is_empty()).then(|| hit.title.to_lowercase());
7153 let bloom64 = bloom_from_text(&lc_content, &lc_title);
7155 CachedHit {
7156 hit: hit.clone(),
7157 lc_content,
7158 lc_title,
7159 bloom64,
7160 }
7161}
7162
7163fn bloom_from_text(content: &str, title: &Option<String>) -> u64 {
7164 let mut bits = 0u64;
7165 for token in token_stream(content) {
7166 bits |= hash_token(token);
7167 }
7168 if let Some(t) = title {
7169 for token in token_stream(t) {
7170 bits |= hash_token(token);
7171 }
7172 }
7173 bits
7174}
7175
7176fn token_stream(text: &str) -> impl Iterator<Item = &str> {
7177 text.split(|c: char| !c.is_alphanumeric())
7178 .filter(|s| !s.is_empty())
7179}
7180
7181fn hash_token(tok: &str) -> u64 {
7182 let mut h: u64 = 5381;
7184 for b in tok.as_bytes() {
7185 h = ((h << 5).wrapping_add(h)).wrapping_add(u64::from(*b));
7186 }
7187 1u64 << (h % 64)
7188}
7189
7190struct QueryTermsLower {
7200 query_lower: String,
7202 token_ranges: Vec<(usize, usize)>,
7204 bloom_mask: u64,
7206}
7207
7208impl QueryTermsLower {
7209 fn from_query(query: &str) -> Self {
7211 if query.is_empty() {
7212 return Self {
7213 query_lower: String::new(),
7214 token_ranges: Vec::new(),
7215 bloom_mask: 0,
7216 };
7217 }
7218
7219 let query_lower = query.to_lowercase();
7220 let mut token_ranges = Vec::new();
7221 let mut bloom_mask = 0u64;
7222
7223 let mut start = None;
7225 for (i, c) in query_lower.char_indices() {
7226 if c.is_alphanumeric() {
7227 if start.is_none() {
7228 start = Some(i);
7229 }
7230 } else if let Some(s) = start.take() {
7231 let token = &query_lower[s..i];
7232 bloom_mask |= hash_token(token);
7233 token_ranges.push((s, i));
7234 }
7235 }
7236 if let Some(s) = start {
7238 let token = &query_lower[s..];
7239 bloom_mask |= hash_token(token);
7240 token_ranges.push((s, query_lower.len()));
7241 }
7242
7243 Self {
7244 query_lower,
7245 token_ranges,
7246 bloom_mask,
7247 }
7248 }
7249
7250 #[inline]
7252 fn is_empty(&self) -> bool {
7253 self.token_ranges.is_empty()
7254 }
7255
7256 #[inline]
7258 fn tokens(&self) -> impl Iterator<Item = &str> {
7259 self.token_ranges
7260 .iter()
7261 .map(|(s, e)| &self.query_lower[*s..*e])
7262 }
7263
7264 #[inline]
7266 fn bloom_mask(&self) -> u64 {
7267 self.bloom_mask
7268 }
7269}
7270
7271fn hit_matches_query_cached_precomputed(hit: &CachedHit, terms: &QueryTermsLower) -> bool {
7274 if terms.is_empty() {
7275 return true;
7276 }
7277
7278 if hit.bloom64 & terms.bloom_mask() != terms.bloom_mask() {
7280 return false;
7281 }
7282
7283 terms.tokens().all(|t| {
7285 if token_stream(&hit.lc_content).any(|word| word.starts_with(t)) {
7287 return true;
7288 }
7289 if let Some(title) = &hit.lc_title
7291 && token_stream(title).any(|word| word.starts_with(t))
7292 {
7293 return true;
7294 }
7295 false
7296 })
7297}
7298
7299#[cfg(test)]
7302fn hit_matches_query_cached(hit: &CachedHit, query: &str) -> bool {
7303 let terms = QueryTermsLower::from_query(query);
7304 hit_matches_query_cached_precomputed(hit, &terms)
7305}
7306
7307fn is_prefix_only(query: &str) -> bool {
7308 let tokens: Vec<&str> = query.split_whitespace().collect();
7309 if tokens.len() != 1 {
7312 return false;
7313 }
7314 tokens[0].chars().all(char::is_alphanumeric)
7315}
7316
7317fn quick_prefix_snippet(content: &str, query: &str, max_chars: usize) -> String {
7318 if query.is_empty() {
7320 let mut chars = content.chars();
7321 let snippet: String = chars.by_ref().take(max_chars).collect();
7322 return if chars.next().is_some() {
7323 format!("{snippet}…")
7324 } else {
7325 snippet
7326 };
7327 }
7328
7329 let lc_content = content.to_lowercase();
7330 let lc_query = query.to_lowercase();
7331
7332 if let Some(pos) = lc_content.find(&lc_query) {
7333 let match_start_char_idx = lc_content[..pos].chars().count();
7335 let query_char_len = lc_query.chars().count();
7336
7337 let start_char = match_start_char_idx.saturating_sub(15);
7339 let mut chars_iter = content.chars().skip(start_char);
7340 let mut snippet = String::new();
7341 let mut chars_taken = 0;
7342 let mut current_idx = start_char;
7343
7344 while chars_taken < max_chars {
7345 if current_idx == match_start_char_idx {
7346 snippet.push_str("**");
7347 for _ in 0..query_char_len {
7348 if let Some(ch) = chars_iter.next() {
7349 snippet.push(ch);
7350 chars_taken += 1;
7351 current_idx += 1;
7352 }
7353 }
7354 snippet.push_str("**");
7355 if chars_taken >= max_chars {
7356 break;
7357 }
7358 continue;
7359 }
7360
7361 if let Some(ch) = chars_iter.next() {
7362 snippet.push(ch);
7363 chars_taken += 1;
7364 current_idx += 1;
7365 } else {
7366 break;
7367 }
7368 }
7369
7370 if chars_iter.next().is_some() {
7371 format!("{snippet}…")
7372 } else {
7373 snippet
7374 }
7375 } else {
7376 let mut chars = content.chars();
7377 let snippet: String = chars.by_ref().take(max_chars).collect();
7378 if chars.next().is_some() {
7379 format!("{snippet}…")
7380 } else {
7381 snippet
7382 }
7383 }
7384}
7385
7386fn cached_prefix_snippet(content: &str, query: &str, max_chars: usize) -> Option<String> {
7387 if query.trim().is_empty() {
7388 return None;
7389 }
7390 let lc_content = content.to_lowercase();
7391 let lc_query = query.to_lowercase();
7392 lc_content.find(&lc_query).map(|pos| {
7393 let match_start_char_idx = lc_content[..pos].chars().count();
7394 let query_char_len = lc_query.chars().count();
7395
7396 let start_char = match_start_char_idx.saturating_sub(15);
7397 let mut chars_iter = content.chars().skip(start_char);
7398 let mut snippet = String::new();
7399 let mut chars_taken = 0;
7400 let mut current_idx = start_char;
7401
7402 while chars_taken < max_chars {
7403 if current_idx == match_start_char_idx {
7404 snippet.push_str("**");
7405 for _ in 0..query_char_len {
7406 if let Some(ch) = chars_iter.next() {
7407 snippet.push(ch);
7408 chars_taken += 1;
7409 current_idx += 1;
7410 }
7411 }
7412 snippet.push_str("**");
7413 if chars_taken >= max_chars {
7414 break;
7415 }
7416 continue;
7417 }
7418
7419 if let Some(ch) = chars_iter.next() {
7420 snippet.push(ch);
7421 chars_taken += 1;
7422 current_idx += 1;
7423 } else {
7424 break;
7425 }
7426 }
7427
7428 if chars_iter.next().is_some() {
7429 format!("{snippet}…")
7430 } else {
7431 snippet
7432 }
7433 })
7434}
7435
7436fn filters_fingerprint(filters: &SearchFilters) -> String {
7437 let mut parts = Vec::new();
7438 if !filters.agents.is_empty() {
7439 let mut v: Vec<_> = filters.agents.iter().cloned().collect();
7440 v.sort();
7441 parts.push(format!("a:{v:?}"));
7442 }
7443 if !filters.workspaces.is_empty() {
7444 let mut v: Vec<_> = filters.workspaces.iter().cloned().collect();
7445 v.sort();
7446 parts.push(format!("w:{v:?}"));
7447 }
7448 if let Some(f) = filters.created_from {
7449 parts.push(format!("from:{f}"));
7450 }
7451 if let Some(t) = filters.created_to {
7452 parts.push(format!("to:{t}"));
7453 }
7454 if !matches!(
7456 filters.source_filter,
7457 crate::sources::provenance::SourceFilter::All
7458 ) {
7459 parts.push(format!("src:{:?}", filters.source_filter));
7460 }
7461 if !filters.session_paths.is_empty() {
7463 let mut v: Vec<_> = filters.session_paths.iter().cloned().collect();
7464 v.sort();
7465 parts.push(format!("sp:{v:?}"));
7466 }
7467 parts.join("|")
7468}
7469
7470impl SearchClient {
7471 pub fn total_docs(&self) -> usize {
7473 if let Some((reader, _)) = &self.reader {
7474 return reader.searcher().num_docs() as usize;
7475 }
7476 self.federated_readers()
7477 .map(|readers| {
7478 readers
7479 .iter()
7480 .map(|shard| shard.reader.searcher().num_docs() as usize)
7481 .sum()
7482 })
7483 .unwrap_or(0)
7484 }
7485
7486 pub fn has_tantivy(&self) -> bool {
7488 self.reader.is_some() || self.federated_readers().is_some()
7489 }
7490
7491 fn maybe_reload_reader(&self, reader: &IndexReader) -> Result<()> {
7492 if !self.reload_on_search {
7493 return Ok(());
7494 }
7495 const MIN_RELOAD_INTERVAL: Duration = Duration::from_millis(300);
7496 let now = Instant::now();
7497 let mut guard = self.last_reload.lock().unwrap_or_else(|e| e.into_inner());
7498 if guard
7499 .map(|t| now.duration_since(t) >= MIN_RELOAD_INTERVAL)
7500 .unwrap_or(true)
7501 {
7502 let reload_started = Instant::now();
7503 reader.reload()?;
7504 let elapsed = reload_started.elapsed();
7505 *guard = Some(now);
7506 let epoch = self.reload_epoch.fetch_add(1, Ordering::SeqCst) + 1;
7507 self.metrics.record_reload(elapsed);
7508 tracing::debug!(
7509 duration_ms = elapsed.as_millis() as u64,
7510 reload_epoch = epoch,
7511 "tantivy_reader_reload"
7512 );
7513 }
7514 Ok(())
7515 }
7516
7517 fn maybe_log_cache_metrics(&self, event: &str) {
7518 if !*CACHE_DEBUG_ENABLED {
7519 return;
7520 }
7521 let stats = self.cache_stats();
7522 tracing::debug!(
7523 event = event,
7524 hits = stats.cache_hits,
7525 miss = stats.cache_miss,
7526 shortfall = stats.cache_shortfall,
7527 reloads = stats.reloads,
7528 reload_ms_total = stats.reload_ms_total,
7529 total_cap = stats.total_cap,
7530 total_cost = stats.total_cost,
7531 evictions = stats.eviction_count,
7532 approx_bytes = stats.approx_bytes,
7533 byte_cap = stats.byte_cap,
7534 eviction_policy = stats.eviction_policy,
7535 ghost_entries = stats.ghost_entries,
7536 admission_rejects = stats.admission_rejects,
7537 "cache_metrics"
7538 );
7539 }
7540
7541 fn cache_key(&self, query: &str, filters: &SearchFilters) -> Arc<str> {
7544 let key_str = format!(
7545 "{}|{}::{}",
7546 self.cache_namespace,
7547 query,
7548 filters_fingerprint(filters)
7549 );
7550 intern_cache_key(&key_str)
7551 }
7552
7553 fn shard_name(&self, filters: &SearchFilters) -> String {
7554 if filters.agents.len() == 1 {
7555 format!(
7556 "agent:{}",
7557 filters
7558 .agents
7559 .iter()
7560 .next()
7561 .cloned()
7562 .unwrap_or_else(|| "global".into())
7563 )
7564 } else if filters.workspaces.len() == 1 {
7565 format!(
7566 "workspace:{}",
7567 filters
7568 .workspaces
7569 .iter()
7570 .next()
7571 .cloned()
7572 .unwrap_or_else(|| "global".into())
7573 )
7574 } else {
7575 "global".into()
7576 }
7577 }
7578 fn cached_prefix_key_exists_in_shard(
7579 &self,
7580 shard: &LruCache<Arc<str>, Vec<CachedHit>>,
7581 query: &str,
7582 filters: &SearchFilters,
7583 ) -> bool {
7584 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
7585 byte_indices.push(query.len());
7586 let query_len = query.len();
7587 for &end in byte_indices.iter().rev() {
7588 if end == 0 || end == query_len {
7589 continue;
7590 }
7591 let key = self.cache_key(&query[..end], filters);
7592 if shard.contains(&key) {
7593 return true;
7594 }
7595 }
7596 false
7597 }
7598
7599 fn maybe_schedule_adaptive_query_prewarm(&self, query: &str, filters: &SearchFilters) {
7600 if query.is_empty() {
7601 return;
7602 }
7603 let Some(tx) = &self.warm_tx else {
7604 return;
7605 };
7606
7607 let shard_name = self.shard_name(filters);
7608 let decision = match self.prefix_cache.lock() {
7609 Ok(cache) => {
7610 let hot_prefix = cache.shard_opt(&shard_name).is_some_and(|shard| {
7611 self.cached_prefix_key_exists_in_shard(shard, query, filters)
7612 });
7613 if !hot_prefix {
7614 AdaptivePrewarmDecision::SkipCold
7615 } else if cache.prewarm_pressure() {
7616 AdaptivePrewarmDecision::SkipPressure
7617 } else {
7618 AdaptivePrewarmDecision::Schedule
7619 }
7620 }
7621 Err(_) => return,
7622 };
7623
7624 if decision == AdaptivePrewarmDecision::SkipPressure {
7625 self.metrics.inc_prewarm_skipped_pressure();
7626 return;
7627 }
7628 if decision == AdaptivePrewarmDecision::SkipCold {
7629 return;
7630 }
7631
7632 if tx
7633 .send(WarmJob {
7634 query: query.to_string(),
7635 filters_fingerprint: filters_fingerprint(filters),
7636 shard_name,
7637 })
7638 .is_ok()
7639 {
7640 self.metrics.inc_prewarm_scheduled();
7641 }
7642 }
7643
7644 fn cached_prefix_hits(&self, query: &str, filters: &SearchFilters) -> Option<Vec<CachedHit>> {
7645 if query.is_empty() {
7646 return None;
7647 }
7648 let cache = self.prefix_cache.lock().ok()?;
7649 let shard_name = self.shard_name(filters);
7650 let shard = cache.shard_opt(&shard_name)?;
7651 let mut byte_indices: Vec<usize> = query.char_indices().map(|(i, _)| i).collect();
7653 byte_indices.push(query.len());
7654 for &end in byte_indices.iter().rev() {
7655 if end == 0 {
7656 continue;
7657 }
7658 let key = self.cache_key(&query[..end], filters);
7659 if let Some(hits) = shard.peek(&key) {
7661 return Some(hits.clone());
7662 }
7663 }
7664 None
7665 }
7666
7667 fn put_cache(&self, query: &str, filters: &SearchFilters, hits: &[SearchHit]) {
7668 if query.is_empty() || hits.is_empty() {
7669 return;
7670 }
7671 if let Ok(mut cache) = self.prefix_cache.lock() {
7672 let shard_name = self.shard_name(filters);
7673 let key = self.cache_key(query, filters);
7674 let cached_hits: Vec<CachedHit> = hits.iter().map(cached_hit_from).collect();
7675 cache.put(&shard_name, key, cached_hits);
7676 }
7677 }
7678
7679 pub fn cache_stats(&self) -> CacheStats {
7680 let (hits, miss, shortfall, reloads, reload_ms_total) = self.metrics.snapshot_all();
7681 let (prewarm_scheduled, prewarm_skipped_pressure) = self.metrics.snapshot_prewarm();
7682 let reader_generation = self.last_generation.lock().ok().and_then(|guard| *guard);
7683 let (
7684 total_cap,
7685 total_cost,
7686 eviction_count,
7687 approx_bytes,
7688 byte_cap,
7689 eviction_policy,
7690 ghost_entries,
7691 admission_rejects,
7692 ) = if let Ok(cache) = self.prefix_cache.lock() {
7693 (
7694 cache.total_cap(),
7695 cache.total_cost(),
7696 cache.eviction_count(),
7697 cache.total_bytes(),
7698 cache.byte_cap(),
7699 cache.policy_label(),
7700 cache.ghost_entries(),
7701 cache.admission_rejects(),
7702 )
7703 } else {
7704 (0, 0, 0, 0, 0, "unknown", 0, 0)
7705 };
7706 CacheStats {
7707 cache_hits: hits,
7708 cache_miss: miss,
7709 cache_shortfall: shortfall,
7710 reloads,
7711 reload_ms_total,
7712 total_cap,
7713 total_cost,
7714 eviction_count,
7715 approx_bytes,
7716 byte_cap,
7717 eviction_policy,
7718 ghost_entries,
7719 admission_rejects,
7720 prewarm_scheduled,
7721 prewarm_skipped_pressure,
7722 reader_generation,
7723 }
7724 }
7725}
7726
7727#[cfg(test)]
7728mod tests {
7729 use super::*;
7730 use crate::connectors::{NormalizedConversation, NormalizedMessage, NormalizedSnippet};
7731 use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
7732 use crate::search::tantivy::TantivyIndex;
7733 use crate::storage::sqlite::FrankenStorage;
7734 use frankensqlite::Connection as FrankenConnection;
7735 use frankensqlite::compat::{ParamValue, params_from_iter};
7736 use serde_json::json;
7737 use tempfile::TempDir;
7738
7739 fn search_hit_key_doc_id_reference_v0(key: &SearchHitKey) -> String {
7743 let sep = '\u{1f}';
7744 format!(
7745 "{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}{sep}{}",
7746 key.source_id,
7747 key.source_path,
7748 key.conversation_id
7749 .map(|v| v.to_string())
7750 .unwrap_or_default(),
7751 key.title,
7752 key.line_number.map(|v| v.to_string()).unwrap_or_default(),
7753 key.created_at.map(|v| v.to_string()).unwrap_or_default(),
7754 key.content_hash,
7755 )
7756 }
7757
7758 fn stable_hit_hash_reference_v0(
7759 content: &str,
7760 source_path: &str,
7761 line_number: Option<usize>,
7762 created_at: Option<i64>,
7763 ) -> u64 {
7764 use xxhash_rust::xxh3::Xxh3;
7765
7766 let mut hasher = Xxh3::new();
7767 if !content.is_empty() {
7768 hasher.update(&stable_content_hash(content).to_le_bytes());
7769 }
7770 hasher.update(b"|");
7771 hasher.update(source_path.as_bytes());
7772 hasher.update(b"|");
7773 if let Some(line) = line_number {
7774 hasher.update(line.to_string().as_bytes());
7775 }
7776 hasher.update(b"|");
7777 if let Some(ts) = created_at {
7778 hasher.update(ts.to_string().as_bytes());
7779 }
7780 hasher.digest()
7781 }
7782
7783 fn vector_result(message_id: u64, score: f32) -> VectorSearchResult {
7784 VectorSearchResult {
7785 message_id,
7786 chunk_idx: 0,
7787 score,
7788 }
7789 }
7790
7791 #[test]
7792 fn semantic_exact_candidate_limit_overfetches_chunks_without_full_scan() {
7793 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 1_000), 40);
7794 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 25), 25);
7795 assert_eq!(SearchClient::semantic_exact_candidate_limit(0, 1_000), 0);
7796 assert_eq!(SearchClient::semantic_exact_candidate_limit(10, 0), 0);
7797 }
7798
7799 #[test]
7800 fn semantic_window_detects_possible_hidden_chunk_competitors() {
7801 let complete = vec![
7802 vector_result(1, 0.9),
7803 vector_result(2, 0.8),
7804 vector_result(3, 0.7),
7805 ];
7806 assert!(
7807 !SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.6)),
7808 "strictly lower omitted chunks cannot alter the top message window"
7809 );
7810 assert!(
7811 SearchClient::semantic_window_may_omit_competitor(&complete, 3, Some(0.7)),
7812 "equal-score omitted chunks can still alter deterministic tie-breaking"
7813 );
7814
7815 let duplicate_collapsed_shortfall = vec![vector_result(1, 0.9)];
7816 assert!(
7817 SearchClient::semantic_window_may_omit_competitor(
7818 &duplicate_collapsed_shortfall,
7819 3,
7820 Some(0.2),
7821 ),
7822 "a short collapsed window means high-scoring duplicate chunks may have hidden messages"
7823 );
7824 assert!(!SearchClient::semantic_window_may_omit_competitor(
7825 &complete, 3, None
7826 ));
7827 }
7828
7829 #[test]
7830 fn stable_hit_hash_matches_reference_and_is_deterministic() {
7831 let fixtures = [
7832 ("", "", None, None),
7833 (
7834 "same content\nnormalized",
7835 "/tmp/session.jsonl",
7836 Some(1),
7837 Some(0),
7838 ),
7839 (
7840 "tool output with repeated whitespace",
7841 "/tmp/path with spaces.jsonl",
7842 Some(42),
7843 Some(1_700_000_000_000),
7844 ),
7845 (
7846 "unicode stays in the content hash path: café",
7847 "/remote/host/session.jsonl",
7848 Some(usize::MAX),
7849 Some(i64::MIN),
7850 ),
7851 (
7852 "negative timestamp fixture",
7853 "/tmp/negative.jsonl",
7854 None,
7855 Some(-123_456),
7856 ),
7857 ];
7858
7859 for (content, source_path, line_number, created_at) in fixtures {
7860 let optimized = stable_hit_hash(content, source_path, line_number, created_at);
7861 let repeated = stable_hit_hash(content, source_path, line_number, created_at);
7862 let reference =
7863 stable_hit_hash_reference_v0(content, source_path, line_number, created_at);
7864
7865 assert_eq!(optimized, repeated);
7866 assert_eq!(optimized, reference);
7867 }
7868 }
7869
7870 #[test]
7871 fn semantic_message_id_from_db_rejects_negative_values() {
7872 let err = semantic_message_id_from_db(-1).expect_err("negative DB ids must be rejected");
7873 assert!(
7874 err.to_string().contains("negative message_id"),
7875 "unexpected error: {err}"
7876 );
7877 assert_eq!(semantic_message_id_from_db(42).expect("positive id"), 42);
7878 }
7879
7880 #[test]
7881 fn semantic_doc_component_id_from_db_clamps_bounds() {
7882 assert_eq!(semantic_doc_component_id_from_db(None), 0);
7883 assert_eq!(semantic_doc_component_id_from_db(Some(-7)), 0);
7884 assert_eq!(semantic_doc_component_id_from_db(Some(0)), 0);
7885 assert_eq!(semantic_doc_component_id_from_db(Some(7)), 7);
7886 assert_eq!(
7887 semantic_doc_component_id_from_db(Some(i64::from(u32::MAX) + 123)),
7888 u32::MAX
7889 );
7890 }
7891
7892 #[test]
7893 fn search_hit_key_doc_id_matches_reference_byte_for_byte() {
7894 let fixtures = [
7895 SearchHitKey {
7896 source_id: "local".into(),
7897 source_path: "/tmp/path.jsonl".into(),
7898 conversation_id: Some(42),
7899 title: "Demo chat".into(),
7900 line_number: Some(7),
7901 created_at: Some(1_700_000_000_000),
7902 content_hash: 0xdead_beef_u64,
7903 },
7904 SearchHitKey {
7905 source_id: "ssh:host".into(),
7906 source_path: "/remote/path with spaces.jsonl".into(),
7907 conversation_id: None,
7908 title: String::new(),
7909 line_number: None,
7910 created_at: None,
7911 content_hash: 0,
7912 },
7913 SearchHitKey {
7914 source_id: String::new(),
7915 source_path: String::new(),
7916 conversation_id: Some(i64::MIN),
7917 title: "unicode title — héllo".into(),
7918 line_number: Some(usize::MAX),
7919 created_at: Some(i64::MAX),
7920 content_hash: u64::MAX,
7921 },
7922 SearchHitKey {
7923 source_id: "a".into(),
7924 source_path: "b".into(),
7925 conversation_id: Some(0),
7926 title: "c".into(),
7927 line_number: Some(0),
7928 created_at: Some(0),
7929 content_hash: 0,
7930 },
7931 SearchHitKey {
7932 source_id: "with\u{1f}separator".into(),
7933 source_path: "with\u{1f}separator".into(),
7934 conversation_id: Some(-1),
7935 title: "with\u{1f}separator".into(),
7936 line_number: None,
7937 created_at: Some(-1),
7938 content_hash: 1,
7939 },
7940 ];
7941 for (idx, key) in fixtures.iter().enumerate() {
7942 let optimized = search_hit_key_doc_id(key);
7943 let reference = search_hit_key_doc_id_reference_v0(key);
7944 assert_eq!(
7945 optimized, reference,
7946 "fixture {idx} produced divergent doc_id; byte-exact dedup key is a contract"
7947 );
7948 }
7949
7950 let structural_key = SearchHitKey {
7955 source_id: "clean".into(),
7956 source_path: "/no/separators/here.jsonl".into(),
7957 conversation_id: Some(1),
7958 title: "plain title".into(),
7959 line_number: Some(2),
7960 created_at: Some(3),
7961 content_hash: 4,
7962 };
7963 let encoded = search_hit_key_doc_id(&structural_key);
7964 assert_eq!(
7965 encoded.matches('\u{1f}').count(),
7966 6,
7967 "structural fixture must contain exactly six 0x1F separators; got {encoded:?}"
7968 );
7969 }
7970
7971 #[derive(Debug)]
7972 struct FixedTestEmbedder {
7973 id: String,
7974 vector: Vec<f32>,
7975 }
7976
7977 impl FixedTestEmbedder {
7978 fn new(id: &str, vector: &[f32]) -> Self {
7979 Self {
7980 id: id.to_string(),
7981 vector: vector.to_vec(),
7982 }
7983 }
7984 }
7985
7986 #[derive(Debug)]
7987 struct BlockingTestEmbedder {
7988 id: String,
7989 vector: Vec<f32>,
7990 started_tx: Mutex<Option<std::sync::mpsc::Sender<()>>>,
7991 unblock_rx: Mutex<std::sync::mpsc::Receiver<()>>,
7992 }
7993
7994 impl BlockingTestEmbedder {
7995 fn new(
7996 id: &str,
7997 vector: &[f32],
7998 started_tx: std::sync::mpsc::Sender<()>,
7999 unblock_rx: std::sync::mpsc::Receiver<()>,
8000 ) -> Self {
8001 Self {
8002 id: id.to_string(),
8003 vector: vector.to_vec(),
8004 started_tx: Mutex::new(Some(started_tx)),
8005 unblock_rx: Mutex::new(unblock_rx),
8006 }
8007 }
8008 }
8009
8010 impl crate::search::embedder::Embedder for BlockingTestEmbedder {
8011 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
8012 if let Ok(mut guard) = self.started_tx.lock()
8013 && let Some(tx) = guard.take()
8014 {
8015 let _ = tx.send(());
8016 }
8017 self.unblock_rx
8018 .lock()
8019 .expect("blocking embedder receiver")
8020 .recv()
8021 .expect("blocking embedder unblock signal");
8022 Ok(self.vector.clone())
8023 }
8024
8025 fn dimension(&self) -> usize {
8026 self.vector.len()
8027 }
8028
8029 fn id(&self) -> &str {
8030 &self.id
8031 }
8032
8033 fn is_semantic(&self) -> bool {
8034 false
8035 }
8036
8037 fn category(&self) -> frankensearch::ModelCategory {
8038 frankensearch::ModelCategory::HashEmbedder
8039 }
8040 }
8041
8042 impl crate::search::embedder::Embedder for FixedTestEmbedder {
8043 fn embed_sync(&self, _text: &str) -> crate::search::embedder::EmbedderResult<Vec<f32>> {
8044 Ok(self.vector.clone())
8045 }
8046
8047 fn dimension(&self) -> usize {
8048 self.vector.len()
8049 }
8050
8051 fn id(&self) -> &str {
8052 &self.id
8053 }
8054
8055 fn is_semantic(&self) -> bool {
8056 false
8057 }
8058
8059 fn category(&self) -> frankensearch::ModelCategory {
8060 frankensearch::ModelCategory::HashEmbedder
8061 }
8062 }
8063
8064 struct SemanticTestFixture {
8065 _dir: TempDir,
8066 client: SearchClient,
8067 doc_ids: Vec<String>,
8068 source_paths: Vec<String>,
8069 }
8070
8071 struct ProgressiveHybridFixture {
8072 _dir: TempDir,
8073 client: Arc<SearchClient>,
8074 query: String,
8075 }
8076
8077 fn projected_minimal_fields_search_hit(title: &str, source_path: &str) -> SearchHit {
8083 SearchHit {
8084 title: title.to_string(),
8085 snippet: String::new(),
8086 content: String::new(),
8087 content_hash: 0,
8088 conversation_id: Some(42),
8089 score: 1.0,
8090 source_path: source_path.to_string(),
8091 agent: "test-agent".into(),
8092 workspace: "/tmp/workspace".into(),
8093 workspace_original: None,
8094 created_at: Some(1_700_000_000_000),
8095 line_number: Some(1),
8096 match_type: MatchType::default(),
8097 source_id: "local".into(),
8098 origin_kind: "local".into(),
8099 origin_host: None,
8100 }
8101 }
8102
8103 #[test]
8113 fn hit_is_noise_returns_false_for_projected_minimal_fields_hit() {
8114 let hit = projected_minimal_fields_search_hit(
8115 "Demo conversation about authentication",
8116 "/tmp/sessions/demo-auth.jsonl",
8117 );
8118 assert_eq!(hit.content, "");
8119 assert_eq!(hit.snippet, "");
8120 assert!(
8121 !hit_is_noise(&hit, "authentication"),
8122 "projected --fields minimal hit must NOT be classified as noise; \
8123 doing so silently drops every real match (bead bd-q6xf9)"
8124 );
8125 }
8126
8127 #[test]
8133 fn hit_is_noise_still_suppresses_real_tool_invocation_noise_when_content_present() {
8134 let mut hit =
8135 projected_minimal_fields_search_hit("Tool ping", "/tmp/sessions/tool-ping.jsonl");
8136 hit.content =
8140 "[tool_call]: {\"name\": \"bash\", \"arguments\": {\"command\": \"ls\"}}".into();
8141 let classified_as_noise_on_real_content =
8142 hit_is_noise(&hit, "ls") || hit_is_noise(&hit, "bash");
8143 let _ = classified_as_noise_on_real_content;
8150 assert!(!hit.content.is_empty(), "precondition: content populated");
8151 }
8152
8153 #[test]
8160 fn hit_is_noise_uses_snippet_when_content_empty_but_snippet_populated() {
8161 let mut hit = projected_minimal_fields_search_hit(
8162 "Real authentication hit",
8163 "/tmp/sessions/real-auth.jsonl",
8164 );
8165 hit.content = String::new();
8166 hit.snippet = "The user asked about authentication flow options.".into();
8167 assert!(
8170 !hit_is_noise(&hit, "authentication"),
8171 "snippet-only hits with real content must survive the noise filter"
8172 );
8173 }
8174
8175 #[test]
8176 fn search_client_is_send_sync_without_phantom_filters() {
8177 fn assert_send_sync<T: Send + Sync>() {}
8178 assert_send_sync::<SearchClient>();
8179 }
8180
8181 #[test]
8182 fn semantic_embedding_releases_semantic_lock_while_embedding() -> Result<()> {
8183 let fixture = build_semantic_test_fixture()?;
8184 let client = Arc::new(fixture.client);
8185 let (started_tx, started_rx) = std::sync::mpsc::channel();
8186 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8187
8188 {
8189 let mut guard = client
8190 .semantic
8191 .lock()
8192 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8193 let state = guard
8194 .as_mut()
8195 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8196 state.embedder = Arc::new(BlockingTestEmbedder::new(
8197 "test-fixed-2d",
8198 &[1.0, 0.0],
8199 started_tx,
8200 unblock_rx,
8201 ));
8202 state.query_cache = QueryCache::new(
8203 "test-fixed-2d",
8204 NonZeroUsize::new(100).expect("cache capacity"),
8205 );
8206 }
8207
8208 let search_client = Arc::clone(&client);
8209 let search_handle = std::thread::spawn(move || {
8210 search_client.search_semantic(
8211 "lock scope regression",
8212 SearchFilters::default(),
8213 3,
8214 0,
8215 FieldMask::FULL,
8216 false,
8217 )
8218 });
8219
8220 started_rx
8221 .recv_timeout(Duration::from_secs(1))
8222 .expect("embedder should start");
8223
8224 let clear_client = Arc::clone(&client);
8225 let (clear_tx, clear_rx) = std::sync::mpsc::channel();
8226 let clear_handle = std::thread::spawn(move || {
8227 let _ = clear_tx.send(clear_client.clear_semantic_context());
8228 });
8229
8230 clear_rx
8231 .recv_timeout(Duration::from_millis(500))
8232 .expect("semantic lock should not stay held during embed")?;
8233
8234 unblock_tx.send(()).expect("unblock embedder");
8235 clear_handle.join().expect("clear thread join");
8236 let search_result = search_handle.join().expect("search thread join");
8237 assert!(
8238 search_result.is_err(),
8239 "search should observe semantic context cleared after embedding"
8240 );
8241
8242 Ok(())
8243 }
8244
8245 #[test]
8246 fn semantic_embedding_ignores_stale_same_id_context_after_swap() -> Result<()> {
8247 let fixture = build_semantic_test_fixture()?;
8248 let client = Arc::new(fixture.client);
8249 let (started_tx, started_rx) = std::sync::mpsc::channel();
8250 let (unblock_tx, unblock_rx) = std::sync::mpsc::channel();
8251
8252 {
8253 let mut guard = client
8254 .semantic
8255 .lock()
8256 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8257 let state = guard
8258 .as_mut()
8259 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8260 state.embedder = Arc::new(BlockingTestEmbedder::new(
8261 "test-fixed-2d",
8262 &[1.0, 0.0],
8263 started_tx,
8264 unblock_rx,
8265 ));
8266 state.query_cache = QueryCache::new(
8267 "test-fixed-2d",
8268 NonZeroUsize::new(100).expect("cache capacity"),
8269 );
8270 }
8271
8272 let embedding_client = Arc::clone(&client);
8273 let handle =
8274 std::thread::spawn(move || embedding_client.semantic_query_embedding("context-swap"));
8275
8276 started_rx
8277 .recv_timeout(Duration::from_secs(1))
8278 .expect("embedder should start");
8279
8280 {
8281 let mut guard = client
8282 .semantic
8283 .lock()
8284 .map_err(|_| anyhow!("semantic lock poisoned"))?;
8285 let state = guard
8286 .as_mut()
8287 .ok_or_else(|| anyhow!("semantic state missing in fixture"))?;
8288 state.context_token = Arc::new(());
8289 state.embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[0.0, 1.0]));
8290 state.query_cache = QueryCache::new(
8291 "test-fixed-2d",
8292 NonZeroUsize::new(100).expect("cache capacity"),
8293 );
8294 }
8295
8296 unblock_tx.send(()).expect("unblock embedder");
8297
8298 let embedding = handle.join().expect("embedding thread join")?.vector;
8299 assert_eq!(
8300 embedding,
8301 vec![0.0, 1.0],
8302 "stale embedding from the previous same-id context must not leak across the swap"
8303 );
8304
8305 Ok(())
8306 }
8307
8308 #[test]
8309 fn quality_mode_does_not_reuse_fast_only_two_tier_cache() -> Result<()> {
8310 let dir = TempDir::new()?;
8311 let mut index = TantivyIndex::open_or_create(dir.path())?;
8312 index.commit()?;
8313
8314 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8315 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8316 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8317 let writer = VectorIndex::create_with_revision(
8318 &fast_path,
8319 embedder.id(),
8320 "rev-fast-only",
8321 embedder.dimension(),
8322 frankensearch::index::Quantization::F16,
8323 )?;
8324 writer.finish()?;
8325
8326 client.set_semantic_context(
8327 embedder,
8328 VectorIndex::open(&fast_path)?,
8329 SemanticFilterMaps::for_tests(
8330 HashMap::new(),
8331 HashMap::new(),
8332 HashMap::new(),
8333 HashSet::new(),
8334 ),
8335 None,
8336 Some(fast_path),
8337 )?;
8338
8339 let fast_only_index = client
8340 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
8341 .expect("fast-only index should load");
8342 assert!(
8343 !fast_only_index.has_quality_index(),
8344 "fixture should only provide the fast tier"
8345 );
8346
8347 let quality_index = client.in_memory_two_tier_index(SemanticTierMode::QualityOnly)?;
8348 assert!(
8349 quality_index.is_none(),
8350 "quality mode must not reuse a cached fast-only two-tier index"
8351 );
8352
8353 Ok(())
8354 }
8355
8356 #[test]
8357 fn failed_quality_probe_does_not_block_fast_only_two_tier_load() -> Result<()> {
8358 let dir = TempDir::new()?;
8359 let mut index = TantivyIndex::open_or_create(dir.path())?;
8360 index.commit()?;
8361
8362 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8363 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8364 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8365 let writer = VectorIndex::create_with_revision(
8366 &fast_path,
8367 embedder.id(),
8368 "rev-fast-only",
8369 embedder.dimension(),
8370 frankensearch::index::Quantization::F16,
8371 )?;
8372 writer.finish()?;
8373
8374 client.set_semantic_context(
8375 embedder,
8376 VectorIndex::open(&fast_path)?,
8377 SemanticFilterMaps::for_tests(
8378 HashMap::new(),
8379 HashMap::new(),
8380 HashMap::new(),
8381 HashSet::new(),
8382 ),
8383 None,
8384 Some(fast_path),
8385 )?;
8386
8387 assert!(
8388 client
8389 .in_memory_two_tier_index(SemanticTierMode::QualityOnly)?
8390 .is_none(),
8391 "quality-only lookup should fail for a fast-only fixture"
8392 );
8393
8394 let fast_only_index = client
8395 .in_memory_two_tier_index(SemanticTierMode::FastOnly)?
8396 .expect("a failed quality-only probe must not poison fast-only loads");
8397 assert!(
8398 !fast_only_index.has_quality_index(),
8399 "fixture should still resolve to the fast-only tier"
8400 );
8401
8402 Ok(())
8403 }
8404
8405 #[test]
8406 fn progressive_context_error_does_not_poison_future_attempts() -> Result<()> {
8407 let dir = TempDir::new()?;
8408 let mut index = TantivyIndex::open_or_create(dir.path())?;
8409 index.commit()?;
8410
8411 let client = SearchClient::open(dir.path(), None)?.expect("index present");
8412 let embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8413 let fast_path = dir.path().join(format!("index-{}.fsvi", embedder.id()));
8414 let writer = VectorIndex::create_with_revision(
8415 &fast_path,
8416 embedder.id(),
8417 "rev-progressive-error",
8418 embedder.dimension(),
8419 frankensearch::index::Quantization::F16,
8420 )?;
8421 writer.finish()?;
8422 std::fs::write(dir.path().join("vector.fast.idx"), b"not-a-valid-index")?;
8423 std::fs::write(dir.path().join("vector.quality.idx"), b"not-a-valid-index")?;
8424
8425 client.set_semantic_context(
8426 embedder,
8427 VectorIndex::open(&fast_path)?,
8428 SemanticFilterMaps::for_tests(
8429 HashMap::new(),
8430 HashMap::new(),
8431 HashMap::new(),
8432 HashSet::new(),
8433 ),
8434 None,
8435 Some(fast_path),
8436 )?;
8437
8438 let first_err = client
8439 .progressive_context()
8440 .err()
8441 .expect("invalid progressive index files should fail to load");
8442 assert!(
8443 first_err
8444 .to_string()
8445 .contains("open fast-tier index failed"),
8446 "unexpected first progressive-context error: {first_err}"
8447 );
8448
8449 let second_err = client
8450 .progressive_context()
8451 .err()
8452 .expect("a failed progressive load must not be memoized as None");
8453 assert!(
8454 second_err
8455 .to_string()
8456 .contains("open fast-tier index failed"),
8457 "unexpected second progressive-context error: {second_err}"
8458 );
8459
8460 Ok(())
8461 }
8462
8463 fn build_semantic_test_fixture() -> Result<SemanticTestFixture> {
8464 build_semantic_test_fixture_with_shards(false)
8465 }
8466
8467 fn build_sharded_semantic_test_fixture() -> Result<SemanticTestFixture> {
8468 build_semantic_test_fixture_with_shards(true)
8469 }
8470
8471 fn build_semantic_test_fixture_with_shards(sharded: bool) -> Result<SemanticTestFixture> {
8472 let dir = TempDir::new()?;
8473 let db_path = dir.path().join("cass.db");
8474 let storage = FrankenStorage::open(&db_path)?;
8475
8476 let agent = Agent {
8477 id: None,
8478 slug: "codex".into(),
8479 name: "Codex".into(),
8480 version: None,
8481 kind: AgentKind::Cli,
8482 };
8483 let agent_id = storage.ensure_agent(&agent)?;
8484 let workspace_path = dir.path().join("workspace");
8485 std::fs::create_dir_all(&workspace_path)?;
8486 let workspace_id = storage.ensure_workspace(&workspace_path, None)?;
8487
8488 let documents = [
8489 ("session-a.jsonl", "top semantic match", [1.0_f32, 0.0_f32]),
8490 (
8491 "session-b.jsonl",
8492 "middle semantic match",
8493 [0.9_f32, 0.1_f32],
8494 ),
8495 ("session-c.jsonl", "late semantic match", [0.8_f32, 0.2_f32]),
8496 ];
8497 let base_ts = 1_700_000_000_000_i64;
8498 let mut doc_ids = Vec::with_capacity(documents.len());
8499 let mut source_paths = Vec::with_capacity(documents.len());
8500
8501 for (idx, (name, content, _vector)) in documents.iter().enumerate() {
8502 let source_path = dir.path().join(name);
8503 source_paths.push(source_path.to_string_lossy().to_string());
8504
8505 let conversation = Conversation {
8506 id: None,
8507 agent_slug: agent.slug.clone(),
8508 workspace: Some(workspace_path.clone()),
8509 external_id: Some(format!("semantic-{idx}")),
8510 title: Some(format!("semantic session {idx}")),
8511 source_path,
8512 started_at: Some(base_ts + idx as i64),
8513 ended_at: Some(base_ts + idx as i64),
8514 approx_tokens: Some(16),
8515 metadata_json: json!({"fixture": "semantic_search"}),
8516 messages: vec![Message {
8517 id: None,
8518 idx: 0,
8519 role: MessageRole::User,
8520 author: Some("user".into()),
8521 created_at: Some(base_ts + idx as i64),
8522 content: (*content).to_string(),
8523 extra_json: json!({}),
8524 snippets: Vec::new(),
8525 }],
8526 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
8527 origin_host: None,
8528 };
8529
8530 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
8531 }
8532
8533 let message_rows: Vec<(u64, i64)> = storage.raw().query_map_collect(
8534 "SELECT m.id, COALESCE(m.created_at, c.started_at, 0)
8535 FROM messages m
8536 JOIN conversations c ON m.conversation_id = c.id
8537 ORDER BY c.id",
8538 &[],
8539 |row: &frankensqlite::Row| {
8540 let message_id: i64 = row.get_typed(0)?;
8541 let created_at: i64 = row.get_typed(1)?;
8542 Ok((u64::try_from(message_id).unwrap_or(u64::MAX), created_at))
8543 },
8544 )?;
8545 assert_eq!(
8546 message_rows.len(),
8547 documents.len(),
8548 "fixture should create 3 messages"
8549 );
8550
8551 let filter_maps = SemanticFilterMaps::from_storage(&storage)?;
8552 let embedder = Arc::new(FixedTestEmbedder::new("test-fixed-2d", &[1.0, 0.0]));
8553 let source_hash = crc32fast::hash(crate::sources::provenance::LOCAL_SOURCE_ID.as_bytes());
8554 let vector_dir = dir.path().join("vector_index");
8555 std::fs::create_dir_all(&vector_dir)?;
8556 let mut vector_records = Vec::with_capacity(documents.len());
8557
8558 for ((message_id, created_at_ms), (_, _, vector)) in message_rows.iter().zip(documents) {
8559 let doc_id = SemanticDocId {
8560 message_id: *message_id,
8561 chunk_idx: 0,
8562 agent_id: u32::try_from(agent_id)?,
8563 workspace_id: u32::try_from(workspace_id)?,
8564 source_id: source_hash,
8565 role: ROLE_USER,
8566 created_at_ms: *created_at_ms,
8567 content_hash: None,
8568 }
8569 .to_doc_id_string();
8570 doc_ids.push(doc_id.clone());
8571 vector_records.push((doc_id, vector));
8572 }
8573
8574 let mut vector_indexes = Vec::new();
8575 if sharded {
8576 for (shard_index, chunk) in vector_records.chunks(2).enumerate() {
8577 let vector_path = vector_dir.join(format!("shard-{shard_index}.fsvi"));
8578 let mut writer = VectorIndex::create_with_revision(
8579 &vector_path,
8580 embedder.id(),
8581 "rev-1",
8582 embedder.dimension(),
8583 frankensearch::index::Quantization::F16,
8584 )?;
8585 for (doc_id, vector) in chunk {
8586 writer.write_record(doc_id, vector)?;
8587 }
8588 writer.finish()?;
8589 vector_indexes.push(VectorIndex::open(&vector_path)?);
8590 }
8591 } else {
8592 let vector_path = vector_dir.join("index-test-fixed-2d.fsvi");
8593 let mut writer = VectorIndex::create_with_revision(
8594 &vector_path,
8595 embedder.id(),
8596 "rev-1",
8597 embedder.dimension(),
8598 frankensearch::index::Quantization::F16,
8599 )?;
8600 for (doc_id, vector) in &vector_records {
8601 writer.write_record(doc_id, vector)?;
8602 }
8603 writer.finish()?;
8604 vector_indexes.push(VectorIndex::open(&vector_path)?);
8605 }
8606 drop(storage);
8607
8608 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
8609 client.set_semantic_indexes_context(embedder, vector_indexes, filter_maps, None, None)?;
8610
8611 Ok(SemanticTestFixture {
8612 _dir: dir,
8613 client,
8614 doc_ids,
8615 source_paths,
8616 })
8617 }
8618
8619 fn build_progressive_hybrid_fixture() -> Result<ProgressiveHybridFixture> {
8620 let dir = TempDir::new()?;
8621 let mut index = TantivyIndex::open_or_create(dir.path())?;
8622 let workspace_path = dir.path().join("workspace");
8623 std::fs::create_dir_all(&workspace_path)?;
8624 let agent_id = 1_i64;
8625 let workspace_id = 1_i64;
8626 let source_id = crate::sources::provenance::LOCAL_SOURCE_ID;
8627 let source_hash = crc32fast::hash(source_id.as_bytes());
8628 let conn = Connection::open(":memory:")?;
8629 conn.execute_batch(
8630 r#"
8631 CREATE TABLE agents (
8632 id INTEGER PRIMARY KEY,
8633 slug TEXT NOT NULL
8634 );
8635 CREATE TABLE workspaces (
8636 id INTEGER PRIMARY KEY,
8637 path TEXT NOT NULL
8638 );
8639 CREATE TABLE sources (
8640 id TEXT PRIMARY KEY,
8641 kind TEXT NOT NULL
8642 );
8643 CREATE TABLE conversations (
8644 id INTEGER PRIMARY KEY,
8645 agent_id INTEGER NOT NULL,
8646 workspace_id INTEGER,
8647 title TEXT,
8648 source_path TEXT NOT NULL,
8649 source_id TEXT NOT NULL,
8650 origin_host TEXT,
8651 started_at INTEGER
8652 );
8653 CREATE TABLE messages (
8654 id INTEGER PRIMARY KEY,
8655 conversation_id INTEGER NOT NULL,
8656 idx INTEGER NOT NULL,
8657 role TEXT NOT NULL,
8658 created_at INTEGER,
8659 content TEXT NOT NULL
8660 );
8661 "#,
8662 )?;
8663 conn.execute_compat(
8664 "INSERT INTO agents (id, slug) VALUES (?1, ?2)",
8665 params![agent_id, "codex"],
8666 )?;
8667 conn.execute_compat(
8668 "INSERT INTO workspaces (id, path) VALUES (?1, ?2)",
8669 params![workspace_id, workspace_path.to_string_lossy().to_string()],
8670 )?;
8671 conn.execute_compat(
8672 "INSERT INTO sources (id, kind) VALUES (?1, ?2)",
8673 params![source_id, "local"],
8674 )?;
8675
8676 let query = "oauth refresh token middleware session cache".to_string();
8677 let filler = " context window ranking provenance semantic upgrade lexical overlay";
8678 let base_ts = 1_700_000_100_000_i64;
8679 let doc_count = 64usize;
8680 let mut message_rows = Vec::with_capacity(doc_count);
8681
8682 for idx in 0..doc_count {
8683 let conversation_id = i64::try_from(idx + 1)?;
8684 let message_id = u64::try_from(idx + 1)?;
8685 let source_path = dir.path().join(format!("progressive-{idx:03}.jsonl"));
8686 let repeated = filler.repeat(48);
8687 let content = if idx % 4 == 0 {
8688 format!(
8689 "{query} hot path candidate {idx} with detailed search diagnostics.{repeated}"
8690 )
8691 } else if idx % 4 == 1 {
8692 format!(
8693 "search pipeline benchmark {idx} with lexical overlay and semantic ranking.{repeated}"
8694 )
8695 } else if idx % 4 == 2 {
8696 format!(
8697 "interactive typing debounce benchmark {idx} for hybrid two tier search.{repeated}"
8698 )
8699 } else {
8700 format!(
8701 "unrelated background chatter {idx} about build systems and formatting checks.{repeated}"
8702 )
8703 };
8704 let created_at = base_ts + idx as i64;
8705 let source_path_str = source_path.to_string_lossy().to_string();
8706 let title = format!("progressive fixture {idx}");
8707
8708 conn.execute_compat(
8709 "INSERT INTO conversations (
8710 id, agent_id, workspace_id, title, source_path, source_id, origin_host, started_at
8711 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, NULL, ?7)",
8712 params![
8713 conversation_id,
8714 agent_id,
8715 workspace_id,
8716 title,
8717 source_path_str.clone(),
8718 source_id,
8719 created_at
8720 ],
8721 )?;
8722 conn.execute_compat(
8723 "INSERT INTO messages (
8724 id, conversation_id, idx, role, created_at, content
8725 ) VALUES (?1, ?2, 0, 'user', ?3, ?4)",
8726 params![
8727 i64::try_from(message_id)?,
8728 conversation_id,
8729 created_at,
8730 content.clone()
8731 ],
8732 )?;
8733 message_rows.push((message_id, created_at, content.clone()));
8734
8735 let normalized = NormalizedConversation {
8736 agent_slug: "codex".into(),
8737 external_id: Some(format!("progressive-{idx}")),
8738 title: Some(format!("progressive fixture {idx}")),
8739 workspace: Some(workspace_path.clone()),
8740 source_path,
8741 started_at: Some(created_at),
8742 ended_at: Some(created_at),
8743 metadata: json!({}),
8744 messages: vec![NormalizedMessage {
8745 idx: 0,
8746 role: "user".into(),
8747 author: Some("user".into()),
8748 created_at: Some(created_at),
8749 content,
8750 extra: json!({}),
8751 snippets: Vec::new(),
8752 invocations: Vec::new(),
8753 }],
8754 };
8755 index.add_conversation(&normalized)?;
8756 }
8757 index.commit()?;
8758
8759 assert_eq!(
8760 message_rows.len(),
8761 doc_count,
8762 "fixture should create the requested number of messages"
8763 );
8764
8765 let fast_embedder = Arc::new(crate::search::hash_embedder::HashEmbedder::new(256));
8766 let quality_embedder = crate::search::hash_embedder::HashEmbedder::new(384);
8767 let filter_maps = SemanticFilterMaps::for_tests(
8768 HashMap::from([("codex".to_string(), u32::try_from(agent_id)?)]),
8769 HashMap::from([(
8770 workspace_path.to_string_lossy().to_string(),
8771 u32::try_from(workspace_id)?,
8772 )]),
8773 HashMap::from([(source_id.to_string(), source_hash)]),
8774 HashSet::new(),
8775 );
8776 let fast_path = dir.path().join("vector.fast.idx");
8777 let quality_path = dir.path().join("vector.quality.idx");
8778
8779 let mut fast_writer = VectorIndex::create_with_revision(
8780 &fast_path,
8781 fast_embedder.id(),
8782 "rev-progressive-fast",
8783 fast_embedder.dimension(),
8784 frankensearch::index::Quantization::F16,
8785 )?;
8786 let mut quality_writer = VectorIndex::create_with_revision(
8787 &quality_path,
8788 quality_embedder.id(),
8789 "rev-progressive-quality",
8790 quality_embedder.dimension(),
8791 frankensearch::index::Quantization::F16,
8792 )?;
8793
8794 for (message_id, created_at_ms, content) in &message_rows {
8795 let canonical = canonicalize_for_embedding(content);
8796 let doc_id = SemanticDocId {
8797 message_id: *message_id,
8798 chunk_idx: 0,
8799 agent_id: u32::try_from(agent_id)?,
8800 workspace_id: u32::try_from(workspace_id)?,
8801 source_id: source_hash,
8802 role: ROLE_USER,
8803 created_at_ms: *created_at_ms,
8804 content_hash: Some(content_hash(&canonical)),
8805 }
8806 .to_doc_id_string();
8807
8808 let fast_vec = fast_embedder.embed_sync(content)?;
8809 fast_writer.write_record(&doc_id, &fast_vec)?;
8810 let quality_vec = quality_embedder.embed_sync(content)?;
8811 quality_writer.write_record(&doc_id, &quality_vec)?;
8812 }
8813 fast_writer.finish()?;
8814 quality_writer.finish()?;
8815
8816 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
8817 let client = SearchClient {
8818 reader,
8819 sqlite: Mutex::new(Some(SendConnection(conn))),
8820 sqlite_path: None,
8821 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
8822 reload_on_search: true,
8823 last_reload: Mutex::new(None),
8824 last_generation: Mutex::new(None),
8825 reload_epoch: Arc::new(AtomicU64::new(0)),
8826 warm_tx: None,
8827 _warm_handle: None,
8828 metrics: Metrics::default(),
8829 cache_namespace: format!("v{}|schema:{}", CACHE_KEY_VERSION, FS_CASS_SCHEMA_HASH),
8830 semantic: Mutex::new(None),
8831 last_tantivy_total_count: Mutex::new(None),
8832 };
8833 let semantic_embedder: Arc<dyn Embedder> = fast_embedder;
8834 client.set_semantic_context(
8835 semantic_embedder,
8836 VectorIndex::open(&fast_path)?,
8837 filter_maps,
8838 None,
8839 Some(fast_path),
8840 )?;
8841
8842 Ok(ProgressiveHybridFixture {
8843 _dir: dir,
8844 client: Arc::new(client),
8845 query,
8846 })
8847 }
8848
8849 fn sanitize_query(raw: &str) -> String {
8850 nfc_sanitize_query(raw)
8851 }
8852
8853 fn parse_boolean_query(query: &str) -> Vec<FsCassQueryToken> {
8854 fs_cass_parse_boolean_query(query)
8855 }
8856
8857 fn sqlite_master_name_count(db_path: &Path, name: &str) -> Result<i64> {
8858 let conn = FrankenConnection::open(db_path.to_string_lossy().as_ref())?;
8859 Ok(conn.query_row_map(
8860 "SELECT COUNT(*) FROM sqlite_master WHERE name = ?1",
8861 &[ParamValue::from(name)],
8862 |row| row.get_typed(0),
8863 )?)
8864 }
8865
8866 type QueryToken = FsCassQueryToken;
8867 type WildcardPattern = FsCassWildcardPattern;
8868 type QueryTokenList = Vec<QueryToken>;
8869
8870 #[test]
8871 #[ignore = "profiling harness for live hybrid progressive search"]
8872 fn progressive_hybrid_profile_harness() -> Result<()> {
8873 let fixture = build_progressive_hybrid_fixture()?;
8874 let runtime = asupersync::runtime::RuntimeBuilder::current_thread()
8875 .build()
8876 .map_err(|err| anyhow!("build test runtime failed: {err}"))?;
8877 let iterations = 24usize;
8878
8879 runtime.block_on(async {
8880 let cx = FsCx::for_request();
8881 fixture
8882 .client
8883 .search_progressive_with_callback(
8884 ProgressiveSearchRequest {
8885 cx: &cx,
8886 query: &fixture.query,
8887 filters: SearchFilters::default(),
8888 limit: 16,
8889 sparse_threshold: 0,
8890 field_mask: FieldMask::new(false, true, true, true),
8891 mode: SearchMode::Hybrid,
8892 },
8893 |_| {},
8894 )
8895 .await
8896 })?;
8897
8898 let mut initial_events = 0usize;
8899 let mut refined_events = 0usize;
8900 let mut total_hits = 0usize;
8901 for _ in 0..iterations {
8902 let mut refinement_error = None;
8903 runtime.block_on(async {
8904 let cx = FsCx::for_request();
8905 fixture
8906 .client
8907 .search_progressive_with_callback(
8908 ProgressiveSearchRequest {
8909 cx: &cx,
8910 query: &fixture.query,
8911 filters: SearchFilters::default(),
8912 limit: 16,
8913 sparse_threshold: 0,
8914 field_mask: FieldMask::new(false, true, true, true),
8915 mode: SearchMode::Hybrid,
8916 },
8917 |event| match event {
8918 ProgressiveSearchEvent::Phase { kind, result, .. } => {
8919 assert!(
8920 !result.hits.is_empty(),
8921 "progressive harness expects non-empty hits for each phase"
8922 );
8923 total_hits += result.hits.len();
8924 match kind {
8925 ProgressivePhaseKind::Initial => initial_events += 1,
8926 ProgressivePhaseKind::Refined => refined_events += 1,
8927 }
8928 }
8929 ProgressiveSearchEvent::RefinementFailed { error, .. } => {
8930 refinement_error = Some(error);
8931 }
8932 },
8933 )
8934 .await
8935 })?;
8936 if let Some(error) = refinement_error {
8937 bail!("progressive harness refinement failed: {error}");
8938 }
8939 }
8940
8941 assert_eq!(initial_events, iterations);
8942 assert_eq!(refined_events, iterations);
8943 assert!(
8944 total_hits >= iterations.saturating_mul(16),
8945 "harness should observe a full page for each phase"
8946 );
8947
8948 Ok(())
8949 }
8950
8951 #[test]
8956 fn interner_returns_same_arc_for_same_string() {
8957 let interner = StringInterner::new(100);
8958
8959 let s1 = interner.intern("test_query");
8960 let s2 = interner.intern("test_query");
8961
8962 assert!(Arc::ptr_eq(&s1, &s2));
8964 assert_eq!(&*s1, "test_query");
8965 }
8966
8967 #[test]
8968 fn interner_different_strings_return_different_arcs() {
8969 let interner = StringInterner::new(100);
8970
8971 let s1 = interner.intern("query1");
8972 let s2 = interner.intern("query2");
8973
8974 assert!(!Arc::ptr_eq(&s1, &s2));
8975 assert_eq!(&*s1, "query1");
8976 assert_eq!(&*s2, "query2");
8977 }
8978
8979 #[test]
8980 fn interner_handles_empty_string() {
8981 let interner = StringInterner::new(100);
8982
8983 let s1 = interner.intern("");
8984 let s2 = interner.intern("");
8985
8986 assert!(Arc::ptr_eq(&s1, &s2));
8987 assert_eq!(&*s1, "");
8988 }
8989
8990 #[test]
8991 fn interner_handles_unicode() {
8992 let interner = StringInterner::new(100);
8993
8994 let s1 = interner.intern("测试查询");
8995 let s2 = interner.intern("测试查询");
8996 let s3 = interner.intern("emoji 🔍 search");
8997
8998 assert!(Arc::ptr_eq(&s1, &s2));
8999 assert_eq!(&*s3, "emoji 🔍 search");
9000 }
9001
9002 #[test]
9003 fn interner_respects_lru_eviction() {
9004 let interner = StringInterner::new(3);
9005
9006 let _s1 = interner.intern("query1");
9007 let _s2 = interner.intern("query2");
9008 let _s3 = interner.intern("query3");
9009
9010 assert_eq!(interner.len(), 3);
9011
9012 let _s4 = interner.intern("query4");
9014
9015 assert_eq!(interner.len(), 3);
9016
9017 let s1_new = interner.intern("query1");
9019 assert_eq!(&*s1_new, "query1");
9020 }
9021
9022 #[test]
9023 fn interner_concurrent_access() {
9024 use std::thread;
9025
9026 let interner = Arc::new(StringInterner::new(1000));
9027 let queries: Vec<String> = (0..100).map(|i| format!("query_{}", i)).collect();
9028
9029 let handles: Vec<_> = (0..4)
9030 .map(|_| {
9031 let interner = Arc::clone(&interner);
9032 let queries = queries.clone();
9033
9034 thread::spawn(move || {
9035 for _ in 0..10 {
9036 for query in &queries {
9037 let _ = interner.intern(query);
9038 }
9039 }
9040 })
9041 })
9042 .collect();
9043
9044 for handle in handles {
9045 handle.join().unwrap();
9046 }
9047
9048 for query in &queries {
9050 let s1 = interner.intern(query);
9051 let s2 = interner.intern(query);
9052 assert!(Arc::ptr_eq(&s1, &s2));
9053 }
9054 }
9055
9056 #[test]
9061 fn query_terms_lower_basic() {
9062 let terms = QueryTermsLower::from_query("Hello World");
9063
9064 assert_eq!(terms.query_lower, "hello world");
9065 let tokens: Vec<&str> = terms.tokens().collect();
9066 assert_eq!(tokens, vec!["hello", "world"]);
9067 }
9068
9069 #[test]
9070 fn query_terms_lower_empty() {
9071 let terms = QueryTermsLower::from_query("");
9072
9073 assert!(terms.is_empty());
9074 assert_eq!(terms.tokens().count(), 0);
9075 }
9076
9077 #[test]
9078 fn query_terms_lower_single_term() {
9079 let terms = QueryTermsLower::from_query("TEST");
9080
9081 let tokens: Vec<&str> = terms.tokens().collect();
9082 assert_eq!(tokens, vec!["test"]);
9083 }
9084
9085 #[test]
9086 fn query_terms_lower_with_punctuation() {
9087 let terms = QueryTermsLower::from_query("hello, world! how's it?");
9088
9089 let tokens: Vec<&str> = terms.tokens().collect();
9090 assert_eq!(tokens, vec!["hello", "world", "how", "s", "it"]);
9091 }
9092
9093 #[test]
9094 fn query_terms_lower_unicode() {
9095 let terms = QueryTermsLower::from_query("Héllo Wörld");
9096
9097 assert_eq!(terms.query_lower, "héllo wörld");
9098 let tokens: Vec<&str> = terms.tokens().collect();
9099 assert_eq!(tokens, vec!["héllo", "wörld"]);
9100 }
9101
9102 #[test]
9103 fn query_terms_lower_bloom_mask() {
9104 let terms = QueryTermsLower::from_query("test");
9105
9106 assert_ne!(terms.bloom_mask(), 0);
9108
9109 let terms2 = QueryTermsLower::from_query("test");
9111 assert_eq!(terms.bloom_mask(), terms2.bloom_mask());
9112 }
9113
9114 #[test]
9115 fn hit_matches_with_precomputed_terms() {
9116 let hit = SearchHit {
9117 title: "Test Title".into(),
9118 snippet: "".into(),
9119 content: "hello world content".into(),
9120 content_hash: stable_content_hash("hello world content"),
9121 score: 1.0,
9122 source_path: "p".into(),
9123 agent: "a".into(),
9124 workspace: "w".into(),
9125 workspace_original: None,
9126 created_at: None,
9127 line_number: None,
9128 match_type: MatchType::Exact,
9129 source_id: "local".into(),
9130 origin_kind: "local".into(),
9131 origin_host: None,
9132 conversation_id: None,
9133 };
9134 let cached = cached_hit_from(&hit);
9135
9136 let terms = QueryTermsLower::from_query("hello");
9138 assert!(hit_matches_query_cached_precomputed(&cached, &terms));
9139
9140 let terms_miss = QueryTermsLower::from_query("missing");
9141 assert!(!hit_matches_query_cached_precomputed(&cached, &terms_miss));
9142 }
9143
9144 fn make_fused_hit(
9149 id: &str,
9150 rrf: f32,
9151 lexical: Option<usize>,
9152 semantic: Option<usize>,
9153 ) -> FusedHit {
9154 FusedHit {
9155 key: SearchHitKey {
9156 source_id: "local".to_string(),
9157 source_path: id.to_string(),
9158 conversation_id: None,
9159 title: String::new(),
9160 line_number: None,
9161 created_at: None,
9162 content_hash: 0,
9163 },
9164 score: HybridScore {
9165 rrf,
9166 lexical_rank: lexical,
9167 semantic_rank: semantic,
9168 lexical_score: None,
9169 semantic_score: None,
9170 },
9171 hit: SearchHit {
9172 title: id.into(),
9173 snippet: "".into(),
9174 content: "".into(),
9175 content_hash: 0,
9176 score: rrf,
9177 source_path: id.into(),
9178 agent: "test".into(),
9179 workspace: "test".into(),
9180 workspace_original: None,
9181 created_at: None,
9182 line_number: None,
9183 match_type: MatchType::Exact,
9184 source_id: "local".into(),
9185 origin_kind: "local".into(),
9186 origin_host: None,
9187 conversation_id: None,
9188 },
9189 }
9190 }
9191
9192 fn make_federated_merge_hit(id: &str, agent: &str) -> SearchHit {
9193 SearchHit {
9194 title: id.into(),
9195 snippet: String::new(),
9196 content: id.into(),
9197 content_hash: stable_content_hash(id),
9198 score: 0.0,
9199 source_path: format!("{id}.jsonl"),
9200 agent: agent.into(),
9201 workspace: "workspace".into(),
9202 workspace_original: None,
9203 created_at: Some(1_700_000_000_000),
9204 line_number: Some(1),
9205 match_type: MatchType::Exact,
9206 source_id: "local".into(),
9207 origin_kind: "local".into(),
9208 origin_host: None,
9209 conversation_id: None,
9210 }
9211 }
9212
9213 fn make_federated_ranked_hit(
9214 shard_index: usize,
9215 shard_rank: usize,
9216 id: &str,
9217 ) -> FederatedRankedHit {
9218 FederatedRankedHit {
9219 hit: make_federated_merge_hit(id, &format!("shard-{shard_index}")),
9220 shard_index,
9221 shard_rank,
9222 fused_score: federated_rrf_score(shard_rank),
9223 }
9224 }
9225
9226 #[test]
9227 fn federated_merge_orders_equal_rank_hits_by_stable_hit_key() {
9228 let merged = merge_federated_ranked_hits(vec![
9229 make_federated_ranked_hit(2, 0, "zeta"),
9230 make_federated_ranked_hit(0, 0, "bravo"),
9231 make_federated_ranked_hit(1, 0, "alpha"),
9232 ]);
9233
9234 let paths = merged
9235 .iter()
9236 .map(|hit| hit.source_path.as_str())
9237 .collect::<Vec<_>>();
9238 assert_eq!(paths, vec!["alpha.jsonl", "bravo.jsonl", "zeta.jsonl"]);
9239 assert!(
9240 merged
9241 .iter()
9242 .all(|hit| (hit.score - federated_rrf_score(0)).abs() < f32::EPSILON),
9243 "equal per-shard rank should produce equal RRF scores"
9244 );
9245 }
9246
9247 #[test]
9248 fn federated_merge_keeps_rrf_rank_ahead_of_stable_key() {
9249 let merged = merge_federated_ranked_hits(vec![
9250 make_federated_ranked_hit(0, 1, "alpha"),
9251 make_federated_ranked_hit(1, 0, "zeta"),
9252 ]);
9253
9254 let paths = merged
9255 .iter()
9256 .map(|hit| hit.source_path.as_str())
9257 .collect::<Vec<_>>();
9258 assert_eq!(paths, vec!["zeta.jsonl", "alpha.jsonl"]);
9259 assert!(merged[0].score > merged[1].score);
9260 }
9261
9262 #[test]
9263 fn federated_merge_uses_shard_index_as_duplicate_final_tiebreak() {
9264 let merged = merge_federated_ranked_hits(vec![
9265 FederatedRankedHit {
9266 hit: make_federated_merge_hit("same", "shard-2"),
9267 shard_index: 2,
9268 shard_rank: 0,
9269 fused_score: federated_rrf_score(0),
9270 },
9271 FederatedRankedHit {
9272 hit: make_federated_merge_hit("same", "shard-0"),
9273 shard_index: 0,
9274 shard_rank: 0,
9275 fused_score: federated_rrf_score(0),
9276 },
9277 ]);
9278
9279 assert_eq!(merged[0].agent, "shard-0");
9280 assert_eq!(merged[1].agent, "shard-2");
9281 }
9282
9283 #[test]
9284 fn top_k_fused_basic() {
9285 let hits = vec![
9286 make_fused_hit("a", 1.0, Some(0), None),
9287 make_fused_hit("b", 3.0, Some(1), None),
9288 make_fused_hit("c", 2.0, Some(2), None),
9289 make_fused_hit("d", 5.0, Some(3), None),
9290 make_fused_hit("e", 4.0, Some(4), None),
9291 ];
9292
9293 let top = top_k_fused(hits, 3);
9294
9295 assert_eq!(top.len(), 3);
9296 assert_eq!(top[0].key.source_path, "d"); assert_eq!(top[1].key.source_path, "e"); assert_eq!(top[2].key.source_path, "b"); }
9300
9301 #[test]
9302 fn top_k_fused_empty() {
9303 let hits: Vec<FusedHit> = vec![];
9304 let top = top_k_fused(hits, 10);
9305 assert!(top.is_empty());
9306 }
9307
9308 #[test]
9309 fn top_k_fused_k_zero() {
9310 let hits = vec![
9311 make_fused_hit("a", 1.0, Some(0), None),
9312 make_fused_hit("b", 2.0, Some(1), None),
9313 ];
9314 let top = top_k_fused(hits, 0);
9315 assert!(top.is_empty());
9316 }
9317
9318 #[test]
9319 fn top_k_fused_k_larger_than_n() {
9320 let hits = vec![
9321 make_fused_hit("a", 1.0, Some(0), None),
9322 make_fused_hit("b", 2.0, Some(1), None),
9323 ];
9324
9325 let top = top_k_fused(hits, 10);
9326
9327 assert_eq!(top.len(), 2);
9328 assert_eq!(top[0].key.source_path, "b"); assert_eq!(top[1].key.source_path, "a"); }
9331
9332 #[test]
9333 fn top_k_fused_k_equals_n() {
9334 let hits = vec![
9335 make_fused_hit("a", 3.0, Some(0), None),
9336 make_fused_hit("b", 1.0, Some(1), None),
9337 make_fused_hit("c", 2.0, Some(2), None),
9338 ];
9339
9340 let top = top_k_fused(hits, 3);
9341
9342 assert_eq!(top.len(), 3);
9343 assert_eq!(top[0].key.source_path, "a"); assert_eq!(top[1].key.source_path, "c"); assert_eq!(top[2].key.source_path, "b"); }
9347
9348 #[test]
9349 fn top_k_fused_k_one() {
9350 let hits = vec![
9351 make_fused_hit("a", 1.0, Some(0), None),
9352 make_fused_hit("b", 3.0, Some(1), None),
9353 make_fused_hit("c", 2.0, Some(2), None),
9354 ];
9355
9356 let top = top_k_fused(hits, 1);
9357
9358 assert_eq!(top.len(), 1);
9359 assert_eq!(top[0].key.source_path, "b");
9360 assert_eq!(top[0].score.rrf, 3.0);
9361 }
9362
9363 #[test]
9364 fn top_k_fused_duplicate_scores() {
9365 let hits = vec![
9366 make_fused_hit("a", 2.0, Some(0), None),
9367 make_fused_hit("b", 2.0, Some(1), None),
9368 make_fused_hit("c", 2.0, Some(2), None),
9369 make_fused_hit("d", 1.0, Some(3), None),
9370 ];
9371
9372 let top = top_k_fused(hits, 2);
9373
9374 assert_eq!(top.len(), 2);
9375 assert_eq!(top[0].score.rrf, 2.0);
9377 assert_eq!(top[1].score.rrf, 2.0);
9378 }
9379
9380 #[test]
9381 fn top_k_fused_dual_source_tiebreaker() {
9382 let hits = vec![
9384 make_fused_hit("a", 2.0, Some(0), None), make_fused_hit("b", 2.0, Some(1), Some(0)), make_fused_hit("c", 2.0, None, Some(1)), ];
9388
9389 let top = top_k_fused(hits, 3);
9390
9391 assert_eq!(top.len(), 3);
9392 assert_eq!(top[0].key.source_path, "b");
9394 }
9395
9396 #[test]
9397 fn top_k_fused_large_input_uses_quickselect() {
9398 let hits: Vec<FusedHit> = (0..100)
9400 .map(|i| make_fused_hit(&format!("hit_{}", i), i as f32, Some(i), None))
9401 .collect();
9402
9403 let top = top_k_fused(hits, 10);
9404
9405 assert_eq!(top.len(), 10);
9406 for (i, hit) in top.iter().enumerate() {
9408 assert_eq!(hit.key.source_path, format!("hit_{}", 99 - i));
9409 assert_eq!(hit.score.rrf, (99 - i) as f32);
9410 }
9411 }
9412
9413 #[test]
9414 fn top_k_fused_equivalence_with_full_sort() {
9415 for n in [10, 50, 100, 200] {
9417 for k in [1, 5, 10, 25] {
9418 if k > n {
9419 continue;
9420 }
9421
9422 let hits: Vec<FusedHit> = (0..n)
9423 .map(|i| {
9424 let score = ((i * 17 + 7) % 1000) as f32;
9426 make_fused_hit(&format!("hit_{}", i), score, Some(i), None)
9427 })
9428 .collect();
9429
9430 let mut baseline = hits.clone();
9432 baseline.sort_by(cmp_fused_hit_desc);
9433 baseline.truncate(k);
9434
9435 let quickselect = top_k_fused(hits, k);
9437
9438 assert_eq!(quickselect.len(), baseline.len(), "n={}, k={}", n, k);
9440
9441 for (q, b) in quickselect.iter().zip(baseline.iter()) {
9443 assert_eq!(
9444 q.key.source_path, b.key.source_path,
9445 "n={}, k={}: mismatch",
9446 n, k
9447 );
9448 assert_eq!(q.score.rrf, b.score.rrf, "n={}, k={}: score mismatch", n, k);
9449 }
9450 }
9451 }
9452 }
9453
9454 #[test]
9455 fn cmp_fused_hit_desc_basic_ordering() {
9456 let a = make_fused_hit("a", 2.0, Some(0), None);
9457 let b = make_fused_hit("b", 3.0, Some(1), None);
9458
9459 assert_eq!(cmp_fused_hit_desc(&a, &b), CmpOrdering::Greater);
9461 assert_eq!(cmp_fused_hit_desc(&b, &a), CmpOrdering::Less);
9462 assert_eq!(cmp_fused_hit_desc(&a, &a), CmpOrdering::Equal);
9463 }
9464
9465 #[test]
9470 fn cache_enforces_prefix_matching() {
9471 let hit = SearchHit {
9473 title: "test".into(),
9474 snippet: "".into(),
9475 content: "arrow".into(),
9476 content_hash: stable_content_hash("arrow"),
9477 score: 1.0,
9478 source_path: "p".into(),
9479 agent: "a".into(),
9480 workspace: "w".into(),
9481 workspace_original: None,
9482 created_at: None,
9483 line_number: None,
9484 match_type: MatchType::Exact,
9485 source_id: "local".into(),
9486 origin_kind: "local".into(),
9487 origin_host: None,
9488 conversation_id: None,
9489 };
9490
9491 let cached = CachedHit {
9492 hit: hit.clone(),
9493 lc_content: "arrow".into(),
9494 lc_title: Some("test".into()),
9495 bloom64: u64::MAX, };
9497
9498 let matched = hit_matches_query_cached(&cached, "row");
9501
9502 assert!(
9503 !matched,
9504 "Query 'row' should NOT match content 'arrow' (prefix match required)"
9505 );
9506 }
9507
9508 #[test]
9509 fn search_deduplication_across_pages_repro() {
9510 let dir = TempDir::new().unwrap();
9515 let index_path = dir.path();
9516 let mut index = TantivyIndex::open_or_create(index_path).unwrap();
9517
9518 let msg1 = NormalizedMessage {
9522 idx: 0,
9523 role: "user".into(),
9524 author: None,
9525 created_at: Some(1000),
9526 content: "duplicate content".into(),
9527 extra: serde_json::json!({}),
9528 snippets: Vec::new(),
9529 invocations: Vec::new(),
9530 };
9531 let conv1 = NormalizedConversation {
9532 agent_slug: "agent1".into(),
9533 external_id: None,
9534 title: None,
9535 workspace: None,
9536 source_path: "path/1".into(),
9537 started_at: None,
9538 ended_at: None,
9539 metadata: serde_json::json!({}),
9540 messages: vec![msg1],
9541 };
9542
9543 let msg2 = NormalizedMessage {
9544 idx: 0,
9545 role: "user".into(),
9546 author: None,
9547 created_at: Some(2000), content: "duplicate content".into(), extra: serde_json::json!({}),
9550 snippets: Vec::new(),
9551 invocations: Vec::new(),
9552 };
9553 let conv2 = NormalizedConversation {
9554 agent_slug: "agent1".into(),
9555 external_id: None,
9556 title: None,
9557 workspace: None,
9558 source_path: "path/2".into(), started_at: None,
9560 ended_at: None,
9561 metadata: serde_json::json!({}),
9562 messages: vec![msg2],
9563 };
9564
9565 index.add_conversation(&conv1).unwrap();
9566 index.add_conversation(&conv2).unwrap();
9567 index.commit().unwrap();
9568
9569 let client = SearchClient::open(index_path, None).unwrap().unwrap();
9570
9571 let page1 = client
9573 .search("duplicate", SearchFilters::default(), 1, 0, FieldMask::FULL)
9574 .unwrap();
9575 assert_eq!(page1.len(), 1);
9576
9577 let page2 = client
9579 .search("duplicate", SearchFilters::default(), 1, 1, FieldMask::FULL)
9580 .unwrap();
9581
9582 assert_eq!(page2.len(), 1);
9583 assert_ne!(page1[0].source_path, page2[0].source_path);
9584 }
9585
9586 #[test]
9587 fn cache_skips_complex_queries() {
9588 let client = SearchClient {
9589 reader: None,
9590 sqlite: Mutex::new(None),
9591 sqlite_path: None,
9592 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9593 reload_on_search: true,
9594 last_reload: Mutex::new(None),
9595 last_generation: Mutex::new(None),
9596 reload_epoch: Arc::new(AtomicU64::new(0)),
9597 warm_tx: None,
9598 _warm_handle: None,
9599 metrics: Metrics::default(),
9600 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
9601 semantic: Mutex::new(None),
9602 last_tantivy_total_count: Mutex::new(None),
9603 };
9604
9605 let _ = client.search("foo*", SearchFilters::default(), 10, 0, FieldMask::FULL);
9607 let stats = client.cache_stats();
9608 assert_eq!(
9609 stats.cache_miss, 0,
9610 "Wildcard query should not trigger cache miss"
9611 );
9612
9613 let _ = client.search(
9615 "foo OR bar",
9616 SearchFilters::default(),
9617 10,
9618 0,
9619 FieldMask::FULL,
9620 );
9621 let stats = client.cache_stats();
9622 assert_eq!(
9623 stats.cache_miss, 0,
9624 "Boolean query should not trigger cache miss"
9625 );
9626
9627 let _ = client.search("simple", SearchFilters::default(), 10, 0, FieldMask::FULL);
9629 let stats = client.cache_stats();
9630 assert_eq!(
9631 stats.cache_miss, 1,
9632 "Simple query should trigger cache miss"
9633 );
9634 }
9635
9636 #[test]
9637 fn cache_prefix_lookup_handles_utf8_boundaries() {
9638 let client = SearchClient {
9639 reader: None,
9640 sqlite: Mutex::new(None),
9641 sqlite_path: None,
9642 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9643 reload_on_search: true,
9644 last_reload: Mutex::new(None),
9645 last_generation: Mutex::new(None),
9646 reload_epoch: Arc::new(AtomicU64::new(0)),
9647 warm_tx: None,
9648 _warm_handle: None,
9649 metrics: Metrics::default(),
9650 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
9651 semantic: Mutex::new(None),
9652 last_tantivy_total_count: Mutex::new(None),
9653 };
9654
9655 let hits = vec![SearchHit {
9656 title: "こんにちは".into(),
9657 snippet: String::new(),
9658 content: "こんにちは 世界".into(),
9659 content_hash: stable_content_hash("こんにちは 世界"),
9660 score: 1.0,
9661 source_path: "p".into(),
9662 agent: "a".into(),
9663 workspace: "w".into(),
9664 workspace_original: None,
9665 created_at: None,
9666 line_number: None,
9667 match_type: MatchType::Exact,
9668 source_id: "local".into(),
9669 origin_kind: "local".into(),
9670 origin_host: None,
9671 conversation_id: None,
9672 }];
9673
9674 client.put_cache("こん", &SearchFilters::default(), &hits);
9675
9676 let cached = client
9677 .cached_prefix_hits("こんにちは", &SearchFilters::default())
9678 .unwrap();
9679 assert_eq!(cached.len(), 1);
9680 assert_eq!(cached[0].hit.title, "こんにちは");
9681 }
9682
9683 #[test]
9684 fn bloom_gate_rejects_missing_terms() {
9685 let hit = SearchHit {
9686 title: "hello world".into(),
9687 snippet: "hello world".into(),
9688 content: "hello world".into(),
9689 content_hash: stable_content_hash("hello world"),
9690 score: 1.0,
9691 source_path: "p".into(),
9692 agent: "a".into(),
9693 workspace: "w".into(),
9694 workspace_original: None,
9695 created_at: None,
9696 line_number: None,
9697 match_type: MatchType::Exact,
9698 source_id: "local".into(),
9699 origin_kind: "local".into(),
9700 origin_host: None,
9701 conversation_id: None,
9702 };
9703 let cached = cached_hit_from(&hit);
9704 assert!(hit_matches_query_cached(&cached, "hello"));
9705 assert!(!hit_matches_query_cached(&cached, "missing"));
9706
9707 let metrics = Metrics::default();
9708 metrics.inc_cache_hits();
9709 metrics.inc_cache_miss();
9710 metrics.inc_cache_shortfall();
9711 metrics.inc_reload();
9712 let (hits, miss, shortfall, reloads, _) = metrics.snapshot_all();
9713 assert_eq!((hits, miss, shortfall, reloads), (1, 1, 1, 1));
9714 }
9715
9716 #[test]
9717 fn progressive_lexical_hit_omits_unused_content() {
9718 let hit = SearchHit {
9719 title: "hello world".into(),
9720 snippet: "hello **world**".into(),
9721 content: "hello world from a much larger conversation body".into(),
9722 content_hash: stable_content_hash("hello world from a much larger conversation body"),
9723 score: 1.0,
9724 source_path: "p".into(),
9725 agent: "a".into(),
9726 workspace: "w".into(),
9727 workspace_original: None,
9728 created_at: None,
9729 line_number: Some(3),
9730 match_type: MatchType::Exact,
9731 source_id: "local".into(),
9732 origin_kind: "local".into(),
9733 origin_host: None,
9734 conversation_id: None,
9735 };
9736
9737 let snippet_only =
9738 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(false, true, true, true));
9739 assert_eq!(snippet_only.title, hit.title);
9740 assert_eq!(snippet_only.snippet, hit.snippet);
9741 assert!(
9742 snippet_only.content.is_empty(),
9743 "snippet-only progressive cache should not retain full content"
9744 );
9745 assert_eq!(snippet_only.match_type, hit.match_type);
9746 assert_eq!(snippet_only.line_number, hit.line_number);
9747 assert_eq!(snippet_only.source_path, hit.source_path);
9748 assert_eq!(snippet_only.agent, hit.agent);
9749 assert_eq!(snippet_only.workspace, hit.workspace);
9750
9751 let full =
9752 ProgressiveLexicalHit::from_search_hit(&hit, FieldMask::new(true, true, true, true));
9753 assert_eq!(full.content, hit.content);
9754 }
9755
9756 #[test]
9757 fn progressive_phase_reuses_lexical_cache_without_db_hydration() -> Result<()> {
9758 let client = SearchClient {
9759 reader: None,
9760 sqlite: Mutex::new(None),
9761 sqlite_path: None,
9762 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
9763 reload_on_search: true,
9764 last_reload: Mutex::new(None),
9765 last_generation: Mutex::new(None),
9766 reload_epoch: Arc::new(AtomicU64::new(0)),
9767 warm_tx: None,
9768 _warm_handle: None,
9769 metrics: Metrics::default(),
9770 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
9771 semantic: Mutex::new(None),
9772 last_tantivy_total_count: Mutex::new(None),
9773 };
9774 let field_mask = FieldMask::new(false, true, true, true);
9775 let lexical_hit = SearchHit {
9776 title: "lexical title".into(),
9777 snippet: "lexical snippet".into(),
9778 content: "full lexical body".into(),
9779 content_hash: stable_content_hash("full lexical body"),
9780 score: 0.0,
9781 source_path: "/tmp/session.jsonl".into(),
9782 agent: "codex".into(),
9783 workspace: "/tmp".into(),
9784 workspace_original: Some("/original".into()),
9785 created_at: Some(1_700_000_000_000),
9786 line_number: Some(7),
9787 match_type: MatchType::Exact,
9788 source_id: "local".into(),
9789 origin_kind: "local".into(),
9790 origin_host: None,
9791 conversation_id: None,
9792 };
9793 let mut lexical_cache = ProgressiveLexicalCache::default();
9794 lexical_cache.hits_by_message.insert(
9795 42,
9796 ProgressiveLexicalHit::from_search_hit(&lexical_hit, field_mask),
9797 );
9798
9799 let hash_hex = "00".repeat(32);
9800 let results = vec![FsScoredResult {
9801 doc_id: format!("m|42|0|1|1|1|1|1700000000000|{hash_hex}"),
9802 score: 0.91,
9803 source: FsScoreSource::Lexical,
9804 index: None,
9805 fast_score: None,
9806 quality_score: None,
9807 lexical_score: Some(0.91),
9808 rerank_score: None,
9809 explanation: None,
9810 metadata: None,
9811 }];
9812
9813 let result = client.progressive_phase_to_result(
9814 &results,
9815 ProgressivePhaseContext {
9816 query: "merged title",
9817 filters: &SearchFilters::default(),
9818 field_mask,
9819 lexical_cache: Some(&lexical_cache),
9820 limit: 1,
9821 fetch_limit: 1,
9822 },
9823 )?;
9824
9825 assert_eq!(result.hits.len(), 1);
9826 assert_eq!(result.hits[0].title, lexical_hit.title);
9827 assert_eq!(result.hits[0].snippet, lexical_hit.snippet);
9828 assert!(
9829 result.hits[0].content.is_empty(),
9830 "masked lexical cache should still avoid carrying full content"
9831 );
9832 assert_eq!(result.hits[0].source_path, lexical_hit.source_path);
9833 assert_eq!(result.hits[0].score, 0.91);
9834
9835 Ok(())
9836 }
9837
9838 #[test]
9839 fn search_returns_results_with_filters_and_pagination() -> Result<()> {
9840 let dir = TempDir::new()?;
9841 let mut index = TantivyIndex::open_or_create(dir.path())?;
9842 let conv = NormalizedConversation {
9843 agent_slug: "codex".into(),
9844 external_id: None,
9845 title: Some("hello world convo".into()),
9846 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
9847 source_path: dir.path().join("rollout-1.jsonl"),
9848 started_at: Some(1_700_000_000_000),
9849 ended_at: None,
9850 metadata: serde_json::json!({}),
9851 messages: vec![NormalizedMessage {
9852 idx: 0,
9853 role: "user".into(),
9854 author: Some("me".into()),
9855 created_at: Some(1_700_000_000_000),
9856 content: "hello rust world".into(),
9857 extra: serde_json::json!({}),
9858 snippets: vec![NormalizedSnippet {
9859 file_path: None,
9860 start_line: None,
9861 end_line: None,
9862 language: None,
9863 snippet_text: None,
9864 }],
9865 invocations: Vec::new(),
9866 }],
9867 };
9868 index.add_conversation(&conv)?;
9869 index.commit()?;
9870
9871 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9872 let mut filters = SearchFilters::default();
9873 filters.agents.insert("codex".into());
9874
9875 let hits = client.search("hello", filters, 10, 0, FieldMask::FULL)?;
9876 assert_eq!(hits.len(), 1);
9877 assert_eq!(hits[0].agent, "codex");
9878 assert!(hits[0].snippet.contains("hello"));
9879 Ok(())
9880 }
9881
9882 #[test]
9883 fn search_honors_created_range_and_workspace() -> Result<()> {
9884 let dir = TempDir::new()?;
9885 let mut index = TantivyIndex::open_or_create(dir.path())?;
9886
9887 let conv_a = NormalizedConversation {
9888 agent_slug: "codex".into(),
9889 external_id: None,
9890 title: Some("needle one".into()),
9891 workspace: Some(std::path::PathBuf::from("/ws/a")),
9892 source_path: dir.path().join("a.jsonl"),
9893 started_at: Some(10),
9894 ended_at: None,
9895 metadata: serde_json::json!({}),
9896 messages: vec![NormalizedMessage {
9897 idx: 0,
9898 role: "user".into(),
9899 author: None,
9900 created_at: Some(10),
9901 content: "alpha needle".into(),
9902 extra: serde_json::json!({}),
9903 snippets: vec![NormalizedSnippet {
9904 file_path: None,
9905 start_line: None,
9906 end_line: None,
9907 language: None,
9908 snippet_text: None,
9909 }],
9910 invocations: Vec::new(),
9911 }],
9912 };
9913 let conv_b = NormalizedConversation {
9914 agent_slug: "codex".into(),
9915 external_id: None,
9916 title: Some("needle two".into()),
9917 workspace: Some(std::path::PathBuf::from("/ws/b")),
9918 source_path: dir.path().join("b.jsonl"),
9919 started_at: Some(20),
9920 ended_at: None,
9921 metadata: serde_json::json!({}),
9922 messages: vec![NormalizedMessage {
9923 idx: 0,
9924 role: "user".into(),
9925 author: None,
9926 created_at: Some(20),
9927 content: "\nneedle second line".into(),
9928 extra: serde_json::json!({}),
9929 snippets: vec![NormalizedSnippet {
9930 file_path: None,
9931 start_line: None,
9932 end_line: None,
9933 language: None,
9934 snippet_text: None,
9935 }],
9936 invocations: Vec::new(),
9937 }],
9938 };
9939 index.add_conversation(&conv_a)?;
9940 index.add_conversation(&conv_b)?;
9941 index.commit()?;
9942
9943 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9944 let mut filters = SearchFilters::default();
9945 filters.workspaces.insert("/ws/b".into());
9946 filters.created_from = Some(15);
9947 filters.created_to = Some(25);
9948
9949 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
9950 assert_eq!(hits.len(), 1);
9951 assert_eq!(hits[0].workspace, "/ws/b");
9952 assert!(hits[0].snippet.contains("second line"));
9953 Ok(())
9954 }
9955
9956 #[test]
9957 fn pagination_skips_results() -> Result<()> {
9958 let dir = TempDir::new()?;
9959 let mut index = TantivyIndex::open_or_create(dir.path())?;
9960 for i in 0..3 {
9961 let conv = NormalizedConversation {
9962 agent_slug: "codex".into(),
9963 external_id: None,
9964 title: Some(format!("doc-{i}")),
9965 workspace: Some(std::path::PathBuf::from("/ws/p")),
9966 source_path: dir.path().join(format!("{i}.jsonl")),
9967 started_at: Some(100 + i),
9968 ended_at: None,
9969 metadata: serde_json::json!({}),
9970 messages: vec![NormalizedMessage {
9971 idx: 0,
9972 role: "user".into(),
9973 author: None,
9974 created_at: Some(100 + i),
9975 content: format!("pagination needle document number {i}"),
9977 extra: serde_json::json!({}),
9978 snippets: vec![NormalizedSnippet {
9979 file_path: None,
9980 start_line: None,
9981 end_line: None,
9982 language: None,
9983 snippet_text: None,
9984 }],
9985 invocations: Vec::new(),
9986 }],
9987 };
9988 index.add_conversation(&conv)?;
9989 }
9990 index.commit()?;
9991
9992 let client = SearchClient::open(dir.path(), None)?.expect("index present");
9993 let hits = client.search(
9994 "pagination",
9995 SearchFilters::default(),
9996 1,
9997 1,
9998 FieldMask::FULL,
9999 )?;
10000 assert_eq!(hits.len(), 1);
10001 Ok(())
10002 }
10003
10004 #[test]
10005 fn search_matches_hyphenated_term() -> Result<()> {
10006 let dir = TempDir::new()?;
10007 let mut index = TantivyIndex::open_or_create(dir.path())?;
10008 let conv = NormalizedConversation {
10009 agent_slug: "codex".into(),
10010 external_id: None,
10011 title: Some("cma-es notes".into()),
10012 workspace: Some(std::path::PathBuf::from("/tmp/workspace")),
10013 source_path: dir.path().join("rollout-1.jsonl"),
10014 started_at: Some(1_700_000_000_000),
10015 ended_at: None,
10016 metadata: serde_json::json!({}),
10017 messages: vec![NormalizedMessage {
10018 idx: 0,
10019 role: "user".into(),
10020 author: Some("me".into()),
10021 created_at: Some(1_700_000_000_000),
10022 content: "Need CMA-ES strategy and CMA ES variants".into(),
10023 extra: serde_json::json!({}),
10024 snippets: vec![NormalizedSnippet {
10025 file_path: None,
10026 start_line: None,
10027 end_line: None,
10028 language: None,
10029 snippet_text: None,
10030 }],
10031 invocations: Vec::new(),
10032 }],
10033 };
10034 index.add_conversation(&conv)?;
10035 index.commit()?;
10036
10037 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10038 let hits = client.search("cma-es", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10039 assert_eq!(hits.len(), 1);
10040 assert!(hits[0].snippet.to_lowercase().contains("cma"));
10041 Ok(())
10042 }
10043
10044 #[test]
10045 fn search_matches_prefix_edge_ngram() -> Result<()> {
10046 let dir = TempDir::new()?;
10047 let mut index = TantivyIndex::open_or_create(dir.path())?;
10048 let conv = NormalizedConversation {
10049 agent_slug: "codex".into(),
10050 external_id: None,
10051 title: Some("math logic".into()),
10052 workspace: Some(std::path::PathBuf::from("/ws/m")),
10053 source_path: dir.path().join("math.jsonl"),
10054 started_at: Some(1000),
10055 ended_at: None,
10056 metadata: serde_json::json!({}),
10057 messages: vec![NormalizedMessage {
10058 idx: 0,
10059 role: "user".into(),
10060 author: None,
10061 created_at: Some(1000),
10062 content: "please calculate the entropy".into(),
10063 extra: serde_json::json!({}),
10064 snippets: vec![],
10065 invocations: Vec::new(),
10066 }],
10067 };
10068 index.add_conversation(&conv)?;
10069 index.commit()?;
10070
10071 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10072
10073 let hits = client.search("cal", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10075 assert_eq!(hits.len(), 1);
10076 assert!(hits[0].content.contains("calculate"));
10077
10078 let hits = client.search("entr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10080 assert_eq!(hits.len(), 1);
10081
10082 Ok(())
10083 }
10084
10085 #[test]
10086 fn search_matches_snake_case() -> Result<()> {
10087 let dir = TempDir::new()?;
10088 let mut index = TantivyIndex::open_or_create(dir.path())?;
10089 let conv = NormalizedConversation {
10090 agent_slug: "codex".into(),
10091 external_id: None,
10092 title: Some("code".into()),
10093 workspace: None,
10094 source_path: dir.path().join("c.jsonl"),
10095 started_at: Some(1),
10096 ended_at: None,
10097 metadata: serde_json::json!({}),
10098 messages: vec![NormalizedMessage {
10099 idx: 0,
10100 role: "user".into(),
10101 author: None,
10102 created_at: Some(1),
10103 content: "check the my_variable_name please".into(),
10104 extra: serde_json::json!({}),
10105 snippets: vec![],
10106 invocations: Vec::new(),
10107 }],
10108 };
10109 index.add_conversation(&conv)?;
10110 index.commit()?;
10111
10112 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10113
10114 let hits = client.search("vari", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10116 assert_eq!(hits.len(), 1);
10117
10118 let hits = client.search(
10120 "my_variable",
10121 SearchFilters::default(),
10122 10,
10123 0,
10124 FieldMask::FULL,
10125 )?;
10126 assert_eq!(hits.len(), 1);
10127
10128 Ok(())
10129 }
10130
10131 #[test]
10132 fn search_matches_symbols_stripped() -> Result<()> {
10133 let dir = TempDir::new()?;
10134 let mut index = TantivyIndex::open_or_create(dir.path())?;
10135 let conv = NormalizedConversation {
10136 agent_slug: "codex".into(),
10137 external_id: None,
10138 title: Some("symbols".into()),
10139 workspace: None,
10140 source_path: dir.path().join("s.jsonl"),
10141 started_at: Some(1),
10142 ended_at: None,
10143 metadata: serde_json::json!({}),
10144 messages: vec![NormalizedMessage {
10145 idx: 0,
10146 role: "user".into(),
10147 author: None,
10148 created_at: Some(1),
10149 content: "working with c++ and foo.bar today".into(),
10150 extra: serde_json::json!({}),
10151 snippets: vec![],
10152 invocations: Vec::new(),
10153 }],
10154 };
10155 index.add_conversation(&conv)?;
10156 index.commit()?;
10157
10158 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10159
10160 let hits = client.search("c++", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10162 assert_eq!(hits.len(), 1);
10163
10164 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10166 assert_eq!(hits.len(), 1);
10167
10168 Ok(())
10169 }
10170
10171 #[test]
10172 fn search_sets_match_type_for_wildcards() -> Result<()> {
10173 let dir = TempDir::new()?;
10174 let mut index = TantivyIndex::open_or_create(dir.path())?;
10175
10176 let conv = NormalizedConversation {
10177 agent_slug: "codex".into(),
10178 external_id: None,
10179 title: Some("handlers".into()),
10180 workspace: None,
10181 source_path: dir.path().join("h.jsonl"),
10182 started_at: Some(1),
10183 ended_at: None,
10184 metadata: serde_json::json!({}),
10185 messages: vec![NormalizedMessage {
10186 idx: 0,
10187 role: "user".into(),
10188 author: None,
10189 created_at: Some(1),
10190 content: "the request handler delegates".into(),
10191 extra: serde_json::json!({}),
10192 snippets: vec![],
10193 invocations: Vec::new(),
10194 }],
10195 };
10196 index.add_conversation(&conv)?;
10197 index.commit()?;
10198
10199 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10200
10201 let exact = client.search("handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10202 assert_eq!(exact[0].match_type, MatchType::Exact);
10203
10204 let prefix = client.search("hand*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10205 assert_eq!(prefix[0].match_type, MatchType::Prefix);
10206
10207 let suffix = client.search("*handler", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10208 assert_eq!(suffix[0].match_type, MatchType::Suffix);
10209
10210 let substring =
10211 client.search("*andle*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10212 assert_eq!(substring[0].match_type, MatchType::Substring);
10213
10214 Ok(())
10215 }
10216
10217 #[test]
10218 fn search_with_fallback_marks_implicit_wildcard() -> Result<()> {
10219 let dir = TempDir::new()?;
10220 let mut index = TantivyIndex::open_or_create(dir.path())?;
10221
10222 let conv = NormalizedConversation {
10223 agent_slug: "codex".into(),
10224 external_id: None,
10225 title: Some("handlers".into()),
10226 workspace: None,
10227 source_path: dir.path().join("h2.jsonl"),
10228 started_at: Some(1),
10229 ended_at: None,
10230 metadata: serde_json::json!({}),
10231 messages: vec![NormalizedMessage {
10232 idx: 0,
10233 role: "user".into(),
10234 author: None,
10235 created_at: Some(1),
10236 content: "the request handler delegates".into(),
10237 extra: serde_json::json!({}),
10238 snippets: vec![],
10239 invocations: Vec::new(),
10240 }],
10241 };
10242 index.add_conversation(&conv)?;
10243 index.commit()?;
10244
10245 let client = SearchClient::open(dir.path(), None)?.expect("index present");
10246
10247 let result = client.search_with_fallback(
10249 "andle",
10250 SearchFilters::default(),
10251 10,
10252 0,
10253 2,
10254 FieldMask::FULL,
10255 )?;
10256 assert!(result.wildcard_fallback);
10257 assert_eq!(result.hits.len(), 1);
10258 assert_eq!(result.hits[0].match_type, MatchType::ImplicitWildcard);
10259
10260 Ok(())
10261 }
10262
10263 #[test]
10264 fn sqlite_backend_skips_wildcard_queries() -> Result<()> {
10265 let conn = Connection::open(":memory:")?;
10267 let client = SearchClient {
10268 reader: None,
10269 sqlite: Mutex::new(Some(SendConnection(conn))),
10270 sqlite_path: None,
10271 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10272 reload_on_search: true,
10273 last_reload: Mutex::new(None),
10274 last_generation: Mutex::new(None),
10275 reload_epoch: Arc::new(AtomicU64::new(0)),
10276 warm_tx: None,
10277 _warm_handle: None,
10278 metrics: Metrics::default(),
10279 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10280 semantic: Mutex::new(None),
10281 last_tantivy_total_count: Mutex::new(None),
10282 };
10283
10284 let hits = client.search("*handler", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10285 assert!(
10286 hits.is_empty(),
10287 "wildcard should skip sqlite fallback, not error"
10288 );
10289
10290 Ok(())
10291 }
10292
10293 #[test]
10294 fn sqlite_backend_handles_null_workspace() -> Result<()> {
10295 let conn = Connection::open(":memory:")?;
10296 conn.execute_batch(
10297 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10298 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10299 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10300 CREATE TABLE conversations (
10301 id INTEGER PRIMARY KEY,
10302 agent_id INTEGER,
10303 workspace_id INTEGER,
10304 source_id TEXT,
10305 origin_host TEXT,
10306 title TEXT,
10307 source_path TEXT
10308 );
10309 CREATE TABLE messages (
10310 id INTEGER PRIMARY KEY,
10311 conversation_id INTEGER,
10312 idx INTEGER,
10313 content TEXT,
10314 created_at INTEGER
10315 );
10316 CREATE VIRTUAL TABLE fts_messages USING fts5(
10317 content,
10318 title,
10319 agent,
10320 workspace,
10321 source_path,
10322 created_at UNINDEXED,
10323 content='',
10324 tokenize='porter'
10325 );",
10326 )?;
10327 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10328 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10329 conn.execute(
10330 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 't', '/tmp/session.jsonl')",
10331 )?;
10332 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
10333 conn.execute_compat(
10334 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10335 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
10336 params![
10337 1_i64,
10338 "auth token failure",
10339 "t",
10340 "codex",
10341 "/tmp/session.jsonl",
10342 42_i64
10343 ],
10344 )?;
10345
10346 let client = SearchClient {
10347 reader: None,
10348 sqlite: Mutex::new(Some(SendConnection(conn))),
10349 sqlite_path: None,
10350 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10351 reload_on_search: true,
10352 last_reload: Mutex::new(None),
10353 last_generation: Mutex::new(None),
10354 reload_epoch: Arc::new(AtomicU64::new(0)),
10355 warm_tx: None,
10356 _warm_handle: None,
10357 metrics: Metrics::default(),
10358 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10359 semantic: Mutex::new(None),
10360 last_tantivy_total_count: Mutex::new(None),
10361 };
10362
10363 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10364 assert_eq!(hits.len(), 1);
10365 assert_eq!(hits[0].workspace, "");
10366 assert_eq!(hits[0].line_number, Some(1));
10367 assert_eq!(hits[0].source_id, "local");
10368 assert_eq!(hits[0].origin_kind, "local");
10369 Ok(())
10370 }
10371
10372 #[test]
10373 fn sqlite_backend_supports_legacy_fts_message_id_schema() -> Result<()> {
10374 let conn = Connection::open(":memory:")?;
10375 conn.execute_batch(
10376 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10377 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10378 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10379 CREATE TABLE conversations (
10380 id INTEGER PRIMARY KEY,
10381 agent_id INTEGER,
10382 workspace_id INTEGER,
10383 source_id TEXT,
10384 origin_host TEXT,
10385 title TEXT,
10386 source_path TEXT
10387 );
10388 CREATE TABLE messages (
10389 id INTEGER PRIMARY KEY,
10390 conversation_id INTEGER,
10391 idx INTEGER,
10392 content TEXT,
10393 created_at INTEGER
10394 );
10395 CREATE VIRTUAL TABLE fts_messages USING fts5(
10396 content,
10397 title,
10398 agent,
10399 workspace,
10400 source_path,
10401 created_at UNINDEXED,
10402 message_id UNINDEXED,
10403 tokenize='porter'
10404 );",
10405 )?;
10406 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10407 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10408 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/legacy')")?;
10409 conn.execute(
10410 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10411 VALUES(1, 1, 1, 'local', NULL, 'legacy title', '/tmp/legacy.jsonl')",
10412 )?;
10413 conn.execute(
10414 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
10415 VALUES(42, 1, 4, 'legacy auth token failure', 99)",
10416 )?;
10417 conn.execute_compat(
10418 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at, message_id)
10419 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
10420 params![
10421 1_i64,
10422 "legacy auth token failure",
10423 "legacy title",
10424 "codex",
10425 "/legacy",
10426 "/tmp/legacy.jsonl",
10427 99_i64,
10428 42_i64
10429 ],
10430 )?;
10431
10432 let client = SearchClient {
10433 reader: None,
10434 sqlite: Mutex::new(Some(SendConnection(conn))),
10435 sqlite_path: None,
10436 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10437 reload_on_search: true,
10438 last_reload: Mutex::new(None),
10439 last_generation: Mutex::new(None),
10440 reload_epoch: Arc::new(AtomicU64::new(0)),
10441 warm_tx: None,
10442 _warm_handle: None,
10443 metrics: Metrics::default(),
10444 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10445 semantic: Mutex::new(None),
10446 last_tantivy_total_count: Mutex::new(None),
10447 };
10448
10449 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10450 assert_eq!(hits.len(), 1);
10451 assert_eq!(hits[0].title, "legacy title");
10452 assert_eq!(hits[0].source_path, "/tmp/legacy.jsonl");
10453 assert_eq!(hits[0].workspace, "/legacy");
10454 assert_eq!(hits[0].line_number, Some(5));
10455 assert_eq!(hits[0].content, "legacy auth token failure");
10456 Ok(())
10457 }
10458
10459 #[test]
10460 fn tantivy_reader_skips_sqlite_fallback_on_empty_lexical_results() -> Result<()> {
10461 let dir = TempDir::new()?;
10462 let mut index = TantivyIndex::open_or_create(dir.path())?;
10463 index.commit()?;
10464 let reader = fs_cass_open_search_reader(dir.path(), ReloadPolicy::Manual).ok();
10465 assert!(
10466 reader.is_some(),
10467 "test fixture should open a Tantivy reader even with an empty index"
10468 );
10469
10470 let conn = Connection::open(":memory:")?;
10471 conn.execute_batch(
10472 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10473 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10474 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10475 CREATE TABLE conversations (
10476 id INTEGER PRIMARY KEY,
10477 agent_id INTEGER,
10478 workspace_id INTEGER,
10479 source_id TEXT,
10480 origin_host TEXT,
10481 title TEXT,
10482 source_path TEXT
10483 );
10484 CREATE TABLE messages (
10485 id INTEGER PRIMARY KEY,
10486 conversation_id INTEGER,
10487 idx INTEGER,
10488 content TEXT,
10489 created_at INTEGER
10490 );
10491 CREATE VIRTUAL TABLE fts_messages USING fts5(
10492 content,
10493 title,
10494 agent,
10495 workspace,
10496 source_path,
10497 created_at UNINDEXED,
10498 content='',
10499 tokenize='porter'
10500 );",
10501 )?;
10502 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10503 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10504 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/sqlite-only')")?;
10505 conn.execute(
10506 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10507 VALUES(1, 1, 1, 'local', NULL, 'sqlite fallback only', '/tmp/sqlite-only.jsonl')",
10508 )?;
10509 conn.execute(
10510 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
10511 VALUES(1, 1, 0, 'sqliteonlytoken overflow candidate', 42)",
10512 )?;
10513 conn.execute_compat(
10514 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10515 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10516 params![
10517 1_i64,
10518 "sqliteonlytoken overflow candidate",
10519 "sqlite fallback only",
10520 "codex",
10521 "/sqlite-only",
10522 "/tmp/sqlite-only.jsonl",
10523 42_i64
10524 ],
10525 )?;
10526
10527 let client = SearchClient {
10528 reader,
10529 sqlite: Mutex::new(Some(SendConnection(conn))),
10530 sqlite_path: None,
10531 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10532 reload_on_search: true,
10533 last_reload: Mutex::new(None),
10534 last_generation: Mutex::new(None),
10535 reload_epoch: Arc::new(AtomicU64::new(0)),
10536 warm_tx: None,
10537 _warm_handle: None,
10538 metrics: Metrics::default(),
10539 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10540 semantic: Mutex::new(None),
10541 last_tantivy_total_count: Mutex::new(None),
10542 };
10543
10544 let sqlite_hits = client.search_sqlite_fts5(
10545 Path::new(":memory:"),
10546 "sqliteonlytoken",
10547 SearchFilters::default(),
10548 5,
10549 0,
10550 FieldMask::FULL,
10551 )?;
10552 assert_eq!(
10553 sqlite_hits.len(),
10554 1,
10555 "fixture should prove sqlite fallback would have produced a hit"
10556 );
10557
10558 let tantivy_authoritative_hits = client.search(
10559 "sqliteonlytoken",
10560 SearchFilters::default(),
10561 5,
10562 0,
10563 FieldMask::FULL,
10564 )?;
10565 assert!(
10566 tantivy_authoritative_hits.is_empty(),
10567 "a live Tantivy reader should prevent sqlite fallback from populating empty lexical results"
10568 );
10569 Ok(())
10570 }
10571
10572 #[test]
10573 fn sqlite_guard_does_not_repair_fts_when_generation_key_stale() -> Result<()> {
10574 let temp_dir = TempDir::new()?;
10575 let db_path = temp_dir.path().join("stale-gen-fts.db");
10576
10577 {
10579 let storage = FrankenStorage::open(&db_path)?;
10580 let agent = Agent {
10581 id: None,
10582 slug: "codex".into(),
10583 name: "Codex".into(),
10584 version: None,
10585 kind: AgentKind::Cli,
10586 };
10587 let agent_id = storage.ensure_agent(&agent)?;
10588 let conversation = Conversation {
10589 id: None,
10590 agent_slug: "codex".into(),
10591 workspace: Some(PathBuf::from("/tmp/workspace")),
10592 external_id: Some("stale-gen-fts".into()),
10593 title: Some("Stale FTS generation".into()),
10594 source_path: PathBuf::from("/tmp/stale-gen-fts.jsonl"),
10595 started_at: Some(1_700_000_000_000),
10596 ended_at: Some(1_700_000_000_100),
10597 approx_tokens: Some(42),
10598 metadata_json: serde_json::Value::Null,
10599 messages: vec![Message {
10600 id: None,
10601 idx: 0,
10602 role: MessageRole::User,
10603 author: Some("user".into()),
10604 created_at: Some(1_700_000_000_050),
10605 content: "message that should remain queryable".into(),
10606 extra_json: serde_json::Value::Null,
10607 snippets: Vec::new(),
10608 }],
10609 source_id: "local".into(),
10610 origin_host: None,
10611 };
10612 storage.insert_conversation_tree(agent_id, None, &conversation)?;
10613 }
10614
10615 let count_before = sqlite_master_name_count(&db_path, "fts_messages")
10616 .context("count schema rows before generation key deletion")?;
10617
10618 {
10622 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
10623 conn.execute_compat(
10624 "DELETE FROM meta WHERE key = ?1",
10625 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
10626 )?;
10627 }
10628
10629 let client = SearchClient {
10632 reader: None,
10633 sqlite: Mutex::new(None),
10634 sqlite_path: Some(db_path.clone()),
10635 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10636 reload_on_search: true,
10637 last_reload: Mutex::new(None),
10638 last_generation: Mutex::new(None),
10639 reload_epoch: Arc::new(AtomicU64::new(0)),
10640 warm_tx: None,
10641 _warm_handle: None,
10642 metrics: Metrics::default(),
10643 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10644 semantic: Mutex::new(None),
10645 last_tantivy_total_count: Mutex::new(None),
10646 };
10647
10648 let guard = client
10649 .sqlite_guard()
10650 .context("open sqlite guard for stale generation fixture")?;
10651 assert!(guard.is_some(), "sqlite guard should open the db");
10652 let conn = guard
10653 .as_ref()
10654 .expect("sqlite guard should hold a connection");
10655 let no_params: [ParamValue; 0] = [];
10656 let cache_size: i64 =
10657 conn.query_row_map("PRAGMA cache_size;", &no_params, |row| row.get_typed(0))?;
10658 assert_eq!(
10659 cache_size, -SEARCH_SQLITE_HYDRATION_CACHE_KIB,
10660 "search hydration should not inherit the general storage cache profile"
10661 );
10662 drop(guard);
10663
10664 let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned())?;
10666 let generation_after: Option<String> = conn
10667 .query_row_map(
10668 "SELECT value FROM meta WHERE key = ?1",
10669 &[ParamValue::from("fts_frankensqlite_rebuild_generation")],
10670 |row| row.get_typed(0),
10671 )
10672 .optional()?;
10673 assert!(
10674 generation_after.is_none(),
10675 "search sqlite guard must not mutate FTS rebuild metadata"
10676 );
10677
10678 let count_after = sqlite_master_name_count(&db_path, "fts_messages")
10680 .context("count schema rows after sqlite guard reopen")?;
10681 assert_eq!(
10682 count_after, count_before,
10683 "read-only reopen must leave FTS schema state unchanged"
10684 );
10685
10686 Ok(())
10687 }
10688
10689 #[test]
10690 fn sqlite_path_rusqlite_fallback_matches_hyphenated_ids_with_workspace_filter() -> Result<()> {
10691 let temp_dir = TempDir::new()?;
10692 let db_path = temp_dir.path().join("hyphenated-rusqlite-fallback.db");
10693
10694 {
10695 let storage = FrankenStorage::open(&db_path)?;
10696 storage.ensure_search_fallback_fts_consistency()?;
10699 let conn = storage.raw();
10700 conn.execute(
10701 "INSERT INTO agents(id, slug, name, kind, created_at, updated_at)
10702 VALUES(1, 'codex', 'Codex', 'codex', 1, 1)",
10703 )?;
10704 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws/alpha')")?;
10705 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/ws/beta')")?;
10706 conn.execute(
10707 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10708 VALUES(1, 1, 1, 'local', NULL, 'alpha bead', '/tmp/alpha.jsonl')",
10709 )?;
10710 conn.execute(
10711 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
10712 VALUES(2, 1, 2, 'local', NULL, 'beta bead', '/tmp/beta.jsonl')",
10713 )?;
10714 conn.execute(
10715 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
10716 VALUES(11, 1, 0, 'user', 'Need follow-up on br-123 root cause', 100)",
10717 )?;
10718 conn.execute(
10719 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
10720 VALUES(12, 2, 0, 'user', 'Need follow-up on br-123 user report', 101)",
10721 )?;
10722 conn.execute_compat(
10723 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10724 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10725 &[
10726 ParamValue::from(11_i64),
10727 ParamValue::from("Need follow-up on br-123 root cause"),
10728 ParamValue::from("alpha bead"),
10729 ParamValue::from("codex"),
10730 ParamValue::from("/ws/alpha"),
10731 ParamValue::from("/tmp/alpha.jsonl"),
10732 ParamValue::from(100_i64),
10733 ],
10734 )?;
10735 conn.execute_compat(
10736 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10737 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10738 &[
10739 ParamValue::from(12_i64),
10740 ParamValue::from("Need follow-up on br-123 user report"),
10741 ParamValue::from("beta bead"),
10742 ParamValue::from("codex"),
10743 ParamValue::from("/ws/beta"),
10744 ParamValue::from("/tmp/beta.jsonl"),
10745 ParamValue::from(101_i64),
10746 ],
10747 )?;
10748 let preclose_total_rows = conn.query("SELECT rowid FROM fts_messages")?;
10749 assert_eq!(
10750 preclose_total_rows.len(),
10751 2,
10752 "freshly seeded file-backed FTS should retain the inserted rows"
10753 );
10754 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
10755 let preclose_rows = conn.query_with_params(
10756 "SELECT rowid FROM fts_messages WHERE fts_messages MATCH ?",
10757 ¶ms_from_iter(vec![ParamValue::from(transpiled.as_str())]),
10758 )?;
10759 assert_eq!(
10760 preclose_rows.len(),
10761 2,
10762 "freshly seeded file-backed FTS should match the transpiled hyphenated query before reopen"
10763 );
10764 }
10765
10766 let client = SearchClient {
10767 reader: None,
10768 sqlite: Mutex::new(None),
10769 sqlite_path: Some(db_path),
10770 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10771 reload_on_search: true,
10772 last_reload: Mutex::new(None),
10773 last_generation: Mutex::new(None),
10774 reload_epoch: Arc::new(AtomicU64::new(0)),
10775 warm_tx: None,
10776 _warm_handle: None,
10777 metrics: Metrics::default(),
10778 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10779 semantic: Mutex::new(None),
10780 last_tantivy_total_count: Mutex::new(None),
10781 };
10782
10783 let guard = client.sqlite_guard()?;
10784 let conn = guard.as_ref().expect("sqlite guard should reopen file db");
10785 let reopened_total_rows = conn.query("SELECT rowid FROM fts_messages")?;
10786 assert_eq!(
10787 reopened_total_rows.len(),
10788 2,
10789 "reopened file-backed FTS should still contain the seeded rows"
10790 );
10791 let transpiled = transpile_to_fts5("br-123").expect("transpiled fallback query");
10792 let raw_rows = conn.query_with_params(
10793 "SELECT rowid FROM fts_messages WHERE fts_messages MATCH ?",
10794 ¶ms_from_iter(vec![ParamValue::from(transpiled.as_str())]),
10795 )?;
10796 assert_eq!(
10797 raw_rows.len(),
10798 2,
10799 "reopened file-backed FTS should still match the transpiled hyphenated query"
10800 );
10801 drop(guard);
10802
10803 let all_hits = client.search("br-123", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10804 assert_eq!(all_hits.len(), 2);
10805 assert!(
10806 all_hits.iter().all(|hit| hit.content.contains("br-123")),
10807 "hyphenated bead IDs should survive the file-backed sqlite fallback path"
10808 );
10809
10810 let leading_or_hits = client.search(
10811 "OR br-123",
10812 SearchFilters::default(),
10813 10,
10814 0,
10815 FieldMask::FULL,
10816 )?;
10817 assert_eq!(leading_or_hits.len(), 2);
10818
10819 let dotted_hits = client.search(
10820 "br-123.jsonl",
10821 SearchFilters::default(),
10822 10,
10823 0,
10824 FieldMask::FULL,
10825 )?;
10826 assert_eq!(dotted_hits.len(), 2);
10827
10828 let dotted_prefix_hits = client.search(
10829 "br-123.json*",
10830 SearchFilters::default(),
10831 10,
10832 0,
10833 FieldMask::FULL,
10834 )?;
10835 assert_eq!(dotted_prefix_hits.len(), 2);
10836
10837 let prefix_hits =
10838 client.search("br-12*", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
10839 assert_eq!(prefix_hits.len(), 2);
10840
10841 let filtered_hits = client.search(
10842 "br-123",
10843 SearchFilters {
10844 workspaces: HashSet::from_iter(["/ws/beta".to_string()]),
10845 ..SearchFilters::default()
10846 },
10847 10,
10848 0,
10849 FieldMask::FULL,
10850 )?;
10851 assert_eq!(filtered_hits.len(), 1);
10852 assert_eq!(filtered_hits[0].workspace, "/ws/beta");
10853 assert_eq!(filtered_hits[0].source_path, "/tmp/beta.jsonl");
10854 assert!(filtered_hits[0].content.contains("br-123"));
10855
10856 Ok(())
10857 }
10858
10859 #[test]
10860 fn sqlite_backend_orders_hits_by_bm25_score() -> Result<()> {
10861 let conn = Connection::open(":memory:")?;
10862 conn.execute_batch(
10863 "CREATE TABLE conversations (
10864 id INTEGER PRIMARY KEY,
10865 agent_id INTEGER,
10866 workspace_id INTEGER,
10867 source_id TEXT,
10868 origin_host TEXT,
10869 title TEXT,
10870 source_path TEXT
10871 );
10872 CREATE TABLE messages (
10873 id INTEGER PRIMARY KEY,
10874 conversation_id INTEGER,
10875 idx INTEGER,
10876 content TEXT,
10877 created_at INTEGER
10878 );
10879 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
10880 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
10881 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
10882 CREATE VIRTUAL TABLE fts_messages USING fts5(
10883 content,
10884 title,
10885 agent,
10886 workspace,
10887 source_path,
10888 created_at UNINDEXED,
10889 content='',
10890 tokenize='porter'
10891 );",
10892 )?;
10893 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
10894 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
10895 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
10896 conn.execute(
10897 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'best', '/tmp/best.jsonl')",
10898 )?;
10899 conn.execute(
10900 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'worse', '/tmp/worse.jsonl')",
10901 )?;
10902 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(7, 1, 0, 'auth auth auth failure', 42)")?;
10903 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(8, 2, 0, 'auth failure', 43)")?;
10904 conn.execute_compat(
10905 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10906 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10907 params![
10908 7_i64,
10909 "auth auth auth failure",
10910 "best",
10911 "codex",
10912 "/ws",
10913 "/tmp/best.jsonl",
10914 42_i64
10915 ],
10916 )?;
10917 conn.execute_compat(
10918 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
10919 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
10920 params![
10921 8_i64,
10922 "auth failure",
10923 "worse",
10924 "codex",
10925 "/ws",
10926 "/tmp/worse.jsonl",
10927 43_i64
10928 ],
10929 )?;
10930 let client = SearchClient {
10931 reader: None,
10932 sqlite: Mutex::new(Some(SendConnection(conn))),
10933 sqlite_path: None,
10934 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
10935 reload_on_search: true,
10936 last_reload: Mutex::new(None),
10937 last_generation: Mutex::new(None),
10938 reload_epoch: Arc::new(AtomicU64::new(0)),
10939 warm_tx: None,
10940 _warm_handle: None,
10941 metrics: Metrics::default(),
10942 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
10943 semantic: Mutex::new(None),
10944 last_tantivy_total_count: Mutex::new(None),
10945 };
10946 let direct_hits = client.search_sqlite_fts5(
10947 Path::new(":memory:"),
10948 "auth",
10949 SearchFilters::default(),
10950 5,
10951 0,
10952 FieldMask::FULL,
10953 )?;
10954 assert_eq!(direct_hits.len(), 2);
10955
10956 let hits = client.search("auth", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
10957 assert_eq!(hits.len(), 2);
10958 assert_eq!(hits[0].title, "best");
10959 assert_eq!(hits[1].title, "worse");
10960 assert!(hits[0].score > hits[1].score);
10961
10962 Ok(())
10963 }
10964
10965 #[test]
10966 fn sqlite_fts5_ranked_phase_defers_content_decode_until_after_limit() {
10967 let (rank_sql, params) =
10968 SearchClient::sqlite_fts5_rank_query("auth", &SearchFilters::default(), 50, 0, false);
10969 let hydrate_sql = SearchClient::sqlite_fts5_hydrate_query(
10970 2,
10971 FieldMask::new(true, true, true, true),
10972 false,
10973 );
10974
10975 assert!(
10976 !rank_sql.contains("fts_messages.content"),
10977 "rank query must not decode large content rows before LIMIT"
10978 );
10979 assert!(
10980 hydrate_sql.contains("fts_messages.content"),
10981 "hydration query should still provide requested content"
10982 );
10983 assert!(
10984 rank_sql.contains("LIMIT ? OFFSET ?"),
10985 "rank query must apply page bounds before hydration"
10986 );
10987 assert_eq!(params.len(), 3, "fts query plus limit and offset params");
10988 }
10989
10990 #[test]
10991 fn sqlite_fts5_hydration_chunks_stay_below_bind_variable_limit() {
10992 let oversized_row_count = SQLITE_MAX_VARIABLE_NUMBER + 1;
10993 let unchunked_sql = SearchClient::sqlite_fts5_hydrate_query(
10994 oversized_row_count,
10995 FieldMask::new(true, true, true, true),
10996 false,
10997 );
10998 assert!(
10999 unchunked_sql.matches('?').count() > SQLITE_MAX_VARIABLE_NUMBER,
11000 "the pre-fix one-shot hydration query would exceed frankensqlite's bind limit"
11001 );
11002
11003 let ranked_rows: Vec<(i64, f64)> = (0..(SQLITE_FTS5_HYDRATE_PARAM_CHUNK + 17))
11004 .map(|idx| (idx as i64, idx as f64))
11005 .collect();
11006 let chunk_sizes: Vec<usize> = SearchClient::sqlite_fts5_hydrate_row_chunks(&ranked_rows)
11007 .map(<[(i64, f64)]>::len)
11008 .collect();
11009
11010 assert_eq!(
11011 chunk_sizes,
11012 vec![SQLITE_FTS5_HYDRATE_PARAM_CHUNK, 17],
11013 "large fallback pages must hydrate in bounded chunks while preserving rank windows"
11014 );
11015 assert!(
11016 chunk_sizes
11017 .iter()
11018 .all(|chunk_size| *chunk_size <= SQLITE_MAX_VARIABLE_NUMBER),
11019 "every hydration chunk must fit under frankensqlite's bind-variable ceiling"
11020 );
11021 }
11022
11023 #[test]
11024 fn tantivy_fallback_hydration_narrows_by_normalized_source_before_message_lookup() -> Result<()>
11025 {
11026 let conn = Connection::open(":memory:")?;
11027 conn.execute_batch(
11028 "CREATE TABLE conversations (
11029 id INTEGER PRIMARY KEY,
11030 source_id TEXT,
11031 origin_host TEXT,
11032 source_path TEXT NOT NULL
11033 );
11034 CREATE TABLE messages (
11035 id INTEGER PRIMARY KEY,
11036 conversation_id INTEGER NOT NULL,
11037 idx INTEGER NOT NULL,
11038 content TEXT NOT NULL,
11039 UNIQUE(conversation_id, idx)
11040 );
11041 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11042 )?;
11043 conn.execute(
11044 "INSERT INTO conversations(id, source_id, origin_host, source_path)
11045 VALUES(1, '', 'devbox', '/tmp/shared-fallback.jsonl')",
11046 )?;
11047 conn.execute(
11048 "INSERT INTO conversations(id, source_id, origin_host, source_path)
11049 VALUES(2, 'local', NULL, '/tmp/shared-fallback.jsonl')",
11050 )?;
11051 conn.execute(
11052 "INSERT INTO messages(id, conversation_id, idx, content)
11053 VALUES(10, 1, 2, 'remote fallback content')",
11054 )?;
11055 conn.execute(
11056 "INSERT INTO messages(id, conversation_id, idx, content)
11057 VALUES(20, 2, 2, 'local content must not win')",
11058 )?;
11059
11060 let client = SearchClient {
11061 reader: None,
11062 sqlite: Mutex::new(Some(SendConnection(conn))),
11063 sqlite_path: None,
11064 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11065 reload_on_search: true,
11066 last_reload: Mutex::new(None),
11067 last_generation: Mutex::new(None),
11068 reload_epoch: Arc::new(AtomicU64::new(0)),
11069 warm_tx: None,
11070 _warm_handle: None,
11071 metrics: Metrics::default(),
11072 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11073 semantic: Mutex::new(None),
11074 last_tantivy_total_count: Mutex::new(None),
11075 };
11076
11077 let fallback_key = (
11078 "devbox".to_string(),
11079 "/tmp/shared-fallback.jsonl".to_string(),
11080 2,
11081 );
11082 let (_, hydrated_fallback) =
11083 client.hydrate_tantivy_hit_contents(&[], std::slice::from_ref(&fallback_key))?;
11084
11085 assert_eq!(
11086 hydrated_fallback.get(&fallback_key).map(String::as_str),
11087 Some("remote fallback content")
11088 );
11089
11090 Ok(())
11091 }
11092
11093 #[test]
11094 fn exact_content_hydration_returns_only_requested_message_indices() -> Result<()> {
11095 let conn = Connection::open(":memory:")?;
11096 conn.execute_batch(
11097 "CREATE TABLE messages (
11098 id INTEGER PRIMARY KEY,
11099 conversation_id INTEGER NOT NULL,
11100 idx INTEGER NOT NULL,
11101 content TEXT NOT NULL,
11102 UNIQUE(conversation_id, idx)
11103 );",
11104 )?;
11105
11106 for idx in 0..8 {
11107 conn.execute(&format!(
11108 "INSERT INTO messages(conversation_id, idx, content)
11109 VALUES(1, {idx}, 'conversation one row {idx}')"
11110 ))?;
11111 }
11112 conn.execute(
11113 "INSERT INTO messages(conversation_id, idx, content)
11114 VALUES(2, 0, 'conversation two row 0')",
11115 )?;
11116
11117 let hydrated =
11118 hydrate_message_content_by_conversation(&conn, &[(1, 6), (1, 2), (2, 0), (1, 99)])?;
11119
11120 assert_eq!(hydrated.len(), 3);
11121 assert_eq!(
11122 hydrated.get(&(1, 2)).map(String::as_str),
11123 Some("conversation one row 2")
11124 );
11125 assert_eq!(
11126 hydrated.get(&(1, 6)).map(String::as_str),
11127 Some("conversation one row 6")
11128 );
11129 assert_eq!(
11130 hydrated.get(&(2, 0)).map(String::as_str),
11131 Some("conversation two row 0")
11132 );
11133 assert!(!hydrated.contains_key(&(1, 99)));
11134
11135 Ok(())
11136 }
11137
11138 #[test]
11139 fn sqlite_backend_generates_snippet_from_content() -> Result<()> {
11140 let conn = Connection::open(":memory:")?;
11141 conn.execute_batch(
11142 "CREATE TABLE conversations (
11143 id INTEGER PRIMARY KEY,
11144 agent_id INTEGER,
11145 workspace_id INTEGER,
11146 source_id TEXT,
11147 origin_host TEXT,
11148 title TEXT,
11149 source_path TEXT
11150 );
11151 CREATE TABLE messages (
11152 id INTEGER PRIMARY KEY,
11153 conversation_id INTEGER,
11154 idx INTEGER,
11155 content TEXT,
11156 created_at INTEGER
11157 );
11158 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11159 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11160 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11161 CREATE VIRTUAL TABLE fts_messages USING fts5(
11162 content,
11163 title,
11164 agent,
11165 workspace,
11166 source_path,
11167 created_at UNINDEXED,
11168 content='',
11169 tokenize='porter'
11170 );",
11171 )?;
11172 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11173 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11174 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/ws')")?;
11175 conn.execute(
11176 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, 'local', NULL, 'snippet title', '/tmp/snippet.jsonl')",
11177 )?;
11178 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'alpha beta gamma delta epsilon zeta eta theta', 42)")?;
11179 conn.execute_compat(
11180 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11181 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11182 params![
11183 1_i64,
11184 "alpha beta gamma delta epsilon zeta eta theta",
11185 "snippet title",
11186 "codex",
11187 "/ws",
11188 "/tmp/snippet.jsonl",
11189 42_i64
11190 ],
11191 )?;
11192
11193 let client = SearchClient {
11194 reader: None,
11195 sqlite: Mutex::new(Some(SendConnection(conn))),
11196 sqlite_path: None,
11197 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11198 reload_on_search: true,
11199 last_reload: Mutex::new(None),
11200 last_generation: Mutex::new(None),
11201 reload_epoch: Arc::new(AtomicU64::new(0)),
11202 warm_tx: None,
11203 _warm_handle: None,
11204 metrics: Metrics::default(),
11205 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11206 semantic: Mutex::new(None),
11207 last_tantivy_total_count: Mutex::new(None),
11208 };
11209
11210 let hits = client.search("delta", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
11211 assert_eq!(hits.len(), 1);
11212 assert_eq!(hits[0].snippet, snippet_from_content(&hits[0].content));
11214 assert!(hits[0].snippet.contains("delta"));
11215
11216 Ok(())
11217 }
11218
11219 #[test]
11220 fn sqlite_backend_respects_source_filter() -> Result<()> {
11221 let conn = Connection::open(":memory:")?;
11222 conn.execute_batch(
11223 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11224 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11225 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11226 CREATE TABLE conversations (
11227 id INTEGER PRIMARY KEY,
11228 agent_id INTEGER,
11229 workspace_id INTEGER,
11230 source_id TEXT,
11231 origin_host TEXT,
11232 title TEXT,
11233 source_path TEXT
11234 );
11235 CREATE TABLE messages (
11236 id INTEGER PRIMARY KEY,
11237 conversation_id INTEGER,
11238 idx INTEGER,
11239 content TEXT,
11240 created_at INTEGER
11241 );
11242 CREATE VIRTUAL TABLE fts_messages USING fts5(
11243 content,
11244 title,
11245 agent,
11246 workspace,
11247 source_path,
11248 created_at UNINDEXED,
11249 content='',
11250 tokenize='porter'
11251 );",
11252 )?;
11253 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11254 conn.execute("INSERT INTO sources(id, kind) VALUES('laptop', 'ssh')")?;
11255 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11256 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/local')")?;
11257 conn.execute("INSERT INTO workspaces(id, path) VALUES(2, '/remote')")?;
11258 conn.execute(
11259 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, 1, ' local ', NULL, 'local title', '/tmp/local.jsonl')",
11260 )?;
11261 conn.execute("INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 2, 'laptop', 'dev@laptop', 'remote title', '/tmp/remote.jsonl')")?;
11262 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
11263 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
11264 conn.execute_compat(
11265 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11266 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11267 params![
11268 1_i64,
11269 "auth token failure",
11270 "local title",
11271 "codex",
11272 "/local",
11273 "/tmp/local.jsonl",
11274 42_i64
11275 ],
11276 )?;
11277 conn.execute_compat(
11278 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11279 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11280 params![
11281 2_i64,
11282 "auth token failure",
11283 "remote title",
11284 "codex",
11285 "/remote",
11286 "/tmp/remote.jsonl",
11287 43_i64
11288 ],
11289 )?;
11290
11291 let client = SearchClient {
11292 reader: None,
11293 sqlite: Mutex::new(Some(SendConnection(conn))),
11294 sqlite_path: None,
11295 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11296 reload_on_search: true,
11297 last_reload: Mutex::new(None),
11298 last_generation: Mutex::new(None),
11299 reload_epoch: Arc::new(AtomicU64::new(0)),
11300 warm_tx: None,
11301 _warm_handle: None,
11302 metrics: Metrics::default(),
11303 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11304 semantic: Mutex::new(None),
11305 last_tantivy_total_count: Mutex::new(None),
11306 };
11307
11308 let local_hits = client.browse_by_date(
11309 SearchFilters {
11310 source_filter: SourceFilter::Local,
11311 ..SearchFilters::default()
11312 },
11313 5,
11314 0,
11315 true,
11316 FieldMask::FULL,
11317 )?;
11318 assert_eq!(local_hits.len(), 1);
11319 assert_eq!(local_hits[0].source_id, "local");
11320
11321 let remote_hits = client.browse_by_date(
11322 SearchFilters {
11323 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
11324 ..SearchFilters::default()
11325 },
11326 5,
11327 0,
11328 true,
11329 FieldMask::FULL,
11330 )?;
11331 assert_eq!(remote_hits.len(), 1);
11332 assert_eq!(remote_hits[0].source_id, "local");
11333 assert_eq!(remote_hits[0].origin_kind, "local");
11334
11335 Ok(())
11336 }
11337
11338 #[test]
11339 fn sqlite_backend_remote_source_filter_matches_blank_source_id_with_origin_host() -> Result<()>
11340 {
11341 let conn = Connection::open(":memory:")?;
11342 conn.execute_batch(
11343 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11344 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11345 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11346 CREATE TABLE conversations (
11347 id INTEGER PRIMARY KEY,
11348 agent_id INTEGER,
11349 workspace_id INTEGER,
11350 source_id TEXT,
11351 origin_host TEXT,
11352 title TEXT,
11353 source_path TEXT
11354 );
11355 CREATE TABLE messages (
11356 id INTEGER PRIMARY KEY,
11357 conversation_id INTEGER,
11358 idx INTEGER,
11359 content TEXT,
11360 created_at INTEGER
11361 );
11362 CREATE VIRTUAL TABLE fts_messages USING fts5(
11363 content,
11364 title,
11365 agent,
11366 workspace,
11367 source_path,
11368 created_at UNINDEXED,
11369 content='',
11370 tokenize='porter'
11371 );",
11372 )?;
11373 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11374 conn.execute(
11375 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11376 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'remote title', '/tmp/remote-filter.jsonl')",
11377 )?;
11378 conn.execute(
11379 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11380 VALUES(1, 1, 0, 'remote filter proof', 42)",
11381 )?;
11382 conn.execute_compat(
11383 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11384 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
11385 params![
11386 1_i64,
11387 "remote filter proof",
11388 "remote title",
11389 "codex",
11390 "/tmp/remote-filter.jsonl",
11391 42_i64
11392 ],
11393 )?;
11394
11395 let client = SearchClient {
11396 reader: None,
11397 sqlite: Mutex::new(Some(SendConnection(conn))),
11398 sqlite_path: None,
11399 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11400 reload_on_search: true,
11401 last_reload: Mutex::new(None),
11402 last_generation: Mutex::new(None),
11403 reload_epoch: Arc::new(AtomicU64::new(0)),
11404 warm_tx: None,
11405 _warm_handle: None,
11406 metrics: Metrics::default(),
11407 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11408 semantic: Mutex::new(None),
11409 last_tantivy_total_count: Mutex::new(None),
11410 };
11411
11412 let remote_hits = client.search(
11413 "remote",
11414 SearchFilters {
11415 source_filter: SourceFilter::Remote,
11416 ..Default::default()
11417 },
11418 5,
11419 0,
11420 FieldMask::FULL,
11421 )?;
11422 assert_eq!(remote_hits.len(), 1);
11423 assert_eq!(remote_hits[0].source_id, "dev@laptop");
11424 assert_eq!(remote_hits[0].origin_kind, "remote");
11425 assert_eq!(remote_hits[0].origin_host.as_deref(), Some("dev@laptop"));
11426
11427 let source_hits = client.search(
11428 "remote",
11429 SearchFilters {
11430 source_filter: SourceFilter::SourceId("dev@laptop".into()),
11431 ..Default::default()
11432 },
11433 5,
11434 0,
11435 FieldMask::FULL,
11436 )?;
11437 assert_eq!(source_hits.len(), 1);
11438 assert_eq!(source_hits[0].source_id, "dev@laptop");
11439 assert_eq!(source_hits[0].origin_kind, "remote");
11440
11441 Ok(())
11442 }
11443
11444 #[test]
11445 fn sqlite_backend_workspace_filter_matches_null_workspace_as_empty_string() -> Result<()> {
11446 let conn = Connection::open(":memory:")?;
11447 conn.execute_batch(
11448 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11449 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
11450 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
11451 CREATE TABLE conversations (
11452 id INTEGER PRIMARY KEY,
11453 agent_id INTEGER,
11454 workspace_id INTEGER,
11455 source_id TEXT,
11456 origin_host TEXT,
11457 title TEXT,
11458 source_path TEXT
11459 );
11460 CREATE TABLE messages (
11461 id INTEGER PRIMARY KEY,
11462 conversation_id INTEGER,
11463 idx INTEGER,
11464 content TEXT,
11465 created_at INTEGER
11466 );
11467 CREATE VIRTUAL TABLE fts_messages USING fts5(
11468 content,
11469 title,
11470 agent,
11471 workspace,
11472 source_path,
11473 created_at UNINDEXED,
11474 content='',
11475 tokenize='porter'
11476 );",
11477 )?;
11478 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
11479 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11480 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/named')")?;
11481 conn.execute(
11483 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(1, 1, NULL, 'local', NULL, 'null workspace', '/tmp/null-workspace.jsonl')",
11484 )?;
11485 conn.execute(
11487 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path) VALUES(2, 1, 1, 'local', NULL, 'named workspace', '/tmp/named-workspace.jsonl')",
11488 )?;
11489 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(1, 1, 0, 'auth token failure', 42)")?;
11490 conn.execute("INSERT INTO messages(id, conversation_id, idx, content, created_at) VALUES(2, 2, 0, 'auth token failure', 43)")?;
11491 conn.execute_compat(
11492 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11493 VALUES(?1, ?2, ?3, ?4, NULL, ?5, ?6)",
11494 params![
11495 1_i64,
11496 "auth token failure",
11497 "null workspace",
11498 "codex",
11499 "/tmp/null-workspace.jsonl",
11500 42_i64
11501 ],
11502 )?;
11503 conn.execute_compat(
11504 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at)
11505 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
11506 params![
11507 2_i64,
11508 "auth token failure",
11509 "named workspace",
11510 "codex",
11511 "/named",
11512 "/tmp/named-workspace.jsonl",
11513 43_i64
11514 ],
11515 )?;
11516
11517 let client = SearchClient {
11518 reader: None,
11519 sqlite: Mutex::new(Some(SendConnection(conn))),
11520 sqlite_path: None,
11521 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11522 reload_on_search: true,
11523 last_reload: Mutex::new(None),
11524 last_generation: Mutex::new(None),
11525 reload_epoch: Arc::new(AtomicU64::new(0)),
11526 warm_tx: None,
11527 _warm_handle: None,
11528 metrics: Metrics::default(),
11529 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11530 semantic: Mutex::new(None),
11531 last_tantivy_total_count: Mutex::new(None),
11532 };
11533
11534 let hits = client.search(
11535 "auth",
11536 SearchFilters {
11537 workspaces: HashSet::from_iter([String::new()]),
11538 ..SearchFilters::default()
11539 },
11540 5,
11541 0,
11542 FieldMask::FULL,
11543 )?;
11544 assert_eq!(hits.len(), 1);
11545 assert_eq!(hits[0].workspace, "");
11546 assert_eq!(hits[0].source_path, "/tmp/null-workspace.jsonl");
11547
11548 Ok(())
11549 }
11550
11551 #[test]
11552 fn browse_by_date_treats_null_workspace_and_source_as_local() -> Result<()> {
11553 let conn = Connection::open(":memory:")?;
11554 conn.execute_batch(
11555 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11556 CREATE TABLE conversations (
11557 id INTEGER PRIMARY KEY,
11558 agent_id INTEGER NOT NULL,
11559 workspace_id INTEGER,
11560 source_id TEXT,
11561 origin_host TEXT,
11562 title TEXT,
11563 source_path TEXT NOT NULL
11564 );
11565 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11566 CREATE TABLE messages (
11567 id INTEGER PRIMARY KEY,
11568 conversation_id INTEGER NOT NULL,
11569 idx INTEGER,
11570 content TEXT NOT NULL,
11571 created_at INTEGER
11572 );
11573 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11574 )?;
11575 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11576 conn.execute(
11577 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11578 VALUES(1, 1, NULL, NULL, NULL, 'browse title', '/tmp/browse.jsonl')",
11579 )?;
11580 conn.execute(
11581 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
11582 VALUES(1, 1, 0, 'browse auth token failure', 123)",
11583 )?;
11584
11585 let client = SearchClient {
11586 reader: None,
11587 sqlite: Mutex::new(Some(SendConnection(conn))),
11588 sqlite_path: None,
11589 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11590 reload_on_search: true,
11591 last_reload: Mutex::new(None),
11592 last_generation: Mutex::new(None),
11593 reload_epoch: Arc::new(AtomicU64::new(0)),
11594 warm_tx: None,
11595 _warm_handle: None,
11596 metrics: Metrics::default(),
11597 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11598 semantic: Mutex::new(None),
11599 last_tantivy_total_count: Mutex::new(None),
11600 };
11601
11602 let hits = client.browse_by_date(
11603 SearchFilters {
11604 workspaces: HashSet::from_iter([String::new()]),
11605 source_filter: SourceFilter::Local,
11606 ..SearchFilters::default()
11607 },
11608 5,
11609 0,
11610 true,
11611 FieldMask::FULL,
11612 )?;
11613 assert_eq!(hits.len(), 1);
11614 assert_eq!(hits[0].workspace, "");
11615 assert_eq!(hits[0].source_id, "local");
11616 assert_eq!(hits[0].origin_kind, "local");
11617
11618 Ok(())
11619 }
11620
11621 #[test]
11622 fn hydrate_semantic_hits_with_ids_snippet_only_uses_full_content_for_snippets_and_identity()
11623 -> Result<()> {
11624 let conn = Connection::open(":memory:")?;
11625 conn.execute_batch(
11626 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11627 CREATE TABLE conversations (
11628 id INTEGER PRIMARY KEY,
11629 agent_id INTEGER NOT NULL,
11630 workspace_id INTEGER,
11631 source_id TEXT,
11632 origin_host TEXT,
11633 title TEXT,
11634 source_path TEXT NOT NULL,
11635 started_at INTEGER
11636 );
11637 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11638 CREATE TABLE messages (
11639 id INTEGER PRIMARY KEY,
11640 conversation_id INTEGER NOT NULL,
11641 idx INTEGER,
11642 role TEXT,
11643 content TEXT NOT NULL,
11644 created_at INTEGER
11645 );
11646 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11647 )?;
11648 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11649 conn.execute(
11650 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11651 VALUES(1, 1, NULL, 'local', NULL, 'semantic title', '/tmp/semantic.jsonl', 100)",
11652 )?;
11653 let shared_prefix = "shared-prefix ".repeat(32);
11654 let first = format!("{shared_prefix}first unique semantic tail");
11655 let second = format!("{shared_prefix}second unique semantic tail");
11656 conn.execute_with_params(
11657 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11658 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
11659 &[
11660 fsqlite_types::value::SqliteValue::Integer(1),
11661 fsqlite_types::value::SqliteValue::Integer(0),
11662 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
11663 fsqlite_types::value::SqliteValue::Integer(101),
11664 ],
11665 )?;
11666 conn.execute_with_params(
11667 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11668 VALUES(?1, 1, ?2, 'assistant', ?3, ?4)",
11669 &[
11670 fsqlite_types::value::SqliteValue::Integer(2),
11671 fsqlite_types::value::SqliteValue::Integer(1),
11672 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
11673 fsqlite_types::value::SqliteValue::Integer(102),
11674 ],
11675 )?;
11676
11677 let client = SearchClient {
11678 reader: None,
11679 sqlite: Mutex::new(Some(SendConnection(conn))),
11680 sqlite_path: None,
11681 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11682 reload_on_search: true,
11683 last_reload: Mutex::new(None),
11684 last_generation: Mutex::new(None),
11685 reload_epoch: Arc::new(AtomicU64::new(0)),
11686 warm_tx: None,
11687 _warm_handle: None,
11688 metrics: Metrics::default(),
11689 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11690 semantic: Mutex::new(None),
11691 last_tantivy_total_count: Mutex::new(None),
11692 };
11693
11694 let hits = client.hydrate_semantic_hits_with_ids(
11695 &[
11696 VectorSearchResult {
11697 message_id: 1,
11698 chunk_idx: 0,
11699 score: 0.9,
11700 },
11701 VectorSearchResult {
11702 message_id: 2,
11703 chunk_idx: 0,
11704 score: 0.8,
11705 },
11706 ],
11707 FieldMask::new(false, true, true, true),
11708 )?;
11709 assert_eq!(hits.len(), 2);
11710 assert!(hits.iter().all(|(_, hit)| hit.content.is_empty()));
11711 assert!(hits.iter().all(|(_, hit)| !hit.snippet.is_empty()));
11712 assert_ne!(hits[0].1.content_hash, hits[1].1.content_hash);
11713
11714 Ok(())
11715 }
11716
11717 #[test]
11718 fn hydrate_semantic_hits_with_ids_normalizes_trimmed_local_source_metadata() -> Result<()> {
11719 let conn = Connection::open(":memory:")?;
11720 conn.execute_batch(
11721 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11722 CREATE TABLE conversations (
11723 id INTEGER PRIMARY KEY,
11724 agent_id INTEGER NOT NULL,
11725 workspace_id INTEGER,
11726 source_id TEXT,
11727 origin_host TEXT,
11728 title TEXT,
11729 source_path TEXT NOT NULL,
11730 started_at INTEGER
11731 );
11732 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11733 CREATE TABLE messages (
11734 id INTEGER PRIMARY KEY,
11735 conversation_id INTEGER NOT NULL,
11736 idx INTEGER,
11737 role TEXT,
11738 content TEXT NOT NULL,
11739 created_at INTEGER
11740 );
11741 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11742 )?;
11743 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11744 conn.execute(
11745 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11746 VALUES(1, 1, NULL, ' local ', NULL, 'trimmed local semantic', '/tmp/trimmed-local-semantic.jsonl', 100)",
11747 )?;
11748 conn.execute_with_params(
11749 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11750 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
11751 &[
11752 fsqlite_types::value::SqliteValue::Integer(1),
11753 fsqlite_types::value::SqliteValue::Text("trimmed local semantic body".into()),
11754 ],
11755 )?;
11756
11757 let client = SearchClient {
11758 reader: None,
11759 sqlite: Mutex::new(Some(SendConnection(conn))),
11760 sqlite_path: None,
11761 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11762 reload_on_search: true,
11763 last_reload: Mutex::new(None),
11764 last_generation: Mutex::new(None),
11765 reload_epoch: Arc::new(AtomicU64::new(0)),
11766 warm_tx: None,
11767 _warm_handle: None,
11768 metrics: Metrics::default(),
11769 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11770 semantic: Mutex::new(None),
11771 last_tantivy_total_count: Mutex::new(None),
11772 };
11773
11774 let hits = client.hydrate_semantic_hits_with_ids(
11775 &[VectorSearchResult {
11776 message_id: 1,
11777 chunk_idx: 0,
11778 score: 0.9,
11779 }],
11780 FieldMask::new(false, true, true, true),
11781 )?;
11782 assert_eq!(hits.len(), 1);
11783 assert_eq!(hits[0].1.source_id, "local");
11784 assert_eq!(hits[0].1.origin_kind, "local");
11785
11786 Ok(())
11787 }
11788
11789 #[test]
11790 fn hydrate_semantic_hits_with_ids_preserves_remote_origin_without_source_row() -> Result<()> {
11791 let conn = Connection::open(":memory:")?;
11792 conn.execute_batch(
11793 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11794 CREATE TABLE conversations (
11795 id INTEGER PRIMARY KEY,
11796 agent_id INTEGER NOT NULL,
11797 workspace_id INTEGER,
11798 source_id TEXT,
11799 origin_host TEXT,
11800 title TEXT,
11801 source_path TEXT NOT NULL,
11802 started_at INTEGER
11803 );
11804 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
11805 CREATE TABLE messages (
11806 id INTEGER PRIMARY KEY,
11807 conversation_id INTEGER NOT NULL,
11808 idx INTEGER,
11809 role TEXT,
11810 content TEXT NOT NULL,
11811 created_at INTEGER
11812 );
11813 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
11814 )?;
11815 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11816 conn.execute(
11817 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
11818 VALUES(1, 1, NULL, 'laptop', 'dev@laptop', 'remote semantic', '/tmp/remote-semantic.jsonl', 100)",
11819 )?;
11820 conn.execute_with_params(
11821 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11822 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
11823 &[
11824 fsqlite_types::value::SqliteValue::Integer(1),
11825 fsqlite_types::value::SqliteValue::Text("remote semantic body".into()),
11826 ],
11827 )?;
11828
11829 let client = SearchClient {
11830 reader: None,
11831 sqlite: Mutex::new(Some(SendConnection(conn))),
11832 sqlite_path: None,
11833 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11834 reload_on_search: true,
11835 last_reload: Mutex::new(None),
11836 last_generation: Mutex::new(None),
11837 reload_epoch: Arc::new(AtomicU64::new(0)),
11838 warm_tx: None,
11839 _warm_handle: None,
11840 metrics: Metrics::default(),
11841 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11842 semantic: Mutex::new(None),
11843 last_tantivy_total_count: Mutex::new(None),
11844 };
11845
11846 let hits = client.hydrate_semantic_hits_with_ids(
11847 &[VectorSearchResult {
11848 message_id: 1,
11849 chunk_idx: 0,
11850 score: 0.9,
11851 }],
11852 FieldMask::new(false, true, true, true),
11853 )?;
11854 assert_eq!(hits.len(), 1);
11855 assert_eq!(hits[0].1.source_id, "laptop");
11856 assert_eq!(hits[0].1.origin_kind, "remote");
11857 assert_eq!(hits[0].1.origin_host.as_deref(), Some("dev@laptop"));
11858
11859 Ok(())
11860 }
11861
11862 #[test]
11863 fn resolve_semantic_doc_ids_for_hits_distinguishes_same_source_path_line_by_content_hash()
11864 -> Result<()> {
11865 let conn = Connection::open(":memory:")?;
11866 conn.execute_batch(
11867 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11868 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
11869 CREATE TABLE conversations (
11870 id INTEGER PRIMARY KEY,
11871 agent_id INTEGER NOT NULL,
11872 workspace_id INTEGER,
11873 source_id TEXT,
11874 origin_host TEXT,
11875 title TEXT,
11876 source_path TEXT NOT NULL
11877 );
11878 CREATE TABLE messages (
11879 id INTEGER PRIMARY KEY,
11880 conversation_id INTEGER NOT NULL,
11881 idx INTEGER,
11882 role TEXT,
11883 content TEXT NOT NULL,
11884 created_at INTEGER
11885 );",
11886 )?;
11887 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
11888 conn.execute(
11889 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11890 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
11891 )?;
11892 conn.execute(
11893 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
11894 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-shared.jsonl')",
11895 )?;
11896 let first = "same prefix first tail".to_string();
11897 let second = "same prefix second tail".to_string();
11898 conn.execute_with_params(
11899 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11900 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
11901 &[
11902 fsqlite_types::value::SqliteValue::Integer(11),
11903 fsqlite_types::value::SqliteValue::Integer(1),
11904 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
11905 ],
11906 )?;
11907 conn.execute_with_params(
11908 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
11909 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
11910 &[
11911 fsqlite_types::value::SqliteValue::Integer(22),
11912 fsqlite_types::value::SqliteValue::Integer(2),
11913 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
11914 ],
11915 )?;
11916
11917 let client = SearchClient {
11918 reader: None,
11919 sqlite: Mutex::new(Some(SendConnection(conn))),
11920 sqlite_path: None,
11921 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
11922 reload_on_search: true,
11923 last_reload: Mutex::new(None),
11924 last_generation: Mutex::new(None),
11925 reload_epoch: Arc::new(AtomicU64::new(0)),
11926 warm_tx: None,
11927 _warm_handle: None,
11928 metrics: Metrics::default(),
11929 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
11930 semantic: Mutex::new(None),
11931 last_tantivy_total_count: Mutex::new(None),
11932 };
11933
11934 let first_hit = SearchHit {
11935 title: "Shared Session".into(),
11936 snippet: String::new(),
11937 content: String::new(),
11938 content_hash: stable_hit_hash(
11939 &first,
11940 "/tmp/progressive-shared.jsonl",
11941 Some(1),
11942 Some(100),
11943 ),
11944 score: 0.0,
11945 source_path: "/tmp/progressive-shared.jsonl".into(),
11946 agent: "codex".into(),
11947 workspace: String::new(),
11948 workspace_original: None,
11949 created_at: Some(100),
11950 line_number: Some(1),
11951 match_type: MatchType::Exact,
11952 source_id: "local".into(),
11953 origin_kind: "local".into(),
11954 origin_host: None,
11955 conversation_id: None,
11956 };
11957 let second_hit = SearchHit {
11958 title: "Shared Session".into(),
11959 snippet: String::new(),
11960 content: String::new(),
11961 content_hash: stable_hit_hash(
11962 &second,
11963 "/tmp/progressive-shared.jsonl",
11964 Some(1),
11965 Some(100),
11966 ),
11967 score: 0.0,
11968 source_path: "/tmp/progressive-shared.jsonl".into(),
11969 agent: "codex".into(),
11970 workspace: String::new(),
11971 workspace_original: None,
11972 created_at: Some(100),
11973 line_number: Some(1),
11974 match_type: MatchType::Exact,
11975 source_id: "local".into(),
11976 origin_kind: "local".into(),
11977 origin_host: None,
11978 conversation_id: None,
11979 };
11980
11981 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
11982 assert_eq!(resolved.len(), 2);
11983 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
11984 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
11985 assert_ne!(
11986 resolved[0].as_ref().map(|hit| hit.doc_id.as_str()),
11987 resolved[1].as_ref().map(|hit| hit.doc_id.as_str())
11988 );
11989
11990 Ok(())
11991 }
11992
11993 #[test]
11994 fn hydrate_semantic_hits_with_ids_keeps_missing_title_empty() -> Result<()> {
11995 let conn = Connection::open(":memory:")?;
11996 conn.execute_batch(
11997 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
11998 CREATE TABLE conversations (
11999 id INTEGER PRIMARY KEY,
12000 agent_id INTEGER NOT NULL,
12001 workspace_id INTEGER,
12002 source_id TEXT,
12003 origin_host TEXT,
12004 title TEXT,
12005 source_path TEXT NOT NULL,
12006 started_at INTEGER
12007 );
12008 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12009 CREATE TABLE messages (
12010 id INTEGER PRIMARY KEY,
12011 conversation_id INTEGER NOT NULL,
12012 idx INTEGER,
12013 role TEXT,
12014 content TEXT NOT NULL,
12015 created_at INTEGER
12016 );
12017 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12018 )?;
12019 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12020 conn.execute(
12021 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path, started_at)
12022 VALUES(1, 1, NULL, 'local', NULL, NULL, '/tmp/untitled-semantic.jsonl', 100)",
12023 )?;
12024 conn.execute_with_params(
12025 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12026 VALUES(?1, 1, 0, 'assistant', ?2, 101)",
12027 &[
12028 fsqlite_types::value::SqliteValue::Integer(1),
12029 fsqlite_types::value::SqliteValue::Text("untitled semantic body".into()),
12030 ],
12031 )?;
12032
12033 let client = SearchClient {
12034 reader: None,
12035 sqlite: Mutex::new(Some(SendConnection(conn))),
12036 sqlite_path: None,
12037 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12038 reload_on_search: true,
12039 last_reload: Mutex::new(None),
12040 last_generation: Mutex::new(None),
12041 reload_epoch: Arc::new(AtomicU64::new(0)),
12042 warm_tx: None,
12043 _warm_handle: None,
12044 metrics: Metrics::default(),
12045 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12046 semantic: Mutex::new(None),
12047 last_tantivy_total_count: Mutex::new(None),
12048 };
12049
12050 let hits = client.hydrate_semantic_hits_with_ids(
12051 &[VectorSearchResult {
12052 message_id: 1,
12053 chunk_idx: 0,
12054 score: 0.9,
12055 }],
12056 FieldMask::new(false, true, true, true),
12057 )?;
12058 assert_eq!(hits.len(), 1);
12059 assert_eq!(hits[0].1.title, "");
12060
12061 Ok(())
12062 }
12063
12064 #[test]
12065 fn resolve_semantic_doc_ids_for_hits_prefers_conversation_id_over_ambiguous_provenance()
12066 -> Result<()> {
12067 let conn = Connection::open(":memory:")?;
12068 conn.execute_batch(
12069 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12070 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12071 CREATE TABLE conversations (
12072 id INTEGER PRIMARY KEY,
12073 agent_id INTEGER NOT NULL,
12074 workspace_id INTEGER,
12075 source_id TEXT,
12076 origin_host TEXT,
12077 title TEXT,
12078 source_path TEXT NOT NULL
12079 );
12080 CREATE TABLE messages (
12081 id INTEGER PRIMARY KEY,
12082 conversation_id INTEGER NOT NULL,
12083 idx INTEGER,
12084 role TEXT,
12085 content TEXT NOT NULL,
12086 created_at INTEGER
12087 );",
12088 )?;
12089 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12090 conn.execute(
12091 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12092 VALUES(1, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
12093 )?;
12094 conn.execute(
12095 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12096 VALUES(2, 1, NULL, 'local', NULL, 'Shared Session', '/tmp/progressive-conversation-id.jsonl')",
12097 )?;
12098 let content = "same ambiguous content".to_string();
12099 conn.execute_with_params(
12100 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12101 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12102 &[
12103 fsqlite_types::value::SqliteValue::Integer(11),
12104 fsqlite_types::value::SqliteValue::Integer(1),
12105 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12106 ],
12107 )?;
12108 conn.execute_with_params(
12109 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12110 VALUES(?1, ?2, 0, 'assistant', ?3, 100)",
12111 &[
12112 fsqlite_types::value::SqliteValue::Integer(22),
12113 fsqlite_types::value::SqliteValue::Integer(2),
12114 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12115 ],
12116 )?;
12117
12118 let client = SearchClient {
12119 reader: None,
12120 sqlite: Mutex::new(Some(SendConnection(conn))),
12121 sqlite_path: None,
12122 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12123 reload_on_search: true,
12124 last_reload: Mutex::new(None),
12125 last_generation: Mutex::new(None),
12126 reload_epoch: Arc::new(AtomicU64::new(0)),
12127 warm_tx: None,
12128 _warm_handle: None,
12129 metrics: Metrics::default(),
12130 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12131 semantic: Mutex::new(None),
12132 last_tantivy_total_count: Mutex::new(None),
12133 };
12134
12135 let first_hit = SearchHit {
12136 title: "Shared Session".into(),
12137 snippet: String::new(),
12138 content: String::new(),
12139 content_hash: stable_hit_hash(
12140 &content,
12141 "/tmp/progressive-conversation-id.jsonl",
12142 Some(1),
12143 Some(100),
12144 ),
12145 score: 0.0,
12146 source_path: "/tmp/progressive-conversation-id.jsonl".into(),
12147 agent: "codex".into(),
12148 workspace: String::new(),
12149 workspace_original: None,
12150 created_at: Some(100),
12151 line_number: Some(1),
12152 match_type: MatchType::Exact,
12153 source_id: "local".into(),
12154 origin_kind: "local".into(),
12155 origin_host: None,
12156 conversation_id: Some(1),
12157 };
12158 let second_hit = SearchHit {
12159 conversation_id: Some(2),
12160 ..first_hit.clone()
12161 };
12162
12163 let resolved = client.resolve_semantic_doc_ids_for_hits(&[first_hit, second_hit])?;
12164 assert_eq!(resolved.len(), 2);
12165 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12166 assert_eq!(resolved[1].as_ref().map(|hit| hit.message_id), Some(22));
12167
12168 Ok(())
12169 }
12170
12171 #[test]
12172 fn resolve_semantic_doc_ids_for_hits_treats_null_source_as_local() -> Result<()> {
12173 let conn = Connection::open(":memory:")?;
12174 conn.execute_batch(
12175 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12176 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12177 CREATE TABLE conversations (
12178 id INTEGER PRIMARY KEY,
12179 agent_id INTEGER NOT NULL,
12180 workspace_id INTEGER,
12181 source_id TEXT,
12182 origin_host TEXT,
12183 title TEXT,
12184 source_path TEXT NOT NULL
12185 );
12186 CREATE TABLE messages (
12187 id INTEGER PRIMARY KEY,
12188 conversation_id INTEGER NOT NULL,
12189 idx INTEGER,
12190 role TEXT,
12191 content TEXT NOT NULL,
12192 created_at INTEGER
12193 );",
12194 )?;
12195 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12196 conn.execute(
12197 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12198 VALUES(1, 1, NULL, NULL, NULL, 'Legacy Local', '/tmp/legacy-local.jsonl')",
12199 )?;
12200 let content = "legacy local semantic message".to_string();
12201 conn.execute_with_params(
12202 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12203 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12204 &[
12205 fsqlite_types::value::SqliteValue::Integer(11),
12206 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12207 ],
12208 )?;
12209
12210 let client = SearchClient {
12211 reader: None,
12212 sqlite: Mutex::new(Some(SendConnection(conn))),
12213 sqlite_path: None,
12214 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12215 reload_on_search: true,
12216 last_reload: Mutex::new(None),
12217 last_generation: Mutex::new(None),
12218 reload_epoch: Arc::new(AtomicU64::new(0)),
12219 warm_tx: None,
12220 _warm_handle: None,
12221 metrics: Metrics::default(),
12222 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12223 semantic: Mutex::new(None),
12224 last_tantivy_total_count: Mutex::new(None),
12225 };
12226
12227 let hit = SearchHit {
12228 title: "Legacy Local".into(),
12229 snippet: String::new(),
12230 content: String::new(),
12231 content_hash: stable_hit_hash(&content, "/tmp/legacy-local.jsonl", Some(1), Some(100)),
12232 score: 0.0,
12233 source_path: "/tmp/legacy-local.jsonl".into(),
12234 agent: "codex".into(),
12235 workspace: String::new(),
12236 workspace_original: None,
12237 created_at: Some(100),
12238 line_number: Some(1),
12239 match_type: MatchType::Exact,
12240 source_id: "local".into(),
12241 origin_kind: "local".into(),
12242 origin_host: None,
12243 conversation_id: None,
12244 };
12245
12246 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12247 assert_eq!(resolved.len(), 1);
12248 assert_eq!(resolved[0].as_ref().map(|hit| hit.message_id), Some(11));
12249
12250 Ok(())
12251 }
12252
12253 #[test]
12254 fn resolve_semantic_doc_ids_for_hits_matches_trimmed_local_source_id() -> Result<()> {
12255 let conn = Connection::open(":memory:")?;
12256 conn.execute_batch(
12257 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12258 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12259 CREATE TABLE conversations (
12260 id INTEGER PRIMARY KEY,
12261 agent_id INTEGER NOT NULL,
12262 workspace_id INTEGER,
12263 source_id TEXT,
12264 origin_host TEXT,
12265 title TEXT,
12266 source_path TEXT NOT NULL
12267 );
12268 CREATE TABLE messages (
12269 id INTEGER PRIMARY KEY,
12270 conversation_id INTEGER NOT NULL,
12271 idx INTEGER,
12272 role TEXT,
12273 content TEXT NOT NULL,
12274 created_at INTEGER
12275 );",
12276 )?;
12277 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12278 conn.execute(
12279 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12280 VALUES(1, 1, NULL, ' local ', NULL, 'Trimmed Local', '/tmp/trimmed-local.jsonl')",
12281 )?;
12282 let content = "trimmed local semantic message".to_string();
12283 conn.execute_with_params(
12284 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12285 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12286 &[
12287 fsqlite_types::value::SqliteValue::Integer(11),
12288 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12289 ],
12290 )?;
12291
12292 let client = SearchClient {
12293 reader: None,
12294 sqlite: Mutex::new(Some(SendConnection(conn))),
12295 sqlite_path: None,
12296 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12297 reload_on_search: true,
12298 last_reload: Mutex::new(None),
12299 last_generation: Mutex::new(None),
12300 reload_epoch: Arc::new(AtomicU64::new(0)),
12301 warm_tx: None,
12302 _warm_handle: None,
12303 metrics: Metrics::default(),
12304 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12305 semantic: Mutex::new(None),
12306 last_tantivy_total_count: Mutex::new(None),
12307 };
12308
12309 let hit = SearchHit {
12310 title: "Trimmed Local".into(),
12311 snippet: String::new(),
12312 content: String::new(),
12313 content_hash: stable_hit_hash(&content, "/tmp/trimmed-local.jsonl", Some(1), Some(100)),
12314 score: 0.0,
12315 source_path: "/tmp/trimmed-local.jsonl".into(),
12316 agent: "codex".into(),
12317 workspace: String::new(),
12318 workspace_original: None,
12319 created_at: Some(100),
12320 line_number: Some(1),
12321 match_type: MatchType::Exact,
12322 source_id: "local".into(),
12323 origin_kind: "local".into(),
12324 origin_host: None,
12325 conversation_id: None,
12326 };
12327
12328 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12329 assert_eq!(resolved.len(), 1);
12330 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
12331
12332 Ok(())
12333 }
12334
12335 #[test]
12336 fn resolve_semantic_doc_ids_for_hits_normalizes_blank_local_source_id() -> Result<()> {
12337 let conn = Connection::open(":memory:")?;
12338 conn.execute_batch(
12339 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12340 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12341 CREATE TABLE conversations (
12342 id INTEGER PRIMARY KEY,
12343 agent_id INTEGER NOT NULL,
12344 workspace_id INTEGER,
12345 source_id TEXT,
12346 origin_host TEXT,
12347 title TEXT,
12348 source_path TEXT NOT NULL
12349 );
12350 CREATE TABLE messages (
12351 id INTEGER PRIMARY KEY,
12352 conversation_id INTEGER NOT NULL,
12353 idx INTEGER,
12354 role TEXT,
12355 content TEXT NOT NULL,
12356 created_at INTEGER
12357 );",
12358 )?;
12359 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12360 conn.execute(
12361 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12362 VALUES(1, 1, NULL, 'local', NULL, 'Blank Local', '/tmp/blank-local.jsonl')",
12363 )?;
12364 let content = "blank local semantic message".to_string();
12365 conn.execute_with_params(
12366 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12367 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12368 &[
12369 fsqlite_types::value::SqliteValue::Integer(11),
12370 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12371 ],
12372 )?;
12373
12374 let client = SearchClient {
12375 reader: None,
12376 sqlite: Mutex::new(Some(SendConnection(conn))),
12377 sqlite_path: None,
12378 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12379 reload_on_search: true,
12380 last_reload: Mutex::new(None),
12381 last_generation: Mutex::new(None),
12382 reload_epoch: Arc::new(AtomicU64::new(0)),
12383 warm_tx: None,
12384 _warm_handle: None,
12385 metrics: Metrics::default(),
12386 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12387 semantic: Mutex::new(None),
12388 last_tantivy_total_count: Mutex::new(None),
12389 };
12390
12391 let hit = SearchHit {
12392 title: "Blank Local".into(),
12393 snippet: String::new(),
12394 content: String::new(),
12395 content_hash: stable_hit_hash(&content, "/tmp/blank-local.jsonl", Some(1), Some(100)),
12396 score: 0.0,
12397 source_path: "/tmp/blank-local.jsonl".into(),
12398 agent: "codex".into(),
12399 workspace: String::new(),
12400 workspace_original: None,
12401 created_at: Some(100),
12402 line_number: Some(1),
12403 match_type: MatchType::Exact,
12404 source_id: " ".into(),
12405 origin_kind: "local".into(),
12406 origin_host: None,
12407 conversation_id: None,
12408 };
12409
12410 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12411 assert_eq!(resolved.len(), 1);
12412 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
12413
12414 Ok(())
12415 }
12416
12417 #[test]
12418 fn resolve_semantic_doc_ids_for_hits_infers_remote_source_from_origin_host_when_source_id_blank()
12419 -> Result<()> {
12420 let conn = Connection::open(":memory:")?;
12421 conn.execute_batch(
12422 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12423 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
12424 CREATE TABLE conversations (
12425 id INTEGER PRIMARY KEY,
12426 agent_id INTEGER NOT NULL,
12427 workspace_id INTEGER,
12428 source_id TEXT,
12429 origin_host TEXT,
12430 title TEXT,
12431 source_path TEXT NOT NULL
12432 );
12433 CREATE TABLE messages (
12434 id INTEGER PRIMARY KEY,
12435 conversation_id INTEGER NOT NULL,
12436 idx INTEGER,
12437 role TEXT,
12438 content TEXT NOT NULL,
12439 created_at INTEGER
12440 );",
12441 )?;
12442 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12443 conn.execute(
12444 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12445 VALUES(1, 1, NULL, ' ', 'dev@laptop', 'Legacy Remote', '/tmp/legacy-remote.jsonl')",
12446 )?;
12447 let content = "legacy remote semantic message".to_string();
12448 conn.execute_with_params(
12449 "INSERT INTO messages(id, conversation_id, idx, role, content, created_at)
12450 VALUES(?1, 1, 0, 'assistant', ?2, 100)",
12451 &[
12452 fsqlite_types::value::SqliteValue::Integer(11),
12453 fsqlite_types::value::SqliteValue::Text(content.clone().into()),
12454 ],
12455 )?;
12456
12457 let client = SearchClient {
12458 reader: None,
12459 sqlite: Mutex::new(Some(SendConnection(conn))),
12460 sqlite_path: None,
12461 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12462 reload_on_search: true,
12463 last_reload: Mutex::new(None),
12464 last_generation: Mutex::new(None),
12465 reload_epoch: Arc::new(AtomicU64::new(0)),
12466 warm_tx: None,
12467 _warm_handle: None,
12468 metrics: Metrics::default(),
12469 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12470 semantic: Mutex::new(None),
12471 last_tantivy_total_count: Mutex::new(None),
12472 };
12473
12474 let hit = SearchHit {
12475 title: "Legacy Remote".into(),
12476 snippet: String::new(),
12477 content: String::new(),
12478 content_hash: stable_hit_hash(&content, "/tmp/legacy-remote.jsonl", Some(1), Some(100)),
12479 score: 0.0,
12480 source_path: "/tmp/legacy-remote.jsonl".into(),
12481 agent: "codex".into(),
12482 workspace: String::new(),
12483 workspace_original: None,
12484 created_at: Some(100),
12485 line_number: Some(1),
12486 match_type: MatchType::Exact,
12487 source_id: "dev@laptop".into(),
12488 origin_kind: "remote".into(),
12489 origin_host: Some("dev@laptop".into()),
12490 conversation_id: None,
12491 };
12492
12493 let resolved = client.resolve_semantic_doc_ids_for_hits(&[hit])?;
12494 assert_eq!(resolved.len(), 1);
12495 assert_eq!(resolved[0].as_ref().map(|doc| doc.message_id), Some(11));
12496
12497 Ok(())
12498 }
12499
12500 #[test]
12501 fn browse_by_date_snippet_only_uses_full_content_for_hit_identity() -> Result<()> {
12502 let conn = Connection::open(":memory:")?;
12503 conn.execute_batch(
12504 "CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL);
12505 CREATE TABLE conversations (
12506 id INTEGER PRIMARY KEY,
12507 agent_id INTEGER NOT NULL,
12508 workspace_id INTEGER,
12509 source_id TEXT,
12510 origin_host TEXT,
12511 title TEXT,
12512 source_path TEXT NOT NULL
12513 );
12514 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL);
12515 CREATE TABLE messages (
12516 id INTEGER PRIMARY KEY,
12517 conversation_id INTEGER NOT NULL,
12518 idx INTEGER,
12519 content TEXT NOT NULL,
12520 created_at INTEGER
12521 );
12522 CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);",
12523 )?;
12524 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
12525 conn.execute(
12526 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, origin_host, title, source_path)
12527 VALUES(1, 1, NULL, 'local', NULL, 'browse title', '/tmp/browse-shared.jsonl')",
12528 )?;
12529 let shared_prefix = "shared-prefix ".repeat(48);
12530 let first = format!("{shared_prefix}first browse-only tail");
12531 let second = format!("{shared_prefix}second browse-only tail");
12532 conn.execute_with_params(
12533 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12534 VALUES(?1, 1, ?2, ?3, ?4)",
12535 &[
12536 fsqlite_types::value::SqliteValue::Integer(1),
12537 fsqlite_types::value::SqliteValue::Integer(0),
12538 fsqlite_types::value::SqliteValue::Text(first.clone().into()),
12539 fsqlite_types::value::SqliteValue::Integer(101),
12540 ],
12541 )?;
12542 conn.execute_with_params(
12543 "INSERT INTO messages(id, conversation_id, idx, content, created_at)
12544 VALUES(?1, 1, ?2, ?3, ?4)",
12545 &[
12546 fsqlite_types::value::SqliteValue::Integer(2),
12547 fsqlite_types::value::SqliteValue::Integer(1),
12548 fsqlite_types::value::SqliteValue::Text(second.clone().into()),
12549 fsqlite_types::value::SqliteValue::Integer(102),
12550 ],
12551 )?;
12552
12553 let client = SearchClient {
12554 reader: None,
12555 sqlite: Mutex::new(Some(SendConnection(conn))),
12556 sqlite_path: None,
12557 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12558 reload_on_search: true,
12559 last_reload: Mutex::new(None),
12560 last_generation: Mutex::new(None),
12561 reload_epoch: Arc::new(AtomicU64::new(0)),
12562 warm_tx: None,
12563 _warm_handle: None,
12564 metrics: Metrics::default(),
12565 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12566 semantic: Mutex::new(None),
12567 last_tantivy_total_count: Mutex::new(None),
12568 };
12569
12570 let hits = client.browse_by_date(
12571 SearchFilters::default(),
12572 10,
12573 0,
12574 true,
12575 FieldMask::new(false, true, true, true),
12576 )?;
12577 assert_eq!(hits.len(), 2);
12578 assert!(hits.iter().all(|hit| hit.content.is_empty()));
12579 assert!(hits.iter().all(|hit| !hit.snippet.is_empty()));
12580 assert_ne!(hits[0].content_hash, hits[1].content_hash);
12581
12582 Ok(())
12583 }
12584
12585 #[test]
12586 fn cache_invalidates_on_new_data() -> Result<()> {
12587 let dir = TempDir::new()?;
12588 let mut index = TantivyIndex::open_or_create(dir.path())?;
12589
12590 let conv1 = NormalizedConversation {
12592 agent_slug: "codex".into(),
12593 external_id: None,
12594 title: Some("first".into()),
12595 workspace: None,
12596 source_path: dir.path().join("1.jsonl"),
12597 started_at: Some(1),
12598 ended_at: None,
12599 metadata: serde_json::json!({}),
12600 messages: vec![NormalizedMessage {
12601 idx: 0,
12602 role: "user".into(),
12603 author: None,
12604 created_at: Some(1),
12605 content: "apple banana".into(),
12606 extra: serde_json::json!({}),
12607 snippets: vec![],
12608 invocations: Vec::new(),
12609 }],
12610 };
12611 index.add_conversation(&conv1)?;
12612 index.commit()?;
12613
12614 let client = SearchClient::open(dir.path(), None)?.expect("index present");
12615
12616 let hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
12618 assert_eq!(hits.len(), 1);
12619 assert_eq!(hits[0].content, "apple banana");
12620
12621 {
12623 let cache = client.prefix_cache.lock().unwrap();
12624 let shard = cache.shard_opt("global").unwrap();
12625 assert!(shard.contains(&client.cache_key("app", &SearchFilters::default())));
12627 }
12628
12629 let conv2 = NormalizedConversation {
12631 agent_slug: "codex".into(),
12632 external_id: None,
12633 title: Some("second".into()),
12634 workspace: None,
12635 source_path: dir.path().join("2.jsonl"),
12636 started_at: Some(2),
12637 ended_at: None,
12638 metadata: serde_json::json!({}),
12639 messages: vec![NormalizedMessage {
12640 idx: 0,
12641 role: "user".into(),
12642 author: None,
12643 created_at: Some(2),
12644 content: "apricot".into(),
12645 extra: serde_json::json!({}),
12646 snippets: vec![],
12647 invocations: Vec::new(),
12648 }],
12649 };
12650 index.add_conversation(&conv2)?;
12651 index.commit()?;
12652
12653 std::thread::sleep(std::time::Duration::from_millis(350));
12659
12660 let _hits = client.search("app", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
12663 let hits = client.search("apr", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
12667 assert_eq!(hits.len(), 1);
12668 assert_eq!(hits[0].content, "apricot");
12669
12670 Ok(())
12674 }
12675
12676 #[test]
12677 fn track_generation_clears_cache_on_change() {
12678 let client = SearchClient {
12679 reader: None,
12680 sqlite: Mutex::new(None),
12681 sqlite_path: None,
12682 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12683 reload_on_search: true,
12684 last_reload: Mutex::new(None),
12685 last_generation: Mutex::new(None),
12686 reload_epoch: Arc::new(AtomicU64::new(0)),
12687 warm_tx: None,
12688 _warm_handle: None,
12689 metrics: Metrics::default(),
12690 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12691 semantic: Mutex::new(None),
12692 last_tantivy_total_count: Mutex::new(None),
12693 };
12694
12695 let hit = SearchHit {
12696 title: "hello world".into(),
12697 snippet: "hello".into(),
12698 content: "hello world".into(),
12699 content_hash: stable_content_hash("hello world"),
12700 score: 1.0,
12701 source_path: "p".into(),
12702 agent: "a".into(),
12703 workspace: "w".into(),
12704 workspace_original: None,
12705 created_at: None,
12706 line_number: None,
12707 match_type: MatchType::Exact,
12708 source_id: "local".into(),
12709 origin_kind: "local".into(),
12710 origin_host: None,
12711 conversation_id: None,
12712 };
12713 let hits = vec![hit];
12714
12715 client.put_cache("hello", &SearchFilters::default(), &hits);
12716 {
12717 let cache = client.prefix_cache.lock().unwrap();
12718 assert!(!cache.shards.is_empty());
12719 }
12720
12721 client.track_generation(1);
12722 {
12723 let cache = client.prefix_cache.lock().unwrap();
12724 assert!(!cache.shards.is_empty());
12725 }
12726
12727 client.track_generation(2);
12728 {
12729 let cache = client.prefix_cache.lock().unwrap();
12730 assert!(cache.shards.is_empty());
12731 }
12732 }
12733
12734 #[test]
12735 fn cache_total_cap_evicts_across_shards() {
12736 let client = SearchClient {
12737 reader: None,
12738 sqlite: Mutex::new(None),
12739 sqlite_path: None,
12740 prefix_cache: Mutex::new(CacheShards::new(2, 0)), reload_on_search: true,
12742 last_reload: Mutex::new(None),
12743 last_generation: Mutex::new(None),
12744 reload_epoch: Arc::new(AtomicU64::new(0)),
12745 warm_tx: None,
12746 _warm_handle: None,
12747 metrics: Metrics::default(),
12748 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12749 semantic: Mutex::new(None),
12750 last_tantivy_total_count: Mutex::new(None),
12751 };
12752
12753 let hit = SearchHit {
12754 title: "a".into(),
12755 snippet: "a".into(),
12756 content: "a".into(),
12757 content_hash: stable_content_hash("a"),
12758 score: 1.0,
12759 source_path: "p".into(),
12760 agent: "agent1".into(),
12761 workspace: "w".into(),
12762 workspace_original: None,
12763 created_at: None,
12764 line_number: None,
12765 match_type: MatchType::Exact,
12766 source_id: "local".into(),
12767 origin_kind: "local".into(),
12768 origin_host: None,
12769 conversation_id: None,
12770 };
12771 let hits = vec![hit.clone()];
12772
12773 let mut filters = SearchFilters::default();
12774 filters.agents.insert("agent1".into());
12775 client.put_cache("a", &filters, &hits);
12776 filters.agents.clear();
12777 filters.agents.insert("agent2".into());
12778 client.put_cache("b", &filters, &hits);
12779 filters.agents.clear();
12780 filters.agents.insert("agent3".into());
12781 client.put_cache("c", &filters, &hits);
12782
12783 let stats = client.cache_stats();
12784 assert!(stats.total_cost <= stats.total_cap);
12785 assert_eq!(stats.total_cap, 2);
12786 }
12787
12788 #[test]
12789 fn cache_stats_reflect_metrics() {
12790 let client = SearchClient {
12791 reader: None,
12792 sqlite: Mutex::new(None),
12793 sqlite_path: None,
12794 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
12795 reload_on_search: true,
12796 last_reload: Mutex::new(None),
12797 last_generation: Mutex::new(None),
12798 reload_epoch: Arc::new(AtomicU64::new(0)),
12799 warm_tx: None,
12800 _warm_handle: None,
12801 metrics: Metrics::default(),
12802 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12803 semantic: Mutex::new(None),
12804 last_tantivy_total_count: Mutex::new(None),
12805 };
12806
12807 client.metrics.inc_cache_hits();
12808 client.metrics.inc_cache_miss();
12809 client.metrics.inc_cache_shortfall();
12810 client.metrics.record_reload(Duration::from_millis(10));
12811
12812 let stats = client.cache_stats();
12813 assert_eq!(stats.cache_hits, 1);
12814 assert_eq!(stats.cache_miss, 1);
12815 assert_eq!(stats.cache_shortfall, 1);
12816 assert_eq!(stats.reloads, 1);
12817 assert_eq!(stats.reload_ms_total, 10);
12818 assert_eq!(stats.total_cap, *CACHE_TOTAL_CAP);
12819 assert_eq!(stats.eviction_policy, "lru");
12820 assert_eq!(stats.prewarm_scheduled, 0);
12821 assert_eq!(stats.prewarm_skipped_pressure, 0);
12822 assert_eq!(CacheStats::default().eviction_policy, "unknown");
12823 }
12824
12825 #[test]
12826 fn adaptive_query_prewarm_schedules_only_after_hot_prefix_cache_entry() {
12827 let (tx, rx) = mpsc::unbounded();
12828 let client = SearchClient {
12829 reader: None,
12830 sqlite: Mutex::new(None),
12831 sqlite_path: None,
12832 prefix_cache: Mutex::new(CacheShards::new(10, 0)),
12833 reload_on_search: true,
12834 last_reload: Mutex::new(None),
12835 last_generation: Mutex::new(None),
12836 reload_epoch: Arc::new(AtomicU64::new(0)),
12837 warm_tx: Some(tx),
12838 _warm_handle: None,
12839 metrics: Metrics::default(),
12840 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12841 semantic: Mutex::new(None),
12842 last_tantivy_total_count: Mutex::new(None),
12843 };
12844 let mut filters = SearchFilters::default();
12845 filters.workspaces.insert("/tmp/cass-workspace".into());
12846
12847 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
12848 assert!(
12849 rx.try_recv().is_err(),
12850 "cold prefixes should not schedule adaptive prewarm"
12851 );
12852
12853 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
12854 hit.snippet = "hello".into();
12855 hit.content = "hello world".into();
12856 hit.content_hash = stable_content_hash(&hit.content);
12857 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
12858
12859 let total_cost_before = client.cache_stats().total_cost;
12860 client.maybe_schedule_adaptive_query_prewarm("hel", &filters);
12861 assert!(
12862 rx.try_recv().is_err(),
12863 "an exact cached query should not schedule redundant prewarm"
12864 );
12865 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
12866
12867 let job = rx
12868 .try_recv()
12869 .expect("hot prefix should schedule adaptive prewarm");
12870 assert_eq!(job.query, "hello");
12871 assert_eq!(job.shard_name, "workspace:/tmp/cass-workspace");
12872 assert_eq!(job.filters_fingerprint, filters_fingerprint(&filters));
12873 let stats = client.cache_stats();
12874 assert_eq!(stats.prewarm_scheduled, 1);
12875 assert_eq!(stats.prewarm_skipped_pressure, 0);
12876 assert_eq!(
12877 stats.total_cost, total_cost_before,
12878 "prewarm scheduling should not mutate result-cache contents"
12879 );
12880 }
12881
12882 #[test]
12883 fn adaptive_query_prewarm_skips_when_cache_byte_cap_is_under_pressure() {
12884 let mut hit = projected_minimal_fields_search_hit("hello title", "p");
12885 hit.snippet = "hello".into();
12886 hit.content = "hello world with enough content to consume the small byte budget".into();
12887 hit.content_hash = stable_content_hash(&hit.content);
12888 let byte_cap = cached_hit_from(&hit).approx_bytes();
12889
12890 let (tx, rx) = mpsc::unbounded();
12891 let client = SearchClient {
12892 reader: None,
12893 sqlite: Mutex::new(None),
12894 sqlite_path: None,
12895 prefix_cache: Mutex::new(CacheShards::new(10, byte_cap)),
12896 reload_on_search: true,
12897 last_reload: Mutex::new(None),
12898 last_generation: Mutex::new(None),
12899 reload_epoch: Arc::new(AtomicU64::new(0)),
12900 warm_tx: Some(tx),
12901 _warm_handle: None,
12902 metrics: Metrics::default(),
12903 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12904 semantic: Mutex::new(None),
12905 last_tantivy_total_count: Mutex::new(None),
12906 };
12907 let filters = SearchFilters::default();
12908
12909 client.put_cache("hel", &filters, std::slice::from_ref(&hit));
12910 client.maybe_schedule_adaptive_query_prewarm("zebra", &filters);
12911 assert_eq!(
12912 client.cache_stats().prewarm_skipped_pressure,
12913 0,
12914 "cold queries should not be counted as pressure-skipped prewarm jobs"
12915 );
12916
12917 client.maybe_schedule_adaptive_query_prewarm("hello", &filters);
12918
12919 assert!(
12920 rx.try_recv().is_err(),
12921 "prewarm should be disabled while cache byte pressure is high"
12922 );
12923 let stats = client.cache_stats();
12924 assert_eq!(stats.prewarm_scheduled, 0);
12925 assert_eq!(stats.prewarm_skipped_pressure, 1);
12926 assert!(stats.approx_bytes <= stats.byte_cap);
12927 }
12928
12929 #[test]
12930 fn cache_eviction_count_tracks_evictions() {
12931 let client = SearchClient {
12933 reader: None,
12934 sqlite: Mutex::new(None),
12935 sqlite_path: None,
12936 prefix_cache: Mutex::new(CacheShards::new(2, 0)),
12937 reload_on_search: true,
12938 last_reload: Mutex::new(None),
12939 last_generation: Mutex::new(None),
12940 reload_epoch: Arc::new(AtomicU64::new(0)),
12941 warm_tx: None,
12942 _warm_handle: None,
12943 metrics: Metrics::default(),
12944 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
12945 semantic: Mutex::new(None),
12946 last_tantivy_total_count: Mutex::new(None),
12947 };
12948
12949 let hit = SearchHit {
12950 title: "test".into(),
12951 snippet: "snippet".into(),
12952 content: "content".into(),
12953 content_hash: stable_content_hash("content"),
12954 score: 1.0,
12955 source_path: "p".into(),
12956 agent: "a".into(),
12957 workspace: "w".into(),
12958 workspace_original: None,
12959 created_at: None,
12960 line_number: None,
12961 match_type: MatchType::Exact,
12962 source_id: "local".into(),
12963 origin_kind: "local".into(),
12964 origin_host: None,
12965 conversation_id: None,
12966 };
12967
12968 client.put_cache(
12970 "query1",
12971 &SearchFilters::default(),
12972 std::slice::from_ref(&hit),
12973 );
12974 client.put_cache(
12975 "query2",
12976 &SearchFilters::default(),
12977 std::slice::from_ref(&hit),
12978 );
12979 client.put_cache(
12980 "query3",
12981 &SearchFilters::default(),
12982 std::slice::from_ref(&hit),
12983 );
12984
12985 let stats = client.cache_stats();
12986 assert!(
12987 stats.eviction_count >= 1,
12988 "should have evicted at least 1 entry"
12989 );
12990 assert!(stats.total_cost <= 2, "should be at or below cap");
12991 assert!(stats.approx_bytes > 0, "should track bytes used");
12992 }
12993
12994 #[test]
12995 fn default_cache_byte_cap_scales_with_available_memory() {
12996 let gib = 1024_u64 * 1024 * 1024;
12997
12998 assert_eq!(
12999 default_cache_byte_cap_for_available(None),
13000 DEFAULT_CACHE_BYTE_CAP_FALLBACK
13001 );
13002 assert_eq!(
13003 default_cache_byte_cap_for_available(Some(2 * gib)),
13004 DEFAULT_CACHE_BYTE_CAP_FALLBACK,
13005 "small hosts keep a conservative cache byte budget"
13006 );
13007 assert_eq!(
13008 default_cache_byte_cap_for_available(Some(64 * gib)),
13009 512 * 1024 * 1024,
13010 "larger hosts get a proportionally larger cache byte budget"
13011 );
13012 assert_eq!(
13013 default_cache_byte_cap_for_available(Some(256 * gib)),
13014 usize::try_from(DEFAULT_CACHE_BYTE_CAP_CEILING).unwrap_or(usize::MAX),
13015 "large swarm hosts still have a bounded default cache budget"
13016 );
13017 }
13018
13019 #[test]
13020 fn malformed_cache_byte_cap_env_uses_default_instead_of_disabling_guard() {
13021 let gib = 1024_u64 * 1024 * 1024;
13022
13023 assert_eq!(cache_byte_cap_from_env_value(Some("0"), Some(64 * gib)), 0);
13024 assert_eq!(
13025 cache_byte_cap_from_env_value(Some("not-a-number"), Some(64 * gib)),
13026 default_cache_byte_cap_for_available(Some(64 * gib)),
13027 "malformed env should keep the default memory guard active"
13028 );
13029 assert_eq!(
13030 cache_byte_cap_from_env_value(None, Some(64 * gib)),
13031 default_cache_byte_cap_for_available(Some(64 * gib))
13032 );
13033 }
13034
13035 #[test]
13036 fn cache_eviction_policy_env_defaults_to_lru_and_accepts_s3_fifo() {
13037 assert_eq!(
13038 cache_eviction_policy_from_env_value(None),
13039 CacheEvictionPolicy::Lru
13040 );
13041 assert_eq!(
13042 cache_eviction_policy_from_env_value(Some("not-a-policy")),
13043 CacheEvictionPolicy::Lru,
13044 "malformed env keeps the current LRU behavior"
13045 );
13046 assert_eq!(
13047 cache_eviction_policy_from_env_value(Some("s3-fifo")),
13048 CacheEvictionPolicy::S3Fifo
13049 );
13050 assert_eq!(
13051 cache_eviction_policy_from_env_value(Some("s3_fifo")),
13052 CacheEvictionPolicy::S3Fifo
13053 );
13054 }
13055
13056 #[test]
13057 fn s3_fifo_admission_rejects_one_off_byte_heavy_entries_then_admits_ghost_replay() {
13058 let content = "large".repeat(1_000);
13059 let hit = SearchHit {
13060 title: "large".into(),
13061 snippet: "large".into(),
13062 content: content.clone(),
13063 content_hash: stable_content_hash(&content),
13064 score: 1.0,
13065 source_path: "large-path".into(),
13066 agent: "a".into(),
13067 workspace: "w".into(),
13068 workspace_original: None,
13069 created_at: None,
13070 line_number: None,
13071 match_type: MatchType::Exact,
13072 source_id: "local".into(),
13073 origin_kind: "local".into(),
13074 origin_host: None,
13075 conversation_id: None,
13076 };
13077 let cached = cached_hit_from(&hit);
13078 let byte_cap = cached.approx_bytes() + 1_024;
13079 assert!(
13080 cached.approx_bytes() > byte_cap.div_ceil(S3_FIFO_LARGE_ENTRY_FRACTION_DENOMINATOR)
13081 );
13082
13083 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::S3Fifo);
13084 let key = Arc::<str>::from("large-query");
13085
13086 cache.put("global", key.clone(), vec![cached.clone()]);
13087 assert_eq!(
13088 cache.total_cost(),
13089 0,
13090 "first one-off large entry is not admitted"
13091 );
13092 assert_eq!(cache.ghost_entries(), 1);
13093 assert_eq!(cache.admission_rejects(), 1);
13094
13095 cache.put("global", key, vec![cached]);
13096 assert_eq!(
13097 cache.total_cost(),
13098 1,
13099 "ghost replay admits the repeated query"
13100 );
13101 assert_eq!(cache.ghost_entries(), 0);
13102 assert!(cache.ghost_keys.is_empty());
13103 assert_eq!(cache.admission_rejects(), 1);
13104 assert!(cache.total_bytes() <= cache.byte_cap());
13105 }
13106
13107 #[test]
13108 fn lru_policy_keeps_admitting_large_entries_under_existing_caps() {
13109 let content = "large".repeat(1_000);
13110 let hit = SearchHit {
13111 title: "large".into(),
13112 snippet: "large".into(),
13113 content: content.clone(),
13114 content_hash: stable_content_hash(&content),
13115 score: 1.0,
13116 source_path: "large-path".into(),
13117 agent: "a".into(),
13118 workspace: "w".into(),
13119 workspace_original: None,
13120 created_at: None,
13121 line_number: None,
13122 match_type: MatchType::Exact,
13123 source_id: "local".into(),
13124 origin_kind: "local".into(),
13125 origin_host: None,
13126 conversation_id: None,
13127 };
13128 let cached = cached_hit_from(&hit);
13129 let byte_cap = cached.approx_bytes() + 1_024;
13130 let mut cache = CacheShards::new_with_policy(100, byte_cap, CacheEvictionPolicy::Lru);
13131
13132 cache.put("global", Arc::<str>::from("large-query"), vec![cached]);
13133
13134 assert_eq!(cache.total_cost(), 1);
13135 assert_eq!(cache.ghost_entries(), 0);
13136 assert_eq!(cache.admission_rejects(), 0);
13137 assert_eq!(cache.policy_label(), "lru");
13138 }
13139
13140 #[test]
13141 fn cache_byte_cap_triggers_eviction() {
13142 let client = SearchClient {
13144 reader: None,
13145 sqlite: Mutex::new(None),
13146 sqlite_path: None,
13147 prefix_cache: Mutex::new(CacheShards::new(1000, 100)), reload_on_search: true,
13149 last_reload: Mutex::new(None),
13150 last_generation: Mutex::new(None),
13151 reload_epoch: Arc::new(AtomicU64::new(0)),
13152 warm_tx: None,
13153 _warm_handle: None,
13154 metrics: Metrics::default(),
13155 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
13156 semantic: Mutex::new(None),
13157 last_tantivy_total_count: Mutex::new(None),
13158 };
13159
13160 let content = "c".repeat(100);
13162 let hit = SearchHit {
13163 title: "a".repeat(50),
13164 snippet: "b".repeat(50),
13165 content: content.clone(), content_hash: stable_content_hash(&content),
13167 score: 1.0,
13168 source_path: "p".into(),
13169 agent: "a".into(),
13170 workspace: "w".into(),
13171 workspace_original: None,
13172 created_at: None,
13173 line_number: None,
13174 match_type: MatchType::Exact,
13175 source_id: "local".into(),
13176 origin_kind: "local".into(),
13177 origin_host: None,
13178 conversation_id: None,
13179 };
13180
13181 client.put_cache("q1", &SearchFilters::default(), std::slice::from_ref(&hit));
13183 client.put_cache("q2", &SearchFilters::default(), std::slice::from_ref(&hit));
13184 client.put_cache("q3", &SearchFilters::default(), std::slice::from_ref(&hit));
13185
13186 let stats = client.cache_stats();
13187 assert!(
13188 stats.eviction_count >= 1,
13189 "byte cap should trigger evictions"
13190 );
13191 assert_eq!(stats.byte_cap, 100, "byte cap should be reported");
13192 }
13194
13195 #[test]
13196 fn cache_byte_pressure_evicts_byte_heavy_shard_before_small_entries() {
13197 let small_hit = SearchHit {
13198 title: "small".into(),
13199 snippet: "small".into(),
13200 content: "small".into(),
13201 content_hash: stable_content_hash("small"),
13202 score: 1.0,
13203 source_path: "small-path".into(),
13204 agent: "a".into(),
13205 workspace: "w".into(),
13206 workspace_original: None,
13207 created_at: None,
13208 line_number: None,
13209 match_type: MatchType::Exact,
13210 source_id: "local".into(),
13211 origin_kind: "local".into(),
13212 origin_host: None,
13213 conversation_id: None,
13214 };
13215 let large_content = "large".repeat(2_000);
13216 let large_hit = SearchHit {
13217 title: "large".into(),
13218 snippet: "large".into(),
13219 content: large_content.clone(),
13220 content_hash: stable_content_hash(&large_content),
13221 score: 1.0,
13222 source_path: "large-path".into(),
13223 agent: "b".into(),
13224 workspace: "w".into(),
13225 workspace_original: None,
13226 created_at: None,
13227 line_number: None,
13228 match_type: MatchType::Exact,
13229 source_id: "local".into(),
13230 origin_kind: "local".into(),
13231 origin_host: None,
13232 conversation_id: None,
13233 };
13234
13235 let mut cache = CacheShards::new(100, 1_024);
13236 cache.put(
13237 "small",
13238 Arc::<str>::from("small-1"),
13239 vec![cached_hit_from(&small_hit)],
13240 );
13241 cache.put(
13242 "small",
13243 Arc::<str>::from("small-2"),
13244 vec![cached_hit_from(&small_hit)],
13245 );
13246 cache.put(
13247 "large",
13248 Arc::<str>::from("large-1"),
13249 vec![cached_hit_from(&large_hit)],
13250 );
13251
13252 assert_eq!(
13253 cache.shard_opt("small").map(LruCache::len),
13254 Some(2),
13255 "byte pressure should preserve the small shard"
13256 );
13257 assert!(
13258 cache.shard_opt("large").is_none_or(LruCache::is_empty),
13259 "oversized shard should be evicted first under byte pressure"
13260 );
13261 assert!(cache.total_bytes() <= cache.byte_cap());
13262 }
13263
13264 #[test]
13269 fn wildcard_pattern_parse_exact() {
13270 assert_eq!(
13272 FsCassWildcardPattern::parse("hello"),
13273 FsCassWildcardPattern::Exact("hello".into())
13274 );
13275 assert_eq!(
13276 FsCassWildcardPattern::parse("HELLO"),
13277 FsCassWildcardPattern::Exact("hello".into()) );
13279 assert_eq!(
13280 FsCassWildcardPattern::parse("FooBar123"),
13281 FsCassWildcardPattern::Exact("foobar123".into())
13282 );
13283 }
13284
13285 #[test]
13286 fn wildcard_pattern_parse_prefix() {
13287 assert_eq!(
13289 FsCassWildcardPattern::parse("foo*"),
13290 FsCassWildcardPattern::Prefix("foo".into())
13291 );
13292 assert_eq!(
13293 FsCassWildcardPattern::parse("CONFIG*"),
13294 FsCassWildcardPattern::Prefix("config".into())
13295 );
13296 assert_eq!(
13297 FsCassWildcardPattern::parse("test*"),
13298 FsCassWildcardPattern::Prefix("test".into())
13299 );
13300 }
13301
13302 #[test]
13303 fn wildcard_pattern_parse_suffix() {
13304 assert_eq!(
13306 FsCassWildcardPattern::parse("*foo"),
13307 FsCassWildcardPattern::Suffix("foo".into())
13308 );
13309 assert_eq!(
13310 FsCassWildcardPattern::parse("*Error"),
13311 FsCassWildcardPattern::Suffix("error".into())
13312 );
13313 assert_eq!(
13314 FsCassWildcardPattern::parse("*Handler"),
13315 FsCassWildcardPattern::Suffix("handler".into())
13316 );
13317 }
13318
13319 #[test]
13320 fn wildcard_pattern_parse_substring() {
13321 assert_eq!(
13323 FsCassWildcardPattern::parse("*foo*"),
13324 FsCassWildcardPattern::Substring("foo".into())
13325 );
13326 assert_eq!(
13327 FsCassWildcardPattern::parse("*CONFIG*"),
13328 FsCassWildcardPattern::Substring("config".into())
13329 );
13330 assert_eq!(
13331 FsCassWildcardPattern::parse("*test*"),
13332 FsCassWildcardPattern::Substring("test".into())
13333 );
13334 }
13335
13336 #[test]
13337 fn wildcard_pattern_parse_edge_cases() {
13338 assert_eq!(
13340 FsCassWildcardPattern::parse("*"),
13341 FsCassWildcardPattern::Exact(String::new())
13342 );
13343 assert_eq!(
13344 FsCassWildcardPattern::parse("**"),
13345 FsCassWildcardPattern::Exact(String::new())
13346 );
13347 assert_eq!(
13348 FsCassWildcardPattern::parse("***"),
13349 FsCassWildcardPattern::Exact(String::new())
13350 );
13351
13352 assert_eq!(
13354 FsCassWildcardPattern::parse("*a*"),
13355 FsCassWildcardPattern::Substring("a".into())
13356 );
13357 assert_eq!(
13358 FsCassWildcardPattern::parse("a*"),
13359 FsCassWildcardPattern::Prefix("a".into())
13360 );
13361 assert_eq!(
13362 FsCassWildcardPattern::parse("*a"),
13363 FsCassWildcardPattern::Suffix("a".into())
13364 );
13365
13366 assert_eq!(
13368 FsCassWildcardPattern::parse("***foo***"),
13369 FsCassWildcardPattern::Substring("foo".into())
13370 );
13371 }
13372
13373 #[test]
13374 fn wildcard_pattern_to_regex_suffix() {
13375 let pattern = FsCassWildcardPattern::Suffix("foo".into());
13376 assert_eq!(pattern.to_regex(), Some(".*foo$".into()));
13378 }
13379
13380 #[test]
13381 fn wildcard_pattern_to_regex_substring() {
13382 let pattern = FsCassWildcardPattern::Substring("bar".into());
13383 assert_eq!(pattern.to_regex(), Some(".*bar.*".into()));
13384 }
13385
13386 #[test]
13387 fn wildcard_pattern_to_regex_exact_prefix_none() {
13388 let exact = FsCassWildcardPattern::Exact("foo".into());
13390 assert_eq!(exact.to_regex(), None);
13391
13392 let prefix = FsCassWildcardPattern::Prefix("bar".into());
13393 assert_eq!(prefix.to_regex(), None);
13394 }
13395
13396 #[test]
13397 fn match_type_quality_factors() {
13398 assert_eq!(MatchType::Exact.quality_factor(), 1.0);
13400 assert_eq!(MatchType::Prefix.quality_factor(), 0.9);
13402 assert_eq!(MatchType::Suffix.quality_factor(), 0.8);
13404 assert_eq!(MatchType::Substring.quality_factor(), 0.7);
13406 assert_eq!(MatchType::ImplicitWildcard.quality_factor(), 0.6);
13408 }
13409
13410 #[test]
13411 fn dominant_match_type_single_terms() {
13412 assert_eq!(dominant_match_type("hello"), MatchType::Exact);
13414 assert_eq!(dominant_match_type("hello*"), MatchType::Prefix);
13415 assert_eq!(dominant_match_type("*hello"), MatchType::Suffix);
13416 assert_eq!(dominant_match_type("*hello*"), MatchType::Substring);
13417 }
13418
13419 #[test]
13420 fn dominant_match_type_multiple_terms() {
13421 assert_eq!(dominant_match_type("foo bar"), MatchType::Exact);
13423 assert_eq!(dominant_match_type("foo bar*"), MatchType::Prefix);
13424 assert_eq!(dominant_match_type("foo *bar"), MatchType::Suffix);
13425 assert_eq!(dominant_match_type("foo* *bar*"), MatchType::Substring);
13426 assert_eq!(dominant_match_type("foo *bar* baz"), MatchType::Substring);
13428 }
13429
13430 #[test]
13431 fn dominant_match_type_empty_query() {
13432 assert_eq!(dominant_match_type(""), MatchType::Exact);
13433 assert_eq!(dominant_match_type(" "), MatchType::Exact);
13434 }
13435
13436 #[test]
13437 fn wildcard_pattern_to_regex_escapes_special_chars() {
13438 assert_eq!(
13439 FsCassWildcardPattern::Suffix("foo.bar".into()).to_regex(),
13440 Some(".*foo\\.bar$".into())
13441 );
13442 assert_eq!(
13443 FsCassWildcardPattern::Substring("a+b*c?".into()).to_regex(),
13444 Some(".*a\\+b\\*c\\?.*".into())
13445 );
13446 }
13447
13448 #[test]
13449 fn wildcard_pattern_to_regex_escapes_complex_patterns() {
13450 assert_eq!(
13451 FsCassWildcardPattern::Suffix("test[0-9]+".into()).to_regex(),
13452 Some(".*test\\[0-9\\]\\+$".into())
13453 );
13454 assert_eq!(
13455 FsCassWildcardPattern::Substring("(a|b)".into()).to_regex(),
13456 Some(".*\\(a\\|b\\).*".into())
13457 );
13458 assert_eq!(
13459 FsCassWildcardPattern::Substring("end$".into()).to_regex(),
13460 Some(".*end\\$.*".into())
13461 );
13462 assert_eq!(
13463 FsCassWildcardPattern::Substring("^start".into()).to_regex(),
13464 Some(".*\\^start.*".into())
13465 );
13466 }
13467
13468 #[test]
13469 fn is_tool_invocation_noise_detects_noise() {
13470 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
13472 assert!(!is_tool_invocation_noise("[Tool: Read]"));
13473
13474 assert!(is_tool_invocation_noise("[Tool:]"));
13476 assert!(is_tool_invocation_noise("[Tool: ]"));
13477
13478 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
13480 assert!(!is_tool_invocation_noise(" [Tool: Grep - Search files] "));
13481
13482 assert!(is_tool_invocation_noise("[tool]"));
13484 assert!(is_tool_invocation_noise("tool: Bash"));
13485 }
13486
13487 #[test]
13488 fn is_tool_invocation_noise_allows_useful_content() {
13489 assert!(!is_tool_invocation_noise("[Tool: Read - src/main.rs]"));
13491 assert!(!is_tool_invocation_noise("[Tool: Bash - cargo test --lib]"));
13492 }
13493
13494 #[test]
13495 fn is_tool_invocation_noise_detects_tool_markers() {
13496 assert!(!is_tool_invocation_noise("[Tool: Bash]"));
13498 assert!(!is_tool_invocation_noise("[Tool: Read]"));
13499
13500 assert!(is_tool_invocation_noise("[Tool:]"));
13502
13503 assert!(!is_tool_invocation_noise("[Tool: Bash - Check status]"));
13505 assert!(!is_tool_invocation_noise(" [Tool: Write - description] "));
13506 }
13507
13508 #[test]
13509 fn deduplicate_hits_removes_exact_dupes() {
13510 let hits = vec![
13511 SearchHit {
13512 title: "title1".into(),
13513 snippet: "snip1".into(),
13514 content: "hello world".into(),
13515 content_hash: stable_content_hash("hello world"),
13516 score: 1.0,
13517 source_path: "a.jsonl".into(),
13518 agent: "agent".into(),
13519 workspace: "ws".into(),
13520 workspace_original: None,
13521 created_at: Some(100),
13522 line_number: None,
13523 match_type: MatchType::Exact,
13524 source_id: "local".into(),
13525 origin_kind: "local".into(),
13526 origin_host: None,
13527 conversation_id: None,
13528 },
13529 SearchHit {
13530 title: "title1".into(),
13531 snippet: "snip2".into(),
13532 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13534 score: 0.5, source_path: "a.jsonl".into(),
13536 agent: "agent".into(),
13537 workspace: "ws".into(),
13538 workspace_original: None,
13539 created_at: Some(100),
13540 line_number: None,
13541 match_type: MatchType::Exact,
13542 source_id: "local".into(), origin_kind: "local".into(),
13544 origin_host: None,
13545 conversation_id: None,
13546 },
13547 ];
13548
13549 let deduped = deduplicate_hits(hits);
13550 assert_eq!(deduped.len(), 1);
13551 assert_eq!(deduped[0].score, 1.0); assert_eq!(deduped[0].title, "title1");
13553 }
13554
13555 #[test]
13556 fn deduplicate_hits_keeps_higher_score() {
13557 let hits = vec![
13558 SearchHit {
13559 title: "title1".into(),
13560 snippet: "snip1".into(),
13561 content: "hello world".into(),
13562 content_hash: stable_content_hash("hello world"),
13563 score: 0.3, source_path: "a.jsonl".into(),
13565 agent: "agent".into(),
13566 workspace: "ws".into(),
13567 workspace_original: None,
13568 created_at: Some(100),
13569 line_number: None,
13570 match_type: MatchType::Exact,
13571 source_id: "local".into(),
13572 origin_kind: "local".into(),
13573 origin_host: None,
13574 conversation_id: None,
13575 },
13576 SearchHit {
13577 title: "title1".into(),
13578 snippet: "snip2".into(),
13579 content: "hello world".into(),
13580 content_hash: stable_content_hash("hello world"),
13581 score: 0.9, source_path: "a.jsonl".into(),
13583 agent: "agent".into(),
13584 workspace: "ws".into(),
13585 workspace_original: None,
13586 created_at: Some(100),
13587 line_number: None,
13588 match_type: MatchType::Exact,
13589 source_id: "local".into(),
13590 origin_kind: "local".into(),
13591 origin_host: None,
13592 conversation_id: None,
13593 },
13594 ];
13595
13596 let deduped = deduplicate_hits(hits);
13597 assert_eq!(deduped.len(), 1);
13598 assert_eq!(deduped[0].score, 0.9); assert_eq!(deduped[0].title, "title1");
13600 }
13601
13602 #[test]
13603 fn deduplicate_hits_keeps_repeated_same_content_at_different_lines() {
13604 let first = SearchHit {
13605 title: "Shared Session".into(),
13606 snippet: String::new(),
13607 content: "repeat me".into(),
13608 content_hash: stable_content_hash("repeat me"),
13609 score: 10.0,
13610 source_path: "/shared/session.jsonl".into(),
13611 agent: "codex".into(),
13612 workspace: "/ws".into(),
13613 workspace_original: None,
13614 created_at: Some(100),
13615 line_number: Some(1),
13616 match_type: MatchType::Exact,
13617 source_id: "local".into(),
13618 origin_kind: "local".into(),
13619 origin_host: None,
13620 conversation_id: None,
13621 };
13622 let mut second = first.clone();
13623 second.line_number = Some(2);
13624 second.created_at = Some(200);
13625 second.score = 9.0;
13626
13627 let deduped = deduplicate_hits(vec![first, second]);
13628 assert_eq!(deduped.len(), 2);
13629 }
13630
13631 #[test]
13632 fn deduplicate_hits_keeps_distinct_conversation_ids_with_same_title_path_and_content() {
13633 let mut first = make_test_hit("same", 1.0);
13634 first.title = "Shared Session".into();
13635 first.source_path = "/shared/session.jsonl".into();
13636 first.content = "identical body".into();
13637 first.content_hash = stable_content_hash("identical body");
13638 first.conversation_id = Some(1);
13639
13640 let mut second = first.clone();
13641 second.conversation_id = Some(2);
13642 second.score = 0.9;
13643
13644 let deduped = deduplicate_hits(vec![first, second]);
13645 assert_eq!(deduped.len(), 2);
13646 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(1)));
13647 assert!(deduped.iter().any(|hit| hit.conversation_id == Some(2)));
13648 }
13649
13650 #[test]
13651 fn deduplicate_hits_coalesces_same_conversation_id_despite_title_drift() {
13652 let mut first = make_test_hit("same", 1.0);
13653 first.title = "Morning Session".into();
13654 first.source_path = "/shared/session.jsonl".into();
13655 first.content = "identical body".into();
13656 first.content_hash = stable_content_hash("identical body");
13657 first.conversation_id = Some(7);
13658
13659 let mut second = first.clone();
13660 second.title = "Evening Session".into();
13661 second.score = 0.9;
13662
13663 let deduped = deduplicate_hits(vec![first, second]);
13664 assert_eq!(deduped.len(), 1);
13665 assert_eq!(deduped[0].conversation_id, Some(7));
13666 }
13667
13668 #[test]
13669 fn deduplicate_hits_keeps_distinct_titles_with_same_source_path_and_content() {
13670 let hits = vec![
13671 SearchHit {
13672 title: "Morning Session".into(),
13673 snippet: "snip1".into(),
13674 content: "hello world".into(),
13675 content_hash: stable_content_hash("hello world"),
13676 score: 0.9,
13677 source_path: "shared.jsonl".into(),
13678 agent: "agent".into(),
13679 workspace: "ws".into(),
13680 workspace_original: None,
13681 created_at: None,
13682 line_number: Some(1),
13683 match_type: MatchType::Exact,
13684 source_id: "local".into(),
13685 origin_kind: "local".into(),
13686 origin_host: None,
13687 conversation_id: None,
13688 },
13689 SearchHit {
13690 title: "Evening Session".into(),
13691 snippet: "snip2".into(),
13692 content: "hello world".into(),
13693 content_hash: stable_content_hash("hello world"),
13694 score: 0.8,
13695 source_path: "shared.jsonl".into(),
13696 agent: "agent".into(),
13697 workspace: "ws".into(),
13698 workspace_original: None,
13699 created_at: None,
13700 line_number: Some(1),
13701 match_type: MatchType::Exact,
13702 source_id: "local".into(),
13703 origin_kind: "local".into(),
13704 origin_host: None,
13705 conversation_id: None,
13706 },
13707 ];
13708
13709 let deduped = deduplicate_hits(hits);
13710 assert_eq!(deduped.len(), 2);
13711 assert!(deduped.iter().any(|hit| hit.title == "Morning Session"));
13712 assert!(deduped.iter().any(|hit| hit.title == "Evening Session"));
13713 }
13714
13715 #[test]
13716 fn deduplicate_hits_normalizes_whitespace() {
13717 let hits = vec![
13718 SearchHit {
13719 title: "title1".into(),
13720 snippet: "snip1".into(),
13721 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13723 score: 1.0,
13724 source_path: "a.jsonl".into(),
13725 agent: "agent".into(),
13726 workspace: "ws".into(),
13727 workspace_original: None,
13728 created_at: Some(100),
13729 line_number: None,
13730 match_type: MatchType::Exact,
13731 source_id: "local".into(),
13732 origin_kind: "local".into(),
13733 origin_host: None,
13734 conversation_id: None,
13735 },
13736 SearchHit {
13737 title: "title1".into(),
13738 snippet: "snip2".into(),
13739 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
13741 score: 0.5,
13742 source_path: "a.jsonl".into(),
13743 agent: "agent".into(),
13744 workspace: "ws".into(),
13745 workspace_original: None,
13746 created_at: Some(100),
13747 line_number: None,
13748 match_type: MatchType::Exact,
13749 source_id: "local".into(),
13750 origin_kind: "local".into(),
13751 origin_host: None,
13752 conversation_id: None,
13753 },
13754 ];
13755
13756 let deduped = deduplicate_hits(hits);
13757 assert_eq!(deduped.len(), 1); }
13759
13760 #[test]
13761 fn deduplicate_hits_normalizes_blank_local_source_id() {
13762 let hits = vec![
13763 SearchHit {
13764 title: "title1".into(),
13765 snippet: "snip1".into(),
13766 content: "hello world".into(),
13767 content_hash: stable_content_hash("hello world"),
13768 score: 1.0,
13769 source_path: "a.jsonl".into(),
13770 agent: "agent".into(),
13771 workspace: "ws".into(),
13772 workspace_original: None,
13773 created_at: Some(100),
13774 line_number: None,
13775 match_type: MatchType::Exact,
13776 source_id: "local".into(),
13777 origin_kind: "local".into(),
13778 origin_host: None,
13779 conversation_id: None,
13780 },
13781 SearchHit {
13782 title: "title1".into(),
13783 snippet: "snip2".into(),
13784 content: "hello world".into(),
13785 content_hash: stable_content_hash("hello world"),
13786 score: 0.5,
13787 source_path: "a.jsonl".into(),
13788 agent: "agent".into(),
13789 workspace: "ws".into(),
13790 workspace_original: None,
13791 created_at: Some(100),
13792 line_number: None,
13793 match_type: MatchType::Exact,
13794 source_id: " ".into(),
13795 origin_kind: "local".into(),
13796 origin_host: None,
13797 conversation_id: None,
13798 },
13799 ];
13800
13801 let deduped = deduplicate_hits(hits);
13802 assert_eq!(deduped.len(), 1);
13803 assert_eq!(deduped[0].source_id, "local");
13804 }
13805
13806 #[test]
13807 fn deduplicate_hits_filters_tool_noise() {
13808 let hits = vec![
13809 SearchHit {
13810 title: "title1".into(),
13811 snippet: "snip1".into(),
13812 content: "[Tool:]".into(), content_hash: stable_content_hash("[Tool:]"),
13814 score: 1.0,
13815 source_path: "a.jsonl".into(),
13816 agent: "agent".into(),
13817 workspace: "ws".into(),
13818 workspace_original: None,
13819 created_at: Some(100),
13820 line_number: None,
13821 match_type: MatchType::Exact,
13822 source_id: "local".into(),
13823 origin_kind: "local".into(),
13824 origin_host: None,
13825 conversation_id: None,
13826 },
13827 SearchHit {
13828 title: "title2".into(),
13829 snippet: "snip2".into(),
13830 content: "This is real content about testing".into(),
13831 content_hash: stable_content_hash("This is real content about testing"),
13832 score: 0.5,
13833 source_path: "b.jsonl".into(),
13834 agent: "agent".into(),
13835 workspace: "ws".into(),
13836 workspace_original: None,
13837 created_at: Some(200),
13838 line_number: None,
13839 match_type: MatchType::Exact,
13840 source_id: "local".into(),
13841 origin_kind: "local".into(),
13842 origin_host: None,
13843 conversation_id: None,
13844 },
13845 ];
13846
13847 let deduped = deduplicate_hits(hits);
13848 assert_eq!(deduped.len(), 1);
13849 assert!(deduped[0].content.contains("real content"));
13850 }
13851
13852 #[test]
13853 fn deduplicate_hits_filters_acknowledgement_noise() {
13854 let hits = vec![
13855 SearchHit {
13856 title: "ack".into(),
13857 snippet: "ack".into(),
13858 content: "Acknowledged.".into(),
13859 content_hash: stable_content_hash("Acknowledged."),
13860 score: 1.0,
13861 source_path: "ack.jsonl".into(),
13862 agent: "agent".into(),
13863 workspace: "ws".into(),
13864 workspace_original: None,
13865 created_at: Some(100),
13866 line_number: None,
13867 match_type: MatchType::Exact,
13868 source_id: "local".into(),
13869 origin_kind: "local".into(),
13870 origin_host: None,
13871 conversation_id: None,
13872 },
13873 SearchHit {
13874 title: "real".into(),
13875 snippet: "real".into(),
13876 content: "Authentication refresh logic changed".into(),
13877 content_hash: stable_content_hash("Authentication refresh logic changed"),
13878 score: 0.5,
13879 source_path: "real.jsonl".into(),
13880 agent: "agent".into(),
13881 workspace: "ws".into(),
13882 workspace_original: None,
13883 created_at: Some(200),
13884 line_number: None,
13885 match_type: MatchType::Exact,
13886 source_id: "local".into(),
13887 origin_kind: "local".into(),
13888 origin_host: None,
13889 conversation_id: None,
13890 },
13891 ];
13892
13893 let deduped = deduplicate_hits_with_query(hits, "authentication");
13894 assert_eq!(deduped.len(), 1);
13895 assert_eq!(deduped[0].title, "real");
13896 }
13897
13898 #[test]
13899 fn deduplicate_hits_hides_system_prompts_unless_query_requests_them() {
13900 let prompt_hit = SearchHit {
13901 title: "prompt".into(),
13902 snippet: "prompt".into(),
13903 content:
13904 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly."
13905 .into(),
13906 content_hash: stable_content_hash(
13907 "# AGENTS.md instructions for /repo\n\nYou are a coding assistant. Follow the instructions exactly.",
13908 ),
13909 score: 1.0,
13910 source_path: "prompt.jsonl".into(),
13911 agent: "agent".into(),
13912 workspace: "ws".into(),
13913 workspace_original: None,
13914 created_at: Some(100),
13915 line_number: None,
13916 match_type: MatchType::Exact,
13917 source_id: "local".into(),
13918 origin_kind: "local".into(),
13919 origin_host: None,
13920 conversation_id: None,
13921 };
13922
13923 assert!(
13924 deduplicate_hits_with_query(vec![prompt_hit.clone()], "coding assistant").is_empty()
13925 );
13926
13927 let kept = deduplicate_hits_with_query(vec![prompt_hit], "AGENTS.md instructions");
13928 assert_eq!(kept.len(), 1);
13929 assert_eq!(kept[0].title, "prompt");
13930 }
13931
13932 #[test]
13933 fn deduplicate_hits_preserves_unique_content() {
13934 let hits = vec![
13935 SearchHit {
13936 title: "title1".into(),
13937 snippet: "snip1".into(),
13938 content: "first message".into(),
13939 content_hash: stable_content_hash("first message"),
13940 score: 1.0,
13941 source_path: "a.jsonl".into(),
13942 agent: "agent".into(),
13943 workspace: "ws".into(),
13944 workspace_original: None,
13945 created_at: Some(100),
13946 line_number: None,
13947 match_type: MatchType::Exact,
13948 source_id: "local".into(),
13949 origin_kind: "local".into(),
13950 origin_host: None,
13951 conversation_id: None,
13952 },
13953 SearchHit {
13954 title: "title2".into(),
13955 snippet: "snip2".into(),
13956 content: "second message".into(),
13957 content_hash: stable_content_hash("second message"),
13958 score: 0.8,
13959 source_path: "b.jsonl".into(),
13960 agent: "agent".into(),
13961 workspace: "ws".into(),
13962 workspace_original: None,
13963 created_at: Some(200),
13964 line_number: None,
13965 match_type: MatchType::Exact,
13966 source_id: "local".into(),
13967 origin_kind: "local".into(),
13968 origin_host: None,
13969 conversation_id: None,
13970 },
13971 SearchHit {
13972 title: "title3".into(),
13973 snippet: "snip3".into(),
13974 content: "third message".into(),
13975 content_hash: stable_content_hash("third message"),
13976 score: 0.6,
13977 source_path: "c.jsonl".into(),
13978 agent: "agent".into(),
13979 workspace: "ws".into(),
13980 workspace_original: None,
13981 created_at: Some(300),
13982 line_number: None,
13983 match_type: MatchType::Exact,
13984 source_id: "local".into(),
13985 origin_kind: "local".into(),
13986 origin_host: None,
13987 conversation_id: None,
13988 },
13989 ];
13990
13991 let deduped = deduplicate_hits(hits);
13992 assert_eq!(deduped.len(), 3); }
13994
13995 #[test]
13998 fn deduplicate_hits_respects_source_boundaries() {
13999 let hits = vec![
14000 SearchHit {
14001 title: "local title".into(),
14002 snippet: "snip".into(),
14003 content: "hello world".into(),
14004 content_hash: stable_content_hash("hello world"),
14005 score: 1.0,
14006 source_path: "a.jsonl".into(),
14007 agent: "agent".into(),
14008 workspace: "ws".into(),
14009 workspace_original: None,
14010 created_at: Some(100),
14011 line_number: None,
14012 match_type: MatchType::Exact,
14013 source_id: "local".into(),
14014 origin_kind: "local".into(),
14015 origin_host: None,
14016 conversation_id: None,
14017 },
14018 SearchHit {
14019 title: "remote title".into(),
14020 snippet: "snip".into(),
14021 content: "hello world".into(), content_hash: stable_content_hash("hello world"),
14023 score: 0.9,
14024 source_path: "b.jsonl".into(),
14025 agent: "agent".into(),
14026 workspace: "ws".into(),
14027 workspace_original: None,
14028 created_at: Some(200),
14029 line_number: None,
14030 match_type: MatchType::Exact,
14031 source_id: "work-laptop".into(), origin_kind: "ssh".into(),
14033 origin_host: Some("work-laptop.local".into()),
14034 conversation_id: None,
14035 },
14036 ];
14037
14038 let deduped = deduplicate_hits(hits);
14039 assert_eq!(
14040 deduped.len(),
14041 2,
14042 "same content from different sources should not dedupe"
14043 );
14044 assert!(deduped.iter().any(|h| h.source_id == "local"));
14045 assert!(deduped.iter().any(|h| h.source_id == "work-laptop"));
14046 }
14047
14048 #[test]
14049 fn wildcard_fallback_sparse_check_uses_effective_limit() {
14050 assert!(
14051 !should_try_wildcard_fallback(1, 1, 0, 3),
14052 "a filled one-result page is not sparse for fallback purposes"
14053 );
14054 assert!(
14055 !should_try_wildcard_fallback(2, 2, 0, 3),
14056 "a filled two-result page is not sparse for fallback purposes"
14057 );
14058 assert!(
14059 should_try_wildcard_fallback(0, 1, 0, 3),
14060 "zero hits should still trigger fallback even for tiny pages"
14061 );
14062 assert!(
14063 should_try_wildcard_fallback(1, 2, 0, 3),
14064 "a partially filled page should still trigger fallback"
14065 );
14066 assert!(
14067 !should_try_wildcard_fallback(0, 5, 10, 3),
14068 "pagination should not trigger wildcard fallback"
14069 );
14070 assert!(
14071 should_try_wildcard_fallback(1, 0, 0, 3),
14072 "limit zero preserves the legacy sparse-threshold semantics"
14073 );
14074 }
14075
14076 #[test]
14077 fn snippet_preview_fast_path_requires_snippet_only_match() {
14078 let snippet_only = FieldMask::new(false, true, false, false);
14079 let snippet = snippet_from_preview_without_full_content(
14080 snippet_only,
14081 "migration checks the database constraint before writing",
14082 "database",
14083 )
14084 .expect("preview should satisfy a snippet-only request when it contains the query");
14085 assert!(snippet.contains("**database**"));
14086
14087 assert!(
14088 snippet_from_preview_without_full_content(
14089 FieldMask::FULL,
14090 "migration checks the database constraint before writing",
14091 "database",
14092 )
14093 .is_none(),
14094 "full-content requests must keep the sqlite hydration path"
14095 );
14096 assert!(
14097 snippet_from_preview_without_full_content(
14098 snippet_only,
14099 "migration checks constraints before writing",
14100 "database",
14101 )
14102 .is_none(),
14103 "snippet-only requests hydrate when the preview cannot show the match"
14104 );
14105 }
14106
14107 #[test]
14108 fn search_with_fallback_returns_exact_when_sufficient() -> Result<()> {
14109 let dir = TempDir::new()?;
14110 let mut index = TantivyIndex::open_or_create(dir.path())?;
14111
14112 for i in 0..5 {
14114 let conv = NormalizedConversation {
14115 agent_slug: "codex".into(),
14116 external_id: None,
14117 title: Some(format!("doc-{i}")),
14118 workspace: Some(std::path::PathBuf::from("/ws")),
14119 source_path: dir.path().join(format!("{i}.jsonl")),
14120 started_at: Some(100 + i),
14121 ended_at: None,
14122 metadata: serde_json::json!({}),
14123 messages: vec![NormalizedMessage {
14124 idx: 0,
14125 role: "user".into(),
14126 author: None,
14127 created_at: Some(100 + i),
14128 content: format!("apple fruit number {i} is delicious and healthy"),
14130 extra: serde_json::json!({}),
14131 snippets: vec![],
14132 invocations: Vec::new(),
14133 }],
14134 };
14135 index.add_conversation(&conv)?;
14136 }
14137 index.commit()?;
14138
14139 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14140
14141 let result = client.search_with_fallback(
14143 "apple",
14144 SearchFilters::default(),
14145 10,
14146 0,
14147 3, FieldMask::FULL,
14149 )?;
14150
14151 assert!(!result.wildcard_fallback);
14152 assert!(result.hits.len() >= 3); Ok(())
14155 }
14156
14157 #[test]
14158 fn search_with_fallback_triggers_on_sparse_results() -> Result<()> {
14159 let dir = TempDir::new()?;
14160 let mut index = TantivyIndex::open_or_create(dir.path())?;
14161
14162 let conv = NormalizedConversation {
14164 agent_slug: "codex".into(),
14165 external_id: None,
14166 title: Some("substring test".into()),
14167 workspace: Some(std::path::PathBuf::from("/ws")),
14168 source_path: dir.path().join("test.jsonl"),
14169 started_at: Some(100),
14170 ended_at: None,
14171 metadata: serde_json::json!({}),
14172 messages: vec![NormalizedMessage {
14173 idx: 0,
14174 role: "user".into(),
14175 author: None,
14176 created_at: Some(100),
14177 content: "configuration management system".into(),
14178 extra: serde_json::json!({}),
14179 snippets: vec![],
14180 invocations: Vec::new(),
14181 }],
14182 };
14183 index.add_conversation(&conv)?;
14184 index.commit()?;
14185
14186 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14187
14188 let result = client.search_with_fallback(
14190 "config",
14191 SearchFilters::default(),
14192 10,
14193 0,
14194 5, FieldMask::FULL,
14196 )?;
14197
14198 assert!(!result.hits.is_empty());
14201
14202 Ok(())
14203 }
14204
14205 #[test]
14206 fn search_with_fallback_skips_when_query_has_wildcards() -> Result<()> {
14207 let dir = TempDir::new()?;
14208 let mut index = TantivyIndex::open_or_create(dir.path())?;
14209
14210 let conv = NormalizedConversation {
14211 agent_slug: "codex".into(),
14212 external_id: None,
14213 title: Some("test".into()),
14214 workspace: None,
14215 source_path: dir.path().join("test.jsonl"),
14216 started_at: Some(100),
14217 ended_at: None,
14218 metadata: serde_json::json!({}),
14219 messages: vec![NormalizedMessage {
14220 idx: 0,
14221 role: "user".into(),
14222 author: None,
14223 created_at: Some(100),
14224 content: "testing data".into(),
14225 extra: serde_json::json!({}),
14226 snippets: vec![],
14227 invocations: Vec::new(),
14228 }],
14229 };
14230 index.add_conversation(&conv)?;
14231 index.commit()?;
14232
14233 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14234
14235 let result = client.search_with_fallback(
14237 "*test*",
14238 SearchFilters::default(),
14239 10,
14240 0,
14241 10, FieldMask::FULL,
14243 )?;
14244
14245 assert!(!result.wildcard_fallback); Ok(())
14247 }
14248
14249 #[test]
14250 fn search_with_fallback_prefers_wildcards_when_they_add_hits() -> Result<()> {
14251 let dir = TempDir::new()?;
14252 let mut index = TantivyIndex::open_or_create(dir.path())?;
14253
14254 for (i, body) in [
14257 "alphabet soup for coders",
14258 "mapping the alphabet city blocks",
14259 ]
14260 .iter()
14261 .enumerate()
14262 {
14263 let conv = NormalizedConversation {
14264 agent_slug: "codex".into(),
14265 external_id: None,
14266 title: Some(format!("alpha-{i}")),
14267 workspace: Some(std::path::PathBuf::from("/ws")),
14268 source_path: dir.path().join(format!("alpha-{i}.jsonl")),
14269 started_at: Some(100 + i as i64),
14270 ended_at: None,
14271 metadata: serde_json::json!({}),
14272 messages: vec![NormalizedMessage {
14273 idx: 0,
14274 role: "user".into(),
14275 author: None,
14276 created_at: Some(100 + i as i64),
14277 content: body.to_string(),
14278 extra: serde_json::json!({}),
14279 snippets: vec![],
14280 invocations: Vec::new(),
14281 }],
14282 };
14283 index.add_conversation(&conv)?;
14284 }
14285 index.commit()?;
14286
14287 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14288
14289 let result = client.search_with_fallback(
14290 "bet",
14291 SearchFilters::default(),
14292 10,
14293 0,
14294 2,
14295 FieldMask::FULL,
14296 )?;
14297
14298 assert!(
14299 result.wildcard_fallback,
14300 "should switch to wildcard fallback when it yields more hits"
14301 );
14302 assert_eq!(
14303 result.hits.len(),
14304 2,
14305 "fallback should surface all alphabet docs"
14306 );
14307 assert!(
14308 result
14309 .hits
14310 .iter()
14311 .all(|h| h.match_type == MatchType::ImplicitWildcard)
14312 );
14313 assert!(result.hits.iter().all(|h| h.content.contains("alphabet")));
14314
14315 Ok(())
14316 }
14317
14318 #[test]
14319 fn automatic_wildcard_fallback_skips_long_zero_hit_token() -> Result<()> {
14320 let dir = TempDir::new()?;
14321 let mut index = TantivyIndex::open_or_create(dir.path())?;
14322
14323 let conv = NormalizedConversation {
14324 agent_slug: "codex".into(),
14325 external_id: None,
14326 title: Some("fruit".into()),
14327 workspace: Some(std::path::PathBuf::from("/ws")),
14328 source_path: dir.path().join("fruit.jsonl"),
14329 started_at: Some(100),
14330 ended_at: None,
14331 metadata: serde_json::json!({}),
14332 messages: vec![NormalizedMessage {
14333 idx: 0,
14334 role: "user".into(),
14335 author: None,
14336 created_at: Some(100),
14337 content: "apple pear banana".into(),
14338 extra: serde_json::json!({}),
14339 snippets: vec![],
14340 invocations: Vec::new(),
14341 }],
14342 };
14343 index.add_conversation(&conv)?;
14344 index.commit()?;
14345
14346 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14347
14348 let result = client.search_with_fallback(
14349 "zzzzzzunlikelyterm",
14350 SearchFilters::default(),
14351 10,
14352 0,
14353 1,
14354 FieldMask::FULL,
14355 )?;
14356 assert!(result.hits.is_empty());
14357 assert!(!result.wildcard_fallback);
14358 assert!(
14359 result
14360 .suggestions
14361 .iter()
14362 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
14363 "manual wildcard suggestion should remain available"
14364 );
14365
14366 let short_result = client.search_with_fallback(
14367 "pple",
14368 SearchFilters::default(),
14369 10,
14370 0,
14371 1,
14372 FieldMask::FULL,
14373 )?;
14374 assert!(short_result.wildcard_fallback);
14375 assert_eq!(short_result.hits.len(), 1);
14376 assert_eq!(short_result.hits[0].match_type, MatchType::ImplicitWildcard);
14377
14378 Ok(())
14379 }
14380
14381 #[test]
14382 fn nohit_suggestions_do_not_lazy_open_sqlite_when_tantivy_is_present() -> Result<()> {
14383 let dir = TempDir::new()?;
14384 let index_path = dir.path().join("index");
14385 let db_path = dir.path().join("cass.db");
14386
14387 let storage = FrankenStorage::open(&db_path)?;
14388 storage.close()?;
14389
14390 let mut index = TantivyIndex::open_or_create(&index_path)?;
14391 let conv = NormalizedConversation {
14392 agent_slug: "codex".into(),
14393 external_id: None,
14394 title: Some("fruit".into()),
14395 workspace: Some(std::path::PathBuf::from("/ws")),
14396 source_path: dir.path().join("fruit.jsonl"),
14397 started_at: Some(100),
14398 ended_at: None,
14399 metadata: serde_json::json!({}),
14400 messages: vec![NormalizedMessage {
14401 idx: 0,
14402 role: "user".into(),
14403 author: None,
14404 created_at: Some(100),
14405 content: "apple pear banana".into(),
14406 extra: serde_json::json!({}),
14407 snippets: vec![],
14408 invocations: Vec::new(),
14409 }],
14410 };
14411 index.add_conversation(&conv)?;
14412 index.commit()?;
14413
14414 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("index present");
14415 assert!(
14416 client
14417 .sqlite
14418 .lock()
14419 .map(|guard| guard.is_none())
14420 .unwrap_or(false),
14421 "sqlite should start closed"
14422 );
14423
14424 let result = client.search_with_fallback(
14425 "zzzzzzunlikelyterm",
14426 SearchFilters::default(),
14427 10,
14428 0,
14429 1,
14430 FieldMask::FULL,
14431 )?;
14432
14433 assert!(result.hits.is_empty());
14434 assert!(
14435 result
14436 .suggestions
14437 .iter()
14438 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
14439 "manual wildcard suggestion should remain available"
14440 );
14441 assert!(
14442 result
14443 .suggestions
14444 .iter()
14445 .all(|s| !matches!(s.kind, SuggestionKind::AlternateAgent)),
14446 "alternate-agent suggestions should not force a SQLite open"
14447 );
14448 assert!(
14449 client
14450 .sqlite
14451 .lock()
14452 .map(|guard| guard.is_none())
14453 .unwrap_or(false),
14454 "sqlite should stay closed after Tantivy no-hit suggestions"
14455 );
14456
14457 Ok(())
14458 }
14459
14460 #[test]
14461 fn search_with_fallback_emits_wildcard_suggestion_on_zero_hits() -> Result<()> {
14462 let client = SearchClient {
14463 reader: None,
14464 sqlite: Mutex::new(None),
14465 sqlite_path: None,
14466 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
14467 reload_on_search: true,
14468 last_reload: Mutex::new(None),
14469 last_generation: Mutex::new(None),
14470 reload_epoch: Arc::new(AtomicU64::new(0)),
14471 warm_tx: None,
14472 _warm_handle: None,
14473 metrics: Metrics::default(),
14474 cache_namespace: "vtest|schema:none".into(),
14475 semantic: Mutex::new(None),
14476 last_tantivy_total_count: Mutex::new(None),
14477 };
14478
14479 let result = client.search_with_fallback(
14480 "ghost",
14481 SearchFilters::default(),
14482 5,
14483 0,
14484 3,
14485 FieldMask::FULL,
14486 )?;
14487
14488 assert!(
14489 result.hits.is_empty(),
14490 "no index/db means no hits should be returned"
14491 );
14492 assert!(
14493 !result.wildcard_fallback,
14494 "with zero baseline and fallback hits, we should keep baseline and mark fallback=false"
14495 );
14496
14497 let wildcard = result
14498 .suggestions
14499 .iter()
14500 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
14501 .expect("should suggest adding wildcards");
14502 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
14503
14504 Ok(())
14505 }
14506
14507 #[test]
14508 fn search_with_fallback_skips_empty_query() -> Result<()> {
14509 let dir = TempDir::new()?;
14510 let mut index = TantivyIndex::open_or_create(dir.path())?;
14511
14512 let conv = NormalizedConversation {
14513 agent_slug: "codex".into(),
14514 external_id: None,
14515 title: Some("test".into()),
14516 workspace: None,
14517 source_path: dir.path().join("test.jsonl"),
14518 started_at: Some(100),
14519 ended_at: None,
14520 metadata: serde_json::json!({}),
14521 messages: vec![NormalizedMessage {
14522 idx: 0,
14523 role: "user".into(),
14524 author: None,
14525 created_at: Some(100),
14526 content: "testing data".into(),
14527 extra: serde_json::json!({}),
14528 snippets: vec![],
14529 invocations: Vec::new(),
14530 }],
14531 };
14532 index.add_conversation(&conv)?;
14533 index.commit()?;
14534
14535 let client = SearchClient::open(dir.path(), None)?.expect("index present");
14536
14537 let result = client.search_with_fallback(
14539 " ",
14540 SearchFilters::default(),
14541 10,
14542 0,
14543 10,
14544 FieldMask::FULL,
14545 )?;
14546
14547 assert!(!result.wildcard_fallback);
14548 Ok(())
14549 }
14550
14551 #[test]
14552 fn search_with_fallback_skips_for_nonzero_offset() -> Result<()> {
14553 let client = SearchClient {
14555 reader: None,
14556 sqlite: Mutex::new(None),
14557 sqlite_path: None,
14558 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
14559 reload_on_search: true,
14560 last_reload: Mutex::new(None),
14561 last_generation: Mutex::new(None),
14562 reload_epoch: Arc::new(AtomicU64::new(0)),
14563 warm_tx: None,
14564 _warm_handle: None,
14565 metrics: Metrics::default(),
14566 cache_namespace: "vtest|schema:none".into(),
14567 semantic: Mutex::new(None),
14568 last_tantivy_total_count: Mutex::new(None),
14569 };
14570
14571 let result = client.search_with_fallback(
14572 "ghost",
14573 SearchFilters::default(),
14574 5,
14575 10,
14576 3,
14577 FieldMask::FULL,
14578 )?;
14579
14580 assert!(
14581 !result.wildcard_fallback,
14582 "fallback should not run on paginated searches"
14583 );
14584 let wildcard = result
14586 .suggestions
14587 .iter()
14588 .find(|s| matches!(s.kind, SuggestionKind::WildcardQuery))
14589 .expect("wildcard suggestion present");
14590 assert_eq!(wildcard.suggested_query.as_deref(), Some("*ghost*"));
14591
14592 Ok(())
14593 }
14594
14595 #[test]
14596 fn generate_suggestions_limits_and_sets_shortcuts() -> Result<()> {
14597 let client = SearchClient {
14599 reader: None,
14600 sqlite: Mutex::new(None),
14601 sqlite_path: None,
14602 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
14603 reload_on_search: true,
14604 last_reload: Mutex::new(None),
14605 last_generation: Mutex::new(None),
14606 reload_epoch: Arc::new(AtomicU64::new(0)),
14607 warm_tx: None,
14608 _warm_handle: None,
14609 metrics: Metrics::default(),
14610 cache_namespace: "vtest|schema:none".into(),
14611 semantic: Mutex::new(None),
14612 last_tantivy_total_count: Mutex::new(None),
14613 };
14614
14615 let mut filters = SearchFilters::default();
14616 filters.agents.insert("codex".into()); let result = client.search_with_fallback("claud", filters, 5, 0, 3, FieldMask::FULL)?;
14619
14620 assert_eq!(
14622 result.suggestions.len(),
14623 3,
14624 "should truncate to 3 suggestions"
14625 );
14626 for (idx, sugg) in result.suggestions.iter().enumerate() {
14627 assert_eq!(
14628 sugg.shortcut,
14629 Some((idx + 1) as u8),
14630 "shortcut should match position (1-based)"
14631 );
14632 }
14633
14634 assert!(
14636 result
14637 .suggestions
14638 .iter()
14639 .any(|s| matches!(s.kind, SuggestionKind::WildcardQuery)),
14640 "should suggest wildcard search"
14641 );
14642 assert!(
14643 result
14644 .suggestions
14645 .iter()
14646 .any(|s| matches!(s.kind, SuggestionKind::RemoveFilter)),
14647 "should suggest removing agent filter"
14648 );
14649 assert!(
14650 result
14651 .suggestions
14652 .iter()
14653 .any(|s| matches!(s.kind, SuggestionKind::SpellingFix)),
14654 "should suggest spelling fix for nearby agent name"
14655 );
14656
14657 Ok(())
14658 }
14659
14660 #[test]
14661 fn generate_suggestions_includes_recent_alternate_agents() -> Result<()> {
14662 let dir = TempDir::new()?;
14663 let db_path = dir.path().join("cass.db");
14664 let storage = FrankenStorage::open(&db_path)?;
14665 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
14666 let base_ts = 1_700_000_010_000_i64;
14667
14668 for (idx, slug) in ["claude_code", "codex"].iter().enumerate() {
14669 let agent = Agent {
14670 id: None,
14671 slug: (*slug).to_string(),
14672 name: (*slug).to_string(),
14673 version: None,
14674 kind: AgentKind::Cli,
14675 };
14676 let agent_id = storage.ensure_agent(&agent)?;
14677 let conversation = Conversation {
14678 id: None,
14679 agent_slug: (*slug).to_string(),
14680 workspace: Some(dir.path().to_path_buf()),
14681 external_id: Some(format!("alt-agent-{idx}")),
14682 title: Some(format!("alternate agent {idx}")),
14683 source_path: dir.path().join(format!("{slug}.jsonl")),
14684 started_at: Some(base_ts + idx as i64),
14685 ended_at: Some(base_ts + idx as i64),
14686 approx_tokens: Some(8),
14687 metadata_json: json!({}),
14688 messages: vec![Message {
14689 id: None,
14690 idx: 0,
14691 role: MessageRole::User,
14692 author: Some("user".into()),
14693 created_at: Some(base_ts + idx as i64),
14694 content: format!("content from {slug}"),
14695 extra_json: json!({}),
14696 snippets: Vec::new(),
14697 }],
14698 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
14699 origin_host: None,
14700 };
14701 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
14702 }
14703 drop(storage);
14704
14705 let client = SearchClient::open(dir.path(), Some(&db_path))?.expect("db-backed client");
14706 let result = client.search_with_fallback(
14707 "ghost",
14708 SearchFilters::default(),
14709 5,
14710 0,
14711 3,
14712 FieldMask::FULL,
14713 )?;
14714
14715 let alternate_agents: HashSet<String> = result
14716 .suggestions
14717 .iter()
14718 .filter(|suggestion| matches!(suggestion.kind, SuggestionKind::AlternateAgent))
14719 .filter_map(|suggestion| suggestion.suggested_filters.as_ref())
14720 .flat_map(|filters| filters.agents.iter().cloned())
14721 .collect();
14722
14723 assert!(
14724 alternate_agents.contains("claude_code"),
14725 "should suggest claude_code from normalized conversations schema"
14726 );
14727 assert!(
14728 alternate_agents.contains("codex"),
14729 "should suggest codex from normalized conversations schema"
14730 );
14731
14732 Ok(())
14733 }
14734
14735 #[test]
14736 fn sanitize_query_preserves_wildcards() {
14737 assert_eq!(fs_cass_sanitize_query("*foo*"), "*foo*");
14739 assert_eq!(fs_cass_sanitize_query("foo*"), "foo*");
14740 assert_eq!(fs_cass_sanitize_query("*bar"), "*bar");
14741 assert_eq!(fs_cass_sanitize_query("*config*"), "*config*");
14742 }
14743
14744 #[test]
14745 fn sanitize_query_strips_other_special_chars() {
14746 assert_eq!(fs_cass_sanitize_query("foo.bar"), "foo bar");
14748 assert_eq!(fs_cass_sanitize_query("c++"), "c ");
14749 assert_eq!(fs_cass_sanitize_query("foo-bar"), "foo-bar");
14750 assert_eq!(fs_cass_sanitize_query("test_case"), "test case");
14751 }
14752
14753 #[test]
14754 fn sanitize_query_combined() {
14755 assert_eq!(fs_cass_sanitize_query("*foo.bar*"), "*foo bar*");
14757 assert_eq!(fs_cass_sanitize_query("test-*"), "test-*");
14758 assert_eq!(fs_cass_sanitize_query("*c++*"), "*c *");
14759 }
14760
14761 #[test]
14763 fn parse_boolean_query_simple_terms() {
14764 let tokens = fs_cass_parse_boolean_query("foo bar baz");
14765 assert_eq!(tokens.len(), 3);
14766 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14767 assert_eq!(tokens[1], FsCassQueryToken::Term("bar".to_string()));
14768 assert_eq!(tokens[2], FsCassQueryToken::Term("baz".to_string()));
14769 }
14770
14771 #[test]
14772 fn parse_boolean_query_and_operator() {
14773 let tokens = fs_cass_parse_boolean_query("foo AND bar");
14774 assert_eq!(tokens.len(), 3);
14775 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14776 assert_eq!(tokens[1], FsCassQueryToken::And);
14777 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14778
14779 let tokens2 = fs_cass_parse_boolean_query("foo && bar");
14781 assert_eq!(tokens2.len(), 3);
14782 assert_eq!(tokens2[1], FsCassQueryToken::And);
14783 }
14784
14785 #[test]
14786 fn parse_boolean_query_or_operator() {
14787 let tokens = fs_cass_parse_boolean_query("foo OR bar");
14788 assert_eq!(tokens.len(), 3);
14789 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14790 assert_eq!(tokens[1], FsCassQueryToken::Or);
14791 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14792
14793 let tokens2 = fs_cass_parse_boolean_query("foo || bar");
14795 assert_eq!(tokens2.len(), 3);
14796 assert_eq!(tokens2[1], FsCassQueryToken::Or);
14797 }
14798
14799 #[test]
14800 fn parse_boolean_query_not_operator() {
14801 let tokens = fs_cass_parse_boolean_query("foo NOT bar");
14802 assert_eq!(tokens.len(), 3);
14803 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14804 assert_eq!(tokens[1], FsCassQueryToken::Not);
14805 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14806 }
14807
14808 #[test]
14809 fn parse_boolean_query_quoted_phrase() {
14810 let tokens = fs_cass_parse_boolean_query(r#"foo "exact phrase" bar"#);
14811 assert_eq!(tokens.len(), 3);
14812 assert_eq!(tokens[0], FsCassQueryToken::Term("foo".to_string()));
14813 assert_eq!(
14814 tokens[1],
14815 FsCassQueryToken::Phrase("exact phrase".to_string())
14816 );
14817 assert_eq!(tokens[2], FsCassQueryToken::Term("bar".to_string()));
14818 }
14819
14820 #[test]
14821 fn parse_boolean_query_complex() {
14822 let tokens = fs_cass_parse_boolean_query(r#"error OR warning NOT "false positive""#);
14823 assert_eq!(tokens.len(), 5);
14824 assert_eq!(tokens[0], FsCassQueryToken::Term("error".to_string()));
14825 assert_eq!(tokens[1], FsCassQueryToken::Or);
14826 assert_eq!(tokens[2], FsCassQueryToken::Term("warning".to_string()));
14827 assert_eq!(tokens[3], FsCassQueryToken::Not);
14828 assert_eq!(
14829 tokens[4],
14830 FsCassQueryToken::Phrase("false positive".to_string())
14831 );
14832 }
14833
14834 #[test]
14835 fn has_boolean_operators_detection() {
14836 assert!(!fs_cass_has_boolean_operators("foo bar"));
14837 assert!(fs_cass_has_boolean_operators("foo AND bar"));
14838 assert!(fs_cass_has_boolean_operators("foo OR bar"));
14839 assert!(fs_cass_has_boolean_operators("foo NOT bar"));
14840 assert!(fs_cass_has_boolean_operators(r#""exact phrase""#));
14841 assert!(fs_cass_has_boolean_operators("foo && bar"));
14842 assert!(fs_cass_has_boolean_operators("foo || bar"));
14843 }
14844
14845 #[test]
14846 fn parse_boolean_query_case_insensitive_operators() {
14847 let tokens = fs_cass_parse_boolean_query("foo and bar or baz not qux");
14849 assert_eq!(tokens.len(), 7);
14850 assert_eq!(tokens[1], FsCassQueryToken::And);
14851 assert_eq!(tokens[3], FsCassQueryToken::Or);
14852 assert_eq!(tokens[5], FsCassQueryToken::Not);
14853 }
14854
14855 #[test]
14856 fn parse_boolean_query_with_wildcards() {
14857 let tokens = fs_cass_parse_boolean_query("*config* OR env*");
14858 assert_eq!(tokens.len(), 3);
14859 assert_eq!(tokens[0], FsCassQueryToken::Term("*config*".to_string()));
14860 assert_eq!(tokens[1], FsCassQueryToken::Or);
14861 assert_eq!(tokens[2], FsCassQueryToken::Term("env*".to_string()));
14862 }
14863
14864 #[test]
14870 fn tantivy_search_hydrates_long_content_when_content_field_is_not_stored() -> Result<()> {
14871 let dir = TempDir::new()?;
14872 let db_path = dir.path().join("cass.db");
14873 let storage = FrankenStorage::open(&db_path)?;
14874 let workspace_id = storage.ensure_workspace(dir.path(), None)?;
14875 let agent = Agent {
14876 id: None,
14877 slug: "codex".into(),
14878 name: "Codex".into(),
14879 version: None,
14880 kind: AgentKind::Cli,
14881 };
14882 let agent_id = storage.ensure_agent(&agent)?;
14883 let long_content = format!(
14884 "{}needle appears past the preview boundary for hydration proof",
14885 "padding ".repeat(70)
14886 );
14887 let short_content = "shortneedle fits entirely inside the stored preview".to_string();
14888 let conversation = Conversation {
14889 id: None,
14890 agent_slug: "codex".into(),
14891 workspace: Some(dir.path().to_path_buf()),
14892 external_id: Some("hydrate-long-content".into()),
14893 title: Some("hydrated lexical doc".into()),
14894 source_path: dir.path().join("hydrate.jsonl"),
14895 started_at: Some(1_700_000_123_000),
14896 ended_at: Some(1_700_000_123_000),
14897 approx_tokens: Some(32),
14898 metadata_json: json!({}),
14899 messages: vec![
14900 Message {
14901 id: None,
14902 idx: 0,
14903 role: MessageRole::User,
14904 author: Some("user".into()),
14905 created_at: Some(1_700_000_123_000),
14906 content: long_content.clone(),
14907 extra_json: json!({}),
14908 snippets: Vec::new(),
14909 },
14910 Message {
14911 id: None,
14912 idx: 1,
14913 role: MessageRole::Agent,
14914 author: Some("assistant".into()),
14915 created_at: Some(1_700_000_124_000),
14916 content: short_content.clone(),
14917 extra_json: json!({}),
14918 snippets: Vec::new(),
14919 },
14920 ],
14921 source_id: crate::sources::provenance::LOCAL_SOURCE_ID.to_string(),
14922 origin_host: None,
14923 };
14924 storage.insert_conversation_tree(agent_id, Some(workspace_id), &conversation)?;
14925 storage.close()?;
14926
14927 let index_path = dir.path().join("search-index");
14928 let mut index = TantivyIndex::open_or_create(&index_path)?;
14929 let normalized = NormalizedConversation {
14930 agent_slug: "codex".into(),
14931 external_id: Some("hydrate-long-content".into()),
14932 title: Some("hydrated lexical doc".into()),
14933 workspace: Some(dir.path().to_path_buf()),
14934 source_path: dir.path().join("hydrate.jsonl"),
14935 started_at: Some(1_700_000_123_000),
14936 ended_at: Some(1_700_000_123_000),
14937 metadata: json!({}),
14938 messages: vec![
14939 NormalizedMessage {
14940 idx: 0,
14941 role: "user".into(),
14942 author: Some("user".into()),
14943 created_at: Some(1_700_000_123_000),
14944 content: long_content.clone(),
14945 extra: json!({}),
14946 snippets: vec![],
14947 invocations: Vec::new(),
14948 },
14949 NormalizedMessage {
14950 idx: 1,
14951 role: "assistant".into(),
14952 author: Some("assistant".into()),
14953 created_at: Some(1_700_000_124_000),
14954 content: short_content.clone(),
14955 extra: json!({}),
14956 snippets: vec![],
14957 invocations: Vec::new(),
14958 },
14959 ],
14960 };
14961 index.add_conversation(&normalized)?;
14962 index.commit()?;
14963
14964 let client = SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
14965 let hits = client.search("needle", SearchFilters::default(), 5, 0, FieldMask::FULL)?;
14966
14967 assert_eq!(hits.len(), 1, "expected one lexical hit");
14968 assert_eq!(hits[0].title, "hydrated lexical doc");
14969 assert!(
14970 hits[0]
14971 .content
14972 .contains("needle appears past the preview boundary"),
14973 "lexical hit should hydrate full content from sqlite when Tantivy content is not stored"
14974 );
14975 assert!(
14976 hits[0].snippet.to_lowercase().contains("needle"),
14977 "snippet should still be rendered from hydrated content"
14978 );
14979
14980 let bounded_hits = client.search(
14981 "needle",
14982 SearchFilters::default(),
14983 5,
14984 0,
14985 FieldMask::FULL.with_preview_content_limit(Some(200)),
14986 )?;
14987
14988 assert_eq!(bounded_hits.len(), 1, "expected one lexical hit");
14989 assert!(
14990 bounded_hits[0].content.starts_with("padding padding"),
14991 "bounded content may be served from the stored preview prefix"
14992 );
14993 assert!(
14994 !bounded_hits[0]
14995 .content
14996 .contains("needle appears past the preview boundary"),
14997 "bounded preview content should not hydrate the full sqlite row"
14998 );
14999
15000 let short_client =
15001 SearchClient::open(&index_path, Some(&db_path))?.expect("db-backed client");
15002 assert!(
15003 short_client
15004 .sqlite
15005 .lock()
15006 .map(|guard| guard.is_none())
15007 .unwrap_or(false),
15008 "sqlite should start closed for short preview hit"
15009 );
15010
15011 let short_hits = short_client.search(
15012 "shortneedle",
15013 SearchFilters::default(),
15014 5,
15015 0,
15016 FieldMask::FULL,
15017 )?;
15018
15019 assert_eq!(short_hits.len(), 1, "expected one short lexical hit");
15020 assert_eq!(
15021 short_hits[0].content, short_content,
15022 "untruncated stored preview is exact full content"
15023 );
15024 assert!(
15025 short_client
15026 .sqlite
15027 .lock()
15028 .map(|guard| guard.is_none())
15029 .unwrap_or(false),
15030 "short full-content hit should not lazy-open sqlite"
15031 );
15032
15033 Ok(())
15034 }
15035
15036 #[test]
15037 fn filter_fidelity_agent_filter_respected() -> Result<()> {
15038 let dir = TempDir::new()?;
15040 let mut index = TantivyIndex::open_or_create(dir.path())?;
15041
15042 let conv_a = NormalizedConversation {
15044 agent_slug: "codex".into(),
15045 external_id: None,
15046 title: Some("alpha doc".into()),
15047 workspace: None,
15048 source_path: dir.path().join("a.jsonl"),
15049 started_at: Some(100),
15050 ended_at: None,
15051 metadata: serde_json::json!({}),
15052 messages: vec![NormalizedMessage {
15053 idx: 0,
15054 role: "user".into(),
15055 author: None,
15056 created_at: Some(100),
15057 content: "hello world findme alpha".into(),
15058 extra: serde_json::json!({}),
15059 snippets: vec![],
15060 invocations: Vec::new(),
15061 }],
15062 };
15063 let conv_b = NormalizedConversation {
15065 agent_slug: "claude".into(),
15066 external_id: None,
15067 title: Some("beta doc".into()),
15068 workspace: None,
15069 source_path: dir.path().join("b.jsonl"),
15070 started_at: Some(200),
15071 ended_at: None,
15072 metadata: serde_json::json!({}),
15073 messages: vec![NormalizedMessage {
15074 idx: 0,
15075 role: "user".into(),
15076 author: None,
15077 created_at: Some(200),
15078 content: "hello world findme beta".into(),
15079 extra: serde_json::json!({}),
15080 snippets: vec![],
15081 invocations: Vec::new(),
15082 }],
15083 };
15084 index.add_conversation(&conv_a)?;
15085 index.add_conversation(&conv_b)?;
15086 index.commit()?;
15087
15088 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15089
15090 let mut filters = SearchFilters::default();
15092 filters.agents.insert("codex".into());
15093
15094 let hits = client.search("findme", filters.clone(), 10, 0, FieldMask::FULL)?;
15095
15096 for hit in &hits {
15098 assert_eq!(
15099 hit.agent, "codex",
15100 "Agent filter violated: got agent '{}' instead of 'codex'",
15101 hit.agent
15102 );
15103 }
15104 assert!(!hits.is_empty(), "Should have found results");
15105
15106 let cached_hits = client.search("findme", filters, 10, 0, FieldMask::FULL)?;
15108 for hit in &cached_hits {
15109 assert_eq!(hit.agent, "codex", "Cached search violated agent filter");
15110 }
15111
15112 Ok(())
15113 }
15114
15115 #[test]
15116 fn filter_fidelity_workspace_filter_respected() -> Result<()> {
15117 let dir = TempDir::new()?;
15119 let mut index = TantivyIndex::open_or_create(dir.path())?;
15120
15121 let conv_a = NormalizedConversation {
15123 agent_slug: "codex".into(),
15124 external_id: None,
15125 title: Some("ws_a doc".into()),
15126 workspace: Some(std::path::PathBuf::from("/workspace/alpha")),
15127 source_path: dir.path().join("a.jsonl"),
15128 started_at: Some(100),
15129 ended_at: None,
15130 metadata: serde_json::json!({}),
15131 messages: vec![NormalizedMessage {
15132 idx: 0,
15133 role: "user".into(),
15134 author: None,
15135 created_at: Some(100),
15136 content: "workspace test needle".into(),
15137 extra: serde_json::json!({}),
15138 snippets: vec![],
15139 invocations: Vec::new(),
15140 }],
15141 };
15142 let conv_b = NormalizedConversation {
15144 agent_slug: "codex".into(),
15145 external_id: None,
15146 title: Some("ws_b doc".into()),
15147 workspace: Some(std::path::PathBuf::from("/workspace/beta")),
15148 source_path: dir.path().join("b.jsonl"),
15149 started_at: Some(200),
15150 ended_at: None,
15151 metadata: serde_json::json!({}),
15152 messages: vec![NormalizedMessage {
15153 idx: 0,
15154 role: "user".into(),
15155 author: None,
15156 created_at: Some(200),
15157 content: "workspace test needle".into(),
15158 extra: serde_json::json!({}),
15159 snippets: vec![],
15160 invocations: Vec::new(),
15161 }],
15162 };
15163 index.add_conversation(&conv_a)?;
15164 index.add_conversation(&conv_b)?;
15165 index.commit()?;
15166
15167 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15168
15169 let mut filters = SearchFilters::default();
15171 filters.workspaces.insert("/workspace/beta".into());
15172
15173 let hits = client.search("needle", filters.clone(), 10, 0, FieldMask::FULL)?;
15174
15175 for hit in &hits {
15177 assert_eq!(
15178 hit.workspace, "/workspace/beta",
15179 "Workspace filter violated: got '{}' instead of '/workspace/beta'",
15180 hit.workspace
15181 );
15182 }
15183 assert!(!hits.is_empty(), "Should have found results");
15184
15185 let cached_hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
15187 for hit in &cached_hits {
15188 assert_eq!(
15189 hit.workspace, "/workspace/beta",
15190 "Cached search violated workspace filter"
15191 );
15192 }
15193
15194 Ok(())
15195 }
15196
15197 #[test]
15198 fn filter_fidelity_date_range_respected() -> Result<()> {
15199 let dir = TempDir::new()?;
15201 let mut index = TantivyIndex::open_or_create(dir.path())?;
15202
15203 let conv_early = NormalizedConversation {
15205 agent_slug: "codex".into(),
15206 external_id: None,
15207 title: Some("early".into()),
15208 workspace: None,
15209 source_path: dir.path().join("early.jsonl"),
15210 started_at: Some(100),
15211 ended_at: None,
15212 metadata: serde_json::json!({}),
15213 messages: vec![NormalizedMessage {
15214 idx: 0,
15215 role: "user".into(),
15216 author: None,
15217 created_at: Some(100),
15218 content: "date range test".into(),
15219 extra: serde_json::json!({}),
15220 snippets: vec![],
15221 invocations: Vec::new(),
15222 }],
15223 };
15224 let conv_middle = NormalizedConversation {
15226 agent_slug: "codex".into(),
15227 external_id: None,
15228 title: Some("middle".into()),
15229 workspace: None,
15230 source_path: dir.path().join("middle.jsonl"),
15231 started_at: Some(500),
15232 ended_at: None,
15233 metadata: serde_json::json!({}),
15234 messages: vec![NormalizedMessage {
15235 idx: 0,
15236 role: "user".into(),
15237 author: None,
15238 created_at: Some(500),
15239 content: "date range test".into(),
15240 extra: serde_json::json!({}),
15241 snippets: vec![],
15242 invocations: Vec::new(),
15243 }],
15244 };
15245 let conv_late = NormalizedConversation {
15247 agent_slug: "codex".into(),
15248 external_id: None,
15249 title: Some("late".into()),
15250 workspace: None,
15251 source_path: dir.path().join("late.jsonl"),
15252 started_at: Some(900),
15253 ended_at: None,
15254 metadata: serde_json::json!({}),
15255 messages: vec![NormalizedMessage {
15256 idx: 0,
15257 role: "user".into(),
15258 author: None,
15259 created_at: Some(900),
15260 content: "date range test".into(),
15261 extra: serde_json::json!({}),
15262 snippets: vec![],
15263 invocations: Vec::new(),
15264 }],
15265 };
15266 index.add_conversation(&conv_early)?;
15267 index.add_conversation(&conv_middle)?;
15268 index.add_conversation(&conv_late)?;
15269 index.commit()?;
15270
15271 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15272
15273 let filters = SearchFilters {
15275 created_from: Some(400),
15276 created_to: Some(600),
15277 ..Default::default()
15278 };
15279
15280 let hits = client.search("range", filters.clone(), 10, 0, FieldMask::FULL)?;
15281
15282 for hit in &hits {
15284 if let Some(ts) = hit.created_at {
15285 assert!(
15286 (400..=600).contains(&ts),
15287 "Date range filter violated: got ts={ts} outside [400, 600]"
15288 );
15289 }
15290 }
15291 assert_eq!(hits.len(), 1, "Should find exactly 1 doc in range");
15293
15294 let cached_hits = client.search("range", filters, 10, 0, FieldMask::FULL)?;
15296 for hit in &cached_hits {
15297 if let Some(ts) = hit.created_at {
15298 assert!(
15299 (400..=600).contains(&ts),
15300 "Cached search violated date range filter"
15301 );
15302 }
15303 }
15304
15305 Ok(())
15306 }
15307
15308 #[test]
15309 fn filter_fidelity_combined_filters_respected() -> Result<()> {
15310 let dir = TempDir::new()?;
15312 let mut index = TantivyIndex::open_or_create(dir.path())?;
15313
15314 let combinations = [
15316 ("codex", "/ws/prod", 100), ("claude", "/ws/prod", 500), ("claude", "/ws/dev", 500), ("claude", "/ws/prod", 900), ];
15321
15322 for (i, (agent, ws, ts)) in combinations.iter().enumerate() {
15323 let conv = NormalizedConversation {
15324 agent_slug: (*agent).into(),
15325 external_id: None,
15326 title: Some(format!("combo-{i}")),
15327 workspace: Some(std::path::PathBuf::from(*ws)),
15328 source_path: dir.path().join(format!("{i}.jsonl")),
15329 started_at: Some(*ts),
15330 ended_at: None,
15331 metadata: serde_json::json!({}),
15332 messages: vec![NormalizedMessage {
15333 idx: 0,
15334 role: "user".into(),
15335 author: None,
15336 created_at: Some(*ts),
15337 content: "hello world combotest query".into(),
15338 extra: serde_json::json!({}),
15339 snippets: vec![],
15340 invocations: Vec::new(),
15341 }],
15342 };
15343 index.add_conversation(&conv)?;
15344 }
15345 index.commit()?;
15346
15347 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15348
15349 let mut filters = SearchFilters::default();
15351 filters.agents.insert("claude".into());
15352 filters.workspaces.insert("/ws/prod".into());
15353 filters.created_from = Some(400);
15354 filters.created_to = Some(600);
15355
15356 let hits = client.search("combotest", filters.clone(), 10, 0, FieldMask::FULL)?;
15357
15358 assert_eq!(hits.len(), 1, "Combined filter should match exactly 1 doc");
15360
15361 for hit in &hits {
15362 assert_eq!(hit.agent, "claude", "Agent filter violated");
15363 assert_eq!(hit.workspace, "/ws/prod", "Workspace filter violated");
15364 if let Some(ts) = hit.created_at {
15365 assert!((400..=600).contains(&ts), "Date filter violated: ts={ts}");
15366 }
15367 }
15368
15369 let cached = client.search("combotest", filters, 10, 0, FieldMask::FULL)?;
15371 assert_eq!(cached.len(), 1, "Cached result count mismatch");
15372
15373 Ok(())
15374 }
15375
15376 #[test]
15377 fn lexical_hits_normalize_trimmed_local_source_metadata() -> Result<()> {
15378 let dir = TempDir::new()?;
15379 let mut index = TantivyIndex::open_or_create(dir.path())?;
15380
15381 let conv = NormalizedConversation {
15382 agent_slug: "codex".into(),
15383 external_id: None,
15384 title: Some("trimmed local doc".into()),
15385 workspace: None,
15386 source_path: dir.path().join("trimmed-local.jsonl"),
15387 started_at: Some(100),
15388 ended_at: None,
15389 metadata: serde_json::json!({
15390 "cass": {
15391 "origin": {
15392 "source_id": " LOCAL ",
15393 "kind": "local"
15394 }
15395 }
15396 }),
15397 messages: vec![NormalizedMessage {
15398 idx: 0,
15399 role: "user".into(),
15400 author: None,
15401 created_at: Some(100),
15402 content: "trimmed local lexical".into(),
15403 extra: serde_json::json!({}),
15404 snippets: vec![],
15405 invocations: Vec::new(),
15406 }],
15407 };
15408 index.add_conversation(&conv)?;
15409 index.commit()?;
15410
15411 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15412 let hits = client.search("trimmed", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
15413
15414 assert_eq!(hits.len(), 1);
15415 assert_eq!(hits[0].source_id, "local");
15416 assert_eq!(hits[0].origin_kind, "local");
15417
15418 Ok(())
15419 }
15420
15421 #[test]
15422 fn lexical_hits_normalize_remote_origin_kind_without_source_id() -> Result<()> {
15423 let dir = TempDir::new()?;
15424 let mut index = TantivyIndex::open_or_create(dir.path())?;
15425
15426 let conv = NormalizedConversation {
15427 agent_slug: "codex".into(),
15428 external_id: None,
15429 title: Some("remote lexical doc".into()),
15430 workspace: None,
15431 source_path: dir.path().join("remote-lexical.jsonl"),
15432 started_at: Some(100),
15433 ended_at: None,
15434 metadata: serde_json::json!({
15435 "cass": {
15436 "origin": {
15437 "source_id": " ",
15438 "kind": "ssh",
15439 "host": "dev@laptop"
15440 }
15441 }
15442 }),
15443 messages: vec![NormalizedMessage {
15444 idx: 0,
15445 role: "user".into(),
15446 author: None,
15447 created_at: Some(100),
15448 content: "remote lexical".into(),
15449 extra: serde_json::json!({}),
15450 snippets: vec![],
15451 invocations: Vec::new(),
15452 }],
15453 };
15454 index.add_conversation(&conv)?;
15455 index.commit()?;
15456
15457 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15458 let hits = client.search("remote", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
15459
15460 assert_eq!(hits.len(), 1);
15461 assert_eq!(hits[0].source_id, "dev@laptop");
15462 assert_eq!(hits[0].origin_kind, "remote");
15463 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
15464
15465 Ok(())
15466 }
15467
15468 #[test]
15469 fn lexical_hits_infer_remote_origin_from_host_without_kind() -> Result<()> {
15470 let dir = TempDir::new()?;
15471 let mut index = TantivyIndex::open_or_create(dir.path())?;
15472
15473 let conv = NormalizedConversation {
15474 agent_slug: "codex".into(),
15475 external_id: None,
15476 title: Some("legacy host-only lexical doc".into()),
15477 workspace: None,
15478 source_path: dir.path().join("legacy-host-only-lexical.jsonl"),
15479 started_at: Some(100),
15480 ended_at: None,
15481 metadata: serde_json::json!({
15482 "cass": {
15483 "origin": {
15484 "source_id": " ",
15485 "host": "dev@laptop"
15486 }
15487 }
15488 }),
15489 messages: vec![NormalizedMessage {
15490 idx: 0,
15491 role: "user".into(),
15492 author: None,
15493 created_at: Some(100),
15494 content: "legacy remote lexical".into(),
15495 extra: serde_json::json!({}),
15496 snippets: vec![],
15497 invocations: Vec::new(),
15498 }],
15499 };
15500 index.add_conversation(&conv)?;
15501 index.commit()?;
15502
15503 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15504 let hits = client.search("legacy", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
15505
15506 assert_eq!(hits.len(), 1);
15507 assert_eq!(hits[0].source_id, "dev@laptop");
15508 assert_eq!(hits[0].origin_kind, "remote");
15509 assert_eq!(hits[0].origin_host.as_deref(), Some("dev@laptop"));
15510
15511 Ok(())
15512 }
15513
15514 #[test]
15515 fn filter_fidelity_source_filter_respected() -> Result<()> {
15516 let dir = TempDir::new()?;
15518 let mut index = TantivyIndex::open_or_create(dir.path())?;
15519
15520 let conv_local = NormalizedConversation {
15522 agent_slug: "codex".into(),
15523 external_id: None,
15524 title: Some("local doc".into()),
15525 workspace: None,
15526 source_path: dir.path().join("local.jsonl"),
15527 started_at: Some(100),
15528 ended_at: None,
15529 metadata: serde_json::json!({}),
15530 messages: vec![NormalizedMessage {
15531 idx: 0,
15532 role: "user".into(),
15533 author: None,
15534 created_at: Some(100),
15535 content: "source filter test local".into(),
15536 extra: serde_json::json!({}),
15537 snippets: vec![],
15538 invocations: Vec::new(),
15539 }],
15540 };
15541 index.add_conversation(&conv_local)?;
15544 index.commit()?;
15545
15546 let client = SearchClient::open(dir.path(), None)?.expect("index present");
15547
15548 let filters = SearchFilters {
15550 source_filter: SourceFilter::Local,
15551 ..Default::default()
15552 };
15553
15554 let hits = client.search("source", filters.clone(), 10, 0, FieldMask::FULL)?;
15555
15556 for hit in &hits {
15558 assert_eq!(
15559 hit.source_id, "local",
15560 "Source filter violated: got source_id '{}' instead of 'local'",
15561 hit.source_id
15562 );
15563 }
15564 assert!(!hits.is_empty(), "Should have found local results");
15565
15566 let filters_id = SearchFilters {
15568 source_filter: SourceFilter::SourceId(" LOCAL ".to_string()),
15569 ..Default::default()
15570 };
15571
15572 let hits_id = client.search("source", filters_id, 10, 0, FieldMask::FULL)?;
15573 for hit in &hits_id {
15574 assert_eq!(
15575 hit.source_id, "local",
15576 "SourceId filter violated: got '{}' instead of 'local'",
15577 hit.source_id
15578 );
15579 }
15580 assert!(
15581 !hits_id.is_empty(),
15582 "Should have found results for source_id=local"
15583 );
15584
15585 Ok(())
15586 }
15587
15588 #[test]
15589 fn filter_fidelity_cache_key_isolation() {
15590 let client = SearchClient {
15592 reader: None,
15593 sqlite: Mutex::new(None),
15594 sqlite_path: None,
15595 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15596 reload_on_search: true,
15597 last_reload: Mutex::new(None),
15598 last_generation: Mutex::new(None),
15599 reload_epoch: Arc::new(AtomicU64::new(0)),
15600 warm_tx: None,
15601 _warm_handle: None,
15602 metrics: Metrics::default(),
15603 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
15604 semantic: Mutex::new(None),
15605 last_tantivy_total_count: Mutex::new(None),
15606 };
15607
15608 let filters_empty = SearchFilters::default();
15609 let mut filters_agent = SearchFilters::default();
15610 filters_agent.agents.insert("codex".into());
15611
15612 let mut filters_ws = SearchFilters::default();
15613 filters_ws.workspaces.insert("/ws".into());
15614
15615 let key_empty = client.cache_key("test", &filters_empty);
15616 let key_agent = client.cache_key("test", &filters_agent);
15617 let key_ws = client.cache_key("test", &filters_ws);
15618
15619 assert_ne!(
15621 key_empty, key_agent,
15622 "Empty vs agent filter keys should differ"
15623 );
15624 assert_ne!(
15625 key_empty, key_ws,
15626 "Empty vs workspace filter keys should differ"
15627 );
15628 assert_ne!(
15629 key_agent, key_ws,
15630 "Agent vs workspace filter keys should differ"
15631 );
15632
15633 let mut filters_agent2 = SearchFilters::default();
15635 filters_agent2.agents.insert("codex".into());
15636 let key_agent2 = client.cache_key("test", &filters_agent2);
15637 assert_eq!(key_agent, key_agent2, "Same filter should produce same key");
15638 }
15639
15640 #[test]
15648 fn sanitize_query_preserves_unicode_alphanumeric() {
15649 assert_eq!(fs_cass_sanitize_query("こんにちは"), "こんにちは");
15651 assert_eq!(fs_cass_sanitize_query("café"), "café");
15652 assert_eq!(fs_cass_sanitize_query("日本語123"), "日本語123");
15653 }
15654
15655 #[test]
15656 fn sanitize_query_handles_multiple_consecutive_special_chars() {
15657 assert_eq!(fs_cass_sanitize_query("foo---bar"), "foo---bar");
15658 assert_eq!(fs_cass_sanitize_query("a!@#$%^&()b"), "a b");
15660 }
15661
15662 #[test]
15665 fn wildcard_pattern_empty_after_trim_returns_exact_empty() {
15666 assert_eq!(
15667 FsCassWildcardPattern::parse("*"),
15668 FsCassWildcardPattern::Exact(String::new())
15669 );
15670 assert_eq!(
15671 FsCassWildcardPattern::parse("**"),
15672 FsCassWildcardPattern::Exact(String::new())
15673 );
15674 assert_eq!(
15675 FsCassWildcardPattern::parse("***"),
15676 FsCassWildcardPattern::Exact(String::new())
15677 );
15678 }
15679
15680 #[test]
15681 fn wildcard_pattern_to_regex_generation() {
15682 assert_eq!(FsCassWildcardPattern::Exact("foo".into()).to_regex(), None);
15684 assert_eq!(FsCassWildcardPattern::Prefix("foo".into()).to_regex(), None);
15685 assert_eq!(
15688 FsCassWildcardPattern::Suffix("foo".into()).to_regex(),
15689 Some(".*foo$".into())
15690 );
15691 assert_eq!(
15692 FsCassWildcardPattern::Substring("foo".into()).to_regex(),
15693 Some(".*foo.*".into())
15694 );
15695 }
15696
15697 #[test]
15700 fn parse_boolean_query_prefix_minus_not() {
15701 let tokens = fs_cass_parse_boolean_query("-world");
15703 let expected = vec![
15704 FsCassQueryToken::Not,
15705 FsCassQueryToken::Term("world".into()),
15706 ];
15707 assert_eq!(tokens, expected);
15708
15709 let tokens = fs_cass_parse_boolean_query("hello -world");
15711 let expected = vec![
15712 FsCassQueryToken::Term("hello".into()),
15713 FsCassQueryToken::Not,
15714 FsCassQueryToken::Term("world".into()),
15715 ];
15716 assert_eq!(tokens, expected);
15717 }
15718
15719 #[test]
15720 fn parse_boolean_query_empty_quoted_phrase_ignored() {
15721 let tokens = parse_boolean_query("\"\"");
15722 assert!(tokens.is_empty());
15723
15724 let tokens = parse_boolean_query("foo \"\" bar");
15725 let expected: QueryTokenList = vec![
15726 QueryToken::Term("foo".into()),
15727 QueryToken::Term("bar".into()),
15728 ];
15729 assert_eq!(tokens, expected);
15730 }
15731
15732 #[test]
15733 fn parse_boolean_query_unclosed_quote() {
15734 let tokens = parse_boolean_query("\"hello world");
15736 let expected: QueryTokenList = vec![QueryToken::Phrase("hello world".into())];
15737 assert_eq!(tokens, expected);
15738 }
15739
15740 #[test]
15741 fn transpile_to_fts5_rejects_leading_unary_not_queries() {
15742 assert_eq!(transpile_to_fts5("NOT foo"), None);
15743 assert_eq!(transpile_to_fts5("-foo"), None);
15744 }
15745
15746 #[test]
15747 fn transpile_to_fts5_rejects_or_not_forms_it_cannot_represent() {
15748 assert_eq!(transpile_to_fts5("foo OR NOT bar"), None);
15749 assert_eq!(transpile_to_fts5("foo NOT bar OR baz"), None);
15750 }
15751
15752 #[test]
15753 fn transpile_to_fts5_ignores_leading_or() {
15754 assert_eq!(transpile_to_fts5("OR test"), Some("test".to_string()));
15755 assert_eq!(
15756 transpile_to_fts5("OR foo-bar"),
15757 Some("(foo AND bar)".to_string())
15758 );
15759 }
15760
15761 #[test]
15762 fn transpile_to_fts5_splits_hyphenated_subterms_for_sqlite_fts() {
15763 assert_eq!(
15764 transpile_to_fts5("br-123.jsonl"),
15765 Some("(br AND 123 AND jsonl)".to_string())
15766 );
15767 assert_eq!(
15768 transpile_to_fts5("br-123.json*"),
15769 Some("(br AND 123 AND json*)".to_string())
15770 );
15771 }
15772
15773 #[test]
15774 fn transpile_to_fts5_preserves_supported_binary_not() {
15775 assert_eq!(
15776 transpile_to_fts5("foo NOT bar").as_deref(),
15777 Some("foo NOT bar")
15778 );
15779 assert_eq!(
15780 transpile_to_fts5("foo NOT bar-baz"),
15781 Some("foo NOT (bar AND baz)".to_string())
15782 );
15783 }
15784
15785 #[test]
15786 fn search_sqlite_fts5_returns_empty_when_sqlite_is_unavailable() {
15787 let client = SearchClient {
15788 reader: None,
15789 sqlite: Mutex::new(None),
15790 sqlite_path: None,
15791 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15792 reload_on_search: false,
15793 last_reload: Mutex::new(None),
15794 last_generation: Mutex::new(None),
15795 reload_epoch: Arc::new(AtomicU64::new(0)),
15796 warm_tx: None,
15797 _warm_handle: None,
15798 metrics: Metrics::default(),
15799 cache_namespace: "fts5-disabled".to_string(),
15800 semantic: Mutex::new(None),
15801 last_tantivy_total_count: Mutex::new(None),
15802 };
15803
15804 let hits = client.search_sqlite_fts5(
15805 Path::new("/nonexistent"),
15806 "test query",
15807 SearchFilters::default(),
15808 10,
15809 0,
15810 FieldMask::FULL,
15811 );
15812
15813 assert!(hits.is_ok(), "disabled FTS5 path should stay non-fatal");
15814 assert!(
15815 hits.unwrap().is_empty(),
15816 "unavailable SQLite fallback should keep returning an empty result set"
15817 );
15818 }
15819
15820 #[test]
15842 fn search_sqlite_fts5_rank_and_hydrate_split_preserves_limit_prefix_invariant() -> Result<()> {
15843 let conn = Connection::open(":memory:")?;
15844 conn.execute_batch(
15845 "CREATE TABLE sources (id TEXT PRIMARY KEY, kind TEXT);
15846 CREATE TABLE agents (id INTEGER PRIMARY KEY, slug TEXT NOT NULL UNIQUE);
15847 CREATE TABLE workspaces (id INTEGER PRIMARY KEY, path TEXT NOT NULL UNIQUE);
15848 CREATE TABLE conversations (
15849 id INTEGER PRIMARY KEY,
15850 agent_id INTEGER,
15851 workspace_id INTEGER,
15852 source_id TEXT,
15853 origin_host TEXT,
15854 title TEXT,
15855 source_path TEXT
15856 );
15857 CREATE TABLE messages (
15858 id INTEGER PRIMARY KEY,
15859 conversation_id INTEGER,
15860 idx INTEGER,
15861 content TEXT,
15862 created_at INTEGER
15863 );
15864 CREATE VIRTUAL TABLE fts_messages USING fts5(
15865 content,
15866 title,
15867 agent,
15868 workspace,
15869 source_path,
15870 created_at UNINDEXED,
15871 message_id UNINDEXED,
15872 tokenize='porter'
15873 );",
15874 )?;
15875 conn.execute("INSERT INTO sources(id, kind) VALUES('local', 'local')")?;
15876 conn.execute("INSERT INTO agents(id, slug) VALUES(1, 'codex')")?;
15877 conn.execute("INSERT INTO workspaces(id, path) VALUES(1, '/tmp/k0e5p')")?;
15878
15879 for (i, repeats) in (1..=6_i64).enumerate() {
15886 let conv_id = i as i64 + 1;
15887 let msg_id = (i as i64 + 1) * 10;
15888 conn.execute_compat(
15889 "INSERT INTO conversations(id, agent_id, workspace_id, source_id, \
15890 origin_host, title, source_path) \
15891 VALUES(?1, 1, 1, 'local', NULL, ?2, ?3)",
15892 params![
15893 conv_id,
15894 format!("k0e5p-{}", i),
15895 format!("/tmp/k0e5p/{}.jsonl", i),
15896 ],
15897 )?;
15898 let content = "rankprobe ".repeat(repeats as usize);
15899 conn.execute_compat(
15900 "INSERT INTO messages(id, conversation_id, idx, content, created_at) \
15901 VALUES(?1, ?2, ?3, ?4, ?5)",
15902 params![
15903 msg_id,
15904 conv_id,
15905 i as i64,
15906 content.as_str(),
15907 1_700_000_000_i64 + i as i64
15908 ],
15909 )?;
15910 conn.execute_compat(
15911 "INSERT INTO fts_messages(rowid, content, title, agent, workspace, \
15912 source_path, created_at, message_id) \
15913 VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
15914 params![
15915 msg_id,
15916 content.as_str(),
15917 format!("k0e5p-{}", i),
15918 "codex",
15919 "/tmp/k0e5p",
15920 format!("/tmp/k0e5p/{}.jsonl", i),
15921 1_700_000_000_i64 + i as i64,
15922 msg_id,
15923 ],
15924 )?;
15925 }
15926
15927 let client = SearchClient {
15928 reader: None,
15929 sqlite: Mutex::new(Some(SendConnection(conn))),
15930 sqlite_path: None,
15931 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
15932 reload_on_search: false,
15933 last_reload: Mutex::new(None),
15934 last_generation: Mutex::new(None),
15935 reload_epoch: Arc::new(AtomicU64::new(0)),
15936 warm_tx: None,
15937 _warm_handle: None,
15938 metrics: Metrics::default(),
15939 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:k0e5p"),
15940 semantic: Mutex::new(None),
15941 last_tantivy_total_count: Mutex::new(None),
15942 };
15943
15944 fn hit_keys(hits: &[SearchHit]) -> Vec<(String, Option<usize>)> {
15949 hits.iter()
15950 .map(|h| (h.source_path.clone(), h.line_number))
15951 .collect()
15952 }
15953
15954 let large_hits = client.search_sqlite_fts5(
15955 Path::new(":memory:"),
15956 "rankprobe",
15957 SearchFilters::default(),
15958 6,
15959 0,
15960 FieldMask::FULL,
15961 )?;
15962 assert_eq!(
15963 large_hits.len(),
15964 6,
15965 "limit=N must return all N candidates when the corpus has exactly N matches"
15966 );
15967
15968 let small_hits = client.search_sqlite_fts5(
15969 Path::new(":memory:"),
15970 "rankprobe",
15971 SearchFilters::default(),
15972 3,
15973 0,
15974 FieldMask::FULL,
15975 )?;
15976 assert_eq!(small_hits.len(), 3, "limit=3 must return exactly 3 hits");
15977
15978 let large_keys = hit_keys(&large_hits);
15981 let small_keys = hit_keys(&small_hits);
15982 assert_eq!(
15983 small_keys,
15984 large_keys[..3],
15985 "limit=3 hit keys MUST be the first 3 of limit=6 hit keys (rank+hydrate \
15986 split must not re-order or re-filter); small={small_keys:?} \
15987 large_prefix={:?}",
15988 &large_keys[..3]
15989 );
15990
15991 for (idx, (small, large)) in small_hits.iter().zip(large_hits.iter()).enumerate() {
15997 assert_eq!(
15998 small.content, large.content,
15999 "hit[{idx}] content must agree across limit=3 and limit=6: \
16000 small={:?} large={:?}",
16001 small.content, large.content
16002 );
16003 assert_eq!(
16004 small.title, large.title,
16005 "hit[{idx}] title must agree across limit=3 and limit=6"
16006 );
16007 }
16008
16009 let zero_hits = client.search_sqlite_fts5(
16013 Path::new(":memory:"),
16014 "rankprobe",
16015 SearchFilters::default(),
16016 0,
16017 0,
16018 FieldMask::FULL,
16019 )?;
16020 assert!(
16021 zero_hits.is_empty(),
16022 "limit=0 must return zero hits even though the rank phase has candidates; \
16023 got {} hits",
16024 zero_hits.len()
16025 );
16026
16027 Ok(())
16028 }
16029
16030 #[test]
16033 fn levenshtein_distance_identical_strings() {
16034 assert_eq!(levenshtein_distance("hello", "hello"), 0);
16035 assert_eq!(levenshtein_distance("", ""), 0);
16036 }
16037
16038 #[test]
16039 fn levenshtein_distance_insertions() {
16040 assert_eq!(levenshtein_distance("", "abc"), 3);
16041 assert_eq!(levenshtein_distance("cat", "cats"), 1);
16042 }
16043
16044 #[test]
16045 fn levenshtein_distance_deletions() {
16046 assert_eq!(levenshtein_distance("abc", ""), 3);
16047 assert_eq!(levenshtein_distance("cats", "cat"), 1);
16048 }
16049
16050 #[test]
16051 fn levenshtein_distance_substitutions() {
16052 assert_eq!(levenshtein_distance("cat", "bat"), 1);
16053 assert_eq!(levenshtein_distance("kitten", "sitten"), 1);
16054 }
16055
16056 #[test]
16057 fn levenshtein_distance_mixed_operations() {
16058 assert_eq!(levenshtein_distance("kitten", "sitting"), 3);
16059 assert_eq!(levenshtein_distance("saturday", "sunday"), 3);
16060 }
16061
16062 #[test]
16065 fn is_tool_invocation_noise_allows_real_content() {
16066 assert!(!is_tool_invocation_noise("This is a normal message"));
16067 assert!(!is_tool_invocation_noise(
16068 "Let me use the Tool feature to accomplish this task. Here is the implementation..."
16069 ));
16070 let long_content = "[Tool: Read] Now here is a lot of useful content that explains the implementation details and provides context for the changes being made to the codebase.";
16072 assert!(!is_tool_invocation_noise(long_content));
16073 }
16074
16075 #[test]
16076 fn is_tool_invocation_noise_handles_short_tool_markers() {
16077 assert!(is_tool_invocation_noise("[tool: x]"));
16078 assert!(is_tool_invocation_noise("tool: bash"));
16079 }
16080
16081 #[test]
16084 fn search_boolean_and_filters_results() -> Result<()> {
16085 let dir = TempDir::new()?;
16086 let mut index = TantivyIndex::open_or_create(dir.path())?;
16087
16088 let conv1 = NormalizedConversation {
16090 agent_slug: "codex".into(),
16091 external_id: None,
16092 title: Some("doc1".into()),
16093 workspace: None,
16094 source_path: dir.path().join("1.jsonl"),
16095 started_at: Some(1),
16096 ended_at: None,
16097 metadata: serde_json::json!({}),
16098 messages: vec![NormalizedMessage {
16099 idx: 0,
16100 role: "user".into(),
16101 author: None,
16102 created_at: Some(1),
16103 content: "alpha beta gamma".into(),
16104 extra: serde_json::json!({}),
16105 snippets: vec![],
16106 invocations: Vec::new(),
16107 }],
16108 };
16109 let conv2 = NormalizedConversation {
16110 agent_slug: "codex".into(),
16111 external_id: None,
16112 title: Some("doc2".into()),
16113 workspace: None,
16114 source_path: dir.path().join("2.jsonl"),
16115 started_at: Some(2),
16116 ended_at: None,
16117 metadata: serde_json::json!({}),
16118 messages: vec![NormalizedMessage {
16119 idx: 0,
16120 role: "user".into(),
16121 author: None,
16122 created_at: Some(2),
16123 content: "alpha delta".into(),
16124 extra: serde_json::json!({}),
16125 snippets: vec![],
16126 invocations: Vec::new(),
16127 }],
16128 };
16129 index.add_conversation(&conv1)?;
16130 index.add_conversation(&conv2)?;
16131 index.commit()?;
16132
16133 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16134
16135 let hits = client.search(
16137 "alpha AND beta",
16138 SearchFilters::default(),
16139 10,
16140 0,
16141 FieldMask::FULL,
16142 )?;
16143 assert_eq!(hits.len(), 1);
16144 assert!(hits[0].content.contains("gamma"));
16145
16146 let hits = client.search(
16148 "alpha AND delta",
16149 SearchFilters::default(),
16150 10,
16151 0,
16152 FieldMask::FULL,
16153 )?;
16154 assert_eq!(hits.len(), 1);
16155 assert!(hits[0].content.contains("delta"));
16156
16157 Ok(())
16158 }
16159
16160 #[test]
16161 fn search_boolean_or_expands_results() -> Result<()> {
16162 let dir = TempDir::new()?;
16163 let mut index = TantivyIndex::open_or_create(dir.path())?;
16164
16165 let conv1 = NormalizedConversation {
16166 agent_slug: "codex".into(),
16167 external_id: None,
16168 title: Some("doc1".into()),
16169 workspace: None,
16170 source_path: dir.path().join("1.jsonl"),
16171 started_at: Some(1),
16172 ended_at: None,
16173 metadata: serde_json::json!({}),
16174 messages: vec![NormalizedMessage {
16175 idx: 0,
16176 role: "user".into(),
16177 author: None,
16178 created_at: Some(1),
16179 content: "unique xyzzy term".into(),
16180 extra: serde_json::json!({}),
16181 snippets: vec![],
16182 invocations: Vec::new(),
16183 }],
16184 };
16185 let conv2 = NormalizedConversation {
16186 agent_slug: "codex".into(),
16187 external_id: None,
16188 title: Some("doc2".into()),
16189 workspace: None,
16190 source_path: dir.path().join("2.jsonl"),
16191 started_at: Some(2),
16192 ended_at: None,
16193 metadata: serde_json::json!({}),
16194 messages: vec![NormalizedMessage {
16195 idx: 0,
16196 role: "user".into(),
16197 author: None,
16198 created_at: Some(2),
16199 content: "unique plugh term".into(),
16200 extra: serde_json::json!({}),
16201 snippets: vec![],
16202 invocations: Vec::new(),
16203 }],
16204 };
16205 index.add_conversation(&conv1)?;
16206 index.add_conversation(&conv2)?;
16207 index.commit()?;
16208
16209 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16210
16211 let hits = client.search(
16213 "xyzzy OR plugh",
16214 SearchFilters::default(),
16215 10,
16216 0,
16217 FieldMask::FULL,
16218 )?;
16219 assert_eq!(hits.len(), 2);
16220
16221 Ok(())
16222 }
16223
16224 #[test]
16225 fn search_boolean_not_excludes_results() -> Result<()> {
16226 let dir = TempDir::new()?;
16227 let mut index = TantivyIndex::open_or_create(dir.path())?;
16228
16229 let conv1 = NormalizedConversation {
16230 agent_slug: "codex".into(),
16231 external_id: None,
16232 title: Some("doc1".into()),
16233 workspace: None,
16234 source_path: dir.path().join("1.jsonl"),
16235 started_at: Some(1),
16236 ended_at: None,
16237 metadata: serde_json::json!({}),
16238 messages: vec![NormalizedMessage {
16239 idx: 0,
16240 role: "user".into(),
16241 author: None,
16242 created_at: Some(1),
16243 content: "nottest keep this".into(),
16244 extra: serde_json::json!({}),
16245 snippets: vec![],
16246 invocations: Vec::new(),
16247 }],
16248 };
16249 let conv2 = NormalizedConversation {
16250 agent_slug: "codex".into(),
16251 external_id: None,
16252 title: Some("doc2".into()),
16253 workspace: None,
16254 source_path: dir.path().join("2.jsonl"),
16255 started_at: Some(2),
16256 ended_at: None,
16257 metadata: serde_json::json!({}),
16258 messages: vec![NormalizedMessage {
16259 idx: 0,
16260 role: "user".into(),
16261 author: None,
16262 created_at: Some(2),
16263 content: "nottest exclude this".into(),
16264 extra: serde_json::json!({}),
16265 snippets: vec![],
16266 invocations: Vec::new(),
16267 }],
16268 };
16269 index.add_conversation(&conv1)?;
16270 index.add_conversation(&conv2)?;
16271 index.commit()?;
16272
16273 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16274
16275 let hits = client.search(
16277 "nottest NOT exclude",
16278 SearchFilters::default(),
16279 10,
16280 0,
16281 FieldMask::FULL,
16282 )?;
16283 assert_eq!(hits.len(), 1);
16284 assert!(
16286 !hits[0].content.contains("exclude"),
16287 "NOT exclude should filter out doc with 'exclude'"
16288 );
16289
16290 let hits = client.search(
16292 "nottest -exclude",
16293 SearchFilters::default(),
16294 10,
16295 0,
16296 FieldMask::FULL,
16297 )?;
16298 assert_eq!(hits.len(), 1);
16299 assert!(
16300 !hits[0].content.contains("exclude"),
16301 "Prefix -exclude should filter out doc with 'exclude'"
16302 );
16303
16304 Ok(())
16305 }
16306
16307 #[test]
16308 fn search_phrase_query_matches_exact_sequence() -> Result<()> {
16309 let dir = TempDir::new()?;
16310 let mut index = TantivyIndex::open_or_create(dir.path())?;
16311
16312 let conv1 = NormalizedConversation {
16313 agent_slug: "codex".into(),
16314 external_id: None,
16315 title: Some("doc1".into()),
16316 workspace: None,
16317 source_path: dir.path().join("1.jsonl"),
16318 started_at: Some(1),
16319 ended_at: None,
16320 metadata: serde_json::json!({}),
16321 messages: vec![NormalizedMessage {
16322 idx: 0,
16323 role: "user".into(),
16324 author: None,
16325 created_at: Some(1),
16326 content: "the quick brown fox".into(),
16327 extra: serde_json::json!({}),
16328 snippets: vec![],
16329 invocations: Vec::new(),
16330 }],
16331 };
16332 let conv2 = NormalizedConversation {
16333 agent_slug: "codex".into(),
16334 external_id: None,
16335 title: Some("doc2".into()),
16336 workspace: None,
16337 source_path: dir.path().join("2.jsonl"),
16338 started_at: Some(2),
16339 ended_at: None,
16340 metadata: serde_json::json!({}),
16341 messages: vec![NormalizedMessage {
16342 idx: 0,
16343 role: "user".into(),
16344 author: None,
16345 created_at: Some(2),
16346 content: "the brown quick fox".into(),
16347 extra: serde_json::json!({}),
16348 snippets: vec![],
16349 invocations: Vec::new(),
16350 }],
16351 };
16352 index.add_conversation(&conv1)?;
16353 index.add_conversation(&conv2)?;
16354 index.commit()?;
16355
16356 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16357
16358 let hits = client.search(
16360 "quick brown",
16361 SearchFilters::default(),
16362 10,
16363 0,
16364 FieldMask::FULL,
16365 )?;
16366 assert_eq!(hits.len(), 2);
16367
16368 let hits = client.search(
16370 "\"quick brown\"",
16371 SearchFilters::default(),
16372 10,
16373 0,
16374 FieldMask::FULL,
16375 )?;
16376 assert_eq!(hits.len(), 1);
16377 assert!(hits[0].content.contains("quick brown"));
16378
16379 Ok(())
16380 }
16381
16382 #[test]
16383 fn search_dot_punctuation_splits_terms_but_hyphens_preserve_compound_semantics() -> Result<()> {
16384 let dir = TempDir::new()?;
16385 let mut index = TantivyIndex::open_or_create(dir.path())?;
16386
16387 let conv = NormalizedConversation {
16388 agent_slug: "codex".into(),
16389 external_id: None,
16390 title: Some("doc".into()),
16391 workspace: None,
16392 source_path: dir.path().join("3.jsonl"),
16393 started_at: Some(1),
16394 ended_at: None,
16395 metadata: serde_json::json!({}),
16396 messages: vec![NormalizedMessage {
16397 idx: 0,
16398 role: "user".into(),
16399 author: None,
16400 created_at: Some(1),
16401 content: "foo bar baz".into(),
16402 extra: serde_json::json!({}),
16403 snippets: vec![],
16404 invocations: Vec::new(),
16405 }],
16406 };
16407 index.add_conversation(&conv)?;
16408 index.commit()?;
16409
16410 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16411
16412 let hits = client.search("foo.bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16413 assert_eq!(hits.len(), 1);
16414
16415 let hits = client.search("foo-bar", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
16416 assert_eq!(hits.len(), 0);
16417
16418 Ok(())
16419 }
16420
16421 #[test]
16426 fn explanation_classifies_simple_query() {
16427 let exp = QueryExplanation::analyze("hello", &SearchFilters::default());
16428 assert_eq!(exp.query_type, QueryType::Simple);
16429 assert_eq!(exp.index_strategy, IndexStrategy::EdgeNgram);
16430 assert_eq!(exp.estimated_cost, QueryCost::Low);
16431 assert!(exp.parsed.terms.len() == 1);
16432 assert_eq!(exp.parsed.terms[0].text, "hello");
16433 assert!(!exp.parsed.terms[0].subterms.is_empty());
16434 assert_eq!(exp.parsed.terms[0].subterms[0].pattern, "exact");
16435 }
16436
16437 #[test]
16438 fn explanation_classifies_wildcard_query() {
16439 let exp = QueryExplanation::analyze("*handler*", &SearchFilters::default());
16440 assert_eq!(exp.query_type, QueryType::Wildcard);
16441 assert_eq!(exp.index_strategy, IndexStrategy::RegexScan);
16442 assert_eq!(exp.estimated_cost, QueryCost::High);
16443 assert!(!exp.parsed.terms[0].subterms.is_empty());
16444 assert!(
16445 exp.parsed.terms[0].subterms[0]
16446 .pattern
16447 .contains("substring")
16448 );
16449 assert!(exp.warnings.iter().any(|w| w.contains("regex scan")));
16450 }
16451
16452 #[test]
16453 fn explanation_classifies_boolean_query() {
16454 let exp = QueryExplanation::analyze("foo AND bar", &SearchFilters::default());
16455 assert_eq!(exp.query_type, QueryType::Boolean);
16456 assert_eq!(exp.index_strategy, IndexStrategy::BooleanCombination);
16457 assert!(exp.parsed.operators.contains(&"AND".to_string()));
16458 }
16459
16460 #[test]
16461 fn explanation_classifies_phrase_query() {
16462 let exp = QueryExplanation::analyze("\"exact phrase\"", &SearchFilters::default());
16463 assert_eq!(exp.query_type, QueryType::Phrase);
16464 assert!(exp.parsed.phrases.contains(&"exact phrase".to_string()));
16465 }
16466
16467 #[test]
16468 fn explanation_handles_filtered_query() {
16469 let mut filters = SearchFilters::default();
16470 filters.agents.insert("codex".to_string());
16471
16472 let exp = QueryExplanation::analyze("test", &filters);
16473 assert_eq!(exp.query_type, QueryType::Filtered);
16474 assert_eq!(exp.filters_summary.agent_count, 1);
16475 assert!(
16476 exp.filters_summary
16477 .description
16478 .as_ref()
16479 .unwrap()
16480 .contains("1 agent")
16481 );
16482 assert!(exp.warnings.iter().any(|w| w.contains("codex")));
16483 }
16484
16485 #[test]
16486 fn explanation_handles_empty_query() {
16487 let exp = QueryExplanation::analyze("", &SearchFilters::default());
16488 assert_eq!(exp.query_type, QueryType::Empty);
16489 assert_eq!(exp.index_strategy, IndexStrategy::FullScan);
16490 assert_eq!(exp.estimated_cost, QueryCost::High);
16491 assert!(exp.warnings.iter().any(|w| w.contains("Empty query")));
16492 }
16493
16494 #[test]
16495 fn explanation_warns_short_terms() {
16496 let exp = QueryExplanation::analyze("a", &SearchFilters::default());
16497 assert!(exp.warnings.iter().any(|w| w.contains("Very short term")));
16498 }
16499
16500 #[test]
16501 fn explanation_with_wildcard_fallback() {
16502 let exp = QueryExplanation::analyze("test", &SearchFilters::default())
16503 .with_wildcard_fallback(true);
16504 assert!(exp.wildcard_applied);
16505 assert!(exp.warnings.iter().any(|w| w.contains("Wildcard fallback")));
16507 }
16508
16509 #[test]
16510 fn explanation_complex_query_has_higher_cost() {
16511 let exp = QueryExplanation::analyze(
16512 "foo AND bar OR baz NOT qux AND \"phrase here\"",
16513 &SearchFilters::default(),
16514 );
16515 assert_eq!(exp.query_type, QueryType::Boolean);
16516 assert!(matches!(
16518 exp.estimated_cost,
16519 QueryCost::Medium | QueryCost::High
16520 ));
16521 }
16522
16523 #[test]
16524 fn explanation_preserves_original_query() {
16525 let exp = QueryExplanation::analyze("Hello World!", &SearchFilters::default());
16526 assert_eq!(exp.original_query, "Hello World!");
16527 assert!(exp.sanitized_query.contains("Hello"));
16529 assert!(!exp.sanitized_query.contains("!"));
16531 }
16532
16533 #[test]
16534 fn explanation_detects_not_operator() {
16535 let exp = QueryExplanation::analyze("foo NOT bar", &SearchFilters::default());
16536 assert!(exp.parsed.operators.contains(&"NOT".to_string()));
16537 assert!(
16539 exp.parsed
16540 .terms
16541 .iter()
16542 .any(|t| t.negated && t.text == "bar")
16543 );
16544 }
16545
16546 #[test]
16547 fn explanation_implicit_and() {
16548 let exp = QueryExplanation::analyze("foo bar", &SearchFilters::default());
16549 assert!(exp.parsed.implicit_and);
16550 assert_eq!(exp.parsed.terms.len(), 2);
16551 }
16552
16553 #[test]
16554 fn explanation_serializes_to_json() {
16555 let exp = QueryExplanation::analyze("test query", &SearchFilters::default());
16556 let json = serde_json::to_value(&exp).expect("should serialize");
16557 assert!(json["original_query"].is_string());
16558 assert!(json["query_type"].is_string());
16559 assert!(json["index_strategy"].is_string());
16560 assert!(json["estimated_cost"].is_string());
16561 assert!(json["parsed"]["terms"].is_array());
16562 }
16563
16564 #[test]
16569 fn search_multi_filter_agent_workspace_time() -> Result<()> {
16570 let dir = TempDir::new()?;
16572 let mut index = TantivyIndex::open_or_create(dir.path())?;
16573
16574 let convs = [
16576 ("codex", "/ws/alpha", 100, "needle alpha codex"),
16577 ("claude", "/ws/alpha", 200, "needle alpha claude"),
16578 ("codex", "/ws/beta", 150, "needle beta codex"),
16579 ("codex", "/ws/alpha", 300, "needle alpha codex late"),
16580 ];
16581
16582 for (i, (agent, ws, ts, content)) in convs.iter().enumerate() {
16583 let conv = NormalizedConversation {
16584 agent_slug: (*agent).into(),
16585 external_id: None,
16586 title: Some(format!("conv-{i}")),
16587 workspace: Some(std::path::PathBuf::from(*ws)),
16588 source_path: dir.path().join(format!("{i}.jsonl")),
16589 started_at: Some(*ts),
16590 ended_at: None,
16591 metadata: serde_json::json!({}),
16592 messages: vec![NormalizedMessage {
16593 idx: 0,
16594 role: "user".into(),
16595 author: None,
16596 created_at: Some(*ts),
16597 content: (*content).into(),
16598 extra: serde_json::json!({}),
16599 snippets: vec![],
16600 invocations: Vec::new(),
16601 }],
16602 };
16603 index.add_conversation(&conv)?;
16604 }
16605 index.commit()?;
16606
16607 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16608
16609 let mut filters = SearchFilters::default();
16611 filters.agents.insert("codex".into());
16612 filters.workspaces.insert("/ws/alpha".into());
16613 filters.created_from = Some(50);
16614 filters.created_to = Some(250);
16615
16616 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
16617 assert_eq!(
16618 hits.len(),
16619 1,
16620 "Should match only one conv (codex + alpha + ts=100)"
16621 );
16622 assert_eq!(hits[0].agent, "codex");
16623 assert_eq!(hits[0].workspace, "/ws/alpha");
16624 assert!(hits[0].content.contains("alpha codex"));
16625 assert!(!hits[0].content.contains("late")); Ok(())
16628 }
16629
16630 #[test]
16631 fn search_multi_agent_filter() -> Result<()> {
16632 let dir = TempDir::new()?;
16634 let mut index = TantivyIndex::open_or_create(dir.path())?;
16635
16636 for agent in ["codex", "claude", "cline", "gemini"] {
16637 let conv = NormalizedConversation {
16638 agent_slug: agent.into(),
16639 external_id: None,
16640 title: Some(format!("{agent}-conv")),
16641 workspace: Some(std::path::PathBuf::from("/ws")),
16642 source_path: dir.path().join(format!("{agent}.jsonl")),
16643 started_at: Some(100),
16644 ended_at: None,
16645 metadata: serde_json::json!({}),
16646 messages: vec![NormalizedMessage {
16647 idx: 0,
16648 role: "user".into(),
16649 author: None,
16650 created_at: Some(100),
16651 content: format!("needle from {agent}"),
16652 extra: serde_json::json!({}),
16653 snippets: vec![],
16654 invocations: Vec::new(),
16655 }],
16656 };
16657 index.add_conversation(&conv)?;
16658 }
16659 index.commit()?;
16660
16661 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16662
16663 let mut filters = SearchFilters::default();
16665 filters.agents.insert("codex".into());
16666 filters.agents.insert("claude".into());
16667
16668 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
16669 assert_eq!(hits.len(), 2);
16670 let agents: Vec<_> = hits.iter().map(|h| h.agent.as_str()).collect();
16671 assert!(agents.contains(&"codex"));
16672 assert!(agents.contains(&"claude"));
16673 assert!(!agents.contains(&"cline"));
16674 assert!(!agents.contains(&"gemini"));
16675
16676 Ok(())
16677 }
16678
16679 #[test]
16684 fn cache_metrics_incremented_on_operations() {
16685 let client = SearchClient {
16686 reader: None,
16687 sqlite: Mutex::new(None),
16688 sqlite_path: None,
16689 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16690 reload_on_search: true,
16691 last_reload: Mutex::new(None),
16692 last_generation: Mutex::new(None),
16693 reload_epoch: Arc::new(AtomicU64::new(0)),
16694 warm_tx: None,
16695 _warm_handle: None,
16696 metrics: Metrics::default(),
16697 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
16698 semantic: Mutex::new(None),
16699 last_tantivy_total_count: Mutex::new(None),
16700 };
16701
16702 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
16704 assert_eq!((hits, miss, shortfall, reloads), (0, 0, 0, 0));
16705
16706 client.metrics.inc_cache_hits();
16708 client.metrics.inc_cache_hits();
16709 client.metrics.inc_cache_miss();
16710 client.metrics.inc_cache_shortfall();
16711 client.metrics.inc_reload();
16712
16713 let (hits, miss, shortfall, reloads, _) = client.metrics.snapshot_all();
16714 assert_eq!(hits, 2);
16715 assert_eq!(miss, 1);
16716 assert_eq!(shortfall, 1);
16717 assert_eq!(reloads, 1);
16718 }
16719
16720 #[test]
16721 fn cache_shard_name_deterministic() {
16722 let client = SearchClient {
16724 reader: None,
16725 sqlite: Mutex::new(None),
16726 sqlite_path: None,
16727 prefix_cache: Mutex::new(CacheShards::new(*CACHE_TOTAL_CAP, *CACHE_BYTE_CAP)),
16728 reload_on_search: true,
16729 last_reload: Mutex::new(None),
16730 last_generation: Mutex::new(None),
16731 reload_epoch: Arc::new(AtomicU64::new(0)),
16732 warm_tx: None,
16733 _warm_handle: None,
16734 metrics: Metrics::default(),
16735 cache_namespace: format!("v{CACHE_KEY_VERSION}|schema:test"),
16736 semantic: Mutex::new(None),
16737 last_tantivy_total_count: Mutex::new(None),
16738 };
16739
16740 let filters1 = SearchFilters::default();
16741 let mut filters2 = SearchFilters::default();
16742 filters2.agents.insert("codex".into());
16743 let mut filters3 = SearchFilters::default();
16744 filters3.workspaces.insert("/tmp/cass-workspace".into());
16745
16746 let shard1_first = client.shard_name(&filters1);
16748 let shard1_second = client.shard_name(&filters1);
16749 assert_eq!(
16750 shard1_first, shard1_second,
16751 "Same filters should produce same shard name"
16752 );
16753
16754 let shard2 = client.shard_name(&filters2);
16756 assert_ne!(
16757 shard1_first, shard2,
16758 "Different filters should produce different shard names"
16759 );
16760
16761 assert_eq!(shard2, client.shard_name(&filters2));
16763 assert_eq!(
16764 client.shard_name(&filters3),
16765 "workspace:/tmp/cass-workspace"
16766 );
16767 }
16768
16769 #[test]
16774 fn wildcard_fallback_respects_filter_constraints() -> Result<()> {
16775 let dir = TempDir::new()?;
16776 let mut index = TantivyIndex::open_or_create(dir.path())?;
16777
16778 let conv_match = NormalizedConversation {
16780 agent_slug: "codex".into(),
16781 external_id: None,
16782 title: Some("match".into()),
16783 workspace: Some(std::path::PathBuf::from("/target")),
16784 source_path: dir.path().join("match.jsonl"),
16785 started_at: Some(100),
16786 ended_at: None,
16787 metadata: serde_json::json!({}),
16788 messages: vec![NormalizedMessage {
16789 idx: 0,
16790 role: "user".into(),
16791 author: None,
16792 created_at: Some(100),
16793 content: "unique specific term here".into(),
16794 extra: serde_json::json!({}),
16795 snippets: vec![],
16796 invocations: Vec::new(),
16797 }],
16798 };
16799
16800 let conv_other = NormalizedConversation {
16801 agent_slug: "claude".into(),
16802 external_id: None,
16803 title: Some("other".into()),
16804 workspace: Some(std::path::PathBuf::from("/other")),
16805 source_path: dir.path().join("other.jsonl"),
16806 started_at: Some(100),
16807 ended_at: None,
16808 metadata: serde_json::json!({}),
16809 messages: vec![NormalizedMessage {
16810 idx: 0,
16811 role: "user".into(),
16812 author: None,
16813 created_at: Some(100),
16814 content: "unique specific also here".into(),
16815 extra: serde_json::json!({}),
16816 snippets: vec![],
16817 invocations: Vec::new(),
16818 }],
16819 };
16820
16821 index.add_conversation(&conv_match)?;
16822 index.add_conversation(&conv_other)?;
16823 index.commit()?;
16824
16825 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16826
16827 let mut filters = SearchFilters::default();
16829 filters.agents.insert("codex".into());
16830
16831 let result =
16832 client.search_with_fallback("unique", filters.clone(), 10, 0, 100, FieldMask::FULL)?;
16833 assert!(result.hits.iter().all(|h| h.agent == "codex"));
16835
16836 Ok(())
16837 }
16838
16839 #[test]
16840 fn wildcard_fallback_short_query_triggers_prefix() -> Result<()> {
16841 let dir = TempDir::new()?;
16842 let mut index = TantivyIndex::open_or_create(dir.path())?;
16843
16844 let conv = NormalizedConversation {
16845 agent_slug: "codex".into(),
16846 external_id: None,
16847 title: Some("test".into()),
16848 workspace: None,
16849 source_path: dir.path().join("test.jsonl"),
16850 started_at: Some(100),
16851 ended_at: None,
16852 metadata: serde_json::json!({}),
16853 messages: vec![NormalizedMessage {
16854 idx: 0,
16855 role: "user".into(),
16856 author: None,
16857 created_at: Some(100),
16858 content: "authentication authorization oauth".into(),
16859 extra: serde_json::json!({}),
16860 snippets: vec![],
16861 invocations: Vec::new(),
16862 }],
16863 };
16864 index.add_conversation(&conv)?;
16865 index.commit()?;
16866
16867 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16868
16869 let result = client.search_with_fallback(
16871 "auth",
16872 SearchFilters::default(),
16873 10,
16874 0,
16875 100,
16876 FieldMask::FULL,
16877 )?;
16878 assert!(
16879 !result.hits.is_empty(),
16880 "Short prefix should match via prefix search"
16881 );
16882 assert!(result.hits[0].content.contains("auth"));
16883
16884 Ok(())
16885 }
16886
16887 #[test]
16892 fn search_real_fixture_multiple_messages() -> Result<()> {
16893 let dir = TempDir::new()?;
16894 let mut index = TantivyIndex::open_or_create(dir.path())?;
16895
16896 let conv = NormalizedConversation {
16898 agent_slug: "claude_code".into(),
16899 external_id: Some("conv-123".into()),
16900 title: Some("Implementing authentication".into()),
16901 workspace: Some(std::path::PathBuf::from("/home/user/project")),
16902 source_path: dir.path().join("session-1.jsonl"),
16903 started_at: Some(1700000000000),
16904 ended_at: Some(1700000060000),
16905 metadata: serde_json::json!({
16906 "model": "claude-3-sonnet",
16907 "tokens": 1500
16908 }),
16909 messages: vec![
16910 NormalizedMessage {
16911 idx: 0,
16912 role: "user".into(),
16913 author: Some("developer".into()),
16914 created_at: Some(1700000000000),
16915 content: "Help me implement JWT authentication for my Express API".into(),
16916 extra: serde_json::json!({}),
16917 snippets: vec![],
16918 invocations: Vec::new(),
16919 },
16920 NormalizedMessage {
16921 idx: 1,
16922 role: "assistant".into(),
16923 author: Some("claude".into()),
16924 created_at: Some(1700000010000),
16925 content: "I'll help you implement JWT authentication. First, let's install the required packages.".into(),
16926 extra: serde_json::json!({}),
16927 snippets: vec![NormalizedSnippet {
16928 file_path: Some("package.json".into()),
16929 start_line: Some(1),
16930 end_line: Some(5),
16931 language: Some("json".into()),
16932 snippet_text: Some(r#"{"dependencies":{"jsonwebtoken":"^9.0.0"}}"#.into()),
16933 }],
16934 invocations: Vec::new(),
16935 },
16936 NormalizedMessage {
16937 idx: 2,
16938 role: "user".into(),
16939 author: Some("developer".into()),
16940 created_at: Some(1700000030000),
16941 content: "Can you also add refresh token support?".into(),
16942 extra: serde_json::json!({}),
16943 snippets: vec![],
16944 invocations: Vec::new(),
16945 },
16946 ],
16947 };
16948 index.add_conversation(&conv)?;
16949 index.commit()?;
16950
16951 let client = SearchClient::open(dir.path(), None)?.expect("index present");
16952
16953 let hits = client.search(
16955 "JWT authentication",
16956 SearchFilters::default(),
16957 10,
16958 0,
16959 FieldMask::FULL,
16960 )?;
16961 assert!(!hits.is_empty(), "Should find JWT authentication");
16962 assert!(hits.iter().any(|h| h.agent == "claude_code"));
16963 assert!(
16964 hits.iter()
16965 .any(|h| h.snippet.contains("JWT") || h.snippet.contains("authentication"))
16966 );
16967
16968 let hits = client.search(
16970 "required packages",
16971 SearchFilters::default(),
16972 10,
16973 0,
16974 FieldMask::FULL,
16975 )?;
16976 assert!(
16977 !hits.is_empty(),
16978 "Should find 'required packages' in assistant response"
16979 );
16980
16981 let hits = client.search(
16983 "refresh token",
16984 SearchFilters::default(),
16985 10,
16986 0,
16987 FieldMask::FULL,
16988 )?;
16989 assert!(!hits.is_empty(), "Should find refresh token");
16990 assert!(hits.iter().any(|h| h.content.contains("refresh")));
16991
16992 Ok(())
16993 }
16994
16995 #[test]
16996 fn search_deduplication_with_similar_content() -> Result<()> {
16997 let dir = TempDir::new()?;
16998 let mut index = TantivyIndex::open_or_create(dir.path())?;
16999
17000 for i in 0..2 {
17002 let conv = NormalizedConversation {
17003 agent_slug: "codex".into(),
17004 external_id: None,
17005 title: Some(format!("similar-{i}")),
17006 workspace: Some(std::path::PathBuf::from("/ws")),
17007 source_path: dir.path().join(format!("similar-{i}.jsonl")),
17008 started_at: Some(100 + i),
17009 ended_at: None,
17010 metadata: serde_json::json!({}),
17011 messages: vec![NormalizedMessage {
17012 idx: 0,
17013 role: "user".into(),
17014 author: None,
17015 created_at: Some(100 + i),
17016 content: "implement the sorting algorithm".into(),
17018 extra: serde_json::json!({}),
17019 snippets: vec![],
17020 invocations: Vec::new(),
17021 }],
17022 };
17023 index.add_conversation(&conv)?;
17024 }
17025 index.commit()?;
17026
17027 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17028 let result = client.search_with_fallback(
17029 "sorting algorithm",
17030 SearchFilters::default(),
17031 10,
17032 0,
17033 100,
17034 FieldMask::FULL,
17035 )?;
17036
17037 assert!(!result.hits.is_empty());
17040
17041 Ok(())
17042 }
17043
17044 #[test]
17049 fn search_session_paths_filter() -> Result<()> {
17050 let dir = TempDir::new()?;
17052 let mut index = TantivyIndex::open_or_create(dir.path())?;
17053
17054 let paths = [
17056 dir.path().join("session-a.jsonl"),
17057 dir.path().join("session-b.jsonl"),
17058 dir.path().join("session-c.jsonl"),
17059 ];
17060
17061 for (i, path) in paths.iter().enumerate() {
17062 let conv = NormalizedConversation {
17063 agent_slug: "claude".into(),
17064 external_id: None,
17065 title: Some(format!("session-{}", i)),
17066 workspace: Some(std::path::PathBuf::from("/ws")),
17067 source_path: path.clone(),
17068 started_at: Some(100 + i as i64),
17069 ended_at: None,
17070 metadata: serde_json::json!({}),
17071 messages: vec![NormalizedMessage {
17072 idx: 0,
17073 role: "user".into(),
17074 author: None,
17075 created_at: Some(100 + i as i64),
17076 content: format!("needle content for session {}", i),
17077 extra: serde_json::json!({}),
17078 snippets: vec![],
17079 invocations: Vec::new(),
17080 }],
17081 };
17082 index.add_conversation(&conv)?;
17083 }
17084 index.commit()?;
17085
17086 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17087
17088 let hits_all = client.search("needle", SearchFilters::default(), 10, 0, FieldMask::FULL)?;
17090 assert_eq!(hits_all.len(), 3, "Should find all 3 sessions");
17091
17092 let mut filters = SearchFilters::default();
17094 filters
17095 .session_paths
17096 .insert(paths[0].to_string_lossy().to_string());
17097 filters
17098 .session_paths
17099 .insert(paths[2].to_string_lossy().to_string());
17100
17101 let hits_filtered = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17102 assert_eq!(
17103 hits_filtered.len(),
17104 2,
17105 "Should find only 2 sessions (A and C)"
17106 );
17107
17108 let filtered_paths: HashSet<&str> = hits_filtered
17110 .iter()
17111 .map(|h| h.source_path.as_str())
17112 .collect();
17113 assert!(filtered_paths.contains(paths[0].to_string_lossy().as_ref()));
17114 assert!(filtered_paths.contains(paths[2].to_string_lossy().as_ref()));
17115 assert!(!filtered_paths.contains(paths[1].to_string_lossy().as_ref()));
17116
17117 Ok(())
17118 }
17119
17120 #[test]
17121 fn lexical_session_paths_filter_retries_past_initial_page() -> Result<()> {
17122 let dir = TempDir::new()?;
17123 let mut index = TantivyIndex::open_or_create(dir.path())?;
17124 let requested_path = dir.path().join("requested-session.jsonl");
17125
17126 for i in 0..4 {
17127 let conv = NormalizedConversation {
17128 agent_slug: "claude".into(),
17129 external_id: None,
17130 title: Some(format!("distractor-{i}")),
17131 workspace: Some(std::path::PathBuf::from("/ws")),
17132 source_path: dir.path().join(format!("distractor-{i}.jsonl")),
17133 started_at: Some(100 + i as i64),
17134 ended_at: None,
17135 metadata: serde_json::json!({}),
17136 messages: vec![NormalizedMessage {
17137 idx: 0,
17138 role: "user".into(),
17139 author: None,
17140 created_at: Some(100 + i as i64),
17141 content: "needle needle needle high ranking distractor".into(),
17142 extra: serde_json::json!({}),
17143 snippets: vec![],
17144 invocations: Vec::new(),
17145 }],
17146 };
17147 index.add_conversation(&conv)?;
17148 }
17149
17150 let requested = NormalizedConversation {
17151 agent_slug: "claude".into(),
17152 external_id: None,
17153 title: Some("requested".into()),
17154 workspace: Some(std::path::PathBuf::from("/ws")),
17155 source_path: requested_path.clone(),
17156 started_at: Some(200),
17157 ended_at: None,
17158 metadata: serde_json::json!({}),
17159 messages: vec![NormalizedMessage {
17160 idx: 0,
17161 role: "user".into(),
17162 author: None,
17163 created_at: Some(200),
17164 content: "needle requested session should survive post-filter paging".into(),
17165 extra: serde_json::json!({}),
17166 snippets: vec![],
17167 invocations: Vec::new(),
17168 }],
17169 };
17170 index.add_conversation(&requested)?;
17171 index.commit()?;
17172
17173 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17174 let mut filters = SearchFilters::default();
17175 filters
17176 .session_paths
17177 .insert(requested_path.to_string_lossy().to_string());
17178
17179 let hits = client.search("needle", filters, 1, 0, FieldMask::FULL)?;
17180
17181 assert_eq!(hits.len(), 1);
17182 assert_eq!(hits[0].source_path, requested_path.to_string_lossy());
17183
17184 Ok(())
17185 }
17186
17187 #[test]
17188 fn search_session_paths_empty_filter_returns_all() -> Result<()> {
17189 let dir = TempDir::new()?;
17191 let mut index = TantivyIndex::open_or_create(dir.path())?;
17192
17193 let conv = NormalizedConversation {
17194 agent_slug: "claude".into(),
17195 external_id: None,
17196 title: Some("test".into()),
17197 workspace: Some(std::path::PathBuf::from("/ws")),
17198 source_path: dir.path().join("test.jsonl"),
17199 started_at: Some(100),
17200 ended_at: None,
17201 metadata: serde_json::json!({}),
17202 messages: vec![NormalizedMessage {
17203 idx: 0,
17204 role: "user".into(),
17205 author: None,
17206 created_at: Some(100),
17207 content: "needle content".into(),
17208 extra: serde_json::json!({}),
17209 snippets: vec![],
17210 invocations: Vec::new(),
17211 }],
17212 };
17213 index.add_conversation(&conv)?;
17214 index.commit()?;
17215
17216 let client = SearchClient::open(dir.path(), None)?.expect("index present");
17217
17218 let filters = SearchFilters::default();
17220 assert!(filters.session_paths.is_empty());
17221
17222 let hits = client.search("needle", filters, 10, 0, FieldMask::FULL)?;
17223 assert_eq!(hits.len(), 1);
17224
17225 Ok(())
17226 }
17227
17228 #[test]
17229 fn search_client_reads_federated_lexical_bundle_as_one_corpus() -> Result<()> {
17230 let root = TempDir::new()?;
17231 let shard_a = root.path().join("shard-a");
17232 let shard_b = root.path().join("shard-b");
17233 let published = root.path().join("published");
17234
17235 let mut shard_a_index = TantivyIndex::open_or_create(&shard_a)?;
17236 let mut shard_b_index = TantivyIndex::open_or_create(&shard_b)?;
17237
17238 let make_conv =
17239 |external_id: &str, title: &str, source_path: &str, tag: &str| NormalizedConversation {
17240 agent_slug: "codex".into(),
17241 external_id: Some(external_id.into()),
17242 title: Some(title.into()),
17243 workspace: Some(std::path::PathBuf::from("/ws")),
17244 source_path: std::path::PathBuf::from(source_path),
17245 started_at: Some(1_700_000_100_000),
17246 ended_at: Some(1_700_000_100_100),
17247 metadata: json!({}),
17248 messages: vec![
17249 NormalizedMessage {
17250 idx: 0,
17251 role: "user".into(),
17252 author: None,
17253 created_at: Some(1_700_000_100_010),
17254 content: format!("shared federated needle {tag} user"),
17255 extra: json!({}),
17256 snippets: vec![],
17257 invocations: Vec::new(),
17258 },
17259 NormalizedMessage {
17260 idx: 1,
17261 role: "assistant".into(),
17262 author: None,
17263 created_at: Some(1_700_000_100_020),
17264 content: format!("shared federated needle {tag} assistant"),
17265 extra: json!({}),
17266 snippets: vec![],
17267 invocations: Vec::new(),
17268 },
17269 ],
17270 };
17271
17272 let conv_a = make_conv(
17273 "fed-query-a",
17274 "Fed Query A",
17275 "/tmp/fed-query-a.jsonl",
17276 "alpha",
17277 );
17278 let conv_b = make_conv(
17279 "fed-query-b",
17280 "Fed Query B",
17281 "/tmp/fed-query-b.jsonl",
17282 "beta",
17283 );
17284
17285 shard_a_index.add_conversation(&conv_a)?;
17286 shard_b_index.add_conversation(&conv_b)?;
17287 shard_a_index.commit()?;
17288 shard_b_index.commit()?;
17289 drop(shard_a_index);
17290 drop(shard_b_index);
17291
17292 crate::search::tantivy::publish_federated_searchable_index_directories(
17293 &published,
17294 &[&shard_a, &shard_b],
17295 )?;
17296
17297 let client = SearchClient::open(&published, None)?.expect("federated index present");
17298 assert!(client.has_tantivy());
17299 assert_eq!(client.total_docs(), 4);
17300
17301 let hits = client.search(
17302 "shared federated needle",
17303 SearchFilters::default(),
17304 10,
17305 0,
17306 FieldMask::FULL,
17307 )?;
17308 assert_eq!(hits.len(), 4);
17309 let observed_order = hits
17310 .iter()
17311 .map(|hit| {
17312 (
17313 hit.source_path.clone(),
17314 hit.line_number,
17315 hit.content.clone(),
17316 hit.score.to_bits(),
17317 )
17318 })
17319 .collect::<Vec<_>>();
17320 let hit_paths = hits
17321 .iter()
17322 .map(|hit| hit.source_path.as_str())
17323 .collect::<std::collections::HashSet<_>>();
17324 assert!(hit_paths.contains("/tmp/fed-query-a.jsonl"));
17325 assert!(hit_paths.contains("/tmp/fed-query-b.jsonl"));
17326
17327 for attempt in 0..3 {
17328 let repeated = client.search(
17329 "shared federated needle",
17330 SearchFilters::default(),
17331 10,
17332 0,
17333 FieldMask::FULL,
17334 )?;
17335 let repeated_order = repeated
17336 .iter()
17337 .map(|hit| {
17338 (
17339 hit.source_path.clone(),
17340 hit.line_number,
17341 hit.content.clone(),
17342 hit.score.to_bits(),
17343 )
17344 })
17345 .collect::<Vec<_>>();
17346 assert_eq!(
17347 repeated_order, observed_order,
17348 "federated lexical query order drifted on repeated attempt {attempt}"
17349 );
17350 }
17351
17352 Ok(())
17353 }
17354
17355 #[test]
17356 fn semantic_search_session_paths_filter_retries_past_initial_candidates() -> Result<()> {
17357 let fixture = build_semantic_test_fixture()?;
17358 let mut filters = SearchFilters::default();
17359 filters
17360 .session_paths
17361 .insert(fixture.source_paths[2].clone());
17362
17363 let (hits, ann_stats) = fixture.client.search_semantic(
17364 "semantic fixture query",
17365 filters,
17366 1,
17367 0,
17368 FieldMask::FULL,
17369 false,
17370 )?;
17371
17372 assert!(
17373 ann_stats.is_none(),
17374 "exact search should not emit ANN stats"
17375 );
17376 assert_eq!(
17377 hits.len(),
17378 1,
17379 "filtered semantic search should still return a hit"
17380 );
17381 assert_eq!(
17382 hits[0].source_path, fixture.source_paths[2],
17383 "semantic search should keep searching until it finds the requested session path"
17384 );
17385
17386 Ok(())
17387 }
17388
17389 #[test]
17390 fn semantic_search_offsets_after_session_paths_filtering() -> Result<()> {
17391 let fixture = build_semantic_test_fixture()?;
17392 let mut filters = SearchFilters::default();
17393 filters
17394 .session_paths
17395 .insert(fixture.source_paths[1].clone());
17396 filters
17397 .session_paths
17398 .insert(fixture.source_paths[2].clone());
17399
17400 let (hits, _) = fixture.client.search_semantic(
17401 "semantic fixture query",
17402 filters,
17403 1,
17404 1,
17405 FieldMask::FULL,
17406 false,
17407 )?;
17408
17409 assert_eq!(
17410 hits.len(),
17411 1,
17412 "second filtered page should still return one hit"
17413 );
17414 assert_eq!(
17415 hits[0].source_path, fixture.source_paths[2],
17416 "offset must apply after semantic deduplication and session path filtering"
17417 );
17418
17419 Ok(())
17420 }
17421
17422 #[test]
17423 fn semantic_search_merges_sharded_vector_indexes() -> Result<()> {
17424 let fixture = build_sharded_semantic_test_fixture()?;
17425 let (hits, ann_stats) = fixture.client.search_semantic(
17426 "semantic fixture query",
17427 SearchFilters::default(),
17428 3,
17429 0,
17430 FieldMask::FULL,
17431 false,
17432 )?;
17433
17434 assert!(
17435 ann_stats.is_none(),
17436 "sharded exact search should not emit ANN stats"
17437 );
17438 assert_eq!(hits.len(), 3);
17439 assert_eq!(hits[0].source_path, fixture.source_paths[0]);
17440 assert_eq!(hits[1].source_path, fixture.source_paths[1]);
17441 assert_eq!(hits[2].source_path, fixture.source_paths[2]);
17442
17443 Ok(())
17444 }
17445
17446 #[test]
17447 fn progressive_phase_overfetches_before_session_paths_filtering() -> Result<()> {
17448 let fixture = build_semantic_test_fixture()?;
17449 let mut filters = SearchFilters::default();
17450 filters
17451 .session_paths
17452 .insert(fixture.source_paths[2].clone());
17453
17454 let results = vec![
17455 FsScoredResult {
17456 doc_id: fixture.doc_ids[0].clone(),
17457 score: 1.0,
17458 source: FsScoreSource::SemanticFast,
17459 index: None,
17460 fast_score: Some(1.0),
17461 quality_score: None,
17462 lexical_score: None,
17463 rerank_score: None,
17464 explanation: None,
17465 metadata: None,
17466 },
17467 FsScoredResult {
17468 doc_id: fixture.doc_ids[1].clone(),
17469 score: 0.9,
17470 source: FsScoreSource::SemanticFast,
17471 index: None,
17472 fast_score: Some(0.9),
17473 quality_score: None,
17474 lexical_score: None,
17475 rerank_score: None,
17476 explanation: None,
17477 metadata: None,
17478 },
17479 FsScoredResult {
17480 doc_id: fixture.doc_ids[2].clone(),
17481 score: 0.8,
17482 source: FsScoreSource::SemanticFast,
17483 index: None,
17484 fast_score: Some(0.8),
17485 quality_score: None,
17486 lexical_score: None,
17487 rerank_score: None,
17488 explanation: None,
17489 metadata: None,
17490 },
17491 ];
17492
17493 let result = fixture.client.progressive_phase_to_result(
17494 &results,
17495 ProgressivePhaseContext {
17496 query: "session path filter",
17497 filters: &filters,
17498 field_mask: FieldMask::FULL,
17499 lexical_cache: None,
17500 limit: 1,
17501 fetch_limit: 3,
17502 },
17503 )?;
17504
17505 assert_eq!(
17506 result.hits.len(),
17507 1,
17508 "progressive phase should retain enough overfetched hits to satisfy post-search session path filtering"
17509 );
17510 assert_eq!(
17511 result.hits[0].source_path, fixture.source_paths[2],
17512 "progressive phase should page after session path filtering"
17513 );
17514
17515 Ok(())
17516 }
17517
17518 #[test]
17523 fn sql_placeholders_empty() {
17524 assert_eq!(sql_placeholders(0), "");
17525 }
17526
17527 #[test]
17528 fn sql_placeholders_single() {
17529 assert_eq!(sql_placeholders(1), "?");
17530 }
17531
17532 #[test]
17533 fn sql_placeholders_multiple() {
17534 assert_eq!(sql_placeholders(3), "?,?,?");
17535 assert_eq!(sql_placeholders(5), "?,?,?,?,?");
17536 }
17537
17538 #[test]
17539 fn sql_placeholders_capacity_efficient() {
17540 let result = sql_placeholders(3);
17542 assert_eq!(result.len(), 5);
17543 assert!(result.capacity() >= 5); let result = sql_placeholders(10);
17547 assert_eq!(result.len(), 19);
17548 assert!(result.capacity() >= 19);
17549 }
17550
17551 #[test]
17552 fn sql_placeholders_large_count() {
17553 let result = sql_placeholders(100);
17555 assert_eq!(result.len(), 199); assert_eq!(result.chars().filter(|c| *c == '?').count(), 100);
17557 assert_eq!(result.chars().filter(|c| *c == ',').count(), 99);
17558 }
17559
17560 #[test]
17561 fn hybrid_budget_identifier_biases_lexical() {
17562 let budget = hybrid_candidate_budget("src/main.rs", 20, 20, 5, 10_000);
17563 assert!(
17564 budget.lexical_candidates > budget.semantic_candidates,
17565 "identifier queries should allocate more lexical than semantic fanout"
17566 );
17567 assert!(budget.lexical_candidates >= 25);
17568 }
17569
17570 #[test]
17571 fn hybrid_budget_natural_language_biases_semantic() {
17572 let budget = hybrid_candidate_budget(
17573 "how do we fix authentication middleware latency",
17574 20,
17575 20,
17576 5,
17577 10_000,
17578 );
17579 assert!(
17580 budget.semantic_candidates > budget.lexical_candidates,
17581 "natural language queries should allocate more semantic than lexical fanout"
17582 );
17583 }
17584
17585 #[test]
17586 fn hybrid_budget_no_limit_caps_both_lexical_and_semantic() {
17587 let total_docs = 2_000_000;
17595 let budget =
17596 hybrid_candidate_budget("authentication middleware", 0, total_docs, 0, total_docs);
17597 let cap = no_limit_result_cap();
17598 assert!(
17599 budget.lexical_candidates <= cap,
17600 "lexical fanout must respect no_limit_result_cap() = {cap}; got {}",
17601 budget.lexical_candidates
17602 );
17603 assert!(
17604 budget.lexical_candidates <= NO_LIMIT_RESULT_MAX,
17605 "lexical fanout must respect the absolute NO_LIMIT_RESULT_MAX; got {}",
17606 budget.lexical_candidates
17607 );
17608 assert!(budget.semantic_candidates <= HYBRID_NO_LIMIT_SEMANTIC_CAP);
17609 assert!(
17616 budget.semantic_candidates <= budget.lexical_candidates,
17617 "semantic ({}) must not exceed lexical ({}) fanout",
17618 budget.semantic_candidates,
17619 budget.lexical_candidates
17620 );
17621 }
17622
17623 #[test]
17624 fn compute_no_limit_result_cap_clamps_explicit_over_ceiling_env_override() {
17625 let cap = compute_no_limit_result_cap_from(Some("999999999999".to_string()), None, None);
17631 assert!(
17632 cap <= NO_LIMIT_RESULT_MAX,
17633 "explicit override must still clamp to ceiling; got {cap} > {NO_LIMIT_RESULT_MAX}"
17634 );
17635 assert!(cap >= NO_LIMIT_RESULT_MIN);
17636 }
17637
17638 #[test]
17639 fn compute_no_limit_result_cap_clamps_tiny_explicit_override_up_to_floor() {
17640 let cap = compute_no_limit_result_cap_from(Some("1".to_string()), None, None);
17642 assert_eq!(cap, NO_LIMIT_RESULT_MIN);
17643 }
17644
17645 #[test]
17646 fn compute_no_limit_result_cap_uses_meminfo_when_no_env_override() {
17647 let cap = compute_no_limit_result_cap_from(None, None, Some(128u64 * 1024 * 1024 * 1024));
17651 assert!(cap >= NO_LIMIT_RESULT_MIN, "cap {cap} below floor");
17652 assert!(cap <= NO_LIMIT_RESULT_MAX, "cap {cap} above ceiling");
17653 assert!(cap > NO_LIMIT_RESULT_MIN * 10);
17655 }
17656
17657 #[test]
17658 fn compute_no_limit_result_cap_falls_back_to_floor_when_meminfo_unavailable() {
17659 let cap = compute_no_limit_result_cap_from(None, None, None);
17663 assert!(cap >= NO_LIMIT_RESULT_MIN);
17664 assert!(cap <= NO_LIMIT_RESULT_MAX);
17665 }
17666
17667 #[test]
17668 fn compute_no_limit_result_cap_bytes_env_takes_priority_over_meminfo() {
17669 let four_gib = (4u64 * 1024 * 1024 * 1024).to_string();
17674 let cap = compute_no_limit_result_cap_from(
17675 None,
17676 Some(four_gib),
17677 Some(1024u64 * 1024 * 1024 * 1024), );
17679 let expected_hits = ((4u64 * 1024 * 1024 * 1024) / AVG_HIT_BYTES) as usize;
17680 let expected = expected_hits.clamp(NO_LIMIT_RESULT_MIN, NO_LIMIT_RESULT_MAX);
17681 assert_eq!(cap, expected, "bytes env must win over meminfo");
17682 }
17683
17684 #[test]
17685 fn no_limit_budget_bytes_preserves_fallback_priority() {
17686 let huge_meminfo = Some(1024u64 * 1024 * 1024 * 1024);
17687 let four_gib = 4u64 * 1024 * 1024 * 1024;
17688
17689 assert_eq!(
17690 no_limit_budget_bytes(Some(four_gib.to_string()), huge_meminfo),
17691 four_gib
17692 );
17693 assert_eq!(
17694 no_limit_budget_bytes(Some("0".to_string()), huge_meminfo),
17695 NO_LIMIT_BYTES_CEILING
17696 );
17697 assert_eq!(no_limit_budget_bytes(None, None), NO_LIMIT_BYTES_FLOOR);
17698 }
17699
17700 #[test]
17701 fn compute_no_limit_result_cap_ignores_malformed_env() {
17702 for bad in ["", "abc", "0", "-1"] {
17704 let cap = compute_no_limit_result_cap_from(
17705 Some(bad.to_string()),
17706 Some(bad.to_string()),
17707 None,
17708 );
17709 assert!(cap >= NO_LIMIT_RESULT_MIN, "bad={bad:?} cap={cap}");
17710 assert!(cap <= NO_LIMIT_RESULT_MAX, "bad={bad:?} cap={cap}");
17711 }
17712 }
17713
17714 fn make_test_hit(id: &str, score: f32) -> SearchHit {
17719 SearchHit {
17720 title: id.to_string(),
17721 snippet: String::new(),
17722 content: id.to_string(),
17723 content_hash: stable_content_hash(id),
17724 score,
17725 source_path: format!("/path/{}.jsonl", id),
17726 agent: "test".to_string(),
17727 workspace: "/workspace".to_string(),
17728 workspace_original: None,
17729 created_at: Some(1_700_000_000_000),
17730 line_number: Some(1),
17731 match_type: MatchType::Exact,
17732 source_id: "local".to_string(),
17733 origin_kind: "local".to_string(),
17734 origin_host: None,
17735 conversation_id: None,
17736 }
17737 }
17738
17739 #[test]
17740 fn test_rrf_fusion_ordering() {
17741 let lexical = vec![
17744 make_test_hit("A", 10.0),
17745 make_test_hit("B", 8.0),
17746 make_test_hit("C", 6.0),
17747 ];
17748 let semantic = vec![
17749 make_test_hit("A", 0.9),
17750 make_test_hit("B", 0.7),
17751 make_test_hit("D", 0.5),
17752 ];
17753
17754 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17755
17756 assert_eq!(fused.len(), 4);
17758 assert_eq!(fused[0].title, "A"); assert_eq!(fused[1].title, "B"); }
17762
17763 #[test]
17764 fn test_rrf_handles_disjoint_sets() {
17765 let lexical = vec![make_test_hit("A", 10.0), make_test_hit("B", 8.0)];
17767 let semantic = vec![make_test_hit("C", 0.9), make_test_hit("D", 0.7)];
17768
17769 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17770
17771 assert_eq!(fused.len(), 4);
17773 let titles: Vec<&str> = fused.iter().map(|h| h.title.as_str()).collect();
17774 assert!(titles.contains(&"A"));
17775 assert!(titles.contains(&"B"));
17776 assert!(titles.contains(&"C"));
17777 assert!(titles.contains(&"D"));
17778 }
17779
17780 #[test]
17781 fn test_rrf_tie_breaking_deterministic() {
17782 let lexical = vec![
17784 make_test_hit("X", 5.0),
17785 make_test_hit("Y", 5.0),
17786 make_test_hit("Z", 5.0),
17787 ];
17788 let semantic = vec![]; let fused1 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17792 let fused2 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17793 let fused3 = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17794
17795 assert_eq!(fused1.len(), fused2.len());
17797 assert_eq!(fused2.len(), fused3.len());
17798
17799 for i in 0..fused1.len() {
17800 assert_eq!(fused1[i].title, fused2[i].title, "Mismatch at index {}", i);
17801 assert_eq!(fused2[i].title, fused3[i].title, "Mismatch at index {}", i);
17802 }
17803 }
17804
17805 #[test]
17806 fn test_rrf_both_lists_bonus() {
17807 let lexical = vec![
17810 make_test_hit("solo_lex", 10.0), make_test_hit("both", 5.0), ];
17813 let semantic = vec![
17814 make_test_hit("solo_sem", 0.9), make_test_hit("both", 0.5), ];
17817
17818 let fused = rrf_fuse_hits(&lexical, &semantic, "", 10, 0);
17819
17820 assert_eq!(
17824 fused[0].title, "both",
17825 "Doc in both lists should rank first"
17826 );
17827 }
17828
17829 #[test]
17830 fn test_rrf_respects_limit_and_offset() {
17831 let lexical = vec![
17832 make_test_hit("A", 10.0),
17833 make_test_hit("B", 8.0),
17834 make_test_hit("C", 6.0),
17835 ];
17836 let semantic = vec![];
17837
17838 let fused = rrf_fuse_hits(&lexical, &semantic, "", 2, 0);
17840 assert_eq!(fused.len(), 2);
17841
17842 let fused_offset = rrf_fuse_hits(&lexical, &semantic, "", 10, 1);
17844 assert_eq!(fused_offset.len(), 2); let fused_empty = rrf_fuse_hits(&lexical, &semantic, "", 0, 0);
17848 assert!(fused_empty.is_empty());
17849 }
17850
17851 #[test]
17852 fn test_rrf_empty_inputs() {
17853 let empty: Vec<SearchHit> = vec![];
17854 let non_empty = vec![make_test_hit("A", 10.0)];
17855
17856 assert!(rrf_fuse_hits(&empty, &empty, "", 10, 0).is_empty());
17858
17859 let fused = rrf_fuse_hits(&empty, &non_empty, "", 10, 0);
17861 assert_eq!(fused.len(), 1);
17862 assert_eq!(fused[0].title, "A");
17863
17864 let fused = rrf_fuse_hits(&non_empty, &empty, "", 10, 0);
17866 assert_eq!(fused.len(), 1);
17867 assert_eq!(fused[0].title, "A");
17868 }
17869
17870 #[test]
17871 fn test_rrf_coalesces_empty_title_hits_across_search_modes() {
17872 let mut lexical = make_test_hit("shared", 10.0);
17873 lexical.title.clear();
17874 lexical.source_path = "/shared/untitled.jsonl".into();
17875 lexical.content = "same untitled body".into();
17876 lexical.content_hash = stable_content_hash("same untitled body");
17877
17878 let mut semantic = lexical.clone();
17879 semantic.score = 0.9;
17880
17881 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17882 assert_eq!(fused.len(), 1);
17883 assert_eq!(fused[0].title, "");
17884 }
17885
17886 #[test]
17887 fn test_rrf_coalesces_blank_local_source_id_hits_across_search_modes() {
17888 let mut lexical = make_test_hit("shared-local", 10.0);
17889 lexical.source_path = "/shared/local.jsonl".into();
17890 lexical.content = "same local body".into();
17891 lexical.content_hash = stable_content_hash("same local body");
17892 lexical.source_id = "local".into();
17893 lexical.origin_kind = "local".into();
17894
17895 let mut semantic = lexical.clone();
17896 semantic.source_id = " ".into();
17897 semantic.origin_kind = "local".into();
17898 semantic.score = 0.9;
17899
17900 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17901 assert_eq!(fused.len(), 1);
17902 assert_eq!(fused[0].source_id, "local");
17903 }
17904
17905 #[test]
17906 fn test_rrf_keeps_repeated_same_content_at_different_lines() {
17907 let mut first = make_test_hit("same", 10.0);
17908 first.title = "Shared Session".into();
17909 first.source_path = "/shared/session.jsonl".into();
17910 first.content = "repeat me".into();
17911 first.content_hash = stable_content_hash("repeat me");
17912 first.line_number = Some(1);
17913 first.created_at = Some(100);
17914
17915 let mut second = first.clone();
17916 second.line_number = Some(2);
17917 second.created_at = Some(200);
17918 second.score = 0.9;
17919
17920 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
17921 assert_eq!(fused.len(), 2);
17922 assert_eq!(fused[0].line_number, Some(1));
17923 assert_eq!(fused[1].line_number, Some(2));
17924 }
17925
17926 #[test]
17927 fn test_rrf_coalesces_present_and_missing_conversation_id_for_same_message() {
17928 let mut lexical = make_test_hit("same", 10.0);
17929 lexical.title = "Shared Session".into();
17930 lexical.source_path = "/shared/session.jsonl".into();
17931 lexical.content = "identical body".into();
17932 lexical.content_hash = stable_content_hash("identical body");
17933 lexical.created_at = Some(100);
17934 lexical.line_number = Some(1);
17935 lexical.conversation_id = None;
17936
17937 let mut semantic = lexical.clone();
17938 semantic.conversation_id = Some(42);
17939 semantic.score = 0.9;
17940
17941 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17942 assert_eq!(fused.len(), 1);
17943 assert_eq!(fused[0].conversation_id, Some(42));
17944 }
17945
17946 #[test]
17947 fn test_rrf_coalesces_present_and_missing_conversation_id_despite_blank_local_source_id() {
17948 let mut lexical = make_test_hit("same", 10.0);
17949 lexical.title = "Shared Session".into();
17950 lexical.source_path = "/shared/session.jsonl".into();
17951 lexical.content = "identical body".into();
17952 lexical.content_hash = stable_content_hash("identical body");
17953 lexical.created_at = Some(100);
17954 lexical.line_number = Some(1);
17955 lexical.conversation_id = None;
17956 lexical.source_id = "local".into();
17957 lexical.origin_kind = "local".into();
17958
17959 let mut semantic = lexical.clone();
17960 semantic.conversation_id = Some(42);
17961 semantic.source_id = " ".into();
17962 semantic.origin_kind = "local".into();
17963 semantic.score = 0.9;
17964
17965 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
17966 assert_eq!(fused.len(), 1);
17967 assert_eq!(fused[0].conversation_id, Some(42));
17968 }
17969
17970 #[test]
17971 fn test_rrf_keeps_distinct_conversation_ids_for_shared_path_and_content() {
17972 let mut first = make_test_hit("same", 10.0);
17973 first.title = "Shared Session".into();
17974 first.source_path = "/shared/session.jsonl".into();
17975 first.content = "identical body".into();
17976 first.content_hash = stable_content_hash("identical body");
17977 first.conversation_id = Some(1);
17978
17979 let mut second = first.clone();
17980 second.conversation_id = Some(2);
17981 second.score = 0.9;
17982
17983 let fused = rrf_fuse_hits(&[first], &[second], "", 10, 0);
17984 assert_eq!(fused.len(), 2);
17985 assert!(fused.iter().any(|hit| hit.conversation_id == Some(1)));
17986 assert!(fused.iter().any(|hit| hit.conversation_id == Some(2)));
17987 }
17988
17989 #[test]
17990 fn test_rrf_coalesces_same_conversation_id_despite_title_drift() {
17991 let mut lexical = make_test_hit("same", 10.0);
17992 lexical.title = "Morning Session".into();
17993 lexical.source_path = "/shared/session.jsonl".into();
17994 lexical.content = "identical body".into();
17995 lexical.content_hash = stable_content_hash("identical body");
17996 lexical.conversation_id = Some(9);
17997
17998 let mut semantic = lexical.clone();
17999 semantic.title = "Evening Session".into();
18000 semantic.score = 0.9;
18001
18002 let fused = rrf_fuse_hits(&[lexical], &[semantic], "", 10, 0);
18003 assert_eq!(fused.len(), 1);
18004 assert_eq!(fused[0].conversation_id, Some(9));
18005 }
18006
18007 #[test]
18008 fn test_rrf_keeps_distinct_titles_for_shared_path_and_content() {
18009 let mut morning = make_test_hit("same", 10.0);
18010 morning.title = "Morning Session".into();
18011 morning.source_path = "/shared/session.jsonl".into();
18012 morning.content = "identical body".into();
18013 morning.content_hash = stable_content_hash("identical body");
18014 morning.created_at = None;
18015
18016 let mut evening = morning.clone();
18017 evening.title = "Evening Session".into();
18018 evening.score = 0.9;
18019
18020 let fused = rrf_fuse_hits(&[morning], &[evening], "", 10, 0);
18021 assert_eq!(fused.len(), 2);
18022 assert!(fused.iter().any(|hit| hit.title == "Morning Session"));
18023 assert!(fused.iter().any(|hit| hit.title == "Evening Session"));
18024 }
18025
18026 #[test]
18027 fn test_rrf_candidate_depth() {
18028 let lexical: Vec<_> = (0..50)
18030 .map(|i| make_test_hit(&format!("L{}", i), 100.0 - i as f32))
18031 .collect();
18032 let semantic: Vec<_> = (0..50)
18033 .map(|i| make_test_hit(&format!("S{}", i), 1.0 - 0.01 * i as f32))
18034 .collect();
18035
18036 let fused = rrf_fuse_hits(&lexical, &semantic, "", 20, 0);
18037
18038 assert_eq!(fused.len(), 20);
18040
18041 let mut seen = std::collections::HashSet::new();
18043 for hit in &fused {
18044 assert!(seen.insert(&hit.title), "Duplicate hit: {}", hit.title);
18045 }
18046 }
18047
18048 #[test]
18053 fn query_token_list_parses_small_queries() {
18054 let cases = [
18055 ("hello", 1),
18056 ("hello world", 2),
18057 ("hello AND world", 3),
18058 ("hello world foo bar", 4),
18059 ];
18060
18061 for (query, expected_len) in cases {
18062 let tokens = parse_boolean_query(query);
18063 assert_eq!(tokens.len(), expected_len, "{query}");
18064 }
18065 }
18066
18067 #[test]
18068 fn query_token_list_parses_large_queries() {
18069 let tokens = parse_boolean_query("a b c d e f g h i");
18070 assert_eq!(tokens.len(), 9);
18071 }
18072
18073 #[test]
18074 fn query_token_list_handles_quoted_phrases() {
18075 let tokens = parse_boolean_query("\"hello world\" test");
18076 assert_eq!(tokens.len(), 2);
18077
18078 assert!(
18080 matches!(&tokens[0], QueryToken::Phrase(phrase) if phrase == "hello world"),
18081 "Expected Phrase token"
18082 );
18083 }
18084
18085 #[test]
18086 fn query_token_list_handles_operators() {
18087 let tokens = parse_boolean_query("foo AND bar OR baz");
18088 assert_eq!(tokens.len(), 5);
18089 assert_eq!(tokens[1], QueryToken::And);
18090 assert_eq!(tokens[3], QueryToken::Or);
18091 }
18092
18093 #[test]
18094 fn query_token_list_empty_query() {
18095 let tokens = parse_boolean_query("");
18096 assert!(tokens.is_empty());
18097 }
18098
18099 #[test]
18100 fn query_token_list_iteration_works() {
18101 let tokens = parse_boolean_query("a b c");
18102 let terms: Vec<_> = tokens
18103 .iter()
18104 .filter_map(|t| match t {
18105 QueryToken::Term(s) => Some(s.as_str()),
18106 _ => None,
18107 })
18108 .collect();
18109 assert_eq!(terms, vec!["a", "b", "c"]);
18110 }
18111
18112 #[test]
18122 fn unicode_emoji_treated_as_separator() {
18123 let sanitized = sanitize_query("🚀 launch");
18125 assert_eq!(sanitized, " launch", "Emoji should become space");
18126 }
18127
18128 #[test]
18129 fn unicode_emoji_splits_terms() {
18130 let sanitized = sanitize_query("hot🔥code");
18132 assert_eq!(sanitized, "hot code", "Emoji between words splits them");
18133 }
18134
18135 #[test]
18136 fn unicode_multiple_emoji_become_spaces() {
18137 let sanitized = sanitize_query("🚀🔥💻");
18138 assert_eq!(
18139 sanitized.trim(),
18140 "",
18141 "All-emoji query sanitizes to whitespace"
18142 );
18143 }
18144
18145 #[test]
18146 fn unicode_emoji_query_parses_without_panic() {
18147 let tokens = parse_boolean_query("🚀 launch code 🔥");
18148 let terms: Vec<_> = tokens
18149 .iter()
18150 .filter_map(|t| match t {
18151 QueryToken::Term(s) => Some(s.clone()),
18152 _ => None,
18153 })
18154 .collect();
18155 assert!(
18157 terms
18158 .iter()
18159 .any(|t| t.contains("launch") || t.contains("code"))
18160 );
18161 }
18162
18163 #[test]
18164 fn unicode_emoji_query_terms_lower() {
18165 let terms = QueryTermsLower::from_query("🚀 LAUNCH");
18166 let tokens: Vec<&str> = terms.tokens().collect();
18168 assert!(
18169 tokens.contains(&"launch"),
18170 "Should extract 'launch' from emoji query"
18171 );
18172 }
18173
18174 #[test]
18177 fn unicode_cjk_chinese_preserved() {
18178 assert_eq!(sanitize_query("测试代码"), "测试代码");
18179 assert_eq!(sanitize_query("测试 代码"), "测试 代码");
18180 }
18181
18182 #[test]
18183 fn unicode_cjk_japanese_preserved() {
18184 assert_eq!(sanitize_query("テスト"), "テスト");
18185 assert_eq!(sanitize_query("こんにちは世界"), "こんにちは世界");
18187 }
18188
18189 #[test]
18190 fn unicode_cjk_korean_preserved() {
18191 assert_eq!(sanitize_query("테스트"), "테스트");
18192 assert_eq!(sanitize_query("안녕하세요"), "안녕하세요");
18193 }
18194
18195 #[test]
18196 fn unicode_cjk_parsed_as_terms() {
18197 let tokens = parse_boolean_query("测试 代码 search");
18198 let terms: Vec<_> = tokens
18199 .iter()
18200 .filter_map(|t| match t {
18201 QueryToken::Term(s) => Some(s.as_str()),
18202 _ => None,
18203 })
18204 .collect();
18205 assert_eq!(terms, vec!["测试", "代码", "search"]);
18206 }
18207
18208 #[test]
18209 fn unicode_cjk_query_terms_lower() {
18210 let terms = QueryTermsLower::from_query("测试 代码");
18211 let tokens: Vec<&str> = terms.tokens().collect();
18212 assert_eq!(tokens, vec!["测试", "代码"]);
18213 }
18214
18215 #[test]
18218 fn unicode_hebrew_preserved() {
18219 assert_eq!(sanitize_query("שלום עולם"), "שלום עולם");
18220 }
18221
18222 #[test]
18223 fn unicode_arabic_preserved() {
18224 assert_eq!(sanitize_query("مرحبا"), "مرحبا");
18225 }
18226
18227 #[test]
18228 fn unicode_hebrew_parsed_as_terms() {
18229 let tokens = parse_boolean_query("שלום עולם");
18230 let terms: Vec<_> = tokens
18231 .iter()
18232 .filter_map(|t| match t {
18233 QueryToken::Term(s) => Some(s.as_str()),
18234 _ => None,
18235 })
18236 .collect();
18237 assert_eq!(terms, vec!["שלום", "עולם"]);
18238 }
18239
18240 #[test]
18241 fn unicode_arabic_query_terms_lower() {
18242 let terms = QueryTermsLower::from_query("مرحبا بالعالم");
18244 let tokens: Vec<&str> = terms.tokens().collect();
18245 assert_eq!(tokens, vec!["مرحبا", "بالعالم"]);
18246 }
18247
18248 #[test]
18251 fn unicode_mixed_scripts_preserved() {
18252 let sanitized = sanitize_query("Hello 世界 мир");
18253 assert_eq!(sanitized, "Hello 世界 мир");
18254 }
18255
18256 #[test]
18257 fn unicode_mixed_scripts_parsed() {
18258 let tokens = parse_boolean_query("Hello 世界 мир");
18259 let terms: Vec<_> = tokens
18260 .iter()
18261 .filter_map(|t| match t {
18262 QueryToken::Term(s) => Some(s.as_str()),
18263 _ => None,
18264 })
18265 .collect();
18266 assert_eq!(terms, vec!["Hello", "世界", "мир"]);
18267 }
18268
18269 #[test]
18270 fn unicode_mixed_scripts_with_emoji() {
18271 let sanitized = sanitize_query("Hello 🌍 世界");
18273 assert_eq!(sanitized, "Hello 世界");
18274 }
18275
18276 #[test]
18277 fn unicode_latin_cyrillic_arabic_query() {
18278 let terms = QueryTermsLower::from_query("Hello Мир مرحبا");
18279 let tokens: Vec<&str> = terms.tokens().collect();
18280 assert_eq!(tokens, vec!["hello", "мир", "مرحبا"]);
18281 }
18282
18283 #[test]
18286 fn unicode_zero_width_joiner_removed() {
18287 let sanitized = sanitize_query("test\u{200D}query");
18289 assert_eq!(sanitized, "test query");
18290 }
18291
18292 #[test]
18293 fn unicode_zero_width_non_joiner_removed() {
18294 let sanitized = sanitize_query("test\u{200C}query");
18296 assert_eq!(sanitized, "test query");
18297 }
18298
18299 #[test]
18300 fn unicode_zero_width_space_removed() {
18301 let sanitized = sanitize_query("test\u{200B}query");
18303 assert_eq!(sanitized, "test query");
18304 }
18305
18306 #[test]
18307 fn unicode_bom_removed() {
18308 let sanitized = sanitize_query("\u{FEFF}test");
18310 assert_eq!(sanitized, " test");
18311 }
18312
18313 #[test]
18316 fn unicode_precomposed_accent_preserved() {
18317 let sanitized = sanitize_query("café");
18319 assert_eq!(sanitized, "café");
18320 }
18321
18322 #[test]
18323 fn unicode_combining_accent_becomes_separator() {
18324 let input = "cafe\u{0301}";
18328 let sanitized = sanitize_query(input);
18329 assert_eq!(sanitized, "caf\u{00e9}");
18330 }
18331
18332 #[test]
18333 fn unicode_nfc_and_nfd_produce_same_sanitized_query() {
18334 let nfc = "caf\u{00E9}";
18336 let nfd = "cafe\u{0301}";
18338
18339 let san_nfc = sanitize_query(nfc);
18340 let san_nfd = sanitize_query(nfd);
18341
18342 assert_eq!(san_nfc, "café");
18346 assert_eq!(san_nfd, "café");
18347 assert_eq!(san_nfc, san_nfd);
18348 }
18349
18350 #[test]
18351 fn unicode_combining_marks_do_not_panic() {
18352 let zalgo = "t\u{0301}\u{0302}\u{0303}e\u{0304}\u{0305}st";
18354 let sanitized = sanitize_query(zalgo);
18355 assert!(sanitized.contains('t'));
18357 assert!(sanitized.contains('s'));
18358 }
18359
18360 #[test]
18363 fn unicode_mathematical_bold_letters_preserved() {
18364 let input = "\u{1D400}\u{1D401}\u{1D402}";
18366 let sanitized = sanitize_query(input);
18367 assert_eq!(
18368 sanitized, input,
18369 "Mathematical bold letters are alphanumeric"
18370 );
18371 }
18372
18373 #[test]
18374 fn unicode_supplementary_ideograph_preserved() {
18375 let input = "\u{20000}";
18377 let sanitized = sanitize_query(input);
18378 assert_eq!(
18379 sanitized, input,
18380 "Supplementary CJK ideographs are alphanumeric"
18381 );
18382 }
18383
18384 #[test]
18385 fn unicode_supplementary_emoji_removed() {
18386 let input = "test\u{1F600}query";
18388 let sanitized = sanitize_query(input);
18389 assert_eq!(sanitized, "test query");
18390 }
18391
18392 #[test]
18395 fn unicode_bidi_mixed_ltr_rtl_no_panic() {
18396 let input = "hello שלום world עולם";
18397 let tokens = parse_boolean_query(input);
18398 let terms: Vec<_> = tokens
18399 .iter()
18400 .filter_map(|t| match t {
18401 QueryToken::Term(s) => Some(s.as_str()),
18402 _ => None,
18403 })
18404 .collect();
18405 assert_eq!(terms.len(), 4);
18406 assert!(terms.contains(&"hello"));
18407 assert!(terms.contains(&"שלום"));
18408 assert!(terms.contains(&"world"));
18409 assert!(terms.contains(&"עולם"));
18410 }
18411
18412 #[test]
18413 fn unicode_bidi_override_chars_removed() {
18414 let input = "test\u{202D}content\u{202C}end";
18417 let sanitized = sanitize_query(input);
18418 assert_eq!(sanitized, "test content end");
18419 }
18420
18421 #[test]
18422 fn unicode_bidi_rtl_mark_removed() {
18423 let input = "test\u{200F}content";
18425 let sanitized = sanitize_query(input);
18426 assert_eq!(sanitized, "test content");
18427 }
18428
18429 #[test]
18432 fn unicode_full_pipeline_cjk_query() {
18433 let explanation = QueryExplanation::analyze("测试 代码", &SearchFilters::default());
18434 assert_eq!(explanation.parsed.terms.len(), 2);
18435 assert!(!explanation.parsed.terms[0].text.is_empty());
18436 assert!(!explanation.parsed.terms[1].text.is_empty());
18437 }
18438
18439 #[test]
18440 fn unicode_full_pipeline_mixed_script_boolean() {
18441 let explanation =
18442 QueryExplanation::analyze("Hello AND 世界 OR مرحبا", &SearchFilters::default());
18443 assert!(
18445 explanation.parsed.operators.iter().any(|op| op == "AND"),
18446 "AND operator should be recognized in mixed-script query"
18447 );
18448 }
18449
18450 #[test]
18451 fn unicode_full_pipeline_emoji_query_type() {
18452 let explanation = QueryExplanation::analyze("🚀🔥💻", &SearchFilters::default());
18454 assert!(
18456 explanation.parsed.terms.is_empty()
18457 || explanation
18458 .parsed
18459 .terms
18460 .iter()
18461 .all(|t| t.subterms.is_empty()),
18462 "All-emoji query should produce no meaningful terms"
18463 );
18464 }
18465
18466 #[test]
18467 fn unicode_full_pipeline_phrase_with_cjk() {
18468 let explanation = QueryExplanation::analyze("\"测试代码\"", &SearchFilters::default());
18469 assert!(
18470 !explanation.parsed.phrases.is_empty(),
18471 "CJK phrase should be recognized"
18472 );
18473 }
18474
18475 #[test]
18476 fn unicode_full_pipeline_wildcard_with_unicode() {
18477 let explanation = QueryExplanation::analyze("*测试*", &SearchFilters::default());
18478 assert!(
18479 !explanation.parsed.terms.is_empty(),
18480 "Wildcard with CJK should produce terms"
18481 );
18482 if let Some(term) = explanation.parsed.terms.first() {
18484 assert!(
18485 term.subterms
18486 .iter()
18487 .any(|s| s.pattern.contains("*") || s.pattern == "exact"),
18488 "CJK wildcard should produce wildcard or exact pattern"
18489 );
18490 }
18491 }
18492
18493 #[test]
18494 fn unicode_query_terms_lower_case_folding() {
18495 let terms = QueryTermsLower::from_query("STRAßE");
18497 assert_eq!(terms.query_lower, "straße");
18498
18499 let terms2 = QueryTermsLower::from_query("HELLO");
18502 assert_eq!(terms2.query_lower, "hello");
18503 }
18504
18505 #[test]
18506 fn unicode_normalize_term_parts_cjk() {
18507 let parts = normalize_term_parts("测试 代码");
18508 assert_eq!(parts, vec!["测试", "代码"]);
18509 }
18510
18511 #[test]
18512 fn unicode_normalize_term_parts_strips_emoji() {
18513 let parts = normalize_term_parts("🚀launch🔥code");
18514 assert!(parts.contains(&"launch".to_string()));
18516 assert!(parts.contains(&"code".to_string()));
18517 }
18518
18519 #[test]
18524 fn special_char_unbalanced_quote_no_panic() {
18525 let tokens = parse_boolean_query("\"hello world");
18526 assert!(
18527 tokens
18528 .iter()
18529 .any(|t| matches!(t, QueryToken::Phrase(p) if p.contains("hello"))),
18530 "Unbalanced quote should still produce a phrase: {tokens:?}"
18531 );
18532 }
18533
18534 #[test]
18535 fn special_char_unbalanced_trailing_quote() {
18536 let tokens = parse_boolean_query("test\"");
18537 assert!(
18538 tokens
18539 .iter()
18540 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
18541 "Text before trailing quote should parse as term: {tokens:?}"
18542 );
18543 }
18544
18545 #[test]
18546 fn special_char_multiple_unbalanced_quotes() {
18547 let tokens = parse_boolean_query("\"foo \"bar");
18548 assert!(
18549 !tokens.is_empty(),
18550 "Should parse despite odd quotes: {tokens:?}"
18551 );
18552 }
18553
18554 #[test]
18555 fn special_char_empty_quotes() {
18556 let tokens = parse_boolean_query("\"\" test");
18557 assert!(
18558 tokens
18559 .iter()
18560 .any(|t| matches!(t, QueryToken::Term(w) if w == "test")),
18561 "Empty quotes should be skipped: {tokens:?}"
18562 );
18563 }
18564
18565 #[test]
18566 fn special_char_unbalanced_via_sanitize() {
18567 let sanitized = sanitize_query("\"hello world");
18568 assert!(
18569 sanitized.contains('"'),
18570 "Quotes preserved by sanitize_query"
18571 );
18572 }
18573
18574 #[test]
18577 fn special_char_backslash_quote_sanitize() {
18578 let sanitized = sanitize_query("\\\"test\\\"");
18579 assert!(sanitized.contains('"'));
18580 assert!(!sanitized.contains('\\'), "Backslash should be stripped");
18581 }
18582
18583 #[test]
18584 fn special_char_backslash_quote_parse() {
18585 let tokens = parse_boolean_query("\\\"test\\\"");
18586 assert!(!tokens.is_empty(), "Should parse without panic: {tokens:?}");
18587 }
18588
18589 #[test]
18590 fn special_char_inner_escaped_quotes() {
18591 let tokens = parse_boolean_query("\"test \\\"inner\\\" test\"");
18592 assert!(
18593 !tokens.is_empty(),
18594 "Nested escaped quotes should not panic: {tokens:?}"
18595 );
18596 }
18597
18598 #[test]
18601 fn special_char_windows_path_sanitize() {
18602 let sanitized = sanitize_query("C:\\Users\\test");
18603 assert_eq!(sanitized, "C Users test");
18604 }
18605
18606 #[test]
18607 fn special_char_unc_path_sanitize() {
18608 let sanitized = sanitize_query("\\\\server\\share");
18609 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18610 assert!(parts.contains(&"server"));
18611 assert!(parts.contains(&"share"));
18612 }
18613
18614 #[test]
18615 fn special_char_windows_path_terms() {
18616 let parts = normalize_term_parts("C:\\Users\\test\\file.rs");
18617 assert!(parts.contains(&"C".to_string()));
18618 assert!(parts.contains(&"Users".to_string()));
18619 assert!(parts.contains(&"test".to_string()));
18620 assert!(parts.contains(&"file".to_string()));
18621 assert!(parts.contains(&"rs".to_string()));
18622 }
18623
18624 #[test]
18627 fn special_char_regex_dot_star() {
18628 let sanitized = sanitize_query("foo.*bar");
18629 assert_eq!(sanitized, "foo *bar");
18630 }
18631
18632 #[test]
18633 fn special_char_regex_char_class() {
18634 let sanitized = sanitize_query("[a-z]+");
18635 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18636 assert_eq!(parts, vec!["a-z"]);
18637 assert_eq!(normalize_term_parts("[a-z]+"), vec!["a", "z"]);
18638 }
18639
18640 #[test]
18641 fn special_char_regex_anchors() {
18642 let sanitized = sanitize_query("^start$");
18643 assert_eq!(sanitized.trim(), "start");
18644 }
18645
18646 #[test]
18647 fn special_char_regex_pipe_groups() {
18648 let sanitized = sanitize_query("(foo|bar)");
18649 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18650 assert_eq!(parts, vec!["foo", "bar"]);
18651 }
18652
18653 #[test]
18656 fn special_char_sql_injection_or() {
18657 let sanitized = sanitize_query("'OR 1=1--");
18658 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18659 assert!(parts.contains(&"OR"));
18660 assert!(parts.contains(&"1"));
18661 assert!(!sanitized.contains('\''));
18662 assert!(!sanitized.contains('='));
18663 }
18664
18665 #[test]
18666 fn special_char_sql_injection_drop() {
18667 let sanitized = sanitize_query("; DROP TABLE users;--");
18668 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18669 assert!(parts.contains(&"DROP"));
18670 assert!(parts.contains(&"TABLE"));
18671 assert!(parts.contains(&"users"));
18672 assert!(!sanitized.contains(';'));
18673 }
18674
18675 #[test]
18676 fn special_char_sql_injection_union() {
18677 let sanitized = sanitize_query("' UNION SELECT * FROM passwords --");
18678 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18679 assert!(parts.contains(&"UNION"));
18680 assert!(parts.contains(&"SELECT"));
18681 assert!(parts.contains(&"*"));
18682 assert!(parts.contains(&"FROM"));
18683 assert!(parts.contains(&"passwords"));
18684 }
18685
18686 #[test]
18687 fn special_char_sql_parse_as_literal() {
18688 let tokens = parse_boolean_query("OR 1=1");
18689 assert!(
18690 tokens.iter().any(|t| matches!(t, QueryToken::Or)),
18691 "OR should be parsed as Or operator: {tokens:?}"
18692 );
18693 }
18694
18695 #[test]
18698 fn special_char_shell_subshell() {
18699 let sanitized = sanitize_query("$(cmd)");
18700 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18701 assert_eq!(parts, vec!["cmd"]);
18702 }
18703
18704 #[test]
18705 fn special_char_shell_backticks() {
18706 let sanitized = sanitize_query("`cmd`");
18707 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18708 assert_eq!(parts, vec!["cmd"]);
18709 }
18710
18711 #[test]
18712 fn special_char_shell_pipe_rm() {
18713 let sanitized = sanitize_query("| rm -rf /");
18714 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18715 assert!(parts.contains(&"rm"));
18716 assert!(parts.contains(&"-rf"));
18717 assert_eq!(normalize_term_parts("| rm -rf /"), vec!["rm", "rf"]);
18718 assert!(!sanitized.contains('|'));
18719 assert!(!sanitized.contains('/'));
18720 }
18721
18722 #[test]
18723 fn special_char_shell_semicolon_chain() {
18724 let sanitized = sanitize_query("test; echo pwned; cat /etc/passwd");
18725 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18726 assert!(parts.contains(&"test"));
18727 assert!(parts.contains(&"echo"));
18728 assert!(parts.contains(&"pwned"));
18729 assert!(!sanitized.contains(';'));
18730 }
18731
18732 #[test]
18735 fn special_char_null_byte_mid_string() {
18736 let sanitized = sanitize_query("test\x00hidden");
18737 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18738 assert_eq!(parts, vec!["test", "hidden"]);
18739 }
18740
18741 #[test]
18742 fn special_char_null_byte_leading() {
18743 let sanitized = sanitize_query("\x00\x00attack");
18744 assert_eq!(sanitized.trim(), "attack");
18745 }
18746
18747 #[test]
18748 fn special_char_null_byte_trailing() {
18749 let sanitized = sanitize_query("query\x00\x00\x00");
18750 assert_eq!(sanitized.trim(), "query");
18751 }
18752
18753 #[test]
18754 fn special_char_null_byte_parse() {
18755 let tokens = parse_boolean_query("test\x00hidden");
18756 assert!(
18757 !tokens.is_empty(),
18758 "Null bytes should not prevent parsing: {tokens:?}"
18759 );
18760 }
18761
18762 #[test]
18765 fn special_char_control_newline() {
18766 let sanitized = sanitize_query("line1\nline2");
18767 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18768 assert_eq!(parts, vec!["line1", "line2"]);
18769 }
18770
18771 #[test]
18772 fn special_char_control_tab_cr() {
18773 let sanitized = sanitize_query("tab\there\r\nend");
18774 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18775 assert_eq!(parts, vec!["tab", "here", "end"]);
18776 }
18777
18778 #[test]
18779 fn special_char_control_parse_whitespace() {
18780 let tokens = parse_boolean_query("hello\tworld\ntest");
18781 let terms: Vec<&str> = tokens
18782 .iter()
18783 .filter_map(|t| match t {
18784 QueryToken::Term(s) => Some(s.as_str()),
18785 _ => None,
18786 })
18787 .collect();
18788 assert_eq!(terms, vec!["hello", "world", "test"]);
18789 }
18790
18791 #[test]
18792 fn special_char_control_bell_escape() {
18793 let sanitized = sanitize_query("test\x07\x1b[31mred");
18794 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18795 assert!(parts.contains(&"test"));
18796 assert!(parts.contains(&"31mred"));
18797 }
18798
18799 #[test]
18802 fn special_char_html_entity_lt() {
18803 let sanitized = sanitize_query("<script>");
18804 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18805 assert_eq!(parts, vec!["lt", "script", "gt"]);
18806 }
18807
18808 #[test]
18809 fn special_char_html_numeric_entity() {
18810 let sanitized = sanitize_query("<script>");
18811 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18812 assert!(parts.contains(&"x3C"));
18813 assert!(parts.contains(&"script"));
18814 assert!(parts.contains(&"x3E"));
18815 }
18816
18817 #[test]
18818 fn special_char_html_tags_stripped() {
18819 let sanitized = sanitize_query("<script>alert('xss')</script>");
18820 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18821 assert!(parts.contains(&"script"));
18822 assert!(parts.contains(&"alert"));
18823 assert!(parts.contains(&"xss"));
18824 }
18825
18826 #[test]
18827 fn special_char_html_attribute() {
18828 let sanitized = sanitize_query("<img src=\"evil.js\" onerror=\"alert(1)\">");
18829 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18830 assert!(parts.contains(&"img"));
18831 assert!(parts.contains(&"src"));
18832 assert!(parts.contains(&"onerror"));
18833 }
18834
18835 #[test]
18838 fn special_char_url_percent_encoding() {
18839 let sanitized = sanitize_query("%20space%2Fslash");
18840 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18841 assert_eq!(parts, vec!["20space", "2Fslash"]);
18842 }
18843
18844 #[test]
18845 fn special_char_url_null_byte_encoded() {
18846 let sanitized = sanitize_query("test%00hidden");
18847 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18848 assert_eq!(parts, vec!["test", "00hidden"]);
18849 }
18850
18851 #[test]
18852 fn special_char_url_full_query_string() {
18853 let sanitized = sanitize_query("search?q=hello&lang=en");
18854 let parts: Vec<&str> = sanitized.split_whitespace().collect();
18855 assert_eq!(parts, vec!["search", "q", "hello", "lang", "en"]);
18856 }
18857
18858 #[test]
18861 fn special_char_explain_sql_injection() {
18862 let filters = SearchFilters::default();
18863 let explanation = QueryExplanation::analyze("'OR 1=1--", &filters);
18864 assert!(
18865 !explanation.parsed.terms.is_empty() || !explanation.parsed.phrases.is_empty(),
18866 "SQL injection should produce parseable terms"
18867 );
18868 }
18869
18870 #[test]
18871 fn special_char_explain_shell_injection() {
18872 let filters = SearchFilters::default();
18873 let explanation = QueryExplanation::analyze("$(rm -rf /)", &filters);
18874 assert!(
18875 !explanation.parsed.terms.is_empty(),
18876 "Shell injection should produce parseable terms"
18877 );
18878 }
18879
18880 #[test]
18881 fn special_char_explain_html_xss() {
18882 let filters = SearchFilters::default();
18883 let explanation = QueryExplanation::analyze("<script>alert('xss')</script>", &filters);
18884 assert!(
18885 !explanation.parsed.terms.is_empty(),
18886 "XSS payload should produce parseable terms"
18887 );
18888 }
18889
18890 #[test]
18891 fn special_char_terms_lower_injection() {
18892 let qt = QueryTermsLower::from_query("'; DROP TABLE--");
18893 let tokens: Vec<&str> = qt.tokens().collect();
18894 for token in &tokens {
18895 assert!(
18896 token.chars().all(|c| c.is_alphanumeric()),
18897 "Token should only contain alphanumeric characters: {token}"
18898 );
18899 }
18900 }
18901
18902 #[test]
18903 fn special_char_terms_lower_null_bytes() {
18904 let qt = QueryTermsLower::from_query("test\x00hidden");
18905 let tokens: Vec<&str> = qt.tokens().collect();
18906 assert!(tokens.contains(&"test"));
18907 assert!(tokens.contains(&"hidden"));
18908 }
18909
18910 #[test]
18911 fn special_char_boolean_with_injection() {
18912 let tokens = parse_boolean_query("search AND 'OR 1=1-- NOT drop");
18913 assert!(
18914 tokens.iter().any(|t| matches!(t, QueryToken::And)),
18915 "Boolean AND should still be recognized: {tokens:?}"
18916 );
18917 assert!(
18918 tokens.iter().any(|t| matches!(t, QueryToken::Not)),
18919 "Boolean NOT should still be recognized: {tokens:?}"
18920 );
18921 }
18922
18923 #[test]
18929 fn stress_query_100k_chars_completes_quickly() {
18930 let long_query = "a ".repeat(50000);
18932 assert_eq!(long_query.len(), 100000);
18933
18934 let start = std::time::Instant::now();
18935 let sanitized = sanitize_query(&long_query);
18936 let elapsed_sanitize = start.elapsed();
18937
18938 let start = std::time::Instant::now();
18939 let tokens = parse_boolean_query(&sanitized);
18940 let elapsed_parse = start.elapsed();
18941
18942 assert!(
18943 elapsed_sanitize < std::time::Duration::from_secs(1),
18944 "sanitize_query with 100k chars took {:?} (>1s)",
18945 elapsed_sanitize
18946 );
18947 assert!(
18948 elapsed_parse < std::time::Duration::from_secs(1),
18949 "parse_boolean_query with 100k chars took {:?} (>1s)",
18950 elapsed_parse
18951 );
18952 assert!(!tokens.is_empty(), "100k char query should produce tokens");
18953 }
18954
18955 #[test]
18956 fn stress_query_1000_terms() {
18957 let words: Vec<String> = (0..1000).map(|i| format!("word{}", i)).collect();
18959 let query = words.join(" ");
18960
18961 let start = std::time::Instant::now();
18962 let sanitized = sanitize_query(&query);
18963 let tokens = parse_boolean_query(&sanitized);
18964 let elapsed = start.elapsed();
18965
18966 assert!(
18967 elapsed < std::time::Duration::from_secs(1),
18968 "1000 terms query took {:?} (>1s)",
18969 elapsed
18970 );
18971 let term_count = tokens
18973 .iter()
18974 .filter(|t| matches!(t, QueryToken::Term(_)))
18975 .count();
18976 assert!(
18977 term_count >= 900,
18978 "Expected ~1000 terms, got {} terms",
18979 term_count
18980 );
18981 }
18982
18983 #[test]
18984 fn stress_query_1000_identical_terms() {
18985 let query = "test ".repeat(1000);
18987
18988 let start = std::time::Instant::now();
18989 let sanitized = sanitize_query(&query);
18990 let tokens = parse_boolean_query(&sanitized);
18991 let elapsed = start.elapsed();
18992
18993 assert!(
18994 elapsed < std::time::Duration::from_secs(1),
18995 "1000 identical terms query took {:?} (>1s)",
18996 elapsed
18997 );
18998
18999 let parsed_term_count = tokens
19001 .iter()
19002 .filter(|t| matches!(t, QueryToken::Term(_)))
19003 .count();
19004 assert_eq!(parsed_term_count, 1000, "Parser should produce 1000 terms");
19005
19006 let qt = QueryTermsLower::from_query(&query);
19008 let tokens_lower: Vec<&str> = qt.tokens().collect();
19009 assert_eq!(
19010 tokens_lower.len(),
19011 1000,
19012 "All 1000 identical terms should be preserved"
19013 );
19014 assert!(
19015 tokens_lower.iter().all(|t| *t == "test"),
19016 "All tokens should be 'test'"
19017 );
19018 }
19019
19020 #[test]
19021 fn stress_query_10k_char_single_term() {
19022 let long_term = "a".repeat(10000);
19024
19025 let start = std::time::Instant::now();
19026 let sanitized = sanitize_query(&long_term);
19027 let tokens = parse_boolean_query(&sanitized);
19028 let elapsed = start.elapsed();
19029
19030 assert!(
19031 elapsed < std::time::Duration::from_secs(1),
19032 "10k char single term took {:?} (>1s)",
19033 elapsed
19034 );
19035 assert_eq!(tokens.len(), 1, "Should produce exactly one token");
19036 assert!(
19037 matches!(&tokens[0], QueryToken::Term(t) if t.len() == 10000),
19038 "Expected Term token"
19039 );
19040 }
19041
19042 #[test]
19043 fn stress_deeply_nested_parentheses() {
19044 let open_parens = "(".repeat(100);
19047 let close_parens = ")".repeat(100);
19048 let query = format!("{}test{}", open_parens, close_parens);
19049
19050 let start = std::time::Instant::now();
19051 let sanitized = sanitize_query(&query);
19052 let tokens = parse_boolean_query(&sanitized);
19053 let elapsed = start.elapsed();
19054
19055 assert!(
19056 elapsed < std::time::Duration::from_millis(100),
19057 "Deeply nested parens took {:?} (>100ms)",
19058 elapsed
19059 );
19060 let term_count = tokens
19062 .iter()
19063 .filter(|t| matches!(t, QueryToken::Term(_)))
19064 .count();
19065 assert_eq!(term_count, 1, "Should have 1 term after sanitizing parens");
19066 }
19067
19068 #[test]
19069 fn stress_many_boolean_operators() {
19070 let terms: Vec<String> = (0..101).map(|i| format!("term{}", i)).collect();
19072 let query = terms.join(" AND ");
19073
19074 let start = std::time::Instant::now();
19075 let tokens = parse_boolean_query(&query);
19076 let elapsed = start.elapsed();
19077
19078 assert!(
19079 elapsed < std::time::Duration::from_secs(1),
19080 "100+ boolean ops took {:?} (>1s)",
19081 elapsed
19082 );
19083
19084 let and_count = tokens
19085 .iter()
19086 .filter(|t| matches!(t, QueryToken::And))
19087 .count();
19088 let term_count = tokens
19089 .iter()
19090 .filter(|t| matches!(t, QueryToken::Term(_)))
19091 .count();
19092
19093 assert_eq!(and_count, 100, "Should have 100 AND operators");
19094 assert_eq!(term_count, 101, "Should have 101 terms");
19095 }
19096
19097 #[test]
19098 fn stress_many_or_operators() {
19099 let terms: Vec<String> = (0..101).map(|i| format!("opt{}", i)).collect();
19101 let query = terms.join(" OR ");
19102
19103 let start = std::time::Instant::now();
19104 let tokens = parse_boolean_query(&query);
19105 let elapsed = start.elapsed();
19106
19107 assert!(
19108 elapsed < std::time::Duration::from_secs(1),
19109 "100+ OR ops took {:?} (>1s)",
19110 elapsed
19111 );
19112
19113 let or_count = tokens
19114 .iter()
19115 .filter(|t| matches!(t, QueryToken::Or))
19116 .count();
19117 assert_eq!(or_count, 100, "Should have 100 OR operators");
19118 }
19119
19120 #[test]
19121 fn stress_mixed_boolean_operators() {
19122 let query = "a AND b OR c NOT d AND e OR f NOT g ".repeat(50);
19124
19125 let start = std::time::Instant::now();
19126 let tokens = parse_boolean_query(&query);
19127 let elapsed = start.elapsed();
19128
19129 assert!(
19130 elapsed < std::time::Duration::from_secs(1),
19131 "Mixed boolean ops took {:?} (>1s)",
19132 elapsed
19133 );
19134 assert!(
19135 !tokens.is_empty(),
19136 "Complex boolean query should produce tokens"
19137 );
19138 }
19139
19140 #[test]
19141 fn stress_memory_bounds_large_query() {
19142 let large_query = "x".repeat(100000);
19146
19147 let sanitized = sanitize_query(&large_query);
19148 let tokens = parse_boolean_query(&sanitized);
19149
19150 assert!(
19152 sanitized.len() <= large_query.len(),
19153 "Sanitized output should not exceed input size"
19154 );
19155
19156 assert_eq!(tokens.len(), 1);
19158
19159 let qt = QueryTermsLower::from_query(&large_query);
19161 let token_count = qt.tokens().count();
19162 assert_eq!(token_count, 1, "Should be 1 token of 100k chars");
19163 }
19164
19165 #[test]
19166 fn stress_concurrent_queries() {
19167 use std::thread;
19168
19169 let queries: Vec<String> = (0..100)
19170 .map(|i| format!("concurrent_query_{} test search", i))
19171 .collect();
19172
19173 let handles: Vec<_> = queries
19174 .into_iter()
19175 .map(|query| {
19176 thread::spawn(move || {
19177 let sanitized = sanitize_query(&query);
19178 let tokens = parse_boolean_query(&sanitized);
19179 let qt = QueryTermsLower::from_query(&query);
19180 (tokens.len(), qt.tokens().count())
19181 })
19182 })
19183 .collect();
19184
19185 for (i, handle) in handles.into_iter().enumerate() {
19186 let (token_len, qt_len) = handle.join().expect("Thread panicked");
19187 assert!(token_len > 0, "Query {} should produce tokens", i);
19188 assert!(qt_len > 0, "Query {} QueryTermsLower should have tokens", i);
19189 }
19190 }
19191
19192 #[test]
19193 fn stress_many_quoted_phrases() {
19194 let phrases: Vec<String> = (0..50)
19196 .map(|i| format!("\"phrase number {}\"", i))
19197 .collect();
19198 let query = phrases.join(" AND ");
19199
19200 let start = std::time::Instant::now();
19201 let tokens = parse_boolean_query(&query);
19202 let elapsed = start.elapsed();
19203
19204 assert!(
19205 elapsed < std::time::Duration::from_secs(1),
19206 "50 quoted phrases took {:?} (>1s)",
19207 elapsed
19208 );
19209
19210 let phrase_count = tokens
19211 .iter()
19212 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19213 .count();
19214 assert_eq!(phrase_count, 50, "Should have 50 phrases");
19215 }
19216
19217 #[test]
19218 fn stress_alternating_quotes() {
19219 let parts: Vec<String> = (0..100)
19221 .map(|i| {
19222 if i % 2 == 0 {
19223 format!("\"word{}\"", i)
19224 } else {
19225 format!("word{}", i)
19226 }
19227 })
19228 .collect();
19229 let query = parts.join(" ");
19230
19231 let start = std::time::Instant::now();
19232 let tokens = parse_boolean_query(&query);
19233 let elapsed = start.elapsed();
19234
19235 assert!(
19236 elapsed < std::time::Duration::from_secs(1),
19237 "100 alternating quotes took {:?} (>1s)",
19238 elapsed
19239 );
19240
19241 let phrase_count = tokens
19242 .iter()
19243 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19244 .count();
19245 let term_count = tokens
19246 .iter()
19247 .filter(|t| matches!(t, QueryToken::Term(_)))
19248 .count();
19249
19250 assert_eq!(phrase_count, 50, "Should have 50 phrases");
19251 assert_eq!(term_count, 50, "Should have 50 terms");
19252 }
19253
19254 #[test]
19255 fn stress_many_wildcards() {
19256 let patterns: Vec<&str> = vec!["pre*", "*suf", "*sub*", "a*b", "test*", "*ing", "*tion*"];
19258 let query = patterns
19259 .iter()
19260 .cycle()
19261 .take(100)
19262 .cloned()
19263 .collect::<Vec<_>>()
19264 .join(" ");
19265
19266 let start = std::time::Instant::now();
19267 let sanitized = sanitize_query(&query);
19268 let tokens = parse_boolean_query(&sanitized);
19269 let elapsed = start.elapsed();
19270
19271 assert!(
19272 elapsed < std::time::Duration::from_secs(1),
19273 "100 wildcards took {:?} (>1s)",
19274 elapsed
19275 );
19276 assert!(!tokens.is_empty());
19277 }
19278
19279 #[test]
19280 fn stress_query_explanation_large_query() {
19281 let words: Vec<String> = (0..100).map(|i| format!("term{}", i)).collect();
19283 let query = words.join(" ");
19284 let filters = SearchFilters::default();
19285
19286 let start = std::time::Instant::now();
19287 let explanation = QueryExplanation::analyze(&query, &filters);
19288 let elapsed = start.elapsed();
19289
19290 assert!(
19291 elapsed < std::time::Duration::from_secs(2),
19292 "QueryExplanation for 100 terms took {:?} (>2s)",
19293 elapsed
19294 );
19295 assert!(
19296 !explanation.parsed.terms.is_empty(),
19297 "Should parse terms successfully"
19298 );
19299 }
19300
19301 #[test]
19302 fn stress_very_long_single_quoted_phrase() {
19303 let words: Vec<String> = (0..500).map(|i| format!("word{}", i)).collect();
19305 let phrase = format!("\"{}\"", words.join(" "));
19306
19307 let start = std::time::Instant::now();
19308 let tokens = parse_boolean_query(&phrase);
19309 let elapsed = start.elapsed();
19310
19311 assert!(
19312 elapsed < std::time::Duration::from_secs(1),
19313 "500-word phrase took {:?} (>1s)",
19314 elapsed
19315 );
19316
19317 let phrase_count = tokens
19318 .iter()
19319 .filter(|t| matches!(t, QueryToken::Phrase(_)))
19320 .count();
19321 assert_eq!(phrase_count, 1, "Should have exactly 1 phrase");
19322 }
19323
19324 #[test]
19325 fn stress_not_prefix_many() {
19326 let terms: Vec<String> = (0..100).map(|i| format!("-term{}", i)).collect();
19328 let query = terms.join(" ");
19329
19330 let start = std::time::Instant::now();
19331 let tokens = parse_boolean_query(&query);
19332 let elapsed = start.elapsed();
19333
19334 assert!(
19335 elapsed < std::time::Duration::from_secs(1),
19336 "100 NOT prefixes took {:?} (>1s)",
19337 elapsed
19338 );
19339
19340 let not_count = tokens
19341 .iter()
19342 .filter(|t| matches!(t, QueryToken::Not))
19343 .count();
19344 assert_eq!(not_count, 100, "Should have 100 NOT operators");
19345 }
19346
19347 #[test]
19348 fn stress_unicode_large_cjk_query() {
19349 let cjk_chars = "中文日本語한국어".repeat(1000);
19351
19352 let start = std::time::Instant::now();
19353 let sanitized = sanitize_query(&cjk_chars);
19354 let qt = QueryTermsLower::from_query(&sanitized);
19355 let elapsed = start.elapsed();
19356
19357 assert!(
19358 elapsed < std::time::Duration::from_secs(1),
19359 "Large CJK query took {:?} (>1s)",
19360 elapsed
19361 );
19362 assert!(!qt.is_empty(), "CJK query should produce tokens");
19363 }
19364
19365 #[test]
19366 fn stress_unicode_many_emoji() {
19367 let emoji_query = "🚀 🔍 📝 💻 🎯 ".repeat(500);
19369
19370 let start = std::time::Instant::now();
19371 let sanitized = sanitize_query(&emoji_query);
19372 let tokens = parse_boolean_query(&sanitized);
19373 let elapsed = start.elapsed();
19374
19375 assert!(
19376 elapsed < std::time::Duration::from_secs(1),
19377 "Emoji query took {:?} (>1s)",
19378 elapsed
19379 );
19380 assert!(
19382 tokens.is_empty(),
19383 "Emoji-only query should produce no tokens"
19384 );
19385 }
19386
19387 #[test]
19388 fn stress_mixed_content_large() {
19389 let mixed = r#"
19391 function test() { return x + y; }
19392 SELECT * FROM users WHERE id = 1;
19393 The quick brown fox 狐狸 jumps over lazy dog
19394 Error: "undefined is not a function" at line 42
19395 https://example.com/path?query=value&other=123
19396 "#
19397 .repeat(100);
19398
19399 let start = std::time::Instant::now();
19400 let sanitized = sanitize_query(&mixed);
19401 let tokens = parse_boolean_query(&sanitized);
19402 let qt = QueryTermsLower::from_query(&mixed);
19403 let elapsed = start.elapsed();
19404
19405 assert!(
19406 elapsed < std::time::Duration::from_secs(2),
19407 "Mixed content query took {:?} (>2s)",
19408 elapsed
19409 );
19410 assert!(!tokens.is_empty());
19411 assert!(!qt.is_empty());
19412 }
19413
19414 #[test]
19421 fn unicode_emoji_mixed_with_alphanumeric() {
19422 let tokens = parse_boolean_query("rocket🚀launch");
19424 assert_eq!(tokens.len(), 1);
19425 let sanitized = sanitize_query("rocket🚀launch");
19427 assert_eq!(sanitized, "rocket launch");
19428
19429 let sanitized2 = sanitize_query("test🔥🎯code");
19431 assert_eq!(sanitized2, "test code");
19432 }
19433
19434 #[test]
19435 fn unicode_emoji_with_boolean_operators() {
19436 let tokens = parse_boolean_query("🚀code AND test");
19438 let term_count = tokens
19440 .iter()
19441 .filter(|t| matches!(t, QueryToken::Term(_)))
19442 .count();
19443 assert!(term_count >= 1, "Should have at least one term");
19444
19445 let tokens_or = parse_boolean_query("deploy OR 🎯target");
19447 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
19448 assert!(has_or, "Should detect OR operator");
19449 }
19450
19451 #[test]
19452 fn unicode_emoji_at_word_boundaries() {
19453 let sanitized_start = sanitize_query("🔍search");
19455 assert_eq!(sanitized_start, " search");
19456
19457 let sanitized_end = sanitize_query("complete✅");
19459 assert_eq!(sanitized_end, "complete ");
19460
19461 let sanitized_only = sanitize_query("🎉🎊🎁");
19463 assert!(
19464 sanitized_only.trim().is_empty(),
19465 "Emoji-only should be empty after trimming"
19466 );
19467 }
19468
19469 #[test]
19472 fn unicode_arabic_text_preserved() {
19473 let arabic = "مرحبا بالعالم"; let sanitized = sanitize_query(arabic);
19476 assert_eq!(
19477 sanitized, arabic,
19478 "Arabic alphanumeric chars should be preserved"
19479 );
19480
19481 let tokens = parse_boolean_query(arabic);
19482 assert!(!tokens.is_empty(), "Arabic query should produce tokens");
19483 }
19484
19485 #[test]
19486 fn unicode_hebrew_text_preserved() {
19487 let hebrew = "שלום עולם"; let sanitized = sanitize_query(hebrew);
19490 assert_eq!(
19491 sanitized, hebrew,
19492 "Hebrew alphanumeric chars should be preserved"
19493 );
19494
19495 let tokens = parse_boolean_query(hebrew);
19496 assert!(!tokens.is_empty(), "Hebrew query should produce tokens");
19497 }
19498
19499 #[test]
19500 fn unicode_mixed_rtl_and_ltr() {
19501 let mixed = "hello مرحبا world";
19503 let sanitized = sanitize_query(mixed);
19504 assert_eq!(sanitized, mixed, "Mixed RTL/LTR should be preserved");
19505
19506 let tokens = parse_boolean_query(mixed);
19507 let term_count = tokens
19508 .iter()
19509 .filter(|t| matches!(t, QueryToken::Term(_)))
19510 .count();
19511 assert_eq!(term_count, 3, "Should have 3 terms");
19512 }
19513
19514 #[test]
19515 fn unicode_rtl_with_boolean_operators() {
19516 let hebrew_and = "שלום AND עולם";
19518 let tokens = parse_boolean_query(hebrew_and);
19519 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
19520 assert!(has_and, "Should detect AND operator in Hebrew query");
19521
19522 let arabic_not = "مرحبا NOT بالعالم";
19524 let tokens_not = parse_boolean_query(arabic_not);
19525 let has_not = tokens_not.iter().any(|t| matches!(t, QueryToken::Not));
19526 assert!(has_not, "Should detect NOT operator in Arabic query");
19527 }
19528
19529 #[test]
19532 fn special_chars_backslash_stripped() {
19533 let query = r"path\to\file";
19535 let sanitized = sanitize_query(query);
19536 assert_eq!(sanitized, "path to file");
19537 }
19538
19539 #[test]
19540 fn special_chars_escaped_quotes_handling() {
19541 let query = r#"say \"hello\""#;
19543 let sanitized = sanitize_query(query);
19544 assert!(sanitized.contains('"'), "Quotes should be preserved");
19546 }
19547
19548 #[test]
19549 fn special_chars_windows_paths() {
19550 let path = r"C:\Users\test\Documents";
19552 let sanitized = sanitize_query(path);
19553 assert_eq!(sanitized, "C Users test Documents");
19554 }
19555
19556 #[test]
19559 fn boolean_deeply_nested_operators() {
19560 let query = "a AND b OR c NOT d AND e";
19562 let tokens = parse_boolean_query(query);
19563
19564 let mut and_count = 0;
19565 let mut or_count = 0;
19566 let mut not_count = 0;
19567 for token in &tokens {
19568 match token {
19569 QueryToken::And => and_count += 1,
19570 QueryToken::Or => or_count += 1,
19571 QueryToken::Not => not_count += 1,
19572 _ => {}
19573 }
19574 }
19575
19576 assert_eq!(and_count, 2, "Should have 2 AND operators");
19577 assert_eq!(or_count, 1, "Should have 1 OR operator");
19578 assert_eq!(not_count, 1, "Should have 1 NOT operator");
19579 }
19580
19581 #[test]
19582 fn boolean_consecutive_operators_degenerate() {
19583 let tokens = parse_boolean_query("foo AND AND bar");
19585 let term_count = tokens
19587 .iter()
19588 .filter(|t| matches!(t, QueryToken::Term(_)))
19589 .count();
19590 assert!(
19591 term_count >= 2,
19592 "Should have at least 2 terms (foo and bar)"
19593 );
19594 }
19595
19596 #[test]
19597 fn boolean_operator_at_start() {
19598 let tokens = parse_boolean_query("AND foo");
19600 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
19601 assert!(has_and, "Leading AND should be detected");
19602
19603 let tokens_or = parse_boolean_query("OR test");
19604 let has_or = tokens_or.iter().any(|t| matches!(t, QueryToken::Or));
19605 assert!(has_or, "Leading OR should be detected");
19606 }
19607
19608 #[test]
19609 fn boolean_operator_at_end() {
19610 let tokens = parse_boolean_query("foo AND");
19612 let has_and = tokens.iter().any(|t| matches!(t, QueryToken::And));
19613 assert!(has_and, "Trailing AND should be detected");
19614 }
19615
19616 #[test]
19619 fn numeric_query_digits_only() {
19620 let tokens = parse_boolean_query("12345");
19622 assert_eq!(tokens.len(), 1);
19623 assert_eq!(tokens[0], QueryToken::Term("12345".to_string()));
19624
19625 let sanitized = sanitize_query("12345");
19626 assert_eq!(sanitized, "12345");
19627 }
19628
19629 #[test]
19630 fn numeric_query_with_text() {
19631 let tokens = parse_boolean_query("error 404 not found");
19633 let term_count = tokens
19634 .iter()
19635 .filter(|t| matches!(t, QueryToken::Term(_)))
19636 .count();
19637 assert!(term_count >= 3, "Should have at least 3 terms");
19639 }
19640
19641 #[test]
19642 fn numeric_versions_with_dots() {
19643 let sanitized = sanitize_query("version 1.2.3");
19645 assert_eq!(sanitized, "version 1 2 3"); }
19647
19648 #[test]
19651 fn whitespace_tabs_treated_as_separators() {
19652 let tokens = parse_boolean_query("foo\tbar\tbaz");
19653 let term_count = tokens
19654 .iter()
19655 .filter(|t| matches!(t, QueryToken::Term(_)))
19656 .count();
19657 assert_eq!(term_count, 3, "Tabs should separate terms");
19658 }
19659
19660 #[test]
19661 fn whitespace_newlines_treated_as_separators() {
19662 let tokens = parse_boolean_query("foo\nbar\nbaz");
19663 let term_count = tokens
19664 .iter()
19665 .filter(|t| matches!(t, QueryToken::Term(_)))
19666 .count();
19667 assert_eq!(term_count, 3, "Newlines should separate terms");
19668 }
19669
19670 #[test]
19671 fn whitespace_mixed_types() {
19672 let tokens = parse_boolean_query("a \t b \n c d");
19673 let term_count = tokens
19674 .iter()
19675 .filter(|t| matches!(t, QueryToken::Term(_)))
19676 .count();
19677 assert_eq!(term_count, 4, "Mixed whitespace should separate properly");
19678 }
19679
19680 #[test]
19683 fn stress_very_long_single_term() {
19684 let long_term = "a".repeat(10_000);
19686
19687 let start = std::time::Instant::now();
19688 let tokens = parse_boolean_query(&long_term);
19689 let elapsed = start.elapsed();
19690
19691 assert!(
19692 elapsed < std::time::Duration::from_secs(1),
19693 "10K char term took {:?} (>1s)",
19694 elapsed
19695 );
19696 assert_eq!(tokens.len(), 1);
19697 assert!(
19698 matches!(tokens.first(), Some(QueryToken::Term(t)) if t.len() == 10_000),
19699 "Expected 10K Term token, got {tokens:?}"
19700 );
19701 }
19702
19703 #[test]
19704 fn stress_very_long_term_with_wildcard() {
19705 let long_pattern = format!("{}*", "prefix".repeat(1000));
19707
19708 let start = std::time::Instant::now();
19709 let sanitized = sanitize_query(&long_pattern);
19710 let pattern = WildcardPattern::parse(&sanitized);
19711 let elapsed = start.elapsed();
19712
19713 assert!(
19714 elapsed < std::time::Duration::from_secs(1),
19715 "Long wildcard pattern took {:?} (>1s)",
19716 elapsed
19717 );
19718 assert!(
19719 matches!(pattern, WildcardPattern::Prefix(_)),
19720 "Should parse as prefix pattern"
19721 );
19722 }
19723
19724 #[test]
19727 fn query_explanation_empty_query() {
19728 let explanation = QueryExplanation::analyze("", &SearchFilters::default());
19729 assert_eq!(explanation.query_type, QueryType::Empty);
19730 }
19731
19732 #[test]
19733 fn search_mode_default_is_hybrid_preferred() {
19734 assert_eq!(SearchMode::default(), SearchMode::Hybrid);
19735 }
19736
19737 #[test]
19738 fn query_explanation_whitespace_only_query() {
19739 let explanation = QueryExplanation::analyze(" \t\n ", &SearchFilters::default());
19740 assert_eq!(explanation.query_type, QueryType::Empty);
19741 }
19742
19743 #[test]
19744 fn query_explanation_unicode_query() {
19745 let explanation = QueryExplanation::analyze("日本語 search", &SearchFilters::default());
19746 assert!(!explanation.parsed.terms.is_empty());
19748 }
19749
19750 #[test]
19753 fn query_terms_lower_unicode_normalization() {
19754 let terms = QueryTermsLower::from_query("CAFÉ RÉSUMÉ");
19756 assert_eq!(terms.query_lower, "café résumé");
19757 }
19758
19759 #[test]
19760 fn query_terms_lower_mixed_case_unicode() {
19761 let terms = QueryTermsLower::from_query("Hello日本語World");
19763 assert!(terms.query_lower.contains("hello"));
19765 assert!(terms.query_lower.contains("world"));
19766 }
19767
19768 #[test]
19769 fn query_terms_lower_preserves_numbers() {
19770 let terms = QueryTermsLower::from_query("ABC123XYZ");
19771 assert_eq!(terms.query_lower, "abc123xyz");
19772 }
19773
19774 #[test]
19777 fn wildcard_pattern_internal_asterisk() {
19778 let pattern = WildcardPattern::parse("f*o");
19780 assert!(
19781 matches!(pattern, WildcardPattern::Complex(_)),
19782 "Internal asterisk should be Complex"
19783 );
19784 }
19785
19786 #[test]
19787 fn wildcard_pattern_multiple_internal_asterisks() {
19788 let pattern = WildcardPattern::parse("a*b*c");
19790 assert!(
19791 matches!(pattern, WildcardPattern::Complex(_)),
19792 "Multiple internal asterisks should be Complex"
19793 );
19794 }
19795
19796 #[test]
19797 fn wildcard_pattern_regex_escapes_special_chars() {
19798 let pattern = WildcardPattern::parse("*foo.bar*");
19800 if let Some(regex) = pattern.to_regex() {
19801 assert!(
19802 regex.contains("\\."),
19803 "Dot should be escaped in regex: {}",
19804 regex
19805 );
19806 }
19807 }
19808
19809 #[test]
19810 fn wildcard_pattern_complex_regex_generation() {
19811 let pattern = WildcardPattern::parse("f*o*o");
19812 if let Some(regex) = pattern.to_regex() {
19813 assert!(
19815 regex.contains(".*"),
19816 "Should have .* for internal wildcards: {}",
19817 regex
19818 );
19819 }
19820 }
19821
19822 #[test]
19823 fn test_transpile_to_fts5() {
19824 assert_eq!(
19826 transpile_to_fts5("foo bar"),
19827 Some("foo AND bar".to_string())
19828 );
19829
19830 assert_eq!(
19832 transpile_to_fts5("foo AND bar"),
19833 Some("foo AND bar".to_string())
19834 );
19835 assert_eq!(
19836 transpile_to_fts5("foo OR bar"),
19837 Some("(foo OR bar)".to_string())
19838 );
19839 assert_eq!(transpile_to_fts5("OR foo"), Some("foo".to_string()));
19840 assert_eq!(transpile_to_fts5("NOT foo"), None);
19841
19842 assert_eq!(
19845 transpile_to_fts5("A AND B OR C"),
19846 Some("A AND (B OR C)".to_string())
19847 );
19848
19849 assert_eq!(
19851 transpile_to_fts5("A OR B AND C"),
19852 Some("(A OR B) AND C".to_string())
19853 );
19854
19855 assert_eq!(
19857 transpile_to_fts5("A OR B OR C"),
19858 Some("(A OR B OR C)".to_string())
19859 );
19860
19861 assert_eq!(
19863 transpile_to_fts5("\"foo bar\""),
19864 Some("\"foo bar\"".to_string())
19865 );
19866
19867 assert_eq!(transpile_to_fts5("foo*"), Some("foo*".to_string()));
19869
19870 assert_eq!(transpile_to_fts5("*foo"), None);
19872 assert_eq!(transpile_to_fts5("f*o"), None);
19873
19874 assert_eq!(
19877 transpile_to_fts5("foo-bar"),
19878 Some("(foo AND bar)".to_string())
19879 );
19880 assert_eq!(
19881 transpile_to_fts5("foo-bar*"),
19882 Some("(foo AND bar*)".to_string())
19883 );
19884 assert_eq!(
19885 transpile_to_fts5("br-123.jsonl"),
19886 Some("(br AND 123 AND jsonl)".to_string())
19887 );
19888 assert_eq!(
19889 transpile_to_fts5("br-123.json*"),
19890 Some("(br AND 123 AND json*)".to_string())
19891 );
19892
19893 assert_eq!(transpile_to_fts5("NOT A OR B"), None);
19895 }
19896
19897 #[test]
19898 fn semantic_doc_id_roundtrip_from_query() {
19899 let hash_hex = "00".repeat(32);
19900 let doc_id = format!("m|42|2|3|7|11|1|1700000000000|{hash_hex}");
19901 let parsed = parse_semantic_doc_id(&doc_id).expect("roundtrip parse");
19902 assert_eq!(parsed.message_id, 42);
19903 assert_eq!(parsed.chunk_idx, 2);
19904 assert_eq!(parsed.agent_id, 3);
19905 assert_eq!(parsed.workspace_id, 7);
19906 assert_eq!(parsed.source_id, 11);
19907 assert_eq!(parsed.role, 1);
19908 assert_eq!(parsed.created_at_ms, 1_700_000_000_000);
19909 }
19910
19911 #[test]
19912 fn semantic_filter_applies_all_constraints() {
19913 use frankensearch::core::filter::SearchFilter;
19914
19915 let filter = SemanticFilter {
19916 agents: Some(HashSet::from([3])),
19917 workspaces: Some(HashSet::from([7])),
19918 sources: Some(HashSet::from([11])),
19919 roles: Some(HashSet::from([1])),
19920 created_from: Some(1_700_000_000_000),
19921 created_to: Some(1_700_000_000_100),
19922 };
19923
19924 assert!(filter.matches("m|42|2|3|7|11|1|1700000000001", None));
19925 assert!(!filter.matches("m|42|2|99|7|11|1|1700000000001", None));
19926 assert!(!filter.matches("m|42|2|3|7|11|1|1699999999999", None));
19927 assert!(!filter.matches("not-a-doc-id", None));
19928 }
19929
19930 #[test]
19931 fn fs_semantic_index_runs_filtered_search() -> Result<()> {
19932 let temp = TempDir::new()?;
19933 let index_path = crate::search::vector_index::vector_index_path(temp.path(), "embed-fast");
19934 if let Some(parent) = index_path.parent() {
19935 std::fs::create_dir_all(parent)?;
19936 }
19937
19938 let hash_a = "00".repeat(32);
19939 let hash_b = "11".repeat(32);
19940 let doc_a = format!("m|101|0|1|10|100|1|1700000000001|{hash_a}");
19941 let doc_b = format!("m|202|0|2|20|200|1|1700000000002|{hash_b}");
19942
19943 let mut writer = VectorIndex::create_with_revision(
19944 &index_path,
19945 "embed-fast",
19946 "rev-1",
19947 2,
19948 frankensearch::index::Quantization::F16,
19949 )
19950 .map_err(|err| anyhow!("create fsvi index failed: {err}"))?;
19951 writer
19952 .write_record(&doc_a, &[1.0, 0.0])
19953 .map_err(|err| anyhow!("write_record failed: {err}"))?;
19954 writer
19955 .write_record(&doc_b, &[0.0, 1.0])
19956 .map_err(|err| anyhow!("write_record failed: {err}"))?;
19957 writer
19958 .finish()
19959 .map_err(|err| anyhow!("finish fsvi index failed: {err}"))?;
19960
19961 let fs_index =
19962 VectorIndex::open(&index_path).map_err(|err| anyhow!("open fsvi failed: {err}"))?;
19963 let filter = SemanticFilter {
19964 agents: Some(HashSet::from([1])),
19965 workspaces: None,
19966 sources: None,
19967 roles: None,
19968 created_from: None,
19969 created_to: None,
19970 };
19971 let fs_filter = semantic_filter_as_search_filter(&filter).expect("expected active filter");
19972 let hits = fs_index
19973 .search_top_k(&[1.0, 0.0], 5, Some(fs_filter))
19974 .map_err(|err| anyhow!("frankensearch search failed: {err}"))?;
19975 assert_eq!(hits.len(), 1);
19976 let parsed = parse_semantic_doc_id(&hits[0].doc_id).expect("parse bridged doc_id");
19977 assert_eq!(parsed.message_id, 101);
19978 assert_eq!(parsed.agent_id, 1);
19979 Ok(())
19980 }
19981
19982 #[test]
19994 fn hit_is_noise_returns_false_when_content_and_snippet_both_empty() {
19995 let hit = SearchHit {
19996 title: String::new(),
19997 snippet: String::new(),
19998 content: String::new(),
19999 content_hash: 0,
20000 conversation_id: Some(1),
20001 score: 1.0,
20002 source_path: "/tmp/session.jsonl".to_string(),
20003 agent: "codex".to_string(),
20004 workspace: String::new(),
20005 workspace_original: None,
20006 created_at: Some(1700000000000),
20007 line_number: Some(1),
20008 match_type: MatchType::Exact,
20009 source_id: "local".to_string(),
20010 origin_kind: "local".to_string(),
20011 origin_host: None,
20012 };
20013
20014 assert!(
20018 !hit_is_noise(&hit, "anything"),
20019 "hit with empty content AND snippet (projection-only) must NOT be classified as noise"
20020 );
20021 assert!(
20022 !hit_is_noise(&hit, ""),
20023 "noise classifier must not treat an empty-query projection-only hit as noise"
20024 );
20025 }
20026
20027 #[test]
20032 fn hit_is_noise_still_drops_tool_acknowledgement_when_content_present() {
20033 let hit = SearchHit {
20034 title: String::new(),
20035 snippet: String::new(),
20036 content: "ok".to_string(),
20037 content_hash: 0,
20038 conversation_id: Some(1),
20039 score: 1.0,
20040 source_path: "/tmp/session.jsonl".to_string(),
20041 agent: "codex".to_string(),
20042 workspace: String::new(),
20043 workspace_original: None,
20044 created_at: Some(1700000000000),
20045 line_number: Some(1),
20046 match_type: MatchType::Exact,
20047 source_id: "local".to_string(),
20048 origin_kind: "local".to_string(),
20049 origin_host: None,
20050 };
20051
20052 assert!(
20053 hit_is_noise(&hit, ""),
20054 "bare tool-ack 'ok' with content present should still be dropped as noise"
20055 );
20056 }
20057}