1use std::collections::HashSet;
13use std::path::{Path, PathBuf};
14use std::sync::Arc;
15
16use crate::search::embedder::Embedder;
17use crate::search::fastembed_embedder::FastEmbedder;
18use crate::search::hash_embedder::HashEmbedder;
19use crate::search::model_download::{
20 ModelAcquisitionPolicy, ModelCacheState, ModelManifest, classify_model_cache,
21 classify_model_cache_metadata,
22};
23use crate::search::policy::{CliSemanticOverrides, SemanticPolicy};
24use crate::search::semantic_manifest::{
25 SemanticShardManifest, SemanticShardRecord, TierKind, semantic_shard_artifact_path_is_safe,
26};
27use crate::search::vector_index::{
28 ROLE_ASSISTANT, ROLE_USER, SemanticFilterMaps, VectorIndex, vector_index_path,
29};
30use crate::storage::sqlite::FrankenStorage;
31
32#[derive(Debug, Clone)]
40pub enum SemanticAvailability {
41 Ready { embedder_id: String },
43
44 NotInstalled,
50
51 NeedsConsent,
54
55 Downloading {
57 progress_pct: u8,
59 bytes_downloaded: u64,
61 total_bytes: u64,
63 },
64
65 Verifying,
67
68 IndexBuilding {
70 embedder_id: String,
71 progress_pct: Option<u8>,
73 items_indexed: u64,
75 total_items: u64,
77 },
78
79 HashFallback,
81
82 Disabled { reason: String },
84
85 ModelMissing {
90 model_dir: PathBuf,
91 missing_files: Vec<String>,
92 },
93
94 IndexMissing { index_path: PathBuf },
96
97 DatabaseUnavailable { db_path: PathBuf, error: String },
99
100 LoadFailed { context: String },
102
103 UpdateAvailable {
105 embedder_id: String,
106 current_revision: String,
107 latest_revision: String,
108 },
109}
110
111impl SemanticAvailability {
112 pub fn is_ready(&self) -> bool {
114 matches!(self, SemanticAvailability::Ready { .. })
115 }
116
117 pub fn has_update(&self) -> bool {
119 matches!(self, SemanticAvailability::UpdateAvailable { .. })
120 }
121
122 pub fn is_building(&self) -> bool {
124 matches!(self, SemanticAvailability::IndexBuilding { .. })
125 }
126
127 pub fn is_downloading(&self) -> bool {
129 matches!(self, SemanticAvailability::Downloading { .. })
130 }
131
132 pub fn needs_consent(&self) -> bool {
134 matches!(self, SemanticAvailability::NeedsConsent)
135 }
136
137 pub fn is_hash_fallback(&self) -> bool {
139 matches!(self, SemanticAvailability::HashFallback)
140 }
141
142 pub fn is_disabled(&self) -> bool {
144 matches!(self, SemanticAvailability::Disabled { .. })
145 }
146
147 pub fn is_not_installed(&self) -> bool {
149 matches!(
150 self,
151 SemanticAvailability::NotInstalled | SemanticAvailability::ModelMissing { .. }
152 )
153 }
154
155 pub fn is_error(&self) -> bool {
157 matches!(
158 self,
159 SemanticAvailability::LoadFailed { .. }
160 | SemanticAvailability::DatabaseUnavailable { .. }
161 )
162 }
163
164 pub fn can_search(&self) -> bool {
166 matches!(
167 self,
168 SemanticAvailability::Ready { .. } | SemanticAvailability::HashFallback
169 )
170 }
171
172 pub fn download_progress(&self) -> Option<(u8, u64, u64)> {
174 match self {
175 SemanticAvailability::Downloading {
176 progress_pct,
177 bytes_downloaded,
178 total_bytes,
179 } => Some((*progress_pct, *bytes_downloaded, *total_bytes)),
180 _ => None,
181 }
182 }
183
184 pub fn index_progress(&self) -> Option<(Option<u8>, u64, u64)> {
186 match self {
187 SemanticAvailability::IndexBuilding {
188 progress_pct,
189 items_indexed,
190 total_items,
191 ..
192 } => Some((*progress_pct, *items_indexed, *total_items)),
193 _ => None,
194 }
195 }
196
197 pub fn status_label(&self) -> &'static str {
199 match self {
200 SemanticAvailability::Ready { .. } => "SEM",
201 SemanticAvailability::HashFallback => "SEM*",
202 SemanticAvailability::NotInstalled => "LEX",
203 SemanticAvailability::NeedsConsent => "LEX",
204 SemanticAvailability::Downloading { .. } => "DL...",
205 SemanticAvailability::Verifying => "VFY...",
206 SemanticAvailability::IndexBuilding { .. } => "IDX...",
207 SemanticAvailability::Disabled { .. } => "OFF",
208 SemanticAvailability::ModelMissing { .. } => "NOMODEL",
209 SemanticAvailability::IndexMissing { .. } => "NOIDX",
210 SemanticAvailability::DatabaseUnavailable { .. } => "NODB",
211 SemanticAvailability::LoadFailed { .. } => "ERR",
212 SemanticAvailability::UpdateAvailable { .. } => "UPD",
213 }
214 }
215
216 pub fn summary(&self) -> String {
218 match self {
219 SemanticAvailability::Ready { embedder_id } => {
220 format!("semantic ready ({embedder_id})")
221 }
222 SemanticAvailability::NotInstalled => "model not installed".to_string(),
223 SemanticAvailability::NeedsConsent => "consent required for model download".to_string(),
224 SemanticAvailability::Downloading {
225 progress_pct,
226 bytes_downloaded,
227 total_bytes,
228 } => {
229 let mb_done = *bytes_downloaded as f64 / 1_048_576.0;
230 let mb_total = *total_bytes as f64 / 1_048_576.0;
231 format!("downloading model: {progress_pct}% ({mb_done:.1}/{mb_total:.1} MB)")
232 }
233 SemanticAvailability::Verifying => "verifying model checksum".to_string(),
234 SemanticAvailability::IndexBuilding {
235 items_indexed,
236 total_items,
237 progress_pct,
238 ..
239 } => {
240 if let Some(pct) = progress_pct {
241 format!("building index: {pct}% ({items_indexed}/{total_items})")
242 } else {
243 format!("building index: {items_indexed}/{total_items}")
244 }
245 }
246 SemanticAvailability::HashFallback => "using hash-based fallback".to_string(),
247 SemanticAvailability::Disabled { reason } => {
248 format!("semantic disabled: {reason}")
249 }
250 SemanticAvailability::ModelMissing { model_dir, .. } => {
251 format!("model missing at {}", model_dir.display())
252 }
253 SemanticAvailability::IndexMissing { index_path } => {
254 format!("vector index missing at {}", index_path.display())
255 }
256 SemanticAvailability::DatabaseUnavailable { error, .. } => {
257 format!("db unavailable ({error})")
258 }
259 SemanticAvailability::LoadFailed { context } => {
260 format!("semantic load failed ({context})")
261 }
262 SemanticAvailability::UpdateAvailable {
263 current_revision,
264 latest_revision,
265 ..
266 } => {
267 format!("update available: {current_revision} -> {latest_revision}")
268 }
269 }
270 }
271}
272
273pub struct SemanticContext {
274 pub embedder: Arc<dyn Embedder>,
275 pub index: VectorIndex,
276 pub additional_indexes: Vec<VectorIndex>,
277 pub filter_maps: SemanticFilterMaps,
278 pub roles: Option<HashSet<u8>>,
279}
280
281pub struct SemanticSetup {
282 pub availability: SemanticAvailability,
283 pub context: Option<SemanticContext>,
284}
285
286fn semantic_sidecar_path(data_dir: &Path, recorded_path: &str) -> Option<PathBuf> {
287 semantic_shard_artifact_path_is_safe(recorded_path).then(|| data_dir.join(recorded_path))
288}
289
290fn matching_complete_shard_records(
291 data_dir: &Path,
292 tier: TierKind,
293 embedder_id: &str,
294 db_fingerprint: &str,
295) -> Result<Option<Vec<SemanticShardRecord>>, String> {
296 let manifest = match SemanticShardManifest::load(data_dir) {
297 Ok(Some(manifest)) => manifest,
298 Ok(None) => return Ok(None),
299 Err(err) => return Err(format!("semantic shard manifest: {err}")),
300 };
301 let summary = manifest.summary(tier, embedder_id, db_fingerprint);
302 if !summary.complete {
303 return Ok(None);
304 }
305
306 let mut records = manifest
307 .shards
308 .into_iter()
309 .filter(|shard| shard.matches_generation(tier, embedder_id, db_fingerprint))
310 .collect::<Vec<_>>();
311 records.sort_by_key(|shard| shard.shard_index);
312 if records.len() != usize::try_from(summary.shard_count).unwrap_or(usize::MAX) {
313 return Ok(None);
314 }
315
316 let Some(first) = records.first() else {
317 return Ok(None);
318 };
319 for (expected_index, shard) in records.iter().enumerate() {
320 if shard.shard_index != u32::try_from(expected_index).unwrap_or(u32::MAX)
321 || !shard.ready
322 || !shard.mmap_ready
323 || shard.model_revision != first.model_revision
324 || shard.schema_version != crate::search::policy::SEMANTIC_SCHEMA_VERSION
325 || shard.chunking_version != crate::search::policy::CHUNKING_STRATEGY_VERSION
326 || shard.dimension == 0
327 || shard.dimension != first.dimension
328 || shard.total_conversations != first.total_conversations
329 {
330 return Ok(None);
331 }
332 let Some(path) = semantic_sidecar_path(data_dir, &shard.index_path) else {
333 return Ok(None);
334 };
335 if !path.is_file() {
336 return Ok(None);
337 }
338 }
339
340 Ok(Some(records))
341}
342
343fn load_complete_shard_indexes(
344 data_dir: &Path,
345 embedder_id: &str,
346 db_fingerprint: &str,
347) -> Result<Option<Vec<VectorIndex>>, String> {
348 for tier in [TierKind::Quality, TierKind::Fast] {
349 let Some(records) =
350 matching_complete_shard_records(data_dir, tier, embedder_id, db_fingerprint)?
351 else {
352 continue;
353 };
354
355 let mut indexes = Vec::with_capacity(records.len());
356 for shard in records {
357 let Some(path) = semantic_sidecar_path(data_dir, &shard.index_path) else {
358 return Ok(None);
359 };
360 let index = VectorIndex::open(&path)
361 .map_err(|err| format!("semantic shard vector index {}: {err}", path.display()))?;
362 if index.embedder_id() != embedder_id || index.dimension() != shard.dimension {
363 return Err(format!(
364 "semantic shard vector index {} metadata mismatch",
365 path.display()
366 ));
367 }
368 indexes.push(index);
369 }
370 if !indexes.is_empty() {
371 tracing::info!(
372 tier = tier.as_str(),
373 embedder = embedder_id,
374 shard_count = indexes.len(),
375 "loaded complete semantic shard generation"
376 );
377 return Ok(Some(indexes));
378 }
379 }
380
381 Ok(None)
382}
383
384fn complete_shard_generation_candidate_exists(data_dir: &Path, embedder_id: &str) -> bool {
385 let manifest = match SemanticShardManifest::load(data_dir) {
386 Ok(Some(manifest)) => manifest,
387 Ok(None) => return false,
388 Err(err) => {
389 tracing::debug!(
390 error = %err,
391 embedder = embedder_id,
392 "semantic shard candidate probe could not load manifest"
393 );
394 return false;
395 }
396 };
397
398 let mut candidates = std::collections::HashSet::new();
399 for shard in manifest
400 .shards
401 .iter()
402 .filter(|shard| shard.embedder_id == embedder_id)
403 {
404 candidates.insert((shard.tier, shard.db_fingerprint.as_str()));
405 }
406
407 candidates
408 .into_iter()
409 .any(|(tier, db_fingerprint)| manifest.summary(tier, embedder_id, db_fingerprint).complete)
410}
411
412fn load_complete_shard_indexes_for_current_db(
413 data_dir: &Path,
414 db_path: &Path,
415 embedder_id: &str,
416 context_label: &'static str,
417) -> Option<Vec<VectorIndex>> {
418 let db_fingerprint = match crate::indexer::lexical_storage_fingerprint_for_db(db_path) {
419 Ok(fingerprint) => fingerprint,
420 Err(err) => {
421 tracing::debug!(
422 error = %err,
423 embedder = embedder_id,
424 context = context_label,
425 "semantic shard context unavailable: failed to fingerprint current DB"
426 );
427 return None;
428 }
429 };
430
431 match load_complete_shard_indexes(data_dir, embedder_id, &db_fingerprint) {
432 Ok(indexes) => indexes,
433 Err(err) => {
434 tracing::debug!(
435 error = %err,
436 embedder = embedder_id,
437 context = context_label,
438 "semantic shard context unavailable"
439 );
440 None
441 }
442 }
443}
444
445pub fn load_semantic_context(data_dir: &Path, db_path: &Path) -> SemanticSetup {
450 load_semantic_context_for_embedder(data_dir, db_path, active_policy_embedder_name())
451}
452
453pub fn load_semantic_context_for_embedder(
454 data_dir: &Path,
455 db_path: &Path,
456 embedder_name: &str,
457) -> SemanticSetup {
458 load_semantic_context_inner(data_dir, db_path, true, embedder_name)
459}
460
461pub(crate) fn probe_semantic_availability(data_dir: &Path) -> SemanticAvailability {
465 probe_semantic_availability_for_embedder(data_dir, active_policy_embedder_name())
466}
467
468pub(crate) fn probe_semantic_availability_for_embedder(
469 data_dir: &Path,
470 embedder_name: &str,
471) -> SemanticAvailability {
472 let canonical_name = FastEmbedder::canonical_name(embedder_name).unwrap_or("minilm");
473 let Some(config) = FastEmbedder::config_for(canonical_name) else {
474 return SemanticAvailability::LoadFailed {
475 context: format!("unknown semantic embedder: {embedder_name}"),
476 };
477 };
478 let Some(model_dir) = FastEmbedder::runtime_model_dir_for(data_dir, canonical_name) else {
479 return SemanticAvailability::LoadFailed {
480 context: format!("no model directory mapping for semantic embedder: {embedder_name}"),
481 };
482 };
483 let manifest =
484 ModelManifest::for_embedder(canonical_name).unwrap_or_else(ModelManifest::minilm_v2);
485 let semantic_policy = SemanticPolicy::resolve(&CliSemanticOverrides::default());
486 let acquisition_policy = ModelAcquisitionPolicy::from_semantic_policy(&semantic_policy);
487 let cache_report = classify_model_cache_metadata(&model_dir, &manifest, &acquisition_policy);
488
489 if let Some(availability) =
490 semantic_availability_from_cache_state(&model_dir, &cache_report.state, true)
491 {
492 return availability;
493 }
494
495 let index_path = vector_index_path(data_dir, &config.embedder_id);
496 if !index_path.is_file() {
497 return SemanticAvailability::IndexMissing { index_path };
498 }
499
500 SemanticAvailability::Ready {
501 embedder_id: config.embedder_id,
502 }
503}
504
505pub(crate) fn probe_hash_semantic_availability(data_dir: &Path) -> SemanticAvailability {
507 let embedder = HashEmbedder::default();
508 let index_path = vector_index_path(data_dir, embedder.id());
509 if !index_path.is_file() {
510 SemanticAvailability::IndexMissing { index_path }
511 } else {
512 SemanticAvailability::HashFallback
513 }
514}
515
516pub fn load_hash_semantic_context(data_dir: &Path, db_path: &Path) -> SemanticSetup {
518 let embedder = HashEmbedder::default();
519 let index_path = vector_index_path(data_dir, embedder.id());
520 let monolithic_present = index_path.is_file();
521 let shard_indexes = if monolithic_present
522 || complete_shard_generation_candidate_exists(data_dir, embedder.id())
523 {
524 load_complete_shard_indexes_for_current_db(
525 data_dir,
526 db_path,
527 embedder.id(),
528 "hash semantic",
529 )
530 } else {
531 None
532 };
533 if !monolithic_present && shard_indexes.is_none() {
534 return SemanticSetup {
535 availability: SemanticAvailability::IndexMissing { index_path },
536 context: None,
537 };
538 }
539
540 let storage = match FrankenStorage::open_readonly(db_path) {
541 Ok(storage) => storage,
542 Err(err) => {
543 return SemanticSetup {
544 availability: SemanticAvailability::DatabaseUnavailable {
545 db_path: db_path.to_path_buf(),
546 error: err.to_string(),
547 },
548 context: None,
549 };
550 }
551 };
552
553 let filter_maps = match SemanticFilterMaps::from_storage(&storage) {
554 Ok(maps) => maps,
555 Err(err) => {
556 return SemanticSetup {
557 availability: SemanticAvailability::LoadFailed {
558 context: format!("filter maps: {err}"),
559 },
560 context: None,
561 };
562 }
563 };
564
565 let (index, additional_indexes) = if let Some(mut indexes) = shard_indexes {
566 let index = indexes.remove(0);
567 (index, indexes)
568 } else {
569 match VectorIndex::open(&index_path) {
570 Ok(index) => (index, Vec::new()),
571 Err(err) => {
572 return SemanticSetup {
573 availability: SemanticAvailability::LoadFailed {
574 context: format!("vector index: {err}"),
575 },
576 context: None,
577 };
578 }
579 }
580 };
581
582 let roles = Some(HashSet::from([ROLE_USER, ROLE_ASSISTANT]));
583 let embedder = Arc::new(embedder) as Arc<dyn Embedder>;
584
585 SemanticSetup {
586 availability: SemanticAvailability::HashFallback,
587 context: Some(SemanticContext {
588 embedder,
589 index,
590 additional_indexes,
591 filter_maps,
592 roles,
593 }),
594 }
595}
596
597pub fn load_semantic_context_no_version_check(data_dir: &Path, db_path: &Path) -> SemanticSetup {
602 load_semantic_context_inner(data_dir, db_path, false, active_policy_embedder_name())
603}
604
605fn load_semantic_context_inner(
606 data_dir: &Path,
607 db_path: &Path,
608 check_for_updates: bool,
609 embedder_name: &str,
610) -> SemanticSetup {
611 let canonical_name = FastEmbedder::canonical_name(embedder_name).unwrap_or("minilm");
612 let Some(config) = FastEmbedder::config_for(canonical_name) else {
613 return SemanticSetup {
614 availability: SemanticAvailability::LoadFailed {
615 context: format!("unknown semantic embedder: {embedder_name}"),
616 },
617 context: None,
618 };
619 };
620 let Some(model_dir) = FastEmbedder::runtime_model_dir_for(data_dir, canonical_name) else {
621 return SemanticSetup {
622 availability: SemanticAvailability::LoadFailed {
623 context: format!(
624 "no model directory mapping for semantic embedder: {embedder_name}"
625 ),
626 },
627 context: None,
628 };
629 };
630 let manifest =
631 ModelManifest::for_embedder(canonical_name).unwrap_or_else(ModelManifest::minilm_v2);
632 let semantic_policy = SemanticPolicy::resolve(&CliSemanticOverrides::default());
633 let acquisition_policy = ModelAcquisitionPolicy::from_semantic_policy(&semantic_policy);
634 let cache_report = classify_model_cache(&model_dir, &manifest, &acquisition_policy);
635
636 if let Some(availability) =
637 semantic_availability_from_cache_state(&model_dir, &cache_report.state, check_for_updates)
638 {
639 return SemanticSetup {
640 availability,
641 context: None,
642 };
643 }
644
645 let index_path = vector_index_path(data_dir, &config.embedder_id);
646 let monolithic_present = index_path.is_file();
647 let shard_indexes = if monolithic_present
648 || complete_shard_generation_candidate_exists(data_dir, &config.embedder_id)
649 {
650 load_complete_shard_indexes_for_current_db(
651 data_dir,
652 db_path,
653 &config.embedder_id,
654 "semantic",
655 )
656 } else {
657 None
658 };
659 if !monolithic_present && shard_indexes.is_none() {
660 return SemanticSetup {
661 availability: SemanticAvailability::IndexMissing { index_path },
662 context: None,
663 };
664 }
665
666 let storage = match FrankenStorage::open_readonly(db_path) {
667 Ok(storage) => storage,
668 Err(err) => {
669 return SemanticSetup {
670 availability: SemanticAvailability::DatabaseUnavailable {
671 db_path: db_path.to_path_buf(),
672 error: err.to_string(),
673 },
674 context: None,
675 };
676 }
677 };
678
679 let filter_maps = match SemanticFilterMaps::from_storage(&storage) {
680 Ok(maps) => maps,
681 Err(err) => {
682 return SemanticSetup {
683 availability: SemanticAvailability::LoadFailed {
684 context: format!("filter maps: {err}"),
685 },
686 context: None,
687 };
688 }
689 };
690
691 let (index, additional_indexes) = if let Some(mut indexes) = shard_indexes {
692 let index = indexes.remove(0);
693 (index, indexes)
694 } else {
695 match VectorIndex::open(&index_path) {
696 Ok(index) => (index, Vec::new()),
697 Err(err) => {
698 return SemanticSetup {
699 availability: SemanticAvailability::LoadFailed {
700 context: format!("vector index: {err}"),
701 },
702 context: None,
703 };
704 }
705 }
706 };
707
708 let embedder = match FastEmbedder::load_by_name(data_dir, canonical_name) {
709 Ok(embedder) => Arc::new(embedder) as Arc<dyn Embedder>,
710 Err(err) => {
711 return SemanticSetup {
712 availability: SemanticAvailability::LoadFailed {
713 context: format!("model load: {err}"),
714 },
715 context: None,
716 };
717 }
718 };
719
720 let roles = Some(HashSet::from([ROLE_USER, ROLE_ASSISTANT]));
721
722 SemanticSetup {
723 availability: SemanticAvailability::Ready {
724 embedder_id: embedder.id().to_string(),
725 },
726 context: Some(SemanticContext {
727 embedder,
728 index,
729 additional_indexes,
730 filter_maps,
731 roles,
732 }),
733 }
734}
735
736fn active_policy_embedder_name() -> &'static str {
737 let semantic_policy = SemanticPolicy::resolve(&CliSemanticOverrides::default());
738 FastEmbedder::canonical_name(&semantic_policy.quality_tier_embedder).unwrap_or("minilm")
739}
740
741fn semantic_availability_from_cache_state(
742 model_dir: &Path,
743 state: &ModelCacheState,
744 check_for_updates: bool,
745) -> Option<SemanticAvailability> {
746 match state {
747 ModelCacheState::Acquired { .. }
748 | ModelCacheState::PreseededLocal { .. }
749 | ModelCacheState::MirrorSourced { .. } => None,
750 ModelCacheState::IncompatibleVersion {
751 current_revision,
752 expected_revision,
753 } if check_for_updates => Some(SemanticAvailability::UpdateAvailable {
754 embedder_id: FastEmbedder::embedder_id_static().to_string(),
755 current_revision: current_revision.clone(),
756 latest_revision: expected_revision.clone(),
757 }),
758 ModelCacheState::IncompatibleVersion { .. } => None,
759 ModelCacheState::NotAcquired {
760 missing_files,
761 needs_consent,
762 } => {
763 if *needs_consent {
764 Some(SemanticAvailability::NeedsConsent)
765 } else {
766 Some(SemanticAvailability::ModelMissing {
767 model_dir: model_dir.to_path_buf(),
768 missing_files: missing_files.clone(),
769 })
770 }
771 }
772 ModelCacheState::Acquiring {
773 bytes_present,
774 total_bytes,
775 ..
776 } => {
777 let progress_pct = if *total_bytes == 0 {
778 0
779 } else {
780 ((*bytes_present as f64 / *total_bytes as f64) * 100.0).min(100.0) as u8
781 };
782 Some(SemanticAvailability::Downloading {
783 progress_pct,
784 bytes_downloaded: *bytes_present,
785 total_bytes: *total_bytes,
786 })
787 }
788 ModelCacheState::ChecksumMismatch {
789 file,
790 expected,
791 actual,
792 } => Some(SemanticAvailability::LoadFailed {
793 context: format!(
794 "model checksum mismatch for {file}: expected {expected}, got {actual}"
795 ),
796 }),
797 ModelCacheState::DisabledByPolicy { reason } => Some(SemanticAvailability::Disabled {
798 reason: reason.clone(),
799 }),
800 ModelCacheState::BudgetBlocked {
801 required_bytes,
802 max_bytes,
803 } => Some(SemanticAvailability::Disabled {
804 reason: format!(
805 "semantic model requires {required_bytes} bytes but policy allows {max_bytes}"
806 ),
807 }),
808 ModelCacheState::QuarantinedCorrupt {
809 marker_path,
810 reason,
811 } => Some(SemanticAvailability::LoadFailed {
812 context: format!(
813 "model cache quarantined at {}: {reason}",
814 marker_path.display()
815 ),
816 }),
817 ModelCacheState::OfflineBlocked { missing_files } => Some(SemanticAvailability::Disabled {
818 reason: format!(
819 "offline and semantic model is not acquired: missing {}",
820 missing_files.join(", ")
821 ),
822 }),
823 }
824}
825
826pub fn needs_index_rebuild(data_dir: &Path) -> bool {
834 let index_path = vector_index_path(data_dir, FastEmbedder::embedder_id_static());
835
836 if !index_path.is_file() {
837 return false;
839 }
840
841 match VectorIndex::open(&index_path) {
843 Ok(index) => {
844 let expected_id = FastEmbedder::embedder_id_static();
847 index.embedder_id() != expected_id
848 }
849 Err(_) => {
850 true
852 }
853 }
854}
855
856pub fn delete_vector_index_for_rebuild(data_dir: &Path) -> std::io::Result<bool> {
867 let index_path = vector_index_path(data_dir, FastEmbedder::embedder_id_static());
868
869 if index_path.is_file() {
870 std::fs::remove_file(&index_path)?;
871 Ok(true)
872 } else {
873 Ok(false)
874 }
875}
876
877pub fn default_model_dir(data_dir: &Path) -> PathBuf {
879 FastEmbedder::default_model_dir(data_dir)
880}
881
882pub fn default_model_manifest() -> ModelManifest {
884 ModelManifest::minilm_v2()
885}
886
887#[cfg(test)]
888mod tests {
889 use super::*;
890 use tempfile::tempdir;
891
892 type AvailabilityTuiCase = (
893 SemanticAvailability,
894 &'static str,
895 fn(&SemanticAvailability) -> bool,
896 );
897
898 #[test]
899 fn test_semantic_availability_ready() {
900 let ready = SemanticAvailability::Ready {
901 embedder_id: "test-123".into(),
902 };
903 assert!(ready.summary().contains("semantic ready"));
904 assert!(ready.is_ready());
905 assert!(!ready.has_update());
906 assert!(ready.can_search());
907 assert_eq!(ready.status_label(), "SEM");
908 }
909
910 #[test]
911 fn semantic_sidecar_path_rejects_paths_outside_data_dir() {
912 let tmp = tempdir().unwrap();
913 let safe = semantic_sidecar_path(tmp.path(), "vector_index/shards/hash/shard-0.fsvi")
914 .expect("safe relative shard path");
915 assert_eq!(
916 safe,
917 tmp.path().join("vector_index/shards/hash/shard-0.fsvi")
918 );
919
920 for unsafe_path in [
921 tmp.path()
922 .join("outside.fsvi")
923 .to_string_lossy()
924 .to_string(),
925 "../outside.fsvi".to_string(),
926 "vector_index/../outside.fsvi".to_string(),
927 "./vector_index/shards/hash/shard-0.fsvi".to_string(),
928 ] {
929 assert!(
930 semantic_sidecar_path(tmp.path(), &unsafe_path).is_none(),
931 "unsafe semantic sidecar path should be rejected: {unsafe_path}"
932 );
933 }
934 }
935
936 #[test]
937 fn test_semantic_availability_update() {
938 let update = SemanticAvailability::UpdateAvailable {
939 embedder_id: "test".into(),
940 current_revision: "v1".into(),
941 latest_revision: "v2".into(),
942 };
943 assert!(update.summary().contains("update available"));
944 assert!(!update.is_ready());
945 assert!(update.has_update());
946 assert_eq!(update.status_label(), "UPD");
947 }
948
949 #[test]
950 fn test_semantic_availability_index_building() {
951 let building = SemanticAvailability::IndexBuilding {
952 embedder_id: "test".into(),
953 progress_pct: Some(45),
954 items_indexed: 100,
955 total_items: 200,
956 };
957 assert!(building.summary().contains("building index"));
958 assert!(building.summary().contains("45%"));
959 assert!(building.is_building());
960 assert_eq!(building.status_label(), "IDX...");
961
962 let (pct, done, total) = building.index_progress().unwrap();
963 assert_eq!(pct, Some(45));
964 assert_eq!(done, 100);
965 assert_eq!(total, 200);
966 }
967
968 #[test]
969 fn test_semantic_availability_downloading() {
970 let downloading = SemanticAvailability::Downloading {
971 progress_pct: 50,
972 bytes_downloaded: 10_000_000,
973 total_bytes: 20_000_000,
974 };
975 assert!(downloading.is_downloading());
976 assert!(downloading.summary().contains("downloading"));
977 assert!(downloading.summary().contains("50%"));
978 assert_eq!(downloading.status_label(), "DL...");
979
980 let (pct, bytes, total) = downloading.download_progress().unwrap();
981 assert_eq!(pct, 50);
982 assert_eq!(bytes, 10_000_000);
983 assert_eq!(total, 20_000_000);
984 }
985
986 #[test]
987 fn test_semantic_availability_tui_states() {
988 let cases: &[AvailabilityTuiCase] = &[
989 (
990 SemanticAvailability::NotInstalled,
991 "LEX",
992 SemanticAvailability::is_not_installed,
993 ),
994 (
995 SemanticAvailability::NeedsConsent,
996 "LEX",
997 SemanticAvailability::needs_consent,
998 ),
999 (SemanticAvailability::Verifying, "VFY...", |state| {
1000 state.summary().contains("verifying")
1001 }),
1002 (SemanticAvailability::HashFallback, "SEM*", |state| {
1003 state.is_hash_fallback() && state.can_search()
1004 }),
1005 (
1006 SemanticAvailability::Disabled {
1007 reason: "offline mode".into(),
1008 },
1009 "OFF",
1010 |state| state.is_disabled() && state.summary().contains("offline"),
1011 ),
1012 ];
1013
1014 for (state, expected_label, predicate) in cases {
1015 assert_eq!(state.status_label(), *expected_label, "{state:?}");
1016 assert!(predicate(state), "{state:?}");
1017 }
1018 }
1019
1020 #[test]
1021 fn test_semantic_availability_error_states() {
1022 let load_failed = SemanticAvailability::LoadFailed {
1023 context: "test error".into(),
1024 };
1025 assert!(load_failed.is_error());
1026 assert_eq!(load_failed.status_label(), "ERR");
1027
1028 let db_unavail = SemanticAvailability::DatabaseUnavailable {
1029 db_path: PathBuf::from("/test"),
1030 error: "locked".into(),
1031 };
1032 assert!(db_unavail.is_error());
1033 assert_eq!(db_unavail.status_label(), "NODB");
1034 }
1035
1036 #[test]
1037 fn test_needs_index_rebuild_no_index() {
1038 let tmp = tempdir().unwrap();
1039 assert!(!needs_index_rebuild(tmp.path()));
1040 }
1041
1042 #[test]
1043 fn test_delete_vector_index_no_file() {
1044 let tmp = tempdir().unwrap();
1045 let result = delete_vector_index_for_rebuild(tmp.path());
1046 assert!(result.is_ok());
1047 assert!(!result.unwrap());
1048 }
1049
1050 fn write_hash_vector_index(path: &Path, record_count: usize) {
1051 let embedder = HashEmbedder::default();
1052 if let Some(parent) = path.parent() {
1053 std::fs::create_dir_all(parent).expect("create vector index parent");
1054 }
1055 let mut writer = VectorIndex::create_with_revision(
1056 path,
1057 embedder.id(),
1058 "hash",
1059 embedder.dimension(),
1060 frankensearch::index::Quantization::F16,
1061 )
1062 .expect("create hash vector index");
1063 let mut vector = vec![0.0_f32; embedder.dimension()];
1064 vector[0] = 1.0;
1065 for idx in 0..record_count {
1066 writer
1067 .write_record(&format!("doc-{idx}"), &vector)
1068 .expect("write hash vector record");
1069 }
1070 writer.finish().expect("finish hash vector index");
1071 }
1072
1073 fn semantic_shard_record(
1074 tier: TierKind,
1075 embedder_id: &str,
1076 db_fingerprint: &str,
1077 shard_index: u32,
1078 shard_count: u32,
1079 ) -> SemanticShardRecord {
1080 SemanticShardRecord {
1081 tier,
1082 embedder_id: embedder_id.to_string(),
1083 model_revision: "test-revision".to_string(),
1084 schema_version: crate::search::policy::SEMANTIC_SCHEMA_VERSION,
1085 chunking_version: crate::search::policy::CHUNKING_STRATEGY_VERSION,
1086 dimension: 384,
1087 shard_index,
1088 shard_count,
1089 doc_count: 1,
1090 total_conversations: 1,
1091 db_fingerprint: db_fingerprint.to_string(),
1092 index_path: format!("vector_index/shards/{embedder_id}/shard-{shard_index}.fsvi"),
1093 quantization: "f16".to_string(),
1094 mmap_ready: true,
1095 ann_index_path: None,
1096 ann_size_bytes: 0,
1097 ann_ready: false,
1098 size_bytes: 128,
1099 started_at_ms: 1_733_100_000_000,
1100 completed_at_ms: 1_733_100_000_000 + i64::from(shard_index),
1101 ready: true,
1102 }
1103 }
1104
1105 #[test]
1106 fn shard_candidate_probe_is_false_without_manifest() {
1107 let tmp = tempdir().unwrap();
1108 assert!(
1109 !complete_shard_generation_candidate_exists(tmp.path(), "fnv1a-384"),
1110 "missing shard manifest must not trigger a current-DB fingerprint"
1111 );
1112 }
1113
1114 #[test]
1115 fn shard_candidate_probe_is_false_for_unreadable_manifest() {
1116 let tmp = tempdir().unwrap();
1117 let path = SemanticShardManifest::path(tmp.path());
1118 std::fs::create_dir_all(path.parent().expect("manifest parent"))
1119 .expect("create shard manifest dir");
1120 std::fs::write(&path, b"not json").expect("write invalid shard manifest");
1121
1122 assert!(
1123 !complete_shard_generation_candidate_exists(tmp.path(), "fnv1a-384"),
1124 "corrupt shard metadata must not trigger a query-time current-DB fingerprint"
1125 );
1126 }
1127
1128 #[test]
1129 fn shard_candidate_probe_ignores_other_or_incomplete_generations() {
1130 let tmp = tempdir().unwrap();
1131 let mut manifest = SemanticShardManifest {
1132 shards: vec![
1133 semantic_shard_record(TierKind::Fast, "other-384", "fp-other", 0, 1),
1134 semantic_shard_record(TierKind::Fast, "fnv1a-384", "fp-partial", 0, 2),
1135 ],
1136 ..Default::default()
1137 };
1138 manifest.save(tmp.path()).expect("save shard manifest");
1139
1140 assert!(
1141 !complete_shard_generation_candidate_exists(tmp.path(), "fnv1a-384"),
1142 "incomplete or unrelated shard generations must not trigger a current-DB fingerprint"
1143 );
1144 }
1145
1146 #[test]
1147 fn shard_candidate_probe_detects_complete_generation_for_embedder() {
1148 let tmp = tempdir().unwrap();
1149 let mut manifest = SemanticShardManifest {
1150 shards: vec![
1151 semantic_shard_record(TierKind::Fast, "fnv1a-384", "fp-current", 0, 2),
1152 semantic_shard_record(TierKind::Fast, "fnv1a-384", "fp-current", 1, 2),
1153 ],
1154 ..Default::default()
1155 };
1156 manifest.save(tmp.path()).expect("save shard manifest");
1157
1158 assert!(
1159 complete_shard_generation_candidate_exists(tmp.path(), "fnv1a-384"),
1160 "complete candidate generations should allow the current-DB fingerprint check"
1161 );
1162 }
1163
1164 #[test]
1165 fn load_hash_context_prefers_current_complete_shards_over_monolithic_file() {
1166 let tmp = tempdir().unwrap();
1167 let db_path = tmp.path().join("cass.db");
1168 let storage = FrankenStorage::open(&db_path).expect("create cass db");
1169 drop(storage);
1170 let db_fingerprint = crate::indexer::lexical_storage_fingerprint_for_db(&db_path)
1171 .expect("fingerprint cass db");
1172
1173 let embedder = HashEmbedder::default();
1174 write_hash_vector_index(&vector_index_path(tmp.path(), embedder.id()), 1);
1175
1176 let mut records = Vec::new();
1177 for shard_index in 0..2_u32 {
1178 let relative_path = format!("vector_index/shards/hash/shard-{shard_index}.fsvi");
1179 let shard_path = tmp.path().join(&relative_path);
1180 write_hash_vector_index(&shard_path, 1);
1181 records.push(SemanticShardRecord {
1182 tier: TierKind::Fast,
1183 embedder_id: embedder.id().to_string(),
1184 model_revision: "hash".to_string(),
1185 schema_version: crate::search::policy::SEMANTIC_SCHEMA_VERSION,
1186 chunking_version: crate::search::policy::CHUNKING_STRATEGY_VERSION,
1187 dimension: embedder.dimension(),
1188 shard_index,
1189 shard_count: 2,
1190 doc_count: 1,
1191 total_conversations: 1,
1192 db_fingerprint: db_fingerprint.clone(),
1193 index_path: relative_path,
1194 quantization: "f16".to_string(),
1195 mmap_ready: true,
1196 ann_index_path: None,
1197 ann_size_bytes: 0,
1198 ann_ready: false,
1199 size_bytes: std::fs::metadata(&shard_path)
1200 .expect("stat hash shard")
1201 .len(),
1202 started_at_ms: 1_733_100_000_000,
1203 completed_at_ms: 1_733_100_000_000 + i64::from(shard_index),
1204 ready: true,
1205 });
1206 }
1207 let mut manifest = SemanticShardManifest {
1208 shards: records,
1209 ..Default::default()
1210 };
1211 manifest.save(tmp.path()).expect("save shard manifest");
1212
1213 let setup = load_hash_semantic_context(tmp.path(), &db_path);
1214 assert!(
1215 matches!(setup.availability, SemanticAvailability::HashFallback),
1216 "hash semantic availability should remain ready: {:?}",
1217 setup.availability
1218 );
1219 let context = setup
1220 .context
1221 .expect("complete current shards should load a semantic context");
1222 assert_eq!(
1223 context.additional_indexes.len(),
1224 1,
1225 "complete current shards must not be shadowed by an older monolithic vector file"
1226 );
1227 let loaded_records = context.index.record_count()
1228 + context
1229 .additional_indexes
1230 .iter()
1231 .map(VectorIndex::record_count)
1232 .sum::<usize>();
1233 assert_eq!(loaded_records, 2);
1234 }
1235}