use crate::{ChunkAndIndexWorker, EmbeddingWorker, ExtractionWorker, run_pending};
use orbok_cache::CacheService;
use orbok_core::{
FileStatus, HiddenFilePolicy, IndexMode, JobType, PersistenceMode, SourceType, SymlinkPolicy,
};
use orbok_db::Catalog;
use orbok_db::repo::{
EmbeddingRepository, FileRepository, IndexJobRepository, NewFile, NewSource, ObservedMetadata,
SourceRepository,
};
use orbok_search::{HybridSearchService, SearchMode, rrf_fuse};
use orbok_models::MockEmbeddingModel;
use rusqlite::params;
use std::fs;
fn setup(root: &std::path::Path) -> (Catalog, CacheService) {
let catalog = Catalog::open(root.join("catalog.sqlite3")).unwrap();
let cache = CacheService::new(root);
(catalog, cache)
}
fn seed_and_run(catalog: &Catalog, cache: &CacheService, root: &std::path::Path,
name: &str, content: &str) -> orbok_core::FileId {
let path = root.join(name);
fs::write(&path, content).unwrap();
let canonical = fs::canonicalize(&path).unwrap().to_string_lossy().to_string();
let root_canon = fs::canonicalize(root).unwrap().to_string_lossy().to_string();
let src = SourceRepository::new(catalog).insert(NewSource {
source_type: SourceType::File,
persistence_mode: PersistenceMode::Persistent,
display_name: Some(name.into()),
original_path: canonical.clone(),
canonical_path: root_canon,
index_mode: IndexMode::Balanced,
include_patterns: vec![],
exclude_patterns: vec![],
hidden_file_policy: HiddenFilePolicy::Exclude,
symlink_policy: SymlinkPolicy::Ignore,
max_file_size_bytes: None,
}).unwrap();
let file = FileRepository::new(catalog).insert(NewFile {
source_id: src.source_id.clone(),
original_path: canonical.clone(),
canonical_path: canonical.clone(),
display_path: name.into(),
extension: Some("md".into()),
metadata: ObservedMetadata {
file_size_bytes: content.len() as u64,
modified_at: Some("2026-01-01T00:00:00Z".into()),
platform_file_key: None,
content_hash: Some("abc".into()),
},
status: FileStatus::Discovered,
}).unwrap();
IndexJobRepository::new(catalog)
.enqueue(JobType::Extract, Some(&src.source_id), Some(&file.file_id))
.unwrap();
let extract = ExtractionWorker::new(catalog, cache);
let chunk = ChunkAndIndexWorker::new(catalog, cache);
run_pending(catalog, &extract, &chunk, None, 50).unwrap();
file.file_id
}
fn seed_mock_model(catalog: &Catalog) -> orbok_core::ModelId {
let model_id = orbok_core::ModelId::from_string("mock_mock-v1".to_string());
let now = "2026-01-01T00:00:00Z";
catalog.lock().execute(
"INSERT OR IGNORE INTO models (model_id, role, model_name, model_version, \
dimension, status, created_at, updated_at) VALUES (?1,'embedding','mock','v1',8,'available',?2,?2)",
params![model_id.as_str(), now],
).unwrap();
model_id
}
#[test]
fn embedding_worker_generates_and_stores_vectors() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(&catalog, &cache, dir.path(), "doc.md",
"# Guide\n\nThis is important documentation.\n");
seed_mock_model(&catalog);
let embed = EmbeddingWorker::with_mock(&catalog, &cache);
embed.run(&file_id).unwrap();
let count = EmbeddingRepository::new(&catalog)
.count_active("mock_mock-v1").unwrap();
assert!(count > 0, "embeddings must be stored after worker run");
}
#[test]
fn vector_search_returns_nearest_candidate() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
seed_and_run(&catalog, &cache, dir.path(), "a.md", "token expiry policy\n");
seed_and_run(&catalog, &cache, dir.path(), "b.md", "gardening tips and tricks\n");
seed_mock_model(&catalog);
let embed = EmbeddingWorker::with_mock(&catalog, &cache);
for fid in [
FileRepository::new(&catalog).get_by_path_str("a.md").unwrap(),
FileRepository::new(&catalog).get_by_path_str("b.md").unwrap(),
].into_iter().flatten() {
embed.run(&fid.file_id).unwrap();
}
let model = MockEmbeddingModel;
let service = HybridSearchService::with_model(&catalog, &model, "mock_mock-v1");
let results = service.search("token expiry", SearchMode::Conceptual, 5).unwrap();
assert!(!results.is_empty(), "vector search should return results");
assert!(results[0].badges.iter().any(|b| *b == orbok_search::MatchBadge::Semantic));
}
#[test]
fn rrf_fusion_combines_keyword_and_vector() {
use orbok_core::{ChunkId, FileId};
use orbok_models::VectorCandidate;
use orbok_search::KeywordCandidate;
let chunk_a = ChunkId::from_string("c_a".to_string());
let chunk_b = ChunkId::from_string("c_b".to_string());
let chunk_c = ChunkId::from_string("c_c".to_string());
let file = FileId::from_string("f_1".to_string());
let kw = vec![
KeywordCandidate { chunk_id: chunk_a.clone(), file_id: file.clone(), rank: 1, score: -1.0 },
KeywordCandidate { chunk_id: chunk_c.clone(), file_id: file.clone(), rank: 2, score: -2.0 },
];
let vc = vec![
VectorCandidate { chunk_id: chunk_b.clone(), file_id: file.clone(), rank: 1, score: 0.9 },
VectorCandidate { chunk_id: chunk_a.clone(), file_id: file.clone(), rank: 2, score: 0.7 },
];
let fused = rrf_fuse(&kw, &vc, 10);
assert_eq!(fused.len(), 3);
assert_eq!(fused[0].chunk_id.as_str(), "c_a");
assert!(fused[0].keyword_rank.is_some() && fused[0].vector_rank.is_some());
}
#[test]
fn keyword_only_mode_works_without_model() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
seed_and_run(&catalog, &cache, dir.path(), "notes.md", "refresh token expiry\n");
let service = HybridSearchService::keyword_only(&catalog);
assert!(!service.is_hybrid());
let results = service.search("refresh", SearchMode::Auto, 10).unwrap();
assert!(!results.is_empty());
}
#[test]
fn model_change_marks_embeddings_stale() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(&catalog, &cache, dir.path(), "doc.md", "some content\n");
seed_mock_model(&catalog);
EmbeddingWorker::with_mock(&catalog, &cache).run(&file_id).unwrap();
assert!(EmbeddingRepository::new(&catalog).count_active("mock_mock-v1").unwrap() > 0);
EmbeddingRepository::new(&catalog).mark_stale_for_model("mock_mock-v1").unwrap();
assert_eq!(EmbeddingRepository::new(&catalog).count_active("mock_mock-v1").unwrap(), 0);
}
#[test]
fn stale_chunks_excluded_from_vector_search() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(&catalog, &cache, dir.path(), "doc.md", "secret information\n");
seed_mock_model(&catalog);
EmbeddingWorker::with_mock(&catalog, &cache).run(&file_id).unwrap();
catalog.lock().execute(
"UPDATE chunks SET chunk_status='stale'", []).unwrap();
let count = EmbeddingRepository::new(&catalog).count_active("mock_mock-v1").unwrap();
assert_eq!(count, 0, "stale chunks must not appear as active embeddings");
}
#[test]
fn deleting_embeddings_does_not_delete_catalog() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(&catalog, &cache, dir.path(), "doc.md", "preserved source\n");
seed_mock_model(&catalog);
EmbeddingWorker::with_mock(&catalog, &cache).run(&file_id).unwrap();
catalog.lock().execute("DELETE FROM embeddings", []).unwrap();
assert!(FileRepository::new(&catalog).get_by_id(&file_id).unwrap().is_some());
}