use crate::{ChunkAndIndexWorker, EmbeddingWorker, ExtractionWorker, run_pending};
use orbok_cache::CacheService;
use orbok_core::{
FileStatus, HiddenFilePolicy, IndexMode, JobType, PersistenceMode, SourceType, SymlinkPolicy,
};
use orbok_db::Catalog;
use orbok_db::repo::{
EmbeddingRepository, FileRepository, IndexJobRepository, NewFile, NewSource, ObservedMetadata,
SourceRepository,
};
use orbok_models::MockEmbeddingModel;
use orbok_search::{HybridSearchService, SearchMode, rrf_fuse};
use rusqlite::params;
use std::fs;
fn setup(root: &std::path::Path) -> (Catalog, CacheService) {
let catalog = Catalog::open(root.join("catalog.sqlite3")).unwrap();
let cache = CacheService::new(root);
(catalog, cache)
}
fn seed_and_run(
catalog: &Catalog,
cache: &CacheService,
root: &std::path::Path,
name: &str,
content: &str,
) -> orbok_core::FileId {
let path = root.join(name);
fs::write(&path, content).unwrap();
let canonical = fs::canonicalize(&path)
.unwrap()
.to_string_lossy()
.to_string();
let root_canon = fs::canonicalize(root)
.unwrap()
.to_string_lossy()
.to_string();
let src = SourceRepository::new(catalog)
.insert(NewSource {
source_type: SourceType::File,
persistence_mode: PersistenceMode::Persistent,
display_name: Some(name.into()),
original_path: canonical.clone(),
canonical_path: root_canon,
index_mode: IndexMode::Balanced,
include_patterns: vec![],
exclude_patterns: vec![],
hidden_file_policy: HiddenFilePolicy::Exclude,
symlink_policy: SymlinkPolicy::Ignore,
max_file_size_bytes: None,
})
.unwrap();
let file = FileRepository::new(catalog)
.insert(NewFile {
source_id: src.source_id.clone(),
original_path: canonical.clone(),
canonical_path: canonical.clone(),
display_path: name.into(),
extension: Some("md".into()),
metadata: ObservedMetadata {
file_size_bytes: content.len() as u64,
modified_at: Some("2026-01-01T00:00:00Z".into()),
platform_file_key: None,
content_hash: Some("abc".into()),
},
status: FileStatus::Discovered,
})
.unwrap();
IndexJobRepository::new(catalog)
.enqueue(JobType::Extract, Some(&src.source_id), Some(&file.file_id))
.unwrap();
let extract = ExtractionWorker::new(catalog, cache);
let chunk = ChunkAndIndexWorker::new(catalog, cache);
run_pending(catalog, &extract, &chunk, None, 50).unwrap();
file.file_id
}
fn seed_mock_model(catalog: &Catalog) -> orbok_core::ModelId {
let model_id = orbok_core::ModelId::from_string("mock_mock-v1".to_string());
let now = "2026-01-01T00:00:00Z";
catalog.lock().execute(
"INSERT OR IGNORE INTO models (model_id, role, model_name, model_version, \
dimension, status, created_at, updated_at) VALUES (?1,'embedding','mock','v1',8,'available',?2,?2)",
params![model_id.as_str(), now],
).unwrap();
model_id
}
#[test]
fn embedding_worker_generates_and_stores_vectors() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(
&catalog,
&cache,
dir.path(),
"doc.md",
"# Guide\n\nThis is important documentation.\n",
);
seed_mock_model(&catalog);
let embed = EmbeddingWorker::with_mock(&catalog, &cache);
embed.run(&file_id).unwrap();
let count = EmbeddingRepository::new(&catalog)
.count_active("mock_mock-v1")
.unwrap();
assert!(count > 0, "embeddings must be stored after worker run");
}
#[test]
fn vector_search_returns_nearest_candidate() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
seed_and_run(
&catalog,
&cache,
dir.path(),
"a.md",
"token expiry policy\n",
);
seed_and_run(
&catalog,
&cache,
dir.path(),
"b.md",
"gardening tips and tricks\n",
);
seed_mock_model(&catalog);
let embed = EmbeddingWorker::with_mock(&catalog, &cache);
for fid in [
FileRepository::new(&catalog)
.get_by_path_str("a.md")
.unwrap(),
FileRepository::new(&catalog)
.get_by_path_str("b.md")
.unwrap(),
]
.into_iter()
.flatten()
{
embed.run(&fid.file_id).unwrap();
}
let model = MockEmbeddingModel;
let service = HybridSearchService::with_model(&catalog, &model, "mock_mock-v1");
let results = service
.search("token expiry", SearchMode::Conceptual, 5)
.unwrap();
assert!(!results.is_empty(), "vector search should return results");
assert!(
results[0]
.badges
.iter()
.any(|b| *b == orbok_search::MatchBadge::Semantic)
);
}
#[test]
fn rrf_fusion_combines_keyword_and_vector() {
use orbok_core::{ChunkId, FileId};
use orbok_models::VectorCandidate;
use orbok_search::KeywordCandidate;
let chunk_a = ChunkId::from_string("c_a".to_string());
let chunk_b = ChunkId::from_string("c_b".to_string());
let chunk_c = ChunkId::from_string("c_c".to_string());
let file = FileId::from_string("f_1".to_string());
let kw = vec![
KeywordCandidate {
chunk_id: chunk_a.clone(),
file_id: file.clone(),
rank: 1,
score: -1.0,
},
KeywordCandidate {
chunk_id: chunk_c.clone(),
file_id: file.clone(),
rank: 2,
score: -2.0,
},
];
let vc = vec![
VectorCandidate {
chunk_id: chunk_b.clone(),
file_id: file.clone(),
rank: 1,
score: 0.9,
},
VectorCandidate {
chunk_id: chunk_a.clone(),
file_id: file.clone(),
rank: 2,
score: 0.7,
},
];
let fused = rrf_fuse(&kw, &vc, 10);
assert_eq!(fused.len(), 3);
assert_eq!(fused[0].chunk_id.as_str(), "c_a");
assert!(fused[0].keyword_rank.is_some() && fused[0].vector_rank.is_some());
}
#[test]
fn keyword_only_mode_works_without_model() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
seed_and_run(
&catalog,
&cache,
dir.path(),
"notes.md",
"refresh token expiry\n",
);
let service = HybridSearchService::keyword_only(&catalog);
assert!(!service.is_hybrid());
let results = service.search("refresh", SearchMode::Auto, 10).unwrap();
assert!(!results.is_empty());
}
#[test]
fn model_change_marks_embeddings_stale() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(&catalog, &cache, dir.path(), "doc.md", "some content\n");
seed_mock_model(&catalog);
EmbeddingWorker::with_mock(&catalog, &cache)
.run(&file_id)
.unwrap();
assert!(
EmbeddingRepository::new(&catalog)
.count_active("mock_mock-v1")
.unwrap()
> 0
);
EmbeddingRepository::new(&catalog)
.mark_stale_for_model("mock_mock-v1")
.unwrap();
assert_eq!(
EmbeddingRepository::new(&catalog)
.count_active("mock_mock-v1")
.unwrap(),
0
);
}
#[test]
fn stale_chunks_excluded_from_vector_search() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(
&catalog,
&cache,
dir.path(),
"doc.md",
"secret information\n",
);
seed_mock_model(&catalog);
EmbeddingWorker::with_mock(&catalog, &cache)
.run(&file_id)
.unwrap();
catalog
.lock()
.execute("UPDATE chunks SET chunk_status='stale'", [])
.unwrap();
let count = EmbeddingRepository::new(&catalog)
.count_active("mock_mock-v1")
.unwrap();
assert_eq!(
count, 0,
"stale chunks must not appear as active embeddings"
);
}
#[test]
fn deleting_embeddings_does_not_delete_catalog() {
let dir = tempfile::tempdir().unwrap();
let (catalog, cache) = setup(dir.path());
let file_id = seed_and_run(&catalog, &cache, dir.path(), "doc.md", "preserved source\n");
seed_mock_model(&catalog);
EmbeddingWorker::with_mock(&catalog, &cache)
.run(&file_id)
.unwrap();
catalog
.lock()
.execute("DELETE FROM embeddings", [])
.unwrap();
assert!(
FileRepository::new(&catalog)
.get_by_id(&file_id)
.unwrap()
.is_some()
);
}