Skip to main content

orbok_workers/
embedding.rs

1//! Embedding worker (RFC-008 §14): reads chunk normalized text from the
2//! extraction cache, embeds it in batches, and stores vectors in the
3//! catalog. Chunk text is consumed and not logged (NFR-014).
4
5use orbok_cache::{CacheService, EngineOptions, OrbokCacheNamespace};
6use orbok_core::{FileId, ModelId, OrbokError, OrbokResult};
7use orbok_db::Catalog;
8use orbok_db::repo::{
9    ChunkRepository, EmbeddingRepository, FileRepository, NewEmbedding, SourceRepository,
10};
11use orbok_extract::ExtractOutput;
12use orbok_fs::{GuardedSource, PathGuard};
13use orbok_models::{EmbeddingModel, MockEmbeddingModel};
14use std::path::Path;
15
16/// Embedding worker for one file.
17pub struct EmbeddingWorker<'a> {
18    catalog: &'a Catalog,
19    cache: &'a CacheService,
20    model: Box<dyn EmbeddingModel>,
21    model_id: ModelId,
22}
23
24impl<'a> EmbeddingWorker<'a> {
25    /// Use the mock model (tests, or when no real model is installed).
26    pub fn with_mock(catalog: &'a Catalog, cache: &'a CacheService) -> Self {
27        Self {
28            catalog,
29            cache,
30            model: Box::new(MockEmbeddingModel),
31            model_id: ModelId::from_string("mock_mock-v1".to_string()),
32        }
33    }
34
35    /// Use a specific embedding model (real or mock).
36    /// Supply a stable `model_id` string for registry lookup
37    /// (e.g. `"mock_mock-v1"` or `"embedding_multilingual-e5-small-v1"`).
38    pub fn with_model(
39        catalog: &'a Catalog,
40        cache: &'a CacheService,
41        model: Box<dyn EmbeddingModel>,
42        model_id: ModelId,
43    ) -> Self {
44        Self {
45            catalog,
46            cache,
47            model,
48            model_id,
49        }
50    }
51
52    /// Embed all active chunks of a file and persist vectors.
53    pub fn run(&self, file_id: &FileId) -> OrbokResult<()> {
54        let files = FileRepository::new(self.catalog);
55        let record = files.get_by_id(file_id)?.ok_or(OrbokError::FileNotFound)?;
56        let sources = SourceRepository::new(self.catalog);
57        let source = sources
58            .get(&record.source_id)?
59            .ok_or(OrbokError::SourceNotFound)?;
60
61        // Re-use the extraction cache to get chunk texts (contentless FTS
62        // stores no text; cache is the source for embedding text, Appendix A §9.3).
63        let guard = PathGuard::new(vec![GuardedSource::from_record(&source)]);
64        let validated = guard.validate(Path::new(&record.canonical_path))?;
65        let engine = self.cache.engine::<ExtractOutput>(
66            self.catalog,
67            &OrbokCacheNamespace::ExtractSegments,
68            EngineOptions::default(),
69        )?;
70        let Some(extract_output) = CacheService::get_fresh(&engine, &validated)? else {
71            return Ok(()); // No extraction cache yet — skip (will retry later).
72        };
73
74        // Get active chunks for this file.
75        let chunks = ChunkRepository::new(self.catalog).list_for_file(file_id)?;
76        if chunks.is_empty() {
77            return Ok(());
78        }
79
80        // Build chunk texts: combine heading + normalized text from extraction
81        // segments aligned to the chunk line range. For now, use the full
82        // document text for the parent chunk and per-section text for children.
83        let all_text: String = extract_output
84            .segments
85            .iter()
86            .map(|s| s.text.as_str())
87            .collect::<Vec<_>>()
88            .join("\n");
89        let texts: Vec<String> = chunks
90            .iter()
91            .map(|chunk| {
92                if let Some(heading) = &chunk.heading_path {
93                    format!("{heading}\n{all_text}")
94                } else {
95                    all_text.clone()
96                }
97            })
98            .collect();
99
100        let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
101        let vectors = self.model.embed_batch(&text_refs)?;
102
103        let embeddings = EmbeddingRepository::new(self.catalog);
104        for (chunk, vector) in chunks.iter().zip(vectors.into_iter()) {
105            embeddings.upsert(&NewEmbedding {
106                chunk_id: chunk.chunk_id.clone(),
107                model_id: self.model_id.clone(),
108                dimension: self.model.dimension(),
109                vector,
110            })?;
111        }
112        Ok(())
113    }
114
115    pub fn model_id(&self) -> &ModelId {
116        &self.model_id
117    }
118}