Skip to main content

orbok_workers/
embedding.rs

1//! Embedding worker (RFC-008 §14): reads chunk normalized text from the
2//! extraction cache, embeds it in batches, and stores vectors in the
3//! catalog. Chunk text is consumed and not logged (NFR-014).
4
5use orbok_cache::{CacheService, EngineOptions, OrbokCacheNamespace};
6use orbok_core::{FileId, ModelId, OrbokError, OrbokResult};
7use orbok_db::Catalog;
8use orbok_db::repo::{
9    ChunkRepository, EmbeddingRepository, FileRepository, NewEmbedding, SourceRepository,
10};
11use orbok_extract::ExtractOutput;
12use orbok_fs::{GuardedSource, PathGuard};
13use orbok_models::{EmbeddingModel, MockEmbeddingModel};
14use std::path::Path;
15
16/// Embedding worker for one file.
17pub struct EmbeddingWorker<'a> {
18    catalog: &'a Catalog,
19    cache: &'a CacheService,
20    model: Box<dyn EmbeddingModel>,
21    model_id: ModelId,
22}
23
24impl<'a> EmbeddingWorker<'a> {
25    /// Use the mock model (tests, or when no real model is installed).
26    pub fn with_mock(catalog: &'a Catalog, cache: &'a CacheService) -> Self {
27        Self {
28            catalog,
29            cache,
30            model: Box::new(MockEmbeddingModel),
31            model_id: ModelId::from_string("mock_mock-v1".to_string()),
32        }
33    }
34
35    /// Embed all active chunks of a file and persist vectors.
36    pub fn run(&self, file_id: &FileId) -> OrbokResult<()> {
37        let files = FileRepository::new(self.catalog);
38        let record = files
39            .get_by_id(file_id)?
40            .ok_or(OrbokError::FileNotFound)?;
41        let sources = SourceRepository::new(self.catalog);
42        let source = sources
43            .get(&record.source_id)?
44            .ok_or(OrbokError::SourceNotFound)?;
45
46        // Re-use the extraction cache to get chunk texts (contentless FTS
47        // stores no text; cache is the source for embedding text, Appendix A §9.3).
48        let guard = PathGuard::new(vec![GuardedSource::from_record(&source)]);
49        let validated = guard.validate(Path::new(&record.canonical_path))?;
50        let engine = self.cache.engine::<ExtractOutput>(
51            self.catalog,
52            &OrbokCacheNamespace::ExtractSegments,
53            EngineOptions::default(),
54        )?;
55        let Some(extract_output) = CacheService::get_fresh(&engine, &validated)? else {
56            return Ok(()); // No extraction cache yet — skip (will retry later).
57        };
58
59        // Get active chunks for this file.
60        let chunks = ChunkRepository::new(self.catalog).list_for_file(file_id)?;
61        if chunks.is_empty() {
62            return Ok(());
63        }
64
65        // Build chunk texts: combine heading + normalized text from extraction
66        // segments aligned to the chunk line range. For now, use the full
67        // document text for the parent chunk and per-section text for children.
68        let all_text: String = extract_output
69            .segments
70            .iter()
71            .map(|s| s.text.as_str())
72            .collect::<Vec<_>>()
73            .join("\n");
74        let texts: Vec<String> = chunks
75            .iter()
76            .map(|chunk| {
77                if let Some(heading) = &chunk.heading_path {
78                    format!("{heading}\n{all_text}")
79                } else {
80                    all_text.clone()
81                }
82            })
83            .collect();
84
85        let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
86        let vectors = self.model.embed_batch(&text_refs)?;
87
88        let embeddings = EmbeddingRepository::new(self.catalog);
89        for (chunk, vector) in chunks.iter().zip(vectors.into_iter()) {
90            embeddings.upsert(&NewEmbedding {
91                chunk_id: chunk.chunk_id.clone(),
92                model_id: self.model_id.clone(),
93                dimension: self.model.dimension(),
94                vector,
95            })?;
96        }
97        Ok(())
98    }
99
100    pub fn model_id(&self) -> &ModelId {
101        &self.model_id
102    }
103}