orbok_workers/
embedding.rs1use orbok_cache::{CacheService, EngineOptions, OrbokCacheNamespace};
6use orbok_core::{FileId, ModelId, OrbokError, OrbokResult};
7use orbok_db::Catalog;
8use orbok_db::repo::{
9 ChunkRepository, EmbeddingRepository, FileRepository, NewEmbedding, SourceRepository,
10};
11use orbok_extract::ExtractOutput;
12use orbok_fs::{GuardedSource, PathGuard};
13use orbok_models::{EmbeddingModel, MockEmbeddingModel};
14use std::path::Path;
15
16pub struct EmbeddingWorker<'a> {
18 catalog: &'a Catalog,
19 cache: &'a CacheService,
20 model: Box<dyn EmbeddingModel>,
21 model_id: ModelId,
22}
23
24impl<'a> EmbeddingWorker<'a> {
25 pub fn with_mock(catalog: &'a Catalog, cache: &'a CacheService) -> Self {
27 Self {
28 catalog,
29 cache,
30 model: Box::new(MockEmbeddingModel),
31 model_id: ModelId::from_string("mock_mock-v1".to_string()),
32 }
33 }
34
35 pub fn with_model(
39 catalog: &'a Catalog,
40 cache: &'a CacheService,
41 model: Box<dyn EmbeddingModel>,
42 model_id: ModelId,
43 ) -> Self {
44 Self {
45 catalog,
46 cache,
47 model,
48 model_id,
49 }
50 }
51
52 pub fn run(&self, file_id: &FileId) -> OrbokResult<()> {
54 let files = FileRepository::new(self.catalog);
55 let record = files.get_by_id(file_id)?.ok_or(OrbokError::FileNotFound)?;
56 let sources = SourceRepository::new(self.catalog);
57 let source = sources
58 .get(&record.source_id)?
59 .ok_or(OrbokError::SourceNotFound)?;
60
61 let guard = PathGuard::new(vec![GuardedSource::from_record(&source)]);
64 let validated = guard.validate(Path::new(&record.canonical_path))?;
65 let engine = self.cache.engine::<ExtractOutput>(
66 self.catalog,
67 &OrbokCacheNamespace::ExtractSegments,
68 EngineOptions::default(),
69 )?;
70 let Some(extract_output) = CacheService::get_fresh(&engine, &validated)? else {
71 return Ok(()); };
73
74 let chunks = ChunkRepository::new(self.catalog).list_for_file(file_id)?;
76 if chunks.is_empty() {
77 return Ok(());
78 }
79
80 let all_text: String = extract_output
84 .segments
85 .iter()
86 .map(|s| s.text.as_str())
87 .collect::<Vec<_>>()
88 .join("\n");
89 let texts: Vec<String> = chunks
90 .iter()
91 .map(|chunk| {
92 if let Some(heading) = &chunk.heading_path {
93 format!("{heading}\n{all_text}")
94 } else {
95 all_text.clone()
96 }
97 })
98 .collect();
99
100 let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
101 let vectors = self.model.embed_batch(&text_refs)?;
102
103 let embeddings = EmbeddingRepository::new(self.catalog);
104 for (chunk, vector) in chunks.iter().zip(vectors.into_iter()) {
105 embeddings.upsert(&NewEmbedding {
106 chunk_id: chunk.chunk_id.clone(),
107 model_id: self.model_id.clone(),
108 dimension: self.model.dimension(),
109 vector,
110 })?;
111 }
112 Ok(())
113 }
114
115 pub fn model_id(&self) -> &ModelId {
116 &self.model_id
117 }
118}