orbok_workers/
embedding.rs1use orbok_cache::{CacheService, EngineOptions, OrbokCacheNamespace};
6use orbok_core::{FileId, ModelId, OrbokError, OrbokResult};
7use orbok_db::Catalog;
8use orbok_db::repo::{
9 ChunkRepository, EmbeddingRepository, FileRepository, NewEmbedding, SourceRepository,
10};
11use orbok_extract::ExtractOutput;
12use orbok_fs::{GuardedSource, PathGuard};
13use orbok_models::{EmbeddingModel, MockEmbeddingModel};
14use std::path::Path;
15
16pub struct EmbeddingWorker<'a> {
18 catalog: &'a Catalog,
19 cache: &'a CacheService,
20 model: Box<dyn EmbeddingModel>,
21 model_id: ModelId,
22}
23
24impl<'a> EmbeddingWorker<'a> {
25 pub fn with_mock(catalog: &'a Catalog, cache: &'a CacheService) -> Self {
27 Self {
28 catalog,
29 cache,
30 model: Box::new(MockEmbeddingModel),
31 model_id: ModelId::from_string("mock_mock-v1".to_string()),
32 }
33 }
34
35 pub fn run(&self, file_id: &FileId) -> OrbokResult<()> {
37 let files = FileRepository::new(self.catalog);
38 let record = files
39 .get_by_id(file_id)?
40 .ok_or(OrbokError::FileNotFound)?;
41 let sources = SourceRepository::new(self.catalog);
42 let source = sources
43 .get(&record.source_id)?
44 .ok_or(OrbokError::SourceNotFound)?;
45
46 let guard = PathGuard::new(vec![GuardedSource::from_record(&source)]);
49 let validated = guard.validate(Path::new(&record.canonical_path))?;
50 let engine = self.cache.engine::<ExtractOutput>(
51 self.catalog,
52 &OrbokCacheNamespace::ExtractSegments,
53 EngineOptions::default(),
54 )?;
55 let Some(extract_output) = CacheService::get_fresh(&engine, &validated)? else {
56 return Ok(()); };
58
59 let chunks = ChunkRepository::new(self.catalog).list_for_file(file_id)?;
61 if chunks.is_empty() {
62 return Ok(());
63 }
64
65 let all_text: String = extract_output
69 .segments
70 .iter()
71 .map(|s| s.text.as_str())
72 .collect::<Vec<_>>()
73 .join("\n");
74 let texts: Vec<String> = chunks
75 .iter()
76 .map(|chunk| {
77 if let Some(heading) = &chunk.heading_path {
78 format!("{heading}\n{all_text}")
79 } else {
80 all_text.clone()
81 }
82 })
83 .collect();
84
85 let text_refs: Vec<&str> = texts.iter().map(|s| s.as_str()).collect();
86 let vectors = self.model.embed_batch(&text_refs)?;
87
88 let embeddings = EmbeddingRepository::new(self.catalog);
89 for (chunk, vector) in chunks.iter().zip(vectors.into_iter()) {
90 embeddings.upsert(&NewEmbedding {
91 chunk_id: chunk.chunk_id.clone(),
92 model_id: self.model_id.clone(),
93 dimension: self.model.dimension(),
94 vector,
95 })?;
96 }
97 Ok(())
98 }
99
100 pub fn model_id(&self) -> &ModelId {
101 &self.model_id
102 }
103}