orbok_workers/
chunk_and_index.rs1use crate::chunk_adapter::to_chunk_specs;
6use orbok_cache::{CacheService, EngineOptions, OrbokCacheNamespace};
7use orbok_core::{ErrorCategory, ExtractionId, FileId, OrbokError, OrbokResult};
8use orbok_db::Catalog;
9use orbok_db::repo::{ChunkRepository, FileRepository, SourceRepository};
10use orbok_extract::{ExtractOutput, chunk};
11use orbok_fs::{GuardedSource, PathGuard};
12use rusqlite::params;
13use std::path::Path;
14
15pub struct ChunkAndIndexWorker<'a> {
17 catalog: &'a Catalog,
18 cache: &'a CacheService,
19}
20
21impl<'a> ChunkAndIndexWorker<'a> {
22 pub fn new(catalog: &'a Catalog, cache: &'a CacheService) -> Self {
23 Self { catalog, cache }
24 }
25
26 pub fn run(&self, file_id: &FileId) -> OrbokResult<()> {
28 let files = FileRepository::new(self.catalog);
29 let record = files.get_by_id(file_id)?.ok_or(OrbokError::FileNotFound)?;
30 let sources = SourceRepository::new(self.catalog);
31 let source = sources
32 .get(&record.source_id)?
33 .ok_or(OrbokError::SourceNotFound)?;
34
35 let guard = PathGuard::new(vec![GuardedSource::from_record(&source)]);
36 let validated = guard.validate(Path::new(&record.canonical_path))?;
37
38 let engine = self.cache.engine::<ExtractOutput>(
39 self.catalog,
40 &OrbokCacheNamespace::ExtractSegments,
41 EngineOptions::default(),
42 )?;
43 let output = CacheService::get_fresh(&engine, &validated)?.ok_or_else(|| {
44 OrbokError::Extraction {
45 category: ErrorCategory::ParserError,
46 message: "extraction cache miss: run extraction first".into(),
47 }
48 })?;
49
50 let extraction_id = self.latest_extraction_id(file_id)?;
52
53 let file_name = Path::new(&record.display_path)
54 .file_name()
55 .map(|n| n.to_string_lossy().into_owned())
56 .unwrap_or_else(|| record.display_path.clone());
57
58 let raw = chunk(&output, &file_name);
59 let specs = to_chunk_specs(raw);
60 if specs.is_empty() || (specs.len() == 1 && specs[0].normalized_text.is_empty()) {
61 return Ok(());
62 }
63
64 ChunkRepository::new(self.catalog).insert_bundle(file_id, &extraction_id, &specs)?;
65 Ok(())
66 }
67
68 fn latest_extraction_id(&self, file_id: &FileId) -> OrbokResult<ExtractionId> {
69 let conn = self.catalog.lock();
70 let id: String = conn
71 .query_row(
72 "SELECT extraction_id FROM extraction_records \
73 WHERE file_id = ?1 AND status = 'succeeded' \
74 ORDER BY completed_at DESC LIMIT 1",
75 params![file_id.as_str()],
76 |row| row.get(0),
77 )
78 .map_err(|e| OrbokError::Database(format!("no extraction record: {e}")))?;
79 Ok(ExtractionId::from_string(id))
80 }
81}