Skip to main content

orbok_workers/
storage.rs

1//! Storage accounting (RFC-011 §9): measures actual orbok storage
2//! consumption and updates the `storage_accounting` table.
3//!
4//! Measurements are approximate on purpose — exact byte-level
5//! accounting per row is expensive; page-level and aggregate-query
6//! measurements are fast and accurate enough for the Storage view.
7
8use orbok_core::{OrbokResult, StorageCategory};
9use orbok_db::Catalog;
10use orbok_db::repo::StorageAccountingRepository;
11use std::path::Path;
12
13/// Compute and persist storage accounting for the Storage view
14/// (RFC-011 §9 "approximate by default").
15///
16/// Called by the worker pipeline after each indexing run and by the
17/// Storage view's "refresh" action.
18pub fn update_storage_accounting(
19    catalog: &Catalog,
20    cache_db_path: &Path,
21) -> OrbokResult<Vec<(StorageCategory, u64, u64)>> {
22    let storage = StorageAccountingRepository::new(catalog);
23    let mut rows = Vec::new();
24
25    macro_rules! measure {
26        ($cat:expr, $size:expr, $count:expr) => {{
27            storage.upsert($cat, $size, $count)?;
28            rows.push(($cat, $size, $count));
29        }};
30    }
31
32    let conn = catalog.lock();
33
34    // Persistent catalog: approximate as the file size of catalog DB.
35    // If in-memory (:memory:), report 0.
36    let catalog_path = catalog.path();
37    let catalog_bytes = if catalog_path.to_str() == Some(":memory:") {
38        // Use page_count × page_size as proxy for in-memory databases.
39        let pages: i64 = conn
40            .query_row("PRAGMA page_count", [], |r| r.get(0))
41            .unwrap_or(0);
42        let page_size: i64 = conn
43            .query_row("PRAGMA page_size", [], |r| r.get(0))
44            .unwrap_or(4096);
45        (pages * page_size) as u64
46    } else {
47        std::fs::metadata(catalog_path).map(|m| m.len()).unwrap_or(0)
48    };
49    // Source count for "items"
50    let source_count: i64 = conn
51        .query_row("SELECT COUNT(*) FROM sources WHERE status != 'removed'", [], |r| r.get(0))
52        .unwrap_or(0);
53    drop(conn); // release before re-acquiring below
54    measure!(StorageCategory::PersistentCatalog, catalog_bytes, source_count as u64);
55
56    // Keyword index: row count from keyword_index_records.
57    let conn = catalog.lock();
58    let kw_count: i64 = conn
59        .query_row("SELECT COUNT(*) FROM keyword_index_records WHERE status='active'", [], |r| r.get(0))
60        .unwrap_or(0);
61    // Approximate size: 256 bytes per token record (FTS overhead).
62    let kw_bytes = kw_count as u64 * 256;
63    drop(conn);
64    measure!(StorageCategory::KeywordIndex, kw_bytes, kw_count as u64);
65
66    // Vector index: actual BLOB sizes.
67    let conn = catalog.lock();
68    let (emb_count, emb_bytes): (i64, i64) = conn
69        .query_row(
70            "SELECT COUNT(*), COALESCE(SUM(LENGTH(vector_blob)), 0) FROM embeddings WHERE status='active'",
71            [],
72            |r| Ok((r.get(0)?, r.get(1)?)),
73        )
74        .unwrap_or((0, 0));
75    drop(conn);
76    measure!(StorageCategory::VectorIndex, emb_bytes as u64, emb_count as u64);
77
78    // Snippet cache: stored size_bytes column.
79    let conn = catalog.lock();
80    let (snip_count, snip_bytes): (i64, i64) = conn
81        .query_row(
82            "SELECT COUNT(*), COALESCE(SUM(size_bytes), 0) FROM snippet_cache",
83            [],
84            |r| Ok((r.get(0)?, r.get(1)?)),
85        )
86        .unwrap_or((0, 0));
87    drop(conn);
88    measure!(StorageCategory::SnippetCache, snip_bytes as u64, snip_count as u64);
89
90    // Search cache: row count (size unknown; estimate 512 bytes each).
91    let conn = catalog.lock();
92    let sr_count: i64 = conn
93        .query_row("SELECT COUNT(*) FROM search_result_cache", [], |r| r.get(0))
94        .unwrap_or(0);
95    drop(conn);
96    measure!(StorageCategory::SearchCache, sr_count as u64 * 512, sr_count as u64);
97
98    // Temporary extraction: localcache DB file size.
99    let cache_bytes = std::fs::metadata(cache_db_path)
100        .map(|m| m.len())
101        .unwrap_or(0);
102    let conn = catalog.lock();
103    let extract_count: i64 = conn
104        .query_row("SELECT COUNT(*) FROM extraction_records WHERE status='succeeded'", [], |r| r.get(0))
105        .unwrap_or(0);
106    drop(conn);
107    measure!(StorageCategory::TemporaryExtraction, cache_bytes, extract_count as u64);
108
109    // Logs: app_events row estimate.
110    let conn = catalog.lock();
111    let evt_count: i64 = conn
112        .query_row("SELECT COUNT(*) FROM app_events", [], |r| r.get(0))
113        .unwrap_or(0);
114    drop(conn);
115    measure!(StorageCategory::Logs, evt_count as u64 * 256, evt_count as u64);
116
117    // Model files: not tracked in v0.4 (full workflow lands in M12).
118    measure!(StorageCategory::ModelFiles, 0, 0);
119
120    Ok(rows)
121}