Skip to main content

orbok_workers/
storage.rs

1//! Storage accounting (RFC-011 §9): measures actual orbok storage
2//! consumption and updates the `storage_accounting` table.
3//!
4//! Measurements are approximate on purpose — exact byte-level
5//! accounting per row is expensive; page-level and aggregate-query
6//! measurements are fast and accurate enough for the Storage view.
7
8use orbok_core::{OrbokError, OrbokResult, StorageCategory, now_iso8601};
9use orbok_db::Catalog;
10use orbok_db::repo::StorageAccountingRepository;
11use rusqlite::params;
12use std::path::Path;
13
14/// Compute and persist storage accounting for the Storage view
15/// (RFC-011 §9 "approximate by default").
16///
17/// Called by the worker pipeline after each indexing run and by the
18/// Storage view's "refresh" action.
19pub fn update_storage_accounting(
20    catalog: &Catalog,
21    cache_db_path: &Path,
22) -> OrbokResult<Vec<(StorageCategory, u64, u64)>> {
23    let storage = StorageAccountingRepository::new(catalog);
24    let mut rows = Vec::new();
25
26    macro_rules! measure {
27        ($cat:expr, $size:expr, $count:expr) => {{
28            storage.upsert($cat, $size, $count)?;
29            rows.push(($cat, $size, $count));
30        }};
31    }
32
33    let conn = catalog.lock();
34
35    // Persistent catalog: approximate as the file size of catalog DB.
36    // If in-memory (:memory:), report 0.
37    let catalog_path = catalog.path();
38    let catalog_bytes = if catalog_path.to_str() == Some(":memory:") {
39        // Use page_count × page_size as proxy for in-memory databases.
40        let pages: i64 = conn
41            .query_row("PRAGMA page_count", [], |r| r.get(0))
42            .unwrap_or(0);
43        let page_size: i64 = conn
44            .query_row("PRAGMA page_size", [], |r| r.get(0))
45            .unwrap_or(4096);
46        (pages * page_size) as u64
47    } else {
48        std::fs::metadata(catalog_path).map(|m| m.len()).unwrap_or(0)
49    };
50    // Source count for "items"
51    let source_count: i64 = conn
52        .query_row("SELECT COUNT(*) FROM sources WHERE status != 'removed'", [], |r| r.get(0))
53        .unwrap_or(0);
54    drop(conn); // release before re-acquiring below
55    measure!(StorageCategory::PersistentCatalog, catalog_bytes, source_count as u64);
56
57    // Keyword index: row count from keyword_index_records.
58    let conn = catalog.lock();
59    let kw_count: i64 = conn
60        .query_row("SELECT COUNT(*) FROM keyword_index_records WHERE status='active'", [], |r| r.get(0))
61        .unwrap_or(0);
62    // Approximate size: 256 bytes per token record (FTS overhead).
63    let kw_bytes = kw_count as u64 * 256;
64    drop(conn);
65    measure!(StorageCategory::KeywordIndex, kw_bytes, kw_count as u64);
66
67    // Vector index: actual BLOB sizes.
68    let conn = catalog.lock();
69    let (emb_count, emb_bytes): (i64, i64) = conn
70        .query_row(
71            "SELECT COUNT(*), COALESCE(SUM(LENGTH(vector_blob)), 0) FROM embeddings WHERE status='active'",
72            [],
73            |r| Ok((r.get(0)?, r.get(1)?)),
74        )
75        .unwrap_or((0, 0));
76    drop(conn);
77    measure!(StorageCategory::VectorIndex, emb_bytes as u64, emb_count as u64);
78
79    // Snippet cache: stored size_bytes column.
80    let conn = catalog.lock();
81    let (snip_count, snip_bytes): (i64, i64) = conn
82        .query_row(
83            "SELECT COUNT(*), COALESCE(SUM(size_bytes), 0) FROM snippet_cache",
84            [],
85            |r| Ok((r.get(0)?, r.get(1)?)),
86        )
87        .unwrap_or((0, 0));
88    drop(conn);
89    measure!(StorageCategory::SnippetCache, snip_bytes as u64, snip_count as u64);
90
91    // Search cache: row count (size unknown; estimate 512 bytes each).
92    let conn = catalog.lock();
93    let sr_count: i64 = conn
94        .query_row("SELECT COUNT(*) FROM search_result_cache", [], |r| r.get(0))
95        .unwrap_or(0);
96    drop(conn);
97    measure!(StorageCategory::SearchCache, sr_count as u64 * 512, sr_count as u64);
98
99    // Temporary extraction: localcache DB file size.
100    let cache_bytes = std::fs::metadata(cache_db_path)
101        .map(|m| m.len())
102        .unwrap_or(0);
103    let conn = catalog.lock();
104    let extract_count: i64 = conn
105        .query_row("SELECT COUNT(*) FROM extraction_records WHERE status='succeeded'", [], |r| r.get(0))
106        .unwrap_or(0);
107    drop(conn);
108    measure!(StorageCategory::TemporaryExtraction, cache_bytes, extract_count as u64);
109
110    // Logs: app_events row estimate.
111    let conn = catalog.lock();
112    let evt_count: i64 = conn
113        .query_row("SELECT COUNT(*) FROM app_events", [], |r| r.get(0))
114        .unwrap_or(0);
115    drop(conn);
116    measure!(StorageCategory::Logs, evt_count as u64 * 256, evt_count as u64);
117
118    // Model files: not tracked in v0.4 (full workflow lands in M12).
119    measure!(StorageCategory::ModelFiles, 0, 0);
120
121    Ok(rows)
122}