Skip to main content

coding_agent_search/storage/
sqlite.rs

1//! `SQLite` backend: schema, pragmas, and migrations.
2
3use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
4use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
5use anyhow::{Context, Result, anyhow, bail};
6use frankensqlite::{
7    Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
8    compat::{
9        ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
10        OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
11        Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
12        open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
13    },
14    migrate::MigrationRunner,
15};
16use serde::{Deserialize, Serialize};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::io::{BufRead, BufReader, Write};
22use std::process::{Command, Stdio};
23use std::sync::{
24    Arc,
25    atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
26};
27
28/// Frankensqlite parameter list builder.
29macro_rules! fparams {
30    () => {
31        &[] as &[ParamValue]
32    };
33    ($($val:expr),+ $(,)?) => {
34        &[$(ParamValue::from($val)),+] as &[ParamValue]
35    };
36}
37use std::path::{Path, PathBuf};
38use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
39use thiserror::Error;
40use tracing::info;
41
42const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
43const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
44
45// -------------------------------------------------------------------------
46// Lazy FrankenSQLite Connection (bd-1ueu)
47// -------------------------------------------------------------------------
48// Defers opening the database until first use, cutting startup cost for
49// commands that may not need the DB at all.  Thread-safe via parking_lot
50// Mutex; logs the reason and duration of the open on first access.
51
52/// Error from lazy database initialization.
53#[derive(Debug, Error)]
54pub enum LazyDbError {
55    #[error("Database not found at {0}")]
56    NotFound(PathBuf),
57    #[error("Failed to open FrankenSQLite database at {path}: {source}")]
58    FrankenOpenFailed {
59        path: PathBuf,
60        source: frankensqlite::FrankenError,
61    },
62}
63
64// -------------------------------------------------------------------------
65// LazyFrankenDb — lazy wrapper around FrankenConnection
66// -------------------------------------------------------------------------
67
68/// Wrapper around `FrankenConnection` that implements `Send`.
69///
70/// `FrankenConnection` is `!Send` because it uses `Rc` internally.
71/// However, the `Rc` values are entirely self-contained within the Connection
72/// and are not shared externally.  When wrapped in a `Mutex`,
73/// exclusive access is guaranteed, making cross-thread transfer safe.
74pub struct SendFrankenConnection(FrankenConnection, i64, u64);
75
76// Safety: Rc fields inside FrankenConnection are not cloned or shared externally.
77// The Mutex<Option<SendFrankenConnection>> ensures exclusive access.
78unsafe impl Send for SendFrankenConnection {}
79
80impl SendFrankenConnection {
81    pub(crate) fn new(conn: FrankenConnection) -> Self {
82        Self(
83            conn,
84            UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
85            UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
86        )
87    }
88
89    pub(crate) fn new_with_index_writer_state(
90        conn: FrankenConnection,
91        checkpoint_pages: i64,
92        busy_timeout_ms: u64,
93    ) -> Self {
94        Self(conn, checkpoint_pages, busy_timeout_ms)
95    }
96
97    pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
98        (self.0, self.1, self.2)
99    }
100}
101
102impl std::ops::Deref for SendFrankenConnection {
103    type Target = FrankenConnection;
104    fn deref(&self) -> &FrankenConnection {
105        &self.0
106    }
107}
108
109/// Lazy-opening wrapper for `FrankenConnection` (frankensqlite).
110///
111/// Constructing a `LazyFrankenDb` is cheap (no I/O).  The underlying
112/// `FrankenConnection` is opened on the first call to [`get`].
113/// Subsequent calls return the cached connection.
114pub struct LazyFrankenDb {
115    path: PathBuf,
116    conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
117}
118
119/// RAII guard that dereferences to the inner `FrankenConnection`.
120pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
121
122impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_tuple("LazyFrankenDbGuard")
125            .field(&self.0.is_some())
126            .finish()
127    }
128}
129
130impl std::ops::Deref for LazyFrankenDbGuard<'_> {
131    type Target = FrankenConnection;
132    fn deref(&self) -> &FrankenConnection {
133        self.0
134            .as_ref()
135            .expect("LazyFrankenDb connection must be initialized before access")
136    }
137}
138
139impl LazyFrankenDb {
140    /// Create a lazy handle pointing at `path`.  No I/O is performed.
141    pub fn new(path: PathBuf) -> Self {
142        Self {
143            path,
144            conn: parking_lot::Mutex::new(None),
145        }
146    }
147
148    /// Resolve path from optional CLI overrides.
149    ///
150    /// Uses `data_dir / agent_search.db` as fallback.
151    pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
152        let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
153        let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
154        Self::new(path)
155    }
156
157    /// Get the connection, opening the database on first access.
158    ///
159    /// `reason` is logged alongside the open duration so callers can
160    /// identify which command triggered the open.
161    pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
162        let mut guard = self.conn.lock();
163        if guard.is_none() {
164            if !self.path.exists() {
165                return Err(LazyDbError::NotFound(self.path.clone()));
166            }
167            let start = Instant::now();
168            let _doctor_guard = acquire_doctor_mutation_db_open_guard(
169                &self.path,
170                DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
171            )
172            .map_err(|err| LazyDbError::FrankenOpenFailed {
173                path: self.path.clone(),
174                source: frankensqlite::FrankenError::Internal(err.to_string()),
175            })?;
176            let conn =
177                FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
178                    LazyDbError::FrankenOpenFailed {
179                        path: self.path.clone(),
180                        source: e,
181                    }
182                })?;
183            let elapsed_ms = start.elapsed().as_millis();
184            info!(
185                path = %self.path.display(),
186                elapsed_ms = elapsed_ms,
187                reason = reason,
188                "lazily opened FrankenSQLite database"
189            );
190            *guard = Some(SendFrankenConnection::new(conn));
191        }
192        Ok(LazyFrankenDbGuard(guard))
193    }
194
195    /// Get the connection with a timeout, opening the database on first access.
196    ///
197    /// Like [`get`] but spawns the open in a background thread and waits up to
198    /// `timeout` for it to complete. Returns `LazyDbError::FrankenOpenFailed`
199    /// with a descriptive message if the timeout elapses. Fix for #128.
200    pub fn get_with_timeout(
201        &self,
202        reason: &str,
203        timeout: Duration,
204    ) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
205        let mut guard = self.conn.lock();
206        if guard.is_none() {
207            if !self.path.exists() {
208                return Err(LazyDbError::NotFound(self.path.clone()));
209            }
210            let start = Instant::now();
211            let path_owned = self.path.to_string_lossy().into_owned();
212            let path_for_guard = self.path.clone();
213            let (tx, rx) = std::sync::mpsc::channel();
214            std::thread::spawn(move || {
215                let _doctor_guard =
216                    match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
217                        Ok(guard) => guard,
218                        Err(err) => {
219                            let _ = tx
220                                .send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
221                            return;
222                        }
223                    };
224                let _ =
225                    tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
226            });
227            let conn = rx
228                .recv_timeout(timeout)
229                .map_err(|_| LazyDbError::FrankenOpenFailed {
230                    path: self.path.clone(),
231                    source: frankensqlite::FrankenError::Internal(format!(
232                        "database open timed out after {}s (possible corruption or lock contention)",
233                        timeout.as_secs()
234                    )),
235                })?
236                .map_err(|e| LazyDbError::FrankenOpenFailed {
237                    path: self.path.clone(),
238                    source: e,
239                })?;
240            let elapsed_ms = start.elapsed().as_millis();
241            info!(
242                path = %self.path.display(),
243                elapsed_ms = elapsed_ms,
244                reason = reason,
245                "lazily opened FrankenSQLite database (with timeout)"
246            );
247            *guard = Some(conn);
248        }
249        Ok(LazyFrankenDbGuard(guard))
250    }
251
252    /// Path to the database file (even if not yet opened).
253    pub fn path(&self) -> &Path {
254        &self.path
255    }
256
257    /// Whether the connection has been opened.
258    pub fn is_open(&self) -> bool {
259        self.conn.lock().is_some()
260    }
261}
262
263static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
264static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
265static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
266static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
267static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
268static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
269static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
270
271#[derive(Debug, Clone, Copy, Default, Serialize)]
272pub(crate) struct MessageLookupTraceCounters {
273    pub exact_idx_probes: u64,
274    pub bounded_lookup_queries: u64,
275    pub full_scan_queries: u64,
276    pub rows_materialized: u64,
277}
278
279impl MessageLookupTraceCounters {
280    pub(crate) fn saturating_sub(self, before: Self) -> Self {
281        Self {
282            exact_idx_probes: self
283                .exact_idx_probes
284                .saturating_sub(before.exact_idx_probes),
285            bounded_lookup_queries: self
286                .bounded_lookup_queries
287                .saturating_sub(before.bounded_lookup_queries),
288            full_scan_queries: self
289                .full_scan_queries
290                .saturating_sub(before.full_scan_queries),
291            rows_materialized: self
292                .rows_materialized
293                .saturating_sub(before.rows_materialized),
294        }
295    }
296
297    pub(crate) fn lookups_against_global(self) -> u64 {
298        self.exact_idx_probes.saturating_add(self.rows_materialized)
299    }
300}
301
302pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
303    MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
304}
305
306pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
307    MessageLookupTraceCounters {
308        exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
309        bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
310        full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
311        rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
312    }
313}
314
315fn record_message_lookup_exact_idx_probe() {
316    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
317        MESSAGE_LOOKUP_EXACT_IDX_PROBES.fetch_add(1, Ordering::Relaxed);
318    }
319}
320
321fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
322    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
323        MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
324        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
325    }
326}
327
328fn record_message_lookup_full_scan_query(rows: usize) {
329    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
330        MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
331        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
332    }
333}
334
335pub(crate) struct DoctorMutationDbOpenBypassGuard;
336
337impl Drop for DoctorMutationDbOpenBypassGuard {
338    fn drop(&mut self) {
339        DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
340    }
341}
342
343pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
344    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
345    DoctorMutationDbOpenBypassGuard
346}
347
348fn doctor_mutation_db_open_bypass_active() -> bool {
349    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
350}
351
352fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
353    let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
354    value ^= value >> 30;
355    value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
356    value ^= value >> 27;
357    value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
358    value ^= value >> 31;
359    value % max_inclusive.saturating_add(1)
360}
361
362/// Sleep with jittered exponential backoff to avoid lock-step retry storms
363/// when many threads hit the same transient SQLite/frankensqlite contention.
364pub(crate) fn sleep_with_franken_retry_backoff(
365    backoff: &mut Duration,
366    remaining: Duration,
367    max_backoff: Duration,
368) {
369    let capped = (*backoff).min(remaining);
370    let extra_budget = remaining.saturating_sub(capped).min(capped);
371    let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
372    let sleep_for = if extra_ms == 0 {
373        capped
374    } else {
375        capped
376            .saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
377                extra_ms,
378            )))
379            .min(remaining)
380    };
381    std::thread::sleep(sleep_for);
382    *backoff = backoff.saturating_mul(2).min(max_backoff);
383}
384
385struct DoctorMutationDbOpenGuard(Option<fs::File>);
386
387impl Drop for DoctorMutationDbOpenGuard {
388    fn drop(&mut self) {
389        if let Some(file) = self.0.as_ref() {
390            let _ = fs2::FileExt::unlock(file);
391        }
392    }
393}
394
395fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
396    if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
397        return None;
398    }
399
400    Some(
401        db_path
402            .parent()?
403            .join("doctor")
404            .join("locks")
405            .join("doctor-repair.lock"),
406    )
407}
408
409fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
410    raw.lines().any(|line| {
411        let Some((key, value)) = line.split_once('=') else {
412            return false;
413        };
414        key.trim() == "pid"
415            && value
416                .trim()
417                .parse::<u32>()
418                .is_ok_and(|pid| pid == std::process::id())
419    })
420}
421
422fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
423    use std::io::Read as _;
424
425    let Ok(mut file) = file.try_clone() else {
426        return false;
427    };
428    let mut raw = String::new();
429    let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
430        .read_to_string(&mut raw);
431    doctor_lock_metadata_pid_is_current_process(&raw)
432}
433
434fn acquire_doctor_mutation_db_open_guard(
435    db_path: &Path,
436    timeout: Duration,
437) -> Result<DoctorMutationDbOpenGuard> {
438    let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
439        return Ok(DoctorMutationDbOpenGuard(None));
440    };
441    if doctor_mutation_db_open_bypass_active() {
442        return Ok(DoctorMutationDbOpenGuard(None));
443    }
444
445    if let Some(parent) = lock_path.parent() {
446        fs::create_dir_all(parent).with_context(|| {
447            format!(
448                "creating doctor mutation lock directory {} before opening {}",
449                parent.display(),
450                db_path.display()
451            )
452        })?;
453    }
454
455    let deadline = Instant::now() + timeout;
456    let mut backoff = Duration::from_millis(4);
457    loop {
458        let file = fs::OpenOptions::new()
459            .create(true)
460            .truncate(false)
461            .read(true)
462            .write(true)
463            .open(&lock_path)
464            .with_context(|| {
465                format!(
466                    "opening doctor mutation lock {} before opening {}",
467                    lock_path.display(),
468                    db_path.display()
469                )
470            })?;
471
472        if doctor_lock_file_pid_is_current_process(&file) {
473            return Ok(DoctorMutationDbOpenGuard(None));
474        }
475
476        match fs2::FileExt::try_lock_shared(&file) {
477            Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
478            Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
479                let now = Instant::now();
480                if now >= deadline {
481                    return Err(anyhow!(
482                        "doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
483                        lock_path.display(),
484                        db_path.display(),
485                        timeout.as_millis()
486                    ));
487                }
488                let remaining = deadline.saturating_duration_since(now);
489                sleep_with_franken_retry_backoff(
490                    &mut backoff,
491                    remaining,
492                    Duration::from_millis(128),
493                );
494            }
495            Err(err) => {
496                return Err(anyhow!(
497                    "failed to acquire shared doctor mutation lock {} before opening {}: {}",
498                    lock_path.display(),
499                    db_path.display(),
500                    err
501                ));
502            }
503        }
504    }
505}
506
507pub(crate) fn open_franken_storage_with_timeout(
508    path: &Path,
509    timeout: Duration,
510) -> Result<FrankenStorage> {
511    if !path.exists() {
512        return Err(anyhow!("Database not found at {}", path.display()));
513    }
514
515    let deadline = Instant::now() + timeout;
516    let mut backoff = Duration::from_millis(4);
517    loop {
518        match FrankenStorage::open(path) {
519            Ok(storage) => return Ok(storage),
520            Err(err) if retryable_franken_anyhow(&err) => {
521                let now = Instant::now();
522                if now >= deadline {
523                    return Err(err);
524                }
525                let remaining = deadline.saturating_duration_since(now);
526                sleep_with_franken_retry_backoff(
527                    &mut backoff,
528                    remaining,
529                    Duration::from_millis(128),
530                );
531            }
532            Err(err) => return Err(err),
533        }
534    }
535}
536
537pub(crate) fn open_current_schema_storage_with_timeout(
538    path: &Path,
539    timeout: Duration,
540) -> Result<Option<FrankenStorage>> {
541    if !path.exists() {
542        return Ok(None);
543    }
544
545    let mut storage = FrankenStorage::new(
546        open_franken_raw_connection_with_timeout(path, timeout)?,
547        path.to_path_buf(),
548    );
549    storage.apply_open_stage_busy_timeout();
550
551    let version = storage
552        .raw()
553        .query("SELECT value FROM meta WHERE key = 'schema_version';")
554        .ok()
555        .and_then(|rows| rows.first().cloned())
556        .and_then(|row| row.get_typed::<String>(0).ok())
557        .and_then(|raw| raw.parse::<i64>().ok());
558
559    if version != Some(CURRENT_SCHEMA_VERSION) {
560        if let Err(close_err) = storage.close_without_checkpoint_in_place() {
561            tracing::debug!(
562                error = %close_err,
563                db_path = %path.display(),
564                "open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
565            );
566            storage.close_best_effort_in_place();
567        }
568        return Ok(None);
569    }
570
571    transition_from_meta_version(&storage.conn)?;
572    storage.repair_missing_current_schema_objects()?;
573    storage.apply_config()?;
574    Ok(Some(storage))
575}
576
577pub(crate) fn open_franken_readonly_storage_with_timeout(
578    path: &Path,
579    timeout: Duration,
580) -> Result<FrankenStorage> {
581    if !path.exists() {
582        return Err(anyhow!("Database not found at {}", path.display()));
583    }
584
585    let deadline = Instant::now() + timeout;
586    let mut backoff = Duration::from_millis(4);
587    loop {
588        match FrankenStorage::open_readonly(path) {
589            Ok(storage) => return Ok(storage),
590            Err(err) if retryable_franken_anyhow(&err) => {
591                let now = Instant::now();
592                if now >= deadline {
593                    return Err(err);
594                }
595                let remaining = deadline.saturating_duration_since(now);
596                sleep_with_franken_retry_backoff(
597                    &mut backoff,
598                    remaining,
599                    Duration::from_millis(128),
600                );
601            }
602            Err(err) => return Err(err),
603        }
604    }
605}
606
607pub(crate) fn open_franken_raw_connection_with_timeout(
608    path: &Path,
609    timeout: Duration,
610) -> Result<FrankenConnection> {
611    if !path.exists() {
612        return Err(anyhow!("Database not found at {}", path.display()));
613    }
614
615    let path_str = path.to_string_lossy().to_string();
616    let deadline = Instant::now() + timeout;
617    let mut backoff = Duration::from_millis(4);
618    loop {
619        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
620        match FrankenConnection::open(&path_str)
621            .with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
622        {
623            Ok(conn) => return Ok(conn),
624            Err(err) if retryable_franken_anyhow(&err) => {
625                let now = Instant::now();
626                if now >= deadline {
627                    return Err(err);
628                }
629                let remaining = deadline.saturating_duration_since(now);
630                sleep_with_franken_retry_backoff(
631                    &mut backoff,
632                    remaining,
633                    Duration::from_millis(128),
634                );
635            }
636            Err(err) => return Err(err),
637        }
638    }
639}
640
641pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
642    path: &Path,
643    timeout: Duration,
644) -> Result<FrankenConnection> {
645    if !path.exists() {
646        return Err(anyhow!("Database not found at {}", path.display()));
647    }
648
649    let path_str = path.to_string_lossy().to_string();
650    let deadline = Instant::now() + timeout;
651    let mut backoff = Duration::from_millis(4);
652    loop {
653        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
654        match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
655            .with_context(|| {
656                format!(
657                    "opening raw frankensqlite db readonly at {}",
658                    path.display()
659                )
660            }) {
661            Ok(conn) => return Ok(conn),
662            Err(err) if retryable_franken_anyhow(&err) => {
663                let now = Instant::now();
664                if now >= deadline {
665                    return Err(err);
666                }
667                let remaining = deadline.saturating_duration_since(now);
668                sleep_with_franken_retry_backoff(
669                    &mut backoff,
670                    remaining,
671                    Duration::from_millis(128),
672                );
673            }
674            Err(err) => return Err(err),
675        }
676    }
677}
678
679pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
680    matches!(
681        err,
682        frankensqlite::FrankenError::Busy
683            | frankensqlite::FrankenError::BusyRecovery
684            | frankensqlite::FrankenError::BusySnapshot { .. }
685            | frankensqlite::FrankenError::DatabaseLocked { .. }
686            | frankensqlite::FrankenError::LockFailed { .. }
687            | frankensqlite::FrankenError::WriteConflict { .. }
688            | frankensqlite::FrankenError::SerializationFailure { .. }
689    ) || retryable_storage_error_message(&err.to_string())
690}
691
692pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
693    let lower = message.to_ascii_lowercase();
694    lower.contains("busy")
695        || lower.contains("locked")
696        || lower.contains("locking")
697        || lower.contains("contention")
698        || lower.contains("temporarily unavailable")
699        || lower.contains("would block")
700}
701
702pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
703    err.chain().any(|cause| {
704        cause
705            .downcast_ref::<frankensqlite::FrankenError>()
706            .is_some_and(retryable_franken_error)
707            || retryable_storage_error_message(&cause.to_string())
708    })
709}
710
711impl Drop for LazyFrankenDb {
712    fn drop(&mut self) {
713        let Some(mut conn) = self.conn.get_mut().take() else {
714            return;
715        };
716        conn.0.close_best_effort_in_place();
717    }
718}
719
720// -------------------------------------------------------------------------
721// FrankenSQLite Connection Manager (bead 3rlf8)
722// -------------------------------------------------------------------------
723// Multi-connection management: reader pool + concurrent writer connections.
724// Replaces the LazyFrankenDb single-connection bottleneck for high-throughput
725// scenarios (indexer parallel writes, concurrent TUI reads + indexer writes).
726
727/// Configuration for the [`FrankenConnectionManager`].
728#[derive(Debug, Clone)]
729pub struct ConnectionManagerConfig {
730    /// Number of pre-opened reader connections (default: 4).
731    pub reader_count: usize,
732    /// Maximum concurrent writer connections (default: available parallelism).
733    pub max_writers: usize,
734}
735
736impl Default for ConnectionManagerConfig {
737    fn default() -> Self {
738        let cpus = std::thread::available_parallelism()
739            .map(|n| n.get())
740            .unwrap_or(4);
741        Self {
742            reader_count: 4,
743            max_writers: cpus,
744        }
745    }
746}
747
748/// Multi-connection manager for frankensqlite.
749///
750/// Provides:
751/// - A pool of pre-opened reader connections (round-robin, Mutex-protected)
752/// - Controlled creation of writer connections with token-based limits
753/// - RAII guards that auto-rollback uncommitted transactions on drop
754///
755/// Thread-safe: reader connections are wrapped in Mutex (FrankenConnection is !Sync).
756/// Writer connections are created per-request (each thread gets its own).
757pub struct FrankenConnectionManager {
758    db_path: PathBuf,
759    readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
760    reader_idx: std::sync::atomic::AtomicUsize,
761    /// Token-based writer limit: channel pre-filled with `max_writers` tokens.
762    /// `recv()` = acquire slot, `send()` = release slot.
763    writer_tokens: (
764        crossbeam_channel::Sender<()>,
765        crossbeam_channel::Receiver<()>,
766    ),
767    config: ConnectionManagerConfig,
768}
769
770// Safety: FrankenConnectionManager is Send+Sync because:
771// - readers wrapped in Mutex<SendFrankenConnection> (exclusive access)
772// - writer_tokens uses crossbeam (Send+Sync)
773// - db_path is PathBuf (Send+Sync)
774unsafe impl Send for FrankenConnectionManager {}
775unsafe impl Sync for FrankenConnectionManager {}
776
777impl FrankenConnectionManager {
778    /// Create a new connection manager.
779    ///
780    /// Opens `config.reader_count` reader connections immediately.
781    /// Writer connections are created on demand (up to `config.max_writers`).
782    pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
783        let db_path = db_path.into();
784        let path_str = db_path.to_string_lossy().to_string();
785
786        let reader_count = config.reader_count.max(1);
787        let mut readers = Vec::with_capacity(reader_count);
788        for _ in 0..reader_count {
789            let conn = FrankenConnection::open(&path_str)
790                .with_context(|| format!("opening reader connection at {}", db_path.display()))?;
791            // Apply read-tuned config (no migration, no write PRAGMAs)
792            let _ = conn.execute("PRAGMA busy_timeout = 5000;"); // match writer config
793            let _ = conn.execute("PRAGMA cache_size = -16384;"); // 16MB reader cache
794            readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
795        }
796
797        let max_writers = config.max_writers.max(1);
798
799        // Pre-fill bounded channel with tokens (acts as counting semaphore).
800        // A zero-capacity channel with no initial tokens would make the first
801        // writer acquisition block forever.
802        let (tx, rx) = crossbeam_channel::bounded(max_writers);
803        for _ in 0..max_writers {
804            tx.send(())
805                .map_err(|_| anyhow!("writer token channel closed during initialization"))?;
806        }
807
808        Ok(Self {
809            db_path,
810            readers,
811            reader_idx: std::sync::atomic::AtomicUsize::new(0),
812            writer_tokens: (tx, rx),
813            config: ConnectionManagerConfig {
814                reader_count,
815                max_writers,
816            },
817        })
818    }
819
820    /// Get a reader connection (round-robin from the pool).
821    ///
822    /// Returns a mutex guard wrapping the connection. The guard prevents
823    /// concurrent access to the same connection (FrankenConnection is !Sync).
824    pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
825        let idx = self
826            .reader_idx
827            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
828        self.readers[idx % self.readers.len()].lock()
829    }
830
831    /// Acquire a writer connection.
832    ///
833    /// Opens a new frankensqlite connection with full config (no migration).
834    /// Blocks if `max_writers` connections are already in use.
835    /// The returned [`WriterGuard`] auto-rolls back on drop.
836    pub fn writer(&self) -> Result<WriterGuard<'_>> {
837        self.writer_tokens
838            .1
839            .recv()
840            .map_err(|_| anyhow!("writer token channel closed"))?;
841        let path_str = self.db_path.to_string_lossy().to_string();
842        let conn = match FrankenConnection::open(&path_str) {
843            Ok(c) => c,
844            Err(e) => {
845                let _ = self.writer_tokens.0.send(());
846                return Err(anyhow::Error::from(e).context(format!(
847                    "opening writer connection at {}",
848                    self.db_path.display()
849                )));
850            }
851        };
852        let storage = FrankenStorage::new(conn, self.db_path.clone());
853        if let Err(e) = storage.apply_config() {
854            let _ = self.writer_tokens.0.send(());
855            return Err(e);
856        }
857        Ok(WriterGuard {
858            storage,
859            mgr: self,
860            committed: false,
861        })
862    }
863
864    /// Acquire a concurrent writer connection (BEGIN CONCURRENT via MVCC).
865    ///
866    /// Similar to [`writer`] but tuned for the parallel indexer write pool.
867    /// Uses reduced cache size and is designed for short-lived batch inserts.
868    pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
869        self.writer_tokens
870            .1
871            .recv()
872            .map_err(|_| anyhow!("writer token channel closed"))?;
873        let path_str = self.db_path.to_string_lossy().to_string();
874        let conn = match FrankenConnection::open(&path_str) {
875            Ok(c) => c,
876            Err(e) => {
877                let _ = self.writer_tokens.0.send(());
878                return Err(anyhow::Error::from(e).context(format!(
879                    "opening concurrent writer at {}",
880                    self.db_path.display()
881                )));
882            }
883        };
884        let storage = FrankenStorage::new(conn, self.db_path.clone());
885        if let Err(e) = storage.apply_config() {
886            let _ = self.writer_tokens.0.send(());
887            return Err(e);
888        }
889        // Reduced cache for concurrent writers (they're short-lived)
890        let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
891        Ok(WriterGuard {
892            storage,
893            mgr: self,
894            committed: false,
895        })
896    }
897
898    /// Database path managed by this pool.
899    pub fn db_path(&self) -> &Path {
900        &self.db_path
901    }
902
903    /// Number of reader connections in the pool.
904    pub fn reader_count(&self) -> usize {
905        self.readers.len()
906    }
907
908    /// Maximum concurrent writers allowed.
909    pub fn max_writers(&self) -> usize {
910        self.config.max_writers
911    }
912}
913
914impl Drop for FrankenConnectionManager {
915    fn drop(&mut self) {
916        for reader in &mut self.readers {
917            reader.get_mut().0.close_best_effort_in_place();
918        }
919    }
920}
921
922/// RAII guard for a writer connection.
923///
924/// Provides access to a [`FrankenStorage`] for write operations.
925/// Releases the writer semaphore slot when dropped.
926pub struct WriterGuard<'a> {
927    storage: FrankenStorage,
928    mgr: &'a FrankenConnectionManager,
929    committed: bool,
930}
931
932impl<'a> WriterGuard<'a> {
933    /// Access the underlying storage for read/write operations.
934    pub fn storage(&self) -> &FrankenStorage {
935        &self.storage
936    }
937
938    /// Mark this writer as successfully committed.
939    ///
940    /// Call after your transaction's `commit()` succeeds. Prevents the drop
941    /// guard from attempting a rollback.
942    pub fn mark_committed(&mut self) {
943        self.committed = true;
944    }
945}
946
947impl Drop for WriterGuard<'_> {
948    fn drop(&mut self) {
949        if !self.committed {
950            // Best-effort rollback — connection may already be in autocommit
951            let _ = self.storage.raw().execute("ROLLBACK;");
952        }
953        self.storage.close_best_effort_in_place();
954        // Release writer token
955        let _ = self.mgr.writer_tokens.0.send(());
956    }
957}
958
959// -------------------------------------------------------------------------
960// Binary Metadata Serialization (Opt 3.1)
961// -------------------------------------------------------------------------
962// MessagePack provides 50-70% storage reduction vs JSON and faster parsing.
963// New rows use binary columns; existing JSON is read on fallback.
964
965/// Serialize a JSON value to MessagePack bytes.
966/// Returns None for null/empty values to save storage.
967fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
968    if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
969        return None;
970    }
971    rmp_serde::to_vec(value).ok()
972}
973
974/// Deserialize MessagePack bytes to a JSON value.
975/// Returns default Value::Object({}) on error or empty input.
976fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
977    if bytes.is_empty() {
978        return serde_json::Value::Object(serde_json::Map::new());
979    }
980    rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
981        tracing::debug!(
982            error = %e,
983            bytes_len = bytes.len(),
984            "Failed to deserialize metadata - returning empty object"
985        );
986        serde_json::Value::Object(serde_json::Map::new())
987    })
988}
989
990/// Read metadata from a frankensqlite Row, preferring binary (msgpack) over JSON.
991fn franken_read_metadata_compat(
992    row: &FrankenRow,
993    json_idx: usize,
994    bin_idx: usize,
995) -> serde_json::Value {
996    // Try binary column first (new format)
997    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
998        && !bytes.is_empty()
999    {
1000        return deserialize_msgpack_to_json(&bytes);
1001    }
1002
1003    // Fall back to JSON column (old format or migration in progress)
1004    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1005        return serde_json::from_str(&json_str)
1006            .unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
1007    }
1008
1009    serde_json::Value::Object(serde_json::Map::new())
1010}
1011
1012fn franken_read_message_extra_compat(
1013    row: &FrankenRow,
1014    json_idx: usize,
1015    bin_idx: usize,
1016) -> serde_json::Value {
1017    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1018        && !bytes.is_empty()
1019    {
1020        return deserialize_msgpack_to_json(&bytes);
1021    }
1022
1023    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1024        return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
1025    }
1026
1027    serde_json::Value::Null
1028}
1029
1030// -------------------------------------------------------------------------
1031// Migration Error Types (P1.5)
1032// -------------------------------------------------------------------------
1033
1034/// Error type for schema migration operations.
1035#[derive(Debug, Error)]
1036pub enum MigrationError {
1037    /// The schema requires a full rebuild. The database has been backed up.
1038    #[error("Rebuild required: {reason}")]
1039    RebuildRequired {
1040        reason: String,
1041        backup_path: Option<std::path::PathBuf>,
1042    },
1043
1044    /// A database error occurred during migration.
1045    #[error("Database error: {0}")]
1046    Database(#[from] frankensqlite::FrankenError),
1047
1048    /// An I/O error occurred during backup.
1049    #[error("I/O error: {0}")]
1050    Io(#[from] std::io::Error),
1051
1052    /// Other migration error.
1053    #[error("{0}")]
1054    Other(String),
1055}
1056
1057impl From<anyhow::Error> for MigrationError {
1058    fn from(e: anyhow::Error) -> Self {
1059        MigrationError::Other(e.to_string())
1060    }
1061}
1062
1063/// Maximum number of backup files to retain.
1064const MAX_BACKUPS: usize = 3;
1065const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
1066
1067/// Files that contain user-authored state and must NEVER be deleted during rebuild.
1068const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
1069
1070/// Check if a file is user-authored data that must be preserved during rebuild.
1071pub fn is_user_data_file(path: &Path) -> bool {
1072    path.file_name()
1073        .and_then(|n| n.to_str())
1074        .map(|name| USER_DATA_FILES.contains(&name))
1075        .unwrap_or(false)
1076}
1077
1078/// SQL to register the FTS5 virtual table on a frankensqlite connection.
1079///
1080/// FrankenSQLite skips virtual-table entries (rootpage=0) when loading
1081/// `sqlite_master` from a stock-SQLite database.  Executing this CREATE
1082/// triggers the legacy FTS5 fallback path and materialises the table so
1083/// subsequent FTS queries work.
1084pub const FTS5_REGISTER_SQL: &str = "\
1085    CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
1086        content, title, agent, workspace, source_path, \
1087        created_at UNINDEXED, \
1088        content='', tokenize='porter'\
1089    )";
1090
1091const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
1092const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
1093const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
1094const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
1095const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
1096const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
1097
1098/// SQL to clear all rows from the contentless `fts_messages` table.
1099///
1100/// Contentless FTS5 tables reject ordinary `DELETE FROM ...` statements.
1101pub const FTS5_DELETE_ALL_SQL: &str =
1102    "INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
1103
1104#[cfg(test)]
1105pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
1106    // Delegate to FrankenStorage: DROP TABLE IF EXISTS + CREATE VIRTUAL TABLE
1107    // is fully supported by the frankensqlite FTS5 path at
1108    // FrankenStorage::rebuild_fts_via_frankensqlite. We call rebuild which
1109    // also populates rows, matching the historical semantics ("fresh FTS"
1110    // means the schema exists and is consistent with message rows).
1111    let storage = FrankenStorage::open(db_path).with_context(|| {
1112        format!(
1113            "opening frankensqlite db at {} for FTS materialization",
1114            db_path.display()
1115        )
1116    })?;
1117    storage.rebuild_fts_via_frankensqlite().map(|_| ())
1118}
1119
1120#[cfg(test)]
1121pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
1122    let storage = FrankenStorage::open(db_path).with_context(|| {
1123        format!(
1124            "opening frankensqlite db at {} for FTS rebuild",
1125            db_path.display()
1126        )
1127    })?;
1128    let inserted = storage.rebuild_fts_via_frankensqlite()?;
1129    storage.record_fts_franken_rebuild_generation()?;
1130    Ok(inserted)
1131}
1132
1133pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
1134    // Delegates to the FrankenStorage-native path. The function name retains
1135    // the `_via_rusqlite` suffix only for backwards compatibility with the
1136    // few test-site callers; all operations now run through frankensqlite.
1137    let storage = FrankenStorage::open(db_path).with_context(|| {
1138        format!(
1139            "opening frankensqlite db at {} for FTS consistency check",
1140            db_path.display()
1141        )
1142    })?;
1143    storage.ensure_search_fallback_fts_consistency()
1144}
1145
1146/// Create a uniquely named backup of the database file.
1147///
1148/// Returns the path to the backup file, or None if the source doesn't exist.
1149pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
1150    if !bundle_path_exists(db_path)? {
1151        return Ok(None);
1152    }
1153
1154    if !copyable_bundle_file_exists(db_path)? {
1155        return Ok(None);
1156    }
1157    let _ = copyable_bundle_sidecar_sources(db_path)?;
1158
1159    let backup_path = unique_backup_path(db_path);
1160    let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
1161
1162    // Try to use SQLite's VACUUM INTO command first, which safely handles WAL files
1163    // and produces a clean, minimized backup.
1164    match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
1165        Ok(()) => {
1166            fs::rename(&vacuum_stage_path, &backup_path)?;
1167        }
1168        Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
1169            tracing::warn!(
1170                db_path = %db_path.display(),
1171                error = %err,
1172                "create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
1173            );
1174            return Err(MigrationError::Database(err));
1175        }
1176        Err(err) => {
1177            tracing::warn!(
1178                db_path = %db_path.display(),
1179                error = %err,
1180                "create_backup: VACUUM INTO failed; falling back to raw evidence copy"
1181            );
1182        }
1183    }
1184
1185    if backup_path.exists() {
1186        sync_file_if_exists(&backup_path)?;
1187        if let Some(parent) = backup_path.parent() {
1188            sync_parent_directory(parent)?;
1189        }
1190        return Ok(Some(backup_path));
1191    }
1192
1193    // Fallback to a raw evidence copy if VACUUM INTO failed (e.g., older SQLite
1194    // or corruption). Keep this on the same symlink-safe bundle path as
1195    // historical seeding so a malformed archive root cannot make us copy an
1196    // arbitrary symlink target or publish a partial sidecar backup.
1197    copy_database_bundle(db_path, &backup_path)?;
1198
1199    Ok(Some(backup_path))
1200}
1201
1202fn vacuum_into_backup_stage(
1203    db_path: &Path,
1204    stage_path: &Path,
1205) -> std::result::Result<(), frankensqlite::FrankenError> {
1206    let mut conn = open_franken_with_flags(
1207        &db_path.to_string_lossy(),
1208        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
1209    )?;
1210    let result = (|| {
1211        conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
1212        let path_str = stage_path.to_string_lossy();
1213        conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
1214        Ok(())
1215    })();
1216    if let Err(close_err) = conn.close_in_place() {
1217        tracing::warn!(
1218            error = %close_err,
1219            db_path = %db_path.display(),
1220            "create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
1221        );
1222        conn.close_best_effort_in_place();
1223    }
1224    result
1225}
1226
1227fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
1228    retryable_franken_error(err)
1229}
1230
1231#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1232pub struct DatabaseBundleMoveResult {
1233    pub database: bool,
1234    pub wal: bool,
1235    pub shm: bool,
1236}
1237
1238impl DatabaseBundleMoveResult {
1239    pub fn moved_any(&self) -> bool {
1240        self.database || self.wal || self.shm
1241    }
1242}
1243
1244fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
1245    PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
1246}
1247
1248/// Move a database file and its WAL/SHM sidecars to a new basename.
1249///
1250/// This is used for non-destructive quarantine of a corrupted bundle before a
1251/// rebuild. If the main database file is already missing but orphaned sidecars
1252/// remain, those sidecars are still moved so a fresh database can be created
1253/// without inheriting stale WAL state.
1254pub(crate) fn move_database_bundle(
1255    source_root: &Path,
1256    destination_root: &Path,
1257) -> std::io::Result<DatabaseBundleMoveResult> {
1258    let mut moved = DatabaseBundleMoveResult::default();
1259    if let Some(parent) = destination_root.parent() {
1260        fs::create_dir_all(parent)?;
1261        sync_parent_directory(parent)?;
1262    }
1263
1264    if bundle_path_exists(source_root)? {
1265        fs::rename(source_root, destination_root)?;
1266        moved.database = true;
1267    }
1268
1269    let wal_source = database_sidecar_path(source_root, "-wal");
1270    if bundle_path_exists(&wal_source)? {
1271        fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
1272        moved.wal = true;
1273    }
1274
1275    let shm_source = database_sidecar_path(source_root, "-shm");
1276    if bundle_path_exists(&shm_source)? {
1277        fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
1278        moved.shm = true;
1279    }
1280
1281    if moved.moved_any() {
1282        if let Some(parent) = source_root.parent() {
1283            sync_parent_directory(parent)?;
1284        }
1285        if let Some(parent) = destination_root.parent() {
1286            sync_parent_directory(parent)?;
1287        }
1288    }
1289
1290    Ok(moved)
1291}
1292
1293fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
1294    match fs::symlink_metadata(path) {
1295        Ok(_) => Ok(true),
1296        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1297        Err(err) => Err(err),
1298    }
1299}
1300
1301fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
1302    if let Some(parent) = destination_root.parent() {
1303        fs::create_dir_all(parent).with_context(|| {
1304            format!(
1305                "creating destination directory for database bundle copy: {}",
1306                parent.display()
1307            )
1308        })?;
1309        sync_parent_directory(parent)
1310            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1311    }
1312
1313    if !copyable_bundle_file_exists(source_root)? {
1314        bail!(
1315            "database bundle root is missing before copy: {}",
1316            source_root.display()
1317        );
1318    }
1319
1320    let sidecars = copyable_bundle_sidecar_sources(source_root)?;
1321
1322    fs::copy(source_root, destination_root).with_context(|| {
1323        format!(
1324            "copying database bundle {} -> {}",
1325            source_root.display(),
1326            destination_root.display()
1327        )
1328    })?;
1329    sync_file_if_exists(destination_root).with_context(|| {
1330        format!(
1331            "syncing copied database bundle {}",
1332            destination_root.display()
1333        )
1334    })?;
1335
1336    for (source_sidecar, suffix) in sidecars {
1337        let destination_sidecar = database_sidecar_path(destination_root, suffix);
1338        fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
1339            format!(
1340                "copying database bundle sidecar {} -> {}",
1341                source_sidecar.display(),
1342                destination_sidecar.display()
1343            )
1344        })?;
1345        sync_file_if_exists(&destination_sidecar).with_context(|| {
1346            format!(
1347                "syncing copied database bundle sidecar {}",
1348                destination_sidecar.display()
1349            )
1350        })?;
1351    }
1352
1353    if let Some(parent) = destination_root.parent() {
1354        sync_parent_directory(parent)
1355            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1356    }
1357
1358    Ok(())
1359}
1360
1361fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
1362    let mut sidecars = Vec::new();
1363    for suffix in ["-wal", "-shm"] {
1364        let source_sidecar = database_sidecar_path(source_root, suffix);
1365        if copyable_bundle_file_exists(&source_sidecar)? {
1366            sidecars.push((source_sidecar, suffix));
1367        }
1368    }
1369    Ok(sidecars)
1370}
1371
1372fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
1373    match fs::symlink_metadata(path) {
1374        Ok(metadata) => {
1375            let file_type = metadata.file_type();
1376            if file_type.is_symlink() {
1377                bail!(
1378                    "refusing to copy database bundle symlink: {}",
1379                    path.display()
1380                );
1381            }
1382            if !file_type.is_file() {
1383                bail!(
1384                    "refusing to copy non-file database bundle path: {}",
1385                    path.display()
1386                );
1387            }
1388            Ok(true)
1389        }
1390        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1391        Err(err) => Err(err).with_context(|| {
1392            format!(
1393                "checking database bundle path before copy: {}",
1394                path.display()
1395            )
1396        }),
1397    }
1398}
1399
1400/// Helper to safely remove a database file and its potential WAL/SHM sidecars.
1401pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
1402    let mut removed_any = false;
1403
1404    match fs::remove_file(path) {
1405        Ok(()) => removed_any = true,
1406        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1407        Err(err) => return Err(err),
1408    }
1409
1410    // Best-effort removal of sidecar files (ignore errors if they don't exist)
1411    for suffix in ["-wal", "-shm"] {
1412        match fs::remove_file(database_sidecar_path(path, suffix)) {
1413            Ok(()) => removed_any = true,
1414            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1415            Err(err) => return Err(err),
1416        }
1417    }
1418
1419    if removed_any && let Some(parent) = path.parent() {
1420        sync_parent_directory(parent)?;
1421    }
1422
1423    Ok(())
1424}
1425
1426#[cfg(not(windows))]
1427fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
1428    fs::File::open(path)?.sync_all()
1429}
1430
1431#[cfg(windows)]
1432fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
1433    Ok(())
1434}
1435
1436fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1437    if path.exists() {
1438        fs::File::open(path)?.sync_all()?;
1439    }
1440    Ok(())
1441}
1442
1443/// Remove old backup files, keeping only the most recent `keep_count`.
1444pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
1445    let parent = match db_path.parent() {
1446        Some(p) => p,
1447        None => return Ok(()),
1448    };
1449
1450    let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
1451
1452    let prefix = format!("{}.backup.", db_name);
1453
1454    // Collect backup files matching the pattern
1455    let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
1456
1457    if let Ok(entries) = fs::read_dir(parent) {
1458        for entry in entries.flatten() {
1459            let path = entry.path();
1460            if let Some(name) = path.file_name().and_then(|n| n.to_str())
1461                && is_backup_root_name(name, &prefix)
1462                && let Ok(meta) = fs::metadata(&path)
1463                && meta.is_file()
1464                && let Ok(mtime) = meta.modified()
1465            {
1466                backups.push((path, mtime));
1467            }
1468        }
1469    }
1470
1471    // Sort by modification time, newest first
1472    backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
1473
1474    // Delete oldest backups beyond keep_count
1475    for (path, _) in backups.into_iter().skip(keep_count) {
1476        let _ = fs::remove_file(&path);
1477
1478        // Also try to cleanup potential sidecars from fs::copy fallback
1479        let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
1480        let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
1481    }
1482
1483    Ok(())
1484}
1485
1486#[derive(Debug, Clone)]
1487pub(crate) struct HistoricalDatabaseBundle {
1488    root_path: PathBuf,
1489    total_bytes: u64,
1490    modified_at_ms: i64,
1491    supports_direct_readonly: bool,
1492    probe: HistoricalBundleProbe,
1493}
1494
1495#[derive(Debug, Clone, Copy, Default)]
1496struct HistoricalBundleProbe {
1497    schema_version: Option<i64>,
1498    fts_schema_rows: Option<i64>,
1499    fts_queryable: bool,
1500    max_message_id: i64,
1501}
1502
1503#[cfg(test)]
1504#[allow(dead_code)]
1505#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1506pub(crate) struct SqliteDatabaseHealthProbe {
1507    pub schema_version: Option<i64>,
1508    pub quick_check_ok: bool,
1509    pub fts_schema_rows: i64,
1510    pub fts_queryable: bool,
1511    pub message_count: i64,
1512    pub max_message_id: i64,
1513}
1514
1515#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1516pub(crate) enum FtsConsistencyRepair {
1517    AlreadyHealthy {
1518        rows: usize,
1519    },
1520    IncrementalCatchUp {
1521        inserted_rows: usize,
1522        total_rows: usize,
1523    },
1524    Rebuilt {
1525        inserted_rows: usize,
1526    },
1527}
1528
1529#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
1530pub struct HistoricalSalvageOutcome {
1531    pub bundles_considered: usize,
1532    pub bundles_imported: usize,
1533    pub conversations_imported: usize,
1534    pub messages_imported: usize,
1535}
1536
1537impl HistoricalSalvageOutcome {
1538    pub(crate) fn accumulate(&mut self, other: Self) {
1539        self.bundles_considered += other.bundles_considered;
1540        self.bundles_imported += other.bundles_imported;
1541        self.conversations_imported += other.conversations_imported;
1542        self.messages_imported += other.messages_imported;
1543    }
1544}
1545
1546#[derive(Debug)]
1547struct HistoricalReadConnection {
1548    conn: FrankenConnection,
1549    method: &'static str,
1550    _tempdir: Option<tempfile::TempDir>,
1551}
1552
1553const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
1554CREATE TABLE sources (
1555    id TEXT PRIMARY KEY,
1556    kind TEXT,
1557    host_label TEXT,
1558    machine_id TEXT,
1559    platform TEXT,
1560    config_json TEXT,
1561    created_at INTEGER,
1562    updated_at INTEGER
1563);
1564CREATE TABLE agents (
1565    id INTEGER PRIMARY KEY,
1566    slug TEXT,
1567    name TEXT,
1568    version TEXT,
1569    kind TEXT,
1570    created_at INTEGER,
1571    updated_at INTEGER
1572);
1573CREATE TABLE workspaces (
1574    id INTEGER PRIMARY KEY,
1575    path TEXT,
1576    display_name TEXT
1577);
1578CREATE TABLE conversations (
1579    id INTEGER PRIMARY KEY,
1580    agent_id INTEGER,
1581    workspace_id INTEGER,
1582    source_id TEXT,
1583    external_id TEXT,
1584    title TEXT,
1585    source_path TEXT,
1586    started_at INTEGER,
1587    ended_at INTEGER,
1588    approx_tokens INTEGER,
1589    metadata_json TEXT,
1590    origin_host TEXT,
1591    metadata_bin BLOB,
1592    total_input_tokens INTEGER,
1593    total_output_tokens INTEGER,
1594    total_cache_read_tokens INTEGER,
1595    total_cache_creation_tokens INTEGER,
1596    grand_total_tokens INTEGER,
1597    estimated_cost_usd REAL,
1598    primary_model TEXT,
1599    api_call_count INTEGER,
1600    tool_call_count INTEGER,
1601    user_message_count INTEGER,
1602    assistant_message_count INTEGER,
1603    last_message_idx INTEGER,
1604    last_message_created_at INTEGER
1605);
1606CREATE TABLE messages (
1607    id INTEGER PRIMARY KEY,
1608    conversation_id INTEGER,
1609    idx INTEGER,
1610    role TEXT,
1611    author TEXT,
1612    created_at INTEGER,
1613    content TEXT,
1614    extra_json TEXT,
1615    extra_bin BLOB
1616);
1617CREATE TABLE snippets (
1618    id INTEGER PRIMARY KEY,
1619    message_id INTEGER,
1620    file_path TEXT,
1621    start_line INTEGER,
1622    end_line INTEGER,
1623    language TEXT,
1624    snippet_text TEXT
1625);
1626";
1627const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
1628const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
1629const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
1630
1631#[derive(Debug, Clone, Serialize, Deserialize)]
1632struct HistoricalBundleProgress {
1633    progress_version: u32,
1634    path: String,
1635    bytes: u64,
1636    modified_at_ms: i64,
1637    method: String,
1638    last_completed_source_row_id: i64,
1639    conversations_imported: usize,
1640    messages_imported: usize,
1641    updated_at_ms: i64,
1642}
1643
1644#[derive(Debug, Clone)]
1645struct HistoricalBatchEntry {
1646    source_row_id: i64,
1647    agent_id: i64,
1648    workspace_id: Option<i64>,
1649    conversation: Conversation,
1650}
1651
1652#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1653struct HistoricalBatchImportTotals {
1654    inserted_source_rows: usize,
1655    inserted_messages: usize,
1656}
1657
1658fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
1659    let mut roots = Vec::new();
1660    let Some(parent) = db_path.parent() else {
1661        return roots;
1662    };
1663    let db_name = db_path
1664        .file_name()
1665        .and_then(|n| n.to_str())
1666        .unwrap_or("agent_search.db");
1667    let db_stem = db_path
1668        .file_stem()
1669        .and_then(|n| n.to_str())
1670        .unwrap_or("agent_search");
1671
1672    let mut push_root = |path: PathBuf| {
1673        if path == db_path {
1674            return;
1675        }
1676        if !roots.iter().any(|existing| existing == &path) {
1677            roots.push(path);
1678        }
1679    };
1680
1681    if let Ok(entries) = fs::read_dir(parent) {
1682        for entry in entries.flatten() {
1683            let path = entry.path();
1684            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1685                continue;
1686            };
1687            if has_db_sidecar_suffix(name) {
1688                continue;
1689            }
1690            if name.starts_with(&format!("{db_name}.backup."))
1691                || name.starts_with(&format!("{db_stem}.corrupt."))
1692            {
1693                push_root(path);
1694            }
1695        }
1696    }
1697
1698    let backups_dir = parent.join("backups");
1699    if let Ok(entries) = fs::read_dir(backups_dir) {
1700        for entry in entries.flatten() {
1701            let path = entry.path();
1702            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1703                continue;
1704            };
1705            if has_db_sidecar_suffix(name) {
1706                continue;
1707            }
1708            if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
1709                push_root(path);
1710            }
1711        }
1712    }
1713
1714    push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
1715    push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
1716
1717    roots
1718}
1719
1720fn push_named_database_children(
1721    roots: &mut Vec<PathBuf>,
1722    canonical_db_path: &Path,
1723    dir: &Path,
1724    db_name: &str,
1725) {
1726    if let Ok(entries) = fs::read_dir(dir) {
1727        for entry in entries.flatten() {
1728            let candidate = entry.path().join(db_name);
1729            if candidate == canonical_db_path {
1730                continue;
1731            }
1732            if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
1733                roots.push(candidate);
1734            }
1735        }
1736    }
1737}
1738
1739fn file_mtime_ms(path: &Path) -> i64 {
1740    fs::metadata(path)
1741        .and_then(|meta| meta.modified())
1742        .ok()
1743        .and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
1744        .map(|d| d.as_millis() as i64)
1745        .unwrap_or(0)
1746}
1747
1748fn bundle_total_bytes(root_path: &Path) -> u64 {
1749    let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
1750    for suffix in ["-wal", "-shm"] {
1751        let sidecar = database_sidecar_path(root_path, suffix);
1752        total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
1753    }
1754    total
1755}
1756
1757pub(crate) fn discover_historical_database_bundles(
1758    db_path: &Path,
1759) -> Vec<HistoricalDatabaseBundle> {
1760    let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
1761        .into_iter()
1762        .filter(|root| root.exists())
1763        .map(|root_path| {
1764            let modified_at_ms = file_mtime_ms(&root_path);
1765            let total_bytes = bundle_total_bytes(&root_path);
1766            let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
1767            let probe = probe_historical_bundle(&root_path, supports_direct_readonly);
1768            HistoricalDatabaseBundle {
1769                modified_at_ms,
1770                total_bytes,
1771                supports_direct_readonly,
1772                root_path,
1773                probe,
1774            }
1775        })
1776        .filter(|bundle| bundle.total_bytes > 0)
1777        .collect();
1778
1779    fn bundle_priority(path: &Path) -> i32 {
1780        let path_str = path.to_string_lossy();
1781        if path_str.contains("/repair-lab/replay-") {
1782            return 5;
1783        }
1784        if path_str.contains("/repair-lab/") {
1785            return 4;
1786        }
1787        if path_str.contains("/snapshots/") {
1788            return 3;
1789        }
1790        if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
1791            return 0;
1792        }
1793        1
1794    }
1795
1796    fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
1797        // Classify FTS health. The probe only sets `fts_queryable = true`
1798        // when `fts_schema_rows == Some(1)` (see
1799        // `historical_bundle_fts_queryable_via_frankensqlite`), so we have
1800        // two legitimate "clean" shapes for a bundle:
1801        //
1802        //   * `fts_schema_rows == Some(1) && fts_queryable` — a pre-V14
1803        //     bundle where the FTS virtual table was eagerly created by
1804        //     migration and is queryable right now.
1805        //
1806        //   * `fts_schema_rows == Some(0) && schema_version == Some(V14+)` —
1807        //     a modern bundle where `MIGRATION_V14` dropped fts_messages on
1808        //     purpose and cass recreates it lazily via
1809        //     `ensure_search_fallback_fts_consistency` on the first open.
1810        //     Gating on `schema_version == CURRENT_SCHEMA_VERSION` is critical
1811        //     so an incomplete pre-V14 bundle with 0 fts rows is not promoted
1812        //     alongside real lazy-V14+ bundles. A `None` schema_version
1813        //     (schema marker unreadable) is excluded for the same reason.
1814        //
1815        // Everything else — `Some(1)` without queryability, `Some(n)` for
1816        // n >= 2 (duplicated CREATE VIRTUAL TABLE rows from a broken legacy
1817        // rebuild), `None` entirely, or `Some(0)` on a non-current schema —
1818        // is not "fts clean".
1819        let fts_clean = match bundle.probe.fts_schema_rows {
1820            Some(1) => bundle.probe.fts_queryable,
1821            Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
1822            _ => false,
1823        };
1824
1825        let clean_schema14_fts =
1826            bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
1827        if clean_schema14_fts {
1828            return 5;
1829        }
1830
1831        if fts_clean {
1832            return 4;
1833        }
1834
1835        if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
1836            && bundle.supports_direct_readonly
1837        {
1838            return 3;
1839        }
1840
1841        if bundle.supports_direct_readonly {
1842            return 2;
1843        }
1844
1845        1
1846    }
1847
1848    bundles.sort_by(|left, right| {
1849        bundle_health_rank(right)
1850            .cmp(&bundle_health_rank(left))
1851            .then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
1852            .then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
1853            .then_with(|| {
1854                right
1855                    .supports_direct_readonly
1856                    .cmp(&left.supports_direct_readonly)
1857            })
1858            .then_with(|| right.total_bytes.cmp(&left.total_bytes))
1859            .then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
1860            .then_with(|| right.root_path.cmp(&left.root_path))
1861    });
1862    bundles
1863}
1864
1865fn probe_historical_bundle(
1866    root_path: &Path,
1867    supports_direct_readonly: bool,
1868) -> HistoricalBundleProbe {
1869    if !supports_direct_readonly {
1870        return HistoricalBundleProbe::default();
1871    }
1872
1873    let Ok(conn) = open_historical_bundle_readonly(root_path) else {
1874        return HistoricalBundleProbe::default();
1875    };
1876
1877    let schema_version = read_meta_schema_version(&conn).ok().flatten();
1878    let fts_schema_rows: Option<i64> = conn
1879        .query_row_map(
1880            "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
1881            fparams![],
1882            |row| row.get_typed(0),
1883        )
1884        .ok();
1885    let fts_queryable =
1886        historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
1887    let max_message_id: i64 = conn
1888        .query_row_map(
1889            "SELECT COALESCE(MAX(id), 0) FROM messages",
1890            fparams![],
1891            |row| row.get_typed(0),
1892        )
1893        .unwrap_or(0);
1894
1895    HistoricalBundleProbe {
1896        schema_version,
1897        fts_schema_rows,
1898        fts_queryable,
1899        max_message_id,
1900    }
1901}
1902
1903fn historical_bundle_fts_queryable_via_frankensqlite(
1904    root_path: &Path,
1905    fts_schema_rows: Option<i64>,
1906) -> bool {
1907    matches!(fts_schema_rows, Some(1))
1908        && FrankenStorage::open_readonly(root_path)
1909            .map(|storage| {
1910                storage
1911                    .raw()
1912                    .query("SELECT rowid FROM fts_messages LIMIT 1")
1913                    .is_ok()
1914            })
1915            .unwrap_or(false)
1916}
1917
1918fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
1919    open_historical_bundle_readonly(root_path)
1920        .and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
1921        .is_ok()
1922}
1923
1924fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
1925    let found: Option<i64> = conn
1926        .query_row_map(
1927            "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
1928            fparams![table],
1929            |row| row.get_typed(0),
1930        )
1931        .optional()
1932        .with_context(|| format!("checking for historical table {table}"))?;
1933    Ok(found.is_some())
1934}
1935
1936fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
1937    if !historical_table_exists(conn, table)? {
1938        return Err(anyhow!(
1939            "historical database missing required table {table}"
1940        ));
1941    }
1942
1943    let sql = format!("SELECT rowid FROM {table} LIMIT 1");
1944    let _: Option<i64> = conn
1945        .query_row_map(&sql, fparams![], |row| row.get_typed(0))
1946        .optional()
1947        .with_context(|| format!("probing rows from historical table {table}"))?;
1948    Ok(())
1949}
1950
1951fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
1952    probe_historical_table_reads(conn, "conversations")?;
1953    probe_historical_table_reads(conn, "messages")?;
1954    Ok(())
1955}
1956
1957fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
1958    let path_str = root_path.to_string_lossy();
1959    let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
1960    let conn = open_franken_with_flags(&path_str, flags)
1961        .with_context(|| format!("opening historical database {}", root_path.display()))?;
1962    Ok(conn)
1963}
1964
1965fn is_recoverable_insert_line(line: &str) -> bool {
1966    [
1967        "sources",
1968        "agents",
1969        "workspaces",
1970        "conversations",
1971        "messages",
1972        "snippets",
1973    ]
1974    .iter()
1975    .any(|table| {
1976        line.starts_with(&format!("INSERT INTO '{table}'"))
1977            || line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
1978            || line.starts_with(&format!("INSERT INTO \"{table}\""))
1979            || line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
1980    })
1981}
1982
1983fn recover_historical_bundle_via_sqlite3(
1984    bundle: &HistoricalDatabaseBundle,
1985) -> Result<HistoricalReadConnection> {
1986    let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
1987    let recovered_db = tempdir.path().join("historical-recovered.db");
1988    let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
1989        .with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
1990    temp_conn
1991        .execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
1992        .with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
1993    drop(temp_conn);
1994
1995    let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
1996    let mut recover = Command::new("sqlite3")
1997        .arg(&bundle_uri)
1998        .arg(".recover")
1999        .stdout(Stdio::piped())
2000        .spawn()
2001        .with_context(|| {
2002            format!(
2003                "launching sqlite3 .recover for historical bundle {}",
2004                bundle.root_path.display()
2005            )
2006        })?;
2007    let recover_stdout = recover
2008        .stdout
2009        .take()
2010        .context("capturing sqlite3 .recover stdout")?;
2011
2012    let mut importer = Command::new("sqlite3")
2013        .arg(&recovered_db)
2014        .stdin(Stdio::piped())
2015        .spawn()
2016        .with_context(|| {
2017            format!(
2018                "launching sqlite3 importer for recovered bundle {}",
2019                recovered_db.display()
2020            )
2021        })?;
2022
2023    {
2024        let importer_stdin = importer
2025            .stdin
2026            .as_mut()
2027            .context("opening sqlite3 importer stdin")?;
2028        importer_stdin
2029            .write_all(b"BEGIN;\n")
2030            .context("starting recovery import transaction")?;
2031
2032        let reader = BufReader::new(recover_stdout);
2033        for line in reader.lines() {
2034            let line = line.context("reading sqlite3 .recover output")?;
2035            if is_recoverable_insert_line(&line) {
2036                importer_stdin
2037                    .write_all(line.as_bytes())
2038                    .context("writing recovered INSERT")?;
2039                importer_stdin
2040                    .write_all(b"\n")
2041                    .context("writing recovered INSERT newline")?;
2042            }
2043        }
2044
2045        importer_stdin
2046            .write_all(b"COMMIT;\n")
2047            .context("committing recovery import transaction")?;
2048    }
2049
2050    let recover_status = recover
2051        .wait()
2052        .context("waiting for sqlite3 .recover process")?;
2053    if !recover_status.success() {
2054        anyhow::bail!(
2055            "sqlite3 .recover exited with status {} for {}",
2056            recover_status,
2057            bundle.root_path.display()
2058        );
2059    }
2060
2061    let importer_status = importer
2062        .wait()
2063        .context("waiting for sqlite3 recovery importer")?;
2064    if !importer_status.success() {
2065        anyhow::bail!(
2066            "sqlite3 recovery importer exited with status {} for {}",
2067            importer_status,
2068            recovered_db.display()
2069        );
2070    }
2071
2072    let conn = open_historical_bundle_readonly(&recovered_db)?;
2073    historical_bundle_has_queryable_core_tables(&conn)?;
2074    Ok(HistoricalReadConnection {
2075        conn,
2076        method: "sqlite3-recover",
2077        _tempdir: Some(tempdir),
2078    })
2079}
2080
2081fn open_historical_bundle_for_salvage(
2082    bundle: &HistoricalDatabaseBundle,
2083) -> Result<HistoricalReadConnection> {
2084    match open_historical_bundle_readonly(&bundle.root_path) {
2085        Ok(conn) => {
2086            if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
2087                return Ok(HistoricalReadConnection {
2088                    conn,
2089                    method: "direct-readonly",
2090                    _tempdir: None,
2091                });
2092            }
2093        }
2094        Err(err) => {
2095            tracing::warn!(
2096                path = %bundle.root_path.display(),
2097                error = %err,
2098                "historical bundle direct open failed; falling back to sqlite3 .recover"
2099            );
2100        }
2101    }
2102
2103    recover_historical_bundle_via_sqlite3(bundle)
2104}
2105
2106fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
2107    let conversations: i64 =
2108        conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
2109            row.get_typed(0)
2110        })?;
2111    let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2112        row.get_typed(0)
2113    })?;
2114    Ok((
2115        usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
2116        usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
2117    ))
2118}
2119
2120fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
2121    conn.execute(
2122        "DELETE FROM meta
2123         WHERE key LIKE 'historical_bundle_salvaged:%'
2124            OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
2125    )?;
2126    Ok(())
2127}
2128
2129fn record_historical_bundle_import(
2130    conn: &FrankenConnection,
2131    bundle: &HistoricalDatabaseBundle,
2132    method: &str,
2133    conversations_imported: usize,
2134    messages_imported: usize,
2135) -> Result<()> {
2136    let key = FrankenStorage::historical_bundle_meta_key(bundle);
2137    let value = serde_json::json!({
2138        "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
2139        "path": bundle.root_path.display().to_string(),
2140        "bytes": bundle.total_bytes,
2141        "modified_at_ms": bundle.modified_at_ms,
2142        "method": method,
2143        "conversations_imported": conversations_imported,
2144        "messages_imported": messages_imported,
2145        "recorded_at_ms": FrankenStorage::now_millis(),
2146    });
2147    let value_str = serde_json::to_string(&value)?;
2148    conn.execute_compat(
2149        "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
2150        fparams![key, value_str],
2151    )?;
2152    Ok(())
2153}
2154
2155fn finalize_seeded_canonical_bundle_via_rusqlite(
2156    canonical_db_path: &Path,
2157    bundle: &HistoricalDatabaseBundle,
2158    conversations_imported: usize,
2159    messages_imported: usize,
2160) -> Result<()> {
2161    let _fts_repair =
2162        ensure_fts_consistency_via_rusqlite(canonical_db_path).with_context(|| {
2163            format!(
2164                "repairing staged canonical FTS consistency before finalization: {}",
2165                canonical_db_path.display()
2166            )
2167        })?;
2168
2169    let path_str = canonical_db_path.to_string_lossy();
2170    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2171        format!(
2172            "opening seeded canonical database for post-seed finalization: {}",
2173            canonical_db_path.display()
2174        )
2175    })?;
2176    conn.execute("PRAGMA busy_timeout = 30000;")
2177        .with_context(|| {
2178            format!(
2179                "configuring busy timeout for seeded canonical database {}",
2180                canonical_db_path.display()
2181            )
2182        })?;
2183    let schema_version = read_meta_schema_version(&conn)?;
2184
2185    if let Some(version) = schema_version
2186        && version < CURRENT_SCHEMA_VERSION
2187        && version != 13
2188    {
2189        anyhow::bail!(
2190            "seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
2191        );
2192    }
2193
2194    clear_seeded_runtime_meta(&conn)?;
2195
2196    conn.execute_compat(
2197        "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
2198        fparams![CURRENT_SCHEMA_VERSION.to_string()],
2199    )?;
2200
2201    conn.execute_compat(
2202        "INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
2203        fparams![CURRENT_SCHEMA_VERSION],
2204    )?;
2205    record_historical_bundle_import(
2206        &conn,
2207        bundle,
2208        "baseline-bulk-sql-copy",
2209        conversations_imported,
2210        messages_imported,
2211    )?;
2212    Ok(())
2213}
2214
2215fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
2216    let version: Option<String> = conn
2217        .query_row_map(
2218            "SELECT value FROM meta WHERE key = 'schema_version'",
2219            fparams![],
2220            |row| row.get_typed(0),
2221        )
2222        .optional()?;
2223    Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
2224}
2225
2226#[cfg(test)]
2227fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
2228    conn.query_row_map(
2229        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2230        fparams![],
2231        |row| row.get_typed(0),
2232    )
2233    .context("counting sqlite_master rows for fts_messages via frankensqlite")
2234}
2235
2236#[cfg(test)]
2237fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
2238    conn.query("SELECT rowid FROM fts_messages LIMIT 1").is_ok()
2239}
2240
2241#[cfg(test)]
2242#[allow(dead_code)]
2243pub(crate) fn probe_database_health_via_frankensqlite(
2244    db_path: &Path,
2245) -> Result<SqliteDatabaseHealthProbe> {
2246    let path_str = db_path.to_string_lossy();
2247    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2248        format!(
2249            "opening frankensqlite db at {} for database health probe",
2250            db_path.display()
2251        )
2252    })?;
2253    conn.execute_batch("PRAGMA busy_timeout = 30000;")
2254        .with_context(|| {
2255            format!(
2256                "configuring busy timeout for database health probe at {}",
2257                db_path.display()
2258            )
2259        })?;
2260
2261    let schema_version = read_meta_schema_version(&conn)?;
2262    let quick_check_status: String = conn
2263        .query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
2264        .with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
2265    let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
2266    let fts_schema_rows = franken_fts_schema_rows(&conn)?;
2267    let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
2268
2269    if !quick_check_ok {
2270        return Ok(SqliteDatabaseHealthProbe {
2271            schema_version,
2272            quick_check_ok,
2273            fts_schema_rows,
2274            fts_queryable,
2275            message_count: 0,
2276            max_message_id: 0,
2277        });
2278    }
2279
2280    let message_count: i64 = conn
2281        .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2282            row.get_typed(0)
2283        })
2284        .context("counting messages during frankensqlite database health probe")?;
2285    let max_message_id: i64 = conn
2286        .query_row_map(
2287            "SELECT COALESCE(MAX(id), 0) FROM messages",
2288            fparams![],
2289            |row| row.get_typed(0),
2290        )
2291        .context("reading max message id during frankensqlite database health probe")?;
2292
2293    Ok(SqliteDatabaseHealthProbe {
2294        schema_version,
2295        quick_check_ok,
2296        fts_schema_rows,
2297        fts_queryable,
2298        message_count,
2299        max_message_id,
2300    })
2301}
2302
2303struct StagedHistoricalSeed {
2304    tempdir: tempfile::TempDir,
2305    db_path: PathBuf,
2306}
2307
2308fn stage_historical_bundle_for_seed(
2309    canonical_db_path: &Path,
2310    bundle: &HistoricalDatabaseBundle,
2311) -> Result<StagedHistoricalSeed> {
2312    let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
2313    fs::create_dir_all(canonical_parent).with_context(|| {
2314        format!(
2315            "creating canonical database directory before bulk historical seed import: {}",
2316            canonical_parent.display()
2317        )
2318    })?;
2319    let tempdir = tempfile::TempDir::new_in(canonical_parent)
2320        .context("creating temporary baseline seed directory")?;
2321    let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
2322    copy_database_bundle(&bundle.root_path, &staged_seed_db)?;
2323
2324    Ok(StagedHistoricalSeed {
2325        tempdir,
2326        db_path: staged_seed_db,
2327    })
2328}
2329
2330fn promote_staged_historical_seed(
2331    canonical_db_path: &Path,
2332    staged_seed: &StagedHistoricalSeed,
2333) -> Result<()> {
2334    let canonical_backup = staged_seed
2335        .tempdir
2336        .path()
2337        .join("pre-seed-canonical-backup.db");
2338    let had_canonical = canonical_db_path.exists()
2339        || database_sidecar_path(canonical_db_path, "-wal").exists()
2340        || database_sidecar_path(canonical_db_path, "-shm").exists();
2341
2342    if had_canonical {
2343        move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
2344            format!(
2345                "backing up canonical database before promoting staged historical seed import: {}",
2346                canonical_db_path.display()
2347            )
2348        })?;
2349    }
2350
2351    if let Err(err) =
2352        move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
2353            format!(
2354                "promoting staged historical seed database bundle {} into canonical path {}",
2355                staged_seed.db_path.display(),
2356                canonical_db_path.display()
2357            )
2358        })
2359    {
2360        if had_canonical {
2361            let _ = move_database_bundle(&canonical_backup, canonical_db_path);
2362        }
2363        return Err(err);
2364    }
2365
2366    Ok(())
2367}
2368
2369pub(crate) fn seed_canonical_from_best_historical_bundle(
2370    canonical_db_path: &Path,
2371) -> Result<Option<HistoricalSalvageOutcome>> {
2372    let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
2373    let mut last_seed_error: Option<anyhow::Error> = None;
2374    for bundle in ordered_bundles
2375        .into_iter()
2376        .filter(|bundle| bundle.supports_direct_readonly)
2377    {
2378        if let Some(version) = bundle.probe.schema_version
2379            && version < 13
2380        {
2381            let err = anyhow!(
2382                "historical bundle {} schema_version {version} is too old for baseline import",
2383                bundle.root_path.display()
2384            );
2385            tracing::warn!(
2386                path = %bundle.root_path.display(),
2387                schema_version = version,
2388                "historical bundle is too old for baseline seed import"
2389            );
2390            last_seed_error = Some(err);
2391            continue;
2392        }
2393
2394        let source = open_historical_bundle_for_salvage(&bundle).with_context(|| {
2395            format!(
2396                "opening historical seed bundle {} for baseline import",
2397                bundle.root_path.display()
2398            )
2399        })?;
2400        let (conversations_imported, messages_imported) = historical_bundle_counts(&source.conn)?;
2401
2402        let staged_seed = match stage_historical_bundle_for_seed(canonical_db_path, &bundle) {
2403            Ok(staged_seed) => staged_seed,
2404            Err(err) => {
2405                tracing::warn!(
2406                    path = %bundle.root_path.display(),
2407                    error = %err,
2408                    "bulk baseline seed staging from historical bundle failed; trying next candidate"
2409                );
2410                last_seed_error = Some(err);
2411                continue;
2412            }
2413        };
2414
2415        if let Err(err) = finalize_seeded_canonical_bundle_via_rusqlite(
2416            &staged_seed.db_path,
2417            &bundle,
2418            conversations_imported,
2419            messages_imported,
2420        ) {
2421            tracing::warn!(
2422                path = %bundle.root_path.display(),
2423                error = %err,
2424                "finalizing staged historical seed import failed; trying next candidate"
2425            );
2426            last_seed_error = Some(err);
2427            continue;
2428        }
2429
2430        if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
2431            tracing::warn!(
2432                path = %bundle.root_path.display(),
2433                error = %err,
2434                "promoting staged historical seed import failed; trying next candidate"
2435            );
2436            last_seed_error = Some(err);
2437            continue;
2438        }
2439
2440        tracing::info!(
2441            path = %bundle.root_path.display(),
2442            conversations_imported,
2443            messages_imported,
2444            "seeded empty canonical database from largest healthy historical bundle"
2445        );
2446
2447        return Ok(Some(HistoricalSalvageOutcome {
2448            bundles_considered: 0,
2449            bundles_imported: 1,
2450            conversations_imported,
2451            messages_imported,
2452        }));
2453    }
2454    if let Some(err) = last_seed_error {
2455        return Err(err);
2456    }
2457    Ok(None)
2458}
2459
2460fn parse_json_column(value: Option<String>) -> serde_json::Value {
2461    value
2462        .and_then(|raw| serde_json::from_str(&raw).ok())
2463        .unwrap_or(serde_json::Value::Null)
2464}
2465
2466const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
2467
2468fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
2469    serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
2470}
2471
2472fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
2473    match value {
2474        serde_json::Value::Object(map) if map.len() == 1 => map
2475            .get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
2476            .and_then(serde_json::Value::as_str),
2477        _ => None,
2478    }
2479}
2480
2481fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
2482    match value {
2483        Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
2484        Some(raw) => wrap_historical_raw_json(raw),
2485        None => serde_json::Value::Null,
2486    }
2487}
2488
2489fn historical_salvage_debug_enabled() -> bool {
2490    std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
2491}
2492
2493#[derive(Debug, Clone, Copy)]
2494struct HistoricalImportBatchLimits {
2495    conversations: usize,
2496    messages: usize,
2497    payload_chars: usize,
2498}
2499
2500fn env_positive_usize(key: &str) -> Option<usize> {
2501    dotenvy::var(key)
2502        .ok()
2503        .and_then(|value| value.parse::<usize>().ok())
2504        .filter(|value| *value > 0)
2505}
2506
2507fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
2508    let cpu_count = std::thread::available_parallelism()
2509        .map(std::num::NonZeroUsize::get)
2510        .unwrap_or(1);
2511
2512    let default_limits = if cpu_count >= 32 {
2513        HistoricalImportBatchLimits {
2514            conversations: 128,
2515            messages: 16_384,
2516            payload_chars: 12_000_000,
2517        }
2518    } else {
2519        HistoricalImportBatchLimits {
2520            conversations: 32,
2521            messages: 4_096,
2522            payload_chars: 3_000_000,
2523        }
2524    };
2525
2526    HistoricalImportBatchLimits {
2527        conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
2528            .unwrap_or(default_limits.conversations),
2529        messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
2530            .unwrap_or(default_limits.messages),
2531        payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
2532            .unwrap_or(default_limits.payload_chars),
2533    }
2534}
2535
2536fn json_value_size_hint(value: &serde_json::Value) -> usize {
2537    if let Some(raw) = historical_raw_json(value) {
2538        return raw.len();
2539    }
2540    match value {
2541        serde_json::Value::Null => 0,
2542        other => serde_json::to_string(other)
2543            .map(|raw| raw.len())
2544            .unwrap_or(0),
2545    }
2546}
2547
2548fn message_payload_size_hint(message: &Message) -> usize {
2549    message
2550        .content
2551        .len()
2552        .saturating_add(json_value_size_hint(&message.extra_json))
2553}
2554
2555fn is_backup_root_name(name: &str, prefix: &str) -> bool {
2556    name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
2557}
2558
2559// Suffixes that mark sqlite sidecar files we must never re-open as a DB root.
2560// Includes the standard -wal/-shm pair plus frankensqlite's Windows advisory-
2561// lock sidecars (-lock-shared/-lock-reserved/-lock-pending). Used by directory
2562// enumeration paths in `historical_bundle_root_paths`; deliberately NOT used
2563// by `is_backup_root_name`, because the existing backup-rotation cleanup must
2564// continue to sweep up any pre-existing orphan lock sidecars.
2565fn has_db_sidecar_suffix(name: &str) -> bool {
2566    const SIDECAR_SUFFIXES: &[&str] = &[
2567        "-wal",
2568        "-shm",
2569        "-lock-shared",
2570        "-lock-reserved",
2571        "-lock-pending",
2572    ];
2573    SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
2574}
2575
2576/// Public schema version constant for external checks.
2577pub const CURRENT_SCHEMA_VERSION: i64 = 20;
2578const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
2579
2580/// Result of checking schema compatibility.
2581#[derive(Debug, Clone)]
2582pub enum SchemaCheck {
2583    /// Schema is up to date, no migration needed.
2584    Compatible,
2585    /// Schema needs migration but can be done incrementally.
2586    NeedsMigration,
2587    /// Schema is incompatible and needs a full rebuild (with reason).
2588    NeedsRebuild(String),
2589}
2590
2591fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
2592    // Only on-disk corruption classes justify destructive rebuild.
2593    // Locking, open, and generic I/O failures are often transient and must
2594    // surface as errors rather than deleting the database under the caller.
2595    matches!(
2596        err,
2597        frankensqlite::FrankenError::DatabaseCorrupt { .. }
2598            | frankensqlite::FrankenError::WalCorrupt { .. }
2599            | frankensqlite::FrankenError::NotADatabase { .. }
2600            | frankensqlite::FrankenError::ShortRead { .. }
2601    )
2602}
2603
2604fn unique_backup_path(path: &Path) -> PathBuf {
2605    static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
2606
2607    let timestamp = SystemTime::now()
2608        .duration_since(UNIX_EPOCH)
2609        .map(|d| d.as_nanos())
2610        .unwrap_or(0);
2611    let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
2612    let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
2613
2614    path.with_file_name(format!(
2615        "{file_name}.backup.{}.{}.{}",
2616        std::process::id(),
2617        timestamp,
2618        nonce
2619    ))
2620}
2621
2622fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
2623    let file_name = backup_path
2624        .file_name()
2625        .and_then(|name| name.to_str())
2626        .unwrap_or("db.backup");
2627    backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
2628}
2629
2630/// Check schema compatibility without modifying the database.
2631///
2632/// Opens the database read-only and checks the schema version.
2633fn check_schema_compatibility(
2634    path: &Path,
2635) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
2636    let mut conn = open_franken_with_flags(
2637        &path.to_string_lossy(),
2638        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
2639    )?;
2640
2641    let result = (|| {
2642        // Check if meta table exists
2643        let meta_exists: i32 = conn.query_row_map(
2644            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
2645            fparams![],
2646            |row| row.get_typed(0),
2647        )?;
2648
2649        if meta_exists == 0 {
2650            // No meta table - could be empty or very old schema, needs rebuild
2651            // But first check if there are any tables at all
2652            let table_count: i32 = conn.query_row_map(
2653                "SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
2654                fparams![],
2655                |row| row.get_typed(0),
2656            )?;
2657
2658            if table_count == 0 {
2659                // Empty database, will be initialized fresh
2660                return Ok(SchemaCheck::NeedsMigration);
2661            }
2662
2663            // Has tables but no meta - very old or corrupted
2664            return Ok(SchemaCheck::NeedsRebuild(
2665                "Database missing schema version metadata".to_string(),
2666            ));
2667        }
2668
2669        // Get the schema version
2670        let version: Option<i64> = conn
2671            .query_row_map(
2672                "SELECT value FROM meta WHERE key = 'schema_version'",
2673                fparams![],
2674                |row| Ok(row.get_typed::<String>(0)?.parse().ok()),
2675            )
2676            .ok()
2677            .flatten();
2678
2679        match version {
2680            Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
2681            Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
2682                Ok(SchemaCheck::NeedsMigration)
2683            }
2684            Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
2685                Ok(SchemaCheck::NeedsRebuild(format!(
2686                    "Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
2687                    v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
2688                )))
2689            }
2690            Some(v) => {
2691                // v > SCHEMA_VERSION - database is from a newer version
2692                Ok(SchemaCheck::NeedsRebuild(format!(
2693                    "Schema version {} is newer than supported version {}",
2694                    v, SCHEMA_VERSION
2695                )))
2696            }
2697            None => Ok(SchemaCheck::NeedsRebuild(
2698                "Schema version not found or invalid".to_string(),
2699            )),
2700        }
2701    })();
2702
2703    if let Err(close_err) = conn.close_in_place() {
2704        tracing::warn!(
2705            error = %close_err,
2706            db_path = %path.display(),
2707            "check_schema_compatibility: close_in_place failed; falling back to best-effort close"
2708        );
2709        conn.close_best_effort_in_place();
2710    }
2711
2712    result
2713}
2714
2715const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
2716
2717#[cfg(test)]
2718const MIGRATION_V1: &str = r"
2719PRAGMA foreign_keys = ON;
2720
2721CREATE TABLE IF NOT EXISTS meta (
2722    key TEXT PRIMARY KEY,
2723    value TEXT NOT NULL
2724);
2725
2726CREATE TABLE IF NOT EXISTS agents (
2727    id INTEGER PRIMARY KEY,
2728    slug TEXT NOT NULL UNIQUE,
2729    name TEXT NOT NULL,
2730    version TEXT,
2731    kind TEXT NOT NULL,
2732    created_at INTEGER NOT NULL,
2733    updated_at INTEGER NOT NULL
2734);
2735
2736CREATE TABLE IF NOT EXISTS workspaces (
2737    id INTEGER PRIMARY KEY,
2738    path TEXT NOT NULL UNIQUE,
2739    display_name TEXT
2740);
2741
2742CREATE TABLE IF NOT EXISTS conversations (
2743    id INTEGER PRIMARY KEY,
2744    agent_id INTEGER NOT NULL REFERENCES agents(id),
2745    workspace_id INTEGER REFERENCES workspaces(id),
2746    external_id TEXT,
2747    title TEXT,
2748    source_path TEXT NOT NULL,
2749    started_at INTEGER,
2750    ended_at INTEGER,
2751    approx_tokens INTEGER,
2752    metadata_json TEXT,
2753    UNIQUE(agent_id, external_id)
2754);
2755
2756CREATE TABLE IF NOT EXISTS messages (
2757    id INTEGER PRIMARY KEY,
2758    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2759    idx INTEGER NOT NULL,
2760    role TEXT NOT NULL,
2761    author TEXT,
2762    created_at INTEGER,
2763    content TEXT NOT NULL,
2764    extra_json TEXT,
2765    UNIQUE(conversation_id, idx)
2766);
2767
2768CREATE TABLE IF NOT EXISTS snippets (
2769    id INTEGER PRIMARY KEY,
2770    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
2771    file_path TEXT,
2772    start_line INTEGER,
2773    end_line INTEGER,
2774    language TEXT,
2775    snippet_text TEXT
2776);
2777
2778CREATE TABLE IF NOT EXISTS tags (
2779    id INTEGER PRIMARY KEY,
2780    name TEXT NOT NULL UNIQUE
2781);
2782
2783CREATE TABLE IF NOT EXISTS conversation_tags (
2784    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2785    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
2786    PRIMARY KEY (conversation_id, tag_id)
2787);
2788
2789CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
2790    ON conversations(agent_id, started_at DESC);
2791
2792CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
2793    ON messages(conversation_id, idx);
2794
2795";
2796
2797#[cfg(test)]
2798const MIGRATION_V2: &str = r"
2799CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
2800    content,
2801    title,
2802    agent,
2803    workspace,
2804    source_path,
2805    created_at UNINDEXED,
2806    message_id UNINDEXED,
2807    tokenize='porter'
2808);
2809INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
2810SELECT
2811    m.content,
2812    c.title,
2813    a.slug,
2814    w.path,
2815    c.source_path,
2816    m.created_at,
2817    m.id
2818FROM messages m
2819JOIN conversations c ON m.conversation_id = c.id
2820JOIN agents a ON c.agent_id = a.id
2821LEFT JOIN workspaces w ON c.workspace_id = w.id;
2822";
2823
2824#[cfg(test)]
2825#[allow(dead_code)]
2826const MIGRATION_V3: &str = r"
2827DROP TABLE IF EXISTS fts_messages;
2828CREATE VIRTUAL TABLE fts_messages USING fts5(
2829    content,
2830    title,
2831    agent,
2832    workspace,
2833    source_path,
2834    created_at UNINDEXED,
2835    message_id UNINDEXED,
2836    tokenize='porter'
2837);
2838INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
2839SELECT
2840    m.content,
2841    c.title,
2842    a.slug,
2843    w.path,
2844    c.source_path,
2845    m.created_at,
2846    m.id
2847FROM messages m
2848JOIN conversations c ON m.conversation_id = c.id
2849JOIN agents a ON c.agent_id = a.id
2850LEFT JOIN workspaces w ON c.workspace_id = w.id;
2851";
2852
2853#[cfg(test)]
2854const MIGRATION_V4: &str = r"
2855-- Sources table for tracking where conversations come from
2856CREATE TABLE IF NOT EXISTS sources (
2857    id TEXT PRIMARY KEY,           -- source_id (e.g., 'local', 'work-laptop')
2858    kind TEXT NOT NULL,            -- 'local', 'ssh', etc.
2859    host_label TEXT,               -- display label
2860    machine_id TEXT,               -- optional stable machine id
2861    platform TEXT,                 -- 'macos', 'linux', 'windows'
2862    config_json TEXT,              -- JSON blob for extra config (SSH params, path rewrites)
2863    created_at INTEGER NOT NULL,
2864    updated_at INTEGER NOT NULL
2865);
2866
2867-- Bootstrap: Insert the default 'local' source
2868INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
2869VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
2870";
2871
2872#[cfg(test)]
2873const MIGRATION_V5: &str = r"
2874-- Add provenance columns to conversations table
2875-- SQLite cannot alter unique constraints, so we need to recreate the table
2876
2877-- Create new table with provenance columns and updated unique constraint
2878CREATE TABLE conversations_new (
2879    id INTEGER PRIMARY KEY,
2880    agent_id INTEGER NOT NULL REFERENCES agents(id),
2881    workspace_id INTEGER REFERENCES workspaces(id),
2882    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
2883    external_id TEXT,
2884    title TEXT,
2885    source_path TEXT NOT NULL,
2886    started_at INTEGER,
2887    ended_at INTEGER,
2888    approx_tokens INTEGER,
2889    metadata_json TEXT,
2890    origin_host TEXT,
2891    UNIQUE(source_id, agent_id, external_id)
2892);
2893
2894-- Copy data from old table (all existing conversations get source_id='local')
2895INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
2896                               source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
2897SELECT id, agent_id, workspace_id, 'local', external_id, title,
2898       source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
2899FROM conversations;
2900
2901-- Drop old table and rename new
2902DROP TABLE conversations;
2903ALTER TABLE conversations_new RENAME TO conversations;
2904
2905-- Recreate indexes
2906CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
2907CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
2908";
2909
2910#[cfg(test)]
2911const MIGRATION_V6: &str = r"
2912-- Optimize lookup by source_path (used by TUI detail view)
2913CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
2914";
2915
2916#[cfg(test)]
2917const MIGRATION_V7: &str = r"
2918-- Add binary columns for MessagePack serialization (Opt 3.1)
2919-- Binary format is 50-70% smaller than JSON and faster to parse
2920ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
2921ALTER TABLE messages ADD COLUMN extra_bin BLOB;
2922";
2923
2924#[cfg(test)]
2925const MIGRATION_V8: &str = r"
2926-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
2927-- Provides fast aggregated queries for stats/dashboard without full table scans
2928
2929CREATE TABLE IF NOT EXISTS daily_stats (
2930    day_id INTEGER NOT NULL,              -- Days since 2020-01-01 (Unix epoch + offset)
2931    agent_slug TEXT NOT NULL,             -- 'all' for totals, or specific agent slug
2932    source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
2933    session_count INTEGER NOT NULL DEFAULT 0,
2934    message_count INTEGER NOT NULL DEFAULT 0,
2935    total_chars INTEGER NOT NULL DEFAULT 0,
2936    last_updated INTEGER NOT NULL,
2937    PRIMARY KEY (day_id, agent_slug, source_id)
2938);
2939
2940CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
2941CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
2942";
2943
2944#[cfg(test)]
2945const MIGRATION_V9: &str = r"
2946-- Background embedding jobs tracking table
2947CREATE TABLE IF NOT EXISTS embedding_jobs (
2948    id INTEGER PRIMARY KEY AUTOINCREMENT,
2949    db_path TEXT NOT NULL,
2950    model_id TEXT NOT NULL,
2951    status TEXT NOT NULL DEFAULT 'pending',
2952    total_docs INTEGER NOT NULL DEFAULT 0,
2953    completed_docs INTEGER NOT NULL DEFAULT 0,
2954    error_message TEXT,
2955    created_at TEXT NOT NULL DEFAULT (datetime('now')),
2956    started_at TEXT,
2957    completed_at TEXT
2958);
2959
2960-- Only one pending or running job per (db_path, model_id) at a time.
2961-- Multiple completed/failed/cancelled jobs are allowed for history.
2962CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
2963ON embedding_jobs(db_path, model_id)
2964WHERE status IN ('pending', 'running');
2965";
2966
2967#[cfg(test)]
2968const MIGRATION_V10: &str = r"
2969-- Token analytics: per-message token usage ledger
2970CREATE TABLE IF NOT EXISTS token_usage (
2971    id INTEGER PRIMARY KEY AUTOINCREMENT,
2972    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
2973    conversation_id INTEGER NOT NULL,
2974    agent_id INTEGER NOT NULL,
2975    workspace_id INTEGER,
2976    source_id TEXT NOT NULL DEFAULT 'local',
2977
2978    -- Timing
2979    timestamp_ms INTEGER NOT NULL,
2980    day_id INTEGER NOT NULL,
2981
2982    -- Model identification
2983    model_name TEXT,
2984    model_family TEXT,
2985    model_tier TEXT,
2986    service_tier TEXT,
2987    provider TEXT,
2988
2989    -- Token counts (nullable — not all agents provide all fields)
2990    input_tokens INTEGER,
2991    output_tokens INTEGER,
2992    cache_read_tokens INTEGER,
2993    cache_creation_tokens INTEGER,
2994    thinking_tokens INTEGER,
2995    total_tokens INTEGER,
2996
2997    -- Cost estimation
2998    estimated_cost_usd REAL,
2999
3000    -- Message context
3001    role TEXT NOT NULL,
3002    content_chars INTEGER NOT NULL,
3003    has_tool_calls INTEGER NOT NULL DEFAULT 0,
3004    tool_call_count INTEGER NOT NULL DEFAULT 0,
3005
3006    -- Data quality
3007    data_source TEXT NOT NULL DEFAULT 'api',
3008
3009    UNIQUE(message_id)
3010);
3011
3012CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
3013CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
3014CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
3015CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
3016CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
3017
3018-- Token analytics: pre-aggregated daily rollups
3019CREATE TABLE IF NOT EXISTS token_daily_stats (
3020    day_id INTEGER NOT NULL,
3021    agent_slug TEXT NOT NULL,
3022    source_id TEXT NOT NULL DEFAULT 'all',
3023    model_family TEXT NOT NULL DEFAULT 'all',
3024
3025    api_call_count INTEGER NOT NULL DEFAULT 0,
3026    user_message_count INTEGER NOT NULL DEFAULT 0,
3027    assistant_message_count INTEGER NOT NULL DEFAULT 0,
3028    tool_message_count INTEGER NOT NULL DEFAULT 0,
3029
3030    total_input_tokens INTEGER NOT NULL DEFAULT 0,
3031    total_output_tokens INTEGER NOT NULL DEFAULT 0,
3032    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
3033    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
3034    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
3035    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
3036
3037    total_content_chars INTEGER NOT NULL DEFAULT 0,
3038    total_tool_calls INTEGER NOT NULL DEFAULT 0,
3039
3040    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
3041
3042    session_count INTEGER NOT NULL DEFAULT 0,
3043
3044    last_updated INTEGER NOT NULL,
3045
3046    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
3047);
3048
3049CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
3050CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
3051
3052-- Model pricing lookup table
3053CREATE TABLE IF NOT EXISTS model_pricing (
3054    model_pattern TEXT NOT NULL,
3055    provider TEXT NOT NULL,
3056    input_cost_per_mtok REAL NOT NULL,
3057    output_cost_per_mtok REAL NOT NULL,
3058    cache_read_cost_per_mtok REAL,
3059    cache_creation_cost_per_mtok REAL,
3060    effective_date TEXT NOT NULL,
3061    PRIMARY KEY (model_pattern, effective_date)
3062);
3063
3064-- Seed with current pricing (as of 2026-02)
3065INSERT OR IGNORE INTO model_pricing VALUES
3066    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
3067    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
3068    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
3069    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
3070    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
3071    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3072    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3073    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
3074    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
3075    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
3076
3077-- Extend conversations table with token summary columns
3078ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
3079ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
3080ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
3081ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
3082ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
3083ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
3084ALTER TABLE conversations ADD COLUMN primary_model TEXT;
3085ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
3086ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
3087ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
3088ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
3089";
3090
3091const MIGRATION_V14: &str = r"
3092-- Switch FTS5 from internal-content to contentless mode (CASS #163).
3093-- Drop the old V13 internal-content fts_messages first so that
3094-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
3095-- entries, which makes the database completely unreadable.
3096-- The current contentless table is recreated lazily after open() only when the
3097-- frankensqlite FTS consistency check finds it missing or malformed.
3098DROP TABLE IF EXISTS fts_messages;
3099";
3100
3101const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
3102CREATE TABLE IF NOT EXISTS conversation_tail_state (
3103    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3104    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3105    conversation_id INTEGER PRIMARY KEY,
3106    ended_at INTEGER,
3107    last_message_idx INTEGER,
3108    last_message_created_at INTEGER
3109);
3110";
3111
3112const MIGRATION_V16: &str = r"
3113-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
3114-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
3115-- doubles message insert index maintenance on the hot indexing path.
3116DROP INDEX IF EXISTS idx_messages_conv_idx;
3117";
3118
3119const MIGRATION_V17: &str = r"
3120-- Drop the global messages(created_at) secondary index from the ingest hot
3121-- path. Search/time filters are served by the derived search layer and
3122-- conversation/analytics indexes, while this index is maintained on every
3123-- message insert.
3124DROP INDEX IF EXISTS idx_messages_created;
3125";
3126
3127const MIGRATION_V18: &str = r"
3128-- Move append-tail state out of the wide, indexed conversations row. The hot
3129-- append path updates this cache for every appended conversation; keeping it in
3130-- a tiny rowid table avoids rewriting the large conversation record.
3131CREATE TABLE IF NOT EXISTS conversation_tail_state (
3132    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3133    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3134    conversation_id INTEGER PRIMARY KEY,
3135    ended_at INTEGER,
3136    last_message_idx INTEGER,
3137    last_message_created_at INTEGER
3138);
3139
3140INSERT OR REPLACE INTO conversation_tail_state (
3141    conversation_id, ended_at, last_message_idx, last_message_created_at
3142)
3143SELECT id, ended_at, last_message_idx, last_message_created_at
3144FROM conversations
3145WHERE ended_at IS NOT NULL
3146   OR last_message_idx IS NOT NULL
3147   OR last_message_created_at IS NOT NULL;
3148";
3149
3150const MIGRATION_V19: &str = r"
3151-- Materialize external conversation provenance into one compact lookup key.
3152-- This keeps the hot append/new-conversation probe on a single primary-key
3153-- lookup instead of a composite conversations-table predicate.
3154CREATE TABLE IF NOT EXISTS conversation_external_lookup (
3155    lookup_key TEXT PRIMARY KEY,
3156    conversation_id INTEGER NOT NULL
3157);
3158
3159INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
3160SELECT
3161    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
3162    CAST(agent_id AS TEXT) || ':' ||
3163    CAST(length(external_id) AS TEXT) || ':' || external_id,
3164    id
3165FROM conversations
3166WHERE external_id IS NOT NULL;
3167";
3168
3169const MIGRATION_V20: &str = r"
3170-- Fuse external conversation lookup with append-tail state. Append-heavy
3171-- workloads can resolve both the conversation id and tail plan from one
3172-- primary-key probe.
3173CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
3174    lookup_key TEXT PRIMARY KEY,
3175    conversation_id INTEGER NOT NULL,
3176    ended_at INTEGER,
3177    last_message_idx INTEGER,
3178    last_message_created_at INTEGER
3179);
3180
3181INSERT OR REPLACE INTO conversation_external_tail_lookup (
3182    lookup_key,
3183    conversation_id,
3184    ended_at,
3185    last_message_idx,
3186    last_message_created_at
3187)
3188SELECT
3189    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
3190    CAST(c.agent_id AS TEXT) || ':' ||
3191    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
3192    c.id,
3193    ts.ended_at,
3194    ts.last_message_idx,
3195    ts.last_message_created_at
3196FROM conversations c
3197LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
3198WHERE c.external_id IS NOT NULL;
3199";
3200
3201/// Row from the embedding_jobs table.
3202#[derive(Debug, Clone)]
3203pub struct EmbeddingJobRow {
3204    pub id: i64,
3205    pub db_path: String,
3206    pub model_id: String,
3207    pub status: String,
3208    pub total_docs: i64,
3209    pub completed_docs: i64,
3210    pub error_message: Option<String>,
3211    pub created_at: String,
3212    pub started_at: Option<String>,
3213    pub completed_at: Option<String>,
3214}
3215
3216/// Lightweight conversation projection used while rebuilding the lexical index.
3217///
3218/// This intentionally omits `metadata_json` / `metadata_bin` and other bulky
3219/// fields because Tantivy only needs the stable envelope plus provenance
3220/// identifiers. Reading full metadata here can force frankensqlite to traverse
3221/// large overflow chains before the first lexical checkpoint is committed.
3222#[derive(Debug, Clone)]
3223pub struct LexicalRebuildConversationRow {
3224    pub id: Option<i64>,
3225    pub agent_slug: String,
3226    pub workspace: Option<PathBuf>,
3227    pub external_id: Option<String>,
3228    pub title: Option<String>,
3229    pub source_path: PathBuf,
3230    pub started_at: Option<i64>,
3231    pub ended_at: Option<i64>,
3232    pub source_id: String,
3233    pub origin_host: Option<String>,
3234}
3235
3236/// Lightweight per-conversation footprint used to pre-plan lexical rebuild
3237/// shard boundaries without re-reading full message bodies in the hot path.
3238#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3239pub struct LexicalRebuildConversationFootprintRow {
3240    pub conversation_id: i64,
3241    pub message_count: usize,
3242    pub message_bytes: usize,
3243}
3244
3245pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
3246const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
3247
3248fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
3249    total_conversations: usize,
3250    covered_conversations: usize,
3251) -> bool {
3252    total_conversations == 0
3253        || total_conversations.saturating_sub(covered_conversations.min(total_conversations))
3254            <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
3255}
3256
3257fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
3258    let last_message_idx = u64::try_from(last_message_idx?).ok()?;
3259    let high_water = last_message_idx.checked_add(1)?;
3260    usize::try_from(high_water).ok()
3261}
3262
3263fn lexical_rebuild_conversation_footprint_from_count(
3264    conversation_id: i64,
3265    message_count: usize,
3266) -> LexicalRebuildConversationFootprintRow {
3267    LexicalRebuildConversationFootprintRow {
3268        conversation_id,
3269        message_count,
3270        message_bytes: message_count
3271            .saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
3272    }
3273}
3274
3275/// Lightweight message projection used by the streaming lexical rebuild path.
3276#[derive(Debug, Clone)]
3277pub struct LexicalRebuildMessageRow {
3278    pub conversation_id: i64,
3279    pub id: i64,
3280    pub idx: i64,
3281    pub role: String,
3282    pub author: Option<String>,
3283    pub created_at: Option<i64>,
3284    pub content: String,
3285}
3286
3287/// Even lighter message projection used only by the grouped lexical rebuild
3288/// stream hot path. It keeps just the per-message fields the rebuild consumes
3289/// and tracks the final message id at conversation scope instead.
3290#[derive(Debug, Clone, PartialEq, Eq)]
3291pub struct LexicalRebuildGroupedMessageRow {
3292    pub idx: i64,
3293    pub is_tool_role: bool,
3294    pub created_at: Option<i64>,
3295    pub content: String,
3296}
3297
3298pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
3299
3300/// Compatibility alias retained while call sites finish converging on `FrankenStorage`.
3301pub type SqliteStorage = FrankenStorage;
3302
3303/// Primary frankensqlite-backed storage backend.
3304pub struct FrankenStorage {
3305    conn: FrankenConnection,
3306    db_path: PathBuf,
3307    ephemeral_writer_preflight_verified: AtomicBool,
3308    index_writer_checkpoint_pages: AtomicI64,
3309    index_writer_busy_timeout_ms: AtomicU64,
3310    cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
3311    ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3312    ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3313    ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
3314    ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3315    fts_messages_present_cache: AtomicI8,
3316}
3317
3318/// Keep ordinary storage commits from tripping over frequent auto-checkpoints
3319/// while still bounding WAL growth. Bulk index paths may override this through
3320/// their explicit checkpoint policy.
3321const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
3322const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
3323const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
3324const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
3325const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
3326const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
3327
3328enum CachedEphemeralWriter {
3329    Uninitialized,
3330    Cached(Box<SendFrankenConnection>),
3331    InUse,
3332}
3333
3334#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3335struct EnsuredAgentKey {
3336    slug: String,
3337    name: String,
3338    version: Option<String>,
3339    kind: String,
3340}
3341
3342impl EnsuredAgentKey {
3343    fn from_agent(agent: &Agent) -> Self {
3344        Self {
3345            slug: agent.slug.clone(),
3346            name: agent.name.clone(),
3347            version: agent.version.clone(),
3348            kind: agent_kind_str(agent.kind.clone()),
3349        }
3350    }
3351}
3352
3353#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3354struct EnsuredWorkspaceKey {
3355    path: String,
3356    display_name: Option<String>,
3357}
3358
3359impl EnsuredWorkspaceKey {
3360    fn new(path: String, display_name: Option<&str>) -> Self {
3361        Self {
3362            path,
3363            display_name: display_name.map(str::to_owned),
3364        }
3365    }
3366}
3367
3368#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3369struct EnsuredConversationSourceKey {
3370    id: String,
3371    kind: SourceKind,
3372    host_label: Option<String>,
3373}
3374
3375impl EnsuredConversationSourceKey {
3376    fn from_source(source: &Source) -> Self {
3377        Self {
3378            id: source.id.clone(),
3379            kind: source.kind,
3380            host_label: source.host_label.clone(),
3381        }
3382    }
3383}
3384
3385#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3386struct EnsuredDailyStatsKey {
3387    day_id: i64,
3388    agent_slug: String,
3389    source_id: String,
3390}
3391
3392impl EnsuredDailyStatsKey {
3393    fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
3394        Self {
3395            day_id,
3396            agent_slug: agent_slug.to_owned(),
3397            source_id: source_id.to_owned(),
3398        }
3399    }
3400}
3401
3402const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
3403    "PRAGMA fsqlite.autocommit_retain = OFF;",
3404    "PRAGMA autocommit_retain = OFF;",
3405];
3406
3407fn disable_autocommit_retain<E>(
3408    mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
3409) -> Result<&'static str>
3410where
3411    E: std::fmt::Display,
3412{
3413    let mut failures = Vec::new();
3414    for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
3415        match execute(pragma) {
3416            Ok(()) => return Ok(pragma),
3417            Err(err) => {
3418                let error = err.to_string();
3419                tracing::debug!(
3420                    %pragma,
3421                    error = %error,
3422                    "autocommit_retain PRAGMA variant not supported"
3423                );
3424                failures.push(format!("{pragma}: {error}"));
3425            }
3426        }
3427    }
3428
3429    Err(anyhow!(
3430        "failed to disable autocommit_retain on frankensqlite connection; \
3431         refusing to keep a long-lived MVCC connection that may accumulate \
3432         unbounded write snapshots. Upgrade frankensqlite to a version that \
3433         supports one of these PRAGMAs or use a short-lived connection path. \
3434         attempts: {}",
3435        failures.join("; ")
3436    ))
3437}
3438
3439impl FrankenStorage {
3440    fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
3441        Self::new_with_shared_caches(
3442            conn,
3443            db_path,
3444            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3445            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3446            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3447            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3448        )
3449    }
3450
3451    fn new_with_shared_caches(
3452        conn: FrankenConnection,
3453        db_path: PathBuf,
3454        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3455        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3456        ensured_conversation_sources: Arc<
3457            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3458        >,
3459        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3460    ) -> Self {
3461        Self {
3462            conn,
3463            db_path,
3464            ephemeral_writer_preflight_verified: AtomicBool::new(false),
3465            index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
3466            index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
3467            cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
3468            ensured_agents,
3469            ensured_workspaces,
3470            ensured_conversation_sources,
3471            ensured_daily_stats_keys,
3472            fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
3473        }
3474    }
3475
3476    fn apply_open_stage_busy_timeout(&self) {
3477        if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
3478            tracing::debug!(
3479                error = %err,
3480                "failed to apply open-stage busy_timeout before migrations"
3481            );
3482        }
3483    }
3484
3485    /// Open a frankensqlite connection, run migrations, and apply config.
3486    ///
3487    /// This initializes canonical schema state only. Derived fallback search
3488    /// structures like the in-database `fts_messages` table are repaired
3489    /// separately so ordinary opens never block on heavyweight maintenance.
3490    pub fn open(path: &Path) -> Result<Self> {
3491        if let Some(parent) = path.parent() {
3492            fs::create_dir_all(parent)
3493                .with_context(|| format!("creating db directory {}", parent.display()))?;
3494        }
3495
3496        let path_str = path.to_string_lossy().to_string();
3497        let _doctor_guard =
3498            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3499        let conn = FrankenConnection::open(&path_str)
3500            .with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
3501        let storage = Self::new(conn, path.to_path_buf());
3502        storage.apply_open_stage_busy_timeout();
3503        storage.run_migrations()?;
3504        storage.repair_missing_current_schema_objects()?;
3505        storage.apply_config()?;
3506        Ok(storage)
3507    }
3508
3509    /// Open a writer connection that skips migration (assumes DB already migrated).
3510    ///
3511    /// Used by the BEGIN CONCURRENT parallel writer pool: each writer needs its
3512    /// own connection with config applied, but migrations have already been run
3513    /// by the primary connection.
3514    pub fn open_writer(path: &Path) -> Result<Self> {
3515        Self::open_writer_with_shared_caches(
3516            path,
3517            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3518            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3519            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3520            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3521        )
3522    }
3523
3524    fn open_writer_with_shared_caches(
3525        path: &Path,
3526        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3527        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3528        ensured_conversation_sources: Arc<
3529            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3530        >,
3531        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3532    ) -> Result<Self> {
3533        let path_str = path.to_string_lossy().to_string();
3534        let _doctor_guard =
3535            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3536        let conn = FrankenConnection::open(&path_str)
3537            .with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
3538        let storage = Self::new_with_shared_caches(
3539            conn,
3540            path.to_path_buf(),
3541            ensured_agents,
3542            ensured_workspaces,
3543            ensured_conversation_sources,
3544            ensured_daily_stats_keys,
3545        );
3546        storage.apply_config()?;
3547        Ok(storage)
3548    }
3549
3550    pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
3551        let mut cached = self.cached_ephemeral_writer.lock();
3552        match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
3553            CachedEphemeralWriter::Cached(conn) => {
3554                let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
3555                let writer = Self::new_with_shared_caches(
3556                    conn,
3557                    self.db_path.clone(),
3558                    Arc::clone(&self.ensured_agents),
3559                    Arc::clone(&self.ensured_workspaces),
3560                    Arc::clone(&self.ensured_conversation_sources),
3561                    Arc::clone(&self.ensured_daily_stats_keys),
3562                );
3563                writer
3564                    .index_writer_checkpoint_pages
3565                    .store(checkpoint_pages, Ordering::Relaxed);
3566                writer
3567                    .index_writer_busy_timeout_ms
3568                    .store(busy_timeout_ms, Ordering::Relaxed);
3569                Ok((writer, true))
3570            }
3571            CachedEphemeralWriter::Uninitialized => {
3572                drop(cached);
3573                match Self::open_writer_with_shared_caches(
3574                    &self.db_path,
3575                    Arc::clone(&self.ensured_agents),
3576                    Arc::clone(&self.ensured_workspaces),
3577                    Arc::clone(&self.ensured_conversation_sources),
3578                    Arc::clone(&self.ensured_daily_stats_keys),
3579                ) {
3580                    Ok(writer) => Ok((writer, true)),
3581                    Err(err) => {
3582                        let mut cached = self.cached_ephemeral_writer.lock();
3583                        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3584                            *cached = CachedEphemeralWriter::Uninitialized;
3585                        }
3586                        Err(err)
3587                    }
3588                }
3589            }
3590            CachedEphemeralWriter::InUse => {
3591                *cached = CachedEphemeralWriter::InUse;
3592                drop(cached);
3593                Ok((
3594                    Self::open_writer_with_shared_caches(
3595                        &self.db_path,
3596                        Arc::clone(&self.ensured_agents),
3597                        Arc::clone(&self.ensured_workspaces),
3598                        Arc::clone(&self.ensured_conversation_sources),
3599                        Arc::clone(&self.ensured_daily_stats_keys),
3600                    )?,
3601                    false,
3602                ))
3603            }
3604        }
3605    }
3606
3607    pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
3608        let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
3609        let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
3610        let conn = writer.into_raw();
3611        let mut cached = self.cached_ephemeral_writer.lock();
3612        debug_assert!(
3613            matches!(&*cached, CachedEphemeralWriter::InUse),
3614            "cached ephemeral writer state should be in-use when releasing"
3615        );
3616        *cached = CachedEphemeralWriter::Cached(Box::new(
3617            SendFrankenConnection::new_with_index_writer_state(
3618                conn,
3619                checkpoint_pages,
3620                busy_timeout_ms,
3621            ),
3622        ));
3623    }
3624
3625    pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
3626        writer.close_best_effort_in_place();
3627        let mut cached = self.cached_ephemeral_writer.lock();
3628        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3629            *cached = CachedEphemeralWriter::Uninitialized;
3630        }
3631    }
3632
3633    fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
3634        self.ensured_agents.lock().get(key).copied()
3635    }
3636
3637    fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
3638        self.ensured_agents.lock().insert(key, id);
3639    }
3640
3641    fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
3642        self.ensured_workspaces.lock().get(key).copied()
3643    }
3644
3645    fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
3646        self.ensured_workspaces.lock().insert(key, id);
3647    }
3648
3649    fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
3650        self.ensured_conversation_sources.lock().contains(key)
3651    }
3652
3653    fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
3654        self.ensured_conversation_sources.lock().insert(key);
3655    }
3656
3657    fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
3658        self.ensured_daily_stats_keys.lock().contains(key)
3659    }
3660
3661    fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
3662        let ensured = self.ensured_daily_stats_keys.lock();
3663        keys.iter().all(|key| ensured.contains(key))
3664    }
3665
3666    fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
3667        self.ensured_daily_stats_keys.lock().insert(key);
3668    }
3669
3670    fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
3671        match self.fts_messages_present_cache.load(Ordering::Acquire) {
3672            FTS_MESSAGES_PRESENT_PRESENT => return true,
3673            FTS_MESSAGES_PRESENT_ABSENT => return false,
3674            _ => {}
3675        }
3676
3677        let present = tx
3678            .query_row_map(
3679                "SELECT COUNT(*) FROM sqlite_master
3680                 WHERE name = 'fts_messages'
3681                   AND rootpage > 0",
3682                fparams![],
3683                |row| row.get_typed::<i64>(0),
3684            )
3685            .map(|count| count > 0)
3686            .unwrap_or_else(|err| {
3687                tracing::debug!(
3688                    error = %err,
3689                    "failed to probe fts_messages presence; skipping db-resident FTS maintenance"
3690                );
3691                false
3692            });
3693        self.set_fts_messages_present_cache(present);
3694        present
3695    }
3696
3697    fn set_fts_messages_present_cache(&self, present: bool) {
3698        self.fts_messages_present_cache.store(
3699            if present {
3700                FTS_MESSAGES_PRESENT_PRESENT
3701            } else {
3702                FTS_MESSAGES_PRESENT_ABSENT
3703            },
3704            Ordering::Release,
3705        );
3706    }
3707
3708    fn invalidate_fts_messages_present_cache(&self) {
3709        self.fts_messages_present_cache
3710            .store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
3711    }
3712
3713    fn invalidate_conversation_source_cache(&self, source_id: &str) {
3714        self.ensured_conversation_sources
3715            .lock()
3716            .retain(|key| key.id != source_id);
3717    }
3718
3719    fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
3720        let cached = self.cached_ephemeral_writer.get_mut();
3721        if let CachedEphemeralWriter::Cached(conn) =
3722            std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
3723        {
3724            let mut conn = conn;
3725            conn.0.close_best_effort_in_place();
3726        }
3727    }
3728
3729    fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
3730        let cached = self.cached_ephemeral_writer.get_mut();
3731        match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
3732            CachedEphemeralWriter::Cached(mut conn) => conn
3733                .0
3734                .close_without_checkpoint_in_place()
3735                .with_context(|| "closing cached frankensqlite writer without final checkpoint"),
3736            CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
3737        }
3738    }
3739
3740    /// Open in read-only mode using frankensqlite compat flags.
3741    pub fn open_readonly(path: &Path) -> Result<Self> {
3742        Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
3743    }
3744
3745    /// Open in read-only mode with an explicit doctor mutation-lock timeout.
3746    ///
3747    /// This is primarily useful for probes that need to prove a reader would
3748    /// not enter the archive while `cass doctor --fix` owns the repair lock.
3749    pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
3750        let path_str = path.to_string_lossy().to_string();
3751        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
3752        let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
3753            .with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
3754        let storage = Self::new(conn, path.to_path_buf());
3755        storage.apply_readonly_config()?;
3756        Ok(storage)
3757    }
3758
3759    pub fn close(self) -> Result<()> {
3760        let mut this = self;
3761        this.close_cached_ephemeral_writer_best_effort_in_place();
3762        this.conn
3763            .close()
3764            .with_context(|| "closing frankensqlite connection")
3765    }
3766
3767    pub fn close_without_checkpoint(self) -> Result<()> {
3768        let mut this = self;
3769        this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3770        this.conn
3771            .close_without_checkpoint()
3772            .with_context(|| "closing frankensqlite connection without final checkpoint")
3773    }
3774
3775    pub fn close_best_effort_in_place(&mut self) {
3776        self.close_cached_ephemeral_writer_best_effort_in_place();
3777        self.conn.close_best_effort_in_place();
3778    }
3779
3780    pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
3781        self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3782        self.conn
3783            .close_without_checkpoint_in_place()
3784            .with_context(|| "closing frankensqlite connection without final checkpoint")
3785    }
3786
3787    /// Access the raw frankensqlite connection.
3788    pub fn raw(&self) -> &FrankenConnection {
3789        &self.conn
3790    }
3791
3792    /// Consume the storage wrapper and return the underlying frankensqlite
3793    /// connection after migrations/repair have already been applied.
3794    pub fn into_raw(self) -> FrankenConnection {
3795        let mut this = self;
3796        this.close_cached_ephemeral_writer_best_effort_in_place();
3797        this.conn
3798    }
3799
3800    /// Apply connection PRAGMAs for parity with SqliteStorage's `apply_pragmas()`.
3801    ///
3802    /// Frankensqlite supports all PRAGMAs cass uses (journal_mode, synchronous,
3803    /// cache_size, foreign_keys, busy_timeout). Its default journal_mode is already
3804    /// WAL and default synchronous is NORMAL, matching cass's requirements.
3805    ///
3806    pub fn apply_config(&self) -> Result<()> {
3807        // journal_mode: frankensqlite defaults to WAL, same as cass.
3808        // synchronous: frankensqlite defaults to NORMAL, same as cass.
3809        // Both are set explicitly for clarity.
3810        self.conn
3811            .execute("PRAGMA journal_mode = WAL;")
3812            .with_context(|| "setting journal_mode")?;
3813        self.conn
3814            .execute("PRAGMA synchronous = NORMAL;")
3815            .with_context(|| "setting synchronous")?;
3816
3817        // cache_size: 64MB (negative value = KiB).
3818        self.conn
3819            .execute("PRAGMA cache_size = -65536;")
3820            .with_context(|| "setting cache_size")?;
3821
3822        // foreign_keys: enable constraint enforcement.
3823        self.conn
3824            .execute("PRAGMA foreign_keys = ON;")
3825            .with_context(|| "setting foreign_keys")?;
3826
3827        // busy_timeout: 5 seconds (in milliseconds).
3828        self.conn
3829            .execute("PRAGMA busy_timeout = 5000;")
3830            .with_context(|| "setting busy_timeout")?;
3831
3832        // temp_store = MEMORY and mmap_size are C SQLite performance knobs.
3833        // In frankensqlite's architecture (in-memory MVCC engine with pager
3834        // backend), temp_store is always memory-resident and mmap_size does not
3835        // apply. Skipped intentionally — these are no-ops or errors.
3836
3837        // wal_autocheckpoint: use a bounded cadence that avoids checkpointing
3838        // inside common append batches without deferring checkpoints forever.
3839        let checkpoint_pragma =
3840            format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
3841        let _ = self.conn.execute(&checkpoint_pragma);
3842        self.index_writer_checkpoint_pages
3843            .store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
3844        // Explicitly enable concurrent writer mode for BEGIN/transaction paths.
3845        // Try both namespace variants for compatibility across fsqlite builds.
3846        let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
3847        let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
3848        // Frankensqlite retained autocommit currently mis-serves same-connection
3849        // read-after-write queries on cass's storage paths; keep it off here
3850        // until the upstream visibility bug is fixed.
3851        //
3852        // CASS #163 item 3: If neither PRAGMA variant succeeds, the MVCC engine
3853        // will accumulate write snapshots for the lifetime of the connection,
3854        // causing unbounded memory growth on long-lived watch-mode handles.
3855        // Log at warn level so the failure is visible instead of silently
3856        // swallowed, and set a flag for callers that need to periodically
3857        // recycle the connection.
3858        let autocommit_pragma =
3859            disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ()))?;
3860        tracing::debug!(
3861            pragma = autocommit_pragma,
3862            "disabled frankensqlite autocommit_retain for storage connection"
3863        );
3864
3865        Ok(())
3866    }
3867
3868    fn apply_readonly_config(&self) -> Result<()> {
3869        self.conn
3870            .execute("PRAGMA query_only = 1;")
3871            .with_context(|| "setting query_only")?;
3872        self.conn
3873            .execute("PRAGMA busy_timeout = 5000;")
3874            .with_context(|| "setting busy_timeout")?;
3875        self.conn
3876            .execute("PRAGMA cache_size = -65536;")
3877            .with_context(|| "setting cache_size")?;
3878        self.conn
3879            .execute("PRAGMA foreign_keys = ON;")
3880            .with_context(|| "setting foreign_keys")?;
3881        Ok(())
3882    }
3883
3884    /// Run all schema migrations, handling transition from meta table versioning.
3885    ///
3886    /// The existing `SqliteStorage` tracks schema version in a `meta` table entry.
3887    /// The new `MigrationRunner` uses a `_schema_migrations` table. This method:
3888    /// 1. Transitions existing databases from meta table → `_schema_migrations`
3889    /// 2. Runs pending migrations via `MigrationRunner`
3890    /// 3. Syncs `meta.schema_version` for backward compatibility
3891    ///
3892    /// # Fresh vs existing databases
3893    ///
3894    /// Fresh databases use a single combined migration (`MIGRATION_FRESH_SCHEMA`)
3895    /// that creates the complete V13 schema directly. This avoids the incremental
3896    /// V5 migration which uses `DROP TABLE` — an operation that triggers a known
3897    /// frankensqlite autoindex limitation.
3898    ///
3899    /// Existing databases (transitioned from SqliteStorage) are typically at
3900    /// V13 or newer already; additive post-V13 migrations are applied normally.
3901    pub fn run_migrations(&self) -> Result<()> {
3902        transition_from_meta_version(&self.conn)?;
3903
3904        let base_result = build_cass_migrations_before_tail_cache()
3905            .run(&self.conn)
3906            .with_context(|| "running base schema migrations")?;
3907
3908        let mut applied = base_result.applied;
3909        if apply_conversation_tail_state_cache_migration(&self.conn)
3910            .with_context(|| "running conversation tail-state cache migration")?
3911        {
3912            applied.push(15);
3913        }
3914
3915        let post_result = build_cass_migrations_after_tail_cache()
3916            .run(&self.conn)
3917            .with_context(|| "running post-tail-cache schema migrations")?;
3918        applied.extend(post_result.applied);
3919
3920        let current = self.schema_version()?;
3921        if !applied.is_empty() {
3922            info!(
3923                applied = ?applied,
3924                current,
3925                was_fresh = base_result.was_fresh,
3926                "frankensqlite schema migrations applied"
3927            );
3928        }
3929
3930        // Keep meta.schema_version in sync for backward compatibility.
3931        self.sync_meta_schema_version(current)?;
3932
3933        Ok(())
3934    }
3935
3936    /// Some historical canonical rebuild paths produced databases whose
3937    /// version markers claim the current schema while post-V10 analytics
3938    /// tables were never materialized. Detect that drift and backfill the
3939    /// idempotent table/index set from the combined schema migration.
3940    fn repair_missing_current_schema_objects(&self) -> Result<()> {
3941        let mut missing_tables = Vec::new();
3942        for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
3943            if let Err(err) = self.conn.query(probe_sql) {
3944                if error_indicates_missing_table(&err) {
3945                    missing_tables.push(table_name);
3946                    continue;
3947                }
3948                return Err(err).with_context(|| {
3949                    format!("probing required schema table {table_name} for completeness")
3950                });
3951            }
3952        }
3953
3954        if !missing_tables.is_empty() {
3955            info!(
3956                missing_tables = ?missing_tables,
3957                "repairing missing current-schema tables on an already-versioned cass database"
3958            );
3959
3960            for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
3961                self.conn
3962                    .execute_batch(batch.sql)
3963                    .with_context(|| format!("repairing current-schema batch {}", batch.name))?;
3964            }
3965
3966            for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
3967                if !missing_tables.contains(&table_name) {
3968                    continue;
3969                }
3970                self.conn
3971                    .query(probe_sql)
3972                    .with_context(|| format!("verifying repaired schema table {table_name}"))?;
3973            }
3974        }
3975        self.repair_missing_conversation_token_columns()?;
3976        Ok(())
3977    }
3978
3979    fn repair_missing_conversation_token_columns(&self) -> Result<()> {
3980        let columns = franken_table_column_names(&self.conn, "conversations")
3981            .with_context(|| "inspecting conversations columns for token-summary repair")?;
3982        let mut missing_columns = Vec::new();
3983        for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
3984            if columns.contains(column_name) {
3985                continue;
3986            }
3987            let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
3988            self.conn.execute(&sql).with_context(|| {
3989                format!("adding missing conversations.{column_name} token-summary column")
3990            })?;
3991            missing_columns.push(column_name);
3992        }
3993        if !missing_columns.is_empty() {
3994            tracing::warn!(
3995                target: "cass::schema_repair",
3996                db_path = %self.db_path.display(),
3997                missing_columns = ?missing_columns,
3998                "cass#222: repaired missing conversations token-summary columns"
3999            );
4000        }
4001        Ok(())
4002    }
4003
4004    /// Detect and remove orphan rows whose FK parent has gone missing.
4005    ///
4006    /// A `Connection` dropped mid-transaction (the `drop_close` warning emitted
4007    /// by frankensqlite's `Drop` impl) can leave child rows persisted without a
4008    /// matching parent — `messages` referencing a `conversation_id` that does
4009    /// not exist, `message_metrics`/`token_usage`/`snippets` referencing a
4010    /// `message_id` that does not exist, etc. With `PRAGMA foreign_keys = ON`,
4011    /// every subsequent indexer pass then trips `FOREIGN KEY constraint failed`
4012    /// on the next write, the session never gets marked indexed, and the
4013    /// pending backlog grows without bound (issue #202).
4014    ///
4015    /// This pass runs at indexer startup as defense in depth: it scans each
4016    /// child table for rows whose parent row has gone missing and removes them
4017    /// in bounded committed chunks, breaking the failure cycle even when the
4018    /// underlying transaction-discipline bug has not been fully root-caused.
4019    /// The pass is idempotent (a clean database is a no-op), and emits a
4020    /// `WARN` after successful cleanup so the upstream `drop_close` condition
4021    /// stays visible.
4022    pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
4023        let mut report = OrphanFkCleanupReport::default();
4024        let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
4025            Ok(ids) => ids,
4026            Err(err) if error_indicates_missing_table(&err) => {
4027                tracing::debug!(
4028                    target: "cass::fk_repair",
4029                    child_table = "messages",
4030                    error = %err,
4031                    "skipping orphan-message probe (table or column unavailable)"
4032                );
4033                Vec::new()
4034            }
4035            Err(err) => return Err(err),
4036        };
4037        if !orphan_message_ids.is_empty() {
4038            report.record("messages", orphan_message_ids.len() as i64);
4039        }
4040
4041        if !orphan_message_ids.is_empty() {
4042            delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
4043                .context("deleting orphan message rows and dependent children")?;
4044        }
4045
4046        for entry in ORPHAN_DIRECT_CHILD_TABLES {
4047            loop {
4048                let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
4049                    Ok(ids) => ids,
4050                    Err(err)
4051                        if error_indicates_missing_table(&err)
4052                            || error_indicates_missing_column(&err) =>
4053                    {
4054                        // Tolerant probe: a missing child/parent table or FK
4055                        // column on older schemas means there is nothing to
4056                        // clean up for this table.
4057                        tracing::debug!(
4058                            target: "cass::fk_repair",
4059                            child_table = entry.child_table,
4060                            error = %err,
4061                            "skipping orphan probe (table or column unavailable)"
4062                        );
4063                        break;
4064                    }
4065                    Err(err) => {
4066                        return Err(err).with_context(|| {
4067                            format!("probing orphan rows in {}", entry.child_table)
4068                        });
4069                    }
4070                };
4071                if ids.is_empty() {
4072                    break;
4073                }
4074
4075                let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
4076                    .with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
4077                if deleted == 0 {
4078                    break;
4079                }
4080                report.record(
4081                    entry.child_table,
4082                    i64::try_from(deleted).unwrap_or(i64::MAX),
4083                );
4084            }
4085        }
4086
4087        if report.total == 0 {
4088            return Ok(report);
4089        }
4090
4091        // WARN only fires after a successful commit so the message accurately
4092        // reflects what actually happened on disk. db_path is included so logs
4093        // from concurrent indexers against different databases stay
4094        // disambiguated.
4095        tracing::warn!(
4096            target: "cass::fk_repair",
4097            db_path = %self.db_path.display(),
4098            total_orphans = report.total,
4099            per_table = ?report.per_table,
4100            "cass#202: removed orphan rows left behind by interrupted index transactions"
4101        );
4102
4103        Ok(report)
4104    }
4105
4106    /// Return the current schema version from `_schema_migrations`.
4107    pub fn schema_version(&self) -> Result<i64> {
4108        let rows = self
4109            .conn
4110            .query("SELECT MAX(version) FROM _schema_migrations;")
4111            .with_context(|| "reading schema version from _schema_migrations")?;
4112
4113        if let Some(row) = rows.first()
4114            && let Ok(v) = row.get_typed::<Option<i64>>(0)
4115        {
4116            return Ok(v.unwrap_or(0));
4117        }
4118        Ok(0)
4119    }
4120
4121    /// Keep `meta.schema_version` in sync for backward compatibility with `SqliteStorage`.
4122    fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
4123        // The meta table is created by V1 migration. If it doesn't exist yet,
4124        // there's nothing to sync.
4125        if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
4126            return Ok(());
4127        }
4128
4129        // Only write if the version needs updating to avoid write lock contention
4130        if let Ok(rows) = self
4131            .conn
4132            .query("SELECT value FROM meta WHERE key = 'schema_version';")
4133            && let Some(row) = rows.first()
4134            && let Ok(val) = row.get_typed::<String>(0)
4135            && val == version.to_string()
4136        {
4137            return Ok(()); // Already up to date
4138        }
4139
4140        self.conn
4141            .execute_compat(
4142                "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
4143                &[ParamValue::from(version.to_string())],
4144            )
4145            .with_context(|| "syncing meta schema_version")?;
4146
4147        Ok(())
4148    }
4149
4150    /// Resolve the database file path for this connection.
4151    pub fn database_path(&self) -> Result<PathBuf> {
4152        Ok(self.db_path.clone())
4153    }
4154
4155    pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
4156        self.ephemeral_writer_preflight_verified
4157            .load(Ordering::Relaxed)
4158    }
4159
4160    pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
4161        self.ephemeral_writer_preflight_verified
4162            .store(true, Ordering::Relaxed);
4163    }
4164
4165    pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
4166        let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4167        (pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
4168    }
4169
4170    pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
4171        self.index_writer_checkpoint_pages
4172            .store(pages, Ordering::Relaxed);
4173    }
4174
4175    pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
4176        let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4177        (timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
4178    }
4179
4180    pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
4181        self.index_writer_busy_timeout_ms
4182            .store(timeout_ms, Ordering::Relaxed);
4183    }
4184
4185    /// Open database with migration, backing up if schema is incompatible.
4186    pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
4187        if let Some(parent) = path.parent() {
4188            fs::create_dir_all(parent)?;
4189        }
4190
4191        if path.exists() {
4192            let check_result = check_schema_compatibility(path);
4193            match check_result {
4194                Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
4195                    // Continue with normal open
4196                }
4197                Ok(SchemaCheck::NeedsRebuild(reason)) => {
4198                    let backup_path = create_backup(path)?;
4199                    cleanup_old_backups(path, MAX_BACKUPS)?;
4200                    remove_database_files(path)?;
4201                    return Err(MigrationError::RebuildRequired {
4202                        reason,
4203                        backup_path,
4204                    });
4205                }
4206                Err(err) if schema_check_error_requires_rebuild(&err) => {
4207                    let backup_path = create_backup(path)?;
4208                    cleanup_old_backups(path, MAX_BACKUPS)?;
4209                    remove_database_files(path)?;
4210                    return Err(MigrationError::RebuildRequired {
4211                        reason: format!("Database appears corrupted: {err}"),
4212                        backup_path,
4213                    });
4214                }
4215                Err(err) => return Err(MigrationError::Database(err)),
4216            }
4217        }
4218
4219        let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
4220        Ok(storage)
4221    }
4222}
4223
4224// -------------------------------------------------------------------------
4225// Frankensqlite migration helpers
4226// -------------------------------------------------------------------------
4227
4228/// Build the `MigrationRunner` for the frankensqlite migration path.
4229///
4230/// Uses a single combined migration (version 13) that creates the complete
4231/// final schema in one step. This avoids the V5 `DROP TABLE conversations`
4232/// operation which triggers a known frankensqlite limitation: autoindex entries
4233/// in sqlite_master are not properly cleaned up during DROP TABLE, causing
4234/// "sqlite_master entry not found" errors.
4235///
4236/// For existing databases transitioned from SqliteStorage, the transition
4237/// function backfills `_schema_migrations`; post-V13 additive migrations then
4238/// run normally.
4239fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
4240    MigrationRunner::new()
4241        .add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
4242        .add(14, "fts_contentless", MIGRATION_V14)
4243}
4244
4245fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
4246    MigrationRunner::new()
4247        .add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
4248        .add(17, "drop_message_created_idx", MIGRATION_V17)
4249        .add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
4250        .add(19, "conversation_external_lookup", MIGRATION_V19)
4251        .add(20, "conversation_external_tail_lookup", MIGRATION_V20)
4252}
4253
4254fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
4255    let rows = conn
4256        .query_with_params(
4257            "SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
4258            &[SqliteValue::from(version)],
4259        )
4260        .with_context(|| format!("checking schema migration version {version}"))?;
4261    Ok(!rows.is_empty())
4262}
4263
4264fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
4265    conn.execute("BEGIN IMMEDIATE;")
4266        .with_context(|| "starting v15 conversation tail-state migration transaction")?;
4267
4268    let result = (|| -> Result<bool> {
4269        if schema_migration_is_applied(conn, 15)? {
4270            conn.execute("COMMIT;")
4271                .with_context(|| "committing already-applied v15 migration transaction")?;
4272            return Ok(false);
4273        }
4274
4275        let started = Instant::now();
4276        let conversation_columns = franken_table_column_names(conn, "conversations")
4277            .with_context(|| "inspecting conversations columns before v15 migration")?;
4278        if !conversation_columns.contains("last_message_idx") {
4279            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
4280                .with_context(|| "adding v15 conversations.last_message_idx column")?;
4281        }
4282        if !conversation_columns.contains("last_message_created_at") {
4283            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
4284                .with_context(|| "adding v15 conversations.last_message_created_at column")?;
4285        }
4286        conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
4287            .with_context(|| "applying v15 conversation tail-state table schema")?;
4288        conn.execute_compat(
4289            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
4290            fparams![15_i64, "conversation_tail_state_cache"],
4291        )
4292        .with_context(|| "recording v15 conversation tail-state migration")?;
4293        conn.execute("COMMIT;")
4294            .with_context(|| "committing v15 conversation tail-state migration")?;
4295        info!(
4296            elapsed_ms = started.elapsed().as_millis(),
4297            "applied v15 conversation tail-state cache migration"
4298        );
4299        Ok(true)
4300    })();
4301
4302    if result.is_err() {
4303        let _ = conn.execute("ROLLBACK;");
4304    }
4305
4306    result
4307}
4308
4309fn franken_table_column_names(
4310    conn: &FrankenConnection,
4311    table_name: &str,
4312) -> Result<HashSet<String>> {
4313    if !table_name
4314        .chars()
4315        .all(|c| c.is_ascii_alphanumeric() || c == '_')
4316    {
4317        return Err(anyhow!(
4318            "unsafe table name for PRAGMA table_info: {table_name}"
4319        ));
4320    }
4321
4322    conn.query_map_collect(
4323        &format!("PRAGMA table_info({table_name})"),
4324        fparams![],
4325        |row: &FrankenRow| row.get_typed::<String>(1),
4326    )
4327    .with_context(|| format!("reading PRAGMA table_info({table_name})"))
4328    .map(|columns| columns.into_iter().collect())
4329}
4330
4331/// Combined V13 schema for fresh databases.
4332///
4333/// Creates the complete final schema in a single migration, avoiding the
4334/// incremental V5 `DROP TABLE conversations` which triggers a frankensqlite
4335/// autoindex limitation. All columns from V1-V13 are included in their
4336/// respective CREATE TABLE statements.
4337///
4338/// Table creation order respects foreign key references:
4339/// sources → agents/workspaces → conversations → messages → snippets, etc.
4340const MIGRATION_FRESH_SCHEMA: &str = r"
4341-- Core tables (V1)
4342CREATE TABLE IF NOT EXISTS meta (
4343    key TEXT PRIMARY KEY,
4344    value TEXT NOT NULL
4345);
4346
4347CREATE TABLE IF NOT EXISTS agents (
4348    id INTEGER PRIMARY KEY,
4349    slug TEXT NOT NULL UNIQUE,
4350    name TEXT NOT NULL,
4351    version TEXT,
4352    kind TEXT NOT NULL,
4353    created_at INTEGER NOT NULL,
4354    updated_at INTEGER NOT NULL
4355);
4356
4357CREATE TABLE IF NOT EXISTS workspaces (
4358    id INTEGER PRIMARY KEY,
4359    path TEXT NOT NULL UNIQUE,
4360    display_name TEXT
4361);
4362
4363-- Sources (V4)
4364CREATE TABLE IF NOT EXISTS sources (
4365    id TEXT PRIMARY KEY,
4366    kind TEXT NOT NULL,
4367    host_label TEXT,
4368    machine_id TEXT,
4369    platform TEXT,
4370    config_json TEXT,
4371    created_at INTEGER NOT NULL,
4372    updated_at INTEGER NOT NULL
4373);
4374
4375INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4376VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4377
4378-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
4379CREATE TABLE IF NOT EXISTS conversations (
4380    id INTEGER PRIMARY KEY,
4381    agent_id INTEGER NOT NULL REFERENCES agents(id),
4382    workspace_id INTEGER REFERENCES workspaces(id),
4383    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
4384    external_id TEXT,
4385    title TEXT,
4386    source_path TEXT NOT NULL,
4387    started_at INTEGER,
4388    ended_at INTEGER,
4389    approx_tokens INTEGER,
4390    metadata_json TEXT,
4391    origin_host TEXT,
4392    metadata_bin BLOB,
4393    total_input_tokens INTEGER,
4394    total_output_tokens INTEGER,
4395    total_cache_read_tokens INTEGER,
4396    total_cache_creation_tokens INTEGER,
4397    grand_total_tokens INTEGER,
4398    estimated_cost_usd REAL,
4399    primary_model TEXT,
4400    api_call_count INTEGER,
4401    tool_call_count INTEGER,
4402    user_message_count INTEGER,
4403    assistant_message_count INTEGER,
4404    -- V15 columns are included in the fresh schema so fresh DB creation does
4405    -- not need ALTER TABLE on conversations. That ALTER path can duplicate
4406    -- provenance autoindex state in frankensqlite when the named unique
4407    -- provenance index already exists.
4408    last_message_idx INTEGER,
4409    last_message_created_at INTEGER
4410);
4411
4412-- Named unique index avoids autoindex issues if table is ever recreated
4413CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
4414    ON conversations(source_id, agent_id, external_id);
4415
4416-- Messages: V1 base + V7 extra_bin
4417CREATE TABLE IF NOT EXISTS messages (
4418    id INTEGER PRIMARY KEY,
4419    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4420    idx INTEGER NOT NULL,
4421    role TEXT NOT NULL,
4422    author TEXT,
4423    created_at INTEGER,
4424    content TEXT NOT NULL,
4425    extra_json TEXT,
4426    extra_bin BLOB,
4427    UNIQUE(conversation_id, idx)
4428);
4429
4430CREATE TABLE IF NOT EXISTS snippets (
4431    id INTEGER PRIMARY KEY,
4432    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4433    file_path TEXT,
4434    start_line INTEGER,
4435    end_line INTEGER,
4436    language TEXT,
4437    snippet_text TEXT
4438);
4439
4440CREATE TABLE IF NOT EXISTS tags (
4441    id INTEGER PRIMARY KEY,
4442    name TEXT NOT NULL UNIQUE
4443);
4444
4445CREATE TABLE IF NOT EXISTS conversation_tags (
4446    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4447    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
4448    PRIMARY KEY (conversation_id, tag_id)
4449);
4450
4451-- Daily stats (V8)
4452CREATE TABLE IF NOT EXISTS daily_stats (
4453    day_id INTEGER NOT NULL,
4454    agent_slug TEXT NOT NULL,
4455    source_id TEXT NOT NULL DEFAULT 'all',
4456    session_count INTEGER NOT NULL DEFAULT 0,
4457    message_count INTEGER NOT NULL DEFAULT 0,
4458    total_chars INTEGER NOT NULL DEFAULT 0,
4459    last_updated INTEGER NOT NULL,
4460    PRIMARY KEY (day_id, agent_slug, source_id)
4461);
4462
4463-- Embedding jobs (V9)
4464CREATE TABLE IF NOT EXISTS embedding_jobs (
4465    id INTEGER PRIMARY KEY AUTOINCREMENT,
4466    db_path TEXT NOT NULL,
4467    model_id TEXT NOT NULL,
4468    status TEXT NOT NULL DEFAULT 'pending',
4469    total_docs INTEGER NOT NULL DEFAULT 0,
4470    completed_docs INTEGER NOT NULL DEFAULT 0,
4471    error_message TEXT,
4472    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4473    started_at TEXT,
4474    completed_at TEXT
4475);
4476
4477CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4478ON embedding_jobs(db_path, model_id)
4479WHERE status IN ('pending', 'running');
4480
4481-- Token usage ledger (V10)
4482CREATE TABLE IF NOT EXISTS token_usage (
4483    id INTEGER PRIMARY KEY AUTOINCREMENT,
4484    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4485    conversation_id INTEGER NOT NULL,
4486    agent_id INTEGER NOT NULL,
4487    workspace_id INTEGER,
4488    source_id TEXT NOT NULL DEFAULT 'local',
4489    timestamp_ms INTEGER NOT NULL,
4490    day_id INTEGER NOT NULL,
4491    model_name TEXT,
4492    model_family TEXT,
4493    model_tier TEXT,
4494    service_tier TEXT,
4495    provider TEXT,
4496    input_tokens INTEGER,
4497    output_tokens INTEGER,
4498    cache_read_tokens INTEGER,
4499    cache_creation_tokens INTEGER,
4500    thinking_tokens INTEGER,
4501    total_tokens INTEGER,
4502    estimated_cost_usd REAL,
4503    role TEXT NOT NULL,
4504    content_chars INTEGER NOT NULL,
4505    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4506    tool_call_count INTEGER NOT NULL DEFAULT 0,
4507    data_source TEXT NOT NULL DEFAULT 'api',
4508    UNIQUE(message_id)
4509);
4510
4511-- Token daily stats (V10)
4512CREATE TABLE IF NOT EXISTS token_daily_stats (
4513    day_id INTEGER NOT NULL,
4514    agent_slug TEXT NOT NULL,
4515    source_id TEXT NOT NULL DEFAULT 'all',
4516    model_family TEXT NOT NULL DEFAULT 'all',
4517    api_call_count INTEGER NOT NULL DEFAULT 0,
4518    user_message_count INTEGER NOT NULL DEFAULT 0,
4519    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4520    tool_message_count INTEGER NOT NULL DEFAULT 0,
4521    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4522    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4523    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4524    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4525    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4526    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4527    total_content_chars INTEGER NOT NULL DEFAULT 0,
4528    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4529    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4530    session_count INTEGER NOT NULL DEFAULT 0,
4531    last_updated INTEGER NOT NULL,
4532    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4533);
4534
4535-- Model pricing (V10)
4536CREATE TABLE IF NOT EXISTS model_pricing (
4537    model_pattern TEXT NOT NULL,
4538    provider TEXT NOT NULL,
4539    input_cost_per_mtok REAL NOT NULL,
4540    output_cost_per_mtok REAL NOT NULL,
4541    cache_read_cost_per_mtok REAL,
4542    cache_creation_cost_per_mtok REAL,
4543    effective_date TEXT NOT NULL,
4544    PRIMARY KEY (model_pattern, effective_date)
4545);
4546
4547INSERT OR IGNORE INTO model_pricing VALUES
4548    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4549    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4550    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4551    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4552    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4553    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4554    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4555    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4556    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4557    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4558
4559-- Message metrics: V11 base + V12 model dimensions
4560CREATE TABLE IF NOT EXISTS message_metrics (
4561    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4562    created_at_ms INTEGER NOT NULL,
4563    hour_id INTEGER NOT NULL,
4564    day_id INTEGER NOT NULL,
4565    agent_slug TEXT NOT NULL,
4566    workspace_id INTEGER NOT NULL DEFAULT 0,
4567    source_id TEXT NOT NULL DEFAULT 'local',
4568    role TEXT NOT NULL,
4569    content_chars INTEGER NOT NULL,
4570    content_tokens_est INTEGER NOT NULL,
4571    api_input_tokens INTEGER,
4572    api_output_tokens INTEGER,
4573    api_cache_read_tokens INTEGER,
4574    api_cache_creation_tokens INTEGER,
4575    api_thinking_tokens INTEGER,
4576    api_service_tier TEXT,
4577    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4578    tool_call_count INTEGER NOT NULL DEFAULT 0,
4579    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4580    has_plan INTEGER NOT NULL DEFAULT 0,
4581    model_name TEXT,
4582    model_family TEXT NOT NULL DEFAULT 'unknown',
4583    model_tier TEXT NOT NULL DEFAULT 'unknown',
4584    provider TEXT NOT NULL DEFAULT 'unknown'
4585);
4586
4587-- Hourly rollups: V11 base + V13 plan columns
4588CREATE TABLE IF NOT EXISTS usage_hourly (
4589    hour_id INTEGER NOT NULL,
4590    agent_slug TEXT NOT NULL,
4591    workspace_id INTEGER NOT NULL DEFAULT 0,
4592    source_id TEXT NOT NULL DEFAULT 'local',
4593    message_count INTEGER NOT NULL DEFAULT 0,
4594    user_message_count INTEGER NOT NULL DEFAULT 0,
4595    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4596    tool_call_count INTEGER NOT NULL DEFAULT 0,
4597    plan_message_count INTEGER NOT NULL DEFAULT 0,
4598    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4599    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4600    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4601    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4602    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4603    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4604    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4605    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4606    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4607    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4608    last_updated INTEGER NOT NULL DEFAULT 0,
4609    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4610    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4611    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4612);
4613
4614-- Daily rollups: V11 base + V13 plan columns
4615CREATE TABLE IF NOT EXISTS usage_daily (
4616    day_id INTEGER NOT NULL,
4617    agent_slug TEXT NOT NULL,
4618    workspace_id INTEGER NOT NULL DEFAULT 0,
4619    source_id TEXT NOT NULL DEFAULT 'local',
4620    message_count INTEGER NOT NULL DEFAULT 0,
4621    user_message_count INTEGER NOT NULL DEFAULT 0,
4622    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4623    tool_call_count INTEGER NOT NULL DEFAULT 0,
4624    plan_message_count INTEGER NOT NULL DEFAULT 0,
4625    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4626    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4627    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4628    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4629    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4630    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4631    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4632    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4633    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4634    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4635    last_updated INTEGER NOT NULL DEFAULT 0,
4636    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4637    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4638    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
4639);
4640
4641-- Model daily rollups (V12)
4642CREATE TABLE IF NOT EXISTS usage_models_daily (
4643    day_id INTEGER NOT NULL,
4644    agent_slug TEXT NOT NULL,
4645    workspace_id INTEGER NOT NULL DEFAULT 0,
4646    source_id TEXT NOT NULL DEFAULT 'local',
4647    model_family TEXT NOT NULL DEFAULT 'unknown',
4648    model_tier TEXT NOT NULL DEFAULT 'unknown',
4649    message_count INTEGER NOT NULL DEFAULT 0,
4650    user_message_count INTEGER NOT NULL DEFAULT 0,
4651    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4652    tool_call_count INTEGER NOT NULL DEFAULT 0,
4653    plan_message_count INTEGER NOT NULL DEFAULT 0,
4654    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4655    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4656    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4657    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4658    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4659    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4660    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4661    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4662    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4663    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4664    last_updated INTEGER NOT NULL DEFAULT 0,
4665    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
4666);
4667
4668-- All indexes
4669CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
4670CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
4671CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
4672CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4673CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4674CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4675CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4676CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4677CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4678CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4679CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4680CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4681CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
4682CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
4683CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
4684CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
4685CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
4686CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
4687CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
4688CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
4689CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
4690CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
4691CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
4692CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
4693CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
4694CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
4695CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
4696CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
4697CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
4698CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
4699";
4700
4701#[derive(Clone, Copy)]
4702struct SchemaRepairBatch {
4703    name: &'static str,
4704    tables: &'static [&'static str],
4705    sql: &'static str,
4706}
4707
4708const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
4709CREATE TABLE IF NOT EXISTS sources (
4710    id TEXT PRIMARY KEY,
4711    kind TEXT NOT NULL,
4712    host_label TEXT,
4713    machine_id TEXT,
4714    platform TEXT,
4715    config_json TEXT,
4716    created_at INTEGER NOT NULL,
4717    updated_at INTEGER NOT NULL
4718);
4719
4720INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4721VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4722";
4723
4724const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
4725CREATE TABLE IF NOT EXISTS daily_stats (
4726    day_id INTEGER NOT NULL,
4727    agent_slug TEXT NOT NULL,
4728    source_id TEXT NOT NULL DEFAULT 'all',
4729    session_count INTEGER NOT NULL DEFAULT 0,
4730    message_count INTEGER NOT NULL DEFAULT 0,
4731    total_chars INTEGER NOT NULL DEFAULT 0,
4732    last_updated INTEGER NOT NULL,
4733    PRIMARY KEY (day_id, agent_slug, source_id)
4734);
4735
4736CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4737CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4738";
4739
4740const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
4741CREATE TABLE IF NOT EXISTS conversation_external_lookup (
4742    lookup_key TEXT PRIMARY KEY,
4743    conversation_id INTEGER NOT NULL
4744);
4745
4746INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
4747SELECT
4748    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
4749    CAST(agent_id AS TEXT) || ':' ||
4750    CAST(length(external_id) AS TEXT) || ':' || external_id,
4751    id
4752FROM conversations
4753WHERE external_id IS NOT NULL;
4754";
4755
4756const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
4757CREATE TABLE IF NOT EXISTS conversation_tail_state (
4758    conversation_id INTEGER PRIMARY KEY,
4759    ended_at INTEGER,
4760    last_message_idx INTEGER,
4761    last_message_created_at INTEGER
4762);
4763
4764CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
4765    lookup_key TEXT PRIMARY KEY,
4766    conversation_id INTEGER NOT NULL,
4767    ended_at INTEGER,
4768    last_message_idx INTEGER,
4769    last_message_created_at INTEGER
4770);
4771
4772INSERT OR REPLACE INTO conversation_external_tail_lookup (
4773    lookup_key,
4774    conversation_id,
4775    ended_at,
4776    last_message_idx,
4777    last_message_created_at
4778)
4779SELECT
4780    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
4781    CAST(c.agent_id AS TEXT) || ':' ||
4782    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
4783    c.id,
4784    ts.ended_at,
4785    ts.last_message_idx,
4786    ts.last_message_created_at
4787FROM conversations c
4788LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
4789WHERE c.external_id IS NOT NULL;
4790";
4791
4792const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
4793CREATE TABLE IF NOT EXISTS embedding_jobs (
4794    id INTEGER PRIMARY KEY AUTOINCREMENT,
4795    db_path TEXT NOT NULL,
4796    model_id TEXT NOT NULL,
4797    status TEXT NOT NULL DEFAULT 'pending',
4798    total_docs INTEGER NOT NULL DEFAULT 0,
4799    completed_docs INTEGER NOT NULL DEFAULT 0,
4800    error_message TEXT,
4801    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4802    started_at TEXT,
4803    completed_at TEXT
4804);
4805
4806CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4807ON embedding_jobs(db_path, model_id)
4808WHERE status IN ('pending', 'running');
4809";
4810
4811const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
4812CREATE TABLE IF NOT EXISTS token_usage (
4813    id INTEGER PRIMARY KEY AUTOINCREMENT,
4814    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4815    conversation_id INTEGER NOT NULL,
4816    agent_id INTEGER NOT NULL,
4817    workspace_id INTEGER,
4818    source_id TEXT NOT NULL DEFAULT 'local',
4819    timestamp_ms INTEGER NOT NULL,
4820    day_id INTEGER NOT NULL,
4821    model_name TEXT,
4822    model_family TEXT,
4823    model_tier TEXT,
4824    service_tier TEXT,
4825    provider TEXT,
4826    input_tokens INTEGER,
4827    output_tokens INTEGER,
4828    cache_read_tokens INTEGER,
4829    cache_creation_tokens INTEGER,
4830    thinking_tokens INTEGER,
4831    total_tokens INTEGER,
4832    estimated_cost_usd REAL,
4833    role TEXT NOT NULL,
4834    content_chars INTEGER NOT NULL,
4835    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4836    tool_call_count INTEGER NOT NULL DEFAULT 0,
4837    data_source TEXT NOT NULL DEFAULT 'api',
4838    UNIQUE(message_id)
4839);
4840
4841CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4842CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4843CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4844CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4845CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4846
4847CREATE TABLE IF NOT EXISTS token_daily_stats (
4848    day_id INTEGER NOT NULL,
4849    agent_slug TEXT NOT NULL,
4850    source_id TEXT NOT NULL DEFAULT 'all',
4851    model_family TEXT NOT NULL DEFAULT 'all',
4852    api_call_count INTEGER NOT NULL DEFAULT 0,
4853    user_message_count INTEGER NOT NULL DEFAULT 0,
4854    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4855    tool_message_count INTEGER NOT NULL DEFAULT 0,
4856    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4857    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4858    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4859    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4860    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4861    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4862    total_content_chars INTEGER NOT NULL DEFAULT 0,
4863    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4864    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4865    session_count INTEGER NOT NULL DEFAULT 0,
4866    last_updated INTEGER NOT NULL,
4867    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4868);
4869
4870CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4871CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4872
4873CREATE TABLE IF NOT EXISTS model_pricing (
4874    model_pattern TEXT NOT NULL,
4875    provider TEXT NOT NULL,
4876    input_cost_per_mtok REAL NOT NULL,
4877    output_cost_per_mtok REAL NOT NULL,
4878    cache_read_cost_per_mtok REAL,
4879    cache_creation_cost_per_mtok REAL,
4880    effective_date TEXT NOT NULL,
4881    PRIMARY KEY (model_pattern, effective_date)
4882);
4883
4884INSERT OR IGNORE INTO model_pricing VALUES
4885    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4886    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4887    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4888    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4889    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4890    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4891    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4892    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4893    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4894    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4895";
4896
4897const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
4898CREATE TABLE IF NOT EXISTS message_metrics (
4899    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4900    created_at_ms INTEGER NOT NULL,
4901    hour_id INTEGER NOT NULL,
4902    day_id INTEGER NOT NULL,
4903    agent_slug TEXT NOT NULL,
4904    workspace_id INTEGER NOT NULL DEFAULT 0,
4905    source_id TEXT NOT NULL DEFAULT 'local',
4906    role TEXT NOT NULL,
4907    content_chars INTEGER NOT NULL,
4908    content_tokens_est INTEGER NOT NULL,
4909    api_input_tokens INTEGER,
4910    api_output_tokens INTEGER,
4911    api_cache_read_tokens INTEGER,
4912    api_cache_creation_tokens INTEGER,
4913    api_thinking_tokens INTEGER,
4914    api_service_tier TEXT,
4915    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4916    tool_call_count INTEGER NOT NULL DEFAULT 0,
4917    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4918    has_plan INTEGER NOT NULL DEFAULT 0,
4919    model_name TEXT,
4920    model_family TEXT NOT NULL DEFAULT 'unknown',
4921    model_tier TEXT NOT NULL DEFAULT 'unknown',
4922    provider TEXT NOT NULL DEFAULT 'unknown'
4923);
4924
4925CREATE TABLE IF NOT EXISTS usage_hourly (
4926    hour_id INTEGER NOT NULL,
4927    agent_slug TEXT NOT NULL,
4928    workspace_id INTEGER NOT NULL DEFAULT 0,
4929    source_id TEXT NOT NULL DEFAULT 'local',
4930    message_count INTEGER NOT NULL DEFAULT 0,
4931    user_message_count INTEGER NOT NULL DEFAULT 0,
4932    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4933    tool_call_count INTEGER NOT NULL DEFAULT 0,
4934    plan_message_count INTEGER NOT NULL DEFAULT 0,
4935    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4936    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4937    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4938    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4939    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4940    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4941    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4942    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4943    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4944    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4945    last_updated INTEGER NOT NULL DEFAULT 0,
4946    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4947    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4948    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4949);
4950
4951CREATE TABLE IF NOT EXISTS usage_daily (
4952    day_id INTEGER NOT NULL,
4953    agent_slug TEXT NOT NULL,
4954    workspace_id INTEGER NOT NULL DEFAULT 0,
4955    source_id TEXT NOT NULL DEFAULT 'local',
4956    message_count INTEGER NOT NULL DEFAULT 0,
4957    user_message_count INTEGER NOT NULL DEFAULT 0,
4958    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4959    tool_call_count INTEGER NOT NULL DEFAULT 0,
4960    plan_message_count INTEGER NOT NULL DEFAULT 0,
4961    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4962    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4963    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4964    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4965    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4966    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4967    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4968    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4969    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4970    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4971    last_updated INTEGER NOT NULL DEFAULT 0,
4972    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4973    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4974    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
4975);
4976
4977CREATE TABLE IF NOT EXISTS usage_models_daily (
4978    day_id INTEGER NOT NULL,
4979    agent_slug TEXT NOT NULL,
4980    workspace_id INTEGER NOT NULL DEFAULT 0,
4981    source_id TEXT NOT NULL DEFAULT 'local',
4982    model_family TEXT NOT NULL DEFAULT 'unknown',
4983    model_tier TEXT NOT NULL DEFAULT 'unknown',
4984    message_count INTEGER NOT NULL DEFAULT 0,
4985    user_message_count INTEGER NOT NULL DEFAULT 0,
4986    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4987    tool_call_count INTEGER NOT NULL DEFAULT 0,
4988    plan_message_count INTEGER NOT NULL DEFAULT 0,
4989    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4990    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4991    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4992    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4993    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4994    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4995    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4996    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4997    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4998    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4999    last_updated INTEGER NOT NULL DEFAULT 0,
5000    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5001);
5002
5003CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5004CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5005CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5006CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5007CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5008CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5009CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5010CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5011CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5012CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5013CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5014CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5015CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5016CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5017CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5018CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5019CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5020CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5021";
5022
5023const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
5024    SchemaRepairBatch {
5025        name: "sources",
5026        tables: &["sources"],
5027        sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
5028    },
5029    SchemaRepairBatch {
5030        name: "daily_stats",
5031        tables: &["daily_stats"],
5032        sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
5033    },
5034    SchemaRepairBatch {
5035        name: "conversation_external_lookup",
5036        tables: &["conversation_external_lookup"],
5037        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
5038    },
5039    SchemaRepairBatch {
5040        name: "conversation_external_tail_lookup",
5041        tables: &[
5042            "conversation_tail_state",
5043            "conversation_external_tail_lookup",
5044        ],
5045        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
5046    },
5047    SchemaRepairBatch {
5048        name: "embedding_jobs",
5049        tables: &["embedding_jobs"],
5050        sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
5051    },
5052    SchemaRepairBatch {
5053        name: "token_analytics",
5054        tables: &["token_usage", "token_daily_stats", "model_pricing"],
5055        sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
5056    },
5057    SchemaRepairBatch {
5058        name: "message_rollups",
5059        tables: &[
5060            "message_metrics",
5061            "usage_hourly",
5062            "usage_daily",
5063            "usage_models_daily",
5064        ],
5065        sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
5066    },
5067];
5068
5069fn current_schema_repair_batches_for_missing_tables(
5070    missing_tables: &[&'static str],
5071) -> Result<Vec<&'static SchemaRepairBatch>> {
5072    let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
5073    let mut selected_batches = Vec::new();
5074    let mut covered_tables = HashSet::new();
5075
5076    for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
5077        if !batch
5078            .tables
5079            .iter()
5080            .any(|table_name| missing_set.contains(table_name))
5081        {
5082            continue;
5083        }
5084        selected_batches.push(batch);
5085        covered_tables.extend(batch.tables.iter().copied());
5086    }
5087
5088    for &table_name in missing_tables {
5089        if !covered_tables.contains(table_name) {
5090            return Err(anyhow!(
5091                "no current-schema repair batch registered for missing table {table_name}"
5092            ));
5093        }
5094    }
5095
5096    Ok(selected_batches)
5097}
5098
5099/// Migration name lookup for backfilling `_schema_migrations` during transition.
5100const MIGRATION_NAMES: [(i64, &str); 20] = [
5101    (1, "core_tables"),
5102    (2, "fts_messages"),
5103    (3, "fts_messages_rebuild"),
5104    (4, "sources"),
5105    (5, "provenance_columns"),
5106    (6, "source_path_index"),
5107    (7, "msgpack_columns"),
5108    (8, "daily_stats"),
5109    (9, "embedding_jobs"),
5110    (10, "token_analytics"),
5111    (11, "message_metrics"),
5112    (12, "model_dimensions"),
5113    (13, "plan_token_rollups"),
5114    (14, "fts_contentless"),
5115    (15, "conversation_tail_state_cache"),
5116    (16, "drop_redundant_message_conv_idx"),
5117    (17, "drop_message_created_idx"),
5118    (18, "conversation_tail_state_hot_table"),
5119    (19, "conversation_external_lookup"),
5120    (20, "conversation_external_tail_lookup"),
5121];
5122
5123/// Transitions an existing database from `meta` table schema versioning to the
5124/// `_schema_migrations` table used by `MigrationRunner`.
5125///
5126/// The existing `SqliteStorage` tracks schema version as a string value in
5127/// `meta WHERE key = 'schema_version'`. The bead spec references
5128/// `PRAGMA user_version`, but the actual cass code uses the `meta` table.
5129/// This function handles the real code path.
5130///
5131/// Behavior:
5132/// - If `_schema_migrations` already exists → skip (already transitioned)
5133/// - If `meta` table has `schema_version > 0` → create `_schema_migrations`
5134///   and backfill entries for versions `1..=current_version`
5135/// - If `meta` table missing or `schema_version = 0` with no tables → fresh DB,
5136///   let `MigrationRunner` handle it
5137/// - If `schema_version = 0` but tables exist → corrupted state, log warning
5138fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
5139    // Avoid sqlite_master enumeration here. Databases with FTS virtual tables
5140    // can trigger frankensqlite parse-recovery on sqlite_master reads, which is
5141    // enough to break the transition on otherwise-healthy legacy cass DBs.
5142    if conn
5143        .query("SELECT version FROM \"_schema_migrations\";")
5144        .is_ok()
5145    {
5146        return Ok(());
5147    }
5148
5149    // Check if the meta table exists.
5150    if conn.query("SELECT key FROM meta;").is_err() {
5151        // No meta table → fresh database, let MigrationRunner handle it.
5152        return Ok(());
5153    }
5154
5155    // Read the current schema version from the meta table.
5156    let rows = conn
5157        .query("SELECT value FROM meta WHERE key = 'schema_version';")
5158        .with_context(|| "reading schema_version from meta")?;
5159
5160    let current_version: i64 = rows
5161        .first()
5162        .and_then(|row| row.get_typed::<String>(0).ok())
5163        .and_then(|s| s.parse().ok())
5164        .unwrap_or(0);
5165
5166    if current_version == 0 {
5167        // Check if tables actually exist (corrupted state: tables present but version=0).
5168        if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
5169            // Truly fresh DB (meta table exists but empty/reset). Let MigrationRunner handle it.
5170            return Ok(());
5171        }
5172
5173        // Tables exist but version=0: corrupted state. Log and skip transition;
5174        // MigrationRunner will fail on "table already exists" and surface the error.
5175        info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
5176        return Ok(());
5177    }
5178
5179    // Create _schema_migrations and backfill entries for all applied versions.
5180    info!(
5181        current_version,
5182        "transitioning schema tracking from meta table to _schema_migrations"
5183    );
5184
5185    conn.execute(
5186        "CREATE TABLE IF NOT EXISTS _schema_migrations (\
5187            version INTEGER PRIMARY KEY, \
5188            name TEXT NOT NULL, \
5189            applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
5190        );",
5191    )
5192    .with_context(|| "creating _schema_migrations table for transition")?;
5193
5194    for &(version, name) in &MIGRATION_NAMES {
5195        if version > current_version {
5196            break;
5197        }
5198        conn.execute_compat(
5199            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
5200            &[ParamValue::from(version), ParamValue::from(name)],
5201        )
5202        .with_context(|| format!("backfilling _schema_migrations version {version}"))?;
5203    }
5204
5205    info!(
5206        current_version,
5207        "schema version transition complete: backfilled entries for versions 1..={current_version}"
5208    );
5209
5210    Ok(())
5211}
5212
5213const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
5214    ("sources", "SELECT id FROM sources LIMIT 1;"),
5215    ("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
5216    (
5217        "conversation_external_lookup",
5218        "SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
5219    ),
5220    (
5221        "conversation_tail_state",
5222        "SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
5223    ),
5224    (
5225        "conversation_external_tail_lookup",
5226        "SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
5227    ),
5228    ("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
5229    ("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
5230    (
5231        "token_daily_stats",
5232        "SELECT day_id FROM token_daily_stats LIMIT 1;",
5233    ),
5234    (
5235        "model_pricing",
5236        "SELECT model_pattern FROM model_pricing LIMIT 1;",
5237    ),
5238    (
5239        "message_metrics",
5240        "SELECT message_id FROM message_metrics LIMIT 1;",
5241    ),
5242    ("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
5243    ("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
5244    (
5245        "usage_models_daily",
5246        "SELECT day_id FROM usage_models_daily LIMIT 1;",
5247    ),
5248];
5249
5250const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
5251    ("total_input_tokens", "INTEGER"),
5252    ("total_output_tokens", "INTEGER"),
5253    ("total_cache_read_tokens", "INTEGER"),
5254    ("total_cache_creation_tokens", "INTEGER"),
5255    ("grand_total_tokens", "INTEGER"),
5256    ("estimated_cost_usd", "REAL"),
5257    ("primary_model", "TEXT"),
5258    ("api_call_count", "INTEGER"),
5259    ("tool_call_count", "INTEGER"),
5260    ("user_message_count", "INTEGER"),
5261    ("assistant_message_count", "INTEGER"),
5262];
5263
5264fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
5265    err.to_string()
5266        .to_ascii_lowercase()
5267        .contains("no such table")
5268}
5269
5270fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
5271    err.to_string()
5272        .to_ascii_lowercase()
5273        .contains("no such column")
5274}
5275
5276const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
5277
5278fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
5279    let min_conversation_id = conn
5280        .query_map_collect(
5281            "SELECT conversation_id
5282             FROM messages
5283             ORDER BY conversation_id ASC
5284             LIMIT 1",
5285            fparams![],
5286            |row| row.get_typed(0),
5287        )
5288        .context("finding minimum message conversation id for orphan FK cleanup")?
5289        .into_iter()
5290        .next();
5291    let Some(min_conversation_id) = min_conversation_id else {
5292        return Ok(Vec::new());
5293    };
5294    let max_conversation_id: i64 = conn
5295        .query_row_map(
5296            "SELECT conversation_id
5297             FROM messages
5298             ORDER BY conversation_id DESC
5299             LIMIT 1",
5300            fparams![],
5301            |row| row.get_typed(0),
5302        )
5303        .context("finding maximum message conversation id for orphan FK cleanup")?;
5304
5305    let parent_conversation_ids: Vec<i64> = conn
5306        .query_map_collect(
5307            "SELECT id
5308             FROM conversations
5309             WHERE id BETWEEN ?1 AND ?2
5310             ORDER BY id",
5311            fparams![min_conversation_id, max_conversation_id],
5312            |row| row.get_typed(0),
5313        )
5314        .context("listing parent conversation ids for orphan FK cleanup")?;
5315
5316    let mut message_ids = Vec::new();
5317    let mut gap_start = min_conversation_id;
5318    for parent_id in parent_conversation_ids {
5319        if parent_id < gap_start {
5320            continue;
5321        }
5322        if parent_id > max_conversation_id {
5323            break;
5324        }
5325        if gap_start < parent_id {
5326            collect_message_ids_for_conversation_gap(
5327                conn,
5328                gap_start,
5329                parent_id.saturating_sub(1),
5330                &mut message_ids,
5331            )?;
5332        }
5333        if parent_id == i64::MAX {
5334            return Ok(message_ids);
5335        }
5336        gap_start = parent_id + 1;
5337    }
5338    if gap_start <= max_conversation_id {
5339        collect_message_ids_for_conversation_gap(
5340            conn,
5341            gap_start,
5342            max_conversation_id,
5343            &mut message_ids,
5344        )?;
5345    }
5346
5347    Ok(message_ids)
5348}
5349
5350fn collect_message_ids_for_conversation_gap(
5351    conn: &FrankenConnection,
5352    gap_start: i64,
5353    gap_end: i64,
5354    message_ids: &mut Vec<i64>,
5355) -> Result<()> {
5356    let (sql, params) = if gap_start == gap_end {
5357        (
5358            "SELECT id FROM messages WHERE conversation_id = ?1",
5359            vec![SqliteValue::from(gap_start)],
5360        )
5361    } else {
5362        (
5363            "SELECT id FROM messages WHERE conversation_id BETWEEN ?1 AND ?2",
5364            vec![SqliteValue::from(gap_start), SqliteValue::from(gap_end)],
5365        )
5366    };
5367    let rows = conn.query_with_params(sql, &params).with_context(|| {
5368        format!("listing orphan message ids for conversation-id gap {gap_start}..={gap_end}")
5369    })?;
5370    message_ids.reserve(rows.len());
5371    for row in rows {
5372        message_ids.push(row.get_typed(0)?);
5373    }
5374    Ok(())
5375}
5376
5377fn delete_rows_by_i64_chunks(
5378    tx: &FrankenTransaction<'_>,
5379    delete_sql: &'static str,
5380    ids: &[i64],
5381) -> Result<usize> {
5382    let mut deleted = 0;
5383    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5384        for id in chunk {
5385            deleted += tx.execute_with_params(delete_sql, &[SqliteValue::from(*id)])?;
5386        }
5387    }
5388    Ok(deleted)
5389}
5390
5391fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5392    let mut deleted = 0usize;
5393    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5394        deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
5395    }
5396    Ok(deleted)
5397}
5398
5399fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5400    if ids.is_empty() {
5401        return Ok(0);
5402    }
5403
5404    match delete_orphan_message_id_chunk_once(conn, ids) {
5405        Ok(deleted) => Ok(deleted),
5406        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5407            let split_at = ids.len() / 2;
5408            tracing::warn!(
5409                target: "cass::fk_repair",
5410                rows = ids.len(),
5411                left = split_at,
5412                right = ids.len().saturating_sub(split_at),
5413                error = %err,
5414                "orphan-message cleanup ran out of memory; retrying as smaller batches"
5415            );
5416            let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
5417            let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
5418            Ok(left.saturating_add(right))
5419        }
5420        Err(err) => Err(err),
5421    }
5422}
5423
5424fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5425    let mut tx = conn.transaction()?;
5426    let mut deleted = 0usize;
5427    for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
5428        match delete_rows_by_i64_chunks(&tx, entry.delete_sql, ids) {
5429            Ok(count) => {
5430                deleted = deleted.saturating_add(count);
5431            }
5432            Err(err) if error_indicates_missing_table(&err) => {
5433                tracing::debug!(
5434                    target: "cass::fk_repair",
5435                    child_table = entry.child_table,
5436                    error = %err,
5437                    "skipping orphan-message dependent cleanup (table unavailable)"
5438                );
5439            }
5440            Err(err) => {
5441                return Err(err).with_context(|| {
5442                    format!(
5443                        "deleting rows from {} that depend on orphan messages",
5444                        entry.child_table
5445                    )
5446                });
5447            }
5448        }
5449    }
5450    deleted = deleted.saturating_add(
5451        delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id = ?1", ids)
5452            .context("deleting orphan rows from messages")?,
5453    );
5454    tx.commit()?;
5455    Ok(deleted)
5456}
5457
5458fn collect_direct_orphan_id_page(
5459    conn: &FrankenConnection,
5460    entry: &'static OrphanFkTable,
5461) -> Result<Vec<i64>> {
5462    Ok(conn.query_map_collect(
5463        entry.orphan_id_page_sql,
5464        fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
5465        |row| row.get_typed(0),
5466    )?)
5467}
5468
5469fn delete_direct_orphan_ids_bisecting_oom(
5470    conn: &FrankenConnection,
5471    entry: &'static OrphanFkTable,
5472    ids: &[i64],
5473) -> Result<usize> {
5474    let mut deleted = 0usize;
5475    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5476        deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
5477    }
5478    Ok(deleted)
5479}
5480
5481fn delete_direct_orphan_id_chunk(
5482    conn: &FrankenConnection,
5483    entry: &'static OrphanFkTable,
5484    ids: &[i64],
5485) -> Result<usize> {
5486    if ids.is_empty() {
5487        return Ok(0);
5488    }
5489
5490    match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
5491        Ok(deleted) => Ok(deleted),
5492        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5493            let split_at = ids.len() / 2;
5494            tracing::warn!(
5495                target: "cass::fk_repair",
5496                child_table = entry.child_table,
5497                rows = ids.len(),
5498                left = split_at,
5499                right = ids.len().saturating_sub(split_at),
5500                error = %err,
5501                "direct orphan cleanup ran out of memory; retrying as smaller batches"
5502            );
5503            let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
5504            let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
5505            Ok(left.saturating_add(right))
5506        }
5507        Err(err) => Err(err),
5508    }
5509}
5510
5511fn delete_direct_orphan_id_chunk_once(
5512    conn: &FrankenConnection,
5513    entry: &'static OrphanFkTable,
5514    ids: &[i64],
5515) -> Result<usize> {
5516    let mut tx = conn.transaction()?;
5517    let deleted = delete_rows_by_i64_chunk_bulk(&tx, entry.delete_many_sql_prefix, ids)?;
5518    tx.commit()?;
5519    Ok(deleted)
5520}
5521
5522fn delete_rows_by_i64_chunk_bulk(
5523    tx: &FrankenTransaction<'_>,
5524    delete_many_sql_prefix: &'static str,
5525    ids: &[i64],
5526) -> Result<usize> {
5527    if ids.is_empty() {
5528        return Ok(0);
5529    }
5530
5531    let placeholders = (1..=ids.len())
5532        .map(|idx| format!("?{idx}"))
5533        .collect::<Vec<_>>()
5534        .join(", ");
5535    let sql = format!("{delete_many_sql_prefix} ({placeholders})");
5536    let params = ids
5537        .iter()
5538        .map(|id| SqliteValue::from(*id))
5539        .collect::<Vec<_>>();
5540    Ok(tx.execute_with_params(&sql, &params)?)
5541}
5542
5543/// Tables whose FK parent rows can go missing when an index transaction is
5544/// dropped mid-flight. The select and delete SQL strings are intentionally
5545/// static (no dynamic table names) so they can be audited at a glance and so
5546/// they cannot be subverted by injected identifiers. The select statement
5547/// yields the integer FK key used by the matching chunked delete.
5548struct OrphanFkTable {
5549    child_table: &'static str,
5550    orphan_id_page_sql: &'static str,
5551    delete_many_sql_prefix: &'static str,
5552}
5553
5554const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
5555    OrphanFkTable {
5556        child_table: "message_metrics",
5557        orphan_id_page_sql: "SELECT message_id FROM message_metrics \
5558                             WHERE message_id NOT IN (SELECT id FROM messages) \
5559                             ORDER BY message_id \
5560                             LIMIT ?1",
5561        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5562    },
5563    OrphanFkTable {
5564        child_table: "token_usage",
5565        orphan_id_page_sql: "SELECT message_id FROM token_usage \
5566                             WHERE message_id NOT IN (SELECT id FROM messages) \
5567                             ORDER BY message_id \
5568                             LIMIT ?1",
5569        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5570    },
5571    OrphanFkTable {
5572        child_table: "snippets",
5573        orphan_id_page_sql: "SELECT message_id FROM snippets \
5574                             WHERE message_id NOT IN (SELECT id FROM messages) \
5575                             ORDER BY message_id \
5576                             LIMIT ?1",
5577        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5578    },
5579    OrphanFkTable {
5580        child_table: "conversation_tags",
5581        orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
5582                             WHERE conversation_id NOT IN (SELECT id FROM conversations) \
5583                             ORDER BY conversation_id \
5584                             LIMIT ?1",
5585        delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
5586    },
5587];
5588
5589struct OrphanMessageDependentTable {
5590    child_table: &'static str,
5591    delete_sql: &'static str,
5592}
5593
5594const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
5595    OrphanMessageDependentTable {
5596        child_table: "message_metrics",
5597        delete_sql: "DELETE FROM message_metrics WHERE message_id = ?1",
5598    },
5599    OrphanMessageDependentTable {
5600        child_table: "token_usage",
5601        delete_sql: "DELETE FROM token_usage WHERE message_id = ?1",
5602    },
5603    OrphanMessageDependentTable {
5604        child_table: "snippets",
5605        delete_sql: "DELETE FROM snippets WHERE message_id = ?1",
5606    },
5607];
5608
5609/// Summary of orphan rows detected and removed by `cleanup_orphan_fk_rows`.
5610///
5611/// Message-root counts come from the probe phase, while direct child counts
5612/// come from bounded page deletes. Under the function's intended use — a single
5613/// indexer-startup pass holding the index run lock — no concurrent writers
5614/// exist, so these counts match the primary orphan roots identified and
5615/// removed during cleanup. Dependent rows below an orphan message
5616/// (`message_metrics` / `token_usage` / `snippets`) are an expected consequence
5617/// of removing that root orphan and are *not* separately counted in `total` or
5618/// `per_table`.
5619#[derive(Debug, Default, Clone)]
5620pub(crate) struct OrphanFkCleanupReport {
5621    pub total: i64,
5622    pub per_table: Vec<(&'static str, i64)>,
5623}
5624
5625impl OrphanFkCleanupReport {
5626    fn record(&mut self, child_table: &'static str, count: i64) {
5627        if let Some((_, existing)) = self
5628            .per_table
5629            .iter_mut()
5630            .find(|(table, _)| *table == child_table)
5631        {
5632            *existing = existing.saturating_add(count);
5633        } else {
5634            self.per_table.push((child_table, count));
5635        }
5636        self.total = self.total.saturating_add(count);
5637    }
5638}
5639
5640pub struct InsertOutcome {
5641    pub conversation_id: i64,
5642    pub conversation_inserted: bool,
5643    pub inserted_indices: Vec<i64>,
5644}
5645
5646#[cfg(test)]
5647#[derive(Debug, Clone, Default)]
5648struct MessageInsertSubstageProfile {
5649    single_row_calls: usize,
5650    batch_calls: usize,
5651    batch_rows: usize,
5652    payload_duration: Duration,
5653    sql_build_duration: Duration,
5654    param_build_duration: Duration,
5655    execute_duration: Duration,
5656    rowid_duration: Duration,
5657}
5658
5659#[cfg(test)]
5660#[derive(Debug, Clone, Default)]
5661struct InsertConversationTreePerfProfile {
5662    invocations: usize,
5663    messages: usize,
5664    inserted_messages: usize,
5665    total_duration: Duration,
5666    source_duration: Duration,
5667    tx_open_duration: Duration,
5668    existing_lookup_duration: Duration,
5669    existing_idx_lookup_duration: Duration,
5670    existing_replay_lookup_duration: Duration,
5671    dedupe_filter_duration: Duration,
5672    conversation_row_duration: Duration,
5673    message_insert_duration: Duration,
5674    message_insert_breakdown: MessageInsertSubstageProfile,
5675    snippet_insert_duration: Duration,
5676    fts_entry_duration: Duration,
5677    fts_flush_duration: Duration,
5678    analytics_duration: Duration,
5679    commit_duration: Duration,
5680}
5681
5682#[cfg(test)]
5683impl InsertConversationTreePerfProfile {
5684    fn millis(duration: Duration) -> f64 {
5685        duration.as_secs_f64() * 1000.0
5686    }
5687
5688    fn log_summary(&self, label: &str) {
5689        let calls = self.invocations.max(1) as f64;
5690        let accounted_duration = self.source_duration
5691            + self.tx_open_duration
5692            + self.existing_lookup_duration
5693            + self.existing_idx_lookup_duration
5694            + self.existing_replay_lookup_duration
5695            + self.dedupe_filter_duration
5696            + self.conversation_row_duration
5697            + self.message_insert_duration
5698            + self.snippet_insert_duration
5699            + self.fts_entry_duration
5700            + self.fts_flush_duration
5701            + self.analytics_duration
5702            + self.commit_duration;
5703        let residual_duration = self.total_duration.saturating_sub(accounted_duration);
5704        eprintln!(
5705            concat!(
5706                "CASS_INSERT_TREE_STAGE_PROFILE ",
5707                "label={} calls={} messages={} inserted_messages={} ",
5708                "total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
5709                "existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
5710                "conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
5711                "fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
5712                "msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
5713                "residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
5714                "avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
5715            ),
5716            label,
5717            self.invocations,
5718            self.messages,
5719            self.inserted_messages,
5720            Self::millis(self.total_duration),
5721            Self::millis(self.source_duration),
5722            Self::millis(self.tx_open_duration),
5723            Self::millis(self.existing_lookup_duration),
5724            Self::millis(self.existing_idx_lookup_duration),
5725            Self::millis(self.existing_replay_lookup_duration),
5726            Self::millis(self.dedupe_filter_duration),
5727            Self::millis(self.conversation_row_duration),
5728            Self::millis(self.message_insert_duration),
5729            Self::millis(self.snippet_insert_duration),
5730            Self::millis(self.fts_entry_duration),
5731            Self::millis(self.fts_flush_duration),
5732            Self::millis(self.analytics_duration),
5733            Self::millis(self.commit_duration),
5734            Self::millis(self.message_insert_breakdown.payload_duration),
5735            Self::millis(self.message_insert_breakdown.sql_build_duration),
5736            Self::millis(self.message_insert_breakdown.param_build_duration),
5737            Self::millis(self.message_insert_breakdown.execute_duration),
5738            Self::millis(self.message_insert_breakdown.rowid_duration),
5739            Self::millis(residual_duration),
5740            Self::millis(self.total_duration) / calls,
5741            Self::millis(self.message_insert_duration) / calls,
5742            Self::millis(self.message_insert_breakdown.execute_duration) / calls,
5743            Self::millis(self.message_insert_breakdown.payload_duration) / calls,
5744            Self::millis(self.snippet_insert_duration) / calls,
5745            Self::millis(self.fts_entry_duration) / calls,
5746            Self::millis(self.commit_duration) / calls,
5747        );
5748    }
5749}
5750
5751#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5752enum PendingConversationKey {
5753    External {
5754        source_id: String,
5755        agent_id: i64,
5756        external_id: String,
5757    },
5758    SourcePath {
5759        source_id: String,
5760        agent_id: i64,
5761        source_path: String,
5762        started_at: Option<i64>,
5763    },
5764}
5765
5766fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
5767    format!(
5768        "{}:{source_id}:{agent_id}:{}:{external_id}",
5769        source_id.chars().count(),
5770        external_id.chars().count()
5771    )
5772}
5773
5774fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
5775    conv.external_id
5776        .as_deref()
5777        .map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
5778}
5779
5780#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5781struct MessageMergeFingerprint {
5782    idx: i64,
5783    created_at: Option<i64>,
5784    role: MessageRole,
5785    author: Option<String>,
5786    content_hash: [u8; 32],
5787}
5788
5789#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5790struct MessageReplayFingerprint {
5791    created_at: Option<i64>,
5792    role: MessageRole,
5793    author: Option<String>,
5794    content_hash: [u8; 32],
5795}
5796
5797#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5798struct ConversationMergeEvidence {
5799    exact_overlap: usize,
5800    replay_overlap: usize,
5801    smaller_replay_set: usize,
5802    started_close: bool,
5803    start_distance_ms: i64,
5804}
5805
5806struct ExistingConversationNewMessages<'a> {
5807    messages: Vec<&'a Message>,
5808    new_chars: i64,
5809    idx_collision_count: usize,
5810    first_collision_idx: Option<i64>,
5811}
5812
5813#[derive(Debug, Clone, Copy)]
5814struct ExistingConversationTailState {
5815    last_message_idx: i64,
5816    last_message_created_at: i64,
5817    ended_at: Option<i64>,
5818}
5819
5820#[derive(Debug, Clone, Copy)]
5821struct ExistingConversationWithTail {
5822    id: i64,
5823    tail_state: Option<ExistingConversationTailState>,
5824}
5825
5826fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
5827    conv.started_at
5828        .or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
5829}
5830
5831fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
5832    (
5833        conv.messages.iter().map(|msg| msg.idx).max(),
5834        conv.messages.iter().filter_map(|msg| msg.created_at).max(),
5835    )
5836}
5837
5838fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
5839    (
5840        messages.iter().map(|msg| msg.idx).max(),
5841        messages.iter().filter_map(|msg| msg.created_at).max(),
5842    )
5843}
5844
5845fn role_from_str(role: &str) -> MessageRole {
5846    match role {
5847        "user" => MessageRole::User,
5848        "agent" | "assistant" => MessageRole::Agent,
5849        "tool" => MessageRole::Tool,
5850        "system" => MessageRole::System,
5851        other => MessageRole::Other(other.to_string()),
5852    }
5853}
5854
5855fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
5856    MessageMergeFingerprint {
5857        idx: msg.idx,
5858        created_at: msg.created_at,
5859        role: msg.role.clone(),
5860        author: msg.author.clone(),
5861        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
5862    }
5863}
5864
5865fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
5866    MessageReplayFingerprint {
5867        created_at: msg.created_at,
5868        role: msg.role.clone(),
5869        author: msg.author.clone(),
5870        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
5871    }
5872}
5873
5874fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
5875    conv.messages
5876        .iter()
5877        .map(message_merge_fingerprint)
5878        .collect()
5879}
5880
5881fn conversation_message_replay_fingerprints(
5882    conv: &Conversation,
5883) -> HashSet<MessageReplayFingerprint> {
5884    conv.messages
5885        .iter()
5886        .map(message_replay_fingerprint)
5887        .collect()
5888}
5889
5890fn replay_fingerprint_from_merge(
5891    fingerprint: &MessageMergeFingerprint,
5892) -> MessageReplayFingerprint {
5893    MessageReplayFingerprint {
5894        created_at: fingerprint.created_at,
5895        role: fingerprint.role.clone(),
5896        author: fingerprint.author.clone(),
5897        content_hash: fingerprint.content_hash,
5898    }
5899}
5900
5901fn replay_fingerprints_from_merge_set(
5902    fingerprints: &HashSet<MessageMergeFingerprint>,
5903) -> HashSet<MessageReplayFingerprint> {
5904    fingerprints
5905        .iter()
5906        .map(replay_fingerprint_from_merge)
5907        .collect()
5908}
5909
5910fn collect_new_messages_for_existing_conversation<'a>(
5911    conversation_id: i64,
5912    conv: &'a Conversation,
5913    existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
5914    existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
5915    replay_skip_log: &'static str,
5916) -> ExistingConversationNewMessages<'a> {
5917    let mut idx_collision_count = 0usize;
5918    let mut first_collision_idx: Option<i64> = None;
5919    let mut new_chars: i64 = 0;
5920    let mut messages = Vec::new();
5921
5922    for msg in &conv.messages {
5923        let incoming_fingerprint = message_merge_fingerprint(msg);
5924        if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
5925            if existing_fingerprint != &incoming_fingerprint {
5926                idx_collision_count = idx_collision_count.saturating_add(1);
5927                first_collision_idx.get_or_insert(msg.idx);
5928            }
5929            continue;
5930        }
5931
5932        let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
5933        if existing_replay_fingerprints.contains(&incoming_replay) {
5934            tracing::debug!(
5935                conversation_id,
5936                idx = msg.idx,
5937                source_path = %conv.source_path.display(),
5938                "{replay_skip_log}"
5939            );
5940            continue;
5941        }
5942
5943        existing_messages.insert(msg.idx, incoming_fingerprint);
5944        existing_replay_fingerprints.insert(incoming_replay);
5945        new_chars += msg.content.len() as i64;
5946        messages.push(msg);
5947    }
5948
5949    ExistingConversationNewMessages {
5950        messages,
5951        new_chars,
5952        idx_collision_count,
5953        first_collision_idx,
5954    }
5955}
5956
5957fn franken_existing_conversation_append_tail_state(
5958    tx: &FrankenTransaction<'_>,
5959    conversation_id: i64,
5960) -> Result<Option<ExistingConversationTailState>> {
5961    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
5962        .query_row_map(
5963            "SELECT last_message_idx, last_message_created_at, ended_at
5964             FROM conversation_tail_state
5965             WHERE conversation_id = ?1",
5966            fparams![conversation_id],
5967            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
5968        )
5969        .optional()?;
5970    if let Some(cached) = cached {
5971        let (_, _, cached_ended_at) = cached;
5972        if let Some(tail_state) =
5973            existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
5974        {
5975            return Ok(Some(tail_state));
5976        }
5977    }
5978
5979    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
5980        "SELECT last_message_idx, last_message_created_at, ended_at
5981         FROM conversations
5982         WHERE id = ?1",
5983        fparams![conversation_id],
5984        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
5985    )?;
5986    let (_, _, cached_ended_at) = legacy_cached;
5987    if let Some(tail_state) = existing_conversation_tail_state_from_cached(
5988        legacy_cached.0,
5989        legacy_cached.1,
5990        cached_ended_at,
5991    ) {
5992        franken_insert_conversation_tail_state(
5993            tx,
5994            conversation_id,
5995            cached_ended_at,
5996            Some(tail_state.last_message_idx),
5997            Some(tail_state.last_message_created_at),
5998        )?;
5999        return Ok(Some(tail_state));
6000    }
6001
6002    let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
6003        "SELECT MAX(idx), MAX(created_at)
6004         FROM messages
6005         WHERE conversation_id = ?1",
6006        fparams![conversation_id],
6007        |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
6008    )?;
6009    if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
6010        franken_update_conversation_tail_state(
6011            tx,
6012            conversation_id,
6013            None,
6014            Some(last_message_idx),
6015            Some(last_message_created_at),
6016        )?;
6017        return Ok(Some(ExistingConversationTailState {
6018            last_message_idx,
6019            last_message_created_at,
6020            ended_at: cached_ended_at,
6021        }));
6022    }
6023    Ok(None)
6024}
6025
6026fn existing_conversation_tail_state_from_cached(
6027    last_message_idx: Option<i64>,
6028    last_message_created_at: Option<i64>,
6029    ended_at: Option<i64>,
6030) -> Option<ExistingConversationTailState> {
6031    let (last_message_idx, last_message_created_at) =
6032        last_message_idx.zip(last_message_created_at)?;
6033    Some(ExistingConversationTailState {
6034        last_message_idx,
6035        last_message_created_at,
6036        ended_at,
6037    })
6038}
6039
6040fn franken_find_existing_conversation_with_tail_by_key(
6041    tx: &FrankenTransaction<'_>,
6042    key: &PendingConversationKey,
6043    conv: Option<&Conversation>,
6044) -> Result<Option<ExistingConversationWithTail>> {
6045    if let PendingConversationKey::External {
6046        source_id,
6047        agent_id,
6048        external_id,
6049    } = key
6050    {
6051        let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
6052        if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
6053            return Ok(Some(existing));
6054        }
6055        return Ok(None);
6056    }
6057
6058    let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
6059        return Ok(None);
6060    };
6061    let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
6062    Ok(Some(ExistingConversationWithTail { id, tail_state }))
6063}
6064
6065fn franken_insert_conversation_tail_state(
6066    tx: &FrankenTransaction<'_>,
6067    conversation_id: i64,
6068    ended_at: Option<i64>,
6069    last_message_idx: Option<i64>,
6070    last_message_created_at: Option<i64>,
6071) -> Result<()> {
6072    if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
6073        return Ok(());
6074    }
6075    tx.execute_compat(
6076        "INSERT OR REPLACE INTO conversation_tail_state (
6077             conversation_id, ended_at, last_message_idx, last_message_created_at
6078         ) VALUES (?1, ?2, ?3, ?4)",
6079        fparams![
6080            conversation_id,
6081            ended_at,
6082            last_message_idx,
6083            last_message_created_at
6084        ],
6085    )?;
6086    Ok(())
6087}
6088
6089fn franken_update_conversation_tail_columns(
6090    tx: &FrankenTransaction<'_>,
6091    conversation_id: i64,
6092    ended_at_candidate: Option<i64>,
6093    last_message_idx_candidate: Option<i64>,
6094    last_message_created_at_candidate: Option<i64>,
6095) -> Result<()> {
6096    if ended_at_candidate.is_none()
6097        && last_message_idx_candidate.is_none()
6098        && last_message_created_at_candidate.is_none()
6099    {
6100        return Ok(());
6101    }
6102
6103    tx.execute_compat(
6104        "UPDATE conversations
6105         SET ended_at = CASE
6106                 WHEN ?1 IS NULL THEN ended_at
6107                 WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
6108                 ELSE ended_at
6109             END,
6110             last_message_idx = CASE
6111                 WHEN ?2 IS NULL THEN last_message_idx
6112                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6113                 ELSE last_message_idx
6114             END,
6115             last_message_created_at = CASE
6116                 WHEN ?3 IS NULL THEN last_message_created_at
6117                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6118                 ELSE last_message_created_at
6119             END
6120         WHERE id = ?4",
6121        fparams![
6122            ended_at_candidate,
6123            last_message_idx_candidate,
6124            last_message_created_at_candidate,
6125            conversation_id
6126        ],
6127    )?;
6128    Ok(())
6129}
6130
6131fn franken_tail_state_insert_ended_at(
6132    tx: &FrankenTransaction<'_>,
6133    conversation_id: i64,
6134    candidate: Option<i64>,
6135) -> Result<Option<i64>> {
6136    let canonical: Option<i64> = tx
6137        .query_row_map(
6138            "SELECT ended_at FROM conversations WHERE id = ?1",
6139            fparams![conversation_id],
6140            |row| row.get_typed(0),
6141        )
6142        .optional()?
6143        .flatten();
6144    Ok(canonical.max(candidate))
6145}
6146
6147fn franken_update_conversation_tail_state(
6148    tx: &FrankenTransaction<'_>,
6149    conversation_id: i64,
6150    ended_at_candidate: Option<i64>,
6151    last_message_idx_candidate: Option<i64>,
6152    last_message_created_at_candidate: Option<i64>,
6153) -> Result<()> {
6154    if ended_at_candidate.is_none()
6155        && last_message_idx_candidate.is_none()
6156        && last_message_created_at_candidate.is_none()
6157    {
6158        return Ok(());
6159    }
6160
6161    let changed = tx.execute_compat(
6162        "UPDATE conversation_tail_state
6163         SET ended_at = CASE
6164                 WHEN ?1 IS NULL THEN ended_at
6165                 ELSE MAX(IFNULL(ended_at, 0), ?1)
6166             END,
6167             last_message_idx = CASE
6168                 WHEN ?2 IS NULL THEN last_message_idx
6169                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6170                 ELSE last_message_idx
6171             END,
6172             last_message_created_at = CASE
6173                 WHEN ?3 IS NULL THEN last_message_created_at
6174                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6175                 ELSE last_message_created_at
6176             END
6177         WHERE conversation_id = ?4",
6178        fparams![
6179            ended_at_candidate,
6180            last_message_idx_candidate,
6181            last_message_created_at_candidate,
6182            conversation_id
6183        ],
6184    )?;
6185    if changed == 0 {
6186        let insert_ended_at =
6187            franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
6188        franken_insert_conversation_tail_state(
6189            tx,
6190            conversation_id,
6191            insert_ended_at,
6192            last_message_idx_candidate,
6193            last_message_created_at_candidate,
6194        )?;
6195    }
6196    franken_update_conversation_tail_columns(
6197        tx,
6198        conversation_id,
6199        ended_at_candidate,
6200        last_message_idx_candidate,
6201        last_message_created_at_candidate,
6202    )?;
6203    Ok(())
6204}
6205
6206fn franken_set_conversation_tail_state_after_append(
6207    tx: &FrankenTransaction<'_>,
6208    conversation_id: i64,
6209    ended_at: i64,
6210    last_message_idx: i64,
6211    last_message_created_at: i64,
6212) -> Result<()> {
6213    let changed = tx.execute_compat(
6214        "UPDATE conversation_tail_state
6215         SET ended_at = ?1,
6216             last_message_idx = ?2,
6217             last_message_created_at = ?3
6218         WHERE conversation_id = ?4",
6219        fparams![
6220            ended_at,
6221            last_message_idx,
6222            last_message_created_at,
6223            conversation_id
6224        ],
6225    )?;
6226    if changed == 0 {
6227        let insert_ended_at =
6228            franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
6229        franken_insert_conversation_tail_state(
6230            tx,
6231            conversation_id,
6232            insert_ended_at,
6233            Some(last_message_idx),
6234            Some(last_message_created_at),
6235        )?;
6236    }
6237    franken_update_conversation_tail_columns(
6238        tx,
6239        conversation_id,
6240        Some(ended_at),
6241        Some(last_message_idx),
6242        Some(last_message_created_at),
6243    )?;
6244    Ok(())
6245}
6246
6247fn collect_append_only_tail_messages<'a>(
6248    conv: &'a Conversation,
6249    existing_max_idx: i64,
6250    existing_max_created_at: i64,
6251) -> Option<ExistingConversationNewMessages<'a>> {
6252    if conv.messages.is_empty() {
6253        return Some(ExistingConversationNewMessages {
6254            messages: Vec::new(),
6255            new_chars: 0,
6256            idx_collision_count: 0,
6257            first_collision_idx: None,
6258        });
6259    }
6260
6261    let mut split_idx = None;
6262    let mut prev_idx = None;
6263    for (pos, msg) in conv.messages.iter().enumerate() {
6264        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6265            return None;
6266        }
6267        prev_idx = Some(msg.idx);
6268        if split_idx.is_none() && msg.idx > existing_max_idx {
6269            split_idx = Some(pos);
6270        }
6271    }
6272    let split_idx = split_idx?;
6273
6274    let mut seen_tail_idx = HashSet::new();
6275    let mut seen_tail_replay = HashSet::new();
6276    let mut new_chars = 0i64;
6277    let mut messages = Vec::new();
6278    for msg in &conv.messages[split_idx..] {
6279        let created_at = msg.created_at?;
6280        if created_at <= existing_max_created_at {
6281            return None;
6282        }
6283
6284        if !seen_tail_idx.insert(msg.idx) {
6285            return None;
6286        }
6287
6288        let replay_fingerprint = message_replay_fingerprint(msg);
6289        if !seen_tail_replay.insert(replay_fingerprint) {
6290            return None;
6291        }
6292
6293        new_chars += msg.content.len() as i64;
6294        messages.push(msg);
6295    }
6296
6297    Some(ExistingConversationNewMessages {
6298        messages,
6299        new_chars,
6300        idx_collision_count: 0,
6301        first_collision_idx: None,
6302    })
6303}
6304
6305fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
6306    match (left, right) {
6307        (Some(left), Some(right)) => (i128::from(left) - i128::from(right))
6308            .abs()
6309            .try_into()
6310            .unwrap_or(i64::MAX),
6311        _ => i64::MAX,
6312    }
6313}
6314
6315fn conversation_merge_evidence(
6316    incoming_exact: &HashSet<MessageMergeFingerprint>,
6317    incoming_replay: &HashSet<MessageReplayFingerprint>,
6318    existing_exact: &HashSet<MessageMergeFingerprint>,
6319    existing_replay: &HashSet<MessageReplayFingerprint>,
6320    incoming_started_at: Option<i64>,
6321    existing_started_at: Option<i64>,
6322) -> Option<ConversationMergeEvidence> {
6323    let exact_overlap = incoming_exact.intersection(existing_exact).count();
6324    let replay_overlap = incoming_replay.intersection(existing_replay).count();
6325    if exact_overlap == 0 && replay_overlap == 0 {
6326        return None;
6327    }
6328
6329    let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
6330    let started_close = timestamps_within_tolerance(
6331        incoming_started_at,
6332        existing_started_at,
6333        SOURCE_PATH_MERGE_START_TOLERANCE_MS,
6334    );
6335    let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
6336
6337    let merge_allowed = if started_close {
6338        exact_overlap >= 1 || replay_overlap >= 2
6339    } else {
6340        exact_overlap >= 2 || full_replay_subset_match
6341    };
6342
6343    merge_allowed.then_some(ConversationMergeEvidence {
6344        exact_overlap,
6345        replay_overlap,
6346        smaller_replay_set,
6347        started_close,
6348        start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
6349    })
6350}
6351
6352fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
6353    match (left, right) {
6354        (Some(left), Some(right)) => {
6355            (i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
6356        }
6357        _ => false,
6358    }
6359}
6360
6361fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
6362    if let Some(external_id) = conv.external_id.clone() {
6363        PendingConversationKey::External {
6364            source_id: conv.source_id.clone(),
6365            agent_id,
6366            external_id,
6367        }
6368    } else {
6369        PendingConversationKey::SourcePath {
6370            source_id: conv.source_id.clone(),
6371            agent_id,
6372            source_path: path_to_string(&conv.source_path),
6373            started_at: conversation_effective_started_at(conv),
6374        }
6375    }
6376}
6377
6378/// Message data needed for semantic embedding generation.
6379pub struct MessageForEmbedding {
6380    pub message_id: i64,
6381    pub created_at: Option<i64>,
6382    pub agent_id: i64,
6383    pub workspace_id: Option<i64>,
6384    pub source_id_hash: u32,
6385    pub role: String,
6386    pub content: String,
6387}
6388
6389// =========================================================================
6390// FrankenStorage CRUD operations
6391// =========================================================================
6392
6393impl FrankenStorage {
6394    /// Ensure an agent exists in the database, returning its ID.
6395    pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
6396        let cache_key = EnsuredAgentKey::from_agent(agent);
6397        if let Some(id) = self.cached_agent_id(&cache_key) {
6398            return Ok(id);
6399        }
6400
6401        let now = Self::now_millis();
6402        self.conn.execute_compat(
6403            "INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
6404             VALUES(?1, ?2, ?3, ?4, ?5, ?6)
6405             ON CONFLICT(slug) DO UPDATE SET
6406                 name = excluded.name,
6407                 version = excluded.version,
6408                 kind = excluded.kind,
6409                 updated_at = excluded.updated_at
6410             WHERE NOT (
6411                 agents.name IS excluded.name
6412                 AND agents.version IS excluded.version
6413                 AND agents.kind IS excluded.kind
6414             )",
6415            fparams![
6416                agent.slug.as_str(),
6417                agent.name.as_str(),
6418                agent.version.as_deref(),
6419                cache_key.kind.as_str(),
6420                now,
6421                now
6422            ],
6423        )?;
6424
6425        let id = self
6426            .conn
6427            .query_row_map(
6428                "SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
6429                fparams![agent.slug.as_str()],
6430                |row| row.get_typed(0),
6431            )
6432            .with_context(|| format!("fetching agent id for {}", agent.slug))?;
6433        self.mark_agent_ensured(cache_key, id);
6434        Ok(id)
6435    }
6436
6437    /// Ensure a workspace exists in the database, returning its ID.
6438    pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
6439        let path_str = path.to_string_lossy().to_string();
6440        let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
6441        if let Some(id) = self.cached_workspace_id(&cache_key) {
6442            return Ok(id);
6443        }
6444
6445        if let Some(display_name) = display_name {
6446            self.conn.execute_compat(
6447                "INSERT INTO workspaces(path, display_name)
6448                 VALUES(?1, ?2)
6449                 ON CONFLICT(path) DO UPDATE SET
6450                     display_name = excluded.display_name
6451                 WHERE NOT (workspaces.display_name IS excluded.display_name)",
6452                fparams![path_str.as_str(), display_name],
6453            )?;
6454        } else {
6455            self.conn.execute_compat(
6456                "INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
6457                fparams![path_str.as_str()],
6458            )?;
6459        }
6460
6461        let id = self
6462            .conn
6463            .query_row_map(
6464                "SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
6465                fparams![path_str.as_str()],
6466                |row| row.get_typed(0),
6467            )
6468            .with_context(|| format!("fetching workspace id for {path_str}"))?;
6469        self.mark_workspace_ensured(cache_key, id);
6470        Ok(id)
6471    }
6472
6473    /// Get current time as milliseconds since epoch.
6474    pub fn now_millis() -> i64 {
6475        SystemTime::now()
6476            .duration_since(UNIX_EPOCH)
6477            .map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
6478            .unwrap_or(0)
6479    }
6480
6481    /// Convert a millisecond timestamp to a day ID (days since 2020-01-01).
6482    pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
6483        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6484        let secs = timestamp_ms.div_euclid(1000);
6485        (secs - EPOCH_2020_SECS).div_euclid(86400)
6486    }
6487
6488    /// Convert a millisecond timestamp to an hour ID (hours since 2020-01-01 00:00 UTC).
6489    pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
6490        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6491        let secs = timestamp_ms.div_euclid(1000);
6492        (secs - EPOCH_2020_SECS).div_euclid(3600)
6493    }
6494
6495    /// Convert a day ID back to milliseconds (start of day).
6496    pub fn millis_from_day_id(day_id: i64) -> i64 {
6497        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6498        (EPOCH_2020_SECS + day_id * 86400) * 1000
6499    }
6500
6501    /// Convert an hour ID back to milliseconds (start of hour).
6502    pub fn millis_from_hour_id(hour_id: i64) -> i64 {
6503        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6504        (EPOCH_2020_SECS + hour_id * 3600) * 1000
6505    }
6506
6507    /// Get the timestamp of the last successful scan.
6508    pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
6509        let result: Result<String, _> = self.conn.query_row_map(
6510            "SELECT value FROM meta WHERE key = 'last_scan_ts'",
6511            fparams![],
6512            |row| row.get_typed(0),
6513        );
6514        match result.optional() {
6515            Ok(Some(s)) => Ok(s.parse().ok()),
6516            Ok(None) => Ok(None),
6517            Err(e) => Err(e.into()),
6518        }
6519    }
6520
6521    /// Set the timestamp of the last successful scan (milliseconds since epoch).
6522    pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
6523        self.conn.execute_compat(
6524            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
6525            fparams![ts.to_string()],
6526        )?;
6527        Ok(())
6528    }
6529
6530    /// Get the timestamp of the last successful index completion.
6531    pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
6532        let result: Result<String, _> = self.conn.query_row_map(
6533            "SELECT value FROM meta WHERE key = 'last_indexed_at'",
6534            fparams![],
6535            |row| row.get_typed(0),
6536        );
6537        match result.optional() {
6538            Ok(Some(s)) => Ok(s.parse().ok()),
6539            Ok(None) => Ok(None),
6540            Err(e) => Err(e.into()),
6541        }
6542    }
6543
6544    /// Set the timestamp of the last successful index completion (milliseconds since epoch).
6545    pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
6546        self.conn.execute_compat(
6547            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
6548            fparams![ts.to_string()],
6549        )?;
6550        Ok(())
6551    }
6552
6553    /// List all registered agents.
6554    pub fn list_agents(&self) -> Result<Vec<Agent>> {
6555        self.conn
6556            .query_map_collect(
6557                "SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
6558                fparams![],
6559                |row| {
6560                    let kind: String = row.get_typed(4)?;
6561                    Ok(Agent {
6562                        id: Some(row.get_typed(0)?),
6563                        slug: row.get_typed(1)?,
6564                        name: row.get_typed(2)?,
6565                        version: row.get_typed(3)?,
6566                        kind: match kind.as_str() {
6567                            "cli" => AgentKind::Cli,
6568                            "vscode" => AgentKind::VsCode,
6569                            _ => AgentKind::Hybrid,
6570                        },
6571                    })
6572                },
6573            )
6574            .with_context(|| "listing agents")
6575    }
6576
6577    /// Count all archived conversations.
6578    pub fn total_conversation_count(&self) -> Result<usize> {
6579        let count: i64 =
6580            self.conn
6581                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6582                    row.get_typed(0)
6583                })?;
6584        Ok(count.max(0) as usize)
6585    }
6586
6587    /// Count all archived messages.
6588    pub fn total_message_count(&self) -> Result<usize> {
6589        let count: i64 =
6590            self.conn
6591                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
6592                    row.get_typed(0)
6593                })?;
6594        Ok(count.max(0) as usize)
6595    }
6596
6597    /// Remove all archived conversations/messages for one agent slug.
6598    ///
6599    /// This only affects cass's local archive database. Source session files on
6600    /// disk are untouched.
6601    pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
6602        let normalized = agent_slug.trim().to_ascii_lowercase();
6603        if normalized.is_empty() {
6604            return Err(anyhow!("agent slug cannot be empty"));
6605        }
6606
6607        let Some(agent_id) = self
6608            .conn
6609            .query_row_map(
6610                "SELECT id FROM agents WHERE slug = ?1",
6611                fparams![normalized.as_str()],
6612                |row| row.get_typed::<i64>(0),
6613            )
6614            .optional()?
6615        else {
6616            return Ok(AgentArchivePurgeResult::default());
6617        };
6618
6619        let conversations_deleted: i64 = self.conn.query_row_map(
6620            "SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
6621            fparams![agent_id],
6622            |row| row.get_typed(0),
6623        )?;
6624        if conversations_deleted == 0 {
6625            return Ok(AgentArchivePurgeResult::default());
6626        }
6627
6628        let messages_deleted: i64 = self.conn.query_row_map(
6629            "SELECT COUNT(*)
6630             FROM messages
6631             WHERE conversation_id IN (
6632                 SELECT id FROM conversations WHERE agent_id = ?1
6633             )",
6634            fparams![agent_id],
6635            |row| row.get_typed(0),
6636        )?;
6637
6638        let mut tx = self.conn.transaction()?;
6639        tx.execute_compat(
6640            "DELETE FROM conversation_external_lookup
6641             WHERE conversation_id IN (
6642                 SELECT id FROM conversations WHERE agent_id = ?1
6643             )",
6644            fparams![agent_id],
6645        )?;
6646        tx.execute_compat(
6647            "DELETE FROM conversation_external_tail_lookup
6648             WHERE conversation_id IN (
6649                 SELECT id FROM conversations WHERE agent_id = ?1
6650             )",
6651            fparams![agent_id],
6652        )?;
6653        tx.execute_compat(
6654            "DELETE FROM conversations WHERE agent_id = ?1",
6655            fparams![agent_id],
6656        )?;
6657        tx.execute_compat(
6658            "DELETE FROM agents
6659             WHERE id = ?1
6660               AND NOT EXISTS (
6661                   SELECT 1 FROM conversations WHERE agent_id = ?1
6662               )",
6663            fparams![agent_id],
6664        )?;
6665        tx.commit()?;
6666
6667        Ok(AgentArchivePurgeResult {
6668            conversations_deleted: conversations_deleted.max(0) as usize,
6669            messages_deleted: messages_deleted.max(0) as usize,
6670        })
6671    }
6672
6673    /// List all registered workspaces.
6674    pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
6675        self.conn
6676            .query_map_collect(
6677                "SELECT id, path, display_name FROM workspaces ORDER BY path",
6678                fparams![],
6679                |row| {
6680                    let path_str: String = row.get_typed(1)?;
6681                    Ok(crate::model::types::Workspace {
6682                        id: Some(row.get_typed(0)?),
6683                        path: Path::new(&path_str).to_path_buf(),
6684                        display_name: row.get_typed(2)?,
6685                    })
6686                },
6687            )
6688            .with_context(|| "listing workspaces")
6689    }
6690
6691    /// List conversations with pagination.
6692    pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
6693        // Avoid the multi-table JOIN with LIMIT/OFFSET that triggers
6694        // frankensqlite's materialization fallback (see c38edcd9, 860acb12).
6695        // Use correlated subqueries for the tiny agents (~20 rows) and
6696        // workspaces (~30 rows) lookup tables and degrade NULL agent_id to
6697        // the same 'unknown' sentinel that 8a0c547c established for the
6698        // lexical rebuild path.
6699        self.conn
6700            .query_map_collect(
6701                r"SELECT c.id,
6702                         COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
6703                         (SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
6704                         c.external_id, c.title, c.source_path,
6705                         c.started_at,
6706                         COALESCE(
6707                             (SELECT ts.ended_at
6708                              FROM conversation_tail_state ts
6709                              WHERE ts.conversation_id = c.id),
6710                             c.ended_at
6711                         ),
6712                         c.approx_tokens, c.metadata_json,
6713                         c.source_id, c.origin_host, c.metadata_bin
6714                FROM conversations c
6715                ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
6716                LIMIT ?1 OFFSET ?2",
6717                fparams![limit, offset],
6718                |row| {
6719                    let workspace_path: Option<String> = row.get_typed(2)?;
6720                    let source_path: String = row.get_typed(5)?;
6721                    let raw_source_id: Option<String> = row.get_typed(10)?;
6722                    let raw_origin_host: Option<String> = row.get_typed(11)?;
6723                    let (source_id, _, origin_host) = normalized_storage_source_parts(
6724                        raw_source_id.as_deref(),
6725                        None,
6726                        raw_origin_host.as_deref(),
6727                    );
6728                    Ok(Conversation {
6729                        id: Some(row.get_typed(0)?),
6730                        agent_slug: row.get_typed(1)?,
6731                        workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
6732                        external_id: row.get_typed(3)?,
6733                        title: row.get_typed(4)?,
6734                        source_path: Path::new(&source_path).to_path_buf(),
6735                        started_at: row.get_typed(6)?,
6736                        ended_at: row.get_typed(7)?,
6737                        approx_tokens: row.get_typed(8)?,
6738                        metadata_json: franken_read_metadata_compat(row, 9, 12),
6739                        messages: Vec::new(),
6740                        source_id,
6741                        origin_host,
6742                    })
6743                },
6744            )
6745            .with_context(|| "listing conversations")
6746    }
6747
6748    /// Build lookup maps for agents and workspaces to avoid JOINs in
6749    /// paged conversation queries.  Both tables are tiny (tens of rows)
6750    /// so this is effectively free.
6751    pub fn build_lexical_rebuild_lookups(
6752        &self,
6753    ) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
6754        let agents: HashMap<i64, String> = self
6755            .conn
6756            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
6757                Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
6758            })
6759            .with_context(|| "loading agent lookup for lexical rebuild")?
6760            .into_iter()
6761            .collect();
6762        let workspaces: HashMap<i64, PathBuf> = self
6763            .conn
6764            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
6765                let path_str: String = row.get_typed(1)?;
6766                Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
6767            })
6768            .with_context(|| "loading workspace lookup for lexical rebuild")?
6769            .into_iter()
6770            .collect();
6771        Ok((agents, workspaces))
6772    }
6773
6774    /// List per-conversation message footprints in primary-key order.
6775    ///
6776    /// This deliberately avoids rebuild-path JOINs. Instead we merge ordered
6777    /// single-table reads over `conversations` and the narrow
6778    /// `conversation_tail_state` cache in Rust, then use `last_message_idx + 1`
6779    /// as a planning estimate.
6780    ///
6781    /// The planner only needs a sizing heuristic; exact message and byte
6782    /// accounting is performed later by the rebuild packet pipeline as it reads
6783    /// message content for indexing. Rows missing both tail-cache sources fall
6784    /// back to `MAX(messages.idx) + 1`, which preserves legacy upgraded
6785    /// databases without treating populated conversations as empty.
6786    pub fn list_conversation_footprints_for_lexical_rebuild(
6787        &self,
6788    ) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
6789        let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
6790            "SELECT conversation_id, last_message_idx
6791             FROM conversation_tail_state
6792             ORDER BY conversation_id ASC",
6793            fparams![],
6794            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
6795        ) {
6796            Ok(rows) => rows,
6797            Err(err) if error_indicates_missing_table(&err) => Vec::new(),
6798            Err(err) => {
6799                return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
6800            }
6801        };
6802        let tail_state_by_conversation: HashMap<i64, Option<i64>> =
6803            tail_state_rows.into_iter().collect();
6804
6805        let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
6806            "SELECT id, last_message_idx
6807             FROM conversations
6808             ORDER BY id ASC",
6809            fparams![],
6810            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
6811        ) {
6812            Ok(rows) => rows,
6813            Err(err) if error_indicates_missing_column(&err) => self
6814                .conn
6815                .query_map_collect(
6816                    "SELECT id
6817                     FROM conversations
6818                     ORDER BY id ASC",
6819                    fparams![],
6820                    |row| Ok((row.get_typed::<i64>(0)?, None)),
6821                )
6822                .with_context(|| {
6823                    "listing lexical rebuild conversation ids after missing tail column fallback"
6824                })?,
6825            Err(err) => {
6826                return Err(err)
6827                    .with_context(|| "listing lexical rebuild conversation footprint estimates");
6828            }
6829        };
6830
6831        let mut footprints = Vec::with_capacity(rows.len());
6832        let mut missing_tail_positions = HashMap::new();
6833        for (conversation_id, conversation_last_message_idx) in rows {
6834            let last_message_idx = tail_state_by_conversation
6835                .get(&conversation_id)
6836                .copied()
6837                .flatten()
6838                .or(conversation_last_message_idx);
6839            let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
6840            else {
6841                missing_tail_positions.insert(conversation_id, footprints.len());
6842                footprints.push(LexicalRebuildConversationFootprintRow {
6843                    conversation_id,
6844                    message_count: 0,
6845                    message_bytes: 0,
6846                });
6847                continue;
6848            };
6849            footprints.push(lexical_rebuild_conversation_footprint_from_count(
6850                conversation_id,
6851                message_count,
6852            ));
6853        }
6854
6855        let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
6856        if !missing_tail_positions.is_empty() {
6857            self.fill_missing_lexical_rebuild_footprint_tails(
6858                &mut footprints,
6859                &missing_tail_positions,
6860            )?;
6861        }
6862        if !every_footprint_was_missing_tail {
6863            self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
6864        }
6865
6866        Ok(footprints)
6867    }
6868
6869    pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
6870        let total_conversations: i64 = self
6871            .conn
6872            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6873                row.get_typed(0)
6874            })
6875            .with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
6876        let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
6877        if total_conversations == 0 {
6878            return Ok(true);
6879        }
6880
6881        let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
6882        let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
6883        let tail_state_has_tail_column =
6884            match franken_table_column_names(&self.conn, "conversation_tail_state") {
6885                Ok(columns) => columns.contains("last_message_idx"),
6886                Err(err) if error_indicates_missing_table(&err) => false,
6887                Err(err) => {
6888                    return Err(err)
6889                        .with_context(|| "reading lexical rebuild tail-state metadata columns");
6890                }
6891            };
6892        if !conversations_have_tail_column && !tail_state_has_tail_column {
6893            return Ok(false);
6894        }
6895
6896        let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
6897            (true, true) => {
6898                "SELECT COUNT(*)
6899                 FROM conversations c
6900                 LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
6901                 WHERE c.last_message_idx IS NOT NULL
6902                    OR ts.last_message_idx IS NOT NULL"
6903            }
6904            (true, false) => {
6905                "SELECT COUNT(*)
6906                 FROM conversations
6907                 WHERE last_message_idx IS NOT NULL"
6908            }
6909            (false, true) => {
6910                "SELECT COUNT(*)
6911                 FROM conversations c
6912                 WHERE EXISTS (
6913                     SELECT 1
6914                     FROM conversation_tail_state ts
6915                     WHERE ts.conversation_id = c.id
6916                       AND ts.last_message_idx IS NOT NULL
6917                 )"
6918            }
6919            (false, false) => unreachable!("checked before covered_sql selection"),
6920        };
6921        let covered_conversations: i64 = self
6922            .conn
6923            .query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
6924            .with_context(
6925                || "counting conversations covered by lexical rebuild tail footprint metadata",
6926            )?;
6927        let covered_conversations =
6928            usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
6929
6930        Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
6931            total_conversations,
6932            covered_conversations,
6933        ))
6934    }
6935
6936    fn raise_lexical_rebuild_footprints_to_exact_message_counts(
6937        &self,
6938        footprints: &mut [LexicalRebuildConversationFootprintRow],
6939    ) -> Result<()> {
6940        if footprints.is_empty() {
6941            return Ok(());
6942        }
6943
6944        let positions_by_conversation: HashMap<i64, usize> = footprints
6945            .iter()
6946            .enumerate()
6947            .map(|(position, footprint)| (footprint.conversation_id, position))
6948            .collect();
6949        self.conn
6950            .query_with_params_for_each(
6951                "SELECT conversation_id, COUNT(*) AS message_count
6952                 FROM messages
6953                 GROUP BY conversation_id
6954                 ORDER BY conversation_id ASC",
6955                &[] as &[SqliteValue],
6956                |row| {
6957                    let conversation_id: i64 = row.get_typed(0)?;
6958                    let exact_count: i64 = row.get_typed(1)?;
6959                    let Some(position) = positions_by_conversation.get(&conversation_id) else {
6960                        return Ok(());
6961                    };
6962                    let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
6963                    let footprint = &mut footprints[*position];
6964                    if exact_count > footprint.message_count {
6965                        footprint.message_count = exact_count;
6966                        footprint.message_bytes =
6967                            footprint.message_bytes.max(exact_count.saturating_mul(
6968                                LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
6969                            ));
6970                    }
6971                    Ok(())
6972                },
6973            )
6974            .with_context(|| "raising lexical rebuild footprints to exact message counts")?;
6975        Ok(())
6976    }
6977
6978    fn fill_missing_lexical_rebuild_footprint_tails(
6979        &self,
6980        footprints: &mut [LexicalRebuildConversationFootprintRow],
6981        missing_tail_positions: &HashMap<i64, usize>,
6982    ) -> Result<()> {
6983        if missing_tail_positions.len() <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT {
6984            for (conversation_id, position) in missing_tail_positions {
6985                let last_message_idx: Option<i64> = self
6986                    .conn
6987                    .query_row_map(
6988                        "SELECT MAX(idx) FROM messages WHERE conversation_id = ?1",
6989                        fparams![*conversation_id],
6990                        |row| row.get_typed(0),
6991                    )
6992                    .with_context(|| {
6993                        format!(
6994                            "looking up missing lexical rebuild tail estimate for conversation {conversation_id}"
6995                        )
6996                    })?;
6997                if let Some(message_count) =
6998                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
6999                {
7000                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7001                        *conversation_id,
7002                        message_count,
7003                    );
7004                }
7005            }
7006            return Ok(());
7007        }
7008
7009        self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7010            footprints,
7011            missing_tail_positions,
7012            "SELECT conversation_id, MAX(idx) AS last_message_idx
7013             FROM messages INDEXED BY idx_messages_conv_idx
7014             GROUP BY conversation_id
7015             ORDER BY conversation_id ASC",
7016        )
7017        .or_else(|err| {
7018            if err
7019                .to_string()
7020                .contains("no such index: idx_messages_conv_idx")
7021            {
7022                return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7023                    footprints,
7024                    missing_tail_positions,
7025                    "SELECT conversation_id, MAX(idx) AS last_message_idx
7026                     FROM messages
7027                     GROUP BY conversation_id
7028                     ORDER BY conversation_id ASC",
7029                );
7030            }
7031            Err(err)
7032        })
7033        .with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
7034
7035        Ok(())
7036    }
7037
7038    fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7039        &self,
7040        footprints: &mut [LexicalRebuildConversationFootprintRow],
7041        missing_tail_positions: &HashMap<i64, usize>,
7042        sql: &str,
7043    ) -> Result<()> {
7044        self.conn
7045            .query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
7046                let conversation_id: i64 = row.get_typed(0)?;
7047                let last_message_idx: Option<i64> = row.get_typed(1)?;
7048                let Some(position) = missing_tail_positions.get(&conversation_id) else {
7049                    return Ok(());
7050                };
7051                if let Some(message_count) =
7052                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7053                {
7054                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7055                        conversation_id,
7056                        message_count,
7057                    );
7058                }
7059                Ok(())
7060            })
7061            .with_context(|| "grouping lexical rebuild missing tail estimates")
7062    }
7063
7064    /// List conversation ids in the stable order used by lexical rebuilds.
7065    pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
7066        self.conn
7067            .query_map_collect(
7068                "SELECT id FROM conversations ORDER BY id ASC",
7069                fparams![],
7070                |row| row.get_typed(0),
7071            )
7072            .with_context(|| "listing conversation ids for lexical rebuild")
7073    }
7074    /// Legacy OFFSET-based traversal for one-time checkpoint migration only.
7075    ///
7076    /// New code must use `list_conversations_for_lexical_rebuild_after_id`
7077    /// for keyset pagination.
7078    pub fn list_conversations_for_lexical_rebuild_by_offset(
7079        &self,
7080        limit: i64,
7081        offset: i64,
7082        agent_slugs: &HashMap<i64, String>,
7083        workspace_paths: &HashMap<i64, PathBuf>,
7084    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7085        // Single-table query avoids the 3-table JOIN that triggers
7086        // frankensqlite's full-materialization fallback path.
7087        self.conn
7088            .query_map_collect(
7089                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7090                       started_at,
7091                       COALESCE(
7092                           (SELECT ts.ended_at
7093                            FROM conversation_tail_state ts
7094                            WHERE ts.conversation_id = conversations.id),
7095                           ended_at
7096                       ),
7097                       source_id, origin_host
7098                FROM conversations
7099                ORDER BY id ASC
7100                LIMIT ?1 OFFSET ?2",
7101                fparams![limit, offset],
7102                |row| {
7103                    let agent_id: Option<i64> = row.get_typed(1)?;
7104                    let workspace_id: Option<i64> = row.get_typed(2)?;
7105                    let source_path: String = row.get_typed(5)?;
7106                    let raw_source_id: Option<String> = row.get_typed(8)?;
7107                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7108                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7109                        raw_source_id.as_deref(),
7110                        None,
7111                        raw_origin_host.as_deref(),
7112                    );
7113                    Ok(LexicalRebuildConversationRow {
7114                        id: Some(row.get_typed(0)?),
7115                        agent_slug: agent_id
7116                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7117                            .unwrap_or_else(|| "unknown".to_string()),
7118                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7119                        external_id: row.get_typed(3)?,
7120                        title: row.get_typed(4)?,
7121                        source_path: Path::new(&source_path).to_path_buf(),
7122                        started_at: row.get_typed(6)?,
7123                        ended_at: row.get_typed(7)?,
7124                        source_id,
7125                        origin_host,
7126                    })
7127                },
7128            )
7129            .with_context(|| "listing conversations for lexical rebuild")
7130    }
7131
7132    /// List lexical rebuild conversations strictly after the given primary key.
7133    ///
7134    /// Keyset pagination keeps later rebuild pages as cheap as earlier ones,
7135    /// avoiding the ever-growing `OFFSET` scan cost during large rebuilds.
7136    pub fn list_conversations_for_lexical_rebuild_after_id(
7137        &self,
7138        limit: i64,
7139        after_conversation_id: i64,
7140        agent_slugs: &HashMap<i64, String>,
7141        workspace_paths: &HashMap<i64, PathBuf>,
7142    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7143        self.conn
7144            .query_map_collect(
7145                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7146                       started_at,
7147                       COALESCE(
7148                           (SELECT ts.ended_at
7149                            FROM conversation_tail_state ts
7150                            WHERE ts.conversation_id = conversations.id),
7151                           ended_at
7152                       ),
7153                       source_id, origin_host
7154                FROM conversations
7155                WHERE id > ?2
7156                ORDER BY id ASC
7157                LIMIT ?1",
7158                fparams![limit, after_conversation_id],
7159                |row| {
7160                    let agent_id: Option<i64> = row.get_typed(1)?;
7161                    let workspace_id: Option<i64> = row.get_typed(2)?;
7162                    let source_path: String = row.get_typed(5)?;
7163                    let raw_source_id: Option<String> = row.get_typed(8)?;
7164                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7165                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7166                        raw_source_id.as_deref(),
7167                        None,
7168                        raw_origin_host.as_deref(),
7169                    );
7170                    Ok(LexicalRebuildConversationRow {
7171                        id: Some(row.get_typed(0)?),
7172                        agent_slug: agent_id
7173                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7174                            .unwrap_or_else(|| "unknown".to_string()),
7175                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7176                        external_id: row.get_typed(3)?,
7177                        title: row.get_typed(4)?,
7178                        source_path: Path::new(&source_path).to_path_buf(),
7179                        started_at: row.get_typed(6)?,
7180                        ended_at: row.get_typed(7)?,
7181                        source_id,
7182                        origin_host,
7183                    })
7184                },
7185            )
7186            .with_context(|| {
7187                format!(
7188                    "listing conversations for lexical rebuild after id {after_conversation_id}"
7189                )
7190            })
7191    }
7192
7193    /// List lexical rebuild conversations inside an `(after_id, through_id]`
7194    /// primary-key window.
7195    ///
7196    /// This lets the rebuild producer respect planned shard boundaries without
7197    /// falling back to client-side trimming or multi-table joins.
7198    pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
7199        &self,
7200        limit: i64,
7201        after_conversation_id: i64,
7202        through_conversation_id: i64,
7203        agent_slugs: &HashMap<i64, String>,
7204        workspace_paths: &HashMap<i64, PathBuf>,
7205    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7206        if through_conversation_id <= after_conversation_id {
7207            return Ok(Vec::new());
7208        }
7209        self.conn
7210            .query_map_collect(
7211                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7212                       started_at,
7213                       COALESCE(
7214                           (SELECT ts.ended_at
7215                            FROM conversation_tail_state ts
7216                            WHERE ts.conversation_id = conversations.id),
7217                           ended_at
7218                       ),
7219                       source_id, origin_host
7220                FROM conversations
7221                WHERE id > ?2 AND id <= ?3
7222                ORDER BY id ASC
7223                LIMIT ?1",
7224                fparams![limit, after_conversation_id, through_conversation_id],
7225                |row| {
7226                    let agent_id: Option<i64> = row.get_typed(1)?;
7227                    let workspace_id: Option<i64> = row.get_typed(2)?;
7228                    let source_path: String = row.get_typed(5)?;
7229                    let raw_source_id: Option<String> = row.get_typed(8)?;
7230                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7231                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7232                        raw_source_id.as_deref(),
7233                        None,
7234                        raw_origin_host.as_deref(),
7235                    );
7236                    Ok(LexicalRebuildConversationRow {
7237                        id: Some(row.get_typed(0)?),
7238                        agent_slug: agent_id
7239                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7240                            .unwrap_or_else(|| "unknown".to_string()),
7241                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7242                        external_id: row.get_typed(3)?,
7243                        title: row.get_typed(4)?,
7244                        source_path: Path::new(&source_path).to_path_buf(),
7245                        started_at: row.get_typed(6)?,
7246                        ended_at: row.get_typed(7)?,
7247                        source_id,
7248                        origin_host,
7249                    })
7250                },
7251            )
7252            .with_context(|| {
7253                format!(
7254                    "listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
7255                )
7256            })
7257    }
7258
7259    /// Fetch messages for a conversation.
7260    pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
7261        let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7262             FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7263             WHERE conversation_id = ?1 ORDER BY idx";
7264        let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7265             FROM messages \
7266             WHERE conversation_id = ?1 ORDER BY idx";
7267
7268        self.conn
7269            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7270                let role: String = row.get_typed(2)?;
7271                Ok(Message {
7272                    id: Some(row.get_typed(0)?),
7273                    idx: row.get_typed(1)?,
7274                    role: match role.as_str() {
7275                        "user" => MessageRole::User,
7276                        "agent" | "assistant" => MessageRole::Agent,
7277                        "tool" => MessageRole::Tool,
7278                        "system" => MessageRole::System,
7279                        other => MessageRole::Other(other.to_string()),
7280                    },
7281                    author: row.get_typed(3)?,
7282                    created_at: row.get_typed(4)?,
7283                    content: row.get_typed(5)?,
7284                    extra_json: franken_read_message_extra_compat(row, 6, 7),
7285                    snippets: Vec::new(),
7286                })
7287            })
7288            .or_else(|err| {
7289                if err
7290                    .to_string()
7291                    .contains("no such index: sqlite_autoindex_messages_1")
7292                {
7293                    return self.conn.query_map_collect(
7294                        fallback_sql,
7295                        fparams![conversation_id],
7296                        |row| {
7297                            let role: String = row.get_typed(2)?;
7298                            Ok(Message {
7299                                id: Some(row.get_typed(0)?),
7300                                idx: row.get_typed(1)?,
7301                                role: match role.as_str() {
7302                                    "user" => MessageRole::User,
7303                                    "agent" | "assistant" => MessageRole::Agent,
7304                                    "tool" => MessageRole::Tool,
7305                                    "system" => MessageRole::System,
7306                                    other => MessageRole::Other(other.to_string()),
7307                                },
7308                                author: row.get_typed(3)?,
7309                                created_at: row.get_typed(4)?,
7310                                content: row.get_typed(5)?,
7311                                extra_json: franken_read_message_extra_compat(row, 6, 7),
7312                                snippets: Vec::new(),
7313                            })
7314                        },
7315                    );
7316                }
7317                Err(err)
7318            })
7319            .with_context(|| format!("fetching messages for conversation {conversation_id}"))
7320    }
7321
7322    /// Fetch messages for lexical index rebuilds without deserializing extra metadata.
7323    ///
7324    /// Tantivy only needs message text and core envelope fields, so avoiding
7325    /// `extra_json` here prevents rebuilds from rehydrating enormous historical
7326    /// payloads that are irrelevant to lexical search.
7327    pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
7328        let hinted_sql = "SELECT id, idx, role, author, created_at, content \
7329                 FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7330                 WHERE conversation_id = ?1 ORDER BY idx";
7331        let fallback_sql = "SELECT id, idx, role, author, created_at, content \
7332                 FROM messages \
7333                 WHERE conversation_id = ?1 ORDER BY idx";
7334
7335        self.conn
7336            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7337                let role: String = row.get_typed(2)?;
7338                Ok(Message {
7339                    id: Some(row.get_typed(0)?),
7340                    idx: row.get_typed(1)?,
7341                    role: match role.as_str() {
7342                        "user" => MessageRole::User,
7343                        "agent" | "assistant" => MessageRole::Agent,
7344                        "tool" => MessageRole::Tool,
7345                        "system" => MessageRole::System,
7346                        other => MessageRole::Other(other.to_string()),
7347                    },
7348                    author: row.get_typed(3)?,
7349                    created_at: row.get_typed(4)?,
7350                    content: row.get_typed(5)?,
7351                    extra_json: serde_json::Value::Null,
7352                    snippets: Vec::new(),
7353                })
7354            })
7355            .or_else(|err| {
7356                if err
7357                    .to_string()
7358                    .contains("no such index: sqlite_autoindex_messages_1")
7359                {
7360                    return self.conn.query_map_collect(
7361                        fallback_sql,
7362                        fparams![conversation_id],
7363                        |row| {
7364                            let role: String = row.get_typed(2)?;
7365                            Ok(Message {
7366                                id: Some(row.get_typed(0)?),
7367                                idx: row.get_typed(1)?,
7368                                role: match role.as_str() {
7369                                    "user" => MessageRole::User,
7370                                    "agent" | "assistant" => MessageRole::Agent,
7371                                    "tool" => MessageRole::Tool,
7372                                    "system" => MessageRole::System,
7373                                    other => MessageRole::Other(other.to_string()),
7374                                },
7375                                author: row.get_typed(3)?,
7376                                created_at: row.get_typed(4)?,
7377                                content: row.get_typed(5)?,
7378                                extra_json: serde_json::Value::Null,
7379                                snippets: Vec::new(),
7380                            })
7381                        },
7382                    );
7383                }
7384                Err(err)
7385            })
7386            .with_context(|| {
7387                format!("fetching messages for lexical rebuild of conversation {conversation_id}")
7388            })
7389    }
7390
7391    /// Fetch messages for multiple conversations during lexical rebuilds.
7392    ///
7393    /// This preserves the lightweight lexical-rebuild projection while avoiding
7394    /// one round-trip per conversation when rebuilding large canonical indexes.
7395    pub fn fetch_messages_for_lexical_rebuild_batch(
7396        &self,
7397        conversation_ids: &[i64],
7398        max_messages: Option<usize>,
7399        max_content_bytes: Option<usize>,
7400    ) -> Result<HashMap<i64, Vec<Message>>> {
7401        if conversation_ids.is_empty() {
7402            return Ok(HashMap::new());
7403        }
7404
7405        let mut grouped: HashMap<i64, Vec<Message>> =
7406            HashMap::with_capacity(conversation_ids.len());
7407        let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
7408        let mut total_messages = 0usize;
7409        let mut total_content_bytes = 0usize;
7410
7411        // The apparent single-query shape (`WHERE conversation_id IN (...) ORDER BY ...`)
7412        // is a bad frankensqlite plan for large live databases: it can
7413        // materialize far more of `messages` than the requested conversations.
7414        // Reuse the hinted per-conversation primary-key lookup instead.
7415        for conversation_id in conversation_ids {
7416            if !fetched_conversation_ids.insert(*conversation_id) {
7417                continue;
7418            }
7419
7420            let messages = self
7421                .fetch_messages_for_lexical_rebuild(*conversation_id)
7422                .with_context(|| {
7423                    format!("fetching lexical rebuild messages for conversation {conversation_id}")
7424                })?;
7425            total_messages = total_messages.saturating_add(messages.len());
7426            if let Some(limit) = max_messages
7427                && total_messages > limit
7428            {
7429                return Err(anyhow!(
7430                    "lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
7431                    conversation_ids.len()
7432                ));
7433            }
7434
7435            let message_bytes = messages
7436                .iter()
7437                .map(|message| message.content.len())
7438                .sum::<usize>();
7439            total_content_bytes = total_content_bytes.saturating_add(message_bytes);
7440            if let Some(limit) = max_content_bytes
7441                && total_content_bytes > limit
7442            {
7443                return Err(anyhow!(
7444                    "lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
7445                    conversation_ids.len()
7446                ));
7447            }
7448
7449            if !messages.is_empty() {
7450                grouped.insert(*conversation_id, messages);
7451            }
7452        }
7453
7454        Ok(grouped)
7455    }
7456
7457    /// Stream lexical rebuild message rows in `(conversation_id, idx)` order
7458    /// without materializing the full result set.
7459    pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
7460        &self,
7461        start_conversation_id: i64,
7462        end_conversation_id: i64,
7463        mut f: F,
7464    ) -> Result<()>
7465    where
7466        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7467    {
7468        if end_conversation_id < start_conversation_id {
7469            return Ok(());
7470        }
7471
7472        let conversation_ids: Vec<i64> = self
7473            .conn
7474            .query_map_collect(
7475                "SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
7476                fparams![start_conversation_id, end_conversation_id],
7477                |row| row.get_typed(0),
7478            )
7479            .with_context(|| "listing conversation ids for streamed lexical rebuild")?;
7480
7481        for conversation_id in conversation_ids {
7482            let messages = self
7483                .fetch_messages_for_lexical_rebuild(conversation_id)
7484                .with_context(|| {
7485                    format!("streaming lexical rebuild messages for conversation {conversation_id}")
7486                })?;
7487
7488            for message in messages {
7489                let message_id = message.id.ok_or_else(|| {
7490                    anyhow!(
7491                        "lexical rebuild message missing id for conversation {conversation_id} idx {}",
7492                        message.idx
7493                    )
7494                })?;
7495                f(LexicalRebuildMessageRow {
7496                    conversation_id,
7497                    id: message_id,
7498                    idx: message.idx,
7499                    role: role_str(&message.role),
7500                    author: message.author,
7501                    created_at: message.created_at,
7502                    content: message.content,
7503                })?;
7504            }
7505        }
7506
7507        Ok(())
7508    }
7509
7510    /// Stream grouped lexical rebuild message rows in `(conversation_id, idx)`
7511    /// order by reusing the canonical per-message stream and coalescing rows
7512    /// per conversation.
7513    pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
7514        &self,
7515        start_conversation_id: i64,
7516        end_conversation_id: i64,
7517        mut f: F,
7518    ) -> Result<()>
7519    where
7520        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7521    {
7522        if end_conversation_id < start_conversation_id {
7523            return Ok(());
7524        }
7525
7526        let mut current_conversation_id: Option<i64> = None;
7527        let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
7528        let mut current_last_message_id = 0i64;
7529        let mut flush_current = |current_conversation_id: &mut Option<i64>,
7530                                 current_messages: &mut LexicalRebuildGroupedMessageRows,
7531                                 current_last_message_id: &mut i64|
7532         -> Result<()> {
7533            let Some(conversation_id) = current_conversation_id.take() else {
7534                return Ok(());
7535            };
7536            let messages = std::mem::take(current_messages);
7537            let last_message_id = std::mem::take(current_last_message_id);
7538            f(conversation_id, messages, last_message_id)
7539        };
7540
7541        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7542            start_conversation_id,
7543            end_conversation_id,
7544            |row| {
7545                if current_conversation_id != Some(row.conversation_id) {
7546                    flush_current(
7547                        &mut current_conversation_id,
7548                        &mut current_messages,
7549                        &mut current_last_message_id,
7550                    )?;
7551                    current_conversation_id = Some(row.conversation_id);
7552                }
7553                current_last_message_id = row.id;
7554                current_messages.push(LexicalRebuildGroupedMessageRow {
7555                    idx: row.idx,
7556                    is_tool_role: row.role == "tool",
7557                    created_at: row.created_at,
7558                    content: row.content,
7559                });
7560                Ok(())
7561            },
7562        )
7563        .with_context(|| "streaming grouped lexical rebuild messages")?;
7564
7565        flush_current(
7566            &mut current_conversation_id,
7567            &mut current_messages,
7568            &mut current_last_message_id,
7569        )
7570        .with_context(|| "flushing grouped lexical rebuild messages")
7571    }
7572
7573    /// Stream grouped lexical rebuild message rows from a starting conversation
7574    /// id to the end of the table.
7575    pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
7576        &self,
7577        start_conversation_id: i64,
7578        f: F,
7579    ) -> Result<()>
7580    where
7581        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7582    {
7583        self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
7584            start_conversation_id,
7585            i64::MAX,
7586            f,
7587        )
7588    }
7589
7590    /// Stream lexical rebuild message rows from a starting conversation id to
7591    /// the end of the table.
7592    pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
7593        &self,
7594        start_conversation_id: i64,
7595        f: F,
7596    ) -> Result<()>
7597    where
7598        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7599    {
7600        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7601            start_conversation_id,
7602            i64::MAX,
7603            f,
7604        )
7605    }
7606
7607    /// Get a source by ID.
7608    pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
7609        let result = self.conn.query_row_map(
7610            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
7611            fparams![id],
7612            |row| {
7613                let kind_str: String = row.get_typed(1)?;
7614                let config_json_str: Option<String> = row.get_typed(5)?;
7615                Ok(Source {
7616                    id: row.get_typed(0)?,
7617                    kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7618                    host_label: row.get_typed(2)?,
7619                    machine_id: row.get_typed(3)?,
7620                    platform: row.get_typed(4)?,
7621                    config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7622                    created_at: row.get_typed(6)?,
7623                    updated_at: row.get_typed(7)?,
7624                })
7625            },
7626        );
7627        Ok(result.optional()?)
7628    }
7629
7630    /// List all sources.
7631    pub fn list_sources(&self) -> Result<Vec<Source>> {
7632        self.conn
7633            .query_map_collect(
7634                "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
7635                fparams![],
7636                |row| {
7637                    let kind_str: String = row.get_typed(1)?;
7638                    let config_json_str: Option<String> = row.get_typed(5)?;
7639                    Ok(Source {
7640                        id: row.get_typed(0)?,
7641                        kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7642                        host_label: row.get_typed(2)?,
7643                        machine_id: row.get_typed(3)?,
7644                        platform: row.get_typed(4)?,
7645                        config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7646                        created_at: row.get_typed(6)?,
7647                        updated_at: row.get_typed(7)?,
7648                    })
7649                },
7650            )
7651            .with_context(|| "listing sources")
7652    }
7653
7654    /// Get IDs of all non-local sources.
7655    pub fn get_source_ids(&self) -> Result<Vec<String>> {
7656        self.conn
7657            .query_map_collect(
7658                "SELECT id FROM sources WHERE id != 'local' ORDER BY id",
7659                fparams![],
7660                |row| row.get_typed(0),
7661            )
7662            .with_context(|| "listing source ids")
7663    }
7664
7665    /// Create or update a source.
7666    pub fn upsert_source(&self, source: &Source) -> Result<()> {
7667        self.invalidate_conversation_source_cache(source.id.as_str());
7668        let now = Self::now_millis();
7669        let kind_str = source.kind.to_string();
7670        let config_json_str = source
7671            .config_json
7672            .as_ref()
7673            .map(serde_json::to_string)
7674            .transpose()?;
7675
7676        // Re-indexing commonly reuses the same normalized source metadata
7677        // across many conversations. Skip the write entirely when the row is
7678        // already identical so we avoid needless WAL churn and timestamp bumps.
7679        self.conn.execute_compat(
7680            "INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
7681             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
7682             ON CONFLICT(id) DO UPDATE SET
7683                 kind = excluded.kind,
7684                 host_label = excluded.host_label,
7685                 machine_id = excluded.machine_id,
7686                 platform = excluded.platform,
7687                 config_json = excluded.config_json,
7688                 updated_at = excluded.updated_at
7689             WHERE NOT (
7690                 sources.kind IS excluded.kind
7691                 AND sources.host_label IS excluded.host_label
7692                 AND sources.machine_id IS excluded.machine_id
7693                 AND sources.platform IS excluded.platform
7694                 AND sources.config_json IS excluded.config_json
7695             )",
7696            fparams![
7697                source.id.as_str(),
7698                kind_str.as_str(),
7699                source.host_label.as_deref(),
7700                source.machine_id.as_deref(),
7701                source.platform.as_deref(),
7702                config_json_str.as_deref(),
7703                source.created_at.unwrap_or(now),
7704                now
7705            ],
7706        )?;
7707        Ok(())
7708    }
7709
7710    fn historical_bundle_key_hash(
7711        version: u32,
7712        bundle: &HistoricalDatabaseBundle,
7713        include_bundle_stats: bool,
7714    ) -> String {
7715        let signature = if include_bundle_stats {
7716            format!(
7717                "{}:{}:{}:{}",
7718                version,
7719                bundle.root_path.display(),
7720                bundle.total_bytes,
7721                bundle.modified_at_ms
7722            )
7723        } else {
7724            format!("{}:{}", version, bundle.root_path.display())
7725        };
7726        blake3::hash(signature.as_bytes()).to_hex().to_string()
7727    }
7728
7729    fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7730        format!(
7731            "historical_bundle_salvaged:{}",
7732            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
7733        )
7734    }
7735
7736    fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7737        let signature = format!(
7738            "{}:{}:{}:{}",
7739            HISTORICAL_SALVAGE_LEDGER_VERSION,
7740            bundle.root_path.display(),
7741            bundle.total_bytes,
7742            bundle.modified_at_ms
7743        );
7744        format!(
7745            "historical_bundle_salvaged:{}",
7746            blake3::hash(signature.as_bytes()).to_hex()
7747        )
7748    }
7749
7750    fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
7751        format!(
7752            "historical_bundle_progress:{}",
7753            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
7754        )
7755    }
7756
7757    fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
7758        let signature = format!(
7759            "{}:{}:{}:{}",
7760            HISTORICAL_SALVAGE_PROGRESS_VERSION,
7761            bundle.root_path.display(),
7762            bundle.total_bytes,
7763            bundle.modified_at_ms
7764        );
7765        format!(
7766            "historical_bundle_progress:{}",
7767            blake3::hash(signature.as_bytes()).to_hex()
7768        )
7769    }
7770
7771    fn historical_bundle_already_imported(
7772        &self,
7773        bundle: &HistoricalDatabaseBundle,
7774    ) -> Result<bool> {
7775        for key in [
7776            Self::historical_bundle_meta_key(bundle),
7777            Self::historical_bundle_legacy_meta_key(bundle),
7778        ] {
7779            let existing: Option<String> = self
7780                .conn
7781                .query_row_map(
7782                    "SELECT value FROM meta WHERE key = ?1",
7783                    fparams![key.as_str()],
7784                    |row| row.get_typed(0),
7785                )
7786                .optional()?;
7787            if existing.is_some() {
7788                return Ok(true);
7789            }
7790        }
7791        Ok(false)
7792    }
7793
7794    pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
7795        for bundle in discover_historical_database_bundles(canonical_db_path) {
7796            if !self.historical_bundle_already_imported(&bundle)? {
7797                return Ok(true);
7798            }
7799        }
7800        Ok(false)
7801    }
7802
7803    fn load_historical_bundle_progress(
7804        &self,
7805        bundle: &HistoricalDatabaseBundle,
7806    ) -> Result<Option<HistoricalBundleProgress>> {
7807        for key in [
7808            Self::historical_bundle_progress_key(bundle),
7809            Self::historical_bundle_legacy_progress_key(bundle),
7810        ] {
7811            let raw: Option<String> = self
7812                .conn
7813                .query_row_map(
7814                    "SELECT value FROM meta WHERE key = ?1",
7815                    fparams![key.as_str()],
7816                    |row| row.get_typed(0),
7817                )
7818                .optional()?;
7819            let Some(raw) = raw else {
7820                continue;
7821            };
7822            let parsed: HistoricalBundleProgress =
7823                serde_json::from_str(&raw).with_context(|| {
7824                    format!(
7825                        "parsing historical salvage progress checkpoint for {}",
7826                        bundle.root_path.display()
7827                    )
7828                })?;
7829            if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
7830                return Ok(Some(parsed));
7831            }
7832        }
7833        Ok(None)
7834    }
7835
7836    fn record_historical_bundle_progress(
7837        &self,
7838        bundle: &HistoricalDatabaseBundle,
7839        method: &str,
7840        last_completed_source_row_id: i64,
7841        conversations_imported: usize,
7842        messages_imported: usize,
7843    ) -> Result<()> {
7844        let key = Self::historical_bundle_progress_key(bundle);
7845        let value = HistoricalBundleProgress {
7846            progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
7847            path: bundle.root_path.display().to_string(),
7848            bytes: bundle.total_bytes,
7849            modified_at_ms: bundle.modified_at_ms,
7850            method: method.to_string(),
7851            last_completed_source_row_id,
7852            conversations_imported,
7853            messages_imported,
7854            updated_at_ms: Self::now_millis(),
7855        };
7856        let value_str = serde_json::to_string(&value)?;
7857        self.conn.execute_compat(
7858            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7859            fparams![key.as_str(), value_str.as_str()],
7860        )?;
7861        Ok(())
7862    }
7863
7864    fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
7865        for key in [
7866            Self::historical_bundle_progress_key(bundle),
7867            Self::historical_bundle_legacy_progress_key(bundle),
7868        ] {
7869            self.conn
7870                .execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
7871        }
7872        Ok(())
7873    }
7874
7875    fn record_historical_bundle_import(
7876        &self,
7877        bundle: &HistoricalDatabaseBundle,
7878        method: &str,
7879        conversations_imported: usize,
7880        messages_imported: usize,
7881    ) -> Result<()> {
7882        let key = Self::historical_bundle_meta_key(bundle);
7883        let value = serde_json::json!({
7884            "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
7885            "path": bundle.root_path.display().to_string(),
7886            "bytes": bundle.total_bytes,
7887            "modified_at_ms": bundle.modified_at_ms,
7888            "method": method,
7889            "conversations_imported": conversations_imported,
7890            "messages_imported": messages_imported,
7891            "recorded_at_ms": Self::now_millis(),
7892        });
7893        let value_str = serde_json::to_string(&value)?;
7894        self.conn.execute_compat(
7895            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7896            fparams![key.as_str(), value_str.as_str()],
7897        )?;
7898        Ok(())
7899    }
7900
7901    fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
7902        const RETRYABLE_PATTERNS: &[&str] = &[
7903            "out of memory",
7904            "string or blob too big",
7905            "too many sql variables",
7906        ];
7907        err.chain().any(|cause| {
7908            let rendered = cause.to_string().to_ascii_lowercase();
7909            RETRYABLE_PATTERNS
7910                .iter()
7911                .any(|pattern| rendered.contains(pattern))
7912        })
7913    }
7914
7915    fn split_historical_batch_entry_messages(
7916        entry: &HistoricalBatchEntry,
7917    ) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
7918        if entry.conversation.messages.len() < 2 {
7919            return None;
7920        }
7921        let split_at = entry.conversation.messages.len() / 2;
7922        if split_at == 0 || split_at >= entry.conversation.messages.len() {
7923            return None;
7924        }
7925
7926        let mut left = entry.clone();
7927        left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
7928
7929        let mut right = entry.clone();
7930        right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
7931
7932        Some((left, right))
7933    }
7934
7935    fn import_historical_batch_with_retry<F>(
7936        entries: &[HistoricalBatchEntry],
7937        insert_batch: &mut F,
7938    ) -> Result<HistoricalBatchImportTotals>
7939    where
7940        F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
7941    {
7942        match insert_batch(entries) {
7943            Ok(totals) => Ok(totals),
7944            Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
7945                if entries.len() > 1 {
7946                    let mid = entries.len() / 2;
7947                    tracing::warn!(
7948                        batch_entries = entries.len(),
7949                        split_left = mid,
7950                        split_right = entries.len() - mid,
7951                        error = %err,
7952                        "historical salvage batch failed; retrying in smaller sub-batches"
7953                    );
7954                    let left =
7955                        Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
7956                    let right =
7957                        Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
7958                    return Ok(HistoricalBatchImportTotals {
7959                        inserted_source_rows: left.inserted_source_rows
7960                            + right.inserted_source_rows,
7961                        inserted_messages: left.inserted_messages + right.inserted_messages,
7962                    });
7963                }
7964
7965                if let Some(entry) = entries.first()
7966                    && let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
7967                {
7968                    tracing::warn!(
7969                        source_row_id = entry.source_row_id,
7970                        message_count = entry.conversation.messages.len(),
7971                        error = %err,
7972                        "historical salvage conversation failed; retrying in smaller message slices"
7973                    );
7974                    let left_totals = Self::import_historical_batch_with_retry(
7975                        std::slice::from_ref(&left),
7976                        insert_batch,
7977                    )?;
7978                    let right_totals = Self::import_historical_batch_with_retry(
7979                        std::slice::from_ref(&right),
7980                        insert_batch,
7981                    )?;
7982                    return Ok(HistoricalBatchImportTotals {
7983                        inserted_source_rows: usize::from(
7984                            left_totals.inserted_source_rows > 0
7985                                || right_totals.inserted_source_rows > 0,
7986                        ),
7987                        inserted_messages: left_totals
7988                            .inserted_messages
7989                            .saturating_add(right_totals.inserted_messages),
7990                    });
7991                }
7992
7993                Err(err)
7994            }
7995            Err(err) => Err(err),
7996        }
7997    }
7998
7999    fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
8000        let sources: Vec<Source> = match source_conn.query_map_collect(
8001            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
8002             FROM sources",
8003            fparams![],
8004            |row| {
8005                let raw_source_id: String = row.get_typed(0)?;
8006                let kind_str: String = row.get_typed(1)?;
8007                let raw_host_label: Option<String> = row.get_typed(2)?;
8008                let config_json_raw: Option<String> = row.get_typed(5)?;
8009                let (source_id, source_kind, host_label) = normalized_storage_source_parts(
8010                    Some(raw_source_id.as_str()),
8011                    Some(kind_str.as_str()),
8012                    raw_host_label.as_deref(),
8013                );
8014                Ok(Source {
8015                    id: source_id,
8016                    kind: source_kind,
8017                    host_label,
8018                    machine_id: row.get_typed(3)?,
8019                    platform: row.get_typed(4)?,
8020                    config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
8021                    created_at: row.get_typed(6)?,
8022                    updated_at: row.get_typed(7)?,
8023                })
8024            },
8025        ) {
8026            Ok(rows) => rows,
8027            Err(err) => {
8028                tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
8029                return Ok(());
8030            }
8031        };
8032
8033        for source in sources {
8034            self.upsert_source(&source)?;
8035        }
8036        Ok(())
8037    }
8038
8039    fn import_historical_conversations(
8040        &self,
8041        bundle: &HistoricalDatabaseBundle,
8042        salvage_method: &str,
8043        source_conn: &FrankenConnection,
8044    ) -> Result<(usize, usize)> {
8045        let batch_limits = historical_import_batch_limits();
8046        let cache_enabled = IndexingCache::is_enabled();
8047        let mut indexing_cache = IndexingCache::new();
8048        let mut known_sources: HashSet<String> = self
8049            .list_sources()?
8050            .into_iter()
8051            .map(|source| source.id)
8052            .collect();
8053        let resume_progress = self.load_historical_bundle_progress(bundle)?;
8054        let resume_after_row_id = resume_progress
8055            .as_ref()
8056            .map(|progress| progress.last_completed_source_row_id)
8057            .filter(|row_id| *row_id > 0);
8058
8059        tracing::info!(
8060            target: "cass::historical_salvage",
8061            batch_conversations = batch_limits.conversations,
8062            batch_messages = batch_limits.messages,
8063            batch_payload_chars = batch_limits.payload_chars,
8064            cache_enabled,
8065            resume_after_row_id,
8066            "configured historical salvage batch limits"
8067        );
8068
8069        if let Some(progress) = &resume_progress {
8070            tracing::info!(
8071                target: "cass::historical_salvage",
8072                path = %bundle.root_path.display(),
8073                resume_after_row_id = progress.last_completed_source_row_id,
8074                prior_conversations_imported = progress.conversations_imported,
8075                prior_messages_imported = progress.messages_imported,
8076                "resuming historical salvage bundle from durable checkpoint"
8077            );
8078        }
8079
8080        // LEFT JOIN + COALESCE on agents so legacy source databases with NULL
8081        // agent_id (the V1 schema did not require NOT NULL) still have their
8082        // conversations imported, degrading to 'unknown' slug like the other
8083        // rebuild paths.  Using INNER JOIN here would silently drop those
8084        // conversations during historical salvage, which is data loss.
8085        let conv_sql = if resume_after_row_id.is_some() {
8086            "SELECT
8087                c.id,
8088                COALESCE(a.slug, 'unknown'),
8089                w.path,
8090                c.external_id,
8091                c.title,
8092                c.source_path,
8093                c.started_at,
8094                c.ended_at,
8095                c.approx_tokens,
8096                c.metadata_json,
8097                c.source_id,
8098                c.origin_host
8099             FROM conversations c
8100             LEFT JOIN agents a ON c.agent_id = a.id
8101             LEFT JOIN workspaces w ON c.workspace_id = w.id
8102             WHERE c.id > ?1
8103             ORDER BY c.id"
8104        } else {
8105            "SELECT
8106                c.id,
8107                COALESCE(a.slug, 'unknown'),
8108                w.path,
8109                c.external_id,
8110                c.title,
8111                c.source_path,
8112                c.started_at,
8113                c.ended_at,
8114                c.approx_tokens,
8115                c.metadata_json,
8116                c.source_id,
8117                c.origin_host
8118             FROM conversations c
8119             LEFT JOIN agents a ON c.agent_id = a.id
8120             LEFT JOIN workspaces w ON c.workspace_id = w.id
8121             ORDER BY c.id"
8122        };
8123        let conv_params: &[ParamValue] =
8124            if let Some(last_completed_source_row_id) = resume_after_row_id {
8125                &[ParamValue::from(last_completed_source_row_id)]
8126            } else {
8127                &[]
8128            };
8129
8130        #[allow(clippy::type_complexity)]
8131        let conv_rows: Vec<(
8132            i64,
8133            String,
8134            Option<String>,
8135            Option<String>,
8136            Option<String>,
8137            String,
8138            Option<i64>,
8139            Option<i64>,
8140            Option<i64>,
8141            Option<String>,
8142            Option<String>,
8143            Option<String>,
8144        )> = source_conn
8145            .query_map_collect(conv_sql, conv_params, |row| {
8146                Ok((
8147                    row.get_typed::<i64>(0)?,
8148                    row.get_typed::<String>(1)?,
8149                    row.get_typed::<Option<String>>(2)?,
8150                    row.get_typed::<Option<String>>(3)?,
8151                    row.get_typed::<Option<String>>(4)?,
8152                    row.get_typed::<String>(5)?,
8153                    row.get_typed::<Option<i64>>(6)?,
8154                    row.get_typed::<Option<i64>>(7)?,
8155                    row.get_typed::<Option<i64>>(8)?,
8156                    row.get_typed::<Option<String>>(9)?,
8157                    row.get_typed::<Option<String>>(10)?,
8158                    row.get_typed::<Option<String>>(11)?,
8159                ))
8160            })
8161            .context("querying historical conversations")?;
8162
8163        let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
8164             FROM messages
8165             WHERE conversation_id = ?1
8166             ORDER BY idx";
8167
8168        let mut imported_conversations = resume_progress
8169            .as_ref()
8170            .map(|progress| progress.conversations_imported)
8171            .unwrap_or(0);
8172        let mut imported_messages = resume_progress
8173            .as_ref()
8174            .map(|progress| progress.messages_imported)
8175            .unwrap_or(0);
8176        let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
8177        let mut pending_batch_messages = 0usize;
8178        let mut pending_batch_chars = 0usize;
8179        let mut pending_batch_first_row_id: Option<i64> = None;
8180        let mut pending_batch_last_row_id: Option<i64> = None;
8181
8182        let flush_batch = |storage: &FrankenStorage,
8183                           batch: &mut Vec<HistoricalBatchEntry>,
8184                           pending_messages: &mut usize,
8185                           pending_chars: &mut usize,
8186                           first_row_id: &mut Option<i64>,
8187                           last_row_id: &mut Option<i64>,
8188                           imported_conversations: &mut usize,
8189                           imported_messages: &mut usize|
8190         -> Result<()> {
8191            if batch.is_empty() {
8192                return Ok(());
8193            }
8194
8195            let batch_first_row_id = *first_row_id;
8196            let batch_last_row_id = *last_row_id;
8197            if historical_salvage_debug_enabled() {
8198                eprintln!(
8199                    "[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
8200                    batch_first_row_id,
8201                    batch_last_row_id,
8202                    batch.len(),
8203                    *pending_messages,
8204                    *pending_chars
8205                );
8206            }
8207            tracing::info!(
8208                target: "cass::historical_salvage",
8209                batch_conversations = batch.len(),
8210                batch_messages = *pending_messages,
8211                batch_payload_chars = *pending_chars,
8212                first_source_row_id = batch_first_row_id,
8213                last_source_row_id = batch_last_row_id,
8214                "flushing historical salvage batch"
8215            );
8216
8217            let mut insert_batch =
8218                |entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
8219                    let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
8220                        .iter()
8221                        .map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
8222                        .collect();
8223                    let outcomes = storage
8224                        .insert_conversations_batched(&borrowed_batch)
8225                        .with_context(|| {
8226                            let first_source_row_id =
8227                                entries.first().map(|entry| entry.source_row_id);
8228                            let last_source_row_id =
8229                                entries.last().map(|entry| entry.source_row_id);
8230                            format!(
8231                                "inserting historical salvage batch source rows {:?}..{:?}",
8232                                first_source_row_id, last_source_row_id
8233                            )
8234                        })?;
8235                    let mut totals = HistoricalBatchImportTotals::default();
8236                    for outcome in outcomes {
8237                        if !outcome.inserted_indices.is_empty() {
8238                            totals.inserted_source_rows += 1;
8239                            totals.inserted_messages += outcome.inserted_indices.len();
8240                        }
8241                    }
8242                    Ok(totals)
8243                };
8244            let totals =
8245                Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
8246            *imported_conversations =
8247                (*imported_conversations).saturating_add(totals.inserted_source_rows);
8248            *imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
8249            if let Some(last_completed_row_id) = batch_last_row_id {
8250                storage.record_historical_bundle_progress(
8251                    bundle,
8252                    salvage_method,
8253                    last_completed_row_id,
8254                    *imported_conversations,
8255                    *imported_messages,
8256                )?;
8257            }
8258            tracing::info!(
8259                target: "cass::historical_salvage",
8260                batch_conversations = batch.len(),
8261                batch_messages = *pending_messages,
8262                imported_conversations = *imported_conversations,
8263                imported_messages = *imported_messages,
8264                first_source_row_id = batch_first_row_id,
8265                last_source_row_id = batch_last_row_id,
8266                "historical salvage batch committed"
8267            );
8268            if historical_salvage_debug_enabled() {
8269                eprintln!(
8270                    "[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
8271                    batch_first_row_id,
8272                    batch_last_row_id,
8273                    *imported_conversations,
8274                    *imported_messages
8275                );
8276            }
8277            batch.clear();
8278            *pending_messages = 0;
8279            *pending_chars = 0;
8280            *first_row_id = None;
8281            *last_row_id = None;
8282            Ok(())
8283        };
8284
8285        for (
8286            conversation_row_id,
8287            agent_slug,
8288            workspace_path,
8289            external_id,
8290            title,
8291            source_path,
8292            started_at,
8293            ended_at,
8294            approx_tokens,
8295            metadata_json_raw,
8296            raw_source_id,
8297            raw_origin_host,
8298        ) in conv_rows
8299        {
8300            let source_id = crate::search::tantivy::normalized_index_source_id(
8301                raw_source_id.as_deref(),
8302                None,
8303                raw_origin_host.as_deref(),
8304            );
8305            let origin_host =
8306                crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
8307
8308            let messages: Vec<Message> = source_conn
8309                .query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
8310                    let role: String = msg_row.get_typed(1)?;
8311                    Ok(Message {
8312                        id: None,
8313                        idx: msg_row.get_typed(0)?,
8314                        role: match role.as_str() {
8315                            "user" => MessageRole::User,
8316                            "agent" | "assistant" => MessageRole::Agent,
8317                            "tool" => MessageRole::Tool,
8318                            "system" => MessageRole::System,
8319                            other => MessageRole::Other(other.to_string()),
8320                        },
8321                        author: msg_row.get_typed(2)?,
8322                        created_at: msg_row.get_typed(3)?,
8323                        content: msg_row.get_typed(4)?,
8324                        extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
8325                        snippets: Vec::new(),
8326                    })
8327                })
8328                .context("collecting historical message rows")?;
8329
8330            if messages.is_empty() {
8331                continue;
8332            }
8333
8334            let conversation_message_count = messages.len();
8335            let conversation_chars = messages
8336                .iter()
8337                .map(message_payload_size_hint)
8338                .sum::<usize>();
8339
8340            let conversation = Conversation {
8341                id: None,
8342                agent_slug: agent_slug.clone(),
8343                workspace: workspace_path.map(PathBuf::from),
8344                external_id,
8345                title,
8346                source_path: PathBuf::from(source_path),
8347                started_at,
8348                ended_at,
8349                approx_tokens,
8350                metadata_json: parse_json_column(metadata_json_raw),
8351                messages,
8352                source_id,
8353                origin_host,
8354            };
8355
8356            if !known_sources.contains(&conversation.source_id) {
8357                let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
8358                    Source::local()
8359                } else {
8360                    Source {
8361                        id: conversation.source_id.clone(),
8362                        kind: SourceKind::Ssh,
8363                        host_label: conversation.origin_host.clone(),
8364                        machine_id: None,
8365                        platform: None,
8366                        config_json: None,
8367                        created_at: None,
8368                        updated_at: None,
8369                    }
8370                };
8371                self.upsert_source(&placeholder)?;
8372                known_sources.insert(conversation.source_id.clone());
8373            }
8374
8375            let agent = Agent {
8376                id: None,
8377                slug: agent_slug.clone(),
8378                name: agent_slug,
8379                version: None,
8380                kind: AgentKind::Cli,
8381            };
8382            let agent_id = if cache_enabled {
8383                indexing_cache.get_or_insert_agent(self, &agent)?
8384            } else {
8385                self.ensure_agent(&agent)?
8386            };
8387            let workspace_id = if let Some(workspace) = &conversation.workspace {
8388                if cache_enabled {
8389                    Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
8390                } else {
8391                    Some(self.ensure_workspace(workspace, None)?)
8392                }
8393            } else {
8394                None
8395            };
8396
8397            let exceeds_pending_limits = !pending_batch.is_empty()
8398                && (pending_batch.len() >= batch_limits.conversations
8399                    || pending_batch_messages.saturating_add(conversation_message_count)
8400                        > batch_limits.messages
8401                    || pending_batch_chars.saturating_add(conversation_chars)
8402                        > batch_limits.payload_chars);
8403            if exceeds_pending_limits {
8404                flush_batch(
8405                    self,
8406                    &mut pending_batch,
8407                    &mut pending_batch_messages,
8408                    &mut pending_batch_chars,
8409                    &mut pending_batch_first_row_id,
8410                    &mut pending_batch_last_row_id,
8411                    &mut imported_conversations,
8412                    &mut imported_messages,
8413                )?;
8414            }
8415
8416            if pending_batch_first_row_id.is_none() {
8417                pending_batch_first_row_id = Some(conversation_row_id);
8418            }
8419            pending_batch_last_row_id = Some(conversation_row_id);
8420            pending_batch_messages =
8421                pending_batch_messages.saturating_add(conversation_message_count);
8422            pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
8423            pending_batch.push(HistoricalBatchEntry {
8424                source_row_id: conversation_row_id,
8425                agent_id,
8426                workspace_id,
8427                conversation,
8428            });
8429
8430            if pending_batch.len() >= batch_limits.conversations
8431                || pending_batch_messages >= batch_limits.messages
8432                || pending_batch_chars >= batch_limits.payload_chars
8433            {
8434                flush_batch(
8435                    self,
8436                    &mut pending_batch,
8437                    &mut pending_batch_messages,
8438                    &mut pending_batch_chars,
8439                    &mut pending_batch_first_row_id,
8440                    &mut pending_batch_last_row_id,
8441                    &mut imported_conversations,
8442                    &mut imported_messages,
8443                )?;
8444            }
8445        }
8446
8447        flush_batch(
8448            self,
8449            &mut pending_batch,
8450            &mut pending_batch_messages,
8451            &mut pending_batch_chars,
8452            &mut pending_batch_first_row_id,
8453            &mut pending_batch_last_row_id,
8454            &mut imported_conversations,
8455            &mut imported_messages,
8456        )?;
8457
8458        if cache_enabled {
8459            let (hits, misses, hit_rate) = indexing_cache.stats();
8460            tracing::info!(
8461                target: "cass::historical_salvage",
8462                hits,
8463                misses,
8464                hit_rate = format!("{:.1}%", hit_rate * 100.0),
8465                agents = indexing_cache.agent_count(),
8466                workspaces = indexing_cache.workspace_count(),
8467                sources = known_sources.len(),
8468                "historical salvage cache stats"
8469            );
8470        }
8471
8472        Ok((imported_conversations, imported_messages))
8473    }
8474
8475    pub fn salvage_historical_databases(
8476        &self,
8477        canonical_db_path: &Path,
8478    ) -> Result<HistoricalSalvageOutcome> {
8479        let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
8480        let mut outcome = HistoricalSalvageOutcome {
8481            bundles_considered: ordered_bundles.len(),
8482            ..HistoricalSalvageOutcome::default()
8483        };
8484
8485        for bundle in ordered_bundles {
8486            if self.historical_bundle_already_imported(&bundle)? {
8487                self.clear_historical_bundle_progress(&bundle)?;
8488                continue;
8489            }
8490
8491            let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
8492                format!(
8493                    "opening historical bundle {} for salvage",
8494                    bundle.root_path.display()
8495                )
8496            }) {
8497                Ok(source) => source,
8498                Err(err) => {
8499                    tracing::warn!(
8500                        path = %bundle.root_path.display(),
8501                        error = %err,
8502                        "skipping unreadable historical cass database bundle during salvage"
8503                    );
8504                    self.clear_historical_bundle_progress(&bundle)?;
8505                    continue;
8506                }
8507            };
8508
8509            // #247 (coding_agent_session_search-r8pcy): if a per-bundle progress
8510            // checkpoint already covers the backup's entire conversation row-id
8511            // space, the bundle was effectively fully imported but the daemon was
8512            // killed (e.g. OOM) before the completion ledger marker landed.
8513            // Re-scanning it is a pure O(n) no-op — every batch commits
8514            // imported=0 while taking 5-12 min. Detect it via the high-water
8515            // checkpoint, write the ledger marker, drop the checkpoint, and skip.
8516            if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
8517                let backup_max_conversation_id: i64 = source
8518                    .conn
8519                    .query_row_map(
8520                        "SELECT COALESCE(MAX(id), 0) FROM conversations",
8521                        fparams![],
8522                        |row| row.get_typed(0),
8523                    )
8524                    .unwrap_or(0);
8525                if backup_max_conversation_id > 0
8526                    && progress.last_completed_source_row_id >= backup_max_conversation_id
8527                {
8528                    self.record_historical_bundle_import(
8529                        &bundle,
8530                        source.method,
8531                        progress.conversations_imported,
8532                        progress.messages_imported,
8533                    )?;
8534                    self.clear_historical_bundle_progress(&bundle)?;
8535                    tracing::info!(
8536                        path = %bundle.root_path.display(),
8537                        last_completed_source_row_id = progress.last_completed_source_row_id,
8538                        backup_max_conversation_id,
8539                        conversations_imported = progress.conversations_imported,
8540                        messages_imported = progress.messages_imported,
8541                        "historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
8542                    );
8543                    continue;
8544                }
8545            }
8546
8547            self.import_historical_sources(&source.conn)?;
8548            let (imported_conversations, imported_messages) =
8549                self.import_historical_conversations(&bundle, source.method, &source.conn)?;
8550            self.record_historical_bundle_import(
8551                &bundle,
8552                source.method,
8553                imported_conversations,
8554                imported_messages,
8555            )?;
8556            self.clear_historical_bundle_progress(&bundle)?;
8557
8558            outcome.bundles_imported += 1;
8559            outcome.conversations_imported += imported_conversations;
8560            outcome.messages_imported += imported_messages;
8561
8562            tracing::info!(
8563                path = %bundle.root_path.display(),
8564                bytes = bundle.total_bytes,
8565                method = source.method,
8566                imported_conversations,
8567                imported_messages,
8568                "salvaged historical cass database bundle"
8569            );
8570        }
8571
8572        Ok(outcome)
8573    }
8574
8575    /// Delete a source by ID. Returns true if a row was deleted.
8576    pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
8577        if id == LOCAL_SOURCE_ID {
8578            anyhow::bail!("cannot delete the local source");
8579        }
8580        let count = self
8581            .conn
8582            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
8583        if count > 0 {
8584            self.invalidate_conversation_source_cache(id);
8585        }
8586        Ok(count > 0)
8587    }
8588
8589    /// Insert a conversation tree (conversation + messages + snippets + FTS).
8590    pub fn insert_conversation_tree(
8591        &self,
8592        agent_id: i64,
8593        workspace_id: Option<i64>,
8594        conv: &Conversation,
8595    ) -> Result<InsertOutcome> {
8596        let normalized_conv = normalized_conversation_for_storage(conv);
8597        let conv = normalized_conv.as_ref();
8598        self.ensure_source_for_conversation(conv)?;
8599        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8600        let defer_analytics_updates = defer_analytics_updates_enabled();
8601        let conversation_key = conversation_merge_key(agent_id, conv);
8602        let mut tx = self.conn.transaction()?;
8603        let existing = franken_find_existing_conversation_with_tail_by_key(
8604            &tx,
8605            &conversation_key,
8606            Some(conv),
8607        )?;
8608        if let Some(existing) = existing {
8609            let outcome = self.franken_append_messages_with_tail_in_tx(
8610                &tx,
8611                agent_id,
8612                existing.id,
8613                conv,
8614                existing.tail_state,
8615                defer_lexical_updates,
8616                defer_analytics_updates,
8617            )?;
8618            tx.commit()?;
8619            return Ok(outcome);
8620        }
8621
8622        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8623            &tx,
8624            agent_id,
8625            workspace_id,
8626            conv,
8627            &conversation_key,
8628        )? {
8629            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8630            ConversationInsertStatus::Existing(existing_id) => {
8631                let ExistingMessageLookup {
8632                    by_idx: mut existing_messages,
8633                    replay: mut existing_replay_fingerprints,
8634                } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
8635                let ExistingConversationNewMessages {
8636                    messages: new_messages,
8637                    new_chars,
8638                    idx_collision_count,
8639                    first_collision_idx,
8640                } = collect_new_messages_for_existing_conversation(
8641                    existing_id,
8642                    conv,
8643                    &mut existing_messages,
8644                    &mut existing_replay_fingerprints,
8645                    "skipping replay-equivalent recovered message with shifted idx",
8646                );
8647                let (inserted_last_idx, inserted_last_created_at) =
8648                    borrowed_messages_tail_state(&new_messages);
8649                let mut inserted_indices = Vec::new();
8650                let mut fts_entries = Vec::new();
8651                let mut fts_pending_chars = 0usize;
8652                let mut _fts_inserted_total = 0usize;
8653                let inserted_message_ids =
8654                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
8655                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8656                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8657                    if !defer_lexical_updates {
8658                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8659                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8660                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8661                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8662                        {
8663                            flush_pending_fts_entries(
8664                                self,
8665                                &tx,
8666                                &mut fts_entries,
8667                                &mut fts_pending_chars,
8668                                &mut _fts_inserted_total,
8669                            )?;
8670                        }
8671                    }
8672                    inserted_indices.push(msg.idx);
8673                }
8674
8675                if idx_collision_count > 0 {
8676                    tracing::warn!(
8677                        conversation_id = existing_id,
8678                        collision_count = idx_collision_count,
8679                        first_idx = first_collision_idx,
8680                        source_path = %conv.source_path.display(),
8681                        "message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
8682                    );
8683                }
8684
8685                if !defer_lexical_updates {
8686                    flush_pending_fts_entries(
8687                        self,
8688                        &tx,
8689                        &mut fts_entries,
8690                        &mut fts_pending_chars,
8691                        &mut _fts_inserted_total,
8692                    )?;
8693                }
8694
8695                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
8696                franken_update_conversation_tail_state(
8697                    &tx,
8698                    existing_id,
8699                    conv_last_ts,
8700                    inserted_last_idx,
8701                    inserted_last_created_at,
8702                )?;
8703                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
8704                {
8705                    franken_update_external_conversation_tail_lookup_key(
8706                        &tx,
8707                        &lookup_key,
8708                        conv_last_ts,
8709                        inserted_last_idx,
8710                        inserted_last_created_at,
8711                    )?;
8712                }
8713
8714                if !defer_analytics_updates && !inserted_indices.is_empty() {
8715                    franken_update_daily_stats_in_tx(
8716                        self,
8717                        &tx,
8718                        &conv.agent_slug,
8719                        &conv.source_id,
8720                        conversation_effective_started_at(conv),
8721                        StatsDelta {
8722                            session_count_delta: 0,
8723                            message_count_delta: inserted_indices.len() as i64,
8724                            total_chars_delta: new_chars,
8725                        },
8726                    )?;
8727                }
8728
8729                tx.commit()?;
8730                return Ok(InsertOutcome {
8731                    conversation_id: existing_id,
8732                    conversation_inserted: false,
8733                    inserted_indices,
8734                });
8735            }
8736        };
8737        let mut fts_entries = Vec::new();
8738        let mut fts_pending_chars = 0usize;
8739        let mut _fts_inserted_total = 0usize;
8740        let mut total_chars: i64 = 0;
8741        let mut inserted_indices = Vec::new();
8742        let mut pending_messages = HashMap::new();
8743        let mut pending_replay_fingerprints = HashSet::new();
8744        let mut idx_collision_count = 0usize;
8745        let mut first_collision_idx: Option<i64> = None;
8746        let mut new_messages = Vec::new();
8747        for msg in &conv.messages {
8748            let incoming_fingerprint = message_merge_fingerprint(msg);
8749            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
8750                if existing_fingerprint != &incoming_fingerprint {
8751                    idx_collision_count = idx_collision_count.saturating_add(1);
8752                    first_collision_idx.get_or_insert(msg.idx);
8753                }
8754                continue;
8755            }
8756            let incoming_replay = message_replay_fingerprint(msg);
8757            if pending_replay_fingerprints.contains(&incoming_replay) {
8758                tracing::debug!(
8759                    conversation_id = conv_id,
8760                    idx = msg.idx,
8761                    source_path = %conv.source_path.display(),
8762                    "skipping replay-equivalent duplicate message within new conversation insert"
8763                );
8764                continue;
8765            }
8766            pending_messages.insert(msg.idx, incoming_fingerprint);
8767            pending_replay_fingerprints.insert(incoming_replay);
8768            new_messages.push(msg);
8769        }
8770        let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
8771        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8772            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8773            if !defer_lexical_updates {
8774                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8775                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8776                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8777                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8778                {
8779                    flush_pending_fts_entries(
8780                        self,
8781                        &tx,
8782                        &mut fts_entries,
8783                        &mut fts_pending_chars,
8784                        &mut _fts_inserted_total,
8785                    )?;
8786                }
8787            }
8788            total_chars += msg.content.len() as i64;
8789            inserted_indices.push(msg.idx);
8790        }
8791        if idx_collision_count > 0 {
8792            tracing::warn!(
8793                conversation_id = conv_id,
8794                collision_count = idx_collision_count,
8795                first_idx = first_collision_idx,
8796                source_path = %conv.source_path.display(),
8797                "message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
8798            );
8799        }
8800        if !defer_lexical_updates {
8801            flush_pending_fts_entries(
8802                self,
8803                &tx,
8804                &mut fts_entries,
8805                &mut fts_pending_chars,
8806                &mut _fts_inserted_total,
8807            )?;
8808        }
8809
8810        if !defer_analytics_updates {
8811            franken_update_daily_stats_in_tx(
8812                self,
8813                &tx,
8814                &conv.agent_slug,
8815                &conv.source_id,
8816                conversation_effective_started_at(conv),
8817                StatsDelta {
8818                    session_count_delta: 1,
8819                    message_count_delta: inserted_indices.len() as i64,
8820                    total_chars_delta: total_chars,
8821                },
8822            )?;
8823        }
8824
8825        tx.commit()?;
8826        Ok(InsertOutcome {
8827            conversation_id: conv_id,
8828            conversation_inserted: true,
8829            inserted_indices,
8830        })
8831    }
8832
8833    #[cfg(test)]
8834    fn insert_conversation_tree_with_profile(
8835        &self,
8836        agent_id: i64,
8837        workspace_id: Option<i64>,
8838        conv: &Conversation,
8839        profile: &mut InsertConversationTreePerfProfile,
8840    ) -> Result<InsertOutcome> {
8841        let total_start = Instant::now();
8842        let normalized_conv = normalized_conversation_for_storage(conv);
8843        let conv = normalized_conv.as_ref();
8844
8845        let source_start = Instant::now();
8846        self.ensure_source_for_conversation(conv)?;
8847        profile.source_duration += source_start.elapsed();
8848
8849        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8850        let defer_analytics_updates = defer_analytics_updates_enabled();
8851        let conversation_key = conversation_merge_key(agent_id, conv);
8852
8853        let tx_open_start = Instant::now();
8854        let mut tx = self.conn.transaction()?;
8855        profile.tx_open_duration += tx_open_start.elapsed();
8856
8857        let existing_lookup_start = Instant::now();
8858        let existing =
8859            franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
8860        profile.existing_lookup_duration += existing_lookup_start.elapsed();
8861        if let Some(existing_id) = existing {
8862            return Err(anyhow!(
8863                "profile helper expects new conversation path, found existing id {existing_id}"
8864            ));
8865        }
8866
8867        let conversation_row_start = Instant::now();
8868        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8869            &tx,
8870            agent_id,
8871            workspace_id,
8872            conv,
8873            &conversation_key,
8874        )? {
8875            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8876            ConversationInsertStatus::Existing(existing_id) => {
8877                return Err(anyhow!(
8878                    "profile helper expected inserted conversation row, reused existing id {existing_id}"
8879                ));
8880            }
8881        };
8882        profile.conversation_row_duration += conversation_row_start.elapsed();
8883
8884        let mut fts_entries = Vec::new();
8885        let mut fts_pending_chars = 0usize;
8886        let mut fts_inserted_total = 0usize;
8887        let mut total_chars: i64 = 0;
8888        let mut inserted_indices = Vec::new();
8889        let mut pending_messages = HashMap::new();
8890        let mut pending_replay_fingerprints = HashSet::new();
8891        let mut idx_collision_count = 0usize;
8892        let mut first_collision_idx: Option<i64> = None;
8893        let mut new_messages = Vec::new();
8894
8895        for msg in &conv.messages {
8896            let incoming_fingerprint = message_merge_fingerprint(msg);
8897            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
8898                if existing_fingerprint != &incoming_fingerprint {
8899                    idx_collision_count = idx_collision_count.saturating_add(1);
8900                    first_collision_idx.get_or_insert(msg.idx);
8901                }
8902                continue;
8903            }
8904
8905            let incoming_replay = message_replay_fingerprint(msg);
8906            if pending_replay_fingerprints.contains(&incoming_replay) {
8907                tracing::debug!(
8908                    conversation_id = conv_id,
8909                    idx = msg.idx,
8910                    source_path = %conv.source_path.display(),
8911                    "skipping replay-equivalent duplicate message within profiled new conversation insert"
8912                );
8913                continue;
8914            }
8915
8916            pending_messages.insert(msg.idx, incoming_fingerprint);
8917            pending_replay_fingerprints.insert(incoming_replay);
8918            new_messages.push(msg);
8919        }
8920
8921        let message_insert_start = Instant::now();
8922        let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
8923            &tx,
8924            conv_id,
8925            &new_messages,
8926            &mut profile.message_insert_breakdown,
8927        )?;
8928        profile.message_insert_duration += message_insert_start.elapsed();
8929
8930        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8931            let snippet_insert_start = Instant::now();
8932            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8933            profile.snippet_insert_duration += snippet_insert_start.elapsed();
8934
8935            if !defer_lexical_updates {
8936                let fts_entry_start = Instant::now();
8937                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8938                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8939                profile.fts_entry_duration += fts_entry_start.elapsed();
8940                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8941                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8942                {
8943                    let fts_flush_start = Instant::now();
8944                    flush_pending_fts_entries(
8945                        self,
8946                        &tx,
8947                        &mut fts_entries,
8948                        &mut fts_pending_chars,
8949                        &mut fts_inserted_total,
8950                    )?;
8951                    profile.fts_flush_duration += fts_flush_start.elapsed();
8952                }
8953            }
8954
8955            total_chars += msg.content.len() as i64;
8956            inserted_indices.push(msg.idx);
8957        }
8958
8959        if idx_collision_count > 0 {
8960            tracing::warn!(
8961                conversation_id = conv_id,
8962                collision_count = idx_collision_count,
8963                first_idx = first_collision_idx,
8964                source_path = %conv.source_path.display(),
8965                "message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
8966            );
8967        }
8968
8969        if !defer_lexical_updates {
8970            let fts_flush_start = Instant::now();
8971            flush_pending_fts_entries(
8972                self,
8973                &tx,
8974                &mut fts_entries,
8975                &mut fts_pending_chars,
8976                &mut fts_inserted_total,
8977            )?;
8978            profile.fts_flush_duration += fts_flush_start.elapsed();
8979        }
8980
8981        if !defer_analytics_updates {
8982            let analytics_start = Instant::now();
8983            franken_update_daily_stats_in_tx(
8984                self,
8985                &tx,
8986                &conv.agent_slug,
8987                &conv.source_id,
8988                conversation_effective_started_at(conv),
8989                StatsDelta {
8990                    session_count_delta: 1,
8991                    message_count_delta: inserted_indices.len() as i64,
8992                    total_chars_delta: total_chars,
8993                },
8994            )?;
8995            profile.analytics_duration += analytics_start.elapsed();
8996        }
8997
8998        let commit_start = Instant::now();
8999        tx.commit()?;
9000        profile.commit_duration += commit_start.elapsed();
9001        profile.invocations += 1;
9002        profile.messages += conv.messages.len();
9003        profile.inserted_messages += inserted_indices.len();
9004        profile.total_duration += total_start.elapsed();
9005
9006        Ok(InsertOutcome {
9007            conversation_id: conv_id,
9008            conversation_inserted: true,
9009            inserted_indices,
9010        })
9011    }
9012
9013    #[cfg(test)]
9014    fn append_existing_conversation_with_profile(
9015        &self,
9016        agent_id: i64,
9017        _workspace_id: Option<i64>,
9018        conv: &Conversation,
9019        profile: &mut InsertConversationTreePerfProfile,
9020    ) -> Result<InsertOutcome> {
9021        let total_start = Instant::now();
9022        let normalized_conv = normalized_conversation_for_storage(conv);
9023        let conv = normalized_conv.as_ref();
9024
9025        let source_start = Instant::now();
9026        self.ensure_source_for_conversation(conv)?;
9027        profile.source_duration += source_start.elapsed();
9028
9029        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9030        let defer_analytics_updates = defer_analytics_updates_enabled();
9031        let conversation_key = conversation_merge_key(agent_id, conv);
9032
9033        let tx_open_start = Instant::now();
9034        let mut tx = self.conn.transaction()?;
9035        profile.tx_open_duration += tx_open_start.elapsed();
9036
9037        let existing_lookup_start = Instant::now();
9038        let existing = franken_find_existing_conversation_with_tail_by_key(
9039            &tx,
9040            &conversation_key,
9041            Some(conv),
9042        )?;
9043        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9044        let existing = existing.ok_or_else(|| {
9045            anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
9046        })?;
9047        let existing_id = existing.id;
9048
9049        let existing_idx_lookup_start = Instant::now();
9050        let append_tail_state = existing.tail_state;
9051        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9052        let existing_plan = append_tail_state.as_ref().and_then(|state| {
9053            collect_append_only_tail_messages(
9054                conv,
9055                state.last_message_idx,
9056                state.last_message_created_at,
9057            )
9058        });
9059        let used_append_tail_plan = existing_plan.is_some();
9060        profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
9061
9062        let dedupe_filter_start = Instant::now();
9063        let ExistingConversationNewMessages {
9064            messages: new_messages,
9065            new_chars,
9066            idx_collision_count,
9067            first_collision_idx,
9068        } = if let Some(existing_plan) = existing_plan {
9069            existing_plan
9070        } else {
9071            let ExistingMessageLookup {
9072                by_idx: mut existing_messages,
9073                replay: mut existing_replay_fingerprints,
9074            } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9075            collect_new_messages_for_existing_conversation(
9076                existing_id,
9077                conv,
9078                &mut existing_messages,
9079                &mut existing_replay_fingerprints,
9080                "skipping replay-equivalent profiled append message with shifted idx",
9081            )
9082        };
9083        profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
9084
9085        let mut inserted_indices = Vec::new();
9086        let mut fts_entries = Vec::new();
9087        let mut fts_pending_chars = 0usize;
9088        let mut fts_inserted_total = 0usize;
9089        let (inserted_last_idx, inserted_last_created_at) =
9090            borrowed_messages_tail_state(&new_messages);
9091
9092        let message_insert_start = Instant::now();
9093        let inserted_message_ids = franken_append_insert_new_messages_with_profile(
9094            &tx,
9095            existing_id,
9096            &new_messages,
9097            &mut profile.message_insert_breakdown,
9098        )?;
9099        profile.message_insert_duration += message_insert_start.elapsed();
9100
9101        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9102            let snippet_insert_start = Instant::now();
9103            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9104            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9105
9106            if !defer_lexical_updates {
9107                let fts_entry_start = Instant::now();
9108                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9109                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9110                profile.fts_entry_duration += fts_entry_start.elapsed();
9111                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9112                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9113                {
9114                    let fts_flush_start = Instant::now();
9115                    flush_pending_fts_entries(
9116                        self,
9117                        &tx,
9118                        &mut fts_entries,
9119                        &mut fts_pending_chars,
9120                        &mut fts_inserted_total,
9121                    )?;
9122                    profile.fts_flush_duration += fts_flush_start.elapsed();
9123                }
9124            }
9125
9126            inserted_indices.push(msg.idx);
9127        }
9128
9129        if idx_collision_count > 0 {
9130            tracing::warn!(
9131                conversation_id = existing_id,
9132                collision_count = idx_collision_count,
9133                first_idx = first_collision_idx,
9134                source_path = %conv.source_path.display(),
9135                "message idx collisions encountered while profiling append merge; retaining canonical message variants"
9136            );
9137        }
9138
9139        if !defer_lexical_updates {
9140            let fts_flush_start = Instant::now();
9141            flush_pending_fts_entries(
9142                self,
9143                &tx,
9144                &mut fts_entries,
9145                &mut fts_pending_chars,
9146                &mut fts_inserted_total,
9147            )?;
9148            profile.fts_flush_duration += fts_flush_start.elapsed();
9149        }
9150
9151        let conversation_row_start = Instant::now();
9152        let mut exact_append_tail_set = false;
9153        if used_append_tail_plan {
9154            if let (Some(last_message_idx), Some(last_message_created_at)) =
9155                (inserted_last_idx, inserted_last_created_at)
9156            {
9157                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9158                    franken_set_conversation_tail_state_after_append(
9159                        &tx,
9160                        existing_id,
9161                        last_message_created_at,
9162                        last_message_idx,
9163                        last_message_created_at,
9164                    )?;
9165                    exact_append_tail_set = true;
9166                } else {
9167                    franken_update_conversation_tail_state(
9168                        &tx,
9169                        existing_id,
9170                        Some(last_message_created_at),
9171                        inserted_last_idx,
9172                        inserted_last_created_at,
9173                    )?;
9174                }
9175            }
9176        } else {
9177            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9178            franken_update_conversation_tail_state(
9179                &tx,
9180                existing_id,
9181                conv_last_ts,
9182                inserted_last_idx,
9183                inserted_last_created_at,
9184            )?;
9185        }
9186        franken_update_external_conversation_tail_after_append(
9187            &tx,
9188            agent_id,
9189            conv,
9190            used_append_tail_plan,
9191            exact_append_tail_set,
9192            inserted_last_idx,
9193            inserted_last_created_at,
9194        )?;
9195        profile.conversation_row_duration += conversation_row_start.elapsed();
9196
9197        if !defer_analytics_updates && !inserted_indices.is_empty() {
9198            let analytics_start = Instant::now();
9199            franken_update_daily_stats_in_tx(
9200                self,
9201                &tx,
9202                &conv.agent_slug,
9203                &conv.source_id,
9204                conversation_effective_started_at(conv),
9205                StatsDelta {
9206                    session_count_delta: 0,
9207                    message_count_delta: inserted_indices.len() as i64,
9208                    total_chars_delta: new_chars,
9209                },
9210            )?;
9211            profile.analytics_duration += analytics_start.elapsed();
9212        }
9213
9214        let commit_start = Instant::now();
9215        tx.commit()?;
9216        profile.commit_duration += commit_start.elapsed();
9217        profile.invocations += 1;
9218        profile.messages += conv.messages.len();
9219        profile.inserted_messages += inserted_indices.len();
9220        profile.total_duration += total_start.elapsed();
9221
9222        Ok(InsertOutcome {
9223            conversation_id: existing_id,
9224            conversation_inserted: false,
9225            inserted_indices,
9226        })
9227    }
9228
9229    /// Append new messages to an existing conversation within an active transaction.
9230    #[allow(clippy::too_many_arguments)]
9231    fn franken_append_messages_with_tail_in_tx(
9232        &self,
9233        tx: &FrankenTransaction<'_>,
9234        agent_id: i64,
9235        conversation_id: i64,
9236        conv: &Conversation,
9237        append_tail_state: Option<ExistingConversationTailState>,
9238        defer_lexical_updates: bool,
9239        defer_analytics_updates: bool,
9240    ) -> Result<InsertOutcome> {
9241        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9242        let append_plan = append_tail_state.as_ref().and_then(|state| {
9243            collect_append_only_tail_messages(
9244                conv,
9245                state.last_message_idx,
9246                state.last_message_created_at,
9247            )
9248        });
9249        let used_append_tail_plan = append_plan.is_some();
9250        let ExistingConversationNewMessages {
9251            messages: new_messages,
9252            new_chars,
9253            idx_collision_count,
9254            first_collision_idx,
9255        } = if let Some(append_plan) = append_plan {
9256            append_plan
9257        } else {
9258            let ExistingMessageLookup {
9259                by_idx: mut existing_messages,
9260                replay: mut existing_replay_fingerprints,
9261            } = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
9262            collect_new_messages_for_existing_conversation(
9263                conversation_id,
9264                conv,
9265                &mut existing_messages,
9266                &mut existing_replay_fingerprints,
9267                "skipping replay-equivalent recovered message with shifted idx",
9268            )
9269        };
9270
9271        let mut inserted_indices = Vec::new();
9272        let mut fts_entries = Vec::new();
9273        let mut fts_pending_chars = 0usize;
9274        let mut _fts_inserted_total = 0usize;
9275        let (inserted_last_idx, inserted_last_created_at) =
9276            borrowed_messages_tail_state(&new_messages);
9277        let inserted_message_ids =
9278            franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
9279        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9280            franken_insert_snippets(tx, msg_id, &msg.snippets)?;
9281            if !defer_lexical_updates {
9282                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9283                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9284                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9285                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9286                {
9287                    flush_pending_fts_entries(
9288                        self,
9289                        tx,
9290                        &mut fts_entries,
9291                        &mut fts_pending_chars,
9292                        &mut _fts_inserted_total,
9293                    )?;
9294                }
9295            }
9296            inserted_indices.push(msg.idx);
9297        }
9298
9299        if idx_collision_count > 0 {
9300            tracing::warn!(
9301                conversation_id,
9302                collision_count = idx_collision_count,
9303                first_idx = first_collision_idx,
9304                source_path = %conv.source_path.display(),
9305                "message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
9306            );
9307        }
9308
9309        if !defer_lexical_updates {
9310            flush_pending_fts_entries(
9311                self,
9312                tx,
9313                &mut fts_entries,
9314                &mut fts_pending_chars,
9315                &mut _fts_inserted_total,
9316            )?;
9317        }
9318
9319        let mut exact_append_tail_set = false;
9320        if used_append_tail_plan {
9321            if let (Some(last_message_idx), Some(last_message_created_at)) =
9322                (inserted_last_idx, inserted_last_created_at)
9323            {
9324                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9325                    franken_set_conversation_tail_state_after_append(
9326                        tx,
9327                        conversation_id,
9328                        last_message_created_at,
9329                        last_message_idx,
9330                        last_message_created_at,
9331                    )?;
9332                    exact_append_tail_set = true;
9333                } else {
9334                    franken_update_conversation_tail_state(
9335                        tx,
9336                        conversation_id,
9337                        Some(last_message_created_at),
9338                        inserted_last_idx,
9339                        inserted_last_created_at,
9340                    )?;
9341                }
9342            }
9343        } else {
9344            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9345            franken_update_conversation_tail_state(
9346                tx,
9347                conversation_id,
9348                conv_last_ts,
9349                inserted_last_idx,
9350                inserted_last_created_at,
9351            )?;
9352        }
9353        franken_update_external_conversation_tail_after_append(
9354            tx,
9355            agent_id,
9356            conv,
9357            used_append_tail_plan,
9358            exact_append_tail_set,
9359            inserted_last_idx,
9360            inserted_last_created_at,
9361        )?;
9362
9363        if !defer_analytics_updates && !inserted_indices.is_empty() {
9364            let message_count = inserted_indices.len() as i64;
9365            franken_update_daily_stats_in_tx(
9366                self,
9367                tx,
9368                &conv.agent_slug,
9369                &conv.source_id,
9370                conversation_effective_started_at(conv),
9371                StatsDelta {
9372                    session_count_delta: 0,
9373                    message_count_delta: message_count,
9374                    total_chars_delta: new_chars,
9375                },
9376            )?;
9377        }
9378
9379        Ok(InsertOutcome {
9380            conversation_id,
9381            conversation_inserted: false,
9382            inserted_indices,
9383        })
9384    }
9385
9386    /// Rebuild the FTS5 index from scratch (chunked to avoid OOM on large databases, #110).
9387    pub fn rebuild_fts(&self) -> Result<()> {
9388        self.rebuild_fts_via_frankensqlite().map(|_| ())
9389    }
9390
9391    /// Best-effort repair for the derived SQLite FTS fallback index.
9392    ///
9393    /// The canonical archive and Tantivy index remain authoritative, so callers
9394    /// should invoke this from maintenance paths rather than ordinary opens.
9395    pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
9396        self.ensure_fts_consistency_via_frankensqlite()
9397    }
9398
9399    pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
9400        &self,
9401        archive_fingerprint: &str,
9402    ) -> Result<bool> {
9403        Ok(
9404            self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
9405                && self
9406                    .read_fts_franken_rebuild_archive_fingerprint()?
9407                    .as_deref()
9408                    == Some(archive_fingerprint),
9409        )
9410    }
9411
9412    pub(crate) fn record_search_fallback_fts_archive_fingerprint(
9413        &self,
9414        archive_fingerprint: &str,
9415    ) -> Result<()> {
9416        self.conn
9417            .execute_compat(
9418                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9419                fparams![
9420                    FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
9421                    archive_fingerprint.to_string()
9422                ],
9423            )
9424            .with_context(|| "recording frankensqlite FTS archive fingerprint")?;
9425        Ok(())
9426    }
9427
9428    pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
9429        &self,
9430        archive_fingerprint: &str,
9431    ) -> Result<bool> {
9432        Ok(
9433            self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
9434                && self.read_daily_stats_archive_fingerprint()?.as_deref()
9435                    == Some(archive_fingerprint),
9436        )
9437    }
9438
9439    pub(crate) fn record_daily_stats_archive_fingerprint(
9440        &self,
9441        archive_fingerprint: &str,
9442    ) -> Result<()> {
9443        self.conn
9444            .execute_compat(
9445                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9446                fparams![
9447                    DAILY_STATS_HEALTH_GENERATION_META_KEY,
9448                    DAILY_STATS_HEALTH_GENERATION.to_string()
9449                ],
9450            )
9451            .with_context(|| "recording daily_stats health generation")?;
9452        self.conn
9453            .execute_compat(
9454                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9455                fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
9456            )
9457            .with_context(|| "recording daily_stats archive fingerprint")?;
9458        Ok(())
9459    }
9460
9461    fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
9462        let value: Option<String> = self
9463            .conn
9464            .query_row_map(
9465                "SELECT value FROM meta WHERE key = ?1",
9466                fparams![FTS_FRANKEN_REBUILD_META_KEY],
9467                |row| row.get_typed(0),
9468            )
9469            .optional()?;
9470        Ok(value.and_then(|v| v.parse::<i64>().ok()))
9471    }
9472
9473    fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
9474        Ok(self
9475            .conn
9476            .query_row_map(
9477                "SELECT value FROM meta WHERE key = ?1",
9478                fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
9479                |row| row.get_typed(0),
9480            )
9481            .optional()?)
9482    }
9483
9484    fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
9485        let value: Option<String> = self
9486            .conn
9487            .query_row_map(
9488                "SELECT value FROM meta WHERE key = ?1",
9489                fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
9490                |row| row.get_typed(0),
9491            )
9492            .optional()?;
9493        Ok(value.and_then(|value| value.parse::<i64>().ok()))
9494    }
9495
9496    fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
9497        Ok(self
9498            .conn
9499            .query_row_map(
9500                "SELECT value FROM meta WHERE key = ?1",
9501                fparams![DAILY_STATS_HEALTH_META_KEY],
9502                |row| row.get_typed(0),
9503            )
9504            .optional()?)
9505    }
9506
9507    fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
9508        self.conn
9509            .execute_compat(
9510                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9511                fparams![
9512                    FTS_FRANKEN_REBUILD_META_KEY,
9513                    FTS_FRANKEN_REBUILD_GENERATION.to_string()
9514                ],
9515            )
9516            .with_context(|| "recording frankensqlite FTS rebuild generation")?;
9517        Ok(())
9518    }
9519
9520    fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
9521        if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
9522            // Before triggering an expensive full rebuild, probe whether
9523            // fts_messages is already populated and consistent.  On large
9524            // databases the rebuild can take hours and OOM — skip it when
9525            // the only thing missing is the generation marker (#184).
9526            let fts_already_healthy = (|| -> Result<bool> {
9527                let fts_exists: i64 = self.conn.query_row_map(
9528                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9529                    fparams![],
9530                    |row| row.get_typed(0),
9531                )?;
9532                if fts_exists != 1 {
9533                    return Ok(false);
9534                }
9535                let total: i64 = self.conn.query_row_map(
9536                    "SELECT COUNT(*) FROM messages",
9537                    fparams![],
9538                    |row| row.get_typed(0),
9539                )?;
9540                if total == 0 {
9541                    return Ok(false);
9542                }
9543                let indexed: i64 = self.conn.query_row_map(
9544                    "SELECT COUNT(*) FROM fts_messages",
9545                    fparams![],
9546                    |row| row.get_typed(0),
9547                )?;
9548                // Consider healthy if >=90% of messages are indexed
9549                Ok(indexed > 0 && indexed * 100 >= total * 90)
9550            })()
9551            .unwrap_or(false);
9552
9553            if fts_already_healthy {
9554                tracing::info!(
9555                    target: "cass::fts_rebuild",
9556                    "FTS already populated and consistent; setting generation marker without rebuild"
9557                );
9558                self.record_fts_franken_rebuild_generation()?;
9559                self.set_fts_messages_present_cache(true);
9560            } else {
9561                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9562                self.record_fts_franken_rebuild_generation()?;
9563                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9564            }
9565        }
9566
9567        let inspection = (|| -> Result<(i64, bool)> {
9568            let fts_schema_rows = self.conn.query_row_map(
9569                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9570                fparams![],
9571                |row| row.get_typed::<i64>(0),
9572            )?;
9573            let fts_queryable = fts_schema_rows == 1
9574                && self
9575                    .conn
9576                    .query("SELECT rowid FROM fts_messages LIMIT 1")
9577                    .is_ok();
9578            Ok((fts_schema_rows, fts_queryable))
9579        })();
9580
9581        let (fts_schema_rows, fts_queryable) = match inspection {
9582            Ok(result) => result,
9583            Err(err) => {
9584                tracing::warn!(
9585                    error = %err,
9586                    "frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
9587                );
9588                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9589                self.record_fts_franken_rebuild_generation()?;
9590                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9591            }
9592        };
9593
9594        if fts_schema_rows != 1 || !fts_queryable {
9595            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9596            self.record_fts_franken_rebuild_generation()?;
9597            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9598        }
9599
9600        let total_messages =
9601            self.conn
9602                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
9603                    row.get_typed::<i64>(0)
9604                })?;
9605        let indexed_messages =
9606            self.conn
9607                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9608                    row.get_typed::<i64>(0)
9609                })?;
9610
9611        if indexed_messages == total_messages {
9612            self.set_fts_messages_present_cache(true);
9613            return Ok(FtsConsistencyRepair::AlreadyHealthy {
9614                rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
9615            });
9616        }
9617
9618        if indexed_messages > total_messages {
9619            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9620            self.record_fts_franken_rebuild_generation()?;
9621            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9622        }
9623
9624        let inserted_rows = self
9625            .stream_fts_rows_via_frankensqlite(true)
9626            .with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
9627        let repaired_rows =
9628            self.conn
9629                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9630                    row.get_typed::<i64>(0)
9631                })?;
9632        if repaired_rows == total_messages {
9633            self.set_fts_messages_present_cache(true);
9634            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9635                inserted_rows,
9636                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9637            });
9638        }
9639
9640        // The incremental catch-up found nothing to insert, yet the gap
9641        // between total_messages (all rows, including orphans) and
9642        // indexed_messages (only rows with valid conversation_id, since the
9643        // FTS INSERT inner-joins on conversations) remains.  A full rebuild
9644        // cannot close this gap either — the orphaned messages will be
9645        // excluded again — so falling through to one would just re-do ~5 min
9646        // of work on every startup.  Accept the current state.
9647        if inserted_rows == 0 {
9648            tracing::debug!(
9649                target: "cass::fts_rebuild",
9650                indexed_messages = repaired_rows,
9651                total_messages,
9652                un_indexable_gap = total_messages.saturating_sub(repaired_rows),
9653                "FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
9654            );
9655            self.set_fts_messages_present_cache(true);
9656            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9657                inserted_rows: 0,
9658                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9659            });
9660        }
9661
9662        // Incremental made progress but didn't fully close the gap — something
9663        // is genuinely inconsistent, so do a full rebuild.
9664        let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9665        self.record_fts_franken_rebuild_generation()?;
9666        Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
9667    }
9668
9669    pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
9670        self.invalidate_fts_messages_present_cache();
9671        self.conn
9672            .execute("DROP TABLE IF EXISTS fts_messages;")
9673            .with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
9674        self.conn
9675            .execute_compat(FTS5_REGISTER_SQL, fparams![])
9676            .with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
9677        self.set_fts_messages_present_cache(true);
9678
9679        self.stream_fts_rows_via_frankensqlite(false)
9680    }
9681
9682    fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
9683        let batch_size = fts_rebuild_batch_size().max(1);
9684        let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
9685        let mut total_inserted: usize = 0;
9686        let mut total_skipped_orphans: usize = 0;
9687        let mut total_skipped_existing: usize = 0;
9688        let mut last_rowid: i64 = 0;
9689        let conversation_by_id = self.load_fts_conversation_projection_map()?;
9690        let agent_slug_by_id = self.load_fts_agent_slug_map()?;
9691        let workspace_path_by_id = self.load_fts_workspace_path_map()?;
9692        let existing_fts_rowids = if missing_only {
9693            Some(self.load_fts_message_rowid_set()?)
9694        } else {
9695            None
9696        };
9697        let mut entries = Vec::new();
9698        let mut pending_chars = 0usize;
9699
9700        loop {
9701            let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
9702            let fetched_count = rows.len();
9703            if fetched_count == 0 {
9704                break;
9705            }
9706
9707            let inserted_before_batch = total_inserted;
9708            let skipped_before_batch = total_skipped_orphans;
9709            let existing_before_batch = total_skipped_existing;
9710
9711            for row in rows {
9712                last_rowid = row.rowid;
9713                if existing_fts_rowids
9714                    .as_ref()
9715                    .is_some_and(|rowids| rowids.contains(&row.message_id))
9716                {
9717                    total_skipped_existing = total_skipped_existing.saturating_add(1);
9718                    continue;
9719                }
9720                let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
9721                    total_skipped_orphans = total_skipped_orphans.saturating_add(1);
9722                    continue;
9723                };
9724                let agent = conversation
9725                    .agent_id
9726                    .and_then(|agent_id| agent_slug_by_id.get(&agent_id))
9727                    .filter(|slug| !slug.is_empty())
9728                    .cloned()
9729                    .unwrap_or_else(|| "unknown".to_string());
9730                let workspace = conversation
9731                    .workspace_id
9732                    .and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
9733                    .cloned()
9734                    .unwrap_or_default();
9735                pending_chars = pending_chars.saturating_add(row.content.len());
9736                entries.push(FtsEntry {
9737                    content: row.content,
9738                    title: conversation.title.clone(),
9739                    agent,
9740                    workspace,
9741                    source_path: conversation.source_path.clone(),
9742                    created_at: row.created_at,
9743                    message_id: row.message_id,
9744                });
9745                if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9746                    || pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9747                {
9748                    total_inserted = total_inserted.saturating_add(
9749                        franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
9750                    );
9751                    entries.clear();
9752                    pending_chars = 0;
9753                }
9754            }
9755
9756            if !entries.is_empty() {
9757                total_inserted = total_inserted.saturating_add(
9758                    franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
9759                );
9760                entries.clear();
9761                pending_chars = 0;
9762            }
9763
9764            tracing::debug!(
9765                target: "cass::fts_rebuild",
9766                batch_rows = fetched_count,
9767                batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
9768                batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
9769                batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
9770                total_inserted,
9771                total_skipped_orphans,
9772                total_skipped_existing,
9773                last_rowid,
9774                missing_only,
9775                "FTS streaming maintenance batch complete"
9776            );
9777
9778            if fetched_count < batch_size {
9779                break;
9780            }
9781        }
9782
9783        Ok(total_inserted)
9784    }
9785
9786    fn fetch_fts_rebuild_message_rows(
9787        &self,
9788        last_rowid: i64,
9789        batch_limit: i64,
9790    ) -> Result<Vec<FtsRebuildMessageRow>> {
9791        self.conn
9792            .query_map_collect(
9793                "SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
9794                 FROM messages m
9795                 WHERE m.rowid > ?1
9796                 ORDER BY m.rowid
9797                 LIMIT ?2",
9798                fparams![last_rowid, batch_limit],
9799                |row| {
9800                    Ok(FtsRebuildMessageRow {
9801                        rowid: row.get_typed(0)?,
9802                        message_id: row.get_typed(1)?,
9803                        conversation_id: row.get_typed(2)?,
9804                        content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
9805                        created_at: row.get_typed(4)?,
9806                    })
9807                },
9808            )
9809            .with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
9810    }
9811
9812    fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
9813        let rows: Vec<i64> = self
9814            .conn
9815            .query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
9816                row.get_typed(0)
9817            })
9818            .with_context(|| "loading existing FTS message rowids")?;
9819        Ok(rows.into_iter().collect())
9820    }
9821
9822    fn load_fts_conversation_projection_map(
9823        &self,
9824    ) -> Result<HashMap<i64, FtsConversationProjection>> {
9825        let rows: Vec<(i64, FtsConversationProjection)> = self
9826            .conn
9827            .query_map_collect(
9828                "SELECT id, title, agent_id, workspace_id, source_path
9829                 FROM conversations",
9830                fparams![],
9831                |row| {
9832                    Ok((
9833                        row.get_typed(0)?,
9834                        FtsConversationProjection {
9835                            title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
9836                            agent_id: row.get_typed(2)?,
9837                            workspace_id: row.get_typed(3)?,
9838                            source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
9839                        },
9840                    ))
9841                },
9842            )
9843            .with_context(|| "loading FTS conversation projection map")?;
9844        Ok(rows.into_iter().collect())
9845    }
9846
9847    fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
9848        let rows: Vec<(i64, String)> = self
9849            .conn
9850            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
9851                Ok((
9852                    row.get_typed(0)?,
9853                    row.get_typed::<Option<String>>(1)?
9854                        .unwrap_or_else(|| "unknown".to_string()),
9855                ))
9856            })
9857            .with_context(|| "loading FTS agent slug map")?;
9858        Ok(rows.into_iter().collect())
9859    }
9860
9861    fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
9862        let rows: Vec<(i64, String)> = self
9863            .conn
9864            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
9865                Ok((
9866                    row.get_typed(0)?,
9867                    row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
9868                ))
9869            })
9870            .with_context(|| "loading FTS workspace path map")?;
9871        Ok(rows.into_iter().collect())
9872    }
9873
9874    /// Fetch all messages for embedding generation.
9875    pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
9876        // COALESCE(c.agent_id, 0) so legacy V1 conversations with NULL
9877        // agent_id don't cause a runtime row-decode failure (agent_id in
9878        // MessageForEmbedding is i64).  saturating_u32_from_i64 downstream
9879        // turns 0 into the "unknown agent" sentinel for doc-id hashing.
9880        self.conn
9881            .query_map_collect(
9882                "SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
9883                 FROM messages m
9884                 JOIN conversations c ON m.conversation_id = c.id
9885                 ORDER BY m.id",
9886                fparams![],
9887                |row| {
9888                    let source_id: String = row.get_typed::<Option<String>>(4)?
9889                        .unwrap_or_else(|| "local".to_string());
9890                    Ok(MessageForEmbedding {
9891                        message_id: row.get_typed(0)?,
9892                        created_at: row.get_typed(1)?,
9893                        agent_id: row.get_typed(2)?,
9894                        workspace_id: row.get_typed(3)?,
9895                        source_id_hash: crc32fast::hash(source_id.as_bytes()),
9896                        role: row.get_typed(5)?,
9897                        content: row.get_typed(6)?,
9898                    })
9899                },
9900            )
9901            .with_context(|| "fetching messages for embedding")
9902    }
9903
9904    /// Get the watermark for incremental semantic embedding.
9905    pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
9906        let result: Result<String, _> = self.conn.query_row_map(
9907            "SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
9908            fparams![],
9909            |row| row.get_typed(0),
9910        );
9911        match result.optional() {
9912            Ok(Some(s)) => Ok(s.parse().ok()),
9913            Ok(None) => Ok(None),
9914            Err(e) => Err(e.into()),
9915        }
9916    }
9917
9918    /// Set the watermark for incremental semantic embedding.
9919    pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
9920        self.conn.execute_compat(
9921            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
9922            fparams![id.to_string()],
9923        )?;
9924        Ok(())
9925    }
9926
9927    /// Get embedding jobs for a database path.
9928    pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
9929        self.conn
9930            .query_map_collect(
9931                "SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
9932                 FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
9933                fparams![db_path],
9934                |row| {
9935                    Ok(EmbeddingJobRow {
9936                        id: row.get_typed(0)?,
9937                        db_path: row.get_typed(1)?,
9938                        model_id: row.get_typed(2)?,
9939                        status: row.get_typed(3)?,
9940                        total_docs: row.get_typed(4)?,
9941                        completed_docs: row.get_typed(5)?,
9942                        error_message: row.get_typed(6)?,
9943                        created_at: row.get_typed(7)?,
9944                        started_at: row.get_typed(8)?,
9945                        completed_at: row.get_typed(9)?,
9946                    })
9947                },
9948            )
9949            .with_context(|| format!("fetching embedding jobs for {db_path}"))
9950    }
9951
9952    /// Create or update an embedding job.
9953    pub fn upsert_embedding_job(
9954        &self,
9955        db_path: &str,
9956        model_id: &str,
9957        total_docs: i64,
9958    ) -> Result<i64> {
9959        let updated = self.conn.execute_compat(
9960            "UPDATE embedding_jobs
9961             SET total_docs = ?3
9962             WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
9963            fparams![db_path, model_id, total_docs],
9964        )?;
9965        if updated == 0 {
9966            let insert_result = self.conn.execute_compat(
9967                "INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
9968                fparams![db_path, model_id, total_docs],
9969            );
9970            if let Err(err) = insert_result {
9971                if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
9972                    return Err(err.into());
9973                }
9974                self.conn.execute_compat(
9975                    "UPDATE embedding_jobs
9976                     SET total_docs = ?3
9977                     WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
9978                    fparams![db_path, model_id, total_docs],
9979                )?;
9980            }
9981        }
9982        self.conn
9983            .query_row_map(
9984                "SELECT id FROM embedding_jobs
9985                 WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
9986                 ORDER BY id DESC
9987                 LIMIT 1",
9988                fparams![db_path, model_id],
9989                |row| row.get_typed(0),
9990            )
9991            .with_context(|| "resolving embedding job id after upsert")
9992    }
9993
9994    /// Mark an embedding job as started.
9995    pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
9996        self.conn.execute_compat(
9997            "UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
9998            fparams![job_id],
9999        )?;
10000        Ok(())
10001    }
10002
10003    /// Mark an embedding job as completed.
10004    pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
10005        self.conn.execute_compat(
10006            "UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
10007            fparams![job_id],
10008        )?;
10009        Ok(())
10010    }
10011
10012    /// Mark an embedding job as failed.
10013    pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
10014        self.conn.execute_compat(
10015            "UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
10016            fparams![job_id, error],
10017        )?;
10018        Ok(())
10019    }
10020
10021    /// Cancel embedding jobs for a database path.
10022    pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
10023        if let Some(mid) = model_id {
10024            Ok(self.conn.execute_compat(
10025                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10026                fparams![db_path, mid],
10027            )?)
10028        } else {
10029            Ok(self.conn.execute_compat(
10030                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
10031                fparams![db_path],
10032            )?)
10033        }
10034    }
10035
10036    /// Update embedding job progress.
10037    pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
10038        self.conn.execute_compat(
10039            "UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
10040            fparams![job_id, completed_docs],
10041        )?;
10042        Ok(())
10043    }
10044
10045    // =====================================================================
10046    // Analytics query methods
10047    // =====================================================================
10048
10049    /// Get session count for a date range using materialized stats.
10050    /// Returns (count, is_from_cache) where is_from_cache is true if from daily_stats.
10051    ///
10052    /// Falls back to COUNT(*) query when daily_stats table is empty or stale.
10053    pub fn count_sessions_in_range(
10054        &self,
10055        start_ts_ms: Option<i64>,
10056        end_ts_ms: Option<i64>,
10057        agent_slug: Option<&str>,
10058        source_id: Option<&str>,
10059    ) -> Result<(i64, bool)> {
10060        let agent = agent_slug.unwrap_or("all");
10061        let source = source_id.unwrap_or("all");
10062
10063        // Check if we have materialized stats
10064        let stats_count: i64 = self
10065            .conn
10066            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10067                row.get_typed(0)
10068            })
10069            .unwrap_or(0);
10070
10071        if stats_count == 0 {
10072            return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
10073        }
10074
10075        // Use materialized stats
10076        let start_day = start_ts_ms.map(Self::day_id_from_millis);
10077        let end_day = end_ts_ms.map(Self::day_id_from_millis);
10078
10079        let count: i64 = match (start_day, end_day) {
10080            (Some(start), Some(end)) => self.conn.query_row_map(
10081                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10082                 WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
10083                fparams![start, end, agent, source],
10084                |row| row.get_typed(0),
10085            )?,
10086            (Some(start), None) => self.conn.query_row_map(
10087                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10088                 WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
10089                fparams![start, agent, source],
10090                |row| row.get_typed(0),
10091            )?,
10092            (None, Some(end)) => self.conn.query_row_map(
10093                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10094                 WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
10095                fparams![end, agent, source],
10096                |row| row.get_typed(0),
10097            )?,
10098            (None, None) => self.conn.query_row_map(
10099                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10100                 WHERE agent_slug = ?1 AND source_id = ?2",
10101                fparams![agent, source],
10102                |row| row.get_typed(0),
10103            )?,
10104        };
10105
10106        Ok((count, true))
10107    }
10108
10109    /// Direct COUNT(*) query as fallback when daily_stats is empty.
10110    fn count_sessions_direct(
10111        &self,
10112        start_ts_ms: Option<i64>,
10113        end_ts_ms: Option<i64>,
10114        agent_slug: Option<&str>,
10115        source_id: Option<&str>,
10116    ) -> Result<(i64, bool)> {
10117        // Build dynamic SQL with positional params.  Single-table scan of
10118        // conversations; filter on agent slug via an EXISTS subquery only
10119        // when that filter is actually requested.  This avoids the unneeded
10120        // 2-table JOIN (which also silently dropped legacy conversations
10121        // with NULL agent_id) and sidesteps frankensqlite's materialization
10122        // fallback entirely.
10123        let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
10124        let mut param_values: Vec<ParamValue> = Vec::new();
10125        let mut idx = 1;
10126
10127        if let Some(start) = start_ts_ms {
10128            sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
10129            param_values.push(ParamValue::from(start));
10130            idx += 1;
10131        }
10132        if let Some(end) = end_ts_ms {
10133            sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
10134            param_values.push(ParamValue::from(end));
10135            idx += 1;
10136        }
10137        if let Some(agent) = agent_slug
10138            && agent != "all"
10139        {
10140            sql.push_str(&format!(
10141                " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
10142            ));
10143            param_values.push(ParamValue::from(agent));
10144            idx += 1;
10145        }
10146        if let Some(source) = source_id
10147            && source != "all"
10148        {
10149            sql.push_str(&format!(" AND c.source_id = ?{idx}"));
10150            param_values.push(ParamValue::from(source));
10151            let _ = idx; // suppress unused warning
10152        }
10153
10154        let count: i64 = self
10155            .conn
10156            .query_row_map(&sql, &param_values, |row| row.get_typed(0))?;
10157        Ok((count, false))
10158    }
10159
10160    /// Get daily histogram data for a date range.
10161    pub fn get_daily_histogram(
10162        &self,
10163        start_ts_ms: i64,
10164        end_ts_ms: i64,
10165        agent_slug: Option<&str>,
10166        source_id: Option<&str>,
10167    ) -> Result<Vec<DailyCount>> {
10168        let start_day = Self::day_id_from_millis(start_ts_ms);
10169        let end_day = Self::day_id_from_millis(end_ts_ms);
10170        let agent = agent_slug.unwrap_or("all");
10171        let source = source_id.unwrap_or("all");
10172
10173        let rows = self.conn.query_map_collect(
10174            "SELECT day_id, session_count, message_count, total_chars
10175             FROM daily_stats
10176             WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
10177             ORDER BY day_id",
10178            fparams![start_day, end_day, agent, source],
10179            |row| {
10180                Ok(DailyCount {
10181                    day_id: row.get_typed(0)?,
10182                    sessions: row.get_typed(1)?,
10183                    messages: row.get_typed(2)?,
10184                    chars: row.get_typed(3)?,
10185                })
10186            },
10187        )?;
10188
10189        Ok(rows)
10190    }
10191
10192    /// Check health of daily stats table.
10193    pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
10194        let row_count: i64 =
10195            self.conn
10196                .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10197                    row.get_typed(0)
10198                })?;
10199
10200        let oldest_update: Option<i64> = self.conn.query_row_map(
10201            "SELECT MIN(last_updated) FROM daily_stats",
10202            fparams![],
10203            |row| row.get_typed(0),
10204        )?;
10205
10206        let conversation_count: i64 =
10207            self.conn
10208                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
10209                    row.get_typed(0)
10210                })?;
10211
10212        let materialized_total: i64 = self.conn.query_row_map(
10213            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10214                 WHERE agent_slug = 'all' AND source_id = 'all'",
10215            fparams![],
10216            |row| row.get_typed(0),
10217        )?;
10218
10219        Ok(DailyStatsHealth {
10220            populated: row_count > 0,
10221            row_count,
10222            oldest_update_ms: oldest_update,
10223            conversation_count,
10224            materialized_total,
10225            drift: (conversation_count - materialized_total).abs(),
10226        })
10227    }
10228
10229    /// Batch insert multiple conversations with full analytics (token usage,
10230    /// message metrics, rollups).  Frankensqlite equivalent of
10231    /// `SqliteStorage::insert_conversations_batched`.
10232    pub fn insert_conversations_batched(
10233        &self,
10234        conversations: &[(i64, Option<i64>, &Conversation)],
10235    ) -> Result<Vec<InsertOutcome>> {
10236        if conversations.is_empty() {
10237            return Ok(Vec::new());
10238        }
10239
10240        self.ensure_sources_for_batch(conversations)?;
10241
10242        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
10243        let defer_analytics_updates = defer_analytics_updates_enabled();
10244
10245        let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
10246            tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
10247            PricingTable { entries: Vec::new() }
10248        });
10249        let mut pricing_diag = PricingDiagnostics::default();
10250
10251        let mut tx = self.conn.transaction()?;
10252
10253        // Bug #167: Ensure all referenced agents, workspaces, and sources
10254        // exist inside the transaction so FK checks pass.  The caller resolves
10255        // IDs via ensure_agent / ensure_workspace / ensure_sources_for_batch
10256        // outside the transaction, but those autocommit writes may not be
10257        // visible inside the transaction snapshot in frankensqlite.  Re-verify
10258        // (and insert if missing) within the tx.
10259        ensure_agents_in_tx(&tx, conversations)?;
10260        ensure_workspaces_in_tx(&tx, conversations)?;
10261        ensure_sources_in_tx(&tx, conversations)?;
10262
10263        let mut outcomes = Vec::with_capacity(conversations.len());
10264        let mut fts_entries = Vec::new();
10265        let mut fts_pending_chars = 0usize;
10266        let mut fts_inserted_total = 0usize;
10267        let mut fts_count_total = 0usize;
10268        let mut stats = StatsAggregator::new();
10269        let mut token_stats = TokenStatsAggregator::new();
10270        let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
10271        let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
10272        let mut rollup_agg = AnalyticsRollupAggregator::new();
10273        let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
10274        let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
10275        let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
10276            HashMap::new();
10277        let mut pending_message_replay_fingerprints: HashMap<
10278            i64,
10279            HashSet<MessageReplayFingerprint>,
10280        > = HashMap::new();
10281
10282        for &(agent_id, workspace_id, raw_conv) in conversations {
10283            let normalized_conv = normalized_conversation_for_storage(raw_conv);
10284            let conv = normalized_conv.as_ref();
10285            let mut total_chars: i64 = 0;
10286            let mut inserted_indices = Vec::with_capacity(conv.messages.len());
10287            let mut inserted_messages: Vec<(i64, &Message)> =
10288                Vec::with_capacity(conv.messages.len());
10289            let mut session_count_delta = 1_i64;
10290            let conversation_key = conversation_merge_key(agent_id, conv);
10291
10292            let existing_conv_id = if let Some(existing_id) =
10293                pending_conversation_ids.get(&conversation_key)
10294            {
10295                Some(*existing_id)
10296            } else {
10297                let existing_id =
10298                    franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
10299                if let Some(existing_id) = existing_id {
10300                    pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10301                }
10302                existing_id
10303            };
10304
10305            let conv_id = if let Some(existing_id) = existing_conv_id {
10306                session_count_delta = 0;
10307                let ExistingMessageLookup {
10308                    by_idx: mut existing_messages,
10309                    replay: mut existing_replay_fingerprints,
10310                } = franken_existing_message_lookup_with_pending(
10311                    &tx,
10312                    existing_id,
10313                    &conv.messages,
10314                    &mut pending_message_fingerprints,
10315                    &mut pending_message_replay_fingerprints,
10316                )?;
10317                let ExistingConversationNewMessages {
10318                    messages: new_messages,
10319                    new_chars,
10320                    idx_collision_count,
10321                    first_collision_idx,
10322                } = collect_new_messages_for_existing_conversation(
10323                    existing_id,
10324                    conv,
10325                    &mut existing_messages,
10326                    &mut existing_replay_fingerprints,
10327                    "skipping replay-equivalent recovered message with shifted idx during batched merge",
10328                );
10329                let (inserted_last_idx, inserted_last_created_at) =
10330                    borrowed_messages_tail_state(&new_messages);
10331                let inserted_message_ids =
10332                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10333                total_chars += new_chars;
10334                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10335                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10336                    if !defer_lexical_updates {
10337                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10338                        fts_count_total += 1;
10339                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
10340                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10341                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10342                        {
10343                            flush_pending_fts_entries(
10344                                self,
10345                                &tx,
10346                                &mut fts_entries,
10347                                &mut fts_pending_chars,
10348                                &mut fts_inserted_total,
10349                            )?;
10350                        }
10351                    }
10352                    inserted_indices.push(msg.idx);
10353                    inserted_messages.push((msg_id, msg));
10354                }
10355
10356                if idx_collision_count > 0 {
10357                    tracing::warn!(
10358                        conversation_id = existing_id,
10359                        collision_count = idx_collision_count,
10360                        first_idx = first_collision_idx,
10361                        source_path = %conv.source_path.display(),
10362                        "message idx collisions encountered during batched conversation merge; retaining canonical message variants"
10363                    );
10364                }
10365
10366                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10367                franken_update_conversation_tail_state(
10368                    &tx,
10369                    existing_id,
10370                    conv_last_ts,
10371                    inserted_last_idx,
10372                    inserted_last_created_at,
10373                )?;
10374                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
10375                {
10376                    franken_update_external_conversation_tail_lookup_key(
10377                        &tx,
10378                        &lookup_key,
10379                        conv_last_ts,
10380                        inserted_last_idx,
10381                        inserted_last_created_at,
10382                    )?;
10383                }
10384
10385                pending_message_fingerprints.insert(existing_id, existing_messages);
10386                pending_message_replay_fingerprints
10387                    .insert(existing_id, existing_replay_fingerprints);
10388
10389                existing_id
10390            } else {
10391                match franken_insert_conversation_or_get_existing(
10392                    &tx,
10393                    agent_id,
10394                    workspace_id,
10395                    conv,
10396                )? {
10397                    ConversationInsertStatus::Inserted(new_conv_id) => {
10398                        pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
10399                        let pending_messages =
10400                            pending_message_fingerprints.entry(new_conv_id).or_default();
10401                        let pending_replay_fingerprints = pending_message_replay_fingerprints
10402                            .entry(new_conv_id)
10403                            .or_default();
10404                        let mut new_messages = Vec::new();
10405                        for msg in &conv.messages {
10406                            let incoming_replay = message_replay_fingerprint(msg);
10407                            if pending_messages.contains_key(&msg.idx)
10408                                || pending_replay_fingerprints.contains(&incoming_replay)
10409                            {
10410                                continue;
10411                            }
10412                            pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
10413                            pending_replay_fingerprints.insert(incoming_replay);
10414                            new_messages.push(msg);
10415                        }
10416                        let inserted_message_ids =
10417                            franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
10418                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10419                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10420                            if !defer_lexical_updates {
10421                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10422                                fts_count_total += 1;
10423                                fts_pending_chars =
10424                                    fts_pending_chars.saturating_add(msg.content.len());
10425                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10426                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10427                                {
10428                                    flush_pending_fts_entries(
10429                                        self,
10430                                        &tx,
10431                                        &mut fts_entries,
10432                                        &mut fts_pending_chars,
10433                                        &mut fts_inserted_total,
10434                                    )?;
10435                                }
10436                            }
10437                            total_chars += msg.content.len() as i64;
10438                            inserted_indices.push(msg.idx);
10439                            inserted_messages.push((msg_id, msg));
10440                        }
10441                        new_conv_id
10442                    }
10443                    ConversationInsertStatus::Existing(existing_id) => {
10444                        session_count_delta = 0;
10445                        pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10446                        let ExistingMessageLookup {
10447                            by_idx: mut existing_messages,
10448                            replay: mut existing_replay_fingerprints,
10449                        } = franken_existing_message_lookup_with_pending(
10450                            &tx,
10451                            existing_id,
10452                            &conv.messages,
10453                            &mut pending_message_fingerprints,
10454                            &mut pending_message_replay_fingerprints,
10455                        )?;
10456                        let ExistingConversationNewMessages {
10457                            messages: new_messages,
10458                            new_chars,
10459                            idx_collision_count,
10460                            first_collision_idx,
10461                        } = collect_new_messages_for_existing_conversation(
10462                            existing_id,
10463                            conv,
10464                            &mut existing_messages,
10465                            &mut existing_replay_fingerprints,
10466                            "skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
10467                        );
10468                        let (inserted_last_idx, inserted_last_created_at) =
10469                            borrowed_messages_tail_state(&new_messages);
10470                        let inserted_message_ids =
10471                            franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10472                        total_chars += new_chars;
10473                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10474                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10475                            if !defer_lexical_updates {
10476                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10477                                fts_count_total += 1;
10478                                fts_pending_chars =
10479                                    fts_pending_chars.saturating_add(msg.content.len());
10480                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10481                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10482                                {
10483                                    flush_pending_fts_entries(
10484                                        self,
10485                                        &tx,
10486                                        &mut fts_entries,
10487                                        &mut fts_pending_chars,
10488                                        &mut fts_inserted_total,
10489                                    )?;
10490                                }
10491                            }
10492                            inserted_indices.push(msg.idx);
10493                            inserted_messages.push((msg_id, msg));
10494                        }
10495
10496                        if idx_collision_count > 0 {
10497                            tracing::warn!(
10498                                conversation_id = existing_id,
10499                                collision_count = idx_collision_count,
10500                                first_idx = first_collision_idx,
10501                                source_path = %conv.source_path.display(),
10502                                "message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
10503                            );
10504                        }
10505
10506                        let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10507                        franken_update_conversation_tail_state(
10508                            &tx,
10509                            existing_id,
10510                            conv_last_ts,
10511                            inserted_last_idx,
10512                            inserted_last_created_at,
10513                        )?;
10514                        if let Some(lookup_key) =
10515                            conversation_external_lookup_key_for_conv(agent_id, conv)
10516                        {
10517                            franken_update_external_conversation_tail_lookup_key(
10518                                &tx,
10519                                &lookup_key,
10520                                conv_last_ts,
10521                                inserted_last_idx,
10522                                inserted_last_created_at,
10523                            )?;
10524                        }
10525
10526                        pending_message_fingerprints.insert(existing_id, existing_messages);
10527                        pending_message_replay_fingerprints
10528                            .insert(existing_id, existing_replay_fingerprints);
10529
10530                        existing_id
10531                    }
10532                }
10533            };
10534
10535            if !defer_analytics_updates {
10536                let delta = StatsDelta {
10537                    session_count_delta,
10538                    message_count_delta: inserted_messages.len() as i64,
10539                    total_chars_delta: total_chars,
10540                };
10541
10542                let effective_started_at = conversation_effective_started_at(conv);
10543                let day_id = effective_started_at
10544                    .map(FrankenStorage::day_id_from_millis)
10545                    .unwrap_or(0);
10546                stats.record_delta(
10547                    &conv.agent_slug,
10548                    &conv.source_id,
10549                    day_id,
10550                    delta.session_count_delta,
10551                    delta.message_count_delta,
10552                    delta.total_chars_delta,
10553                );
10554
10555                let conv_day_id = day_id;
10556                let mut session_model_family = String::from("unknown");
10557                let mut has_any_tokens = false;
10558
10559                for &(message_id, msg) in &inserted_messages {
10560                    let role_s = role_str(&msg.role);
10561                    let usage = if historical_raw_json(&msg.extra_json).is_some() {
10562                        crate::connectors::extract_tokens_for_agent(
10563                            &conv.agent_slug,
10564                            &serde_json::Value::Null,
10565                            &msg.content,
10566                            &role_s,
10567                        )
10568                    } else {
10569                        crate::connectors::extract_tokens_for_agent(
10570                            &conv.agent_slug,
10571                            &msg.extra_json,
10572                            &msg.content,
10573                            &role_s,
10574                        )
10575                    };
10576
10577                    let msg_ts = msg
10578                        .created_at
10579                        .or(conversation_effective_started_at(conv))
10580                        .unwrap_or(0);
10581                    let msg_day_id = if msg_ts > 0 {
10582                        FrankenStorage::day_id_from_millis(msg_ts)
10583                    } else {
10584                        conv_day_id
10585                    };
10586
10587                    let model_info = usage
10588                        .model_name
10589                        .as_deref()
10590                        .map(crate::connectors::normalize_model);
10591
10592                    let model_family = model_info
10593                        .as_ref()
10594                        .map(|i| i.family.clone())
10595                        .unwrap_or_else(|| "unknown".into());
10596                    let model_tier = model_info
10597                        .as_ref()
10598                        .map(|i| i.tier.clone())
10599                        .unwrap_or_else(|| "unknown".into());
10600                    let provider = usage
10601                        .provider
10602                        .clone()
10603                        .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
10604                        .unwrap_or_else(|| "unknown".into());
10605
10606                    if model_family != "unknown" {
10607                        session_model_family = model_family.clone();
10608                    }
10609
10610                    let estimated_cost = pricing_table.compute_cost(
10611                        usage.model_name.as_deref(),
10612                        msg_day_id,
10613                        usage.input_tokens,
10614                        usage.output_tokens,
10615                        usage.cache_read_tokens,
10616                        usage.cache_creation_tokens,
10617                    );
10618                    if estimated_cost.is_some() {
10619                        pricing_diag.record_priced();
10620                    } else if usage.has_token_data() {
10621                        pricing_diag.record_unpriced(usage.model_name.as_deref());
10622                    }
10623
10624                    token_stats.record(
10625                        &conv.agent_slug,
10626                        &conv.source_id,
10627                        msg_day_id,
10628                        &model_family,
10629                        &role_s,
10630                        &usage,
10631                        msg.content.len() as i64,
10632                        estimated_cost.unwrap_or(0.0),
10633                    );
10634
10635                    if usage.has_token_data() {
10636                        has_any_tokens = true;
10637                    }
10638
10639                    let content_chars = msg.content.len() as i64;
10640                    let content_tokens_est = content_chars / 4;
10641                    let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
10642                    let has_plan = has_plan_for_role(&role_s, &msg.content);
10643
10644                    token_entries.push(TokenUsageEntry {
10645                        message_id,
10646                        conversation_id: conv_id,
10647                        agent_id,
10648                        workspace_id,
10649                        source_id: conv.source_id.clone(),
10650                        timestamp_ms: msg_ts,
10651                        day_id: msg_day_id,
10652                        model_name: usage.model_name.clone(),
10653                        model_family: Some(model_family.clone()),
10654                        model_tier: Some(model_tier.clone()),
10655                        service_tier: usage.service_tier.clone(),
10656                        provider: Some(provider.clone()),
10657                        input_tokens: usage.input_tokens,
10658                        output_tokens: usage.output_tokens,
10659                        cache_read_tokens: usage.cache_read_tokens,
10660                        cache_creation_tokens: usage.cache_creation_tokens,
10661                        thinking_tokens: usage.thinking_tokens,
10662                        total_tokens: usage.total_tokens(),
10663                        estimated_cost_usd: estimated_cost,
10664                        role: role_s.to_string(),
10665                        content_chars,
10666                        has_tool_calls: usage.has_tool_calls,
10667                        tool_call_count: usage.tool_call_count,
10668                        data_source: usage.data_source.as_str().to_string(),
10669                    });
10670
10671                    let mm = MessageMetricsEntry {
10672                        message_id,
10673                        created_at_ms: msg_ts,
10674                        hour_id: msg_hour_id,
10675                        day_id: msg_day_id,
10676                        agent_slug: conv.agent_slug.clone(),
10677                        workspace_id: workspace_id.unwrap_or(0),
10678                        source_id: conv.source_id.clone(),
10679                        role: role_s.to_string(),
10680                        content_chars,
10681                        content_tokens_est,
10682                        model_name: usage.model_name.clone(),
10683                        model_family: model_family.clone(),
10684                        model_tier: model_tier.clone(),
10685                        provider,
10686                        api_input_tokens: usage.input_tokens,
10687                        api_output_tokens: usage.output_tokens,
10688                        api_cache_read_tokens: usage.cache_read_tokens,
10689                        api_cache_creation_tokens: usage.cache_creation_tokens,
10690                        api_thinking_tokens: usage.thinking_tokens,
10691                        api_service_tier: usage.service_tier.clone(),
10692                        api_data_source: usage.data_source.as_str().to_string(),
10693                        tool_call_count: usage.tool_call_count as i64,
10694                        has_tool_calls: usage.has_tool_calls,
10695                        has_plan,
10696                    };
10697                    rollup_agg.record(&mm);
10698                    metrics_entries.push(mm);
10699                }
10700
10701                if session_count_delta > 0 {
10702                    token_stats.record_session(
10703                        &conv.agent_slug,
10704                        &conv.source_id,
10705                        conv_day_id,
10706                        &session_model_family,
10707                    );
10708                }
10709
10710                if has_any_tokens {
10711                    conv_ids_to_summarize.push(conv_id);
10712                }
10713            }
10714
10715            outcomes.push(InsertOutcome {
10716                conversation_id: conv_id,
10717                conversation_inserted: session_count_delta > 0,
10718                inserted_indices,
10719            });
10720        }
10721
10722        // Batch insert all FTS entries at once
10723        if !defer_lexical_updates {
10724            flush_pending_fts_entries(
10725                self,
10726                &tx,
10727                &mut fts_entries,
10728                &mut fts_pending_chars,
10729                &mut fts_inserted_total,
10730            )?;
10731        }
10732        if !defer_lexical_updates && fts_count_total > 0 {
10733            tracing::debug!(
10734                target: "cass::perf::fts5",
10735                total = fts_count_total,
10736                inserted = fts_inserted_total,
10737                conversations = conversations.len(),
10738                "franken_batch_fts_insert_complete"
10739            );
10740        }
10741
10742        // Batched daily_stats update
10743        if !defer_analytics_updates && !stats.is_empty() {
10744            let entries = stats.expand();
10745            let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
10746            tracing::debug!(
10747                target: "cass::perf::daily_stats",
10748                raw = stats.raw_entry_count(),
10749                expanded = entries.len(),
10750                affected = affected,
10751                "franken_batched_stats_update_complete"
10752            );
10753        }
10754
10755        // Batch insert token_usage rows
10756        if !defer_analytics_updates && !token_entries.is_empty() {
10757            let token_count = token_entries.len();
10758            let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
10759            tracing::debug!(
10760                target: "cass::perf::token_usage",
10761                total = token_count,
10762                inserted = inserted,
10763                "franken_batch_token_usage_insert_complete"
10764            );
10765        }
10766
10767        // Batched token_daily_stats update
10768        if !defer_analytics_updates && !token_stats.is_empty() {
10769            let entries = token_stats.expand();
10770            let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
10771            tracing::debug!(
10772                target: "cass::perf::token_daily_stats",
10773                raw = token_stats.raw_entry_count(),
10774                expanded = entries.len(),
10775                affected = affected,
10776                "franken_batched_token_stats_update_complete"
10777            );
10778        }
10779
10780        // Batch insert message_metrics rows
10781        if !defer_analytics_updates && !metrics_entries.is_empty() {
10782            let mm_count = metrics_entries.len();
10783            let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
10784            tracing::debug!(
10785                target: "cass::perf::message_metrics",
10786                total = mm_count,
10787                inserted = inserted,
10788                "franken_batch_message_metrics_insert_complete"
10789            );
10790        }
10791
10792        // Flush usage_hourly + usage_daily rollups
10793        if !defer_analytics_updates && !rollup_agg.is_empty() {
10794            let (hourly, daily, models_daily) =
10795                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
10796            tracing::debug!(
10797                target: "cass::perf::usage_rollups",
10798                hourly_buckets = rollup_agg.hourly_entry_count(),
10799                daily_buckets = rollup_agg.daily_entry_count(),
10800                models_daily_buckets = rollup_agg.models_daily_entry_count(),
10801                hourly_affected = hourly,
10802                daily_affected = daily,
10803                models_daily_affected = models_daily,
10804                "franken_batched_usage_rollups_complete"
10805            );
10806        }
10807
10808        // Update conversation-level token summaries
10809        if !defer_analytics_updates {
10810            for conv_id in &conv_ids_to_summarize {
10811                franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
10812            }
10813        }
10814
10815        tx.commit()?;
10816
10817        pricing_diag.log_summary();
10818
10819        Ok(outcomes)
10820    }
10821}
10822
10823fn normalized_storage_source_parts(
10824    source_id: Option<&str>,
10825    origin_kind: Option<&str>,
10826    origin_host: Option<&str>,
10827) -> (String, SourceKind, Option<String>) {
10828    let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
10829    let source_id = crate::search::tantivy::normalized_index_source_id(
10830        source_id,
10831        origin_kind,
10832        host_label.as_deref(),
10833    );
10834
10835    if source_id == LOCAL_SOURCE_ID {
10836        (source_id, SourceKind::Local, None)
10837    } else {
10838        (source_id, SourceKind::Ssh, host_label)
10839    }
10840}
10841
10842fn normalized_source_for_conversation(conv: &Conversation) -> Source {
10843    let (id, kind, host_label) = normalized_storage_source_parts(
10844        Some(conv.source_id.as_str()),
10845        None,
10846        conv.origin_host.as_deref(),
10847    );
10848    Source {
10849        id,
10850        kind,
10851        host_label,
10852        machine_id: None,
10853        platform: None,
10854        config_json: None,
10855        created_at: None,
10856        updated_at: None,
10857    }
10858}
10859
10860fn is_bootstrap_local_source(source: &Source) -> bool {
10861    source.id == LOCAL_SOURCE_ID
10862        && matches!(source.kind, SourceKind::Local)
10863        && source.host_label.is_none()
10864        && source.machine_id.is_none()
10865        && source.platform.is_none()
10866        && source.config_json.is_none()
10867        && source.created_at.is_none()
10868        && source.updated_at.is_none()
10869}
10870
10871fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
10872    let normalized_source = normalized_source_for_conversation(conv);
10873    if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
10874        Cow::Borrowed(conv)
10875    } else {
10876        let mut normalized = conv.clone();
10877        normalized.source_id = normalized_source.id;
10878        normalized.origin_host = normalized_source.host_label;
10879        Cow::Owned(normalized)
10880    }
10881}
10882
10883impl FrankenStorage {
10884    fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
10885        let source = normalized_source_for_conversation(conv);
10886        if is_bootstrap_local_source(&source) {
10887            // `open()` and schema repair always seed the canonical local source row.
10888            // Avoid an autocommit UPDATE on every local conversation insert.
10889            return Ok(());
10890        }
10891        let cache_key = EnsuredConversationSourceKey::from_source(&source);
10892        if self.conversation_source_already_ensured(&cache_key) {
10893            return Ok(());
10894        }
10895        self.upsert_source(&source)?;
10896        self.mark_conversation_source_ensured(cache_key);
10897        Ok(())
10898    }
10899
10900    fn ensure_sources_for_batch(
10901        &self,
10902        conversations: &[(i64, Option<i64>, &Conversation)],
10903    ) -> Result<()> {
10904        let mut seen = HashSet::with_capacity(conversations.len());
10905        for &(_, _, conv) in conversations {
10906            let source = normalized_source_for_conversation(conv);
10907            if seen.insert(source.id.clone()) {
10908                if is_bootstrap_local_source(&source) {
10909                    continue;
10910                }
10911                self.upsert_source(&source)?;
10912                self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
10913                    &source,
10914                ));
10915            }
10916        }
10917        Ok(())
10918    }
10919}
10920
10921// =========================================================================
10922// FrankenStorage transaction helper functions
10923// =========================================================================
10924
10925/// Get last_insert_rowid from a frankensqlite transaction.
10926fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
10927    tx.last_insert_rowid()
10928        .ok()
10929        .filter(|&id| id > 0)
10930        .with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
10931}
10932
10933/// Bug #167: Ensure all agents referenced by a batch exist within the
10934/// transaction.  The caller already resolved `agent_id` values via
10935/// `ensure_agent` outside the transaction, but those autocommit writes may
10936/// not be visible inside a frankensqlite transaction snapshot.  This function
10937/// checks each unique agent_id and creates a stub row if it's missing.
10938fn ensure_agents_in_tx(
10939    tx: &FrankenTransaction<'_>,
10940    conversations: &[(i64, Option<i64>, &Conversation)],
10941) -> Result<()> {
10942    let mut seen = HashSet::new();
10943    let now = FrankenStorage::now_millis();
10944    for &(agent_id, _, conv) in conversations {
10945        if !seen.insert(agent_id) {
10946            continue;
10947        }
10948        let exists: i64 = tx.query_row_map(
10949            "SELECT COUNT(*) FROM agents WHERE id = ?1",
10950            fparams![agent_id],
10951            |row| row.get_typed(0),
10952        )?;
10953        if exists == 0 {
10954            tracing::debug!(
10955                target: "cass::fk_guard",
10956                agent_id,
10957                slug = %conv.agent_slug,
10958                "inserting agent row inside transaction to satisfy FK constraint"
10959            );
10960            // INSERT OR IGNORE: the slug might already exist with a different
10961            // id from a concurrent writer.  If the slug row exists, the FK
10962            // constraint is already satisfied (the caller just got a stale id).
10963            tx.execute_compat(
10964                "INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
10965                 VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
10966                fparams![
10967                    agent_id,
10968                    conv.agent_slug.as_str(),
10969                    conv.agent_slug.as_str(),
10970                    now,
10971                    now
10972                ],
10973            )?;
10974        }
10975    }
10976    Ok(())
10977}
10978
10979/// Bug #167: Ensure all workspaces referenced by a batch exist within the
10980/// transaction.  Same rationale as `ensure_agents_in_tx`.
10981fn ensure_workspaces_in_tx(
10982    tx: &FrankenTransaction<'_>,
10983    conversations: &[(i64, Option<i64>, &Conversation)],
10984) -> Result<()> {
10985    let mut seen = HashSet::new();
10986    for &(_, workspace_id, conv) in conversations {
10987        let ws_id = match workspace_id {
10988            Some(id) => id,
10989            None => continue,
10990        };
10991        if !seen.insert(ws_id) {
10992            continue;
10993        }
10994        let exists: i64 = tx.query_row_map(
10995            "SELECT COUNT(*) FROM workspaces WHERE id = ?1",
10996            fparams![ws_id],
10997            |row| row.get_typed(0),
10998        )?;
10999        if exists == 0 {
11000            let path_str = conv
11001                .workspace
11002                .as_ref()
11003                .map(|p| p.to_string_lossy().to_string())
11004                .unwrap_or_default();
11005            tracing::debug!(
11006                target: "cass::fk_guard",
11007                workspace_id = ws_id,
11008                path = %path_str,
11009                "inserting workspace row inside transaction to satisfy FK constraint"
11010            );
11011            tx.execute_compat(
11012                "INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
11013                fparams![ws_id, path_str.as_str()],
11014            )?;
11015        }
11016    }
11017    Ok(())
11018}
11019
11020/// Bug #167: Ensure all sources referenced by a batch exist within the
11021/// transaction.  Same rationale as `ensure_agents_in_tx` — source_id is a
11022/// TEXT FK on the conversations table.
11023fn ensure_sources_in_tx(
11024    tx: &FrankenTransaction<'_>,
11025    conversations: &[(i64, Option<i64>, &Conversation)],
11026) -> Result<()> {
11027    let mut seen = HashSet::new();
11028    for &(_, _, conv) in conversations {
11029        let (source_id, source_kind, host_label) = normalized_storage_source_parts(
11030            Some(conv.source_id.as_str()),
11031            None,
11032            conv.origin_host.as_deref(),
11033        );
11034        if !seen.insert(source_id.clone()) {
11035            continue;
11036        }
11037        let exists: i64 = tx.query_row_map(
11038            "SELECT COUNT(*) FROM sources WHERE id = ?1",
11039            fparams![source_id.as_str()],
11040            |row| row.get_typed(0),
11041        )?;
11042        if exists == 0 {
11043            let kind_str = source_kind.to_string();
11044            let now = FrankenStorage::now_millis();
11045            tracing::debug!(
11046                target: "cass::fk_guard",
11047                source_id = %source_id,
11048                kind = kind_str.as_str(),
11049                "inserting source row inside transaction to satisfy FK constraint"
11050            );
11051            tx.execute_compat(
11052                "INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
11053                 VALUES(?1, ?2, ?3, ?4, ?5)",
11054                fparams![
11055                    source_id.as_str(),
11056                    kind_str.as_str(),
11057                    host_label.as_deref(),
11058                    now,
11059                    now
11060                ],
11061            )?;
11062        }
11063    }
11064    Ok(())
11065}
11066
11067fn env_flag_enabled(name: &str) -> bool {
11068    dotenvy::var(name)
11069        .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
11070        .unwrap_or(false)
11071}
11072
11073fn defer_storage_lexical_updates_enabled() -> bool {
11074    env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
11075}
11076
11077fn defer_analytics_updates_enabled() -> bool {
11078    env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES")
11079}
11080
11081enum ConversationInsertStatus {
11082    Inserted(i64),
11083    Existing(i64),
11084}
11085
11086fn franken_find_external_conversation_tail_lookup(
11087    tx: &FrankenTransaction<'_>,
11088    lookup_key: &str,
11089) -> Result<Option<ExistingConversationWithTail>> {
11090    let params = [SqliteValue::from(lookup_key)];
11091    let row = tx
11092        .query_row_with_params(
11093            "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
11094             FROM conversation_external_tail_lookup
11095             WHERE lookup_key = ?1",
11096            &params,
11097        )
11098        .optional()?;
11099    let Some(row) = row else {
11100        return Ok(None);
11101    };
11102    let id = row.get_typed(0)?;
11103    let ended_at = row.get_typed(1)?;
11104    let last_message_idx = row.get_typed(2)?;
11105    let last_message_created_at = row.get_typed(3)?;
11106    Ok(Some(ExistingConversationWithTail {
11107        id,
11108        tail_state: existing_conversation_tail_state_from_cached(
11109            last_message_idx,
11110            last_message_created_at,
11111            ended_at,
11112        ),
11113    }))
11114}
11115
11116fn franken_find_external_conversation_lookup(
11117    tx: &FrankenTransaction<'_>,
11118    lookup_key: &str,
11119) -> Result<Option<i64>> {
11120    Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
11121}
11122
11123fn franken_insert_external_conversation_tail_lookup_key(
11124    tx: &FrankenTransaction<'_>,
11125    lookup_key: &str,
11126    conversation_id: i64,
11127    ended_at: Option<i64>,
11128    last_message_idx: Option<i64>,
11129    last_message_created_at: Option<i64>,
11130) -> Result<()> {
11131    let params = [
11132        SqliteValue::from(lookup_key),
11133        SqliteValue::from(conversation_id),
11134        SqliteValue::from(ended_at),
11135        SqliteValue::from(last_message_idx),
11136        SqliteValue::from(last_message_created_at),
11137    ];
11138    tx.execute_with_params(
11139        "INSERT OR REPLACE INTO conversation_external_tail_lookup(
11140             lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
11141         ) VALUES(?1, ?2, ?3, ?4, ?5)",
11142        &params,
11143    )?;
11144    Ok(())
11145}
11146
11147fn franken_insert_external_conversation_tail_lookup(
11148    tx: &FrankenTransaction<'_>,
11149    source_id: &str,
11150    agent_id: i64,
11151    external_id: &str,
11152    existing: ExistingConversationWithTail,
11153) -> Result<()> {
11154    let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
11155    let ended_at = existing.tail_state.and_then(|state| state.ended_at);
11156    let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
11157    let last_message_created_at = existing
11158        .tail_state
11159        .map(|state| state.last_message_created_at);
11160    franken_insert_external_conversation_tail_lookup_key(
11161        tx,
11162        &lookup_key,
11163        existing.id,
11164        ended_at,
11165        last_message_idx,
11166        last_message_created_at,
11167    )
11168}
11169
11170fn franken_update_external_conversation_tail_lookup_key(
11171    tx: &FrankenTransaction<'_>,
11172    lookup_key: &str,
11173    ended_at_candidate: Option<i64>,
11174    last_message_idx_candidate: Option<i64>,
11175    last_message_created_at_candidate: Option<i64>,
11176) -> Result<()> {
11177    if ended_at_candidate.is_none()
11178        && last_message_idx_candidate.is_none()
11179        && last_message_created_at_candidate.is_none()
11180    {
11181        return Ok(());
11182    }
11183    tx.execute_compat(
11184        "UPDATE conversation_external_tail_lookup
11185         SET ended_at = CASE
11186                 WHEN ?1 IS NULL THEN ended_at
11187                 ELSE MAX(IFNULL(ended_at, 0), ?1)
11188             END,
11189             last_message_idx = CASE
11190                 WHEN ?2 IS NULL THEN last_message_idx
11191                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
11192                 ELSE last_message_idx
11193             END,
11194             last_message_created_at = CASE
11195                 WHEN ?3 IS NULL THEN last_message_created_at
11196                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
11197                 ELSE last_message_created_at
11198             END
11199         WHERE lookup_key = ?4",
11200        fparams![
11201            ended_at_candidate,
11202            last_message_idx_candidate,
11203            last_message_created_at_candidate,
11204            lookup_key
11205        ],
11206    )?;
11207    Ok(())
11208}
11209
11210fn franken_set_external_conversation_tail_lookup_after_append(
11211    tx: &FrankenTransaction<'_>,
11212    lookup_key: &str,
11213    ended_at: i64,
11214    last_message_idx: i64,
11215    last_message_created_at: i64,
11216) -> Result<()> {
11217    tx.execute_compat(
11218        "UPDATE conversation_external_tail_lookup
11219         SET ended_at = ?1,
11220             last_message_idx = ?2,
11221             last_message_created_at = ?3
11222         WHERE lookup_key = ?4",
11223        fparams![
11224            ended_at,
11225            last_message_idx,
11226            last_message_created_at,
11227            lookup_key
11228        ],
11229    )?;
11230    Ok(())
11231}
11232
11233fn franken_update_external_conversation_tail_after_append(
11234    tx: &FrankenTransaction<'_>,
11235    agent_id: i64,
11236    conv: &Conversation,
11237    used_append_tail_plan: bool,
11238    exact_append_set: bool,
11239    inserted_last_idx: Option<i64>,
11240    inserted_last_created_at: Option<i64>,
11241) -> Result<()> {
11242    let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
11243        return Ok(());
11244    };
11245
11246    if exact_append_set
11247        && let (Some(last_message_idx), Some(last_message_created_at)) =
11248            (inserted_last_idx, inserted_last_created_at)
11249    {
11250        return franken_set_external_conversation_tail_lookup_after_append(
11251            tx,
11252            &lookup_key,
11253            last_message_created_at,
11254            last_message_idx,
11255            last_message_created_at,
11256        );
11257    }
11258
11259    let ended_at_candidate = if used_append_tail_plan {
11260        inserted_last_created_at
11261    } else {
11262        conv.messages.iter().filter_map(|m| m.created_at).max()
11263    };
11264    franken_update_external_conversation_tail_lookup_key(
11265        tx,
11266        &lookup_key,
11267        ended_at_candidate,
11268        inserted_last_idx,
11269        inserted_last_created_at,
11270    )
11271}
11272
11273fn franken_find_existing_conversation_by_key(
11274    tx: &FrankenTransaction<'_>,
11275    key: &PendingConversationKey,
11276    conv: Option<&Conversation>,
11277) -> Result<Option<i64>> {
11278    franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
11279}
11280
11281fn franken_find_existing_conversation_by_key_after_conflict(
11282    tx: &FrankenTransaction<'_>,
11283    key: &PendingConversationKey,
11284    conv: Option<&Conversation>,
11285) -> Result<Option<i64>> {
11286    franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
11287}
11288
11289fn franken_find_existing_conversation_by_key_impl(
11290    tx: &FrankenTransaction<'_>,
11291    key: &PendingConversationKey,
11292    conv: Option<&Conversation>,
11293    allow_legacy_external_scan: bool,
11294) -> Result<Option<i64>> {
11295    match key {
11296        PendingConversationKey::External {
11297            source_id,
11298            agent_id,
11299            external_id,
11300        } => {
11301            let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
11302            if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
11303                return Ok(Some(existing_id));
11304            }
11305            if !allow_legacy_external_scan {
11306                return Ok(None);
11307            }
11308
11309            let existing_id = tx
11310                .query_row_map(
11311                    "SELECT id
11312                 FROM conversations
11313                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
11314                    fparams![source_id.as_str(), *agent_id, external_id.as_str()],
11315                    |row| row.get_typed(0),
11316                )
11317                .optional()?;
11318            if let Some(existing_id) = existing_id {
11319                let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
11320                franken_insert_external_conversation_tail_lookup_key(
11321                    tx,
11322                    &lookup_key,
11323                    existing_id,
11324                    tail_state.and_then(|state| state.ended_at),
11325                    tail_state.map(|state| state.last_message_idx),
11326                    tail_state.map(|state| state.last_message_created_at),
11327                )?;
11328                Ok(Some(existing_id))
11329            } else {
11330                Ok(None)
11331            }
11332        }
11333        PendingConversationKey::SourcePath {
11334            source_id,
11335            agent_id,
11336            source_path,
11337            started_at,
11338        } => {
11339            let exact_match = tx
11340                .query_row_map(
11341                    "SELECT c.id
11342                     FROM conversations c
11343                     WHERE c.source_id = ?1
11344                       AND c.agent_id = ?2
11345                       AND c.source_path = ?3
11346                       AND ((
11347                            COALESCE(
11348                                c.started_at,
11349                                (SELECT MIN(created_at)
11350                                 FROM messages
11351                                 WHERE conversation_id = c.id
11352                                   AND created_at IS NOT NULL)
11353                            ) IS NULL
11354                            AND ?4 IS NULL
11355                       ) OR COALESCE(
11356                            c.started_at,
11357                            (SELECT MIN(created_at)
11358                             FROM messages
11359                             WHERE conversation_id = c.id
11360                               AND created_at IS NOT NULL)
11361                       ) = ?4)
11362                     ORDER BY c.id
11363                     LIMIT 1",
11364                    fparams![
11365                        source_id.as_str(),
11366                        *agent_id,
11367                        source_path.as_str(),
11368                        *started_at
11369                    ],
11370                    |row| row.get_typed(0),
11371                )
11372                .optional()?;
11373            if exact_match.is_some() {
11374                return Ok(exact_match);
11375            }
11376
11377            let Some(conv) = conv else {
11378                return Ok(None);
11379            };
11380            let incoming_fingerprints = conversation_message_fingerprints(conv);
11381            if incoming_fingerprints.is_empty() {
11382                return Ok(None);
11383            }
11384            let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
11385
11386            let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
11387                "SELECT
11388                     c.id,
11389                     COALESCE(
11390                         c.started_at,
11391                         (SELECT MIN(created_at)
11392                          FROM messages
11393                          WHERE conversation_id = c.id
11394                            AND created_at IS NOT NULL)
11395                     ) AS effective_started_at
11396                 FROM conversations c
11397                 WHERE c.source_id = ?1
11398                   AND c.agent_id = ?2
11399                   AND c.source_path = ?3
11400                 ORDER BY c.id",
11401                fparams![source_id.as_str(), *agent_id, source_path.as_str()],
11402                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
11403            )?;
11404
11405            let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
11406            for (candidate_id, candidate_started_at) in candidates {
11407                let existing_fingerprints =
11408                    franken_existing_message_fingerprints(tx, candidate_id)?;
11409                let existing_replay_fingerprints =
11410                    replay_fingerprints_from_merge_set(&existing_fingerprints);
11411                let Some(evidence) = conversation_merge_evidence(
11412                    &incoming_fingerprints,
11413                    &incoming_replay_fingerprints,
11414                    &existing_fingerprints,
11415                    &existing_replay_fingerprints,
11416                    *started_at,
11417                    candidate_started_at,
11418                ) else {
11419                    continue;
11420                };
11421
11422                let candidate_key = (
11423                    evidence.exact_overlap,
11424                    evidence.replay_overlap,
11425                    evidence.started_close,
11426                    evidence.smaller_replay_set,
11427                    std::cmp::Reverse(evidence.start_distance_ms),
11428                );
11429                let should_replace = best_candidate
11430                    .as_ref()
11431                    .map(|(_, best_evidence)| {
11432                        candidate_key
11433                            > (
11434                                best_evidence.exact_overlap,
11435                                best_evidence.replay_overlap,
11436                                best_evidence.started_close,
11437                                best_evidence.smaller_replay_set,
11438                                std::cmp::Reverse(best_evidence.start_distance_ms),
11439                            )
11440                    })
11441                    .unwrap_or(true);
11442
11443                if should_replace {
11444                    best_candidate = Some((candidate_id, evidence));
11445                }
11446            }
11447
11448            Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
11449        }
11450    }
11451}
11452
11453fn franken_insert_conversation_or_get_existing(
11454    tx: &FrankenTransaction<'_>,
11455    agent_id: i64,
11456    workspace_id: Option<i64>,
11457    conv: &Conversation,
11458) -> Result<ConversationInsertStatus> {
11459    let conversation_key = conversation_merge_key(agent_id, conv);
11460    if let Some(existing_id) =
11461        franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
11462    {
11463        return Ok(ConversationInsertStatus::Existing(existing_id));
11464    }
11465
11466    franken_insert_conversation_or_get_existing_after_miss(
11467        tx,
11468        agent_id,
11469        workspace_id,
11470        conv,
11471        &conversation_key,
11472    )
11473}
11474
11475fn franken_insert_conversation_or_get_existing_after_miss(
11476    tx: &FrankenTransaction<'_>,
11477    agent_id: i64,
11478    workspace_id: Option<i64>,
11479    conv: &Conversation,
11480    conversation_key: &PendingConversationKey,
11481) -> Result<ConversationInsertStatus> {
11482    match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
11483        Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
11484        Ok(None) => {
11485            // A concurrent writer won the unique-provenance race. Resolve the
11486            // canonical row so callers can merge messages into it.
11487            let existing_id =
11488                franken_find_existing_conversation_by_key_after_conflict(
11489                    tx,
11490                    conversation_key,
11491                    Some(conv),
11492                )?
11493                    .with_context(|| {
11494                        format!(
11495                            "conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
11496                            conv.source_id,
11497                            agent_id,
11498                            conv.external_id,
11499                            conv.source_path.display()
11500                        )
11501                    })?;
11502            tracing::warn!(
11503                source_id = %conv.source_id,
11504                agent_id,
11505                external_id = ?conv.external_id,
11506                existing_id,
11507                source_path = %conv.source_path.display(),
11508                "conversation INSERT: duplicate gracefully recovered, reusing existing row"
11509            );
11510            Ok(ConversationInsertStatus::Existing(existing_id))
11511        }
11512        Err(error) => {
11513            tracing::error!(
11514                source_id = %conv.source_id,
11515                agent_id,
11516                external_id = ?conv.external_id,
11517                error = %error,
11518                source_path = %conv.source_path.display(),
11519                "franken_insert_conversation failed"
11520            );
11521            Err(error)
11522        }
11523    }
11524}
11525
11526/// Insert a conversation into the DB within a frankensqlite transaction.
11527///
11528/// Uses a plain `INSERT` so the common miss path stays on the slim direct
11529/// insert lane. Duplicate provenance conflicts are converted into `Ok(None)`
11530/// so callers can recover the canonical row and merge messages into it.
11531fn franken_insert_conversation(
11532    tx: &FrankenTransaction<'_>,
11533    agent_id: i64,
11534    workspace_id: Option<i64>,
11535    conv: &Conversation,
11536) -> Result<Option<i64>> {
11537    let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
11538    let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
11539    let metadata_bin_bytes = metadata_bin.as_deref();
11540
11541    match tx.execute_compat(
11542        "INSERT INTO conversations(
11543            agent_id, workspace_id, source_id, external_id, title, source_path,
11544            started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
11545            last_message_idx, last_message_created_at
11546        ) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
11547        fparams![
11548            agent_id,
11549            workspace_id,
11550            conv.source_id.as_str(),
11551            conv.external_id.as_deref(),
11552            conv.title.as_deref(),
11553            path_to_string(&conv.source_path),
11554            conv.started_at,
11555            conv.ended_at,
11556            conv.approx_tokens,
11557            metadata_json_str.as_deref(),
11558            conv.origin_host.as_deref(),
11559            metadata_bin_bytes,
11560            last_message_idx,
11561            last_message_created_at
11562        ],
11563    ) {
11564        Ok(_) => {
11565            let conv_id = franken_last_rowid(tx)?;
11566            franken_insert_conversation_tail_state(
11567                tx,
11568                conv_id,
11569                conv.ended_at,
11570                last_message_idx,
11571                last_message_created_at,
11572            )?;
11573            if let Some(external_id) = conv.external_id.as_deref() {
11574                franken_insert_external_conversation_tail_lookup(
11575                    tx,
11576                    conv.source_id.as_str(),
11577                    agent_id,
11578                    external_id,
11579                    ExistingConversationWithTail {
11580                        id: conv_id,
11581                        tail_state: existing_conversation_tail_state_from_cached(
11582                            last_message_idx,
11583                            last_message_created_at,
11584                            conv.ended_at,
11585                        ),
11586                    },
11587                )?;
11588            }
11589            Ok(Some(conv_id))
11590        }
11591        Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
11592            tracing::debug!(
11593                source_id = %conv.source_id,
11594                agent_id,
11595                external_id = ?conv.external_id,
11596                source_path = %conv.source_path.display(),
11597                "conversation INSERT: duplicate provenance conflict"
11598            );
11599            Ok(None)
11600        }
11601        Err(error) => Err(error.into()),
11602    }
11603}
11604
11605type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11606
11607fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
11608    if let Some(raw) = historical_raw_json(value) {
11609        Ok((Some(Cow::Borrowed(raw)), None))
11610    } else if value.is_null() {
11611        Ok((Some(Cow::Borrowed("null")), None))
11612    } else if value.as_object().is_some_and(|object| object.is_empty()) {
11613        Ok((None, None))
11614    } else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
11615        Ok((None, Some(metadata_bin)))
11616    } else {
11617        Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
11618    }
11619}
11620
11621fn franken_insert_new_message(
11622    tx: &FrankenTransaction<'_>,
11623    conversation_id: i64,
11624    msg: &Message,
11625) -> Result<i64> {
11626    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11627    let extra_bin_bytes = extra_bin.as_deref();
11628
11629    tx.execute_compat(
11630        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11631         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11632            fparams![
11633                conversation_id,
11634                msg.idx,
11635                role_as_str(&msg.role),
11636                msg.author.as_deref(),
11637                msg.created_at,
11638                msg.content.as_str(),
11639                extra_json_str.as_deref(),
11640                extra_bin_bytes
11641        ],
11642    )?;
11643    franken_last_rowid(tx)
11644}
11645
11646type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11647
11648fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
11649    if let Some(raw) = historical_raw_json(&msg.extra_json) {
11650        Ok((Some(Cow::Borrowed(raw)), None))
11651    } else if msg.extra_json.is_null() {
11652        Ok((None, None))
11653    } else {
11654        let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
11655        if extra_bin.is_some() {
11656            Ok((None, extra_bin))
11657        } else {
11658            Ok((
11659                Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
11660                None,
11661            ))
11662        }
11663    }
11664}
11665
11666/// Batch size for proven-new message inserts.
11667///
11668/// Each row binds 8 values, so 100 rows stays well under SQLite's default
11669/// `SQLITE_MAX_VARIABLE_NUMBER` limit of 999 while still amortizing parse cost.
11670const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
11671
11672/// Append workloads profile fastest with larger chunks on current frankensqlite.
11673///
11674/// After the tail-state hot table removed conversation-row rewrites from the
11675/// append path, 50-row chunks beat the old 20-row setting on the append-merge
11676/// profile. 100-row chunks slightly regress the 20-message workload.
11677const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
11678
11679fn message_insert_batch_sql(row_count: usize) -> &'static str {
11680    static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
11681
11682    let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
11683    let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
11684        let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
11685        sql_by_row_count.push(String::new());
11686        for row_count in 1..=max_batch_size {
11687            let placeholders = (0..row_count)
11688                .map(|idx| {
11689                    let base = idx * 8;
11690                    format!(
11691                        "(?{},?{},?{},?{},?{},?{},?{},?{})",
11692                        base + 1,
11693                        base + 2,
11694                        base + 3,
11695                        base + 4,
11696                        base + 5,
11697                        base + 6,
11698                        base + 7,
11699                        base + 8
11700                    )
11701                })
11702                .collect::<Vec<_>>()
11703                .join(",");
11704            sql_by_row_count.push(format!(
11705                "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
11706            ));
11707        }
11708        sql_by_row_count
11709    });
11710
11711    cached_sql
11712        .get(row_count)
11713        .map(String::as_str)
11714        .expect("message insert batch size must be covered by the cached SQL table")
11715}
11716
11717fn franken_batch_insert_new_messages(
11718    tx: &FrankenTransaction<'_>,
11719    conversation_id: i64,
11720    messages: &[&Message],
11721) -> Result<Vec<i64>> {
11722    franken_batch_insert_new_messages_with_batch_size(
11723        tx,
11724        conversation_id,
11725        messages,
11726        MESSAGE_INSERT_BATCH_SIZE,
11727    )
11728}
11729
11730fn franken_append_insert_new_messages(
11731    tx: &FrankenTransaction<'_>,
11732    conversation_id: i64,
11733    messages: &[&Message],
11734) -> Result<Vec<i64>> {
11735    franken_batch_insert_new_messages_with_batch_size(
11736        tx,
11737        conversation_id,
11738        messages,
11739        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11740    )
11741}
11742
11743fn franken_batch_insert_new_messages_with_batch_size(
11744    tx: &FrankenTransaction<'_>,
11745    conversation_id: i64,
11746    messages: &[&Message],
11747    batch_size: usize,
11748) -> Result<Vec<i64>> {
11749    let batch_size = batch_size.max(1);
11750    let mut inserted_ids = Vec::with_capacity(messages.len());
11751    for chunk in messages.chunks(batch_size) {
11752        if chunk.len() == 1 {
11753            inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
11754            continue;
11755        }
11756        let sql = message_insert_batch_sql(chunk.len());
11757
11758        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
11759        for msg in chunk {
11760            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11761            param_values.push(SqliteValue::from(conversation_id));
11762            param_values.push(SqliteValue::from(msg.idx));
11763            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
11764            param_values.push(SqliteValue::from(msg.author.as_deref()));
11765            param_values.push(SqliteValue::from(msg.created_at));
11766            param_values.push(SqliteValue::from(msg.content.as_str()));
11767            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
11768            param_values.push(SqliteValue::from(extra_bin.as_deref()));
11769        }
11770
11771        tx.execute_with_params(sql, &param_values)?;
11772
11773        let last_id = franken_last_rowid(tx)?;
11774        let first_id = last_id
11775            .checked_sub((chunk.len() - 1) as i64)
11776            .with_context(|| {
11777                format!(
11778                    "inferring rowid range for {}-row message batch ending at {last_id}",
11779                    chunk.len()
11780                )
11781            })?;
11782        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
11783    }
11784
11785    Ok(inserted_ids)
11786}
11787
11788#[cfg(test)]
11789fn franken_insert_new_message_with_profile(
11790    tx: &FrankenTransaction<'_>,
11791    conversation_id: i64,
11792    msg: &Message,
11793    profile: &mut MessageInsertSubstageProfile,
11794) -> Result<i64> {
11795    profile.single_row_calls += 1;
11796    profile.batch_rows += 1;
11797
11798    let payload_start = Instant::now();
11799    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11800    profile.payload_duration += payload_start.elapsed();
11801    let extra_bin_bytes = extra_bin.as_deref();
11802
11803    let execute_start = Instant::now();
11804    tx.execute_compat(
11805        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11806         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11807            fparams![
11808                conversation_id,
11809                msg.idx,
11810                role_as_str(&msg.role),
11811                msg.author.as_deref(),
11812                msg.created_at,
11813                msg.content.as_str(),
11814                extra_json_str.as_deref(),
11815                extra_bin_bytes
11816        ],
11817    )?;
11818    profile.execute_duration += execute_start.elapsed();
11819
11820    let rowid_start = Instant::now();
11821    let rowid = franken_last_rowid(tx)?;
11822    profile.rowid_duration += rowid_start.elapsed();
11823    Ok(rowid)
11824}
11825
11826#[cfg(test)]
11827fn franken_batch_insert_new_messages_with_profile(
11828    tx: &FrankenTransaction<'_>,
11829    conversation_id: i64,
11830    messages: &[&Message],
11831    profile: &mut MessageInsertSubstageProfile,
11832) -> Result<Vec<i64>> {
11833    franken_batch_insert_new_messages_with_profile_batch_size(
11834        tx,
11835        conversation_id,
11836        messages,
11837        profile,
11838        MESSAGE_INSERT_BATCH_SIZE,
11839    )
11840}
11841
11842#[cfg(test)]
11843fn franken_append_insert_new_messages_with_profile(
11844    tx: &FrankenTransaction<'_>,
11845    conversation_id: i64,
11846    messages: &[&Message],
11847    profile: &mut MessageInsertSubstageProfile,
11848) -> Result<Vec<i64>> {
11849    franken_batch_insert_new_messages_with_profile_batch_size(
11850        tx,
11851        conversation_id,
11852        messages,
11853        profile,
11854        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11855    )
11856}
11857
11858#[cfg(test)]
11859fn franken_batch_insert_new_messages_with_profile_batch_size(
11860    tx: &FrankenTransaction<'_>,
11861    conversation_id: i64,
11862    messages: &[&Message],
11863    profile: &mut MessageInsertSubstageProfile,
11864    batch_size: usize,
11865) -> Result<Vec<i64>> {
11866    let batch_size = batch_size.max(1);
11867    let mut inserted_ids = Vec::with_capacity(messages.len());
11868    for chunk in messages.chunks(batch_size) {
11869        if chunk.len() == 1 {
11870            inserted_ids.push(franken_insert_new_message_with_profile(
11871                tx,
11872                conversation_id,
11873                chunk[0],
11874                profile,
11875            )?);
11876            continue;
11877        }
11878
11879        profile.batch_calls += 1;
11880        profile.batch_rows += chunk.len();
11881
11882        let sql_build_start = Instant::now();
11883        let sql = message_insert_batch_sql(chunk.len());
11884        profile.sql_build_duration += sql_build_start.elapsed();
11885
11886        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
11887        for msg in chunk {
11888            let payload_start = Instant::now();
11889            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11890            profile.payload_duration += payload_start.elapsed();
11891
11892            let param_build_start = Instant::now();
11893            param_values.push(SqliteValue::from(conversation_id));
11894            param_values.push(SqliteValue::from(msg.idx));
11895            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
11896            param_values.push(SqliteValue::from(msg.author.as_deref()));
11897            param_values.push(SqliteValue::from(msg.created_at));
11898            param_values.push(SqliteValue::from(msg.content.as_str()));
11899            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
11900            param_values.push(SqliteValue::from(extra_bin.as_deref()));
11901            profile.param_build_duration += param_build_start.elapsed();
11902        }
11903
11904        let execute_start = Instant::now();
11905        tx.execute_with_params(sql, &param_values)?;
11906        profile.execute_duration += execute_start.elapsed();
11907
11908        let rowid_start = Instant::now();
11909        let last_id = franken_last_rowid(tx)?;
11910        let first_id = last_id
11911            .checked_sub((chunk.len() - 1) as i64)
11912            .with_context(|| {
11913                format!(
11914                    "inferring rowid range for {}-row message batch ending at {last_id}",
11915                    chunk.len()
11916                )
11917            })?;
11918        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
11919        profile.rowid_duration += rowid_start.elapsed();
11920    }
11921
11922    Ok(inserted_ids)
11923}
11924
11925/// Insert snippets within a frankensqlite transaction.
11926fn franken_insert_snippets(
11927    tx: &FrankenTransaction<'_>,
11928    message_id: i64,
11929    snippets: &[Snippet],
11930) -> Result<()> {
11931    for snip in snippets {
11932        let file_path_str = snip.file_path.as_ref().map(path_to_string);
11933        tx.execute_compat(
11934            "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
11935             VALUES(?1,?2,?3,?4,?5,?6)",
11936            fparams![
11937                message_id,
11938                file_path_str.as_deref(),
11939                snip.start_line,
11940                snip.end_line,
11941                snip.language.as_deref(),
11942                snip.snippet_text.as_deref()
11943            ],
11944        )?;
11945    }
11946    Ok(())
11947}
11948
11949fn franken_existing_message_fingerprints(
11950    tx: &FrankenTransaction<'_>,
11951    conversation_id: i64,
11952) -> Result<HashSet<MessageMergeFingerprint>> {
11953    let rows = tx.query_params(
11954        "SELECT idx, role, author, created_at, content
11955         FROM messages
11956         WHERE conversation_id = ?1",
11957        fparams![conversation_id],
11958    )?;
11959    let mut fingerprints = HashSet::with_capacity(rows.len());
11960    for row in rows {
11961        let role: String = row.get_typed(1)?;
11962        let content: String = row.get_typed(4)?;
11963        fingerprints.insert(MessageMergeFingerprint {
11964            idx: row.get_typed(0)?,
11965            created_at: row.get_typed(3)?,
11966            role: role_from_str(&role),
11967            author: row.get_typed(2)?,
11968            content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
11969        });
11970    }
11971    Ok(fingerprints)
11972}
11973
11974struct ExistingMessageLookup {
11975    by_idx: HashMap<i64, MessageMergeFingerprint>,
11976    replay: HashSet<MessageReplayFingerprint>,
11977}
11978
11979fn franken_existing_message_lookup(
11980    tx: &FrankenTransaction<'_>,
11981    conversation_id: i64,
11982    incoming_messages: &[Message],
11983) -> Result<ExistingMessageLookup> {
11984    if incoming_messages.is_empty() {
11985        return Ok(ExistingMessageLookup {
11986            by_idx: HashMap::new(),
11987            replay: HashSet::new(),
11988        });
11989    }
11990
11991    let min_idx = incoming_messages
11992        .iter()
11993        .map(|msg| msg.idx)
11994        .min()
11995        .unwrap_or(0);
11996    let max_idx = incoming_messages
11997        .iter()
11998        .map(|msg| msg.idx)
11999        .max()
12000        .unwrap_or(min_idx);
12001    let requires_full_scan = incoming_messages.iter().any(|msg| msg.created_at.is_none());
12002    let created_bounds = incoming_messages
12003        .iter()
12004        .filter_map(|msg| msg.created_at)
12005        .fold(None, |bounds: Option<(i64, i64)>, created_at| {
12006            Some(match bounds {
12007                Some((min_created_at, max_created_at)) => (
12008                    min_created_at.min(created_at),
12009                    max_created_at.max(created_at),
12010                ),
12011                None => (created_at, created_at),
12012            })
12013        });
12014
12015    let mut indexed_by_idx = HashMap::with_capacity(incoming_messages.len());
12016    let mut indexed_replay = HashSet::with_capacity(incoming_messages.len());
12017    let mut exact_idx_match = true;
12018    for msg in incoming_messages {
12019        record_message_lookup_exact_idx_probe();
12020        let Some((role, author, created_at, content)) = tx
12021            .query_row_map(
12022                "SELECT role, author, created_at, content
12023                 FROM messages INDEXED BY sqlite_autoindex_messages_1
12024                 WHERE conversation_id = ?1 AND idx = ?2
12025                 LIMIT 1",
12026                fparams![conversation_id, msg.idx],
12027                |row| {
12028                    Ok((
12029                        row.get_typed::<String>(0)?,
12030                        row.get_typed::<Option<String>>(1)?,
12031                        row.get_typed::<Option<i64>>(2)?,
12032                        row.get_typed::<String>(3)?,
12033                    ))
12034                },
12035            )
12036            .optional()?
12037        else {
12038            exact_idx_match = false;
12039            break;
12040        };
12041        let role = role_from_str(&role);
12042        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12043        let fingerprint = MessageMergeFingerprint {
12044            idx: msg.idx,
12045            created_at,
12046            role: role.clone(),
12047            author: author.clone(),
12048            content_hash,
12049        };
12050        if fingerprint != message_merge_fingerprint(msg) {
12051            exact_idx_match = false;
12052            break;
12053        }
12054        indexed_by_idx.insert(msg.idx, fingerprint);
12055        indexed_replay.insert(MessageReplayFingerprint {
12056            created_at,
12057            role,
12058            author,
12059            content_hash,
12060        });
12061    }
12062
12063    if exact_idx_match {
12064        return Ok(ExistingMessageLookup {
12065            by_idx: indexed_by_idx,
12066            replay: indexed_replay,
12067        });
12068    }
12069
12070    let (rows, replay_full_scan) = if requires_full_scan {
12071        let rows = tx.query_params(
12072            "SELECT idx, role, author, created_at, content
12073             FROM messages INDEXED BY sqlite_autoindex_messages_1
12074             WHERE conversation_id = ?1",
12075            fparams![conversation_id],
12076        )?;
12077        record_message_lookup_full_scan_query(rows.len());
12078        (rows, true)
12079    } else if let Some((min_created_at, max_created_at)) = created_bounds {
12080        let mut rows = tx.query_params(
12081            "SELECT idx, role, author, created_at, content
12082             FROM messages INDEXED BY sqlite_autoindex_messages_1
12083             WHERE conversation_id = ?1
12084               AND idx >= ?2
12085               AND idx <= ?3",
12086            fparams![conversation_id, min_idx, max_idx],
12087        )?;
12088        rows.extend(tx.query_params(
12089            "SELECT idx, role, author, created_at, content
12090             FROM messages INDEXED BY sqlite_autoindex_messages_1
12091             WHERE conversation_id = ?1
12092               AND created_at IS NOT NULL
12093               AND created_at >= ?2
12094               AND created_at <= ?3",
12095            fparams![conversation_id, min_created_at, max_created_at],
12096        )?);
12097        record_message_lookup_bounded_queries(2, rows.len());
12098        (rows, false)
12099    } else {
12100        let rows = tx.query_params(
12101            "SELECT idx, role, author, created_at, content
12102             FROM messages INDEXED BY sqlite_autoindex_messages_1
12103             WHERE conversation_id = ?1",
12104            fparams![conversation_id],
12105        )?;
12106        record_message_lookup_full_scan_query(rows.len());
12107        (rows, true)
12108    };
12109
12110    let mut by_idx = HashMap::with_capacity(rows.len());
12111    let mut replay = HashSet::with_capacity(rows.len());
12112    for row in rows {
12113        let idx: i64 = row.get_typed(0)?;
12114        let role: String = row.get_typed(1)?;
12115        let author: Option<String> = row.get_typed(2)?;
12116        let created_at: Option<i64> = row.get_typed(3)?;
12117        let content: String = row.get_typed(4)?;
12118        let role = role_from_str(&role);
12119        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12120
12121        if idx >= min_idx && idx <= max_idx {
12122            by_idx.insert(
12123                idx,
12124                MessageMergeFingerprint {
12125                    idx,
12126                    created_at,
12127                    role: role.clone(),
12128                    author: author.clone(),
12129                    content_hash,
12130                },
12131            );
12132        }
12133
12134        let replay_matches = if replay_full_scan {
12135            true
12136        } else if let Some((min_created_at, max_created_at)) = created_bounds {
12137            created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
12138        } else {
12139            true
12140        };
12141        if replay_matches {
12142            replay.insert(MessageReplayFingerprint {
12143                created_at,
12144                role,
12145                author,
12146                content_hash,
12147            });
12148        }
12149    }
12150
12151    Ok(ExistingMessageLookup { by_idx, replay })
12152}
12153
12154fn franken_existing_message_lookup_with_pending(
12155    tx: &FrankenTransaction<'_>,
12156    conversation_id: i64,
12157    incoming_messages: &[Message],
12158    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12159    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12160) -> Result<ExistingMessageLookup> {
12161    if let (Some(by_idx), Some(replay)) = (
12162        pending_message_fingerprints.get(&conversation_id),
12163        pending_message_replay_fingerprints.get(&conversation_id),
12164    ) {
12165        if incoming_messages.iter().all(|msg| {
12166            by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
12167        }) {
12168            return Ok(ExistingMessageLookup {
12169                by_idx: by_idx.clone(),
12170                replay: replay.clone(),
12171            });
12172        }
12173
12174        let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12175        let mut merged_by_idx = by_idx.clone();
12176        let mut merged_replay = replay.clone();
12177        merged_by_idx.extend(fresh.by_idx);
12178        merged_replay.extend(fresh.replay);
12179        pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
12180        pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
12181        return Ok(ExistingMessageLookup {
12182            by_idx: merged_by_idx,
12183            replay: merged_replay,
12184        });
12185    }
12186
12187    let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12188    pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
12189    pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
12190    Ok(lookup)
12191}
12192
12193/// Batch insert FTS5 entries within a frankensqlite transaction.
12194fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
12195    if entries.is_empty() {
12196        return Ok(0);
12197    }
12198
12199    let mut inserted = 0;
12200
12201    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12202        let placeholders: String = chunk
12203            .iter()
12204            .enumerate()
12205            .map(|(i, _)| {
12206                let base = i * 7 + 1; // +1 for 1-indexed params
12207                format!(
12208                    "(?{},?{},?{},?{},?{},?{},?{})",
12209                    base,
12210                    base + 1,
12211                    base + 2,
12212                    base + 3,
12213                    base + 4,
12214                    base + 5,
12215                    base + 6
12216                )
12217            })
12218            .collect::<Vec<_>>()
12219            .join(",");
12220
12221        let sql = format!(
12222            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12223        );
12224
12225        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12226        for entry in chunk {
12227            param_values.push(SqliteValue::from(entry.message_id));
12228            param_values.push(SqliteValue::from(entry.content.as_str()));
12229            param_values.push(SqliteValue::from(entry.title.as_str()));
12230            param_values.push(SqliteValue::from(entry.agent.as_str()));
12231            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12232            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12233            param_values.push(SqliteValue::from(entry.created_at));
12234        }
12235
12236        match tx.execute_with_params(&sql, &param_values) {
12237            Ok(_) => {
12238                inserted += chunk.len();
12239            }
12240            Err(err) => {
12241                tracing::warn!(
12242                    error = %err,
12243                    chunk_docs = chunk.len(),
12244                    "frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
12245                );
12246                return Ok(inserted);
12247            }
12248        }
12249    }
12250
12251    Ok(inserted)
12252}
12253
12254fn franken_batch_insert_fts_on_connection(
12255    conn: &FrankenConnection,
12256    entries: &[FtsEntry],
12257) -> Result<usize> {
12258    if entries.is_empty() {
12259        return Ok(0);
12260    }
12261
12262    let mut inserted = 0;
12263
12264    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12265        let placeholders: String = chunk
12266            .iter()
12267            .enumerate()
12268            .map(|(i, _)| {
12269                let base = i * 7 + 1;
12270                format!(
12271                    "(?{},?{},?{},?{},?{},?{},?{})",
12272                    base,
12273                    base + 1,
12274                    base + 2,
12275                    base + 3,
12276                    base + 4,
12277                    base + 5,
12278                    base + 6
12279                )
12280            })
12281            .collect::<Vec<_>>()
12282            .join(",");
12283
12284        let sql = format!(
12285            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12286        );
12287
12288        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12289        for entry in chunk {
12290            param_values.push(SqliteValue::from(entry.message_id));
12291            param_values.push(SqliteValue::from(entry.content.as_str()));
12292            param_values.push(SqliteValue::from(entry.title.as_str()));
12293            param_values.push(SqliteValue::from(entry.agent.as_str()));
12294            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12295            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12296            param_values.push(SqliteValue::from(entry.created_at));
12297        }
12298
12299        conn.execute_with_params(&sql, &param_values)
12300            .with_context(|| {
12301                format!(
12302                    "inserting {} rows into fts_messages during streaming FTS maintenance",
12303                    chunk.len()
12304                )
12305            })?;
12306        inserted += chunk.len();
12307    }
12308
12309    Ok(inserted)
12310}
12311
12312/// Update daily stats within a frankensqlite transaction.
12313fn franken_update_daily_stats_in_tx(
12314    storage: &FrankenStorage,
12315    tx: &FrankenTransaction<'_>,
12316    agent_slug: &str,
12317    source_id: &str,
12318    started_at: Option<i64>,
12319    delta: StatsDelta,
12320) -> Result<()> {
12321    let day_id = started_at
12322        .map(FrankenStorage::day_id_from_millis)
12323        .unwrap_or(0);
12324    let now = FrankenStorage::now_millis();
12325
12326    let targets = [
12327        DailyStatsTarget {
12328            day_id,
12329            agent_slug,
12330            source_id,
12331        },
12332        DailyStatsTarget {
12333            day_id,
12334            agent_slug: "all",
12335            source_id,
12336        },
12337        DailyStatsTarget {
12338            day_id,
12339            agent_slug,
12340            source_id: "all",
12341        },
12342        DailyStatsTarget {
12343            day_id,
12344            agent_slug: "all",
12345            source_id: "all",
12346        },
12347    ];
12348
12349    if agent_slug != "all"
12350        && source_id != "all"
12351        && franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
12352    {
12353        return Ok(());
12354    }
12355
12356    for target in targets {
12357        franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
12358    }
12359
12360    Ok(())
12361}
12362
12363#[derive(Clone, Copy)]
12364struct DailyStatsTarget<'a> {
12365    day_id: i64,
12366    agent_slug: &'a str,
12367    source_id: &'a str,
12368}
12369
12370fn franken_update_ensured_daily_stats_targets_in_tx(
12371    storage: &FrankenStorage,
12372    tx: &FrankenTransaction<'_>,
12373    targets: &[DailyStatsTarget<'_>; 4],
12374    now: i64,
12375    delta: StatsDelta,
12376) -> Result<bool> {
12377    let cache_keys = targets.map(|target| {
12378        EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
12379    });
12380    if !storage.daily_stats_keys_already_ensured(&cache_keys) {
12381        return Ok(false);
12382    }
12383
12384    let primary = targets[0];
12385    let rows_changed = tx.execute_compat(
12386        "UPDATE daily_stats
12387         SET session_count = session_count + ?4,
12388             message_count = message_count + ?5,
12389             total_chars = total_chars + ?6,
12390             last_updated = ?7
12391         WHERE day_id = ?1
12392           AND ((agent_slug = ?2 AND source_id = ?3)
12393                OR (agent_slug = 'all' AND source_id = ?3)
12394                OR (agent_slug = ?2 AND source_id = 'all')
12395                OR (agent_slug = 'all' AND source_id = 'all'))",
12396        fparams![
12397            primary.day_id,
12398            primary.agent_slug,
12399            primary.source_id,
12400            delta.session_count_delta,
12401            delta.message_count_delta,
12402            delta.total_chars_delta,
12403            now
12404        ],
12405    )?;
12406    if rows_changed == targets.len() {
12407        return Ok(true);
12408    }
12409
12410    for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
12411        let exists = tx
12412            .query_row_map(
12413                "SELECT 1 FROM daily_stats
12414                 WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
12415                 LIMIT 1",
12416                fparams![target.day_id, target.agent_slug, target.source_id],
12417                |row| row.get_typed::<i64>(0),
12418            )
12419            .optional()?
12420            .is_some();
12421        if exists {
12422            continue;
12423        }
12424
12425        tx.execute_compat(
12426            "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12427             VALUES(?1,?2,?3,?4,?5,?6,?7)",
12428            fparams![
12429                target.day_id,
12430                target.agent_slug,
12431                target.source_id,
12432                delta.session_count_delta,
12433                delta.message_count_delta,
12434                delta.total_chars_delta,
12435                now
12436            ],
12437        )?;
12438        storage.mark_daily_stats_key_ensured(cache_key);
12439    }
12440
12441    Ok(true)
12442}
12443
12444fn franken_apply_daily_stats_delta_in_tx(
12445    storage: &FrankenStorage,
12446    tx: &FrankenTransaction<'_>,
12447    target: DailyStatsTarget<'_>,
12448    now: i64,
12449    delta: StatsDelta,
12450) -> Result<()> {
12451    let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
12452    if storage.daily_stats_key_already_ensured(&cache_key) {
12453        let rows_changed = tx.execute_compat(
12454            "UPDATE daily_stats
12455             SET session_count = session_count + ?4,
12456                 message_count = message_count + ?5,
12457                 total_chars = total_chars + ?6,
12458                 last_updated = ?7
12459             WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
12460            fparams![
12461                target.day_id,
12462                target.agent_slug,
12463                target.source_id,
12464                delta.session_count_delta,
12465                delta.message_count_delta,
12466                delta.total_chars_delta,
12467                now
12468            ],
12469        )?;
12470        if rows_changed > 0 {
12471            return Ok(());
12472        }
12473    }
12474
12475    tx.execute_compat(
12476        "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12477         VALUES(?1,?2,?3,?4,?5,?6,?7)
12478         ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12479            session_count = session_count + excluded.session_count,
12480            message_count = message_count + excluded.message_count,
12481            total_chars = total_chars + excluded.total_chars,
12482            last_updated = excluded.last_updated",
12483        fparams![
12484            target.day_id,
12485            target.agent_slug,
12486            target.source_id,
12487            delta.session_count_delta,
12488            delta.message_count_delta,
12489            delta.total_chars_delta,
12490            now
12491        ],
12492    )?;
12493    storage.mark_daily_stats_key_ensured(cache_key);
12494    Ok(())
12495}
12496
12497// -------------------------------------------------------------------------
12498// Frankensqlite batch helpers
12499// -------------------------------------------------------------------------
12500
12501/// Batch upsert daily_stats within a frankensqlite transaction.
12502fn franken_update_daily_stats_batched_in_tx(
12503    tx: &FrankenTransaction<'_>,
12504    entries: &[(i64, String, String, StatsDelta)],
12505) -> Result<usize> {
12506    if entries.is_empty() {
12507        return Ok(0);
12508    }
12509
12510    let now = FrankenStorage::now_millis();
12511    let mut total_affected = 0;
12512
12513    // Keep frankensqlite UPSERTs row-wise inside the transaction. The
12514    // multi-row VALUES ... ON CONFLICT form still falls back through
12515    // INSERT...SELECT in fsqlite-core, which rejects UPSERT/RETURNING during
12516    // real cass indexing.
12517    for (day_id, agent, source, delta) in entries {
12518        total_affected += tx.execute_compat(
12519            "INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12520             VALUES(?1,?2,?3,?4,?5,?6,?7)
12521             ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12522                 session_count = session_count + excluded.session_count,
12523                 message_count = message_count + excluded.message_count,
12524                 total_chars = total_chars + excluded.total_chars,
12525                 last_updated = excluded.last_updated",
12526            fparams![
12527                *day_id,
12528                agent.as_str(),
12529                source.as_str(),
12530                delta.session_count_delta,
12531                delta.message_count_delta,
12532                delta.total_chars_delta,
12533                now
12534            ],
12535        )?;
12536    }
12537
12538    Ok(total_affected)
12539}
12540
12541/// Batch insert token_usage rows within a frankensqlite transaction.
12542///
12543/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12544/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12545/// UPSERT/OR IGNORE conflict clauses.
12546fn franken_insert_token_usage_batched_in_tx(
12547    tx: &FrankenTransaction<'_>,
12548    entries: &[TokenUsageEntry],
12549) -> Result<usize> {
12550    if entries.is_empty() {
12551        return Ok(0);
12552    }
12553
12554    let mut total_inserted = 0;
12555
12556    for e in entries {
12557        let params_vec: Vec<ParamValue> = vec![
12558            ParamValue::from(e.message_id),
12559            ParamValue::from(e.conversation_id),
12560            ParamValue::from(e.agent_id),
12561            ParamValue::from(e.workspace_id),
12562            ParamValue::from(e.source_id.clone()),
12563            ParamValue::from(e.timestamp_ms),
12564            ParamValue::from(e.day_id),
12565            ParamValue::from(e.model_name.clone()),
12566            ParamValue::from(e.model_family.clone()),
12567            ParamValue::from(e.model_tier.clone()),
12568            ParamValue::from(e.service_tier.clone()),
12569            ParamValue::from(e.provider.clone()),
12570            ParamValue::from(e.input_tokens),
12571            ParamValue::from(e.output_tokens),
12572            ParamValue::from(e.cache_read_tokens),
12573            ParamValue::from(e.cache_creation_tokens),
12574            ParamValue::from(e.thinking_tokens),
12575            ParamValue::from(e.total_tokens),
12576            ParamValue::from(e.estimated_cost_usd),
12577            ParamValue::from(e.role.clone()),
12578            ParamValue::from(e.content_chars),
12579            ParamValue::from(e.has_tool_calls as i64),
12580            ParamValue::from(e.tool_call_count as i64),
12581            ParamValue::from(e.data_source.clone()),
12582        ];
12583
12584        let values = param_slice_to_values(&params_vec);
12585        total_inserted += tx.execute_with_params(
12586            "INSERT OR IGNORE INTO token_usage (
12587                message_id, conversation_id, agent_id, workspace_id, source_id,
12588                timestamp_ms, day_id,
12589                model_name, model_family, model_tier, service_tier, provider,
12590                input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
12591                thinking_tokens, total_tokens, estimated_cost_usd,
12592                role, content_chars, has_tool_calls, tool_call_count, data_source
12593            )
12594            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12595            &values,
12596        )?;
12597    }
12598
12599    Ok(total_inserted)
12600}
12601
12602/// Batch upsert token_daily_stats within a frankensqlite transaction.
12603fn franken_update_token_daily_stats_batched_in_tx(
12604    tx: &FrankenTransaction<'_>,
12605    entries: &[(i64, String, String, String, TokenStatsDelta)],
12606) -> Result<usize> {
12607    if entries.is_empty() {
12608        return Ok(0);
12609    }
12610
12611    let now = FrankenStorage::now_millis();
12612    let mut total_affected = 0;
12613
12614    for (day_id, agent, source, model, delta) in entries {
12615        total_affected += tx.execute_compat(
12616            "INSERT INTO token_daily_stats (
12617                day_id, agent_slug, source_id, model_family,
12618                api_call_count, user_message_count, assistant_message_count, tool_message_count,
12619                total_input_tokens, total_output_tokens, total_cache_read_tokens,
12620                total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
12621                total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
12622                last_updated
12623            )
12624            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
12625            ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
12626                api_call_count = api_call_count + excluded.api_call_count,
12627                user_message_count = user_message_count + excluded.user_message_count,
12628                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12629                tool_message_count = tool_message_count + excluded.tool_message_count,
12630                total_input_tokens = total_input_tokens + excluded.total_input_tokens,
12631                total_output_tokens = total_output_tokens + excluded.total_output_tokens,
12632                total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
12633                total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
12634                total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
12635                grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
12636                total_content_chars = total_content_chars + excluded.total_content_chars,
12637                total_tool_calls = total_tool_calls + excluded.total_tool_calls,
12638                estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
12639                session_count = session_count + excluded.session_count,
12640                last_updated = excluded.last_updated",
12641            fparams![
12642                *day_id,
12643                agent.as_str(),
12644                source.as_str(),
12645                model.as_str(),
12646                delta.api_call_count,
12647                delta.user_message_count,
12648                delta.assistant_message_count,
12649                delta.tool_message_count,
12650                delta.total_input_tokens,
12651                delta.total_output_tokens,
12652                delta.total_cache_read_tokens,
12653                delta.total_cache_creation_tokens,
12654                delta.total_thinking_tokens,
12655                delta.grand_total_tokens,
12656                delta.total_content_chars,
12657                delta.total_tool_calls,
12658                delta.estimated_cost_usd,
12659                delta.session_count,
12660                now
12661            ],
12662        )?;
12663    }
12664
12665    Ok(total_affected)
12666}
12667
12668/// Batch insert message_metrics rows within a frankensqlite transaction.
12669///
12670/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12671/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12672/// UPSERT/OR IGNORE conflict clauses.
12673fn franken_insert_message_metrics_batched_in_tx(
12674    tx: &FrankenTransaction<'_>,
12675    entries: &[MessageMetricsEntry],
12676) -> Result<usize> {
12677    if entries.is_empty() {
12678        return Ok(0);
12679    }
12680
12681    let mut total_inserted = 0;
12682
12683    for e in entries {
12684        let params_vec: Vec<ParamValue> = vec![
12685            ParamValue::from(e.message_id),
12686            ParamValue::from(e.created_at_ms),
12687            ParamValue::from(e.hour_id),
12688            ParamValue::from(e.day_id),
12689            ParamValue::from(e.agent_slug.clone()),
12690            ParamValue::from(e.workspace_id),
12691            ParamValue::from(e.source_id.clone()),
12692            ParamValue::from(e.role.clone()),
12693            ParamValue::from(e.content_chars),
12694            ParamValue::from(e.content_tokens_est),
12695            ParamValue::from(e.model_name.clone()),
12696            ParamValue::from(e.model_family.clone()),
12697            ParamValue::from(e.model_tier.clone()),
12698            ParamValue::from(e.provider.clone()),
12699            ParamValue::from(e.api_input_tokens),
12700            ParamValue::from(e.api_output_tokens),
12701            ParamValue::from(e.api_cache_read_tokens),
12702            ParamValue::from(e.api_cache_creation_tokens),
12703            ParamValue::from(e.api_thinking_tokens),
12704            ParamValue::from(e.api_service_tier.clone()),
12705            ParamValue::from(e.api_data_source.clone()),
12706            ParamValue::from(e.tool_call_count),
12707            ParamValue::from(e.has_tool_calls as i64),
12708            ParamValue::from(e.has_plan as i64),
12709        ];
12710
12711        let values = param_slice_to_values(&params_vec);
12712        total_inserted += tx.execute_with_params(
12713            "INSERT OR IGNORE INTO message_metrics (
12714                message_id, created_at_ms, hour_id, day_id,
12715                agent_slug, workspace_id, source_id, role,
12716                content_chars, content_tokens_est,
12717                model_name, model_family, model_tier, provider,
12718                api_input_tokens, api_output_tokens, api_cache_read_tokens,
12719                api_cache_creation_tokens, api_thinking_tokens,
12720                api_service_tier, api_data_source,
12721                tool_call_count, has_tool_calls, has_plan
12722            )
12723            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12724            &values,
12725        )?;
12726    }
12727
12728    Ok(total_inserted)
12729}
12730
12731/// Flush one rollup table (shared logic for hourly + daily) within a frankensqlite transaction.
12732fn franken_flush_rollup_table(
12733    tx: &FrankenTransaction<'_>,
12734    table: &str,
12735    bucket_col: &str,
12736    deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
12737    now: i64,
12738) -> Result<usize> {
12739    if deltas.is_empty() {
12740        return Ok(0);
12741    }
12742
12743    let mut total_affected = 0;
12744
12745    for ((bucket_id, agent, workspace_id, source), d) in deltas {
12746        let sql = format!(
12747            "INSERT INTO {table} (
12748                {bucket_col}, agent_slug, workspace_id, source_id,
12749                message_count, user_message_count, assistant_message_count,
12750                tool_call_count, plan_message_count, plan_content_tokens_est_total,
12751                plan_api_tokens_total, api_coverage_message_count,
12752                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
12753                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
12754                api_cache_read_tokens_total, api_cache_creation_tokens_total,
12755                api_thinking_tokens_total, last_updated
12756            )
12757            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
12758            ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
12759                message_count = message_count + excluded.message_count,
12760                user_message_count = user_message_count + excluded.user_message_count,
12761                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12762                tool_call_count = tool_call_count + excluded.tool_call_count,
12763                plan_message_count = plan_message_count + excluded.plan_message_count,
12764                plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
12765                plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
12766                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
12767                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
12768                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
12769                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
12770                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
12771                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
12772                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
12773                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
12774                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
12775                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
12776                last_updated = excluded.last_updated"
12777        );
12778
12779        total_affected += tx.execute_compat(
12780            &sql,
12781            fparams![
12782                *bucket_id,
12783                agent.as_str(),
12784                *workspace_id,
12785                source.as_str(),
12786                d.message_count,
12787                d.user_message_count,
12788                d.assistant_message_count,
12789                d.tool_call_count,
12790                d.plan_message_count,
12791                d.plan_content_tokens_est_total,
12792                d.plan_api_tokens_total,
12793                d.api_coverage_message_count,
12794                d.content_tokens_est_total,
12795                d.content_tokens_est_user,
12796                d.content_tokens_est_assistant,
12797                d.api_tokens_total,
12798                d.api_input_tokens_total,
12799                d.api_output_tokens_total,
12800                d.api_cache_read_tokens_total,
12801                d.api_cache_creation_tokens_total,
12802                d.api_thinking_tokens_total,
12803                now
12804            ],
12805        )?;
12806    }
12807
12808    Ok(total_affected)
12809}
12810
12811/// Flush usage_models_daily rollup within a frankensqlite transaction.
12812fn franken_flush_model_daily_rollup_table(
12813    tx: &FrankenTransaction<'_>,
12814    deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
12815    now: i64,
12816) -> Result<usize> {
12817    if deltas.is_empty() {
12818        return Ok(0);
12819    }
12820
12821    let mut total_affected = 0;
12822
12823    for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
12824        total_affected += tx.execute_compat(
12825            "INSERT INTO usage_models_daily (
12826                day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
12827                message_count, user_message_count, assistant_message_count,
12828                tool_call_count, plan_message_count, api_coverage_message_count,
12829                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
12830                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
12831                api_cache_read_tokens_total, api_cache_creation_tokens_total,
12832                api_thinking_tokens_total, last_updated
12833            )
12834            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
12835            ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
12836                message_count = message_count + excluded.message_count,
12837                user_message_count = user_message_count + excluded.user_message_count,
12838                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12839                tool_call_count = tool_call_count + excluded.tool_call_count,
12840                plan_message_count = plan_message_count + excluded.plan_message_count,
12841                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
12842                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
12843                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
12844                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
12845                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
12846                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
12847                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
12848                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
12849                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
12850                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
12851                last_updated = excluded.last_updated",
12852            fparams![
12853                *day_id,
12854                agent.as_str(),
12855                *workspace_id,
12856                source.as_str(),
12857                model_family.as_str(),
12858                model_tier.as_str(),
12859                d.message_count,
12860                d.user_message_count,
12861                d.assistant_message_count,
12862                d.tool_call_count,
12863                d.plan_message_count,
12864                d.api_coverage_message_count,
12865                d.content_tokens_est_total,
12866                d.content_tokens_est_user,
12867                d.content_tokens_est_assistant,
12868                d.api_tokens_total,
12869                d.api_input_tokens_total,
12870                d.api_output_tokens_total,
12871                d.api_cache_read_tokens_total,
12872                d.api_cache_creation_tokens_total,
12873                d.api_thinking_tokens_total,
12874                now
12875            ],
12876        )?;
12877    }
12878
12879    Ok(total_affected)
12880}
12881
12882/// Flush AnalyticsRollupAggregator deltas via frankensqlite transaction.
12883fn franken_flush_analytics_rollups_in_tx(
12884    tx: &FrankenTransaction<'_>,
12885    agg: &AnalyticsRollupAggregator,
12886) -> Result<(usize, usize, usize)> {
12887    let now = FrankenStorage::now_millis();
12888
12889    let hourly_affected =
12890        franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
12891    let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
12892    let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
12893
12894    Ok((hourly_affected, daily_affected, models_daily_affected))
12895}
12896
12897/// Update conversation-level token summary columns via frankensqlite transaction.
12898fn franken_update_conversation_token_summaries_in_tx(
12899    tx: &FrankenTransaction<'_>,
12900    conversation_id: i64,
12901) -> Result<()> {
12902    tx.execute_compat(
12903        "UPDATE conversations SET
12904            total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
12905            total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
12906            total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
12907            total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
12908            grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
12909            estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
12910            primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
12911                             AND model_name IS NOT NULL
12912                             GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
12913            api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12914                              AND data_source = 'api'),
12915            tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
12916            user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12917                                  AND role = 'user'),
12918            assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12919                                       AND role IN ('assistant', 'agent'))
12920         WHERE id = ?1",
12921        fparams![conversation_id],
12922    )?;
12923    Ok(())
12924}
12925
12926impl FrankenStorage {
12927    /// Rebuild token_daily_stats from the token_usage ledger.
12928    pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
12929        const CONVERSATION_BATCH_SIZE: usize = 1_000;
12930        const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
12931
12932        let total_usage_rows: i64 =
12933            self.conn
12934                .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
12935                    row.get_typed(0)
12936                })?;
12937        tracing::info!(
12938            target: "cass::analytics",
12939            total_usage_rows,
12940            "token_daily_stats_rebuild_start"
12941        );
12942
12943        let mut tx = self.conn.transaction()?;
12944        tx.execute("DELETE FROM token_daily_stats")?;
12945
12946        let mut last_conversation_id = 0_i64;
12947        let mut rows_created = 0_usize;
12948
12949        loop {
12950            let conversation_rows = tx.query_map_collect(
12951                "SELECT c.id, c.started_at, c.source_id,
12952                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
12953                 FROM conversations c
12954                 WHERE c.id > ?1
12955                 ORDER BY c.id
12956                 LIMIT ?2",
12957                fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
12958                |row| {
12959                    Ok((
12960                        row.get_typed::<i64>(0)?,
12961                        row.get_typed::<Option<i64>>(1)?,
12962                        row.get_typed::<String>(2)?,
12963                        row.get_typed::<String>(3)?,
12964                    ))
12965                },
12966            )?;
12967            if conversation_rows.is_empty() {
12968                break;
12969            }
12970
12971            let mut aggregate = TokenStatsAggregator::new();
12972
12973            for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
12974                last_conversation_id = conversation_id;
12975                let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
12976                let mut last_token_usage_id = 0_i64;
12977                let mut session_model_family = String::from("unknown");
12978
12979                loop {
12980                    let usage_rows = tx.query_map_collect(
12981                        "SELECT id, day_id, role,
12982                                COALESCE(model_family, 'unknown'),
12983                                input_tokens, output_tokens, cache_read_tokens,
12984                                cache_creation_tokens, thinking_tokens,
12985                                has_tool_calls, tool_call_count,
12986                                content_chars, estimated_cost_usd
12987                         FROM token_usage
12988                         WHERE conversation_id = ?1
12989                           AND id > ?2
12990                         ORDER BY id
12991                         LIMIT ?3",
12992                        fparams![
12993                            conversation_id,
12994                            last_token_usage_id,
12995                            TOKEN_USAGE_BATCH_SIZE as i64
12996                        ],
12997                        |row| {
12998                            Ok((
12999                                row.get_typed::<i64>(0)?,
13000                                row.get_typed::<i64>(1)?,
13001                                row.get_typed::<String>(2)?,
13002                                row.get_typed::<String>(3)?,
13003                                row.get_typed::<Option<i64>>(4)?,
13004                                row.get_typed::<Option<i64>>(5)?,
13005                                row.get_typed::<Option<i64>>(6)?,
13006                                row.get_typed::<Option<i64>>(7)?,
13007                                row.get_typed::<Option<i64>>(8)?,
13008                                row.get_typed::<i64>(9)?,
13009                                row.get_typed::<i64>(10)?,
13010                                row.get_typed::<i64>(11)?,
13011                                row.get_typed::<Option<f64>>(12)?,
13012                            ))
13013                        },
13014                    )?;
13015                    if usage_rows.is_empty() {
13016                        break;
13017                    }
13018
13019                    for (
13020                        token_usage_id,
13021                        day_id,
13022                        role,
13023                        model_family,
13024                        input_tokens,
13025                        output_tokens,
13026                        cache_read_tokens,
13027                        cache_creation_tokens,
13028                        thinking_tokens,
13029                        has_tool_calls,
13030                        tool_call_count,
13031                        content_chars,
13032                        estimated_cost_usd,
13033                    ) in usage_rows
13034                    {
13035                        last_token_usage_id = token_usage_id;
13036                        if model_family != "unknown" {
13037                            session_model_family = model_family.clone();
13038                        }
13039                        let usage = crate::connectors::ExtractedTokenUsage {
13040                            model_name: None,
13041                            provider: None,
13042                            input_tokens,
13043                            output_tokens,
13044                            cache_read_tokens,
13045                            cache_creation_tokens,
13046                            thinking_tokens,
13047                            service_tier: None,
13048                            has_tool_calls: has_tool_calls != 0,
13049                            tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
13050                            data_source: franken_agent_detection::TokenDataSource::Api,
13051                        };
13052                        aggregate.record(
13053                            &agent_slug,
13054                            &source_id,
13055                            day_id,
13056                            &model_family,
13057                            &role,
13058                            &usage,
13059                            content_chars,
13060                            estimated_cost_usd.unwrap_or(0.0),
13061                        );
13062                    }
13063                }
13064
13065                aggregate.record_session(
13066                    &agent_slug,
13067                    &source_id,
13068                    conversation_day_id,
13069                    &session_model_family,
13070                );
13071            }
13072
13073            let entries = aggregate.expand();
13074            rows_created = rows_created.saturating_add(entries.len());
13075            franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
13076        }
13077
13078        tx.commit()?;
13079
13080        tracing::info!(
13081            target: "cass::analytics",
13082            rows_created,
13083            "token_daily_stats_rebuild_complete"
13084        );
13085
13086        Ok(rows_created)
13087    }
13088
13089    /// Rebuild analytics tables (message_metrics + rollups) from existing
13090    /// messages in the database. Does NOT re-parse raw agent session files.
13091    pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
13092        let start = Instant::now();
13093
13094        let total_messages: i64 =
13095            self.conn
13096                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13097                    row.get_typed(0)
13098                })?;
13099        tracing::info!(
13100            target: "cass::analytics",
13101            total_messages,
13102            "analytics_rebuild_start"
13103        );
13104
13105        let mut tx = self.conn.transaction()?;
13106
13107        tx.execute("DELETE FROM message_metrics")?;
13108        tx.execute("DELETE FROM usage_hourly")?;
13109        tx.execute("DELETE FROM usage_daily")?;
13110        tx.execute("DELETE FROM usage_models_daily")?;
13111
13112        const CHUNK_SIZE: i64 = 10_000;
13113        let mut offset: i64 = 0;
13114        let mut total_inserted: usize = 0;
13115        let mut usage_hourly_rows: usize = 0;
13116        let mut usage_daily_rows: usize = 0;
13117        let mut usage_models_daily_rows: usize = 0;
13118
13119        loop {
13120            #[allow(clippy::type_complexity)]
13121            let rows: Vec<(
13122                i64,
13123                String,
13124                String,
13125                Option<serde_json::Value>,
13126                Option<i64>,
13127                Option<i64>,
13128                String,
13129                Option<i64>,
13130                String,
13131            )> = tx.query_map_collect(
13132                // Avoid the 3-table JOIN with LIMIT/OFFSET that triggers
13133                // frankensqlite's materialization fallback (see 860acb12).
13134                // Inline the agent slug lookup as a correlated subquery and
13135                // fall back to 'unknown' for NULL agent_id, matching the
13136                // FTS / lexical rebuild paths.
13137                "SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
13138                        m.created_at,
13139                        c.id AS conv_id, c.started_at AS conv_started_at,
13140                        c.source_id, c.workspace_id,
13141                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
13142                 FROM messages m
13143                 JOIN conversations c ON m.conversation_id = c.id
13144                 ORDER BY m.id
13145                 LIMIT ?1 OFFSET ?2",
13146                fparams![CHUNK_SIZE, offset],
13147                |row| {
13148                    let msg_id: i64 = row.get_typed(0)?;
13149                    let role: String = row.get_typed(2)?;
13150                    let content: String = row.get_typed(3)?;
13151                    let extra_json = row
13152                        .get_typed::<Option<String>>(4)?
13153                        .and_then(|s| serde_json::from_str(&s).ok())
13154                        .or_else(|| {
13155                            row.get_typed::<Option<Vec<u8>>>(5)
13156                                .ok()
13157                                .flatten()
13158                                .and_then(|b| rmp_serde::from_slice(&b).ok())
13159                        });
13160                    let msg_ts: Option<i64> = row.get_typed(6)?;
13161                    let conv_started_at: Option<i64> = row.get_typed(8)?;
13162                    let source_id: String = row.get_typed(9)?;
13163                    let workspace_id: Option<i64> = row.get_typed(10)?;
13164                    let agent_slug: String = row.get_typed(11)?;
13165                    let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
13166
13167                    Ok((
13168                        msg_id,
13169                        role,
13170                        content,
13171                        extra_json,
13172                        Some(effective_ts),
13173                        workspace_id,
13174                        source_id,
13175                        conv_started_at,
13176                        agent_slug,
13177                    ))
13178                },
13179            )?;
13180
13181            if rows.is_empty() {
13182                break;
13183            }
13184
13185            let chunk_len = rows.len();
13186            let mut entries = Vec::with_capacity(chunk_len);
13187            let mut rollup_agg = AnalyticsRollupAggregator::new();
13188
13189            for (
13190                msg_id,
13191                role,
13192                content,
13193                extra_json,
13194                effective_ts,
13195                workspace_id,
13196                source_id,
13197                _conv_started_at,
13198                agent_slug,
13199            ) in &rows
13200            {
13201                let ts = effective_ts.unwrap_or(0);
13202                let day_id = Self::day_id_from_millis(ts);
13203                let hour_id = Self::hour_id_from_millis(ts);
13204                let content_chars = content.len() as i64;
13205                let content_tokens_est = content_chars / 4;
13206                let extra = extra_json
13207                    .as_ref()
13208                    .cloned()
13209                    .unwrap_or(serde_json::Value::Null);
13210                let usage =
13211                    crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
13212                let model_info = usage
13213                    .model_name
13214                    .as_deref()
13215                    .map(crate::connectors::normalize_model);
13216                let model_family = model_info
13217                    .as_ref()
13218                    .map(|i| i.family.clone())
13219                    .unwrap_or_else(|| "unknown".into());
13220                let model_tier = model_info
13221                    .as_ref()
13222                    .map(|i| i.tier.clone())
13223                    .unwrap_or_else(|| "unknown".into());
13224                let provider = usage
13225                    .provider
13226                    .clone()
13227                    .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
13228                    .unwrap_or_else(|| "unknown".into());
13229
13230                let entry = MessageMetricsEntry {
13231                    message_id: *msg_id,
13232                    created_at_ms: ts,
13233                    hour_id,
13234                    day_id,
13235                    agent_slug: agent_slug.clone(),
13236                    workspace_id: workspace_id.unwrap_or(0),
13237                    source_id: source_id.clone(),
13238                    role: role.clone(),
13239                    content_chars,
13240                    content_tokens_est,
13241                    model_name: usage.model_name.clone(),
13242                    model_family,
13243                    model_tier,
13244                    provider,
13245                    api_input_tokens: usage.input_tokens,
13246                    api_output_tokens: usage.output_tokens,
13247                    api_cache_read_tokens: usage.cache_read_tokens,
13248                    api_cache_creation_tokens: usage.cache_creation_tokens,
13249                    api_thinking_tokens: usage.thinking_tokens,
13250                    api_service_tier: usage.service_tier,
13251                    api_data_source: usage.data_source.as_str().to_string(),
13252                    tool_call_count: usage.tool_call_count as i64,
13253                    has_tool_calls: usage.has_tool_calls,
13254                    has_plan: has_plan_for_role(role, content),
13255                };
13256                rollup_agg.record(&entry);
13257                entries.push(entry);
13258            }
13259
13260            total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
13261            let (hourly, daily, models_daily) =
13262                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
13263            usage_hourly_rows += hourly;
13264            usage_daily_rows += daily;
13265            usage_models_daily_rows += models_daily;
13266            offset += chunk_len as i64;
13267
13268            tracing::debug!(
13269                target: "cass::analytics",
13270                offset,
13271                chunk = chunk_len,
13272                inserted = entries.len(),
13273                total = total_inserted,
13274                "analytics_rebuild_chunk"
13275            );
13276
13277            if (chunk_len as i64) < CHUNK_SIZE {
13278                break;
13279            }
13280        }
13281
13282        tx.commit()?;
13283
13284        let elapsed = start.elapsed();
13285        let elapsed_ms = elapsed.as_millis() as u64;
13286        let msgs_per_sec = if elapsed_ms > 0 {
13287            (total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
13288        } else {
13289            0.0
13290        };
13291
13292        tracing::info!(
13293            target: "cass::analytics",
13294            message_metrics_rows = total_inserted,
13295            usage_hourly_rows,
13296            usage_daily_rows,
13297            usage_models_daily_rows,
13298            elapsed_ms,
13299            messages_per_sec = format!("{:.0}", msgs_per_sec),
13300            "analytics_rebuild_complete"
13301        );
13302
13303        Ok(AnalyticsRebuildResult {
13304            message_metrics_rows: total_inserted,
13305            usage_hourly_rows,
13306            usage_daily_rows,
13307            usage_models_daily_rows,
13308            elapsed_ms,
13309            messages_per_sec: msgs_per_sec,
13310        })
13311    }
13312
13313    /// Rebuild all daily stats from scratch.
13314    pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
13315        const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
13316        const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
13317
13318        let mut conversation_batch_size = rebuild_batch_size_env(
13319            "CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
13320            DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
13321        );
13322        let mut message_batch_size = rebuild_batch_size_env(
13323            "CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
13324            DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
13325        );
13326
13327        let total_messages: i64 =
13328            self.conn
13329                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13330                    row.get_typed(0)
13331                })?;
13332        let message_metrics_rows: i64 =
13333            self.conn
13334                .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
13335                    row.get_typed(0)
13336                })?;
13337        let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
13338
13339        tracing::info!(
13340            target: "cass::perf::daily_stats",
13341            total_messages,
13342            message_metrics_rows,
13343            use_message_metrics,
13344            "daily_stats rebuild selected message source"
13345        );
13346
13347        let mut tx = self.conn.transaction()?;
13348        tx.execute("DELETE FROM daily_stats")?;
13349
13350        let mut last_conversation_id = 0_i64;
13351        let mut conversation_batch_count = 0_usize;
13352        let mut conversations_processed = 0_usize;
13353        let mut messages_processed = 0_usize;
13354        let mut message_batch_count = 0_usize;
13355        let mut raw_entries_flushed = 0_usize;
13356        let mut expanded_entries_flushed = 0_usize;
13357        let message_scan_sql = if use_message_metrics {
13358            "SELECT m.idx, mm.content_chars
13359             FROM messages m
13360             JOIN message_metrics mm ON mm.message_id = m.id
13361             WHERE m.conversation_id = ?1
13362               AND m.idx > ?2
13363             ORDER BY m.conversation_id, m.idx
13364             LIMIT ?3"
13365        } else {
13366            "SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
13367             FROM messages m
13368             WHERE m.conversation_id = ?1
13369               AND m.idx > ?2
13370             ORDER BY m.conversation_id, m.idx
13371             LIMIT ?3"
13372        };
13373
13374        loop {
13375            // Avoid the 2-table JOIN with LIMIT that triggers frankensqlite's
13376            // materialization fallback (which is what the OOM retry below is
13377            // defending against — see 860acb12).  Inline agent slug via
13378            // correlated subquery and degrade NULL agent_id to 'unknown' for
13379            // consistency with the lexical/FTS rebuild paths.
13380            let conversation_rows = match self.conn.query_with_params(
13381                "SELECT c.id, c.started_at,
13382                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
13383                        c.source_id
13384                 FROM conversations c
13385                 WHERE c.id > ?1
13386                 ORDER BY c.id
13387                 LIMIT ?2",
13388                &params_from_iter([
13389                    ParamValue::from(last_conversation_id),
13390                    ParamValue::from(conversation_batch_size as i64),
13391                ]),
13392            ) {
13393                Ok(rows) => rows,
13394                Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
13395                    let previous_batch_size = conversation_batch_size;
13396                    conversation_batch_size = (conversation_batch_size / 2).max(1);
13397                    tracing::warn!(
13398                        previous_batch_size,
13399                        conversation_batch_size,
13400                        last_conversation_id,
13401                        "daily_stats conversation scan ran out of memory; retrying with smaller batch"
13402                    );
13403                    continue;
13404                }
13405                Err(err) => return Err(err.into()),
13406            };
13407            if conversation_rows.is_empty() {
13408                break;
13409            }
13410
13411            let mut aggregate = StatsAggregator::new();
13412            let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
13413                Vec::with_capacity(conversation_rows.len());
13414            for row in &conversation_rows {
13415                let conversation_id: i64 = row.get_typed(0)?;
13416                let started_at: Option<i64> = row.get_typed(1)?;
13417                let agent_slug: String = row.get_typed(2)?;
13418                let source_id: String = row.get_typed(3)?;
13419                last_conversation_id = conversation_id;
13420                let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13421                aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
13422                conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
13423                conversations_processed += 1;
13424            }
13425
13426            conversation_batch_count += 1;
13427            raw_entries_flushed += aggregate.raw_entry_count();
13428            let entries = aggregate.expand();
13429            expanded_entries_flushed += entries.len();
13430            if !entries.is_empty() {
13431                franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13432            }
13433            if conversation_batch_count.is_multiple_of(25) {
13434                tracing::info!(
13435                    target: "cass::perf::daily_stats",
13436                    conversations_processed,
13437                    batches = conversation_batch_count,
13438                    batch_size = conversation_batch_size,
13439                    last_conversation_id,
13440                    "daily_stats rebuild conversation scan progress"
13441                );
13442            }
13443            if conversation_batch_meta.is_empty() {
13444                continue;
13445            }
13446
13447            for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
13448                let mut cursor_message_idx = -1_i64;
13449                loop {
13450                    let message_rows = match self.conn.query_with_params(
13451                        message_scan_sql,
13452                        &params_from_iter([
13453                            ParamValue::from(conversation_id),
13454                            ParamValue::from(cursor_message_idx),
13455                            ParamValue::from(message_batch_size as i64),
13456                        ]),
13457                    ) {
13458                        Ok(rows) => rows,
13459                        Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
13460                            let previous_batch_size = message_batch_size;
13461                            message_batch_size = (message_batch_size / 2).max(1);
13462                            tracing::warn!(
13463                                previous_batch_size,
13464                                message_batch_size,
13465                                conversation_id,
13466                                cursor_message_idx,
13467                                "daily_stats message scan ran out of memory; retrying with smaller batch"
13468                            );
13469                            continue;
13470                        }
13471                        Err(err) => return Err(err.into()),
13472                    };
13473                    if message_rows.is_empty() {
13474                        break;
13475                    }
13476
13477                    let mut aggregate = StatsAggregator::new();
13478                    for row in &message_rows {
13479                        let message_idx: i64 = row.get_typed(0)?;
13480                        let content_len: i64 = row.get_typed(1)?;
13481                        cursor_message_idx = message_idx;
13482                        aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
13483                        messages_processed += 1;
13484                    }
13485
13486                    message_batch_count += 1;
13487                    raw_entries_flushed += aggregate.raw_entry_count();
13488                    let entries = aggregate.expand();
13489                    expanded_entries_flushed += entries.len();
13490                    if !entries.is_empty() {
13491                        franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13492                    }
13493                    if message_batch_count.is_multiple_of(50) {
13494                        tracing::info!(
13495                            target: "cass::perf::daily_stats",
13496                            messages_processed,
13497                            batches = message_batch_count,
13498                            batch_size = message_batch_size,
13499                            source = if use_message_metrics {
13500                                "message_metrics"
13501                            } else {
13502                                "messages"
13503                            },
13504                            conversation_id,
13505                            cursor_message_idx,
13506                            "daily_stats rebuild message scan progress"
13507                        );
13508                    }
13509                }
13510            }
13511        }
13512
13513        let rows_created: i64 =
13514            tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
13515                row.get_typed(0)
13516            })?;
13517        let total_sessions: i64 = tx.query_row_map(
13518            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
13519            fparams![],
13520            |row| row.get_typed(0),
13521        )?;
13522
13523        tx.commit()?;
13524
13525        tracing::info!(
13526            target: "cass::perf::daily_stats",
13527            rows_created,
13528            total_sessions,
13529            conversations_processed,
13530            conversation_batches = conversation_batch_count,
13531            conversation_batch_size,
13532            message_batches = message_batch_count,
13533            message_batch_size,
13534            messages_processed,
13535            use_message_metrics,
13536            raw_entries_flushed,
13537            expanded_entries_flushed,
13538            "Daily stats rebuilt from conversations"
13539        );
13540
13541        Ok(DailyStatsRebuildResult {
13542            rows_created,
13543            total_sessions,
13544        })
13545    }
13546}
13547
13548// SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
13549// All methods are available through FrankenStorage.
13550
13551// -------------------------------------------------------------------------
13552// IndexingCache (Opt 7.2) - N+1 Prevention for Agent/Workspace IDs
13553// -------------------------------------------------------------------------
13554
13555/// Cache for agent and workspace IDs during batch indexing.
13556///
13557/// Prevents N+1 database queries by caching the results of ensure_agent
13558/// and ensure_workspace calls within a batch. This is per-batch and
13559/// single-threaded, so no synchronization is needed.
13560///
13561/// # Usage
13562/// ```ignore
13563/// let mut cache = IndexingCache::new();
13564/// for conv in conversations {
13565///     let agent_id = cache.get_or_insert_agent(storage, &agent)?;
13566///     let workspace_id = cache.get_or_insert_workspace(storage, workspace)?;
13567///     // ... use agent_id and workspace_id
13568/// }
13569/// ```
13570///
13571/// # Rollback
13572/// Set environment variable `CASS_SQLITE_CACHE=0` to bypass caching
13573/// and use direct DB calls (useful for debugging).
13574#[derive(Debug, Default)]
13575pub struct IndexingCache {
13576    agent_ids: HashMap<String, i64>,
13577    workspace_ids: HashMap<PathBuf, i64>,
13578    hits: u64,
13579    misses: u64,
13580}
13581
13582pub trait IndexingCacheStorage {
13583    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
13584    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
13585}
13586
13587impl IndexingCacheStorage for FrankenStorage {
13588    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
13589        self.ensure_agent(agent)
13590    }
13591
13592    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
13593        self.ensure_workspace(path, display_name)
13594    }
13595}
13596
13597// IndexingCacheStorage for SqliteStorage removed: SqliteStorage is a type alias for FrankenStorage.
13598
13599impl IndexingCache {
13600    /// Create a new empty cache.
13601    pub fn new() -> Self {
13602        Self {
13603            agent_ids: HashMap::new(),
13604            workspace_ids: HashMap::new(),
13605            hits: 0,
13606            misses: 0,
13607        }
13608    }
13609
13610    /// Check if caching is enabled via environment variable.
13611    /// Returns true unless CASS_SQLITE_CACHE is set to "0" or "false".
13612    pub fn is_enabled() -> bool {
13613        dotenvy::var("CASS_SQLITE_CACHE")
13614            .map(|v| v != "0" && v.to_lowercase() != "false")
13615            .unwrap_or(true)
13616    }
13617
13618    /// Get or insert an agent ID, using cache if available.
13619    ///
13620    /// Returns the cached ID if present, otherwise calls ensure_agent
13621    /// and caches the result.
13622    pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
13623    where
13624        S: IndexingCacheStorage + ?Sized,
13625    {
13626        if let Some(&cached) = self.agent_ids.get(&agent.slug) {
13627            self.hits += 1;
13628            return Ok(cached);
13629        }
13630
13631        self.misses += 1;
13632        let id = storage.ensure_indexing_agent(agent)?;
13633        self.agent_ids.insert(agent.slug.clone(), id);
13634        Ok(id)
13635    }
13636
13637    /// Get or insert a workspace ID, using cache if available.
13638    ///
13639    /// Returns the cached ID if present, otherwise calls ensure_workspace
13640    /// and caches the result.
13641    pub fn get_or_insert_workspace(
13642        &mut self,
13643        storage: &(impl IndexingCacheStorage + ?Sized),
13644        path: &Path,
13645        display_name: Option<&str>,
13646    ) -> Result<i64> {
13647        if let Some(&cached) = self.workspace_ids.get(path) {
13648            self.hits += 1;
13649            return Ok(cached);
13650        }
13651
13652        self.misses += 1;
13653        let id = storage.ensure_indexing_workspace(path, display_name)?;
13654        self.workspace_ids.insert(path.to_path_buf(), id);
13655        Ok(id)
13656    }
13657
13658    /// Get cache statistics: (hits, misses, hit_rate).
13659    pub fn stats(&self) -> (u64, u64, f64) {
13660        let total = self.hits + self.misses;
13661        let hit_rate = if total > 0 {
13662            self.hits as f64 / total as f64
13663        } else {
13664            0.0
13665        };
13666        (self.hits, self.misses, hit_rate)
13667    }
13668
13669    /// Clear the cache, resetting all state.
13670    pub fn clear(&mut self) {
13671        self.agent_ids.clear();
13672        self.workspace_ids.clear();
13673        self.hits = 0;
13674        self.misses = 0;
13675    }
13676
13677    /// Number of cached agents.
13678    pub fn agent_count(&self) -> usize {
13679        self.agent_ids.len()
13680    }
13681
13682    /// Number of cached workspaces.
13683    pub fn workspace_count(&self) -> usize {
13684        self.workspace_ids.len()
13685    }
13686}
13687
13688// -------------------------------------------------------------------------
13689// StatsAggregator (kzxu) - Batched Daily Stats Updates
13690// -------------------------------------------------------------------------
13691// Aggregates daily stats in memory during batch ingestion, then flushes
13692// to the database in a single batched INSERT...ON CONFLICT operation.
13693// This prevents N×4 database writes (4 permutations per conversation).
13694
13695/// Accumulated statistics delta for a single (day_id, agent, source) combination.
13696#[derive(Clone, Copy, Debug, Default)]
13697pub struct StatsDelta {
13698    pub session_count_delta: i64,
13699    pub message_count_delta: i64,
13700    pub total_chars_delta: i64,
13701}
13702
13703/// In-memory aggregator for batched daily stats updates.
13704///
13705/// During batch ingestion, we accumulate deltas per (day_id, agent, source) key.
13706/// After processing all conversations, call `expand()` to generate the 4
13707/// permutations per raw entry, then flush via `SqliteStorage::update_daily_stats_batched`.
13708///
13709/// # Example
13710/// ```ignore
13711/// let mut agg = StatsAggregator::new();
13712/// for conv in conversations {
13713///     agg.record(&conv.agent_slug, source_id, day_id, msg_count, char_count);
13714/// }
13715/// let entries = agg.expand();
13716/// storage.update_daily_stats_batched(&entries)?;
13717/// ```
13718#[derive(Debug, Default)]
13719pub struct StatsAggregator {
13720    /// Raw deltas keyed by (day_id, agent_slug, source_id).
13721    /// Only stores specific (non-"all") combinations.
13722    deltas: HashMap<(i64, String, String), StatsDelta>,
13723}
13724
13725impl StatsAggregator {
13726    /// Create a new empty aggregator.
13727    pub fn new() -> Self {
13728        Self {
13729            deltas: HashMap::new(),
13730        }
13731    }
13732
13733    /// Record a conversation's contribution to stats (session + messages + chars).
13734    ///
13735    /// This increments session_count by 1.
13736    ///
13737    /// # Arguments
13738    /// * `agent_slug` - The specific agent slug (not "all")
13739    /// * `source_id` - The specific source ID (not "all")
13740    /// * `day_id` - Days since 2020-01-01 (from `SqliteStorage::day_id_from_millis`)
13741    /// * `message_count` - Number of messages in the conversation
13742    /// * `total_chars` - Total character count across all messages
13743    pub fn record(
13744        &mut self,
13745        agent_slug: &str,
13746        source_id: &str,
13747        day_id: i64,
13748        message_count: i64,
13749        total_chars: i64,
13750    ) {
13751        self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
13752    }
13753
13754    /// Record an arbitrary delta. Use this for append-only updates where
13755    /// `session_count_delta` may be 0 but message/char deltas are non-zero.
13756    pub fn record_delta(
13757        &mut self,
13758        agent_slug: &str,
13759        source_id: &str,
13760        day_id: i64,
13761        session_count_delta: i64,
13762        message_count_delta: i64,
13763        total_chars_delta: i64,
13764    ) {
13765        if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
13766            return;
13767        }
13768        let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
13769        let delta = self.deltas.entry(key).or_default();
13770        delta.session_count_delta += session_count_delta;
13771        delta.message_count_delta += message_count_delta;
13772        delta.total_chars_delta += total_chars_delta;
13773    }
13774
13775    /// Expand raw deltas into the 4 permutation keys:
13776    /// - (agent, source) - specific both
13777    /// - ("all", source) - all agents, specific source
13778    /// - (agent, "all") - specific agent, all sources
13779    /// - ("all", "all") - totals
13780    ///
13781    /// Returns entries sorted by (day_id, agent_slug, source_id) for deterministic batching.
13782    pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
13783        let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
13784
13785        for ((day_id, agent, source), delta) in &self.deltas {
13786            let permutations = [
13787                (agent.as_str(), source.as_str()),
13788                ("all", source.as_str()),
13789                (agent.as_str(), "all"),
13790                ("all", "all"),
13791            ];
13792
13793            // Ensure we don't double-apply deltas if agent/source is already "all".
13794            for idx in 0..permutations.len() {
13795                let (a, s) = permutations[idx];
13796                if permutations[..idx].contains(&(a, s)) {
13797                    continue;
13798                }
13799                let key = (*day_id, a.to_owned(), s.to_owned());
13800                let entry = expanded.entry(key).or_default();
13801                entry.session_count_delta += delta.session_count_delta;
13802                entry.message_count_delta += delta.message_count_delta;
13803                entry.total_chars_delta += delta.total_chars_delta;
13804            }
13805        }
13806
13807        let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
13808            .into_iter()
13809            .map(|((d, a, s), delta)| (d, a, s, delta))
13810            .collect();
13811        out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
13812            d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
13813        });
13814        out
13815    }
13816
13817    /// Check if the aggregator is empty (no data recorded).
13818    pub fn is_empty(&self) -> bool {
13819        self.deltas.is_empty()
13820    }
13821
13822    /// Get number of distinct raw (day, agent, source) combinations recorded.
13823    pub fn raw_entry_count(&self) -> usize {
13824        self.deltas.len()
13825    }
13826}
13827
13828// -------------------------------------------------------------------------
13829// TokenStatsAggregator — Batched Token Analytics Daily Stats
13830// -------------------------------------------------------------------------
13831// Mirrors StatsAggregator pattern for token-level metrics.
13832// Aggregates token usage in memory during batch ingestion, then flushes
13833// to token_daily_stats in a single batched INSERT...ON CONFLICT operation.
13834
13835/// Accumulated token statistics delta for a single (day_id, agent, source, model_family) combination.
13836#[derive(Clone, Debug, Default)]
13837pub struct TokenStatsDelta {
13838    pub api_call_count: i64,
13839    pub user_message_count: i64,
13840    pub assistant_message_count: i64,
13841    pub tool_message_count: i64,
13842    pub total_input_tokens: i64,
13843    pub total_output_tokens: i64,
13844    pub total_cache_read_tokens: i64,
13845    pub total_cache_creation_tokens: i64,
13846    pub total_thinking_tokens: i64,
13847    pub grand_total_tokens: i64,
13848    pub total_content_chars: i64,
13849    pub total_tool_calls: i64,
13850    pub estimated_cost_usd: f64,
13851    pub session_count: i64,
13852}
13853
13854/// In-memory aggregator for batched token daily stats updates.
13855///
13856/// During batch ingestion, accumulate token deltas per (day_id, agent, source, model_family) key.
13857/// After processing, call `expand()` to generate the 5 permutation keys, then flush via
13858/// `update_token_daily_stats_batched_in_tx`.
13859#[derive(Debug, Default)]
13860pub struct TokenStatsAggregator {
13861    /// Raw deltas keyed by (day_id, agent_slug, source_id, model_family).
13862    deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
13863}
13864
13865impl TokenStatsAggregator {
13866    pub fn new() -> Self {
13867        Self {
13868            deltas: HashMap::new(),
13869        }
13870    }
13871
13872    /// Record a single message's token contribution.
13873    #[allow(clippy::too_many_arguments)]
13874    pub fn record(
13875        &mut self,
13876        agent_slug: &str,
13877        source_id: &str,
13878        day_id: i64,
13879        model_family: &str,
13880        role: &str,
13881        usage: &crate::connectors::ExtractedTokenUsage,
13882        content_chars: i64,
13883        estimated_cost_usd: f64,
13884    ) {
13885        let key = (
13886            day_id,
13887            agent_slug.to_owned(),
13888            source_id.to_owned(),
13889            model_family.to_owned(),
13890        );
13891        let delta = self.deltas.entry(key).or_default();
13892
13893        delta.api_call_count += 1;
13894        match role {
13895            "user" => delta.user_message_count += 1,
13896            "assistant" | "agent" => delta.assistant_message_count += 1,
13897            "tool" => delta.tool_message_count += 1,
13898            _ => {}
13899        }
13900
13901        delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
13902        delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
13903        delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
13904        delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
13905        delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
13906        delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
13907        delta.total_content_chars += content_chars;
13908        delta.total_tool_calls += usage.tool_call_count as i64;
13909        delta.estimated_cost_usd += estimated_cost_usd;
13910    }
13911
13912    /// Record a session count bump for a given day/agent/source/model.
13913    pub fn record_session(
13914        &mut self,
13915        agent_slug: &str,
13916        source_id: &str,
13917        day_id: i64,
13918        model_family: &str,
13919    ) {
13920        let key = (
13921            day_id,
13922            agent_slug.to_owned(),
13923            source_id.to_owned(),
13924            model_family.to_owned(),
13925        );
13926        self.deltas.entry(key).or_default().session_count += 1;
13927    }
13928
13929    /// Expand raw deltas into 5 permutation keys for the 4-dimensional composite PK:
13930    /// - (agent, source, model)  — specific all three
13931    /// - ("all", source, model)  — all agents
13932    /// - (agent, "all", model)   — all sources
13933    /// - (agent, source, "all")  — all models
13934    /// - ("all", "all", "all")   — global total
13935    pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
13936        let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
13937
13938        for ((day_id, agent, source, model), delta) in &self.deltas {
13939            let permutations = [
13940                (agent.as_str(), source.as_str(), model.as_str()),
13941                ("all", source.as_str(), model.as_str()),
13942                (agent.as_str(), "all", model.as_str()),
13943                (agent.as_str(), source.as_str(), "all"),
13944                ("all", "all", "all"),
13945            ];
13946
13947            for idx in 0..permutations.len() {
13948                let (a, s, m) = permutations[idx];
13949                // Deduplicate if agent/source/model is already "all"
13950                if permutations[..idx].contains(&(a, s, m)) {
13951                    continue;
13952                }
13953                let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
13954                let entry = expanded.entry(key).or_default();
13955                entry.api_call_count += delta.api_call_count;
13956                entry.user_message_count += delta.user_message_count;
13957                entry.assistant_message_count += delta.assistant_message_count;
13958                entry.tool_message_count += delta.tool_message_count;
13959                entry.total_input_tokens += delta.total_input_tokens;
13960                entry.total_output_tokens += delta.total_output_tokens;
13961                entry.total_cache_read_tokens += delta.total_cache_read_tokens;
13962                entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
13963                entry.total_thinking_tokens += delta.total_thinking_tokens;
13964                entry.grand_total_tokens += delta.grand_total_tokens;
13965                entry.total_content_chars += delta.total_content_chars;
13966                entry.total_tool_calls += delta.total_tool_calls;
13967                entry.estimated_cost_usd += delta.estimated_cost_usd;
13968                entry.session_count += delta.session_count;
13969            }
13970        }
13971
13972        let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
13973            .into_iter()
13974            .map(|((d, a, s, m), delta)| (d, a, s, m, delta))
13975            .collect();
13976        out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
13977            d1.cmp(d2)
13978                .then_with(|| a1.cmp(a2))
13979                .then_with(|| s1.cmp(s2))
13980                .then_with(|| m1.cmp(m2))
13981        });
13982        out
13983    }
13984
13985    pub fn is_empty(&self) -> bool {
13986        self.deltas.is_empty()
13987    }
13988
13989    pub fn raw_entry_count(&self) -> usize {
13990        self.deltas.len()
13991    }
13992}
13993
13994// -------------------------------------------------------------------------
13995// AnalyticsRollupAggregator — Batched usage_hourly + usage_daily Updates
13996// -------------------------------------------------------------------------
13997// Accumulates per-message deltas in memory, then flushes to both
13998// usage_hourly and usage_daily in a single batched operation.
13999
14000/// Delta for a single (bucket, agent_slug, workspace_id, source_id) rollup key.
14001#[derive(Clone, Debug, Default)]
14002pub struct UsageRollupDelta {
14003    pub message_count: i64,
14004    pub user_message_count: i64,
14005    pub assistant_message_count: i64,
14006    pub tool_call_count: i64,
14007    pub plan_message_count: i64,
14008    pub plan_content_tokens_est_total: i64,
14009    pub plan_api_tokens_total: i64,
14010    pub api_coverage_message_count: i64,
14011    pub content_tokens_est_total: i64,
14012    pub content_tokens_est_user: i64,
14013    pub content_tokens_est_assistant: i64,
14014    pub api_tokens_total: i64,
14015    pub api_input_tokens_total: i64,
14016    pub api_output_tokens_total: i64,
14017    pub api_cache_read_tokens_total: i64,
14018    pub api_cache_creation_tokens_total: i64,
14019    pub api_thinking_tokens_total: i64,
14020}
14021
14022/// Pending message_metrics row for batch insertion.
14023#[derive(Debug, Clone)]
14024pub struct MessageMetricsEntry {
14025    pub message_id: i64,
14026    pub created_at_ms: i64,
14027    pub hour_id: i64,
14028    pub day_id: i64,
14029    pub agent_slug: String,
14030    pub workspace_id: i64,
14031    pub source_id: String,
14032    pub role: String,
14033    pub content_chars: i64,
14034    pub content_tokens_est: i64,
14035    pub model_name: Option<String>,
14036    pub model_family: String,
14037    pub model_tier: String,
14038    pub provider: String,
14039    pub api_input_tokens: Option<i64>,
14040    pub api_output_tokens: Option<i64>,
14041    pub api_cache_read_tokens: Option<i64>,
14042    pub api_cache_creation_tokens: Option<i64>,
14043    pub api_thinking_tokens: Option<i64>,
14044    pub api_service_tier: Option<String>,
14045    pub api_data_source: String,
14046    pub tool_call_count: i64,
14047    pub has_tool_calls: bool,
14048    pub has_plan: bool,
14049}
14050
14051/// In-memory aggregator for batched usage_hourly and usage_daily rollup updates.
14052///
14053/// Keyed by (bucket_id, agent_slug, workspace_id, source_id).
14054/// Maintains separate hourly and daily delta maps.
14055#[derive(Debug, Default)]
14056pub struct AnalyticsRollupAggregator {
14057    hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14058    daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14059    models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
14060}
14061
14062impl AnalyticsRollupAggregator {
14063    pub fn new() -> Self {
14064        Self::default()
14065    }
14066
14067    /// Record a single message's contribution to both hourly and daily rollups.
14068    pub fn record(&mut self, entry: &MessageMetricsEntry) {
14069        let content_est = entry.content_tokens_est;
14070        let api_total = entry.api_input_tokens.unwrap_or(0)
14071            + entry.api_output_tokens.unwrap_or(0)
14072            + entry.api_cache_read_tokens.unwrap_or(0)
14073            + entry.api_cache_creation_tokens.unwrap_or(0)
14074            + entry.api_thinking_tokens.unwrap_or(0);
14075        let is_api = entry.api_data_source == "api";
14076        let is_user = entry.role == "user";
14077        let is_assistant = entry.role == "assistant" || entry.role == "agent";
14078
14079        // Apply to both hourly and daily
14080        for (map, bucket_id) in [
14081            (&mut self.hourly, entry.hour_id),
14082            (&mut self.daily, entry.day_id),
14083        ] {
14084            let key = (
14085                bucket_id,
14086                entry.agent_slug.clone(),
14087                entry.workspace_id,
14088                entry.source_id.clone(),
14089            );
14090            let d = map.entry(key).or_default();
14091            d.message_count += 1;
14092            if is_user {
14093                d.user_message_count += 1;
14094                d.content_tokens_est_user += content_est;
14095            }
14096            if is_assistant {
14097                d.assistant_message_count += 1;
14098                d.content_tokens_est_assistant += content_est;
14099            }
14100            d.tool_call_count += entry.tool_call_count;
14101            if entry.has_plan {
14102                d.plan_message_count += 1;
14103                d.plan_content_tokens_est_total += content_est;
14104                if is_api {
14105                    d.plan_api_tokens_total += api_total;
14106                }
14107            }
14108            if is_api {
14109                d.api_coverage_message_count += 1;
14110                d.api_tokens_total += api_total;
14111                d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14112                d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14113                d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14114                d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14115                d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14116            }
14117            d.content_tokens_est_total += content_est;
14118        }
14119
14120        let model_key = (
14121            entry.day_id,
14122            entry.agent_slug.clone(),
14123            entry.workspace_id,
14124            entry.source_id.clone(),
14125            entry.model_family.clone(),
14126            entry.model_tier.clone(),
14127        );
14128        let d = self.models_daily.entry(model_key).or_default();
14129        d.message_count += 1;
14130        if is_user {
14131            d.user_message_count += 1;
14132            d.content_tokens_est_user += content_est;
14133        }
14134        if is_assistant {
14135            d.assistant_message_count += 1;
14136            d.content_tokens_est_assistant += content_est;
14137        }
14138        d.tool_call_count += entry.tool_call_count;
14139        if entry.has_plan {
14140            d.plan_message_count += 1;
14141            d.plan_content_tokens_est_total += content_est;
14142            if is_api {
14143                d.plan_api_tokens_total += api_total;
14144            }
14145        }
14146        if is_api {
14147            d.api_coverage_message_count += 1;
14148            d.api_tokens_total += api_total;
14149            d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14150            d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14151            d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14152            d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14153            d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14154        }
14155        d.content_tokens_est_total += content_est;
14156    }
14157
14158    pub fn is_empty(&self) -> bool {
14159        self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
14160    }
14161
14162    pub fn hourly_entry_count(&self) -> usize {
14163        self.hourly.len()
14164    }
14165
14166    pub fn daily_entry_count(&self) -> usize {
14167        self.daily.len()
14168    }
14169
14170    pub fn models_daily_entry_count(&self) -> usize {
14171        self.models_daily.len()
14172    }
14173}
14174
14175/// Whether the current role should be considered for plan attribution.
14176///
14177/// Plan attribution v2 defaults to assistant/agent messages only.
14178fn has_plan_for_role(role: &str, content: &str) -> bool {
14179    let role = role.trim();
14180    (role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
14181        && has_plan_heuristic(content)
14182}
14183
14184/// Heuristic to detect "plan" messages.
14185///
14186/// v2 behavior:
14187/// - Require an explicit plan marker near the top of the message.
14188/// - Require structured steps (numbered or bullets) to reduce false positives.
14189/// - Avoid classifying tool-output blobs as plans.
14190fn has_plan_heuristic(content: &str) -> bool {
14191    if content.len() < 24 {
14192        return false;
14193    }
14194
14195    let lower = content.to_lowercase();
14196
14197    // Ignore tool-output-like blobs unless they also have a strong plan header.
14198    let looks_like_tool_blob = lower.contains("```")
14199        || lower.contains("\"tool\"")
14200        || lower.contains("stdout:")
14201        || lower.contains("stderr:")
14202        || lower.contains("exit code:");
14203
14204    let mut lines: Vec<&str> = Vec::with_capacity(60);
14205    let mut in_fenced_code = false;
14206    for raw in lower.lines() {
14207        let line = raw.trim();
14208        if line.starts_with("```") {
14209            in_fenced_code = !in_fenced_code;
14210            continue;
14211        }
14212        if in_fenced_code || line.is_empty() {
14213            continue;
14214        }
14215        lines.push(line);
14216        if lines.len() >= 60 {
14217            break;
14218        }
14219    }
14220
14221    let header_pos = lines.iter().position(|line| {
14222        line.starts_with("## plan")
14223            || line.starts_with("# plan")
14224            || line.starts_with("plan:")
14225            || line.starts_with("implementation plan")
14226            || line.starts_with("next steps:")
14227            || line.starts_with("action plan:")
14228    });
14229    let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
14230    let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
14231
14232    if !header_near_top {
14233        return false;
14234    }
14235    if looks_like_tool_blob && header_pos.is_none() {
14236        return false;
14237    }
14238
14239    let numbered_steps = lines
14240        .iter()
14241        .filter(|line| is_numbered_step_line(line))
14242        .count();
14243    let bullet_steps = lines
14244        .iter()
14245        .filter(|line| {
14246            line.starts_with("- ")
14247                || line.starts_with("* ")
14248                || line.starts_with("+ ")
14249                || line.starts_with("- [ ] ")
14250                || line.starts_with("- [x] ")
14251        })
14252        .count();
14253
14254    numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
14255}
14256
14257fn is_numbered_step_line(line: &str) -> bool {
14258    let trimmed = line.trim_start();
14259    let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
14260    if digit_count == 0 || digit_count > 3 {
14261        return false;
14262    }
14263    let rest = &trimmed[digit_count..];
14264    rest.starts_with(". ") || rest.starts_with(") ")
14265}
14266
14267/// Pending token_usage row to be batch-inserted.
14268#[derive(Debug, Clone)]
14269pub struct TokenUsageEntry {
14270    pub message_id: i64,
14271    pub conversation_id: i64,
14272    pub agent_id: i64,
14273    pub workspace_id: Option<i64>,
14274    pub source_id: String,
14275    pub timestamp_ms: i64,
14276    pub day_id: i64,
14277    pub model_name: Option<String>,
14278    pub model_family: Option<String>,
14279    pub model_tier: Option<String>,
14280    pub service_tier: Option<String>,
14281    pub provider: Option<String>,
14282    pub input_tokens: Option<i64>,
14283    pub output_tokens: Option<i64>,
14284    pub cache_read_tokens: Option<i64>,
14285    pub cache_creation_tokens: Option<i64>,
14286    pub thinking_tokens: Option<i64>,
14287    pub total_tokens: Option<i64>,
14288    pub estimated_cost_usd: Option<f64>,
14289    pub role: String,
14290    pub content_chars: i64,
14291    pub has_tool_calls: bool,
14292    pub tool_call_count: u32,
14293    pub data_source: String,
14294}
14295
14296// -------------------------------------------------------------------------
14297// PricingTable — In-memory cache for model_pricing lookups (bead z9fse.10)
14298// -------------------------------------------------------------------------
14299
14300/// One pricing row loaded from the `model_pricing` table.
14301#[derive(Debug, Clone)]
14302pub struct PricingEntry {
14303    pub model_pattern: String,
14304    pub provider: String,
14305    pub input_cost_per_mtok: f64,
14306    pub output_cost_per_mtok: f64,
14307    pub cache_read_cost_per_mtok: Option<f64>,
14308    pub cache_creation_cost_per_mtok: Option<f64>,
14309    /// Effective date as day_id (days since 2020-01-01).
14310    pub effective_day_id: i64,
14311}
14312
14313/// Diagnostics for pricing coverage during a batch operation.
14314#[derive(Debug, Clone, Default)]
14315pub struct PricingDiagnostics {
14316    pub priced_count: u64,
14317    pub unpriced_count: u64,
14318    /// Top unknown model names → count.
14319    pub unknown_models: HashMap<String, u64>,
14320}
14321
14322impl PricingDiagnostics {
14323    fn record_priced(&mut self) {
14324        self.priced_count += 1;
14325    }
14326
14327    fn record_unpriced(&mut self, model_name: Option<&str>) {
14328        self.unpriced_count += 1;
14329        let key = model_name.unwrap_or("(none)").to_string();
14330        *self.unknown_models.entry(key).or_insert(0) += 1;
14331    }
14332
14333    /// Log a summary of pricing coverage.
14334    pub fn log_summary(&self) {
14335        let total = self.priced_count + self.unpriced_count;
14336        if total == 0 {
14337            return;
14338        }
14339        let pct = (self.priced_count as f64 / total as f64) * 100.0;
14340        tracing::info!(
14341            target: "cass::analytics::pricing",
14342            priced = self.priced_count,
14343            unpriced = self.unpriced_count,
14344            total = total,
14345            coverage_pct = format!("{pct:.1}%"),
14346            "pricing coverage"
14347        );
14348        if !self.unknown_models.is_empty() {
14349            let mut sorted: Vec<_> = self.unknown_models.iter().collect();
14350            sorted.sort_by(|a, b| b.1.cmp(a.1));
14351            for (model, count) in sorted.iter().take(5) {
14352                tracing::debug!(
14353                    target: "cass::analytics::pricing",
14354                    model = model.as_str(),
14355                    count = count,
14356                    "unknown model (no pricing)"
14357                );
14358            }
14359        }
14360    }
14361}
14362
14363/// In-memory pricing table loaded from `model_pricing` for fast lookups.
14364#[derive(Debug, Clone)]
14365pub struct PricingTable {
14366    entries: Vec<PricingEntry>,
14367}
14368
14369impl PricingTable {
14370    /// Load all pricing entries from the database.
14371    pub fn load(conn: &FrankenConnection) -> Result<Self> {
14372        Self::franken_load(conn)
14373    }
14374
14375    /// Load all pricing entries from a frankensqlite connection.
14376    pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
14377        let rows = conn.query(
14378            "SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
14379                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
14380             FROM model_pricing
14381             ORDER BY effective_date DESC",
14382        )?;
14383        let mut entries = Vec::with_capacity(rows.len());
14384        for row in &rows {
14385            let effective_date: String = row.get_typed(6)?;
14386            let effective_day_id = date_str_to_day_id(&effective_date)?;
14387            entries.push(PricingEntry {
14388                model_pattern: row.get_typed(0)?,
14389                provider: row.get_typed(1)?,
14390                input_cost_per_mtok: row.get_typed(2)?,
14391                output_cost_per_mtok: row.get_typed(3)?,
14392                cache_read_cost_per_mtok: row.get_typed(4)?,
14393                cache_creation_cost_per_mtok: row.get_typed(5)?,
14394                effective_day_id,
14395            });
14396        }
14397        Ok(Self { entries })
14398    }
14399
14400    /// Look up the best pricing entry for a given model name and date.
14401    ///
14402    /// Selection rules:
14403    /// 1. Pattern must match model_name (SQL LIKE semantics).
14404    /// 2. effective_day_id must be <= message_day_id.
14405    /// 3. Among matches, prefer the most recent effective_date.
14406    /// 4. Tie-break by pattern specificity (longest pattern wins).
14407    pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
14408        let mut best: Option<&PricingEntry> = None;
14409
14410        for entry in &self.entries {
14411            if entry.effective_day_id > message_day_id {
14412                continue;
14413            }
14414            if !sql_like_match(model_name, &entry.model_pattern) {
14415                continue;
14416            }
14417
14418            match best {
14419                None => best = Some(entry),
14420                Some(current) => {
14421                    if entry.effective_day_id > current.effective_day_id
14422                        || (entry.effective_day_id == current.effective_day_id
14423                            && entry.model_pattern.len() > current.model_pattern.len())
14424                    {
14425                        best = Some(entry);
14426                    }
14427                }
14428            }
14429        }
14430
14431        best
14432    }
14433
14434    /// Compute estimated cost in USD for a set of token counts.
14435    ///
14436    /// Returns `None` if no pricing entry matches or if no token counts are available.
14437    pub fn compute_cost(
14438        &self,
14439        model_name: Option<&str>,
14440        message_day_id: i64,
14441        input_tokens: Option<i64>,
14442        output_tokens: Option<i64>,
14443        cache_read_tokens: Option<i64>,
14444        cache_creation_tokens: Option<i64>,
14445    ) -> Option<f64> {
14446        let model = model_name?;
14447        let pricing = self.lookup(model, message_day_id)?;
14448
14449        if input_tokens.is_none() && output_tokens.is_none() {
14450            return None;
14451        }
14452
14453        let mut cost = 0.0;
14454        let cache_read = cache_read_tokens.unwrap_or(0);
14455        let cache_creation = cache_creation_tokens.unwrap_or(0);
14456        // input_tokens includes cache tokens as a subset; subtract them
14457        // so we don't charge at both the full input rate AND the cache rate.
14458        let non_cache_input = input_tokens
14459            .unwrap_or(0)
14460            .saturating_sub(cache_read)
14461            .saturating_sub(cache_creation)
14462            .max(0);
14463        cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
14464        cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
14465
14466        if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
14467            cost += cache_read as f64 * cache_price / 1_000_000.0;
14468        }
14469        if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
14470            cost += cache_creation as f64 * cache_price / 1_000_000.0;
14471        }
14472
14473        Some(cost)
14474    }
14475
14476    /// Whether the pricing table has any entries.
14477    pub fn is_empty(&self) -> bool {
14478        self.entries.is_empty()
14479    }
14480}
14481
14482/// Convert "YYYY-MM-DD" date string to day_id (days since 2020-01-01),
14483/// matching the format produced by `day_id_from_millis`.
14484fn date_str_to_day_id(s: &str) -> Result<i64> {
14485    use chrono::NaiveDate;
14486    const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
14487        Some(d) => d,
14488        None => unreachable!(),
14489    };
14490    NaiveDate::parse_from_str(s, "%Y-%m-%d")
14491        .map(|d| (d - EPOCH_2020).num_days())
14492        .with_context(|| format!("invalid effective_date '{s}'"))
14493}
14494
14495/// SQL LIKE pattern matcher (case-insensitive). `%` = any sequence, `_` = any single char.
14496fn sql_like_match(value: &str, pattern: &str) -> bool {
14497    sql_like_match_bytes(
14498        value.to_ascii_lowercase().as_bytes(),
14499        pattern.to_ascii_lowercase().as_bytes(),
14500    )
14501}
14502
14503/// Determine the byte length of the UTF-8 character starting at `b`.
14504fn utf8_char_len(b: u8) -> usize {
14505    if b < 0x80 {
14506        1
14507    } else if b < 0xE0 {
14508        2
14509    } else if b < 0xF0 {
14510        3
14511    } else {
14512        4
14513    }
14514}
14515
14516fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
14517    if pat.is_empty() {
14518        return val.is_empty();
14519    }
14520    match pat[0] {
14521        b'%' => {
14522            let mut p = 1;
14523            while p < pat.len() && pat[p] == b'%' {
14524                p += 1;
14525            }
14526            let rest = &pat[p..];
14527            // Iterate only at UTF-8 char boundaries
14528            let mut i = 0;
14529            while i <= val.len() {
14530                if sql_like_match_bytes(&val[i..], rest) {
14531                    return true;
14532                }
14533                if i < val.len() {
14534                    i += utf8_char_len(val[i]);
14535                } else {
14536                    break;
14537                }
14538            }
14539            false
14540        }
14541        b'_' => {
14542            // Match one full UTF-8 character, not just one byte
14543            if val.is_empty() {
14544                return false;
14545            }
14546            let char_len = utf8_char_len(val[0]);
14547            val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
14548        }
14549        c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
14550    }
14551}
14552
14553fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
14554    dotenvy::var(var)
14555        .ok()
14556        .and_then(|raw| raw.parse::<usize>().ok())
14557        .filter(|value| *value > 0)
14558        .unwrap_or(default)
14559}
14560
14561/// Returns true when the error chain represents a real `FrankenError::OutOfMemory`
14562/// (typed variant) or a bare "out of memory" / "not enough memory" message.
14563///
14564/// We *deliberately* do not do substring matching on the rendered chain: frankensqlite's
14565/// `FrankenError::OutOfMemory` renders as the literal "out of memory" and is also emitted
14566/// for several non-process-OOM internal conditions (VFS buffer / VDBE register allocation).
14567/// Contextual messages like "connector parse failed: not enough memory in record" must not
14568/// be promoted into the OOM-bisect/quarantine path. See `retryable_franken_anyhow` above
14569/// for the same downcast idiom.
14570fn is_out_of_memory_error<E: OutOfMemoryProbe + ?Sized>(err: &E) -> bool {
14571    err.is_out_of_memory()
14572}
14573
14574trait OutOfMemoryProbe {
14575    fn is_out_of_memory(&self) -> bool;
14576}
14577
14578impl OutOfMemoryProbe for anyhow::Error {
14579    fn is_out_of_memory(&self) -> bool {
14580        self.chain().any(|cause| {
14581            if cause
14582                .downcast_ref::<frankensqlite::FrankenError>()
14583                .is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
14584            {
14585                return true;
14586            }
14587            is_exact_out_of_memory_message(&cause.to_string())
14588        })
14589    }
14590}
14591
14592impl OutOfMemoryProbe for frankensqlite::FrankenError {
14593    fn is_out_of_memory(&self) -> bool {
14594        matches!(self, frankensqlite::FrankenError::OutOfMemory)
14595    }
14596}
14597
14598fn is_exact_out_of_memory_message(message: &str) -> bool {
14599    matches!(
14600        message.trim().to_ascii_lowercase().as_str(),
14601        "out of memory" | "not enough memory"
14602    )
14603}
14604
14605// Second SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
14606// All methods (insert_conversation_tree, list_agents, list_conversations, etc.) are
14607// available through FrankenStorage.
14608
14609/// Daily count data for histogram display.
14610#[derive(Debug, Clone)]
14611pub struct DailyCount {
14612    pub day_id: i64,
14613    pub sessions: i64,
14614    pub messages: i64,
14615    pub chars: i64,
14616}
14617
14618/// Result of an analytics rebuild operation.
14619#[derive(Debug, Clone)]
14620pub struct AnalyticsRebuildResult {
14621    pub message_metrics_rows: usize,
14622    pub usage_hourly_rows: usize,
14623    pub usage_daily_rows: usize,
14624    pub usage_models_daily_rows: usize,
14625    pub elapsed_ms: u64,
14626    pub messages_per_sec: f64,
14627}
14628
14629/// Result of rebuilding daily stats.
14630#[derive(Debug, Clone)]
14631pub struct DailyStatsRebuildResult {
14632    pub rows_created: i64,
14633    pub total_sessions: i64,
14634}
14635
14636/// Result of purging archived data for a single agent.
14637#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
14638pub struct AgentArchivePurgeResult {
14639    pub conversations_deleted: usize,
14640    pub messages_deleted: usize,
14641}
14642
14643/// Health status of daily stats table.
14644#[derive(Debug, Clone)]
14645pub struct DailyStatsHealth {
14646    pub populated: bool,
14647    pub row_count: i64,
14648    pub oldest_update_ms: Option<i64>,
14649    pub conversation_count: i64,
14650    pub materialized_total: i64,
14651    pub drift: i64,
14652}
14653
14654// -------------------------------------------------------------------------
14655// FTS5 Batch Insert (P2 Opt 2.1)
14656// -------------------------------------------------------------------------
14657
14658/// Batch size for FTS5 inserts. With 7 columns per row (rowid + 6 cols) and
14659/// SQLite's SQLITE_MAX_VARIABLE_NUMBER default of 999, max batch is ~142 rows.
14660/// Using 100 for safety margin and memory efficiency.
14661const FTS5_BATCH_SIZE: usize = 100;
14662
14663#[derive(Debug, Clone)]
14664struct FtsRebuildMessageRow {
14665    rowid: i64,
14666    message_id: i64,
14667    conversation_id: i64,
14668    content: String,
14669    created_at: Option<i64>,
14670}
14671
14672#[derive(Debug, Clone)]
14673struct FtsConversationProjection {
14674    title: String,
14675    agent_id: Option<i64>,
14676    workspace_id: Option<i64>,
14677    source_path: String,
14678}
14679
14680/// Entry for pending FTS5 insert.
14681#[derive(Debug, Clone)]
14682pub struct FtsEntry {
14683    pub content: String,
14684    pub title: String,
14685    pub agent: String,
14686    pub workspace: String,
14687    pub source_path: String,
14688    pub created_at: Option<i64>,
14689    pub message_id: i64,
14690}
14691
14692impl FtsEntry {
14693    /// Create an FTS entry from a message and conversation.
14694    pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
14695        FtsEntry {
14696            content: msg.content.clone(),
14697            title: conv.title.clone().unwrap_or_default(),
14698            agent: conv.agent_slug.clone(),
14699            workspace: conv
14700                .workspace
14701                .as_ref()
14702                .map(|p| p.to_string_lossy().into_owned())
14703                .unwrap_or_default(),
14704            source_path: path_to_string(&conv.source_path),
14705            created_at: msg.created_at.or(conv.started_at),
14706            message_id,
14707        }
14708    }
14709}
14710
14711const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
14712const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
14713
14714/// Default batch size for the FTS rebuild INSERT (Bug #168).  When
14715/// `fts_messages` is empty but `messages` has 100K+ rows, a single unbounded
14716/// INSERT-SELECT OOMs.  This constant caps each batch so peak memory stays
14717/// bounded.  Override via `CASS_FTS_REBUILD_BATCH_SIZE` for tuning.
14718const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
14719
14720/// Read the FTS rebuild batch size from the environment, falling back to the
14721/// compiled-in default.
14722fn fts_rebuild_batch_size() -> usize {
14723    dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
14724        .ok()
14725        .and_then(|v| v.parse::<usize>().ok())
14726        .filter(|&n| n > 0)
14727        .unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
14728}
14729
14730fn flush_pending_fts_entries(
14731    storage: &FrankenStorage,
14732    tx: &FrankenTransaction<'_>,
14733    entries: &mut Vec<FtsEntry>,
14734    pending_chars: &mut usize,
14735    inserted_total: &mut usize,
14736) -> Result<()> {
14737    if entries.is_empty() {
14738        return Ok(());
14739    }
14740
14741    if storage.fts_messages_present_cached(tx) {
14742        *inserted_total += franken_batch_insert_fts(tx, entries)?;
14743    }
14744    entries.clear();
14745    *pending_chars = 0;
14746    Ok(())
14747}
14748
14749fn path_to_string<P: AsRef<Path>>(p: P) -> String {
14750    p.as_ref().to_string_lossy().into_owned()
14751}
14752
14753fn role_str(role: &MessageRole) -> String {
14754    role_as_str(role).to_owned()
14755}
14756
14757fn role_as_str(role: &MessageRole) -> &str {
14758    match role {
14759        MessageRole::User => "user",
14760        MessageRole::Agent => "agent",
14761        MessageRole::Tool => "tool",
14762        MessageRole::System => "system",
14763        MessageRole::Other(v) => v.as_str(),
14764    }
14765}
14766
14767fn agent_kind_str(kind: AgentKind) -> String {
14768    match kind {
14769        AgentKind::Cli => "cli".into(),
14770        AgentKind::VsCode => "vscode".into(),
14771        AgentKind::Hybrid => "hybrid".into(),
14772    }
14773}
14774
14775// =============================================================================
14776// Tests (bead yln.4)
14777// =============================================================================
14778
14779#[cfg(test)]
14780mod tests {
14781    use super::*;
14782    use serial_test::serial;
14783    use tempfile::TempDir;
14784
14785    struct EnvGuard {
14786        key: &'static str,
14787        previous: Option<String>,
14788    }
14789
14790    impl Drop for EnvGuard {
14791        fn drop(&mut self) {
14792            if let Some(value) = &self.previous {
14793                // SAFETY: test helper restores prior process env for isolation.
14794                unsafe {
14795                    std::env::set_var(self.key, value);
14796                }
14797            } else {
14798                // SAFETY: test helper restores prior process env for isolation.
14799                unsafe {
14800                    std::env::remove_var(self.key);
14801                }
14802            }
14803        }
14804    }
14805
14806    fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
14807        let previous = dotenvy::var(key).ok();
14808        // SAFETY: test helper toggles a process-local env var for isolation.
14809        unsafe {
14810            std::env::set_var(key, value.as_ref());
14811        }
14812        EnvGuard { key, previous }
14813    }
14814
14815    #[test]
14816    fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
14817        let dir = TempDir::new().unwrap();
14818        let canonical = dir.path().join("agent_search.db");
14819        let scratch = dir.path().join("scratch.db");
14820
14821        assert_eq!(
14822            doctor_mutation_lock_path_for_db_open(&canonical),
14823            Some(dir.path().join("doctor/locks/doctor-repair.lock"))
14824        );
14825        assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
14826    }
14827
14828    #[test]
14829    fn doctor_lock_metadata_pid_detection_is_exact() {
14830        let current = std::process::id();
14831
14832        assert!(doctor_lock_metadata_pid_is_current_process(&format!(
14833            "schema_version=1\npid={current}\nmode=safe_auto_run\n"
14834        )));
14835        assert!(!doctor_lock_metadata_pid_is_current_process(
14836            "schema_version=1\npid=not-a-pid\n"
14837        ));
14838        assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
14839            "pid={}\n",
14840            current.saturating_add(1)
14841        )));
14842    }
14843
14844    #[test]
14845    fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
14846        use std::io::Write as _;
14847
14848        let dir = TempDir::new().unwrap();
14849        let db_path = dir.path().join("agent_search.db");
14850        {
14851            let storage = FrankenStorage::open(&db_path).unwrap();
14852            storage.close().unwrap();
14853        }
14854
14855        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
14856        let mut lock_file = fs::OpenOptions::new()
14857            .create(true)
14858            .truncate(false)
14859            .read(true)
14860            .write(true)
14861            .open(&lock_path)
14862            .unwrap();
14863        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
14864        lock_file.set_len(0).unwrap();
14865        lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
14866        lock_file.sync_all().unwrap();
14867
14868        let err =
14869            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
14870                .expect_err("active doctor mutation lock must block canonical DB opens");
14871        let message = err.to_string();
14872        assert!(
14873            message.contains("doctor mutation lock") && message.contains("active"),
14874            "error should identify the active doctor mutation lock: {message}"
14875        );
14876
14877        fs2::FileExt::unlock(&lock_file).unwrap();
14878    }
14879
14880    #[test]
14881    fn doctor_storage_open_allows_current_doctor_process_probe() {
14882        use std::io::Write as _;
14883
14884        let dir = TempDir::new().unwrap();
14885        let db_path = dir.path().join("agent_search.db");
14886        {
14887            let storage = FrankenStorage::open(&db_path).unwrap();
14888            storage.close().unwrap();
14889        }
14890
14891        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
14892        let mut lock_file = fs::OpenOptions::new()
14893            .create(true)
14894            .truncate(false)
14895            .read(true)
14896            .write(true)
14897            .open(&lock_path)
14898            .unwrap();
14899        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
14900        lock_file.set_len(0).unwrap();
14901        write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
14902        lock_file.sync_all().unwrap();
14903
14904        let conn =
14905            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
14906                .expect(
14907                    "doctor process must be able to run post-repair read probes under its own lock",
14908                );
14909        drop(conn);
14910
14911        fs2::FileExt::unlock(&lock_file).unwrap();
14912    }
14913
14914    #[test]
14915    fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
14916        let mut attempts = Vec::new();
14917
14918        let selected = disable_autocommit_retain(|pragma| {
14919            attempts.push(pragma);
14920            if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
14921                Err("compat namespace unavailable")
14922            } else {
14923                Ok(())
14924            }
14925        })
14926        .expect("canonical pragma should disable autocommit retain");
14927
14928        assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
14929        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
14930    }
14931
14932    #[test]
14933    fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
14934        let mut attempts = Vec::new();
14935
14936        let err = disable_autocommit_retain(|pragma| {
14937            attempts.push(pragma);
14938            Err("unsupported pragma")
14939        })
14940        .expect_err("unsupported autocommit retain controls should fail closed");
14941
14942        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
14943        let message = err.to_string();
14944        assert!(
14945            message.contains("refusing to keep a long-lived MVCC connection"),
14946            "error should force callers away from unbounded snapshot retention: {message}"
14947        );
14948        assert!(
14949            message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
14950                && message.contains("PRAGMA autocommit_retain = OFF;"),
14951            "error should preserve attempted PRAGMAs for diagnostics: {message}"
14952        );
14953    }
14954
14955    /// Open a rusqlite connection on `db_path` for the narrow purpose of
14956    /// injecting (or inspecting the raw projection of) sqlite_master
14957    /// corruption patterns in test fixtures. Frankensqlite intentionally does
14958    /// not support `PRAGMA writable_schema` writes or raw inserts to
14959    /// sqlite_master (see AGENTS.md: "PRAGMA writable_schema: Not supported for
14960    /// write operations"), so these fixtures retain rusqlite as the standard-
14961    /// SQLite interop layer. All callers are in this test module and run under
14962    /// #[cfg(test)]; no production code path touches rusqlite here.
14963    fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
14964        rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
14965    }
14966
14967    fn seed_historical_db_direct(
14968        db_path: &Path,
14969        conversations: &[crate::model::types::Conversation],
14970    ) {
14971        if let Some(parent) = db_path.parent() {
14972            fs::create_dir_all(parent).unwrap();
14973        }
14974
14975        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
14976        conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
14977        conn.execute_compat(
14978            "INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
14979             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
14980            fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
14981        )
14982        .unwrap();
14983
14984        let mut next_message_id = 1_i64;
14985        for (conv_index, conv) in conversations.iter().enumerate() {
14986            let conversation_id = i64::try_from(conv_index + 1).unwrap();
14987            let workspace_id = conv.workspace.as_ref().map(|workspace| {
14988                let workspace_id = conversation_id;
14989                let workspace_path = workspace.to_string_lossy().into_owned();
14990                conn.execute_compat(
14991                    "INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
14992                    fparams![
14993                        workspace_id,
14994                        workspace_path.as_str(),
14995                        workspace_path.as_str()
14996                    ],
14997                )
14998                .unwrap();
14999                workspace_id
15000            });
15001            let source_path = conv.source_path.to_string_lossy().into_owned();
15002            let metadata_json = conv.metadata_json.to_string();
15003            conn.execute_compat(
15004                "INSERT INTO conversations (
15005                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
15006                    started_at, ended_at, approx_tokens, metadata_json, origin_host
15007                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
15008                fparams![
15009                    conversation_id,
15010                    1_i64,
15011                    workspace_id,
15012                    conv.source_id.as_str(),
15013                    conv.external_id.as_deref(),
15014                    conv.title.as_deref(),
15015                    source_path.as_str(),
15016                    conv.started_at,
15017                    conv.ended_at,
15018                    conv.approx_tokens,
15019                    metadata_json.as_str(),
15020                    conv.origin_host.as_deref()
15021                ],
15022            )
15023            .unwrap();
15024
15025            for msg in &conv.messages {
15026                let extra_json = msg.extra_json.to_string();
15027                let role = role_str(&msg.role);
15028                conn.execute_compat(
15029                    "INSERT INTO messages(
15030                        id, conversation_id, idx, role, author, created_at, content, extra_json
15031                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
15032                    fparams![
15033                        next_message_id,
15034                        conversation_id,
15035                        msg.idx,
15036                        role.as_str(),
15037                        msg.author.as_deref(),
15038                        msg.created_at,
15039                        msg.content.as_str(),
15040                        extra_json.as_str()
15041                    ],
15042                )
15043                .unwrap();
15044                next_message_id += 1;
15045            }
15046        }
15047    }
15048
15049    // =========================================================================
15050    // User data file protection tests (bead yln.4)
15051    // =========================================================================
15052
15053    #[test]
15054    fn is_user_data_file_detects_bookmarks() {
15055        assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
15056        assert!(is_user_data_file(Path::new("bookmarks.db")));
15057    }
15058
15059    #[test]
15060    fn is_user_data_file_detects_tui_state() {
15061        assert!(is_user_data_file(Path::new("/data/tui_state.json")));
15062    }
15063
15064    #[test]
15065    fn is_user_data_file_detects_sources_toml() {
15066        assert!(is_user_data_file(Path::new("/config/sources.toml")));
15067    }
15068
15069    #[test]
15070    fn is_user_data_file_detects_env() {
15071        assert!(is_user_data_file(Path::new(".env")));
15072    }
15073
15074    #[test]
15075    fn is_user_data_file_rejects_other_files() {
15076        assert!(!is_user_data_file(Path::new("index.db")));
15077        assert!(!is_user_data_file(Path::new("conversations.db")));
15078        assert!(!is_user_data_file(Path::new("random.txt")));
15079    }
15080
15081    // =========================================================================
15082    // Backup creation tests (bead yln.4)
15083    // =========================================================================
15084
15085    #[test]
15086    fn create_backup_returns_none_for_nonexistent() {
15087        let dir = TempDir::new().unwrap();
15088        let db_path = dir.path().join("nonexistent.db");
15089        let result = create_backup(&db_path).unwrap();
15090        assert!(result.is_none());
15091    }
15092
15093    #[test]
15094    fn create_backup_creates_named_file() {
15095        let dir = TempDir::new().unwrap();
15096        let db_path = dir.path().join("test.db");
15097        std::fs::write(&db_path, b"test data").unwrap();
15098
15099        let backup_path = create_backup(&db_path).unwrap();
15100        assert!(backup_path.is_some());
15101        let backup = backup_path.unwrap();
15102        assert!(backup.exists());
15103        assert!(
15104            backup
15105                .file_name()
15106                .unwrap()
15107                .to_str()
15108                .unwrap()
15109                .contains("backup")
15110        );
15111    }
15112
15113    #[test]
15114    fn create_backup_paths_are_unique() {
15115        let dir = TempDir::new().unwrap();
15116        let db_path = dir.path().join("test.db");
15117        std::fs::write(&db_path, b"test data").unwrap();
15118
15119        let first = create_backup(&db_path).unwrap().unwrap();
15120        let second = create_backup(&db_path).unwrap().unwrap();
15121
15122        assert_ne!(first, second);
15123        assert!(first.exists());
15124        assert!(second.exists());
15125    }
15126
15127    #[test]
15128    fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
15129        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
15130        use std::path::PathBuf;
15131
15132        let dir = TempDir::new().unwrap();
15133        let db_path = dir.path().join("agent_search.db");
15134        let storage = SqliteStorage::open(&db_path).unwrap();
15135
15136        let agent = Agent {
15137            id: None,
15138            slug: "claude_code".into(),
15139            name: "Claude Code".into(),
15140            version: None,
15141            kind: AgentKind::Cli,
15142        };
15143        let agent_id = storage.ensure_agent(&agent).unwrap();
15144        let conversation = Conversation {
15145            id: None,
15146            agent_slug: "claude_code".into(),
15147            workspace: Some(PathBuf::from("/tmp/workspace")),
15148            external_id: Some("conv-1".into()),
15149            title: Some("Lexical rebuild".into()),
15150            source_path: PathBuf::from("/tmp/conv-1.jsonl"),
15151            started_at: Some(1_700_000_000_000),
15152            ended_at: Some(1_700_000_000_100),
15153            approx_tokens: None,
15154            metadata_json: serde_json::Value::Null,
15155            messages: vec![
15156                Message {
15157                    id: None,
15158                    idx: 0,
15159                    role: MessageRole::User,
15160                    author: Some("user".into()),
15161                    created_at: Some(1_700_000_000_010),
15162                    content: "first".into(),
15163                    extra_json: serde_json::Value::Null,
15164                    snippets: Vec::new(),
15165                },
15166                Message {
15167                    id: None,
15168                    idx: 1,
15169                    role: MessageRole::Agent,
15170                    author: Some("assistant".into()),
15171                    created_at: Some(1_700_000_000_020),
15172                    content: "second".into(),
15173                    extra_json: serde_json::Value::Null,
15174                    snippets: Vec::new(),
15175                },
15176            ],
15177            source_id: LOCAL_SOURCE_ID.into(),
15178            origin_host: None,
15179        };
15180        storage
15181            .insert_conversation_tree(agent_id, None, &conversation)
15182            .unwrap();
15183        let conversation_id = storage
15184            .conn
15185            .query_row_map(
15186                "SELECT id FROM conversations WHERE external_id = ?1",
15187                fparams!["conv-1"],
15188                |row| row.get_typed::<i64>(0),
15189            )
15190            .unwrap();
15191
15192        let opcodes: Vec<String> = storage
15193            .conn
15194            .query_map_collect(
15195                "EXPLAIN \
15196                 SELECT id, idx, role, author, created_at, content \
15197                 FROM messages \
15198                 WHERE conversation_id = ?1 ORDER BY idx",
15199                fparams![conversation_id],
15200                |row| row.get_typed(1),
15201            )
15202            .unwrap();
15203
15204        assert!(
15205            opcodes.iter().any(|opcode| opcode == "SeekGE"),
15206            "expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
15207        );
15208        assert!(
15209            !opcodes.iter().any(|opcode| opcode == "SorterOpen"),
15210            "expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
15211        );
15212    }
15213
15214    #[test]
15215    fn schema_check_rebuild_classification_ignores_transient_errors() {
15216        assert!(!schema_check_error_requires_rebuild(
15217            &frankensqlite::FrankenError::Busy
15218        ));
15219        assert!(!schema_check_error_requires_rebuild(
15220            &frankensqlite::FrankenError::DatabaseLocked {
15221                path: PathBuf::from("/tmp/test.db"),
15222            }
15223        ));
15224        assert!(!schema_check_error_requires_rebuild(
15225            &frankensqlite::FrankenError::CannotOpen {
15226                path: PathBuf::from("/tmp/test.db"),
15227            }
15228        ));
15229        assert!(!schema_check_error_requires_rebuild(
15230            &frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
15231        ));
15232    }
15233
15234    #[test]
15235    fn schema_check_rebuild_classification_keeps_corruption_errors() {
15236        assert!(schema_check_error_requires_rebuild(
15237            &frankensqlite::FrankenError::DatabaseCorrupt {
15238                detail: "bad header".to_string(),
15239            }
15240        ));
15241        assert!(schema_check_error_requires_rebuild(
15242            &frankensqlite::FrankenError::WalCorrupt {
15243                detail: "bad wal".to_string(),
15244            }
15245        ));
15246        assert!(schema_check_error_requires_rebuild(
15247            &frankensqlite::FrankenError::NotADatabase {
15248                path: PathBuf::from("/tmp/test.db"),
15249            }
15250        ));
15251        assert!(schema_check_error_requires_rebuild(
15252            &frankensqlite::FrankenError::ShortRead {
15253                expected: 4096,
15254                actual: 64,
15255            }
15256        ));
15257    }
15258
15259    #[test]
15260    fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
15261        let retryable_errors = [
15262            frankensqlite::FrankenError::Busy,
15263            frankensqlite::FrankenError::BusyRecovery,
15264            frankensqlite::FrankenError::BusySnapshot {
15265                conflicting_pages: "1,2".to_string(),
15266            },
15267            frankensqlite::FrankenError::DatabaseLocked {
15268                path: PathBuf::from("/tmp/test.db"),
15269            },
15270            frankensqlite::FrankenError::LockFailed {
15271                detail: "fcntl lock still held".to_string(),
15272            },
15273            frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
15274            frankensqlite::FrankenError::SerializationFailure { page: 11 },
15275            frankensqlite::FrankenError::Internal("database is locked".to_string()),
15276        ];
15277
15278        for err in retryable_errors {
15279            assert!(
15280                backup_vacuum_error_requires_consistent_retry(&err),
15281                "retryable VACUUM failure must not fall back to raw bundle copy: {err}"
15282            );
15283        }
15284
15285        assert!(!backup_vacuum_error_requires_consistent_retry(
15286            &frankensqlite::FrankenError::NotADatabase {
15287                path: PathBuf::from("/tmp/test.db")
15288            }
15289        ));
15290        assert!(!backup_vacuum_error_requires_consistent_retry(
15291            &frankensqlite::FrankenError::DatabaseCorrupt {
15292                detail: "bad header".to_string()
15293            }
15294        ));
15295    }
15296
15297    #[test]
15298    fn create_backup_uses_hidden_vacuum_stage_path() {
15299        let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
15300        let stage_path = vacuum_stage_backup_path(&backup_path);
15301        let stage_name = stage_path
15302            .file_name()
15303            .and_then(|name| name.to_str())
15304            .unwrap_or_default();
15305
15306        assert!(stage_name.starts_with('.'));
15307        assert!(stage_name.ends_with(".vacuum-in-progress"));
15308        assert!(
15309            !is_backup_root_name(stage_name, "test.db.backup."),
15310            "incomplete VACUUM output must not be discoverable as a backup root"
15311        );
15312    }
15313
15314    #[test]
15315    fn create_backup_preserves_content() {
15316        let dir = TempDir::new().unwrap();
15317        let db_path = dir.path().join("test.db");
15318        let original_content = b"test database content 12345";
15319        std::fs::write(&db_path, original_content).unwrap();
15320
15321        let backup_path = create_backup(&db_path).unwrap().unwrap();
15322        let backup_content = std::fs::read(&backup_path).unwrap();
15323        assert_eq!(backup_content, original_content);
15324    }
15325
15326    #[test]
15327    fn create_backup_copies_sidecars_when_present() {
15328        let dir = TempDir::new().unwrap();
15329        let db_path = dir.path().join("test.db");
15330        std::fs::write(&db_path, b"db").unwrap();
15331        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15332        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15333
15334        let backup_path = create_backup(&db_path).unwrap().unwrap();
15335
15336        assert_eq!(
15337            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15338            b"wal"
15339        );
15340        assert_eq!(
15341            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15342            b"shm"
15343        );
15344    }
15345
15346    #[test]
15347    #[cfg(unix)]
15348    fn create_backup_rejects_symlink_root_during_raw_fallback() {
15349        use std::os::unix::fs::symlink;
15350
15351        let dir = TempDir::new().unwrap();
15352        let outside_db = dir.path().join("outside.db");
15353        let db_path = dir.path().join("test.db");
15354        std::fs::write(&outside_db, b"not sqlite").unwrap();
15355        symlink(&outside_db, &db_path).unwrap();
15356
15357        let err = create_backup(&db_path).unwrap_err();
15358
15359        assert!(
15360            err.to_string().contains("bundle symlink"),
15361            "unexpected error: {err:#}"
15362        );
15363        assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
15364        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15365            .unwrap()
15366            .filter_map(|entry| entry.ok())
15367            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15368            .filter(|name| name.starts_with("test.db.backup."))
15369            .collect();
15370        assert!(
15371            backup_roots.is_empty(),
15372            "symlinked backup source must not publish backup roots: {backup_roots:?}"
15373        );
15374    }
15375
15376    #[test]
15377    #[cfg(unix)]
15378    fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
15379        use std::os::unix::fs::symlink;
15380
15381        let dir = TempDir::new().unwrap();
15382        let db_path = dir.path().join("test.db");
15383        let outside_wal = dir.path().join("outside.wal");
15384        let wal_path = database_sidecar_path(&db_path, "-wal");
15385        std::fs::write(&db_path, b"not sqlite").unwrap();
15386        std::fs::write(&outside_wal, b"outside wal").unwrap();
15387        symlink(&outside_wal, &wal_path).unwrap();
15388
15389        let err = create_backup(&db_path).unwrap_err();
15390
15391        assert!(
15392            err.to_string().contains("bundle symlink"),
15393            "unexpected error: {err:#}"
15394        );
15395        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15396        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15397            .unwrap()
15398            .filter_map(|entry| entry.ok())
15399            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15400            .filter(|name| name.starts_with("test.db.backup."))
15401            .collect();
15402        assert!(
15403            backup_roots.is_empty(),
15404            "sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
15405        );
15406    }
15407
15408    // =========================================================================
15409    // Backup cleanup tests (bead yln.4)
15410    // =========================================================================
15411
15412    #[test]
15413    fn cleanup_old_backups_keeps_recent() {
15414        let dir = TempDir::new().unwrap();
15415        let db_path = dir.path().join("test.db");
15416
15417        // Create 5 backup files with different timestamps
15418        for i in 0..5 {
15419            let backup_name = format!("test.db.backup.{}", 1000 + i);
15420            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15421        }
15422
15423        cleanup_old_backups(&db_path, 3).unwrap();
15424
15425        // Count remaining backup files
15426        let backups: Vec<_> = std::fs::read_dir(dir.path())
15427            .unwrap()
15428            .filter_map(|e| e.ok())
15429            .filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
15430            .collect();
15431
15432        assert_eq!(backups.len(), 3);
15433    }
15434
15435    #[test]
15436    fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
15437        let dir = TempDir::new().unwrap();
15438        let db_path = dir.path().join("test.db");
15439
15440        for i in 0..3 {
15441            let backup_name = format!("test.db.backup.{}", 1000 + i);
15442            let backup_path = dir.path().join(&backup_name);
15443            std::fs::write(&backup_path, format!("backup {i}")).unwrap();
15444            std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
15445            std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
15446            std::thread::sleep(std::time::Duration::from_millis(20));
15447        }
15448
15449        cleanup_old_backups(&db_path, 2).unwrap();
15450
15451        let mut roots = Vec::new();
15452        let mut wals = Vec::new();
15453        let mut shms = Vec::new();
15454        for entry in std::fs::read_dir(dir.path())
15455            .unwrap()
15456            .filter_map(|e| e.ok())
15457        {
15458            let name = entry.file_name().to_string_lossy().into_owned();
15459            if name.ends_with("-wal") {
15460                wals.push(name);
15461            } else if name.ends_with("-shm") {
15462                shms.push(name);
15463            } else if name.contains("backup") {
15464                roots.push(name);
15465            }
15466        }
15467
15468        assert_eq!(roots.len(), 2, "should keep two backup roots");
15469        assert_eq!(
15470            wals.len(),
15471            2,
15472            "should keep WAL sidecars only for retained backups"
15473        );
15474        assert_eq!(
15475            shms.len(),
15476            2,
15477            "should keep SHM sidecars only for retained backups"
15478        );
15479    }
15480
15481    #[test]
15482    fn move_database_bundle_moves_database_and_sidecars() {
15483        let dir = TempDir::new().unwrap();
15484        let db_path = dir.path().join("test.db");
15485        let backup_path = dir.path().join("test.db.corrupt");
15486
15487        std::fs::write(&db_path, b"db").unwrap();
15488        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15489        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15490
15491        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15492        assert_eq!(
15493            moved,
15494            DatabaseBundleMoveResult {
15495                database: true,
15496                wal: true,
15497                shm: true
15498            }
15499        );
15500        assert!(moved.moved_any());
15501
15502        assert!(!db_path.exists());
15503        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15504        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15505
15506        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15507        assert_eq!(
15508            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15509            b"wal"
15510        );
15511        assert_eq!(
15512            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15513            b"shm"
15514        );
15515    }
15516
15517    #[test]
15518    fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
15519        let dir = TempDir::new().unwrap();
15520        let db_path = dir.path().join("test.db");
15521        let backup_path = dir.path().join("test.db.corrupt");
15522
15523        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15524        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15525
15526        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15527        assert_eq!(
15528            moved,
15529            DatabaseBundleMoveResult {
15530                database: false,
15531                wal: true,
15532                shm: true
15533            }
15534        );
15535        assert!(moved.moved_any());
15536        assert!(!db_path.exists());
15537        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15538        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15539        assert_eq!(
15540            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15541            b"wal"
15542        );
15543        assert_eq!(
15544            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15545            b"shm"
15546        );
15547    }
15548
15549    #[test]
15550    #[cfg(unix)]
15551    fn move_database_bundle_moves_dangling_symlink_database_root() {
15552        use std::os::unix::fs::symlink;
15553
15554        let dir = TempDir::new().unwrap();
15555        let db_path = dir.path().join("test.db");
15556        let backup_path = dir.path().join("test.db.corrupt");
15557        let missing_target = dir.path().join("missing-target.db");
15558
15559        symlink(&missing_target, &db_path).unwrap();
15560
15561        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15562
15563        assert_eq!(
15564            moved,
15565            DatabaseBundleMoveResult {
15566                database: true,
15567                wal: false,
15568                shm: false
15569            }
15570        );
15571        assert!(std::fs::symlink_metadata(&db_path).is_err());
15572        assert!(
15573            std::fs::symlink_metadata(&backup_path)
15574                .unwrap()
15575                .file_type()
15576                .is_symlink()
15577        );
15578        assert!(!missing_target.exists());
15579    }
15580
15581    #[test]
15582    #[cfg(unix)]
15583    fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
15584        use std::os::unix::fs::symlink;
15585
15586        let dir = TempDir::new().unwrap();
15587        let db_path = dir.path().join("test.db");
15588        let backup_path = dir.path().join("test.db.corrupt");
15589        let missing_wal_target = dir.path().join("missing-wal");
15590        let missing_shm_target = dir.path().join("missing-shm");
15591        let wal_path = database_sidecar_path(&db_path, "-wal");
15592        let shm_path = database_sidecar_path(&db_path, "-shm");
15593
15594        symlink(&missing_wal_target, &wal_path).unwrap();
15595        symlink(&missing_shm_target, &shm_path).unwrap();
15596
15597        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15598
15599        assert_eq!(
15600            moved,
15601            DatabaseBundleMoveResult {
15602                database: false,
15603                wal: true,
15604                shm: true
15605            }
15606        );
15607        assert!(std::fs::symlink_metadata(&wal_path).is_err());
15608        assert!(std::fs::symlink_metadata(&shm_path).is_err());
15609        assert!(
15610            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
15611                .unwrap()
15612                .file_type()
15613                .is_symlink()
15614        );
15615        assert!(
15616            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
15617                .unwrap()
15618                .file_type()
15619                .is_symlink()
15620        );
15621        assert!(!missing_wal_target.exists());
15622        assert!(!missing_shm_target.exists());
15623    }
15624
15625    #[test]
15626    fn copy_database_bundle_copies_database_and_sidecars() {
15627        let dir = TempDir::new().unwrap();
15628        let db_path = dir.path().join("test.db");
15629        let copied_path = dir.path().join("copy.db");
15630
15631        std::fs::write(&db_path, b"db").unwrap();
15632        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15633        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15634
15635        copy_database_bundle(&db_path, &copied_path).unwrap();
15636
15637        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15638        assert_eq!(
15639            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15640            b"wal"
15641        );
15642        assert_eq!(
15643            std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
15644            b"shm"
15645        );
15646        assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
15647    }
15648
15649    #[test]
15650    fn copy_database_bundle_creates_destination_parent() {
15651        let dir = TempDir::new().unwrap();
15652        let db_path = dir.path().join("test.db");
15653        let copied_path = dir.path().join("nested/copies/copy.db");
15654
15655        std::fs::write(&db_path, b"db").unwrap();
15656        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15657
15658        copy_database_bundle(&db_path, &copied_path).unwrap();
15659
15660        assert!(copied_path.parent().unwrap().is_dir());
15661        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15662        assert_eq!(
15663            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15664            b"wal"
15665        );
15666    }
15667
15668    #[test]
15669    #[cfg(unix)]
15670    fn copy_database_bundle_rejects_symlink_source_root() {
15671        use std::os::unix::fs::symlink;
15672
15673        let dir = TempDir::new().unwrap();
15674        let outside_db = dir.path().join("outside.db");
15675        let db_path = dir.path().join("test.db");
15676        let copied_path = dir.path().join("copy.db");
15677
15678        std::fs::write(&outside_db, b"outside").unwrap();
15679        symlink(&outside_db, &db_path).unwrap();
15680
15681        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15682
15683        assert!(
15684            err.to_string().contains("bundle symlink"),
15685            "unexpected error: {err:#}"
15686        );
15687        assert!(!copied_path.exists());
15688        assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
15689    }
15690
15691    #[test]
15692    #[cfg(unix)]
15693    fn copy_database_bundle_rejects_symlink_sidecar() {
15694        use std::os::unix::fs::symlink;
15695
15696        let dir = TempDir::new().unwrap();
15697        let db_path = dir.path().join("test.db");
15698        let copied_path = dir.path().join("copy.db");
15699        let outside_wal = dir.path().join("outside.wal");
15700        let wal_path = database_sidecar_path(&db_path, "-wal");
15701
15702        std::fs::write(&db_path, b"db").unwrap();
15703        std::fs::write(&outside_wal, b"outside wal").unwrap();
15704        symlink(&outside_wal, &wal_path).unwrap();
15705
15706        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15707
15708        assert!(
15709            err.to_string().contains("bundle symlink"),
15710            "unexpected error: {err:#}"
15711        );
15712        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15713        assert!(!copied_path.exists());
15714        assert!(!database_sidecar_path(&copied_path, "-wal").exists());
15715    }
15716
15717    #[test]
15718    fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
15719        let dir = TempDir::new().unwrap();
15720        let db_path = dir.path().join("test.db");
15721        let backup_path = dir.path().join("nested/backups/test.db.corrupt");
15722
15723        std::fs::write(&db_path, b"db").unwrap();
15724        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15725        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15726
15727        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15728        assert_eq!(
15729            moved,
15730            DatabaseBundleMoveResult {
15731                database: true,
15732                wal: true,
15733                shm: true
15734            }
15735        );
15736        assert!(backup_path.parent().unwrap().is_dir());
15737        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15738        assert_eq!(
15739            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15740            b"wal"
15741        );
15742        assert_eq!(
15743            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15744            b"shm"
15745        );
15746    }
15747
15748    #[test]
15749    fn remove_database_files_removes_orphan_sidecars_without_main_db() {
15750        let dir = TempDir::new().unwrap();
15751        let db_path = dir.path().join("test.db");
15752
15753        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15754        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15755
15756        remove_database_files(&db_path).unwrap();
15757
15758        assert!(!db_path.exists());
15759        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15760        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15761    }
15762
15763    #[test]
15764    fn cleanup_old_backups_ignores_backup_named_directories() {
15765        let dir = TempDir::new().unwrap();
15766        let db_path = dir.path().join("test.db");
15767
15768        for i in 0..3 {
15769            let backup_name = format!("test.db.backup.{}", 1000 + i);
15770            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15771        }
15772        std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
15773
15774        cleanup_old_backups(&db_path, 2).unwrap();
15775
15776        let mut backup_files = Vec::new();
15777        let mut backup_dirs = Vec::new();
15778        for entry in std::fs::read_dir(dir.path())
15779            .unwrap()
15780            .filter_map(|e| e.ok())
15781        {
15782            let name = entry.file_name().to_string_lossy().into_owned();
15783            if !name.starts_with("test.db.backup.") {
15784                continue;
15785            }
15786            if entry.path().is_dir() {
15787                backup_dirs.push(name);
15788            } else {
15789                backup_files.push(name);
15790            }
15791        }
15792
15793        assert_eq!(
15794            backup_files.len(),
15795            2,
15796            "only real backup files count toward retention"
15797        );
15798        assert_eq!(
15799            backup_dirs.len(),
15800            1,
15801            "backup-named directories should be ignored"
15802        );
15803    }
15804
15805    // =========================================================================
15806    // Storage open/create tests (bead yln.4)
15807    // =========================================================================
15808
15809    #[test]
15810    fn open_creates_new_database() {
15811        let dir = TempDir::new().unwrap();
15812        let db_path = dir.path().join("new.db");
15813        assert!(!db_path.exists());
15814
15815        let storage = SqliteStorage::open(&db_path).unwrap();
15816        assert!(db_path.exists());
15817        storage.close().unwrap();
15818    }
15819
15820    #[test]
15821    fn open_readonly_fails_for_nonexistent() {
15822        let dir = TempDir::new().unwrap();
15823        let db_path = dir.path().join("nonexistent.db");
15824        let result = SqliteStorage::open_readonly(&db_path);
15825        assert!(result.is_err());
15826    }
15827
15828    #[test]
15829    fn open_readonly_succeeds_for_existing() {
15830        let dir = TempDir::new().unwrap();
15831        let db_path = dir.path().join("existing.db");
15832
15833        // Create first
15834        let _storage = SqliteStorage::open(&db_path).unwrap();
15835        drop(_storage);
15836
15837        // Now open readonly
15838        let storage = SqliteStorage::open_readonly(&db_path).unwrap();
15839        assert!(storage.schema_version().is_ok());
15840    }
15841
15842    #[test]
15843    fn reopen_existing_current_schema_is_idempotent() {
15844        let dir = TempDir::new().unwrap();
15845        let db_path = dir.path().join("existing.db");
15846
15847        // First open creates and migrates to current schema.
15848        {
15849            let storage = SqliteStorage::open(&db_path).unwrap();
15850            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15851        }
15852
15853        // Re-open should not fail on current schema.
15854        let reopened = SqliteStorage::open(&db_path).unwrap();
15855        assert_eq!(
15856            reopened.schema_version().unwrap(),
15857            CURRENT_SCHEMA_VERSION,
15858            "reopening current schema DB should be idempotent"
15859        );
15860    }
15861
15862    #[test]
15863    fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
15864        let dir = TempDir::new().unwrap();
15865        let db_path = dir.path().join("existing.db");
15866
15867        // Create DB at current schema.
15868        {
15869            let storage = SqliteStorage::open(&db_path).unwrap();
15870            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15871        }
15872
15873        // Should open normally, not require rebuild.
15874        let reopened = SqliteStorage::open_or_rebuild(&db_path)
15875            .expect("current schema DB should open without rebuild");
15876        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15877    }
15878
15879    #[test]
15880    fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
15881        let dir = TempDir::new().unwrap();
15882        let db_path = dir.path().join("db_dir");
15883        std::fs::create_dir(&db_path).unwrap();
15884
15885        let result = SqliteStorage::open_or_rebuild(&db_path);
15886
15887        match result {
15888            Err(MigrationError::Database(_)) | Err(MigrationError::Io(_)) => {}
15889            Err(MigrationError::RebuildRequired { reason, .. }) => {
15890                panic!("should not rebuild non-database path: {reason}")
15891            }
15892            Err(MigrationError::Other(msg)) => {
15893                panic!("should preserve underlying open error, got Other: {msg}")
15894            }
15895            Ok(_) => panic!("directory path must not open as a database"),
15896        }
15897
15898        assert!(
15899            db_path.is_dir(),
15900            "non-database directory must be left in place"
15901        );
15902    }
15903
15904    // =========================================================================
15905    // Schema version tests (bead yln.4)
15906    // =========================================================================
15907
15908    #[test]
15909    fn schema_version_returns_current() {
15910        let dir = TempDir::new().unwrap();
15911        let db_path = dir.path().join("test.db");
15912        let storage = SqliteStorage::open(&db_path).unwrap();
15913        let version = storage.schema_version().unwrap();
15914        assert!(version >= 5, "Schema version should be at least 5");
15915    }
15916
15917    // =========================================================================
15918    // Current analytics/schema smoke test (bead z9fse.11)
15919    // =========================================================================
15920
15921    #[test]
15922    fn migration_v13_creates_analytics_tables() {
15923        let dir = TempDir::new().unwrap();
15924        let db_path = dir.path().join("test.db");
15925        let storage = SqliteStorage::open(&db_path).unwrap();
15926
15927        // Schema version should be current.
15928        let version = storage.schema_version().unwrap();
15929        assert_eq!(
15930            version, CURRENT_SCHEMA_VERSION,
15931            "Schema version must match CURRENT_SCHEMA_VERSION after migration"
15932        );
15933
15934        let conn = storage.raw();
15935
15936        // Helper: collect column names from PRAGMA table_info
15937        fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
15938            conn.query_map_collect(
15939                &format!("PRAGMA table_info({})", table),
15940                fparams![],
15941                |row: &FrankenRow| row.get_typed(1),
15942            )
15943            .unwrap()
15944        }
15945
15946        // Helper: collect index names from PRAGMA index_list
15947        fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
15948            conn.query_map_collect(
15949                &format!("PRAGMA index_list({})", table),
15950                fparams![],
15951                |row: &FrankenRow| row.get_typed(1),
15952            )
15953            .unwrap()
15954        }
15955
15956        // Verify message_metrics table exists with expected columns
15957        let mm_cols = col_names(conn, "message_metrics");
15958        for expected in &[
15959            "message_id",
15960            "hour_id",
15961            "day_id",
15962            "content_tokens_est",
15963            "model_name",
15964            "model_family",
15965            "model_tier",
15966            "provider",
15967            "api_input_tokens",
15968            "has_plan",
15969            "agent_slug",
15970            "role",
15971            "api_data_source",
15972        ] {
15973            assert!(
15974                mm_cols.contains(&expected.to_string()),
15975                "message_metrics missing column: {expected}"
15976            );
15977        }
15978
15979        // Verify usage_hourly table
15980        let uh_cols = col_names(conn, "usage_hourly");
15981        for expected in &[
15982            "hour_id",
15983            "plan_message_count",
15984            "plan_content_tokens_est_total",
15985            "plan_api_tokens_total",
15986            "api_coverage_message_count",
15987            "content_tokens_est_user",
15988            "api_thinking_tokens_total",
15989        ] {
15990            assert!(
15991                uh_cols.contains(&expected.to_string()),
15992                "usage_hourly missing column: {expected}"
15993            );
15994        }
15995
15996        // Verify usage_daily table
15997        let ud_cols = col_names(conn, "usage_daily");
15998        for expected in &[
15999            "day_id",
16000            "plan_content_tokens_est_total",
16001            "plan_api_tokens_total",
16002            "api_thinking_tokens_total",
16003            "content_tokens_est_assistant",
16004            "message_count",
16005        ] {
16006            assert!(
16007                ud_cols.contains(&expected.to_string()),
16008                "usage_daily missing column: {expected}"
16009            );
16010        }
16011
16012        // Verify usage_models_daily table
16013        let umd_cols = col_names(conn, "usage_models_daily");
16014        for expected in &[
16015            "day_id",
16016            "model_family",
16017            "model_tier",
16018            "message_count",
16019            "api_tokens_total",
16020            "api_coverage_message_count",
16021        ] {
16022            assert!(
16023                umd_cols.contains(&expected.to_string()),
16024                "usage_models_daily missing column: {expected}"
16025            );
16026        }
16027
16028        // Verify indexes on message_metrics
16029        let mm_idxs = idx_names(conn, "message_metrics");
16030        assert!(
16031            mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
16032            "message_metrics must have hour index"
16033        );
16034        assert!(
16035            mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
16036            "message_metrics must have agent+day index"
16037        );
16038        assert!(
16039            mm_idxs
16040                .iter()
16041                .any(|n| n.contains("idx_mm_model_family_day")),
16042            "message_metrics must have model_family+day index"
16043        );
16044
16045        // Verify indexes on usage_hourly
16046        let uh_idxs = idx_names(conn, "usage_hourly");
16047        assert!(
16048            uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
16049            "usage_hourly must have agent index"
16050        );
16051
16052        // Verify indexes on usage_daily
16053        let ud_idxs = idx_names(conn, "usage_daily");
16054        assert!(
16055            ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
16056            "usage_daily must have agent index"
16057        );
16058
16059        // Verify indexes on usage_models_daily
16060        let umd_idxs = idx_names(conn, "usage_models_daily");
16061        assert!(
16062            umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
16063            "usage_models_daily must have model+day index"
16064        );
16065
16066        let conversation_cols = col_names(conn, "conversations");
16067        assert!(
16068            conversation_cols.contains(&"last_message_idx".to_string())
16069                && conversation_cols.contains(&"last_message_created_at".to_string()),
16070            "fresh schema must include V15 tail columns without ALTER TABLE on conversations"
16071        );
16072        let fts_schema_rows: i64 = conn
16073            .query_row_map(
16074                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
16075                fparams![],
16076                |row: &FrankenRow| row.get_typed(0),
16077            )
16078            .unwrap();
16079        assert_eq!(
16080            fts_schema_rows, 0,
16081            "fresh schema should not create and immediately drop derived fts_messages"
16082        );
16083        let integrity: Vec<String> = conn
16084            .query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
16085                row.get_typed(0)
16086            })
16087            .unwrap();
16088        assert_eq!(
16089            integrity,
16090            vec!["ok".to_string()],
16091            "fresh schema must pass SQLite integrity_check"
16092        );
16093    }
16094
16095    #[test]
16096    fn hour_id_round_trip() {
16097        // 2026-02-06 12:00:00 UTC
16098        let ts_ms = 1_770_508_800_000_i64;
16099        let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
16100        let day_id = SqliteStorage::day_id_from_millis(ts_ms);
16101
16102        // hour_id should be 24x day_id (approximately)
16103        assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
16104
16105        // Round-trip: millis_from_hour_id should give start of that hour
16106        let back = SqliteStorage::millis_from_hour_id(hour_id);
16107        assert!(
16108            back <= ts_ms && ts_ms - back < 3_600_000,
16109            "Round-trip should land within the same hour"
16110        );
16111    }
16112
16113    #[test]
16114    fn day_and_hour_ids_floor_negative_millis() {
16115        // One millisecond before the Unix epoch should still floor into the
16116        // previous second/hour/day rather than truncating toward zero.
16117        let ts_ms = -1_i64;
16118        let expected_secs = -1_i64;
16119        let epoch_2020_secs = 1_577_836_800_i64;
16120
16121        assert_eq!(
16122            SqliteStorage::day_id_from_millis(ts_ms),
16123            (expected_secs - epoch_2020_secs).div_euclid(86_400)
16124        );
16125        assert_eq!(
16126            SqliteStorage::hour_id_from_millis(ts_ms),
16127            (expected_secs - epoch_2020_secs).div_euclid(3_600)
16128        );
16129    }
16130
16131    #[test]
16132    fn migration_v13_from_v10() {
16133        let dir = TempDir::new().unwrap();
16134        let db_path = dir.path().join("test.db");
16135
16136        // Open at v10 first by faking it
16137        {
16138            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
16139            conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
16140            conn.execute_batch(
16141                "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
16142            )
16143            .unwrap();
16144            conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
16145                .unwrap();
16146            // Apply V1-V10 so schema is correct
16147            let mut tx = conn.transaction().unwrap();
16148            tx.execute_batch(MIGRATION_V1).unwrap();
16149            tx.execute_batch(MIGRATION_V2).unwrap();
16150            tx.execute_batch(MIGRATION_V4).unwrap();
16151            tx.execute_batch(MIGRATION_V5).unwrap();
16152            tx.execute_batch(MIGRATION_V6).unwrap();
16153            tx.execute_batch(MIGRATION_V7).unwrap();
16154            tx.execute_batch(MIGRATION_V8).unwrap();
16155            tx.execute_batch(MIGRATION_V9).unwrap();
16156            tx.execute_batch(MIGRATION_V10).unwrap();
16157            tx.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
16158                .unwrap();
16159            tx.commit().unwrap();
16160        }
16161        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
16162
16163        // Now open with SqliteStorage — should auto-migrate to current schema
16164        let storage = SqliteStorage::open(&db_path).unwrap();
16165        let version = storage.schema_version().unwrap();
16166        assert_eq!(
16167            version, CURRENT_SCHEMA_VERSION,
16168            "Should have migrated from v10 to the current schema"
16169        );
16170
16171        // Verify new tables exist
16172        let count: i64 = storage
16173            .raw()
16174            .query_row_map(
16175                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
16176                &[],
16177                |row: &FrankenRow| row.get_typed::<i64>(0),
16178            )
16179            .unwrap();
16180        assert_eq!(count, 4, "All 4 analytics tables should exist");
16181    }
16182
16183    // =========================================================================
16184    // Analytics ingest integration test (bead z9fse.2)
16185    // =========================================================================
16186
16187    #[test]
16188    fn analytics_ingest_populates_metrics_and_rollups() {
16189        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16190        use std::path::PathBuf;
16191
16192        let dir = TempDir::new().unwrap();
16193        let db_path = dir.path().join("test.db");
16194        let storage = SqliteStorage::open(&db_path).unwrap();
16195
16196        // Register agent + workspace
16197        let agent = Agent {
16198            id: None,
16199            slug: "claude_code".into(),
16200            name: "Claude Code".into(),
16201            version: Some("1.0".into()),
16202            kind: AgentKind::Cli,
16203        };
16204        let agent_id = storage.ensure_agent(&agent).unwrap();
16205
16206        // Create a synthetic conversation with 3 messages at a known timestamp
16207        // 2026-02-06 10:30:00 UTC → day_id = 2228, hour_id = 53472
16208        let ts_ms = 1_770_551_400_000_i64;
16209        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16210        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16211
16212        // Include a JSON usage block on the assistant message (like Claude Code data)
16213        let usage_json = serde_json::json!({
16214            "message": {
16215                "model": "claude-opus-4-6",
16216                "usage": {
16217                    "input_tokens": 100,
16218                    "output_tokens": 50,
16219                    "cache_read_input_tokens": 200,
16220                    "cache_creation_input_tokens": 30,
16221                    "service_tier": "standard"
16222                }
16223            }
16224        });
16225
16226        let conv = Conversation {
16227            id: None,
16228            agent_slug: "claude_code".into(),
16229            workspace: None,
16230            external_id: Some("test-conv-1".into()),
16231            title: Some("Test conversation".into()),
16232            source_path: PathBuf::from("/tmp/test.jsonl"),
16233            started_at: Some(ts_ms),
16234            ended_at: Some(ts_ms + 60_000),
16235            approx_tokens: None,
16236            metadata_json: serde_json::Value::Null,
16237            messages: vec![
16238                Message {
16239                    id: None,
16240                    idx: 0,
16241                    role: MessageRole::User,
16242                    author: None,
16243                    created_at: Some(ts_ms),
16244                    content: "Hello, can you help me with a plan?".into(),
16245                    extra_json: serde_json::Value::Null,
16246                    snippets: vec![],
16247                },
16248                Message {
16249                    id: None,
16250                    idx: 1,
16251                    role: MessageRole::Agent,
16252                    author: None,
16253                    created_at: Some(ts_ms + 30_000),
16254                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16255                    extra_json: usage_json,
16256                    snippets: vec![],
16257                },
16258                Message {
16259                    id: None,
16260                    idx: 2,
16261                    role: MessageRole::User,
16262                    author: None,
16263                    created_at: Some(ts_ms + 60_000),
16264                    content: "Great, let's proceed!".into(),
16265                    extra_json: serde_json::Value::Null,
16266                    snippets: vec![],
16267                },
16268            ],
16269            source_id: "local".into(),
16270            origin_host: None,
16271        };
16272
16273        let outcomes = storage
16274            .insert_conversations_batched(&[(agent_id, None, &conv)])
16275            .unwrap();
16276        assert_eq!(outcomes.len(), 1);
16277        assert_eq!(outcomes[0].inserted_indices.len(), 3);
16278
16279        let conn = storage.raw();
16280
16281        // Verify message_metrics rows
16282        let mm_count: i64 = conn
16283            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16284                row.get_typed::<i64>(0)
16285            })
16286            .unwrap();
16287        assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
16288
16289        // Verify hour_id and day_id are correct
16290        #[allow(clippy::type_complexity)]
16291        let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
16292            .query_map_collect(
16293                "SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
16294                fparams![],
16295                |row: &FrankenRow| {
16296                    Ok((
16297                        row.get_typed(0)?,
16298                        row.get_typed(1)?,
16299                        row.get_typed(2)?,
16300                        row.get_typed(3)?,
16301                        row.get_typed(4)?,
16302                        row.get_typed(5)?,
16303                        row.get_typed(6)?,
16304                        row.get_typed(7)?,
16305                        row.get_typed(8)?,
16306                    ))
16307                },
16308            )
16309            .unwrap();
16310
16311        assert_eq!(rows.len(), 3);
16312        // All messages in the same hour/day
16313        assert_eq!(rows[0].0, expected_hour);
16314        assert_eq!(rows[0].1, expected_day);
16315        // First message is user
16316        assert_eq!(rows[0].2, "user");
16317        // Second message (assistant) should have has_plan=1 (contains "## Plan" + numbered steps)
16318        assert_eq!(
16319            rows[1].4, 1,
16320            "Assistant message with plan should have has_plan=1"
16321        );
16322        // Second message should have api data source
16323        assert_eq!(
16324            rows[1].5, "api",
16325            "Claude Code assistant message should have api data source"
16326        );
16327        // First and third (user) messages should be estimated
16328        assert_eq!(rows[0].5, "estimated");
16329        assert_eq!(rows[2].5, "estimated");
16330        assert_eq!(rows[1].6, "claude");
16331        assert_eq!(rows[1].7, "opus");
16332        assert_eq!(rows[1].8, "anthropic");
16333        assert_eq!(rows[0].6, "unknown");
16334        // content_tokens_est = chars / 4
16335        let user_chars = "Hello, can you help me with a plan?".len() as i64;
16336        assert_eq!(rows[0].3, user_chars / 4);
16337
16338        // Verify usage_hourly rollup
16339        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
16340            i64,
16341            i64,
16342            i64,
16343            i64,
16344            i64,
16345            i64,
16346            i64,
16347        ) = conn
16348            .query_row_map(
16349                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16350                        plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
16351                 FROM usage_hourly WHERE hour_id = ?",
16352                fparams![expected_hour],
16353                |row: &FrankenRow| {
16354                    Ok((
16355                        row.get_typed(0)?,
16356                        row.get_typed(1)?,
16357                        row.get_typed(2)?,
16358                        row.get_typed(3)?,
16359                        row.get_typed(4)?,
16360                        row.get_typed(5)?,
16361                        row.get_typed(6)?,
16362                    ))
16363                },
16364            )
16365            .unwrap();
16366        assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
16367        assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
16368        assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
16369        assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
16370        assert!(
16371            uh_plan_content > 0,
16372            "Hourly rollup should include plan content tokens"
16373        );
16374        assert!(
16375            uh_plan_api > 0,
16376            "Hourly rollup should include plan API tokens"
16377        );
16378        assert_eq!(
16379            uh_api_cov, 1,
16380            "Hourly rollup should have 1 API-covered message"
16381        );
16382
16383        // Verify usage_daily rollup matches hourly (same day)
16384        let (ud_msg, ud_api_cov): (i64, i64) = conn
16385            .query_row_map(
16386                "SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
16387                fparams![expected_day],
16388                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16389            )
16390            .unwrap();
16391        assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
16392        assert_eq!(
16393            ud_api_cov, 1,
16394            "Daily api_coverage should be 1 (only assistant msg has real API data)"
16395        );
16396
16397        // Verify the API input tokens from message_metrics (only API-sourced)
16398        let api_only_input: i64 = conn
16399            .query_row_map(
16400                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
16401                fparams![expected_day],
16402                |row: &FrankenRow| row.get_typed::<i64>(0),
16403            )
16404            .unwrap();
16405        assert_eq!(
16406            api_only_input, 100,
16407            "Only API-sourced input tokens should be 100"
16408        );
16409
16410        // Verify rollups match summed message_metrics
16411        let mm_total_content_est: i64 = conn
16412            .query_row_map(
16413                "SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
16414                fparams![expected_day],
16415                |row| row.get_typed::<i64>(0),
16416            )
16417            .unwrap();
16418        let mm_plan_content_est: i64 = conn
16419            .query_row_map(
16420                "SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
16421                fparams![expected_day],
16422                |row: &FrankenRow| row.get_typed::<i64>(0),
16423            )
16424            .unwrap();
16425        let mm_plan_api_total: i64 = conn
16426            .query_row_map(
16427                "SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
16428                 FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
16429                fparams![expected_day],
16430                |row: &FrankenRow| row.get_typed::<i64>(0),
16431            )
16432            .unwrap();
16433        let ud_content_est: i64 = conn
16434            .query_row_map(
16435                "SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
16436                fparams![expected_day],
16437                |row| row.get_typed::<i64>(0),
16438            )
16439            .unwrap();
16440        let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
16441            .query_row_map(
16442                "SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
16443                fparams![expected_day],
16444                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16445            )
16446            .unwrap();
16447        assert_eq!(
16448            mm_total_content_est, ud_content_est,
16449            "Daily rollup content_tokens_est_total must equal SUM of message_metrics"
16450        );
16451        assert_eq!(
16452            mm_plan_content_est, ud_plan_content_est,
16453            "Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
16454        );
16455        assert_eq!(
16456            mm_plan_api_total, ud_plan_api_total,
16457            "Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
16458        );
16459
16460        // Verify model rollup rows
16461        let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
16462            i64,
16463            i64,
16464            i64,
16465            i64,
16466            i64,
16467        ) = conn
16468            .query_row_map(
16469                "SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
16470                 FROM usage_models_daily
16471                 WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
16472                fparams![expected_day],
16473                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
16474            )
16475            .unwrap();
16476        assert_eq!(claude_msg, 1);
16477        assert_eq!(claude_user, 0);
16478        assert_eq!(claude_asst, 1);
16479        assert_eq!(claude_api_total, 380);
16480        assert_eq!(claude_api_cov, 1);
16481
16482        let unknown_msg: i64 = conn
16483            .query_row_map(
16484                "SELECT message_count FROM usage_models_daily
16485                 WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
16486                fparams![expected_day],
16487                |row| row.get_typed(0),
16488            )
16489            .unwrap();
16490        assert_eq!(
16491            unknown_msg, 2,
16492            "user messages should land in unknown model bucket"
16493        );
16494    }
16495
16496    #[test]
16497    fn has_plan_heuristic_detects_plans() {
16498        assert!(has_plan_heuristic(
16499            "## Plan\n\n1. First step\n2. Second step"
16500        ));
16501        assert!(has_plan_heuristic(
16502            "# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
16503        ));
16504        assert!(has_plan_heuristic(
16505            "Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
16506        ));
16507        assert!(has_plan_heuristic(
16508            "Next steps:\n1. Update schema\n2. Rebuild rollups"
16509        ));
16510        assert!(!has_plan_heuristic("Hello world"));
16511        assert!(!has_plan_heuristic("Short"));
16512        assert!(!has_plan_heuristic(
16513            "This is a regular message without plans"
16514        ));
16515        assert!(!has_plan_heuristic(
16516            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
16517        ));
16518    }
16519
16520    #[test]
16521    fn has_plan_for_role_only_counts_assistant_messages() {
16522        let plan_text = "## Plan\n1. First\n2. Second";
16523        assert!(has_plan_for_role("assistant", plan_text));
16524        assert!(has_plan_for_role("agent", plan_text));
16525        assert!(has_plan_for_role("Assistant", plan_text));
16526        assert!(!has_plan_for_role("user", plan_text));
16527        assert!(!has_plan_for_role("tool", plan_text));
16528    }
16529
16530    #[test]
16531    fn api_rollups_require_api_data_source() {
16532        let mut agg = AnalyticsRollupAggregator::new();
16533
16534        let estimated_plan = MessageMetricsEntry {
16535            message_id: 1,
16536            created_at_ms: 0,
16537            hour_id: 1,
16538            day_id: 1,
16539            agent_slug: "codex".into(),
16540            workspace_id: 0,
16541            source_id: "local".into(),
16542            role: "assistant".into(),
16543            content_chars: 120,
16544            content_tokens_est: 30,
16545            model_name: None,
16546            model_family: "unknown".into(),
16547            model_tier: "unknown".into(),
16548            provider: "unknown".into(),
16549            api_input_tokens: Some(100),
16550            api_output_tokens: Some(50),
16551            api_cache_read_tokens: Some(0),
16552            api_cache_creation_tokens: Some(0),
16553            api_thinking_tokens: Some(0),
16554            api_service_tier: None,
16555            api_data_source: "estimated".into(),
16556            tool_call_count: 0,
16557            has_tool_calls: false,
16558            has_plan: true,
16559        };
16560        agg.record(&estimated_plan);
16561
16562        let api_plan = MessageMetricsEntry {
16563            message_id: 2,
16564            created_at_ms: 0,
16565            hour_id: 1,
16566            day_id: 1,
16567            agent_slug: "codex".into(),
16568            workspace_id: 0,
16569            source_id: "local".into(),
16570            role: "assistant".into(),
16571            content_chars: 80,
16572            content_tokens_est: 20,
16573            model_name: None,
16574            model_family: "unknown".into(),
16575            model_tier: "unknown".into(),
16576            provider: "unknown".into(),
16577            api_input_tokens: Some(40),
16578            api_output_tokens: Some(10),
16579            api_cache_read_tokens: Some(0),
16580            api_cache_creation_tokens: Some(0),
16581            api_thinking_tokens: Some(0),
16582            api_service_tier: None,
16583            api_data_source: "api".into(),
16584            tool_call_count: 0,
16585            has_tool_calls: false,
16586            has_plan: true,
16587        };
16588        agg.record(&api_plan);
16589
16590        let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
16591        let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
16592        let daily = agg.daily.get(&key).expect("daily rollup key must exist");
16593        let model_key = (
16594            1_i64,
16595            "codex".to_string(),
16596            0_i64,
16597            "local".to_string(),
16598            "unknown".to_string(),
16599            "unknown".to_string(),
16600        );
16601        let models_daily = agg
16602            .models_daily
16603            .get(&model_key)
16604            .expect("model rollup key must exist");
16605
16606        // Content rollup includes both plan messages.
16607        assert_eq!(hourly.plan_message_count, 2);
16608        assert_eq!(hourly.plan_content_tokens_est_total, 50);
16609        // API plan tokens must include only api_data_source='api' rows.
16610        assert_eq!(hourly.plan_api_tokens_total, 50);
16611        assert_eq!(daily.plan_api_tokens_total, 50);
16612        assert_eq!(models_daily.plan_api_tokens_total, 50);
16613        // Overall API totals must also exclude estimated rows.
16614        assert_eq!(hourly.api_tokens_total, 50);
16615        assert_eq!(hourly.api_input_tokens_total, 40);
16616        assert_eq!(hourly.api_output_tokens_total, 10);
16617        assert_eq!(hourly.api_coverage_message_count, 1);
16618        assert_eq!(daily.api_tokens_total, 50);
16619        assert_eq!(models_daily.api_tokens_total, 50);
16620    }
16621
16622    #[test]
16623    fn has_plan_heuristic_curated_corpus_thresholds() {
16624        // Cross-agent-style positives.
16625        let positives = [
16626            "## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
16627            "Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
16628            "Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
16629            "Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
16630            "# Plan\n1. Gather requirements\n2. Ship changes",
16631            "Action plan:\n- Identify root cause\n- Fix it\n- Validate",
16632        ];
16633
16634        // Typical false positives we want to avoid.
16635        let negatives = [
16636            "The plan is to move fast and fix things later.",
16637            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
16638            "stdout:\n1. Build started\n2. Build finished\nexit code: 0",
16639            "I can help with that request. Let me know if you want details.",
16640            "Here is a list:\n- apples\n- oranges",
16641            "Status update: completed tasks and blockers below.",
16642        ];
16643
16644        let tp = positives
16645            .iter()
16646            .filter(|msg| has_plan_heuristic(msg))
16647            .count();
16648        let fp = negatives
16649            .iter()
16650            .filter(|msg| has_plan_heuristic(msg))
16651            .count();
16652
16653        let recall = tp as f64 / positives.len() as f64;
16654        let false_positive_rate = fp as f64 / negatives.len() as f64;
16655
16656        assert!(
16657            recall >= 0.80,
16658            "plan heuristic recall too low: got {recall:.2}"
16659        );
16660        assert!(
16661            false_positive_rate <= 0.20,
16662            "plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
16663        );
16664    }
16665
16666    #[test]
16667    fn rebuild_analytics_repopulates_from_messages() {
16668        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16669        use std::path::PathBuf;
16670
16671        let dir = TempDir::new().unwrap();
16672        let db_path = dir.path().join("test.db");
16673        let storage = SqliteStorage::open(&db_path).unwrap();
16674
16675        // Register agent
16676        let agent = Agent {
16677            id: None,
16678            slug: "claude_code".into(),
16679            name: "Claude Code".into(),
16680            version: Some("1.0".into()),
16681            kind: AgentKind::Cli,
16682        };
16683        let agent_id = storage.ensure_agent(&agent).unwrap();
16684
16685        // 2026-02-06 10:30:00 UTC
16686        let ts_ms = 1_770_551_400_000_i64;
16687        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16688        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16689
16690        let usage_json = serde_json::json!({
16691            "message": {
16692                "model": "claude-opus-4-6",
16693                "usage": {
16694                    "input_tokens": 100,
16695                    "output_tokens": 50,
16696                    "cache_read_input_tokens": 200,
16697                    "cache_creation_input_tokens": 30,
16698                    "service_tier": "standard"
16699                }
16700            }
16701        });
16702
16703        let conv = Conversation {
16704            id: None,
16705            agent_slug: "claude_code".into(),
16706            workspace: None,
16707            external_id: Some("test-rebuild-1".into()),
16708            title: Some("Test conversation".into()),
16709            source_path: PathBuf::from("/tmp/test.jsonl"),
16710            started_at: Some(ts_ms),
16711            ended_at: Some(ts_ms + 60_000),
16712            approx_tokens: None,
16713            metadata_json: serde_json::Value::Null,
16714            messages: vec![
16715                Message {
16716                    id: None,
16717                    idx: 0,
16718                    role: MessageRole::User,
16719                    author: None,
16720                    created_at: Some(ts_ms),
16721                    content: "Hello, can you help me with a plan?".into(),
16722                    extra_json: serde_json::Value::Null,
16723                    snippets: vec![],
16724                },
16725                Message {
16726                    id: None,
16727                    idx: 1,
16728                    role: MessageRole::Agent,
16729                    author: None,
16730                    created_at: Some(ts_ms + 30_000),
16731                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16732                    extra_json: usage_json,
16733                    snippets: vec![],
16734                },
16735                Message {
16736                    id: None,
16737                    idx: 2,
16738                    role: MessageRole::User,
16739                    author: None,
16740                    created_at: Some(ts_ms + 60_000),
16741                    content: "Great, let's proceed!".into(),
16742                    extra_json: serde_json::Value::Null,
16743                    snippets: vec![],
16744                },
16745            ],
16746            source_id: "local".into(),
16747            origin_host: None,
16748        };
16749
16750        storage
16751            .insert_conversations_batched(&[(agent_id, None, &conv)])
16752            .unwrap();
16753
16754        // Save original analytics state
16755        let conn = storage.raw();
16756        let orig_mm: i64 = conn
16757            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16758                row.get_typed(0)
16759            })
16760            .unwrap();
16761        let orig_hourly: i64 = conn
16762            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
16763                row.get_typed(0)
16764            })
16765            .unwrap();
16766        let orig_daily: i64 = conn
16767            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
16768                row.get_typed(0)
16769            })
16770            .unwrap();
16771        let orig_models_daily: i64 = conn
16772            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
16773                row.get_typed(0)
16774            })
16775            .unwrap();
16776        let orig_api_input: i64 = conn
16777            .query_row_map(
16778                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
16779                &[],
16780                |row: &FrankenRow| row.get_typed(0),
16781            )
16782            .unwrap();
16783
16784        assert_eq!(orig_mm, 3);
16785        assert!(orig_hourly > 0);
16786        assert!(orig_daily > 0);
16787        assert!(orig_models_daily > 0);
16788
16789        // Destroy analytics tables (simulate corruption)
16790        conn.execute("DELETE FROM message_metrics").unwrap();
16791        conn.execute("DELETE FROM usage_hourly").unwrap();
16792        conn.execute("DELETE FROM usage_daily").unwrap();
16793        conn.execute("DELETE FROM usage_models_daily").unwrap();
16794
16795        // Verify they're empty
16796        let zero: i64 = conn
16797            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16798                row.get_typed(0)
16799            })
16800            .unwrap();
16801        assert_eq!(zero, 0);
16802
16803        // Rebuild analytics
16804        let result = storage.rebuild_analytics().unwrap();
16805
16806        assert_eq!(result.message_metrics_rows, 3);
16807        assert!(result.usage_hourly_rows > 0);
16808        assert!(result.usage_daily_rows > 0);
16809        assert!(result.usage_models_daily_rows > 0);
16810        assert!(
16811            result.elapsed_ms < 10_000,
16812            "Rebuild should be fast for 3 msgs"
16813        );
16814
16815        // Verify rebuilt data matches
16816        let conn = storage.raw();
16817        let rebuilt_mm: i64 = conn
16818            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16819                row.get_typed(0)
16820            })
16821            .unwrap();
16822        assert_eq!(
16823            rebuilt_mm, orig_mm,
16824            "Rebuilt message_metrics count should match"
16825        );
16826
16827        let rebuilt_hourly: i64 = conn
16828            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
16829                row.get_typed(0)
16830            })
16831            .unwrap();
16832        assert_eq!(
16833            rebuilt_hourly, orig_hourly,
16834            "Rebuilt hourly rows should match"
16835        );
16836
16837        let rebuilt_daily: i64 = conn
16838            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
16839                row.get_typed(0)
16840            })
16841            .unwrap();
16842        assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
16843
16844        let rebuilt_models_daily: i64 = conn
16845            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
16846                row.get_typed(0)
16847            })
16848            .unwrap();
16849        assert_eq!(
16850            rebuilt_models_daily, orig_models_daily,
16851            "Rebuilt model rollup rows should match"
16852        );
16853
16854        // Verify API token data preserved through rebuild
16855        let rebuilt_api_input: i64 = conn
16856            .query_row_map(
16857                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
16858                &[],
16859                |row: &FrankenRow| row.get_typed(0),
16860            )
16861            .unwrap();
16862        assert_eq!(
16863            rebuilt_api_input, orig_api_input,
16864            "Rebuilt API input tokens should match original"
16865        );
16866
16867        // Verify rollups have correct data
16868        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
16869            i64,
16870            i64,
16871            i64,
16872            i64,
16873            i64,
16874            i64,
16875        ) = conn
16876            .query_row_map(
16877                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16878                        plan_content_tokens_est_total, plan_api_tokens_total
16879                 FROM usage_hourly WHERE hour_id = ?",
16880                fparams![expected_hour],
16881                |row: &FrankenRow| {
16882                    Ok((
16883                        row.get_typed(0)?,
16884                        row.get_typed(1)?,
16885                        row.get_typed(2)?,
16886                        row.get_typed(3)?,
16887                        row.get_typed(4)?,
16888                        row.get_typed(5)?,
16889                    ))
16890                },
16891            )
16892            .unwrap();
16893        assert_eq!(uh_msg, 3);
16894        assert_eq!(uh_user, 2);
16895        assert_eq!(uh_asst, 1);
16896        assert_eq!(uh_plan, 1);
16897        assert!(uh_plan_content > 0);
16898        assert!(uh_plan_api > 0);
16899
16900        let ud_msg: i64 = conn
16901            .query_row_map(
16902                "SELECT message_count FROM usage_daily WHERE day_id = ?",
16903                fparams![expected_day],
16904                |row| row.get_typed(0),
16905            )
16906            .unwrap();
16907        assert_eq!(ud_msg, 3);
16908    }
16909
16910    #[test]
16911    fn insert_conversations_batched_flushes_large_fts_batches() {
16912        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16913        use std::path::PathBuf;
16914
16915        let dir = TempDir::new().unwrap();
16916        let db_path = dir.path().join("test.db");
16917        let storage = SqliteStorage::open(&db_path).unwrap();
16918        // V14 drops fts_messages during migration; cass normally recreates it
16919        // during startup via `ensure_search_fallback_fts_consistency`. Tests
16920        // that inspect fts_messages directly need to run the same repair pass
16921        // to exercise the "insert flushes FTS" contract.
16922        storage
16923            .ensure_search_fallback_fts_consistency()
16924            .expect("ensure FTS consistency before insert");
16925
16926        let agent = Agent {
16927            id: None,
16928            slug: "codex".into(),
16929            name: "Codex".into(),
16930            version: Some("0.2.3".into()),
16931            kind: AgentKind::Cli,
16932        };
16933        let agent_id = storage.ensure_agent(&agent).unwrap();
16934
16935        let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
16936        let messages: Vec<_> = (0_i64..2)
16937            .map(|i| Message {
16938                id: None,
16939                idx: i,
16940                role: MessageRole::Agent,
16941                author: None,
16942                created_at: Some(1_700_000_000_000 + i),
16943                content: format!("{i}-{content}"),
16944                extra_json: serde_json::Value::Null,
16945                snippets: Vec::new(),
16946            })
16947            .collect();
16948        let conv = Conversation {
16949            id: None,
16950            agent_slug: "codex".into(),
16951            workspace: Some(PathBuf::from("/tmp/workspace")),
16952            external_id: Some("fts-large-batch".into()),
16953            title: Some("FTS Large Batch".into()),
16954            source_path: PathBuf::from("/tmp/rollout.jsonl"),
16955            started_at: Some(1_700_000_000_000),
16956            ended_at: Some(1_700_000_000_999),
16957            approx_tokens: None,
16958            metadata_json: serde_json::Value::Null,
16959            messages,
16960            source_id: "local".into(),
16961            origin_host: None,
16962        };
16963
16964        let outcomes = storage
16965            .insert_conversations_batched(&[(agent_id, None, &conv)])
16966            .unwrap();
16967        assert_eq!(outcomes.len(), 1);
16968        assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
16969
16970        let message_count: i64 = storage
16971            .conn
16972            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
16973                row.get_typed(0)
16974            })
16975            .unwrap();
16976        let fts_count: i64 = storage
16977            .conn
16978            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
16979                row.get_typed(0)
16980            })
16981            .unwrap();
16982
16983        assert_eq!(message_count, conv.messages.len() as i64);
16984        assert_eq!(fts_count, conv.messages.len() as i64);
16985    }
16986
16987    fn make_profiled_storage_remote_conversation(
16988        external_id: i64,
16989        msg_count: usize,
16990    ) -> Conversation {
16991        Conversation {
16992            id: None,
16993            agent_slug: "codex".into(),
16994            workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
16995            external_id: Some(format!("profiled-storage-remote-{external_id}")),
16996            title: Some(format!(
16997                "Profiled storage remote conversation {external_id}"
16998            )),
16999            source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
17000            started_at: Some(10_000 + external_id * 100),
17001            ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
17002            approx_tokens: Some(msg_count as i64 * 32),
17003            metadata_json: serde_json::json!({ "bench": true }),
17004            messages: (0..msg_count)
17005                .map(|idx| Message {
17006                    id: None,
17007                    idx: idx as i64,
17008                    role: if idx % 2 == 0 {
17009                        MessageRole::User
17010                    } else {
17011                        MessageRole::Agent
17012                    },
17013                    author: Some("tester".into()),
17014                    created_at: Some(20_000 + external_id * 100 + idx as i64),
17015                    content: format!(
17016                        "profiled storage remote content ext={external_id} idx={idx} {}",
17017                        "x".repeat(64)
17018                    ),
17019                    extra_json: serde_json::json!({ "idx": idx }),
17020                    snippets: Vec::new(),
17021                })
17022                .collect(),
17023            source_id: "profiled-storage-remote-source".into(),
17024            origin_host: Some("builder-profile".into()),
17025        }
17026    }
17027
17028    fn make_profiled_append_remote_merge_conversation(
17029        external_id: i64,
17030        msg_count: usize,
17031    ) -> Conversation {
17032        let base_ts = 100_000 + external_id * 1_000;
17033        Conversation {
17034            id: None,
17035            agent_slug: "codex".into(),
17036            workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
17037            external_id: Some(format!("profiled-append-remote-{external_id}")),
17038            title: Some(format!("Profiled append remote conversation {external_id}")),
17039            source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
17040            started_at: Some(base_ts),
17041            ended_at: Some(base_ts + msg_count as i64),
17042            approx_tokens: Some(msg_count as i64 * 50),
17043            metadata_json: serde_json::json!({ "bench": true }),
17044            messages: (0..msg_count)
17045                .map(|idx| Message {
17046                    id: None,
17047                    idx: idx as i64,
17048                    role: if idx % 2 == 0 {
17049                        MessageRole::User
17050                    } else {
17051                        MessageRole::Agent
17052                    },
17053                    author: Some(format!("model-{}", external_id % 5)),
17054                    created_at: Some(base_ts + idx as i64),
17055                    content: format!(
17056                        "Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
17057                        external_id, idx
17058                    ),
17059                    extra_json: serde_json::json!({ "bench": true }),
17060                    snippets: Vec::new(),
17061                })
17062                .collect(),
17063            source_id: "profiled-append-remote-source".into(),
17064            origin_host: Some("builder-profile".into()),
17065        }
17066    }
17067
17068    #[test]
17069    fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
17070        let dir = TempDir::new().unwrap();
17071        let db_path = dir.path().join("batched-message-ids.db");
17072        let storage = SqliteStorage::open(&db_path).unwrap();
17073        let agent_id = storage
17074            .ensure_agent(&Agent {
17075                id: None,
17076                slug: "codex".into(),
17077                name: "Codex".into(),
17078                version: None,
17079                kind: AgentKind::Cli,
17080            })
17081            .unwrap();
17082        let workspace_id = storage
17083            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17084            .unwrap();
17085        let mut conv = make_profiled_storage_remote_conversation(42, 5);
17086        for (idx, msg) in conv.messages.iter_mut().enumerate() {
17087            msg.snippets.push(Snippet {
17088                id: None,
17089                file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
17090                start_line: Some((idx + 1) as i64),
17091                end_line: Some((idx + 2) as i64),
17092                language: Some("rust".into()),
17093                snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
17094            });
17095        }
17096        let outcome = storage
17097            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
17098            .unwrap();
17099
17100        let message_count: i64 = storage
17101            .conn
17102            .query_row_map(
17103                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17104                fparams![outcome.conversation_id],
17105                |row| row.get_typed(0),
17106            )
17107            .unwrap();
17108        let joined_snippet_count: i64 = storage
17109            .conn
17110            .query_row_map(
17111                "SELECT COUNT(*)
17112                 FROM snippets s
17113                 JOIN messages m ON s.message_id = m.id
17114                 WHERE m.conversation_id = ?1",
17115                fparams![outcome.conversation_id],
17116                |row| row.get_typed(0),
17117            )
17118            .unwrap();
17119
17120        assert_eq!(message_count, conv.messages.len() as i64);
17121        assert_eq!(joined_snippet_count, conv.messages.len() as i64);
17122    }
17123
17124    #[test]
17125    fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
17126        let dir = TempDir::new().unwrap();
17127        let db_path = dir.path().join("batched-append-message-ids.db");
17128        let storage = SqliteStorage::open(&db_path).unwrap();
17129        let agent_id = storage
17130            .ensure_agent(&Agent {
17131                id: None,
17132                slug: "codex".into(),
17133                name: "Codex".into(),
17134                version: None,
17135                kind: AgentKind::Cli,
17136            })
17137            .unwrap();
17138        let workspace_id = storage
17139            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17140            .unwrap();
17141
17142        let mut initial = make_profiled_storage_remote_conversation(77, 2);
17143        for (idx, msg) in initial.messages.iter_mut().enumerate() {
17144            msg.snippets.push(Snippet {
17145                id: None,
17146                file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
17147                start_line: Some((idx + 1) as i64),
17148                end_line: Some((idx + 2) as i64),
17149                language: Some("rust".into()),
17150                snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
17151            });
17152        }
17153        let first = storage
17154            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17155            .unwrap();
17156        assert_eq!(first.inserted_indices, vec![0, 1]);
17157
17158        let mut appended = make_profiled_storage_remote_conversation(77, 5);
17159        for (idx, msg) in appended.messages.iter_mut().enumerate() {
17160            msg.snippets.push(Snippet {
17161                id: None,
17162                file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
17163                start_line: Some((idx + 10) as i64),
17164                end_line: Some((idx + 11) as i64),
17165                language: Some("rust".into()),
17166                snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
17167            });
17168        }
17169        let second = storage
17170            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17171            .unwrap();
17172        assert_eq!(second.conversation_id, first.conversation_id);
17173        assert_eq!(second.inserted_indices, vec![2, 3, 4]);
17174
17175        let message_count: i64 = storage
17176            .conn
17177            .query_row_map(
17178                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17179                fparams![first.conversation_id],
17180                |row| row.get_typed(0),
17181            )
17182            .unwrap();
17183        let joined_snippets: Vec<(i64, String)> = storage
17184            .conn
17185            .query_map_collect(
17186                "SELECT m.idx, s.file_path
17187                 FROM snippets s
17188                 JOIN messages m ON s.message_id = m.id
17189                 WHERE m.conversation_id = ?1
17190                 ORDER BY m.idx, s.id",
17191                fparams![first.conversation_id],
17192                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17193            )
17194            .unwrap();
17195
17196        assert_eq!(message_count, 5);
17197        assert_eq!(
17198            joined_snippets,
17199            vec![
17200                (0, "src/append_initial_0.rs".to_string()),
17201                (1, "src/append_initial_1.rs".to_string()),
17202                (2, "src/append_full_2.rs".to_string()),
17203                (3, "src/append_full_3.rs".to_string()),
17204                (4, "src/append_full_4.rs".to_string()),
17205            ]
17206        );
17207    }
17208
17209    #[test]
17210    fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
17211        let dir = TempDir::new().unwrap();
17212        let db_path = dir.path().join("external-lookup-rehydrate.db");
17213        let storage = SqliteStorage::open(&db_path).unwrap();
17214        let agent_id = storage
17215            .ensure_agent(&Agent {
17216                id: None,
17217                slug: "codex".into(),
17218                name: "Codex".into(),
17219                version: None,
17220                kind: AgentKind::Cli,
17221            })
17222            .unwrap();
17223        let workspace_id = storage
17224            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17225            .unwrap();
17226
17227        let initial = make_profiled_storage_remote_conversation(88, 2);
17228        let first = storage
17229            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17230            .unwrap();
17231        let external_id = initial.external_id.as_deref().unwrap();
17232        let lookup_key =
17233            conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
17234        let lookup_id: i64 = storage
17235            .conn
17236            .query_row_map(
17237                "SELECT conversation_id
17238                 FROM conversation_external_tail_lookup
17239                 WHERE lookup_key = ?1",
17240                fparams![lookup_key.as_str()],
17241                |row| row.get_typed(0),
17242            )
17243            .unwrap();
17244        assert_eq!(lookup_id, first.conversation_id);
17245
17246        storage
17247            .conn
17248            .execute_compat(
17249                "DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
17250                fparams![lookup_key.as_str()],
17251            )
17252            .unwrap();
17253
17254        let appended = make_profiled_storage_remote_conversation(88, 4);
17255        let second = storage
17256            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17257            .unwrap();
17258        assert_eq!(second.conversation_id, first.conversation_id);
17259        assert_eq!(second.inserted_indices, vec![2, 3]);
17260
17261        let conversation_count: i64 = storage
17262            .conn
17263            .query_row_map(
17264                "SELECT COUNT(*)
17265                 FROM conversations
17266                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
17267                fparams![initial.source_id.as_str(), agent_id, external_id],
17268                |row| row.get_typed(0),
17269            )
17270            .unwrap();
17271        let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
17272            .conn
17273            .query_row_map(
17274                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
17275                 FROM conversation_external_tail_lookup
17276                 WHERE lookup_key = ?1",
17277                fparams![lookup_key.as_str()],
17278                |row| {
17279                    Ok((
17280                        row.get_typed(0)?,
17281                        row.get_typed(1)?,
17282                        row.get_typed(2)?,
17283                        row.get_typed(3)?,
17284                    ))
17285                },
17286            )
17287            .unwrap();
17288        let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
17289            .conn
17290            .query_row_map(
17291                "SELECT ended_at, last_message_idx, last_message_created_at
17292                 FROM conversation_tail_state
17293                 WHERE conversation_id = ?1",
17294                fparams![first.conversation_id],
17295                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
17296            )
17297            .unwrap();
17298        assert_eq!(conversation_count, 1);
17299        assert_eq!(
17300            restored_lookup,
17301            (
17302                first.conversation_id,
17303                tail_state.0,
17304                tail_state.1,
17305                tail_state.2
17306            )
17307        );
17308        assert_eq!(
17309            tail_state,
17310            (
17311                appended.messages[3].created_at,
17312                Some(3),
17313                appended.messages[3].created_at
17314            )
17315        );
17316    }
17317
17318    #[test]
17319    fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
17320        let dir = TempDir::new().unwrap();
17321        let db_path = dir.path().join("test.db");
17322        let storage = SqliteStorage::open(&db_path).unwrap();
17323        let agent_id = storage
17324            .ensure_agent(&Agent {
17325                id: None,
17326                slug: "codex".into(),
17327                name: "Codex".into(),
17328                version: None,
17329                kind: AgentKind::Cli,
17330            })
17331            .unwrap();
17332        let workspace = PathBuf::from("/ws/profiled-storage-remote");
17333        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17334
17335        storage
17336            .insert_conversation_tree(
17337                agent_id,
17338                Some(workspace_id),
17339                &make_profiled_storage_remote_conversation(0, 3),
17340            )
17341            .unwrap();
17342        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17343
17344        storage
17345            .insert_conversation_tree(
17346                agent_id,
17347                Some(workspace_id),
17348                &make_profiled_storage_remote_conversation(1, 2),
17349            )
17350            .unwrap();
17351
17352        let row_count: i64 = storage
17353            .conn
17354            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
17355                row.get_typed(0)
17356            })
17357            .unwrap();
17358        let (session_count, message_count): (i64, i64) = storage
17359            .conn
17360            .query_row_map(
17361                "SELECT session_count, message_count
17362                 FROM daily_stats
17363                 WHERE agent_slug = 'all' AND source_id = 'all'",
17364                fparams![],
17365                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17366            )
17367            .unwrap();
17368
17369        assert_eq!(row_count, 4);
17370        assert_eq!(session_count, 1);
17371        assert_eq!(message_count, 2);
17372    }
17373
17374    #[test]
17375    #[serial]
17376    fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
17377        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17378
17379        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17380            let dir = TempDir::new().unwrap();
17381            let db_path = dir.path().join(format!("profile-{msg_count}.db"));
17382            let storage = SqliteStorage::open(&db_path).unwrap();
17383            let agent_id = storage
17384                .ensure_agent(&Agent {
17385                    id: None,
17386                    slug: "codex".into(),
17387                    name: "Codex".into(),
17388                    version: None,
17389                    kind: AgentKind::Cli,
17390                })
17391                .unwrap();
17392            let workspace = PathBuf::from("/ws/profiled-storage-remote");
17393            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17394
17395            storage
17396                .insert_conversation_tree(
17397                    agent_id,
17398                    Some(workspace_id),
17399                    &make_profiled_storage_remote_conversation(0, msg_count),
17400                )
17401                .unwrap();
17402
17403            let mut profile = InsertConversationTreePerfProfile::default();
17404            for external_id in 1..=iterations {
17405                storage
17406                    .insert_conversation_tree_with_profile(
17407                        agent_id,
17408                        Some(workspace_id),
17409                        &make_profiled_storage_remote_conversation(external_id as i64, msg_count),
17410                        &mut profile,
17411                    )
17412                    .unwrap();
17413            }
17414
17415            let accounted_duration = profile.source_duration
17416                + profile.tx_open_duration
17417                + profile.existing_lookup_duration
17418                + profile.conversation_row_duration
17419                + profile.message_insert_duration
17420                + profile.snippet_insert_duration
17421                + profile.fts_entry_duration
17422                + profile.fts_flush_duration
17423                + profile.analytics_duration
17424                + profile.commit_duration;
17425            assert_eq!(profile.invocations, iterations);
17426            assert_eq!(profile.messages, iterations * msg_count);
17427            assert_eq!(profile.inserted_messages, iterations * msg_count);
17428            assert!(
17429                profile.total_duration >= accounted_duration,
17430                "accounted stage durations cannot exceed total duration"
17431            );
17432
17433            profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
17434        }
17435    }
17436
17437    #[test]
17438    #[serial]
17439    fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
17440        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17441
17442        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17443            let dir = TempDir::new().unwrap();
17444            let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
17445            let storage = SqliteStorage::open(&db_path).unwrap();
17446            let agent_id = storage
17447                .ensure_agent(&Agent {
17448                    id: None,
17449                    slug: "codex".into(),
17450                    name: "Codex".into(),
17451                    version: None,
17452                    kind: AgentKind::Cli,
17453                })
17454                .unwrap();
17455            let workspace = PathBuf::from("/ws/profiled-append-remote");
17456            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17457
17458            for external_id in 0..iterations {
17459                storage
17460                    .insert_conversation_tree(
17461                        agent_id,
17462                        Some(workspace_id),
17463                        &make_profiled_append_remote_merge_conversation(
17464                            external_id as i64,
17465                            msg_count,
17466                        ),
17467                    )
17468                    .unwrap();
17469            }
17470
17471            let mut profile = InsertConversationTreePerfProfile::default();
17472            for external_id in 0..iterations {
17473                storage
17474                    .append_existing_conversation_with_profile(
17475                        agent_id,
17476                        Some(workspace_id),
17477                        &make_profiled_append_remote_merge_conversation(
17478                            external_id as i64,
17479                            msg_count * 2,
17480                        ),
17481                        &mut profile,
17482                    )
17483                    .unwrap();
17484            }
17485
17486            let accounted_duration = profile.source_duration
17487                + profile.tx_open_duration
17488                + profile.existing_lookup_duration
17489                + profile.existing_idx_lookup_duration
17490                + profile.existing_replay_lookup_duration
17491                + profile.dedupe_filter_duration
17492                + profile.conversation_row_duration
17493                + profile.message_insert_duration
17494                + profile.snippet_insert_duration
17495                + profile.fts_entry_duration
17496                + profile.fts_flush_duration
17497                + profile.analytics_duration
17498                + profile.commit_duration;
17499            assert_eq!(profile.invocations, iterations);
17500            assert_eq!(profile.messages, iterations * msg_count * 2);
17501            assert_eq!(profile.inserted_messages, iterations * msg_count);
17502            assert!(
17503                profile.total_duration >= accounted_duration,
17504                "accounted append stage durations cannot exceed total duration"
17505            );
17506
17507            profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
17508        }
17509    }
17510
17511    #[test]
17512    fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
17513        let dir = TempDir::new().unwrap();
17514        let db_path = dir.path().join("test.db");
17515        let storage = SqliteStorage::open(&db_path).unwrap();
17516        let started_at = 1_700_000_000_000_i64;
17517        let day_id = FrankenStorage::day_id_from_millis(started_at);
17518        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17519
17520        storage
17521            .conn
17522            .execute_compat(
17523                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17524                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17525                fparams![1_i64, "codex", "Codex", "cli"],
17526            )
17527            .unwrap();
17528        storage
17529            .conn
17530            .execute_compat(
17531                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17532                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17533                fparams![2_i64, "claude", "Claude", "cli"],
17534            )
17535            .unwrap();
17536
17537        storage
17538            .conn
17539            .execute_compat(
17540                "INSERT INTO conversations (
17541                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17542                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17543                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17544                fparams![
17545                    1_i64,
17546                    1_i64,
17547                    LOCAL_SOURCE_ID,
17548                    "daily-a",
17549                    "Daily A",
17550                    "/tmp/daily-a.jsonl",
17551                    started_at,
17552                    started_at + 200,
17553                    "{}"
17554                ],
17555            )
17556            .unwrap();
17557        storage
17558            .conn
17559            .execute_compat(
17560                "INSERT INTO conversations (
17561                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17562                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17563                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17564                fparams![
17565                    2_i64,
17566                    2_i64,
17567                    LOCAL_SOURCE_ID,
17568                    "daily-b",
17569                    "Daily B",
17570                    "/tmp/daily-b.jsonl",
17571                    started_at,
17572                    started_at + 300,
17573                    "{}"
17574                ],
17575            )
17576            .unwrap();
17577
17578        storage
17579            .conn
17580            .execute_compat(
17581                "INSERT INTO messages (
17582                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17583                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17584                fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
17585            )
17586            .unwrap();
17587        storage
17588            .conn
17589            .execute_compat(
17590                "INSERT INTO messages (
17591                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17592                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17593                fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
17594            )
17595            .unwrap();
17596        storage
17597            .conn
17598            .execute_compat(
17599                "INSERT INTO messages (
17600                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17601                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17602                fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
17603            )
17604            .unwrap();
17605
17606        for (message_id, agent_slug, role, content_len) in [
17607            (1_i64, "codex", "user", 5_i64),
17608            (2_i64, "codex", "assistant", 8_i64),
17609            (3_i64, "claude", "user", 3_i64),
17610        ] {
17611            storage
17612                .conn
17613                .execute_compat(
17614                    "INSERT INTO message_metrics (
17615                        message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17616                        role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17617                        api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17618                        api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17619                        model_name, model_family, model_tier, provider
17620                     ) VALUES (
17621                        ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17622                        ?8, ?9, ?10, ?11, ?12,
17623                        ?13, ?14, ?15,
17624                        ?16, ?17, ?18, ?19, ?20,
17625                        ?21, ?22, ?23, ?24
17626                     )",
17627                    fparams![
17628                        message_id,
17629                        started_at,
17630                        hour_id,
17631                        day_id,
17632                        agent_slug,
17633                        0_i64,
17634                        LOCAL_SOURCE_ID,
17635                        role,
17636                        content_len,
17637                        content_len / 4,
17638                        0_i64,
17639                        0_i64,
17640                        0_i64,
17641                        0_i64,
17642                        0_i64,
17643                        "",
17644                        "estimated",
17645                        0_i64,
17646                        0_i64,
17647                        0_i64,
17648                        "",
17649                        "unknown",
17650                        "unknown",
17651                        "unknown"
17652                    ],
17653                )
17654                .unwrap();
17655        }
17656
17657        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17658
17659        let rebuilt = storage.rebuild_daily_stats().unwrap();
17660        assert_eq!(rebuilt.total_sessions, 2);
17661
17662        let health = storage.daily_stats_health().unwrap();
17663        assert_eq!(health.conversation_count, 2);
17664        assert_eq!(health.materialized_total, 2);
17665        assert_eq!(health.drift, 0);
17666
17667        let total_messages: i64 = storage
17668            .conn
17669            .query_row_map(
17670                "SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17671                fparams![],
17672                |row| row.get_typed(0),
17673            )
17674            .unwrap();
17675        assert_eq!(total_messages, 3);
17676    }
17677
17678    #[test]
17679    fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
17680        let dir = TempDir::new().unwrap();
17681        let db_path = dir.path().join("test.db");
17682        let storage = SqliteStorage::open(&db_path).unwrap();
17683
17684        let content = "ASCII🙂é漢字";
17685        let expected_bytes = content.len() as i64;
17686        let started_at = 1_704_067_200_000_i64;
17687        let day_id = FrankenStorage::day_id_from_millis(started_at);
17688        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17689
17690        storage
17691            .conn
17692            .execute_compat(
17693                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17694                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17695                fparams![1_i64, "tester", "Tester", "cli"],
17696            )
17697            .unwrap();
17698        storage
17699            .conn
17700            .execute_compat(
17701                "INSERT INTO conversations (
17702                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17703                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17704                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17705                fparams![
17706                    1_i64,
17707                    1_i64,
17708                    LOCAL_SOURCE_ID,
17709                    "unicode-metrics",
17710                    "Unicode Metrics",
17711                    "/tmp/unicode-metrics.jsonl",
17712                    started_at,
17713                    "{}"
17714                ],
17715            )
17716            .unwrap();
17717        storage
17718            .conn
17719            .execute_compat(
17720                "INSERT INTO messages (
17721                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17722                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17723                fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
17724            )
17725            .unwrap();
17726        storage
17727            .conn
17728            .execute_compat(
17729                "INSERT INTO message_metrics (
17730                    message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17731                    role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17732                    api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17733                    api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17734                    model_name, model_family, model_tier, provider
17735                 ) VALUES (
17736                    ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17737                    ?8, ?9, ?10, ?11, ?12,
17738                    ?13, ?14, ?15,
17739                    ?16, ?17, ?18, ?19, ?20,
17740                    ?21, ?22, ?23, ?24
17741                 )",
17742                fparams![
17743                    1_i64,
17744                    started_at,
17745                    hour_id,
17746                    day_id,
17747                    "tester",
17748                    0_i64,
17749                    LOCAL_SOURCE_ID,
17750                    "user",
17751                    expected_bytes,
17752                    expected_bytes / 4,
17753                    0_i64,
17754                    0_i64,
17755                    0_i64,
17756                    0_i64,
17757                    0_i64,
17758                    "",
17759                    "estimated",
17760                    0_i64,
17761                    0_i64,
17762                    0_i64,
17763                    "",
17764                    "unknown",
17765                    "unknown",
17766                    "unknown"
17767                ],
17768            )
17769            .unwrap();
17770
17771        let mut tx = storage.conn.transaction().unwrap();
17772        franken_update_daily_stats_in_tx(
17773            &storage,
17774            &tx,
17775            "tester",
17776            LOCAL_SOURCE_ID,
17777            Some(started_at),
17778            StatsDelta {
17779                session_count_delta: 1,
17780                message_count_delta: 1,
17781                total_chars_delta: expected_bytes,
17782            },
17783        )
17784        .unwrap();
17785        tx.commit().unwrap();
17786
17787        let inline_total: i64 = storage
17788            .conn
17789            .query_row_map(
17790                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17791                fparams![],
17792                |row| row.get_typed(0),
17793            )
17794            .unwrap();
17795        assert_eq!(inline_total, expected_bytes);
17796
17797        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17798
17799        let rebuilt = storage.rebuild_daily_stats().unwrap();
17800        assert_eq!(rebuilt.total_sessions, 1);
17801
17802        let rebuilt_total: i64 = storage
17803            .conn
17804            .query_row_map(
17805                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17806                fparams![],
17807                |row| row.get_typed(0),
17808            )
17809            .unwrap();
17810        assert_eq!(rebuilt_total, expected_bytes);
17811    }
17812
17813    #[test]
17814    fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
17815        let dir = TempDir::new().unwrap();
17816        let db_path = dir.path().join("test.db");
17817        let storage = SqliteStorage::open(&db_path).unwrap();
17818
17819        let content = "fallback🙂é漢字";
17820        let expected_bytes = content.len() as i64;
17821        let started_at = 1_704_067_200_000_i64;
17822        storage
17823            .conn
17824            .execute_compat(
17825                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17826                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17827                fparams![1_i64, "tester", "Tester", "cli"],
17828            )
17829            .unwrap();
17830        storage
17831            .conn
17832            .execute_compat(
17833                "INSERT INTO conversations (
17834                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17835                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17836                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17837                fparams![
17838                    1_i64,
17839                    1_i64,
17840                    LOCAL_SOURCE_ID,
17841                    "unicode-fallback",
17842                    "Unicode Fallback",
17843                    "/tmp/unicode-fallback.jsonl",
17844                    started_at,
17845                    "{}"
17846                ],
17847            )
17848            .unwrap();
17849        storage
17850            .conn
17851            .execute_compat(
17852                "INSERT INTO messages (
17853                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17854                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17855                fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
17856            )
17857            .unwrap();
17858
17859        let mut tx = storage.conn.transaction().unwrap();
17860        franken_update_daily_stats_in_tx(
17861            &storage,
17862            &tx,
17863            "tester",
17864            LOCAL_SOURCE_ID,
17865            Some(started_at),
17866            StatsDelta {
17867                session_count_delta: 1,
17868                message_count_delta: 1,
17869                total_chars_delta: expected_bytes,
17870            },
17871        )
17872        .unwrap();
17873        tx.commit().unwrap();
17874
17875        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17876
17877        let rebuilt = storage.rebuild_daily_stats().unwrap();
17878        assert_eq!(rebuilt.total_sessions, 1);
17879
17880        let rebuilt_total: i64 = storage
17881            .conn
17882            .query_row_map(
17883                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17884                fparams![],
17885                |row| row.get_typed(0),
17886            )
17887            .unwrap();
17888        assert_eq!(rebuilt_total, expected_bytes);
17889    }
17890
17891    #[test]
17892    fn insert_conversations_batched_appends_duplicate_external_id() {
17893        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17894        use std::path::PathBuf;
17895
17896        let dir = TempDir::new().unwrap();
17897        let db_path = dir.path().join("test.db");
17898        let storage = SqliteStorage::open(&db_path).unwrap();
17899
17900        let agent = Agent {
17901            id: None,
17902            slug: "codex".into(),
17903            name: "Codex".into(),
17904            version: Some("0.2.3".into()),
17905            kind: AgentKind::Cli,
17906        };
17907        let agent_id = storage.ensure_agent(&agent).unwrap();
17908
17909        let base_conv = |messages: Vec<Message>| Conversation {
17910            id: None,
17911            agent_slug: "codex".into(),
17912            workspace: Some(PathBuf::from("/tmp/workspace")),
17913            external_id: Some("shared-session".into()),
17914            title: Some("Shared Session".into()),
17915            source_path: PathBuf::from("/tmp/rollout.jsonl"),
17916            started_at: Some(1_700_000_000_000),
17917            ended_at: Some(1_700_000_000_999),
17918            approx_tokens: None,
17919            metadata_json: serde_json::Value::Null,
17920            messages,
17921            source_id: "local".into(),
17922            origin_host: None,
17923        };
17924
17925        let conv_a = base_conv(vec![
17926            Message {
17927                id: None,
17928                idx: 0,
17929                role: MessageRole::User,
17930                author: None,
17931                created_at: Some(1_700_000_000_000),
17932                content: "first".into(),
17933                extra_json: serde_json::Value::Null,
17934                snippets: Vec::new(),
17935            },
17936            Message {
17937                id: None,
17938                idx: 1,
17939                role: MessageRole::Agent,
17940                author: None,
17941                created_at: Some(1_700_000_000_100),
17942                content: "second".into(),
17943                extra_json: serde_json::Value::Null,
17944                snippets: Vec::new(),
17945            },
17946        ]);
17947        let conv_b = base_conv(vec![
17948            Message {
17949                id: None,
17950                idx: 0,
17951                role: MessageRole::User,
17952                author: None,
17953                created_at: Some(1_700_000_000_000),
17954                content: "first".into(),
17955                extra_json: serde_json::Value::Null,
17956                snippets: Vec::new(),
17957            },
17958            Message {
17959                id: None,
17960                idx: 1,
17961                role: MessageRole::Agent,
17962                author: None,
17963                created_at: Some(1_700_000_000_100),
17964                content: "second".into(),
17965                extra_json: serde_json::Value::Null,
17966                snippets: Vec::new(),
17967            },
17968            Message {
17969                id: None,
17970                idx: 2,
17971                role: MessageRole::User,
17972                author: None,
17973                created_at: Some(1_700_000_000_200),
17974                content: "third".into(),
17975                extra_json: serde_json::Value::Null,
17976                snippets: Vec::new(),
17977            },
17978            Message {
17979                id: None,
17980                idx: 3,
17981                role: MessageRole::Agent,
17982                author: None,
17983                created_at: Some(1_700_000_000_300),
17984                content: "fourth".into(),
17985                extra_json: serde_json::Value::Null,
17986                snippets: Vec::new(),
17987            },
17988        ]);
17989
17990        let outcomes = storage
17991            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
17992            .unwrap();
17993        assert_eq!(outcomes.len(), 2);
17994        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
17995        assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
17996        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
17997
17998        let conversation_count: i64 = storage
17999            .conn
18000            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18001                row.get_typed(0)
18002            })
18003            .unwrap();
18004        let conversation_count_not_indexed: i64 = storage
18005            .conn
18006            .query_row_map(
18007                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18008                fparams![],
18009                |row| row.get_typed(0),
18010            )
18011            .unwrap();
18012        let conversation_count_source_index: i64 = storage
18013            .conn
18014            .query_row_map(
18015                "SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
18016                fparams![],
18017                |row| row.get_typed(0),
18018            )
18019            .unwrap();
18020        let message_count: i64 = storage
18021            .conn
18022            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18023                row.get_typed(0)
18024            })
18025            .unwrap();
18026        let reopened_storage = SqliteStorage::open(&db_path).unwrap();
18027        let reopened_conversation_count: i64 = reopened_storage
18028            .conn
18029            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18030                row.get_typed(0)
18031            })
18032            .unwrap();
18033        let reopened_conversation_count_not_indexed: i64 = reopened_storage
18034            .conn
18035            .query_row_map(
18036                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18037                fparams![],
18038                |row| row.get_typed(0),
18039            )
18040            .unwrap();
18041        let reopened_conversation_ids: Vec<i64> = reopened_storage
18042            .conn
18043            .query_map_collect(
18044                "SELECT id FROM conversations ORDER BY id",
18045                fparams![],
18046                |row| row.get_typed(0),
18047            )
18048            .unwrap();
18049        let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
18050            .conn
18051            .query_map_collect(
18052                "SELECT id FROM conversations NOT INDEXED ORDER BY id",
18053                fparams![],
18054                |row| row.get_typed(0),
18055            )
18056            .unwrap();
18057        let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
18058            .conn
18059            .query_map_collect(
18060                "SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
18061                fparams![],
18062                |row| row.get_typed(0),
18063            )
18064            .unwrap();
18065
18066        assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
18067        assert_eq!(
18068            reopened_conversation_ids_not_indexed,
18069            vec![outcomes[0].conversation_id]
18070        );
18071        assert_eq!(
18072            reopened_conversation_ids_source_index,
18073            vec![outcomes[0].conversation_id]
18074        );
18075        assert_eq!(reopened_conversation_count, 1);
18076        assert_eq!(reopened_conversation_count_not_indexed, 1);
18077        assert_eq!(conversation_count_not_indexed, 1);
18078        assert_eq!(conversation_count_source_index, 1);
18079        assert_eq!(conversation_count, 1);
18080        assert_eq!(message_count, 4);
18081    }
18082
18083    #[test]
18084    fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
18085        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18086        use std::path::PathBuf;
18087
18088        let dir = TempDir::new().unwrap();
18089        let db_path = dir.path().join("test.db");
18090        let storage = SqliteStorage::open(&db_path).unwrap();
18091
18092        let agent = Agent {
18093            id: None,
18094            slug: "codex".into(),
18095            name: "Codex".into(),
18096            version: Some("0.2.3".into()),
18097            kind: AgentKind::Cli,
18098        };
18099        let agent_id = storage.ensure_agent(&agent).unwrap();
18100
18101        let conv = Conversation {
18102            id: None,
18103            agent_slug: "codex".into(),
18104            workspace: Some(PathBuf::from("/tmp/workspace")),
18105            external_id: Some("recover-duplicate".into()),
18106            title: Some("Recover Duplicate".into()),
18107            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18108            started_at: Some(1_700_000_000_000),
18109            ended_at: Some(1_700_000_000_100),
18110            approx_tokens: None,
18111            metadata_json: serde_json::Value::Null,
18112            messages: vec![Message {
18113                id: None,
18114                idx: 0,
18115                role: MessageRole::User,
18116                author: None,
18117                created_at: Some(1_700_000_000_000),
18118                content: "hello".into(),
18119                extra_json: serde_json::Value::Null,
18120                snippets: Vec::new(),
18121            }],
18122            source_id: "local".into(),
18123            origin_host: None,
18124        };
18125
18126        let tx = storage.conn.transaction().unwrap();
18127        let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
18128            .unwrap()
18129            .expect("first insert should succeed");
18130
18131        let conversation_key = conversation_merge_key(agent_id, &conv);
18132        let resolved = franken_insert_conversation_or_get_existing_after_miss(
18133            &tx,
18134            agent_id,
18135            None,
18136            &conv,
18137            &conversation_key,
18138        )
18139        .unwrap();
18140
18141        match resolved {
18142            ConversationInsertStatus::Existing(existing_id) => {
18143                assert_eq!(existing_id, inserted_id);
18144            }
18145            ConversationInsertStatus::Inserted(new_id) => {
18146                panic!("expected existing conversation id, got freshly inserted {new_id}");
18147            }
18148        }
18149
18150        let conversation_count: i64 = tx
18151            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18152                row.get_typed(0)
18153            })
18154            .unwrap();
18155        assert_eq!(conversation_count, 1);
18156    }
18157
18158    #[test]
18159    fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
18160        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18161        use std::path::PathBuf;
18162
18163        let dir = TempDir::new().unwrap();
18164        let db_path = dir.path().join("test.db");
18165        let storage = SqliteStorage::open(&db_path).unwrap();
18166
18167        let agent = Agent {
18168            id: None,
18169            slug: "codex".into(),
18170            name: "Codex".into(),
18171            version: Some("0.2.3".into()),
18172            kind: AgentKind::Cli,
18173        };
18174        let agent_id = storage.ensure_agent(&agent).unwrap();
18175
18176        let base_conv = |messages: Vec<Message>| Conversation {
18177            id: None,
18178            agent_slug: "codex".into(),
18179            workspace: Some(PathBuf::from("/tmp/workspace")),
18180            external_id: Some("shared-session-gap".into()),
18181            title: Some("Shared Session Gap".into()),
18182            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18183            started_at: Some(1_700_000_000_000),
18184            ended_at: Some(1_700_000_000_999),
18185            approx_tokens: None,
18186            metadata_json: serde_json::Value::Null,
18187            messages,
18188            source_id: "local".into(),
18189            origin_host: None,
18190        };
18191
18192        let conv_a = base_conv(vec![
18193            Message {
18194                id: None,
18195                idx: 2,
18196                role: MessageRole::User,
18197                author: None,
18198                created_at: Some(1_700_000_000_200),
18199                content: "third".into(),
18200                extra_json: serde_json::Value::Null,
18201                snippets: Vec::new(),
18202            },
18203            Message {
18204                id: None,
18205                idx: 3,
18206                role: MessageRole::Agent,
18207                author: None,
18208                created_at: Some(1_700_000_000_300),
18209                content: "fourth".into(),
18210                extra_json: serde_json::Value::Null,
18211                snippets: Vec::new(),
18212            },
18213        ]);
18214        let conv_b = base_conv(vec![
18215            Message {
18216                id: None,
18217                idx: 0,
18218                role: MessageRole::User,
18219                author: None,
18220                created_at: Some(1_700_000_000_000),
18221                content: "first".into(),
18222                extra_json: serde_json::Value::Null,
18223                snippets: Vec::new(),
18224            },
18225            Message {
18226                id: None,
18227                idx: 1,
18228                role: MessageRole::Agent,
18229                author: None,
18230                created_at: Some(1_700_000_000_100),
18231                content: "second".into(),
18232                extra_json: serde_json::Value::Null,
18233                snippets: Vec::new(),
18234            },
18235            Message {
18236                id: None,
18237                idx: 3,
18238                role: MessageRole::Agent,
18239                author: None,
18240                created_at: Some(1_700_000_000_300),
18241                content: "fourth".into(),
18242                extra_json: serde_json::Value::Null,
18243                snippets: Vec::new(),
18244            },
18245        ]);
18246
18247        let outcomes = storage
18248            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18249            .unwrap();
18250        assert_eq!(outcomes.len(), 2);
18251        assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
18252        assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
18253        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18254
18255        let stored_indices: Vec<i64> = storage
18256            .conn
18257            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18258                row.get_typed(0)
18259            })
18260            .unwrap();
18261        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18262    }
18263
18264    #[test]
18265    fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
18266        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18267        use std::path::PathBuf;
18268
18269        let dir = TempDir::new().unwrap();
18270        let db_path = dir.path().join("test.db");
18271        let storage = SqliteStorage::open(&db_path).unwrap();
18272
18273        let agent = Agent {
18274            id: None,
18275            slug: "codex".into(),
18276            name: "Codex".into(),
18277            version: Some("0.2.3".into()),
18278            kind: AgentKind::Cli,
18279        };
18280        let agent_id = storage.ensure_agent(&agent).unwrap();
18281
18282        let make_message = |idx: i64, content: &str| Message {
18283            id: None,
18284            idx,
18285            role: if idx == 0 {
18286                MessageRole::User
18287            } else {
18288                MessageRole::Agent
18289            },
18290            author: None,
18291            created_at: Some(1_700_000_000_000 + idx),
18292            content: content.into(),
18293            extra_json: serde_json::Value::Null,
18294            snippets: Vec::new(),
18295        };
18296
18297        let base_conv = |messages: Vec<Message>| Conversation {
18298            id: None,
18299            agent_slug: "codex".into(),
18300            workspace: Some(PathBuf::from("/tmp/workspace")),
18301            external_id: Some("partial-cache-session".into()),
18302            title: Some("Partial cache session".into()),
18303            source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
18304            started_at: Some(1_700_000_000_000),
18305            ended_at: Some(1_700_000_000_100),
18306            approx_tokens: None,
18307            metadata_json: serde_json::Value::Null,
18308            messages,
18309            source_id: "local".into(),
18310            origin_host: None,
18311        };
18312
18313        let canonical = base_conv(vec![
18314            make_message(0, "canonical zero"),
18315            make_message(20, "canonical twenty"),
18316        ]);
18317        storage
18318            .insert_conversation_tree(agent_id, None, &canonical)
18319            .unwrap();
18320
18321        let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
18322        let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
18323
18324        let outcomes = storage
18325            .insert_conversations_batched(&[
18326                (agent_id, None, &exact_prefix),
18327                (agent_id, None, &conflicting_tail),
18328            ])
18329            .unwrap();
18330
18331        assert_eq!(outcomes.len(), 2);
18332        assert!(outcomes[0].inserted_indices.is_empty());
18333        assert!(
18334            outcomes[1].inserted_indices.is_empty(),
18335            "the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
18336        );
18337
18338        let stored_messages: Vec<(i64, String)> = storage
18339            .conn
18340            .query_map_collect(
18341                "SELECT idx, content FROM messages ORDER BY idx",
18342                fparams![],
18343                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18344            )
18345            .unwrap();
18346        assert_eq!(
18347            stored_messages,
18348            vec![
18349                (0, "canonical zero".to_string()),
18350                (20, "canonical twenty".to_string()),
18351            ]
18352        );
18353    }
18354
18355    #[test]
18356    fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
18357        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18358        use std::path::PathBuf;
18359
18360        const MESSAGE_COUNT: i64 = 64;
18361
18362        let dir = TempDir::new().unwrap();
18363        let db_path = dir.path().join("test.db");
18364        let storage = SqliteStorage::open(&db_path).unwrap();
18365
18366        let agent = Agent {
18367            id: None,
18368            slug: "codex".into(),
18369            name: "Codex".into(),
18370            version: Some("0.2.3".into()),
18371            kind: AgentKind::Cli,
18372        };
18373        let agent_id = storage.ensure_agent(&agent).unwrap();
18374
18375        let messages: Vec<Message> = (0..MESSAGE_COUNT)
18376            .map(|idx| Message {
18377                id: None,
18378                idx,
18379                role: if idx % 2 == 0 {
18380                    MessageRole::User
18381                } else {
18382                    MessageRole::Agent
18383                },
18384                author: None,
18385                created_at: Some(1_700_000_000_000 + idx),
18386                content: format!("message {idx}"),
18387                extra_json: serde_json::Value::Null,
18388                snippets: Vec::new(),
18389            })
18390            .collect();
18391
18392        let conversation = Conversation {
18393            id: None,
18394            agent_slug: "codex".into(),
18395            workspace: Some(PathBuf::from("/tmp/workspace")),
18396            external_id: Some("large-reprocess-session".into()),
18397            title: Some("Large Reprocess Session".into()),
18398            source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
18399            started_at: Some(1_700_000_000_000),
18400            ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
18401            approx_tokens: None,
18402            metadata_json: serde_json::Value::Null,
18403            messages,
18404            source_id: "local".into(),
18405            origin_host: None,
18406        };
18407
18408        let first = storage
18409            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18410            .unwrap();
18411        let second = storage
18412            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18413            .unwrap();
18414
18415        assert_eq!(first.len(), 1);
18416        assert_eq!(second.len(), 1);
18417        assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
18418        assert!(
18419            second[0].inserted_indices.is_empty(),
18420            "full reprocessing of a large conversation must not attempt duplicate idx inserts"
18421        );
18422        assert_eq!(first[0].conversation_id, second[0].conversation_id);
18423
18424        let conversation_count: i64 = storage
18425            .conn
18426            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18427                row.get_typed(0)
18428            })
18429            .unwrap();
18430        let message_count: i64 = storage
18431            .conn
18432            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18433                row.get_typed(0)
18434            })
18435            .unwrap();
18436
18437        assert_eq!(conversation_count, 1);
18438        assert_eq!(message_count, MESSAGE_COUNT);
18439    }
18440
18441    #[test]
18442    fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
18443        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18444        use crate::indexer::persist::map_to_internal;
18445        use crate::model::types::{Agent, AgentKind};
18446        use frankensqlite::compat::{ConnectionExt, RowExt};
18447        use rand::RngExt;
18448        use rayon::prelude::*;
18449
18450        fn retryable_franken_error(err: &anyhow::Error) -> bool {
18451            err.downcast_ref::<frankensqlite::FrankenError>()
18452                .or_else(|| {
18453                    err.root_cause()
18454                        .downcast_ref::<frankensqlite::FrankenError>()
18455                })
18456                .is_some_and(|inner| {
18457                    matches!(
18458                        inner,
18459                        frankensqlite::FrankenError::Busy
18460                            | frankensqlite::FrankenError::BusyRecovery
18461                            | frankensqlite::FrankenError::BusySnapshot { .. }
18462                            | frankensqlite::FrankenError::WriteConflict { .. }
18463                            | frankensqlite::FrankenError::SerializationFailure { .. }
18464                    )
18465                })
18466        }
18467
18468        fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
18469        where
18470            F: FnMut() -> anyhow::Result<T>,
18471        {
18472            let mut rng = rand::rng();
18473            let mut backoff_ms = 4_u64;
18474            for attempt in 0..=24 {
18475                match f() {
18476                    Ok(value) => return Ok(value),
18477                    Err(err) if attempt < 24 && retryable_franken_error(&err) => {
18478                        let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
18479                        std::thread::sleep(Duration::from_millis(sleep_ms));
18480                        backoff_ms = (backoff_ms * 2).min(512);
18481                    }
18482                    Err(err) => return Err(err),
18483                }
18484            }
18485            unreachable!("retry loop must return on success or final failure")
18486        }
18487
18488        let dir = TempDir::new().unwrap();
18489        let db_path = dir.path().join("parallel_insert_conversation_tree.db");
18490        let seed = FrankenStorage::open(&db_path).unwrap();
18491        drop(seed);
18492
18493        let conversations: Vec<NormalizedConversation> = (0..10)
18494            .map(|i| NormalizedConversation {
18495                agent_slug: format!("agent-{}", i % 3),
18496                external_id: Some(format!("conv-{i}")),
18497                title: Some(format!("Conversation {i}")),
18498                workspace: Some(PathBuf::from(format!("/ws/{i}"))),
18499                source_path: PathBuf::from(format!("/log/{i}.jsonl")),
18500                started_at: Some(1_000 + i * 100),
18501                ended_at: Some(1_000 + i * 100 + 50),
18502                metadata: serde_json::json!({}),
18503                messages: (0..3)
18504                    .map(|j| NormalizedMessage {
18505                        idx: j,
18506                        role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
18507                        author: Some("tester".into()),
18508                        created_at: Some(1_000 + i * 100 + j * 10),
18509                        content: format!("parallel-distinct-test conv={i} msg={j}"),
18510                        extra: serde_json::json!({}),
18511                        snippets: vec![],
18512                        invocations: Vec::new(),
18513                    })
18514                    .collect(),
18515            })
18516            .collect();
18517
18518        let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
18519            .par_chunks(3)
18520            .map(|chunk| {
18521                let storage = FrankenStorage::open_writer(&db_path).unwrap();
18522                let mut agent_cache: HashMap<String, i64> = HashMap::new();
18523                let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
18524                let mut chunk_outcomes = Vec::with_capacity(chunk.len());
18525
18526                for conv in chunk {
18527                    let agent_slug = conv.agent_slug.clone();
18528                    let workspace = conv.workspace.clone();
18529                    let external_id = conv.external_id.clone().expect("external id");
18530                    let internal = map_to_internal(conv);
18531                    let outcome = with_retry(|| {
18532                        let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
18533                            *id
18534                        } else {
18535                            let agent = Agent {
18536                                id: None,
18537                                slug: agent_slug.clone(),
18538                                name: agent_slug.clone(),
18539                                version: None,
18540                                kind: AgentKind::Cli,
18541                            };
18542                            let id = storage.ensure_agent(&agent)?;
18543                            agent_cache.insert(agent_slug.clone(), id);
18544                            id
18545                        };
18546                        let workspace_id = if let Some(path) = &workspace {
18547                            if let Some(id) = workspace_cache.get(path) {
18548                                Some(*id)
18549                            } else {
18550                                let id = storage.ensure_workspace(path, None)?;
18551                                workspace_cache.insert(path.clone(), id);
18552                                Some(id)
18553                            }
18554                        } else {
18555                            None
18556                        };
18557                        storage.insert_conversation_tree(agent_id, workspace_id, &internal)
18558                    })
18559                    .unwrap();
18560                    chunk_outcomes.push((
18561                        external_id,
18562                        outcome.conversation_id,
18563                        outcome.inserted_indices,
18564                    ));
18565                }
18566
18567                storage.close().unwrap();
18568                chunk_outcomes
18569            })
18570            .flatten()
18571            .collect();
18572        outcomes.sort_by(|left, right| left.0.cmp(&right.0));
18573
18574        assert!(
18575            outcomes
18576                .iter()
18577                .all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
18578            "unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
18579        );
18580
18581        let distinct_ids: HashSet<i64> = outcomes
18582            .iter()
18583            .map(|(_, conversation_id, _)| *conversation_id)
18584            .collect();
18585        assert_eq!(
18586            distinct_ids.len(),
18587            conversations.len(),
18588            "unique external ids must produce distinct conversation ids: {outcomes:?}"
18589        );
18590
18591        let reader = FrankenStorage::open(&db_path).unwrap();
18592        let stored_rows: Vec<(i64, String)> = reader
18593            .raw()
18594            .query_map_collect(
18595                "SELECT id, external_id FROM conversations ORDER BY id",
18596                &[],
18597                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18598            )
18599            .unwrap();
18600        let stored_count: i64 = reader
18601            .raw()
18602            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
18603                row.get_typed(0)
18604            })
18605            .unwrap();
18606
18607        assert_eq!(
18608            stored_count as usize,
18609            conversations.len(),
18610            "parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
18611        );
18612        assert_eq!(
18613            stored_rows.len(),
18614            conversations.len(),
18615            "parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
18616        );
18617    }
18618
18619    #[test]
18620    fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
18621        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18622        use crate::indexer::persist::map_to_internal;
18623        use crate::model::types::{Agent, AgentKind};
18624        use std::path::PathBuf;
18625
18626        let dir = TempDir::new().unwrap();
18627        let db_path = dir.path().join("test.db");
18628        let storage = SqliteStorage::open(&db_path).unwrap();
18629
18630        let agent = Agent {
18631            id: None,
18632            slug: "codex".into(),
18633            name: "Codex".into(),
18634            version: Some("0.2.3".into()),
18635            kind: AgentKind::Cli,
18636        };
18637        let agent_id = storage.ensure_agent(&agent).unwrap();
18638
18639        let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
18640            agent_slug: "codex".into(),
18641            workspace: Some(PathBuf::from("/tmp/workspace")),
18642            external_id: Some("tree-gap-session".into()),
18643            title: Some("Tree Gap Session".into()),
18644            source_path: PathBuf::from("/tmp/tree.jsonl"),
18645            started_at: Some(1_700_000_000_000),
18646            ended_at: Some(1_700_000_000_999),
18647            metadata: serde_json::Value::Null,
18648            messages,
18649        };
18650
18651        let conv_a = map_to_internal(&base_conv(vec![
18652            NormalizedMessage {
18653                idx: 2,
18654                role: "user".into(),
18655                author: None,
18656                created_at: Some(1_700_000_000_200),
18657                content: "third".into(),
18658                extra: serde_json::Value::Null,
18659                snippets: Vec::new(),
18660                invocations: Vec::new(),
18661            },
18662            NormalizedMessage {
18663                idx: 3,
18664                role: "assistant".into(),
18665                author: None,
18666                created_at: Some(1_700_000_000_300),
18667                content: "fourth".into(),
18668                extra: serde_json::Value::Null,
18669                snippets: Vec::new(),
18670                invocations: Vec::new(),
18671            },
18672        ]));
18673        let conv_b = map_to_internal(&base_conv(vec![
18674            NormalizedMessage {
18675                idx: 0,
18676                role: "user".into(),
18677                author: None,
18678                created_at: Some(1_700_000_000_000),
18679                content: "first".into(),
18680                extra: serde_json::Value::Null,
18681                snippets: Vec::new(),
18682                invocations: Vec::new(),
18683            },
18684            NormalizedMessage {
18685                idx: 1,
18686                role: "assistant".into(),
18687                author: None,
18688                created_at: Some(1_700_000_000_100),
18689                content: "second".into(),
18690                extra: serde_json::Value::Null,
18691                snippets: Vec::new(),
18692                invocations: Vec::new(),
18693            },
18694            NormalizedMessage {
18695                idx: 3,
18696                role: "assistant".into(),
18697                author: None,
18698                created_at: Some(1_700_000_000_300),
18699                content: "fourth".into(),
18700                extra: serde_json::Value::Null,
18701                snippets: Vec::new(),
18702                invocations: Vec::new(),
18703            },
18704        ]));
18705
18706        let first = storage
18707            .insert_conversation_tree(agent_id, None, &conv_a)
18708            .unwrap();
18709        let second = storage
18710            .insert_conversation_tree(agent_id, None, &conv_b)
18711            .unwrap();
18712
18713        assert_eq!(first.inserted_indices, vec![2, 3]);
18714        assert_eq!(second.inserted_indices, vec![0, 1]);
18715        assert_eq!(first.conversation_id, second.conversation_id);
18716
18717        let stored_indices: Vec<i64> = storage
18718            .conn
18719            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18720                row.get_typed(0)
18721            })
18722            .unwrap();
18723        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18724    }
18725
18726    #[test]
18727    fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
18728        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18729        use std::path::PathBuf;
18730
18731        let dir = TempDir::new().unwrap();
18732        let db_path = dir.path().join("test.db");
18733        let storage = SqliteStorage::open(&db_path).unwrap();
18734
18735        let agent = Agent {
18736            id: None,
18737            slug: "codex".into(),
18738            name: "Codex".into(),
18739            version: Some("0.2.3".into()),
18740            kind: AgentKind::Cli,
18741        };
18742        let agent_id = storage.ensure_agent(&agent).unwrap();
18743
18744        let conversation = Conversation {
18745            id: None,
18746            agent_slug: "codex".into(),
18747            workspace: Some(PathBuf::from("/tmp/workspace")),
18748            external_id: Some("duplicate-new-session".into()),
18749            title: Some("Duplicate New Session".into()),
18750            source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
18751            started_at: Some(1_700_000_000_000),
18752            ended_at: Some(1_700_000_000_999),
18753            approx_tokens: None,
18754            metadata_json: serde_json::Value::Null,
18755            messages: vec![
18756                Message {
18757                    id: None,
18758                    idx: 0,
18759                    role: MessageRole::User,
18760                    author: None,
18761                    created_at: Some(1_700_000_000_000),
18762                    content: "first canonical".into(),
18763                    extra_json: serde_json::Value::Null,
18764                    snippets: Vec::new(),
18765                },
18766                Message {
18767                    id: None,
18768                    idx: 0,
18769                    role: MessageRole::User,
18770                    author: None,
18771                    created_at: Some(1_700_000_000_001),
18772                    content: "duplicate idx should be skipped".into(),
18773                    extra_json: serde_json::Value::Null,
18774                    snippets: Vec::new(),
18775                },
18776                Message {
18777                    id: None,
18778                    idx: 1,
18779                    role: MessageRole::Agent,
18780                    author: None,
18781                    created_at: Some(1_700_000_000_100),
18782                    content: "second".into(),
18783                    extra_json: serde_json::Value::Null,
18784                    snippets: Vec::new(),
18785                },
18786            ],
18787            source_id: "local".into(),
18788            origin_host: None,
18789        };
18790
18791        let outcome = storage
18792            .insert_conversation_tree(agent_id, None, &conversation)
18793            .unwrap();
18794
18795        assert_eq!(outcome.inserted_indices, vec![0, 1]);
18796
18797        let stored_messages: Vec<(i64, String)> = storage
18798            .conn
18799            .query_map_collect(
18800                "SELECT idx, content FROM messages ORDER BY idx",
18801                fparams![],
18802                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18803            )
18804            .unwrap();
18805        assert_eq!(
18806            stored_messages,
18807            vec![
18808                (0, "first canonical".to_string()),
18809                (1, "second".to_string())
18810            ]
18811        );
18812    }
18813
18814    #[test]
18815    fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
18816        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18817        use std::path::PathBuf;
18818
18819        let dir = TempDir::new().unwrap();
18820        let db_path = dir.path().join("test.db");
18821        let storage = SqliteStorage::open(&db_path).unwrap();
18822
18823        let agent = Agent {
18824            id: None,
18825            slug: "codex".into(),
18826            name: "Codex".into(),
18827            version: Some("0.2.3".into()),
18828            kind: AgentKind::Cli,
18829        };
18830        let agent_id = storage.ensure_agent(&agent).unwrap();
18831
18832        let base_conv = |messages: Vec<Message>| Conversation {
18833            id: None,
18834            agent_slug: "codex".into(),
18835            workspace: Some(PathBuf::from("/tmp/workspace")),
18836            external_id: None,
18837            title: Some("Source Path Merge".into()),
18838            source_path: PathBuf::from("/tmp/shared-session.jsonl"),
18839            started_at: Some(1_700_000_000_000),
18840            ended_at: Some(1_700_000_000_999),
18841            approx_tokens: None,
18842            metadata_json: serde_json::Value::Null,
18843            messages,
18844            source_id: "local".into(),
18845            origin_host: None,
18846        };
18847
18848        let first = storage
18849            .insert_conversation_tree(
18850                agent_id,
18851                None,
18852                &base_conv(vec![
18853                    Message {
18854                        id: None,
18855                        idx: 0,
18856                        role: MessageRole::User,
18857                        author: None,
18858                        created_at: Some(1_700_000_000_000),
18859                        content: "first".into(),
18860                        extra_json: serde_json::Value::Null,
18861                        snippets: Vec::new(),
18862                    },
18863                    Message {
18864                        id: None,
18865                        idx: 1,
18866                        role: MessageRole::Agent,
18867                        author: None,
18868                        created_at: Some(1_700_000_000_100),
18869                        content: "second".into(),
18870                        extra_json: serde_json::Value::Null,
18871                        snippets: Vec::new(),
18872                    },
18873                ]),
18874            )
18875            .unwrap();
18876
18877        let second = storage
18878            .insert_conversation_tree(
18879                agent_id,
18880                None,
18881                &base_conv(vec![
18882                    Message {
18883                        id: None,
18884                        idx: 1,
18885                        role: MessageRole::Agent,
18886                        author: None,
18887                        created_at: Some(1_700_000_000_100),
18888                        content: "second".into(),
18889                        extra_json: serde_json::Value::Null,
18890                        snippets: Vec::new(),
18891                    },
18892                    Message {
18893                        id: None,
18894                        idx: 2,
18895                        role: MessageRole::User,
18896                        author: None,
18897                        created_at: Some(1_700_000_000_200),
18898                        content: "third".into(),
18899                        extra_json: serde_json::Value::Null,
18900                        snippets: Vec::new(),
18901                    },
18902                ]),
18903            )
18904            .unwrap();
18905
18906        assert_eq!(first.conversation_id, second.conversation_id);
18907        assert_eq!(first.inserted_indices, vec![0, 1]);
18908        assert_eq!(second.inserted_indices, vec![2]);
18909
18910        let stored_indices: Vec<i64> = storage
18911            .conn
18912            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18913                row.get_typed(0)
18914            })
18915            .unwrap();
18916        assert_eq!(stored_indices, vec![0, 1, 2]);
18917    }
18918
18919    #[test]
18920    fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
18921        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18922        use std::path::PathBuf;
18923
18924        let dir = TempDir::new().unwrap();
18925        let db_path = dir.path().join("test.db");
18926        let storage = SqliteStorage::open(&db_path).unwrap();
18927
18928        let agent = Agent {
18929            id: None,
18930            slug: "codex".into(),
18931            name: "Codex".into(),
18932            version: Some("0.2.3".into()),
18933            kind: AgentKind::Cli,
18934        };
18935        let agent_id = storage.ensure_agent(&agent).unwrap();
18936
18937        let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
18938            id: None,
18939            agent_slug: "codex".into(),
18940            workspace: Some(PathBuf::from("/tmp/workspace")),
18941            external_id: None,
18942            title: Some("Drift Merge".into()),
18943            source_path: PathBuf::from("/tmp/drift-session.jsonl"),
18944            started_at,
18945            ended_at: Some(1_700_000_000_999),
18946            approx_tokens: None,
18947            metadata_json: serde_json::Value::Null,
18948            messages,
18949            source_id: "local".into(),
18950            origin_host: None,
18951        };
18952
18953        let first = storage
18954            .insert_conversation_tree(
18955                agent_id,
18956                None,
18957                &base_conv(
18958                    Some(1_700_000_000_000),
18959                    vec![
18960                        Message {
18961                            id: None,
18962                            idx: 0,
18963                            role: MessageRole::User,
18964                            author: None,
18965                            created_at: Some(1_700_000_000_000),
18966                            content: "first".into(),
18967                            extra_json: serde_json::Value::Null,
18968                            snippets: Vec::new(),
18969                        },
18970                        Message {
18971                            id: None,
18972                            idx: 1,
18973                            role: MessageRole::Agent,
18974                            author: None,
18975                            created_at: Some(1_700_000_000_100),
18976                            content: "second".into(),
18977                            extra_json: serde_json::Value::Null,
18978                            snippets: Vec::new(),
18979                        },
18980                    ],
18981                ),
18982            )
18983            .unwrap();
18984
18985        let second = storage
18986            .insert_conversation_tree(
18987                agent_id,
18988                None,
18989                &base_conv(
18990                    Some(1_700_000_004_000),
18991                    vec![
18992                        Message {
18993                            id: None,
18994                            idx: 1,
18995                            role: MessageRole::Agent,
18996                            author: None,
18997                            created_at: Some(1_700_000_000_100),
18998                            content: "second".into(),
18999                            extra_json: serde_json::Value::Null,
19000                            snippets: Vec::new(),
19001                        },
19002                        Message {
19003                            id: None,
19004                            idx: 2,
19005                            role: MessageRole::User,
19006                            author: None,
19007                            created_at: Some(1_700_000_004_200),
19008                            content: "third".into(),
19009                            extra_json: serde_json::Value::Null,
19010                            snippets: Vec::new(),
19011                        },
19012                    ],
19013                ),
19014            )
19015            .unwrap();
19016
19017        assert_eq!(first.conversation_id, second.conversation_id);
19018        assert_eq!(second.inserted_indices, vec![2]);
19019    }
19020
19021    #[test]
19022    fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
19023        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19024        use std::path::PathBuf;
19025
19026        let dir = TempDir::new().unwrap();
19027        let db_path = dir.path().join("test.db");
19028        let storage = SqliteStorage::open(&db_path).unwrap();
19029
19030        let agent = Agent {
19031            id: None,
19032            slug: "codex".into(),
19033            name: "Codex".into(),
19034            version: Some("0.2.3".into()),
19035            kind: AgentKind::Cli,
19036        };
19037        let agent_id = storage.ensure_agent(&agent).unwrap();
19038
19039        let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
19040            id: None,
19041            agent_slug: "codex".into(),
19042            workspace: Some(PathBuf::from("/tmp/workspace")),
19043            external_id: None,
19044            title: Some("Partial overlap".into()),
19045            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19046            started_at: Some(started_at),
19047            ended_at: Some(started_at + 500),
19048            approx_tokens: None,
19049            metadata_json: serde_json::Value::Null,
19050            messages: vec![Message {
19051                id: None,
19052                idx,
19053                role: MessageRole::User,
19054                author: None,
19055                created_at: Some(started_at),
19056                content: content.into(),
19057                extra_json: serde_json::Value::Null,
19058                snippets: Vec::new(),
19059            }],
19060            source_id: "local".into(),
19061            origin_host: None,
19062        };
19063
19064        storage
19065            .insert_conversation_tree(
19066                agent_id,
19067                None,
19068                &Conversation {
19069                    messages: vec![
19070                        Message {
19071                            id: None,
19072                            idx: 0,
19073                            role: MessageRole::User,
19074                            author: None,
19075                            created_at: Some(1_700_000_000_000),
19076                            content: "shared opener".into(),
19077                            extra_json: serde_json::Value::Null,
19078                            snippets: Vec::new(),
19079                        },
19080                        Message {
19081                            id: None,
19082                            idx: 1,
19083                            role: MessageRole::Agent,
19084                            author: None,
19085                            created_at: Some(1_700_000_000_100),
19086                            content: "first session unique".into(),
19087                            extra_json: serde_json::Value::Null,
19088                            snippets: Vec::new(),
19089                        },
19090                    ],
19091                    ..make_conv(1_700_000_000_000, 0, "unused")
19092                },
19093            )
19094            .unwrap();
19095        storage
19096            .insert_conversation_tree(
19097                agent_id,
19098                None,
19099                &make_conv(1_700_000_900_000, 0, "shared opener"),
19100            )
19101            .unwrap();
19102
19103        let conversation_count: i64 = storage
19104            .conn
19105            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19106                row.get_typed(0)
19107            })
19108            .unwrap();
19109        assert_eq!(conversation_count, 2);
19110    }
19111
19112    #[test]
19113    fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
19114        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19115        use std::path::PathBuf;
19116
19117        let dir = TempDir::new().unwrap();
19118        let db_path = dir.path().join("test.db");
19119        let storage = SqliteStorage::open(&db_path).unwrap();
19120
19121        let agent = Agent {
19122            id: None,
19123            slug: "codex".into(),
19124            name: "Codex".into(),
19125            version: Some("0.2.3".into()),
19126            kind: AgentKind::Cli,
19127        };
19128        let agent_id = storage.ensure_agent(&agent).unwrap();
19129
19130        let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
19131            id: None,
19132            agent_slug: "codex".into(),
19133            workspace: Some(PathBuf::from("/tmp/workspace")),
19134            external_id: None,
19135            title: Some("Same Path Different Session".into()),
19136            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19137            started_at: Some(started_at),
19138            ended_at: Some(started_at + 500),
19139            approx_tokens: None,
19140            metadata_json: serde_json::Value::Null,
19141            messages: vec![Message {
19142                id: None,
19143                idx: 0,
19144                role: MessageRole::User,
19145                author: None,
19146                created_at: Some(created_at),
19147                content: content.into(),
19148                extra_json: serde_json::Value::Null,
19149                snippets: Vec::new(),
19150            }],
19151            source_id: "local".into(),
19152            origin_host: None,
19153        };
19154
19155        storage
19156            .insert_conversation_tree(
19157                agent_id,
19158                None,
19159                &make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
19160            )
19161            .unwrap();
19162        storage
19163            .insert_conversation_tree(
19164                agent_id,
19165                None,
19166                &make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
19167            )
19168            .unwrap();
19169
19170        let conversation_count: i64 = storage
19171            .conn
19172            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19173                row.get_typed(0)
19174            })
19175            .unwrap();
19176        assert_eq!(conversation_count, 2);
19177    }
19178
19179    #[test]
19180    fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
19181        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19182        use std::path::PathBuf;
19183
19184        let dir = TempDir::new().unwrap();
19185        let db_path = dir.path().join("test.db");
19186        let storage = SqliteStorage::open(&db_path).unwrap();
19187
19188        let agent = Agent {
19189            id: None,
19190            slug: "codex".into(),
19191            name: "Codex".into(),
19192            version: Some("0.2.3".into()),
19193            kind: AgentKind::Cli,
19194        };
19195        let agent_id = storage.ensure_agent(&agent).unwrap();
19196
19197        let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
19198            id: None,
19199            agent_slug: "codex".into(),
19200            workspace: Some(PathBuf::from("/tmp/workspace")),
19201            external_id: None,
19202            title: Some("Shifted replay".into()),
19203            source_path: PathBuf::from("/tmp/replay-session.jsonl"),
19204            started_at: Some(started_at),
19205            ended_at: Some(started_at + 500),
19206            approx_tokens: None,
19207            metadata_json: serde_json::Value::Null,
19208            messages,
19209            source_id: "local".into(),
19210            origin_host: None,
19211        };
19212
19213        let first = storage
19214            .insert_conversation_tree(
19215                agent_id,
19216                None,
19217                &make_conv(
19218                    1_700_000_000_000,
19219                    vec![
19220                        Message {
19221                            id: None,
19222                            idx: 0,
19223                            role: MessageRole::User,
19224                            author: None,
19225                            created_at: Some(1_700_000_000_000),
19226                            content: "first".into(),
19227                            extra_json: serde_json::Value::Null,
19228                            snippets: Vec::new(),
19229                        },
19230                        Message {
19231                            id: None,
19232                            idx: 1,
19233                            role: MessageRole::Agent,
19234                            author: None,
19235                            created_at: Some(1_700_000_000_100),
19236                            content: "second".into(),
19237                            extra_json: serde_json::Value::Null,
19238                            snippets: Vec::new(),
19239                        },
19240                    ],
19241                ),
19242            )
19243            .unwrap();
19244
19245        let second = storage
19246            .insert_conversation_tree(
19247                agent_id,
19248                None,
19249                &make_conv(
19250                    1_700_000_900_000,
19251                    vec![
19252                        Message {
19253                            id: None,
19254                            idx: 10,
19255                            role: MessageRole::User,
19256                            author: None,
19257                            created_at: Some(1_700_000_000_000),
19258                            content: "first".into(),
19259                            extra_json: serde_json::Value::Null,
19260                            snippets: Vec::new(),
19261                        },
19262                        Message {
19263                            id: None,
19264                            idx: 11,
19265                            role: MessageRole::Agent,
19266                            author: None,
19267                            created_at: Some(1_700_000_000_100),
19268                            content: "second".into(),
19269                            extra_json: serde_json::Value::Null,
19270                            snippets: Vec::new(),
19271                        },
19272                        Message {
19273                            id: None,
19274                            idx: 12,
19275                            role: MessageRole::User,
19276                            author: None,
19277                            created_at: Some(1_700_000_000_200),
19278                            content: "third".into(),
19279                            extra_json: serde_json::Value::Null,
19280                            snippets: Vec::new(),
19281                        },
19282                    ],
19283                ),
19284            )
19285            .unwrap();
19286
19287        assert_eq!(first.conversation_id, second.conversation_id);
19288        assert_eq!(second.inserted_indices, vec![12]);
19289
19290        let stored_indices: Vec<i64> = storage
19291            .conn
19292            .query_map_collect(
19293                "SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
19294                fparams![first.conversation_id],
19295                |row| row.get_typed(0),
19296            )
19297            .unwrap();
19298        assert_eq!(stored_indices, vec![0, 1, 12]);
19299    }
19300
19301    #[test]
19302    fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
19303        use crate::model::types::{Conversation, Message, MessageRole};
19304        use std::path::PathBuf;
19305
19306        fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
19307            Conversation {
19308                id: None,
19309                agent_slug: "codex".into(),
19310                workspace: Some(PathBuf::from("/tmp/workspace")),
19311                external_id: None,
19312                title: Some("Recovered".into()),
19313                source_path: PathBuf::from(source_path),
19314                started_at: Some(1_700_000_000_000),
19315                ended_at: Some(1_700_000_000_999),
19316                approx_tokens: None,
19317                metadata_json: serde_json::Value::Null,
19318                messages,
19319                source_id: "local".into(),
19320                origin_host: None,
19321            }
19322        }
19323
19324        let dir = TempDir::new().unwrap();
19325        let canonical_db = dir.path().join("agent_search.db");
19326        let storage = SqliteStorage::open(&canonical_db).unwrap();
19327
19328        let overlapping_a = base_conv(
19329            "/tmp/shared-history.jsonl",
19330            vec![
19331                Message {
19332                    id: None,
19333                    idx: 0,
19334                    role: MessageRole::User,
19335                    author: None,
19336                    created_at: Some(1_700_000_000_000),
19337                    content: "first".into(),
19338                    extra_json: serde_json::Value::Null,
19339                    snippets: Vec::new(),
19340                },
19341                Message {
19342                    id: None,
19343                    idx: 1,
19344                    role: MessageRole::Agent,
19345                    author: None,
19346                    created_at: Some(1_700_000_000_100),
19347                    content: "second".into(),
19348                    extra_json: serde_json::Value::Null,
19349                    snippets: Vec::new(),
19350                },
19351            ],
19352        );
19353        let overlapping_b = base_conv(
19354            "/tmp/shared-history.jsonl",
19355            vec![
19356                Message {
19357                    id: None,
19358                    idx: 1,
19359                    role: MessageRole::Agent,
19360                    author: None,
19361                    created_at: Some(1_700_000_000_100),
19362                    content: "second".into(),
19363                    extra_json: serde_json::Value::Null,
19364                    snippets: Vec::new(),
19365                },
19366                Message {
19367                    id: None,
19368                    idx: 2,
19369                    role: MessageRole::User,
19370                    author: None,
19371                    created_at: Some(1_700_000_000_200),
19372                    content: "third".into(),
19373                    extra_json: serde_json::Value::Null,
19374                    snippets: Vec::new(),
19375                },
19376            ],
19377        );
19378        let unique = Conversation {
19379            source_path: PathBuf::from("/tmp/unique-history.jsonl"),
19380            messages: vec![Message {
19381                id: None,
19382                idx: 0,
19383                role: MessageRole::User,
19384                author: None,
19385                created_at: Some(1_700_000_001_000),
19386                content: "unique".into(),
19387                extra_json: serde_json::Value::Null,
19388                snippets: Vec::new(),
19389            }],
19390            started_at: Some(1_700_000_001_000),
19391            ended_at: Some(1_700_000_001_100),
19392            ..base_conv("/tmp/unique-history.jsonl", Vec::new())
19393        };
19394
19395        seed_historical_db_direct(
19396            &dir.path()
19397                .join("backups/agent_search.db.20260322T020200.bak"),
19398            std::slice::from_ref(&overlapping_a),
19399        );
19400        seed_historical_db_direct(
19401            &dir.path().join("agent_search.corrupt.20260324_212907"),
19402            &[overlapping_b, unique],
19403        );
19404
19405        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19406        assert_eq!(first.bundles_considered, 2);
19407        assert_eq!(first.bundles_imported, 2);
19408        assert_eq!(first.messages_imported, 4);
19409
19410        let conversations = storage.list_conversations(10, 0).unwrap();
19411        assert_eq!(conversations.len(), 2);
19412
19413        let shared_id = conversations
19414            .iter()
19415            .find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
19416            .and_then(|conv| conv.id)
19417            .unwrap();
19418        let shared_indices: Vec<i64> = storage
19419            .fetch_messages(shared_id)
19420            .unwrap()
19421            .into_iter()
19422            .map(|msg| msg.idx)
19423            .collect();
19424        assert_eq!(shared_indices, vec![0, 1, 2]);
19425
19426        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19427        assert_eq!(second.bundles_imported, 0);
19428        assert_eq!(second.messages_imported, 0);
19429    }
19430
19431    #[test]
19432    fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
19433        use crate::model::types::{Conversation, Message, MessageRole};
19434        use std::path::PathBuf;
19435
19436        let dir = TempDir::new().unwrap();
19437        let canonical_db = dir.path().join("agent_search.db");
19438        let storage = SqliteStorage::open(&canonical_db).unwrap();
19439
19440        let host_only_remote = Conversation {
19441            id: None,
19442            agent_slug: "codex".into(),
19443            workspace: Some(PathBuf::from("/tmp/workspace")),
19444            external_id: None,
19445            title: Some("Recovered Host Only Remote".into()),
19446            source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
19447            started_at: Some(1_700_000_000_000),
19448            ended_at: Some(1_700_000_000_999),
19449            approx_tokens: None,
19450            metadata_json: serde_json::Value::Null,
19451            messages: vec![Message {
19452                id: None,
19453                idx: 0,
19454                role: MessageRole::User,
19455                author: None,
19456                created_at: Some(1_700_000_000_000),
19457                content: "host-only remote".into(),
19458                extra_json: serde_json::Value::Null,
19459                snippets: Vec::new(),
19460            }],
19461            source_id: "   ".into(),
19462            origin_host: Some("builder-5".into()),
19463        };
19464
19465        let historical_db = dir
19466            .path()
19467            .join("backups/agent_search.db.20260322T020200.bak");
19468        seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
19469
19470        let historical_conn =
19471            FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
19472        historical_conn
19473            .execute_compat(
19474                "INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
19475                fparams!["   ", "ssh", "builder-5", 0_i64, 0_i64],
19476            )
19477            .unwrap();
19478        historical_conn
19479            .execute_compat(
19480                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
19481                fparams!["   ", "builder-5", "/tmp/host-only-history.jsonl"],
19482            )
19483            .unwrap();
19484        historical_conn
19485            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
19486            .unwrap();
19487        drop(historical_conn);
19488
19489        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19490        assert_eq!(first.bundles_imported, 1);
19491        assert_eq!(first.messages_imported, 1);
19492
19493        let source_ids = storage.get_source_ids().unwrap();
19494        assert_eq!(source_ids, vec!["builder-5".to_string()]);
19495
19496        let conversations = storage.list_conversations(10, 0).unwrap();
19497        assert_eq!(conversations.len(), 1);
19498        assert_eq!(conversations[0].source_id, "builder-5");
19499        assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
19500    }
19501
19502    #[test]
19503    fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
19504        use crate::model::types::{Conversation, Message, MessageRole};
19505        use std::path::PathBuf;
19506
19507        let mut attempts: Vec<Vec<usize>> = Vec::new();
19508        let entry = HistoricalBatchEntry {
19509            source_row_id: 77,
19510            agent_id: 1,
19511            workspace_id: None,
19512            conversation: Conversation {
19513                id: None,
19514                agent_slug: "gemini".into(),
19515                workspace: Some(PathBuf::from("/tmp/workspace")),
19516                external_id: Some("conv-77".into()),
19517                title: Some("Large recovered conversation".into()),
19518                source_path: PathBuf::from("/tmp/history.jsonl"),
19519                started_at: Some(1_700_000_000_000),
19520                ended_at: Some(1_700_000_000_999),
19521                approx_tokens: None,
19522                metadata_json: serde_json::Value::Null,
19523                messages: (0..4)
19524                    .map(|idx| Message {
19525                        id: None,
19526                        idx,
19527                        role: MessageRole::User,
19528                        author: None,
19529                        created_at: Some(1_700_000_000_000 + idx),
19530                        content: format!("message-{idx}"),
19531                        extra_json: serde_json::Value::Null,
19532                        snippets: Vec::new(),
19533                    })
19534                    .collect(),
19535                source_id: LOCAL_SOURCE_ID.into(),
19536                origin_host: None,
19537            },
19538        };
19539
19540        let totals = SqliteStorage::import_historical_batch_with_retry(
19541            std::slice::from_ref(&entry),
19542            &mut |batch| {
19543                attempts.push(
19544                    batch
19545                        .iter()
19546                        .map(|entry| entry.conversation.messages.len())
19547                        .collect(),
19548                );
19549                let total_messages: usize = batch
19550                    .iter()
19551                    .map(|entry| entry.conversation.messages.len())
19552                    .sum();
19553                if total_messages > 1 {
19554                    Err(anyhow!("out of memory"))
19555                } else {
19556                    Ok(HistoricalBatchImportTotals {
19557                        inserted_source_rows: batch.len(),
19558                        inserted_messages: total_messages,
19559                    })
19560                }
19561            },
19562        )
19563        .unwrap();
19564
19565        assert_eq!(
19566            totals,
19567            HistoricalBatchImportTotals {
19568                inserted_source_rows: 1,
19569                inserted_messages: 4,
19570            }
19571        );
19572        assert_eq!(attempts.first().cloned(), Some(vec![4]));
19573        assert!(
19574            attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
19575            "expected recursive fallback to reach one-message slices"
19576        );
19577    }
19578
19579    #[test]
19580    fn salvage_historical_databases_resumes_from_progress_checkpoint() {
19581        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19582        use std::path::PathBuf;
19583
19584        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19585            Conversation {
19586                id: None,
19587                agent_slug: "codex".into(),
19588                workspace: Some(PathBuf::from("/tmp/workspace")),
19589                external_id: Some(format!("conv-{idx_seed}")),
19590                title: Some(format!("Recovered {idx_seed}")),
19591                source_path: PathBuf::from(source_path),
19592                started_at: Some(1_700_000_000_000 + idx_seed),
19593                ended_at: Some(1_700_000_000_100 + idx_seed),
19594                approx_tokens: None,
19595                metadata_json: serde_json::Value::Null,
19596                messages: vec![Message {
19597                    id: None,
19598                    idx: 0,
19599                    role: MessageRole::User,
19600                    author: None,
19601                    created_at: Some(1_700_000_000_000 + idx_seed),
19602                    content: format!("message-{idx_seed}"),
19603                    extra_json: serde_json::Value::Null,
19604                    snippets: Vec::new(),
19605                }],
19606                source_id: LOCAL_SOURCE_ID.into(),
19607                origin_host: None,
19608            }
19609        }
19610
19611        let dir = TempDir::new().unwrap();
19612        let canonical_db = dir.path().join("agent_search.db");
19613        let backup_db = dir
19614            .path()
19615            .join("backups/agent_search.db.20260322T020200.bak");
19616        let storage = SqliteStorage::open(&canonical_db).unwrap();
19617        let conv_a = make_conv("/tmp/one.jsonl", 1);
19618        let conv_b = make_conv("/tmp/two.jsonl", 2);
19619        let conv_c = make_conv("/tmp/three.jsonl", 3);
19620        seed_historical_db_direct(
19621            &backup_db,
19622            &[conv_a.clone(), conv_b.clone(), conv_c.clone()],
19623        );
19624
19625        let agent = Agent {
19626            id: None,
19627            slug: "codex".into(),
19628            name: "Codex".into(),
19629            version: Some("0.2.3".into()),
19630            kind: AgentKind::Cli,
19631        };
19632        let agent_id = storage.ensure_agent(&agent).unwrap();
19633        storage
19634            .insert_conversation_tree(agent_id, None, &conv_a)
19635            .unwrap();
19636
19637        let bundle = discover_historical_database_bundles(&canonical_db)
19638            .into_iter()
19639            .find(|bundle| bundle.root_path == backup_db)
19640            .unwrap();
19641        let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19642            .unwrap()
19643            .query_row_map(
19644                "SELECT id FROM conversations WHERE source_path = ?1",
19645                fparams!["/tmp/one.jsonl"],
19646                |row| row.get_typed(0),
19647            )
19648            .unwrap();
19649        storage
19650            .record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
19651            .unwrap();
19652
19653        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19654        assert_eq!(outcome.bundles_imported, 1);
19655        assert_eq!(outcome.conversations_imported, 52);
19656        assert_eq!(outcome.messages_imported, 101);
19657        assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
19658
19659        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19660        let progress_left: Option<String> = storage
19661            .conn
19662            .query_row_map(
19663                "SELECT value FROM meta WHERE key = ?1",
19664                fparams![progress_key.as_str()],
19665                |row| row.get_typed(0),
19666            )
19667            .optional()
19668            .unwrap();
19669        assert!(
19670            progress_left.is_none(),
19671            "completed salvage should clear bundle progress"
19672        );
19673
19674        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19675        assert_eq!(second.bundles_imported, 0);
19676        assert_eq!(second.messages_imported, 0);
19677    }
19678
19679    #[test]
19680    fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
19681        // Regression for issue #247 (coding_agent_session_search-r8pcy): a bundle
19682        // whose progress checkpoint already covers the backup's entire conversation
19683        // row-id space (daemon OOM-killed after the last batch committed but before
19684        // the completion ledger marker landed) must be ledgered + skipped, not
19685        // re-scanned O(n) with imported=0 every batch.
19686        use crate::model::types::{Conversation, Message, MessageRole};
19687        use std::path::PathBuf;
19688
19689        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19690            Conversation {
19691                id: None,
19692                agent_slug: "codex".into(),
19693                workspace: Some(PathBuf::from("/tmp/workspace")),
19694                external_id: Some(format!("conv-{idx_seed}")),
19695                title: Some(format!("Recovered {idx_seed}")),
19696                source_path: PathBuf::from(source_path),
19697                started_at: Some(1_700_000_000_000 + idx_seed),
19698                ended_at: Some(1_700_000_000_100 + idx_seed),
19699                approx_tokens: None,
19700                metadata_json: serde_json::Value::Null,
19701                messages: vec![Message {
19702                    id: None,
19703                    idx: 0,
19704                    role: MessageRole::User,
19705                    author: None,
19706                    created_at: Some(1_700_000_000_000 + idx_seed),
19707                    content: format!("message-{idx_seed}"),
19708                    extra_json: serde_json::Value::Null,
19709                    snippets: Vec::new(),
19710                }],
19711                source_id: LOCAL_SOURCE_ID.into(),
19712                origin_host: None,
19713            }
19714        }
19715
19716        let dir = TempDir::new().unwrap();
19717        let canonical_db = dir.path().join("agent_search.db");
19718        let backup_db = dir
19719            .path()
19720            .join("backups/agent_search.db.20260322T020200.bak");
19721        let storage = SqliteStorage::open(&canonical_db).unwrap();
19722        seed_historical_db_direct(
19723            &backup_db,
19724            &[
19725                make_conv("/tmp/one.jsonl", 1),
19726                make_conv("/tmp/two.jsonl", 2),
19727                make_conv("/tmp/three.jsonl", 3),
19728            ],
19729        );
19730
19731        let bundle = discover_historical_database_bundles(&canonical_db)
19732            .into_iter()
19733            .find(|bundle| bundle.root_path == backup_db)
19734            .unwrap();
19735
19736        // Checkpoint high-water mark == backup's max conversation id.
19737        let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19738            .unwrap()
19739            .query_row_map(
19740                "SELECT COALESCE(MAX(id), 0) FROM conversations",
19741                fparams![],
19742                |row| row.get_typed(0),
19743            )
19744            .unwrap();
19745        assert!(backup_max_id > 0, "seeded backup should have conversations");
19746        storage
19747            .record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
19748            .unwrap();
19749
19750        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19751        assert_eq!(
19752            outcome.bundles_imported, 0,
19753            "fully-checkpointed bundle must not be re-scanned"
19754        );
19755        assert_eq!(outcome.conversations_imported, 0);
19756        assert_eq!(outcome.messages_imported, 0);
19757        assert_eq!(
19758            storage.list_conversations(10, 0).unwrap().len(),
19759            0,
19760            "skip path must not import anything"
19761        );
19762        assert!(
19763            storage.historical_bundle_already_imported(&bundle).unwrap(),
19764            "skipped bundle must be ledgered as salvaged so future runs short-circuit"
19765        );
19766
19767        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19768        let progress_left: Option<String> = storage
19769            .conn
19770            .query_row_map(
19771                "SELECT value FROM meta WHERE key = ?1",
19772                fparams![progress_key.as_str()],
19773                |row| row.get_typed(0),
19774            )
19775            .optional()
19776            .unwrap();
19777        assert!(
19778            progress_left.is_none(),
19779            "skip path must clear the bundle progress checkpoint"
19780        );
19781    }
19782
19783    #[test]
19784    fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
19785        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19786        use std::path::PathBuf;
19787
19788        let dir = TempDir::new().unwrap();
19789        let db_path = dir.path().join("agent_search.db");
19790        let storage = SqliteStorage::open(&db_path).unwrap();
19791        let agent = Agent {
19792            id: None,
19793            slug: "codex".into(),
19794            name: "Codex".into(),
19795            version: Some("0.2.3".into()),
19796            kind: AgentKind::Cli,
19797        };
19798        let agent_id = storage.ensure_agent(&agent).unwrap();
19799
19800        let make_conv = |source_path: &str, started_at: i64| Conversation {
19801            id: None,
19802            agent_slug: "codex".into(),
19803            workspace: Some(PathBuf::from("/tmp/workspace")),
19804            external_id: Some(source_path.to_string()),
19805            title: Some(source_path.to_string()),
19806            source_path: PathBuf::from(source_path),
19807            started_at: Some(started_at),
19808            ended_at: Some(started_at + 1),
19809            approx_tokens: None,
19810            metadata_json: serde_json::Value::Null,
19811            messages: vec![Message {
19812                id: None,
19813                idx: 0,
19814                role: MessageRole::User,
19815                author: None,
19816                created_at: Some(started_at),
19817                content: format!("message for {source_path}"),
19818                extra_json: serde_json::Value::Null,
19819                snippets: Vec::new(),
19820            }],
19821            source_id: LOCAL_SOURCE_ID.into(),
19822            origin_host: None,
19823        };
19824
19825        let conv_a = make_conv("/tmp/a.jsonl", 3_000);
19826        let conv_b = make_conv("/tmp/b.jsonl", 1_000);
19827        let conv_c = make_conv("/tmp/c.jsonl", 2_000);
19828
19829        storage
19830            .insert_conversation_tree(agent_id, None, &conv_a)
19831            .unwrap();
19832        storage
19833            .insert_conversation_tree(agent_id, None, &conv_b)
19834            .unwrap();
19835        storage
19836            .insert_conversation_tree(agent_id, None, &conv_c)
19837            .unwrap();
19838
19839        let user_order: Vec<PathBuf> = storage
19840            .list_conversations(10, 0)
19841            .unwrap()
19842            .into_iter()
19843            .map(|conv| conv.source_path)
19844            .collect();
19845        assert_eq!(
19846            user_order,
19847            vec![
19848                PathBuf::from("/tmp/a.jsonl"),
19849                PathBuf::from("/tmp/c.jsonl"),
19850                PathBuf::from("/tmp/b.jsonl"),
19851            ]
19852        );
19853
19854        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
19855        let rebuild_order: Vec<PathBuf> = storage
19856            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
19857            .unwrap()
19858            .into_iter()
19859            .map(|conv| conv.source_path)
19860            .collect();
19861        assert_eq!(
19862            rebuild_order,
19863            vec![
19864                PathBuf::from("/tmp/a.jsonl"),
19865                PathBuf::from("/tmp/b.jsonl"),
19866                PathBuf::from("/tmp/c.jsonl"),
19867            ]
19868        );
19869
19870        let first_page = storage
19871            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
19872            .unwrap();
19873        let first_page_paths: Vec<PathBuf> = first_page
19874            .iter()
19875            .map(|conv| conv.source_path.clone())
19876            .collect();
19877        assert_eq!(
19878            first_page_paths,
19879            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
19880        );
19881
19882        let second_page = storage
19883            .list_conversations_for_lexical_rebuild_after_id(
19884                2,
19885                first_page
19886                    .last()
19887                    .and_then(|conv| conv.id)
19888                    .expect("first page should include an id"),
19889                &agent_slugs,
19890                &workspace_paths,
19891            )
19892            .unwrap();
19893        let second_page_paths: Vec<PathBuf> = second_page
19894            .iter()
19895            .map(|conv| conv.source_path.clone())
19896            .collect();
19897        assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
19898
19899        let bounded_page = storage
19900            .list_conversations_for_lexical_rebuild_after_id_through_id(
19901                10,
19902                0,
19903                first_page
19904                    .last()
19905                    .and_then(|conv| conv.id)
19906                    .expect("first page should include an id"),
19907                &agent_slugs,
19908                &workspace_paths,
19909            )
19910            .unwrap();
19911        let bounded_paths: Vec<PathBuf> = bounded_page
19912            .iter()
19913            .map(|conv| conv.source_path.clone())
19914            .collect();
19915        assert_eq!(
19916            bounded_paths,
19917            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
19918        );
19919    }
19920
19921    #[test]
19922    fn keyset_traversal_handles_sparse_holey_conversation_ids() {
19923        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19924        use std::path::PathBuf;
19925
19926        let dir = TempDir::new().unwrap();
19927        let db_path = dir.path().join("agent_search.db");
19928        let storage = SqliteStorage::open(&db_path).unwrap();
19929        let agent = Agent {
19930            id: None,
19931            slug: "codex".into(),
19932            name: "Codex".into(),
19933            version: Some("0.2.3".into()),
19934            kind: AgentKind::Cli,
19935        };
19936        let agent_id = storage.ensure_agent(&agent).unwrap();
19937
19938        let make_conv = |label: &str, ts: i64| Conversation {
19939            id: None,
19940            agent_slug: "codex".into(),
19941            workspace: Some(PathBuf::from("/tmp/workspace")),
19942            external_id: Some(label.to_string()),
19943            title: Some(label.to_string()),
19944            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
19945            started_at: Some(ts),
19946            ended_at: Some(ts + 1),
19947            approx_tokens: None,
19948            metadata_json: serde_json::Value::Null,
19949            messages: vec![Message {
19950                id: None,
19951                idx: 0,
19952                role: MessageRole::User,
19953                author: None,
19954                created_at: Some(ts),
19955                content: format!("msg for {label}"),
19956                extra_json: serde_json::Value::Null,
19957                snippets: Vec::new(),
19958            }],
19959            source_id: LOCAL_SOURCE_ID.into(),
19960            origin_host: None,
19961        };
19962
19963        for i in 0..6 {
19964            storage
19965                .insert_conversation_tree(
19966                    agent_id,
19967                    None,
19968                    &make_conv(&format!("conv-{i}"), 1000 + i),
19969                )
19970                .unwrap();
19971        }
19972
19973        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
19974        storage
19975            .conn
19976            .execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
19977            .unwrap();
19978        storage
19979            .conn
19980            .execute_compat(
19981                "DELETE FROM messages WHERE conversation_id IN (2, 4)",
19982                fparams![],
19983            )
19984            .unwrap();
19985        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
19986
19987        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
19988
19989        let page1 = storage
19990            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
19991            .unwrap();
19992        assert_eq!(page1.len(), 2);
19993        let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
19994        assert_eq!(page1_ids, vec![1, 3]);
19995
19996        let page2 = storage
19997            .list_conversations_for_lexical_rebuild_after_id(
19998                2,
19999                *page1_ids.last().unwrap(),
20000                &agent_slugs,
20001                &workspace_paths,
20002            )
20003            .unwrap();
20004        assert_eq!(page2.len(), 2);
20005        let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
20006        assert_eq!(page2_ids, vec![5, 6]);
20007
20008        let page3 = storage
20009            .list_conversations_for_lexical_rebuild_after_id(
20010                2,
20011                *page2_ids.last().unwrap(),
20012                &agent_slugs,
20013                &workspace_paths,
20014            )
20015            .unwrap();
20016        assert!(page3.is_empty());
20017
20018        let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
20019        assert_eq!(all_ids, vec![1, 3, 5, 6]);
20020    }
20021
20022    #[test]
20023    fn keyset_traversal_through_id_with_sparse_ranges() {
20024        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20025        use std::path::PathBuf;
20026
20027        let dir = TempDir::new().unwrap();
20028        let db_path = dir.path().join("agent_search.db");
20029        let storage = SqliteStorage::open(&db_path).unwrap();
20030        let agent = Agent {
20031            id: None,
20032            slug: "codex".into(),
20033            name: "Codex".into(),
20034            version: Some("0.2.3".into()),
20035            kind: AgentKind::Cli,
20036        };
20037        let agent_id = storage.ensure_agent(&agent).unwrap();
20038
20039        let make_conv = |label: &str, ts: i64| Conversation {
20040            id: None,
20041            agent_slug: "codex".into(),
20042            workspace: Some(PathBuf::from("/tmp/workspace")),
20043            external_id: Some(label.to_string()),
20044            title: Some(label.to_string()),
20045            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20046            started_at: Some(ts),
20047            ended_at: Some(ts + 1),
20048            approx_tokens: None,
20049            metadata_json: serde_json::Value::Null,
20050            messages: vec![Message {
20051                id: None,
20052                idx: 0,
20053                role: MessageRole::User,
20054                author: None,
20055                created_at: Some(ts),
20056                content: format!("msg for {label}"),
20057                extra_json: serde_json::Value::Null,
20058                snippets: Vec::new(),
20059            }],
20060            source_id: LOCAL_SOURCE_ID.into(),
20061            origin_host: None,
20062        };
20063
20064        for i in 0..10 {
20065            storage
20066                .insert_conversation_tree(
20067                    agent_id,
20068                    None,
20069                    &make_conv(&format!("conv-{i}"), 1000 + i),
20070                )
20071                .unwrap();
20072        }
20073
20074        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20075        storage
20076            .conn
20077            .execute_compat(
20078                "DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
20079                fparams![],
20080            )
20081            .unwrap();
20082        storage
20083            .conn
20084            .execute_compat(
20085                "DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
20086                fparams![],
20087            )
20088            .unwrap();
20089        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20090
20091        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20092
20093        let through_5 = storage
20094            .list_conversations_for_lexical_rebuild_after_id_through_id(
20095                100,
20096                0,
20097                5,
20098                &agent_slugs,
20099                &workspace_paths,
20100            )
20101            .unwrap();
20102        let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
20103        assert_eq!(through_5_ids, vec![1, 2, 4]);
20104
20105        let after_4_through_10 = storage
20106            .list_conversations_for_lexical_rebuild_after_id_through_id(
20107                100,
20108                4,
20109                10,
20110                &agent_slugs,
20111                &workspace_paths,
20112            )
20113            .unwrap();
20114        let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
20115        assert_eq!(ids, vec![6, 9, 10]);
20116
20117        let after_10 = storage
20118            .list_conversations_for_lexical_rebuild_after_id_through_id(
20119                100,
20120                10,
20121                20,
20122                &agent_slugs,
20123                &workspace_paths,
20124            )
20125            .unwrap();
20126        assert!(after_10.is_empty());
20127    }
20128
20129    #[test]
20130    fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
20131     {
20132        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20133        use std::path::PathBuf;
20134
20135        let dir = TempDir::new().unwrap();
20136        let db_path = dir.path().join("agent_search.db");
20137        let storage = SqliteStorage::open(&db_path).unwrap();
20138        let agent = Agent {
20139            id: None,
20140            slug: "codex".into(),
20141            name: "Codex".into(),
20142            version: Some("0.2.3".into()),
20143            kind: AgentKind::Cli,
20144        };
20145        let agent_id = storage.ensure_agent(&agent).unwrap();
20146
20147        let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
20148            storage
20149                .insert_conversation_tree(
20150                    agent_id,
20151                    None,
20152                    &Conversation {
20153                        id: None,
20154                        agent_slug: "codex".into(),
20155                        workspace: Some(PathBuf::from("/tmp/workspace")),
20156                        external_id: Some(external_id.to_string()),
20157                        title: Some(external_id.to_string()),
20158                        source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
20159                        started_at: Some(base_ts),
20160                        ended_at: Some(base_ts + 100),
20161                        approx_tokens: None,
20162                        metadata_json: serde_json::Value::Null,
20163                        messages,
20164                        source_id: LOCAL_SOURCE_ID.into(),
20165                        origin_host: None,
20166                    },
20167                )
20168                .unwrap()
20169                .conversation_id
20170        };
20171
20172        let ascii_id = insert(
20173            "footprint-ascii",
20174            1_700_000_000_000,
20175            vec![
20176                Message {
20177                    id: None,
20178                    idx: 0,
20179                    role: MessageRole::User,
20180                    author: None,
20181                    created_at: Some(1_700_000_000_001),
20182                    content: "abc".into(),
20183                    extra_json: serde_json::Value::Null,
20184                    snippets: Vec::new(),
20185                },
20186                Message {
20187                    id: None,
20188                    idx: 1,
20189                    role: MessageRole::Agent,
20190                    author: None,
20191                    created_at: Some(1_700_000_000_002),
20192                    content: "defg".into(),
20193                    extra_json: serde_json::Value::Null,
20194                    snippets: Vec::new(),
20195                },
20196            ],
20197        );
20198        let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
20199        let utf8_id = insert(
20200            "footprint-utf8",
20201            1_700_000_002_000,
20202            vec![Message {
20203                id: None,
20204                idx: 0,
20205                role: MessageRole::Tool,
20206                author: None,
20207                created_at: Some(1_700_000_002_001),
20208                content: "hé🙂".into(),
20209                extra_json: serde_json::Value::Null,
20210                snippets: Vec::new(),
20211            }],
20212        );
20213        let sparse_id = insert(
20214            "footprint-sparse",
20215            1_700_000_003_000,
20216            vec![Message {
20217                id: None,
20218                idx: 10,
20219                role: MessageRole::User,
20220                author: None,
20221                created_at: Some(1_700_000_003_010),
20222                content: "sparse".into(),
20223                extra_json: serde_json::Value::Null,
20224                snippets: Vec::new(),
20225            }],
20226        );
20227        storage
20228            .conn
20229            .execute_compat(
20230                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20231                fparams![utf8_id],
20232            )
20233            .unwrap();
20234
20235        let footprints = storage
20236            .list_conversation_footprints_for_lexical_rebuild()
20237            .unwrap();
20238        assert_eq!(
20239            footprints,
20240            vec![
20241                LexicalRebuildConversationFootprintRow {
20242                    conversation_id: ascii_id,
20243                    message_count: 2,
20244                    message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20245                },
20246                LexicalRebuildConversationFootprintRow {
20247                    conversation_id: empty_id,
20248                    message_count: 0,
20249                    message_bytes: 0,
20250                },
20251                LexicalRebuildConversationFootprintRow {
20252                    conversation_id: utf8_id,
20253                    message_count: 1,
20254                    message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20255                },
20256                LexicalRebuildConversationFootprintRow {
20257                    conversation_id: sparse_id,
20258                    message_count: 11,
20259                    message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20260                },
20261            ]
20262        );
20263    }
20264
20265    #[test]
20266    fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
20267        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20268        use std::path::PathBuf;
20269
20270        let dir = TempDir::new().unwrap();
20271        let db_path = dir.path().join("agent_search.db");
20272        let storage = SqliteStorage::open(&db_path).unwrap();
20273        let agent = Agent {
20274            id: None,
20275            slug: "codex".into(),
20276            name: "Codex".into(),
20277            version: Some("0.2.3".into()),
20278            kind: AgentKind::Cli,
20279        };
20280        let agent_id = storage.ensure_agent(&agent).unwrap();
20281        let conversation_id = storage
20282            .insert_conversation_tree(
20283                agent_id,
20284                None,
20285                &Conversation {
20286                    id: None,
20287                    agent_slug: "codex".into(),
20288                    workspace: Some(PathBuf::from("/tmp/workspace")),
20289                    external_id: Some("footprint-missing-tail".to_string()),
20290                    title: Some("footprint-missing-tail".to_string()),
20291                    source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
20292                    started_at: Some(1_700_000_000_000),
20293                    ended_at: Some(1_700_000_000_100),
20294                    approx_tokens: None,
20295                    metadata_json: serde_json::Value::Null,
20296                    messages: vec![Message {
20297                        id: None,
20298                        idx: 10,
20299                        role: MessageRole::User,
20300                        author: None,
20301                        created_at: Some(1_700_000_000_010),
20302                        content: "legacy sparse tail".into(),
20303                        extra_json: serde_json::Value::Null,
20304                        snippets: Vec::new(),
20305                    }],
20306                    source_id: LOCAL_SOURCE_ID.into(),
20307                    origin_host: None,
20308                },
20309            )
20310            .unwrap()
20311            .conversation_id;
20312
20313        storage
20314            .conn
20315            .execute_compat(
20316                "UPDATE conversations
20317                 SET last_message_idx = NULL, last_message_created_at = NULL
20318                 WHERE id = ?1",
20319                fparams![conversation_id],
20320            )
20321            .unwrap();
20322        storage
20323            .conn
20324            .execute_compat(
20325                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20326                fparams![conversation_id],
20327            )
20328            .unwrap();
20329
20330        let footprints = storage
20331            .list_conversation_footprints_for_lexical_rebuild()
20332            .unwrap();
20333
20334        assert_eq!(
20335            footprints,
20336            vec![LexicalRebuildConversationFootprintRow {
20337                conversation_id,
20338                message_count: 11,
20339                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20340            }],
20341            "missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
20342        );
20343    }
20344
20345    #[test]
20346    fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
20347        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20348        use std::path::PathBuf;
20349
20350        let dir = TempDir::new().unwrap();
20351        let db_path = dir.path().join("agent_search.db");
20352        let storage = SqliteStorage::open(&db_path).unwrap();
20353        let agent = Agent {
20354            id: None,
20355            slug: "codex".into(),
20356            name: "Codex".into(),
20357            version: Some("0.2.3".into()),
20358            kind: AgentKind::Cli,
20359        };
20360        let agent_id = storage.ensure_agent(&agent).unwrap();
20361        let conversation_id = storage
20362            .insert_conversation_tree(
20363                agent_id,
20364                None,
20365                &Conversation {
20366                    id: None,
20367                    agent_slug: "codex".into(),
20368                    workspace: Some(PathBuf::from("/tmp/workspace")),
20369                    external_id: Some("footprint-stale-tail".to_string()),
20370                    title: Some("footprint-stale-tail".to_string()),
20371                    source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
20372                    started_at: Some(1_700_000_000_000),
20373                    ended_at: Some(1_700_000_000_100),
20374                    approx_tokens: None,
20375                    metadata_json: serde_json::Value::Null,
20376                    messages: (0..3)
20377                        .map(|idx| Message {
20378                            id: None,
20379                            idx,
20380                            role: MessageRole::User,
20381                            author: None,
20382                            created_at: Some(1_700_000_000_010 + idx),
20383                            content: format!("message {idx}"),
20384                            extra_json: serde_json::Value::Null,
20385                            snippets: Vec::new(),
20386                        })
20387                        .collect(),
20388                    source_id: LOCAL_SOURCE_ID.into(),
20389                    origin_host: None,
20390                },
20391            )
20392            .unwrap()
20393            .conversation_id;
20394
20395        storage
20396            .conn
20397            .execute_compat(
20398                "UPDATE conversations
20399                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20400                 WHERE id = ?1",
20401                fparams![conversation_id],
20402            )
20403            .unwrap();
20404        storage
20405            .conn
20406            .execute_compat(
20407                "UPDATE conversation_tail_state
20408                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20409                 WHERE conversation_id = ?1",
20410                fparams![conversation_id],
20411            )
20412            .unwrap();
20413
20414        let footprints = storage
20415            .list_conversation_footprints_for_lexical_rebuild()
20416            .unwrap();
20417
20418        assert_eq!(
20419            footprints,
20420            vec![LexicalRebuildConversationFootprintRow {
20421                conversation_id,
20422                message_count: 3,
20423                message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20424            }],
20425            "stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
20426        );
20427    }
20428
20429    #[test]
20430    fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
20431        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20432        use std::path::PathBuf;
20433
20434        let dir = TempDir::new().unwrap();
20435        let db_path = dir.path().join("agent_search.db");
20436        let storage = SqliteStorage::open(&db_path).unwrap();
20437        let agent = Agent {
20438            id: None,
20439            slug: "codex".into(),
20440            name: "Codex".into(),
20441            version: Some("0.2.3".into()),
20442            kind: AgentKind::Cli,
20443        };
20444        let agent_id = storage.ensure_agent(&agent).unwrap();
20445        let conversation_id = storage
20446            .insert_conversation_tree(
20447                agent_id,
20448                None,
20449                &Conversation {
20450                    id: None,
20451                    agent_slug: "codex".into(),
20452                    workspace: Some(PathBuf::from("/tmp/workspace")),
20453                    external_id: Some("footprint-missing-tail-table".to_string()),
20454                    title: Some("footprint-missing-tail-table".to_string()),
20455                    source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
20456                    started_at: Some(1_700_000_000_000),
20457                    ended_at: Some(1_700_000_000_100),
20458                    approx_tokens: None,
20459                    metadata_json: serde_json::Value::Null,
20460                    messages: vec![Message {
20461                        id: None,
20462                        idx: 10,
20463                        role: MessageRole::User,
20464                        author: None,
20465                        created_at: Some(1_700_000_000_010),
20466                        content: "legacy sparse tail without hot table".into(),
20467                        extra_json: serde_json::Value::Null,
20468                        snippets: Vec::new(),
20469                    }],
20470                    source_id: LOCAL_SOURCE_ID.into(),
20471                    origin_host: None,
20472                },
20473            )
20474            .unwrap()
20475            .conversation_id;
20476
20477        storage
20478            .conn
20479            .execute_compat(
20480                "UPDATE conversations
20481                 SET last_message_idx = NULL, last_message_created_at = NULL
20482                 WHERE id = ?1",
20483                fparams![conversation_id],
20484            )
20485            .unwrap();
20486        storage
20487            .conn
20488            .execute_compat("DROP TABLE conversation_tail_state", fparams![])
20489            .unwrap();
20490
20491        let footprints = storage
20492            .list_conversation_footprints_for_lexical_rebuild()
20493            .unwrap();
20494
20495        assert_eq!(
20496            footprints,
20497            vec![LexicalRebuildConversationFootprintRow {
20498                conversation_id,
20499                message_count: 11,
20500                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20501            }],
20502            "read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
20503        );
20504    }
20505
20506    #[test]
20507    fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
20508        let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
20509            .join("tests")
20510            .join("fixtures")
20511            .join("search_demo_data")
20512            .join("agent_search.db");
20513        let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
20514
20515        let footprints = storage
20516            .list_conversation_footprints_for_lexical_rebuild()
20517            .unwrap();
20518
20519        assert!(
20520            !footprints.is_empty(),
20521            "search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
20522        );
20523        assert!(
20524            footprints
20525                .iter()
20526                .all(|footprint| footprint.message_count > 0),
20527            "legacy fixture conversations should derive message counts from messages when tail caches are absent"
20528        );
20529    }
20530
20531    #[test]
20532    fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
20533        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20534        use std::path::PathBuf;
20535
20536        let dir = TempDir::new().unwrap();
20537        let db_path = dir.path().join("agent_search.db");
20538        let storage = SqliteStorage::open(&db_path).unwrap();
20539        let agent = Agent {
20540            id: None,
20541            slug: "codex".into(),
20542            name: "Codex".into(),
20543            version: Some("0.2.3".into()),
20544            kind: AgentKind::Cli,
20545        };
20546        let agent_id = storage.ensure_agent(&agent).unwrap();
20547        let conversation = Conversation {
20548            id: None,
20549            agent_slug: "codex".into(),
20550            workspace: Some(PathBuf::from("/tmp/workspace")),
20551            external_id: Some("legacy-blank-source".into()),
20552            title: Some("Legacy blank source".into()),
20553            source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
20554            started_at: Some(1_700_000_000_000),
20555            ended_at: Some(1_700_000_000_100),
20556            approx_tokens: None,
20557            metadata_json: serde_json::Value::Null,
20558            messages: vec![Message {
20559                id: None,
20560                idx: 0,
20561                role: MessageRole::User,
20562                author: None,
20563                created_at: Some(1_700_000_000_000),
20564                content: "hello".into(),
20565                extra_json: serde_json::Value::Null,
20566                snippets: Vec::new(),
20567            }],
20568            source_id: LOCAL_SOURCE_ID.into(),
20569            origin_host: None,
20570        };
20571
20572        let conversation_id = storage
20573            .insert_conversation_tree(agent_id, None, &conversation)
20574            .unwrap()
20575            .conversation_id;
20576        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20577        storage
20578            .conn
20579            .execute_compat(
20580                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
20581                fparams!["   ", "dev@laptop", conversation_id],
20582            )
20583            .unwrap();
20584        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20585
20586        let listed = storage.list_conversations(10, 0).unwrap();
20587        assert_eq!(listed.len(), 1);
20588        assert_eq!(listed[0].source_id, "dev@laptop");
20589        assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
20590
20591        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20592        let rebuild_listed = storage
20593            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
20594            .unwrap();
20595        assert_eq!(rebuild_listed.len(), 1);
20596        assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
20597        assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
20598    }
20599
20600    #[test]
20601    fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
20602        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20603        use std::path::PathBuf;
20604
20605        let dir = TempDir::new().unwrap();
20606        let canonical_db = dir.path().join("agent_search.db");
20607        let source_db = dir
20608            .path()
20609            .join("backups/agent_search.db.20260322T020200.bak");
20610
20611        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20612
20613        let source = SqliteStorage::open(&source_db).unwrap();
20614        let agent = Agent {
20615            id: None,
20616            slug: "codex".into(),
20617            name: "Codex".into(),
20618            version: Some("0.2.3".into()),
20619            kind: AgentKind::Cli,
20620        };
20621        let agent_id = source.ensure_agent(&agent).unwrap();
20622        let conversation = Conversation {
20623            id: None,
20624            agent_slug: "codex".into(),
20625            workspace: Some(PathBuf::from("/tmp/workspace")),
20626            external_id: Some("seed-conv".into()),
20627            title: Some("Historical seed".into()),
20628            source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
20629            started_at: Some(1_700_000_000_000),
20630            ended_at: Some(1_700_000_000_100),
20631            approx_tokens: Some(42),
20632            metadata_json: serde_json::json!({"seed": true}),
20633            messages: vec![Message {
20634                id: None,
20635                idx: 0,
20636                role: MessageRole::Agent,
20637                author: Some("assistant".into()),
20638                created_at: Some(1_700_000_000_050),
20639                content: "seeded message".into(),
20640                extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
20641                snippets: Vec::new(),
20642            }],
20643            source_id: LOCAL_SOURCE_ID.into(),
20644            origin_host: None,
20645        };
20646        source
20647            .insert_conversation_tree(agent_id, None, &conversation)
20648            .unwrap();
20649        source.set_last_scan_ts(123).unwrap();
20650        source.set_last_indexed_at(456).unwrap();
20651        source.set_last_embedded_message_id(789).unwrap();
20652        source
20653            .conn
20654            .execute_compat(
20655                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20656                fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
20657            )
20658            .unwrap();
20659        drop(source);
20660
20661        // Legacy "duplicate FTS" fixture reconstruction.
20662        //
20663        // Post-V14 migration cass drops the V13-era fts_messages virtual table
20664        // and recreates it lazily, so a freshly-opened canonical DB has zero
20665        // fts_messages entries in sqlite_master. To reproduce the historical
20666        // failure mode this test exercises — a legacy v13 bundle with a
20667        // duplicated CREATE VIRTUAL TABLE row — we have to inject *both*
20668        // entries: the original V13-era contentless row and the buggy duplicate
20669        // row. Before V14 existed the original was already present after
20670        // migration and only the duplicate needed manual injection.
20671        let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
20672        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
20673        let legacy = rusqlite_test_fixture_conn(&source_db);
20674        legacy
20675            .execute_batch(
20676                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
20677                 DELETE FROM _schema_migrations WHERE version = 14;
20678                 PRAGMA writable_schema = ON;",
20679            )
20680            .unwrap();
20681        legacy
20682            .execute(
20683                "DELETE FROM meta WHERE key = ?1",
20684                [FTS_FRANKEN_REBUILD_META_KEY],
20685            )
20686            .unwrap();
20687        // Inject the V13 original first.
20688        legacy
20689            .execute(
20690                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20691                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20692                [legacy_v13_fts_sql],
20693            )
20694            .unwrap();
20695        // Then the duplicate that's the real subject of the fixup logic.
20696        legacy
20697            .execute(
20698                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20699                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20700                [duplicate_legacy_fts_sql],
20701            )
20702            .unwrap();
20703        legacy
20704            .execute_batch("PRAGMA writable_schema = OFF;")
20705            .unwrap();
20706        drop(legacy);
20707
20708        // Verify fixture with rusqlite+writable_schema to see raw
20709        // sqlite_master rows (frankensqlite deduplicates schema entries).
20710        {
20711            let verify = rusqlite_test_fixture_conn(&source_db);
20712            verify
20713                .execute_batch("PRAGMA writable_schema = ON;")
20714                .unwrap();
20715            let fts_entries: i64 = verify
20716                .query_row(
20717                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20718                    [],
20719                    |row| row.get(0),
20720                )
20721                .unwrap();
20722            assert_eq!(
20723                fts_entries, 2,
20724                "test fixture should reproduce the duplicate legacy fts_messages rows"
20725            );
20726            let msg_count: i64 = verify
20727                .query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
20728                .unwrap();
20729            assert_eq!(msg_count, 1);
20730        }
20731
20732        let fresh = SqliteStorage::open(&canonical_db).unwrap();
20733        drop(fresh);
20734
20735        let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
20736            .unwrap()
20737            .unwrap();
20738        assert_eq!(outcome.bundles_imported, 1);
20739        assert_eq!(outcome.conversations_imported, 1);
20740        assert_eq!(outcome.messages_imported, 1);
20741
20742        let readonly = open_franken_with_flags(
20743            &canonical_db.to_string_lossy(),
20744            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20745        )
20746        .unwrap();
20747        let readonly_message_count: i64 = readonly
20748            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20749                row.get_typed(0)
20750            })
20751            .unwrap();
20752        assert_eq!(readonly_message_count, 1);
20753
20754        let seeded = SqliteStorage::open(&canonical_db).unwrap();
20755        assert_eq!(
20756            seeded
20757                .count_sessions_in_range(None, None, None, None)
20758                .unwrap()
20759                .0,
20760            1
20761        );
20762        let message_count: i64 = seeded
20763            .conn
20764            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20765                row.get_typed(0)
20766            })
20767            .unwrap();
20768        assert_eq!(message_count, 1);
20769        assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
20770        assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
20771
20772        let last_indexed: Option<String> = seeded
20773            .conn
20774            .query_row_map(
20775                "SELECT value FROM meta WHERE key = 'last_indexed_at'",
20776                fparams![],
20777                |row| row.get_typed(0),
20778            )
20779            .optional()
20780            .unwrap();
20781        assert!(last_indexed.is_none());
20782
20783        let salvage_keys: Vec<String> = seeded
20784            .conn
20785            .query_map_collect(
20786                "SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
20787                fparams![],
20788                |row| row.get_typed(0),
20789            )
20790            .unwrap();
20791        assert_eq!(salvage_keys.len(), 1);
20792
20793        let reopened_readonly = open_franken_with_flags(
20794            &canonical_db.to_string_lossy(),
20795            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20796        )
20797        .unwrap();
20798        let reopened_fts_entries: i64 = reopened_readonly
20799            .query_row_map(
20800                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20801                fparams![],
20802                |row| row.get_typed(0),
20803            )
20804            .unwrap();
20805        assert_eq!(
20806            reopened_fts_entries, 1,
20807            "seeded canonical db should keep a single stock-SQLite fts_messages schema row"
20808        );
20809        let reopened_message_count: i64 = reopened_readonly
20810            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20811                row.get_typed(0)
20812            })
20813            .unwrap();
20814        assert_eq!(reopened_message_count, 1);
20815
20816        let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
20817        assert_eq!(
20818            franken_seeded.schema_version().unwrap(),
20819            CURRENT_SCHEMA_VERSION
20820        );
20821        // Post-V14 fts_messages is recreated lazily. `FrankenStorage::open`
20822        // alone doesn't re-register the virtual table for the frankensqlite
20823        // query engine — the consistency pass does, and this is exactly what
20824        // normal cass startup runs before the first search. Invoke it
20825        // explicitly so the query below exercises the expected post-repair
20826        // state rather than the between-steps state.
20827        franken_seeded
20828            .ensure_search_fallback_fts_consistency()
20829            .expect("ensure FTS consistency after seed");
20830        let post_franken_schema_rows: i64 = franken_seeded
20831            .raw()
20832            .query_row_map(
20833                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20834                fparams![],
20835                |row| row.get_typed(0),
20836            )
20837            .unwrap();
20838        assert_eq!(post_franken_schema_rows, 1);
20839        assert!(
20840            franken_seeded
20841                .raw()
20842                .query("SELECT rowid FROM fts_messages LIMIT 1")
20843                .is_ok()
20844        );
20845    }
20846
20847    #[test]
20848    fn failed_baseline_seed_preserves_existing_canonical_bundle() {
20849        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20850        use std::path::PathBuf;
20851
20852        let dir = TempDir::new().unwrap();
20853        let canonical_db = dir.path().join("agent_search.db");
20854        let source_db = dir
20855            .path()
20856            .join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
20857
20858        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20859
20860        let canonical = SqliteStorage::open(&canonical_db).unwrap();
20861        canonical
20862            .conn
20863            .execute_compat(
20864                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20865                fparams!["sentinel", "keep-me"],
20866            )
20867            .unwrap();
20868        drop(canonical);
20869
20870        let source = SqliteStorage::open(&source_db).unwrap();
20871        let agent = Agent {
20872            id: None,
20873            slug: "codex".into(),
20874            name: "Codex".into(),
20875            version: Some("0.2.3".into()),
20876            kind: AgentKind::Cli,
20877        };
20878        let agent_id = source.ensure_agent(&agent).unwrap();
20879        let conversation = Conversation {
20880            id: None,
20881            agent_slug: "codex".into(),
20882            workspace: Some(PathBuf::from("/tmp/workspace")),
20883            external_id: Some("bad-seed-conv".into()),
20884            title: Some("Bad seed".into()),
20885            source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
20886            started_at: Some(1_700_000_000_000),
20887            ended_at: Some(1_700_000_000_100),
20888            approx_tokens: Some(42),
20889            metadata_json: serde_json::json!({"seed": "bad"}),
20890            messages: vec![Message {
20891                id: None,
20892                idx: 0,
20893                role: MessageRole::Agent,
20894                author: Some("assistant".into()),
20895                created_at: Some(1_700_000_000_050),
20896                content: "this seed should fail".into(),
20897                extra_json: serde_json::Value::Null,
20898                snippets: Vec::new(),
20899            }],
20900            source_id: LOCAL_SOURCE_ID.into(),
20901            origin_host: None,
20902        };
20903        source
20904            .insert_conversation_tree(agent_id, None, &conversation)
20905            .unwrap();
20906        drop(source);
20907
20908        let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
20909        legacy
20910            .execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
20911            .unwrap();
20912        drop(legacy);
20913
20914        let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
20915        assert!(
20916            err.to_string()
20917                .contains("schema_version 12 is too old for baseline import"),
20918            "unexpected seed error: {err:#}"
20919        );
20920
20921        let reopened = SqliteStorage::open(&canonical_db).unwrap();
20922        let sentinel: Option<String> = reopened
20923            .conn
20924            .query_row_map(
20925                "SELECT value FROM meta WHERE key = 'sentinel'",
20926                fparams![],
20927                |row| row.get_typed(0),
20928            )
20929            .optional()
20930            .unwrap();
20931        assert_eq!(sentinel.as_deref(), Some("keep-me"));
20932
20933        let conversation_count: i64 = reopened
20934            .conn
20935            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
20936                row.get_typed(0)
20937            })
20938            .unwrap();
20939        assert_eq!(conversation_count, 0);
20940
20941        let readonly = open_franken_with_flags(
20942            &canonical_db.to_string_lossy(),
20943            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20944        )
20945        .unwrap();
20946        let readonly_conversation_count: i64 = readonly
20947            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
20948                row.get_typed(0)
20949            })
20950            .unwrap();
20951        assert_eq!(readonly_conversation_count, 0);
20952    }
20953
20954    #[test]
20955    fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
20956        let dir = TempDir::new().unwrap();
20957        let db_path = dir.path().join("test.db");
20958        let storage = SqliteStorage::open(&db_path).unwrap();
20959
20960        let agent = Agent {
20961            id: None,
20962            slug: "codex".into(),
20963            name: "Codex".into(),
20964            version: Some("0.2.3".into()),
20965            kind: AgentKind::Cli,
20966        };
20967        let agent_id = storage.ensure_agent(&agent).unwrap();
20968
20969        let conversation = Conversation {
20970            id: None,
20971            agent_slug: "codex".into(),
20972            workspace: Some(PathBuf::from("/tmp/workspace")),
20973            external_id: Some("lexical-rebuild-test".into()),
20974            title: Some("Lexical rebuild".into()),
20975            source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
20976            started_at: Some(1_700_000_000_000),
20977            ended_at: Some(1_700_000_000_100),
20978            approx_tokens: Some(42),
20979            metadata_json: serde_json::Value::Null,
20980            messages: vec![Message {
20981                id: None,
20982                idx: 0,
20983                role: MessageRole::Agent,
20984                author: Some("assistant".into()),
20985                created_at: Some(1_700_000_000_050),
20986                content: "indexed text".into(),
20987                extra_json: serde_json::json!({
20988                    "usage": { "total_tokens": 1234 },
20989                    "irrelevant_blob": "still preserved in canonical storage"
20990                }),
20991                snippets: Vec::new(),
20992            }],
20993            source_id: LOCAL_SOURCE_ID.into(),
20994            origin_host: None,
20995        };
20996
20997        let inserted = storage
20998            .insert_conversation_tree(agent_id, None, &conversation)
20999            .unwrap();
21000        let conversation_id = inserted.conversation_id;
21001
21002        let stored = storage.fetch_messages(conversation_id).unwrap();
21003        assert_eq!(stored.len(), 1);
21004        assert!(!stored[0].extra_json.is_null());
21005
21006        let lexical = storage
21007            .fetch_messages_for_lexical_rebuild(conversation_id)
21008            .unwrap();
21009        assert_eq!(lexical.len(), 1);
21010        assert_eq!(lexical[0].content, "indexed text");
21011        assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
21012        assert!(lexical[0].extra_json.is_null());
21013    }
21014
21015    #[test]
21016    fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
21017        let dir = TempDir::new().unwrap();
21018        let db_path = dir.path().join("test.db");
21019        let storage = SqliteStorage::open(&db_path).unwrap();
21020
21021        let agent = Agent {
21022            id: None,
21023            slug: "codex".into(),
21024            name: "Codex".into(),
21025            version: Some("0.2.3".into()),
21026            kind: AgentKind::Cli,
21027        };
21028        let agent_id = storage.ensure_agent(&agent).unwrap();
21029
21030        let first = Conversation {
21031            id: None,
21032            agent_slug: "codex".into(),
21033            workspace: Some(PathBuf::from("/tmp/workspace")),
21034            external_id: Some("lexical-batch-1".into()),
21035            title: Some("Lexical batch 1".into()),
21036            source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
21037            started_at: Some(1_700_000_000_000),
21038            ended_at: Some(1_700_000_000_100),
21039            approx_tokens: Some(42),
21040            metadata_json: serde_json::Value::Null,
21041            messages: vec![
21042                Message {
21043                    id: None,
21044                    idx: 0,
21045                    role: MessageRole::User,
21046                    author: Some("user".into()),
21047                    created_at: Some(1_700_000_000_010),
21048                    content: "first-a".into(),
21049                    extra_json: serde_json::json!({"opaque": true}),
21050                    snippets: Vec::new(),
21051                },
21052                Message {
21053                    id: None,
21054                    idx: 1,
21055                    role: MessageRole::Agent,
21056                    author: Some("assistant".into()),
21057                    created_at: Some(1_700_000_000_020),
21058                    content: "first-b".into(),
21059                    extra_json: serde_json::json!({"opaque": true}),
21060                    snippets: Vec::new(),
21061                },
21062            ],
21063            source_id: LOCAL_SOURCE_ID.into(),
21064            origin_host: None,
21065        };
21066
21067        let second = Conversation {
21068            id: None,
21069            agent_slug: "codex".into(),
21070            workspace: Some(PathBuf::from("/tmp/workspace")),
21071            external_id: Some("lexical-batch-2".into()),
21072            title: Some("Lexical batch 2".into()),
21073            source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
21074            started_at: Some(1_700_000_000_200),
21075            ended_at: Some(1_700_000_000_300),
21076            approx_tokens: Some(84),
21077            metadata_json: serde_json::Value::Null,
21078            messages: vec![Message {
21079                id: None,
21080                idx: 0,
21081                role: MessageRole::Tool,
21082                author: Some("tool".into()),
21083                created_at: Some(1_700_000_000_210),
21084                content: "second-a".into(),
21085                extra_json: serde_json::json!({"opaque": true}),
21086                snippets: Vec::new(),
21087            }],
21088            source_id: LOCAL_SOURCE_ID.into(),
21089            origin_host: None,
21090        };
21091        let third = Conversation {
21092            external_id: Some("lexical-batch-3".into()),
21093            title: Some("Lexical batch 3".into()),
21094            source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
21095            messages: vec![Message {
21096                id: None,
21097                idx: 0,
21098                role: MessageRole::System,
21099                author: Some("system".into()),
21100                created_at: Some(1_700_000_000_410),
21101                content: "third-a".into(),
21102                extra_json: serde_json::json!({"opaque": true}),
21103                snippets: Vec::new(),
21104            }],
21105            ..second.clone()
21106        };
21107
21108        let first_id = storage
21109            .insert_conversation_tree(agent_id, None, &first)
21110            .unwrap()
21111            .conversation_id;
21112        let second_id = storage
21113            .insert_conversation_tree(agent_id, None, &second)
21114            .unwrap()
21115            .conversation_id;
21116        let third_id = storage
21117            .insert_conversation_tree(agent_id, None, &third)
21118            .unwrap()
21119            .conversation_id;
21120
21121        let lexical = storage
21122            .fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
21123            .unwrap();
21124
21125        let first_messages = lexical.get(&first_id).expect("first conversation");
21126        assert_eq!(first_messages.len(), 2);
21127        assert_eq!(first_messages[0].content, "first-a");
21128        assert_eq!(first_messages[1].content, "first-b");
21129        assert!(
21130            first_messages
21131                .iter()
21132                .all(|message| message.extra_json.is_null())
21133        );
21134
21135        assert!(
21136            !lexical.contains_key(&second_id),
21137            "batch fetch must exclude conversations not requested by the caller"
21138        );
21139
21140        let third_messages = lexical.get(&third_id).expect("third conversation");
21141        assert_eq!(third_messages.len(), 1);
21142        assert_eq!(third_messages[0].content, "third-a");
21143        assert!(third_messages[0].extra_json.is_null());
21144    }
21145
21146    #[test]
21147    fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
21148        let dir = TempDir::new().unwrap();
21149        let db_path = dir.path().join("test.db");
21150        let storage = SqliteStorage::open(&db_path).unwrap();
21151
21152        let agent = Agent {
21153            id: None,
21154            slug: "codex".into(),
21155            name: "Codex".into(),
21156            version: Some("0.2.3".into()),
21157            kind: AgentKind::Cli,
21158        };
21159        let agent_id = storage.ensure_agent(&agent).unwrap();
21160
21161        let conversation = Conversation {
21162            id: None,
21163            agent_slug: "codex".into(),
21164            workspace: Some(PathBuf::from("/tmp/workspace")),
21165            external_id: Some("lexical-batch-guard".into()),
21166            title: Some("Lexical batch guard".into()),
21167            source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
21168            started_at: Some(1_700_000_000_000),
21169            ended_at: Some(1_700_000_000_100),
21170            approx_tokens: Some(42),
21171            metadata_json: serde_json::Value::Null,
21172            messages: vec![
21173                Message {
21174                    id: None,
21175                    idx: 0,
21176                    role: MessageRole::User,
21177                    author: Some("user".into()),
21178                    created_at: Some(1_700_000_000_010),
21179                    content: "123456".into(),
21180                    extra_json: serde_json::Value::Null,
21181                    snippets: Vec::new(),
21182                },
21183                Message {
21184                    id: None,
21185                    idx: 1,
21186                    role: MessageRole::Agent,
21187                    author: Some("assistant".into()),
21188                    created_at: Some(1_700_000_000_020),
21189                    content: "abcdef".into(),
21190                    extra_json: serde_json::Value::Null,
21191                    snippets: Vec::new(),
21192                },
21193            ],
21194            source_id: LOCAL_SOURCE_ID.into(),
21195            origin_host: None,
21196        };
21197
21198        let conversation_id = storage
21199            .insert_conversation_tree(agent_id, None, &conversation)
21200            .unwrap()
21201            .conversation_id;
21202
21203        let error = storage
21204            .fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
21205            .expect_err("guardrail should reject oversized batch content");
21206
21207        let message = format!("{error:#}");
21208        assert!(
21209            message.contains("content-byte guardrail"),
21210            "expected guardrail reason in error, got {message}"
21211        );
21212    }
21213
21214    #[test]
21215    fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
21216        let dir = TempDir::new().unwrap();
21217        let db_path = dir.path().join("manual-rows.db");
21218        let storage = FrankenStorage::open(&db_path).unwrap();
21219        let conn = storage.raw();
21220
21221        conn.execute(
21222            "INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
21223             VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
21224        )
21225        .unwrap();
21226        conn.execute(
21227            "INSERT INTO conversations
21228             (id, agent_id, external_id, title, source_path, source_id, started_at)
21229             VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
21230        )
21231        .unwrap();
21232        conn.execute(
21233            "INSERT INTO messages
21234             (id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
21235             VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
21236        )
21237        .unwrap();
21238
21239        let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
21240        assert_eq!(lexical.len(), 1);
21241        assert_eq!(lexical[0].content, "manual body");
21242
21243        let full = storage.fetch_messages(1).unwrap();
21244        assert_eq!(full.len(), 1);
21245        assert_eq!(full[0].content, "manual body");
21246        assert_eq!(full[0].author.as_deref(), Some("tester"));
21247        assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
21248    }
21249
21250    #[test]
21251    fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
21252        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21253        use std::path::PathBuf;
21254
21255        let dir = TempDir::new().unwrap();
21256        let db_path = dir.path().join("agent_search.db");
21257        let storage = SqliteStorage::open(&db_path).unwrap();
21258
21259        let agent = Agent {
21260            id: None,
21261            slug: "claude_code".into(),
21262            name: "Claude Code".into(),
21263            version: None,
21264            kind: AgentKind::Cli,
21265        };
21266        let agent_id = storage.ensure_agent(&agent).unwrap();
21267
21268        for (external_id, base_ts) in [
21269            ("conv-1", 1_700_000_000_000_i64),
21270            ("conv-2", 1_700_000_001_000_i64),
21271        ] {
21272            let conversation = Conversation {
21273                id: None,
21274                agent_slug: "claude_code".into(),
21275                workspace: Some(PathBuf::from("/tmp/workspace")),
21276                external_id: Some(external_id.to_string()),
21277                title: Some("Lexical rebuild".into()),
21278                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21279                started_at: Some(base_ts),
21280                ended_at: Some(base_ts + 100),
21281                approx_tokens: None,
21282                metadata_json: serde_json::Value::Null,
21283                messages: vec![
21284                    Message {
21285                        id: None,
21286                        idx: 0,
21287                        role: MessageRole::User,
21288                        author: Some("user".into()),
21289                        created_at: Some(base_ts + 10),
21290                        content: format!("{external_id}-first"),
21291                        extra_json: serde_json::Value::Null,
21292                        snippets: Vec::new(),
21293                    },
21294                    Message {
21295                        id: None,
21296                        idx: 1,
21297                        role: MessageRole::Agent,
21298                        author: Some("assistant".into()),
21299                        created_at: Some(base_ts + 20),
21300                        content: format!("{external_id}-second"),
21301                        extra_json: serde_json::Value::Null,
21302                        snippets: Vec::new(),
21303                    },
21304                ],
21305                source_id: LOCAL_SOURCE_ID.into(),
21306                origin_host: None,
21307            };
21308            storage
21309                .insert_conversation_tree(agent_id, None, &conversation)
21310                .unwrap();
21311        }
21312
21313        let conversation_ids: Vec<i64> = storage
21314            .conn
21315            .query_map_collect(
21316                "SELECT id FROM conversations ORDER BY id",
21317                fparams![],
21318                |row| row.get_typed(0),
21319            )
21320            .unwrap();
21321        assert_eq!(conversation_ids.len(), 2);
21322
21323        let plan_details: Vec<String> = storage
21324            .conn
21325            .query_map_collect(
21326                "EXPLAIN QUERY PLAN \
21327                 SELECT conversation_id, id, idx, role, author, created_at, content \
21328                 FROM messages \
21329                 WHERE conversation_id IN (?1, ?2) \
21330                 ORDER BY conversation_id ASC, idx ASC",
21331                fparams![conversation_ids[0], conversation_ids[1]],
21332                |row| row.get_typed(3),
21333            )
21334            .unwrap();
21335
21336        assert!(
21337            plan_details
21338                .iter()
21339                .any(|detail| detail.contains("sqlite_autoindex_messages_1")),
21340            "expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
21341        );
21342        assert!(
21343            !plan_details
21344                .iter()
21345                .any(|detail| detail.contains("TEMP B-TREE")),
21346            "expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
21347        );
21348    }
21349
21350    #[test]
21351    fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
21352        let dir = TempDir::new().unwrap();
21353        let db_path = dir.path().join("test.db");
21354        let storage = SqliteStorage::open(&db_path).unwrap();
21355
21356        let agent = Agent {
21357            id: None,
21358            slug: "codex".into(),
21359            name: "Codex".into(),
21360            version: Some("0.2.3".into()),
21361            kind: AgentKind::Cli,
21362        };
21363        let agent_id = storage.ensure_agent(&agent).unwrap();
21364
21365        let first = Conversation {
21366            id: None,
21367            agent_slug: "codex".into(),
21368            workspace: Some(PathBuf::from("/tmp/workspace")),
21369            external_id: Some("lexical-stream-1".into()),
21370            title: Some("Lexical stream 1".into()),
21371            source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
21372            started_at: Some(1_700_000_000_000),
21373            ended_at: Some(1_700_000_000_100),
21374            approx_tokens: Some(42),
21375            metadata_json: serde_json::Value::Null,
21376            messages: vec![
21377                Message {
21378                    id: None,
21379                    idx: 0,
21380                    role: MessageRole::User,
21381                    author: Some("user".into()),
21382                    created_at: Some(1_700_000_000_010),
21383                    content: "first-a".into(),
21384                    extra_json: serde_json::json!({"opaque": true}),
21385                    snippets: Vec::new(),
21386                },
21387                Message {
21388                    id: None,
21389                    idx: 1,
21390                    role: MessageRole::Agent,
21391                    author: Some("assistant".into()),
21392                    created_at: Some(1_700_000_000_020),
21393                    content: "first-b".into(),
21394                    extra_json: serde_json::json!({"opaque": true}),
21395                    snippets: Vec::new(),
21396                },
21397            ],
21398            source_id: LOCAL_SOURCE_ID.into(),
21399            origin_host: None,
21400        };
21401
21402        let second = Conversation {
21403            id: None,
21404            agent_slug: "codex".into(),
21405            workspace: Some(PathBuf::from("/tmp/workspace")),
21406            external_id: Some("lexical-stream-2".into()),
21407            title: Some("Lexical stream 2".into()),
21408            source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
21409            started_at: Some(1_700_000_000_200),
21410            ended_at: Some(1_700_000_000_300),
21411            approx_tokens: Some(84),
21412            metadata_json: serde_json::Value::Null,
21413            messages: vec![Message {
21414                id: None,
21415                idx: 0,
21416                role: MessageRole::Tool,
21417                author: Some("tool".into()),
21418                created_at: Some(1_700_000_000_210),
21419                content: "second-a".into(),
21420                extra_json: serde_json::json!({"opaque": true}),
21421                snippets: Vec::new(),
21422            }],
21423            source_id: LOCAL_SOURCE_ID.into(),
21424            origin_host: None,
21425        };
21426
21427        let first_id = storage
21428            .insert_conversation_tree(agent_id, None, &first)
21429            .unwrap()
21430            .conversation_id;
21431        let second_id = storage
21432            .insert_conversation_tree(agent_id, None, &second)
21433            .unwrap()
21434            .conversation_id;
21435
21436        let mut streamed = Vec::new();
21437        storage
21438            .stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
21439                streamed.push((
21440                    row.conversation_id,
21441                    row.idx,
21442                    row.role,
21443                    row.author,
21444                    row.content,
21445                ));
21446                Ok(())
21447            })
21448            .unwrap();
21449
21450        assert_eq!(
21451            streamed,
21452            vec![
21453                (
21454                    first_id,
21455                    0,
21456                    "user".to_string(),
21457                    Some("user".to_string()),
21458                    "first-a".to_string(),
21459                ),
21460                (
21461                    first_id,
21462                    1,
21463                    "agent".to_string(),
21464                    Some("assistant".to_string()),
21465                    "first-b".to_string(),
21466                ),
21467                (
21468                    second_id,
21469                    0,
21470                    "tool".to_string(),
21471                    Some("tool".to_string()),
21472                    "second-a".to_string(),
21473                ),
21474            ]
21475        );
21476    }
21477
21478    #[test]
21479    fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
21480        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21481        use std::path::PathBuf;
21482
21483        let dir = TempDir::new().unwrap();
21484        let db_path = dir.path().join("agent_search.db");
21485        let storage = SqliteStorage::open(&db_path).unwrap();
21486
21487        let agent = Agent {
21488            id: None,
21489            slug: "claude_code".into(),
21490            name: "Claude Code".into(),
21491            version: Some("1.2.3".into()),
21492            kind: AgentKind::Cli,
21493        };
21494        let agent_id = storage.ensure_agent(&agent).unwrap();
21495
21496        let first = Conversation {
21497            id: None,
21498            agent_slug: "claude_code".into(),
21499            workspace: Some(PathBuf::from("/tmp/workspace")),
21500            external_id: Some("lexical-range-1".into()),
21501            title: Some("Lexical range 1".into()),
21502            source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
21503            started_at: Some(1_700_000_000_000),
21504            ended_at: Some(1_700_000_000_100),
21505            approx_tokens: Some(42),
21506            metadata_json: serde_json::Value::Null,
21507            messages: vec![Message {
21508                id: None,
21509                idx: 0,
21510                role: MessageRole::User,
21511                author: Some("user".into()),
21512                created_at: Some(1_700_000_000_010),
21513                content: "first-only".into(),
21514                extra_json: serde_json::json!({"opaque": true}),
21515                snippets: Vec::new(),
21516            }],
21517            source_id: LOCAL_SOURCE_ID.into(),
21518            origin_host: None,
21519        };
21520
21521        let second = Conversation {
21522            id: None,
21523            agent_slug: "claude_code".into(),
21524            workspace: Some(PathBuf::from("/tmp/workspace")),
21525            external_id: Some("lexical-range-2".into()),
21526            title: Some("Lexical range 2".into()),
21527            source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
21528            started_at: Some(1_700_000_000_200),
21529            ended_at: Some(1_700_000_000_300),
21530            approx_tokens: Some(84),
21531            metadata_json: serde_json::Value::Null,
21532            messages: vec![Message {
21533                id: None,
21534                idx: 0,
21535                role: MessageRole::Tool,
21536                author: Some("tool".into()),
21537                created_at: Some(1_700_000_000_210),
21538                content: "second-should-not-appear".into(),
21539                extra_json: serde_json::json!({"opaque": true}),
21540                snippets: Vec::new(),
21541            }],
21542            source_id: LOCAL_SOURCE_ID.into(),
21543            origin_host: None,
21544        };
21545
21546        let first_id = storage
21547            .insert_conversation_tree(agent_id, None, &first)
21548            .unwrap()
21549            .conversation_id;
21550        let second_id = storage
21551            .insert_conversation_tree(agent_id, None, &second)
21552            .unwrap()
21553            .conversation_id;
21554
21555        let mut streamed = Vec::new();
21556        storage
21557            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21558                first_id,
21559                first_id,
21560                |row| {
21561                    streamed.push((row.conversation_id, row.idx, row.content));
21562                    Ok(())
21563                },
21564            )
21565            .unwrap();
21566
21567        assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
21568        assert!(
21569            streamed
21570                .iter()
21571                .all(|(conversation_id, _, _)| *conversation_id != second_id),
21572            "upper bound should exclude later conversation ids"
21573        );
21574    }
21575
21576    #[test]
21577    fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
21578        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21579        use std::path::PathBuf;
21580
21581        let dir = TempDir::new().unwrap();
21582        let db_path = dir.path().join("agent_search.db");
21583        let storage = SqliteStorage::open(&db_path).unwrap();
21584
21585        let claude_agent_id = storage
21586            .ensure_agent(&Agent {
21587                id: None,
21588                slug: "claude_code".into(),
21589                name: "Claude Code".into(),
21590                version: None,
21591                kind: AgentKind::Cli,
21592            })
21593            .unwrap();
21594        let aider_agent_id = storage
21595            .ensure_agent(&Agent {
21596                id: None,
21597                slug: "aider".into(),
21598                name: "Aider".into(),
21599                version: None,
21600                kind: AgentKind::Cli,
21601            })
21602            .unwrap();
21603
21604        type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
21605
21606        let mut expected = Vec::new();
21607        let mut first_conversation_id = None;
21608        let mut last_conversation_id = None;
21609        let mut insert_conversation =
21610            |agent_id: i64,
21611             external_id: &str,
21612             title: &str,
21613             source_path: &str,
21614             started_at: i64,
21615             message_specs: Vec<MessageSpec>| {
21616                let conversation = Conversation {
21617                    id: None,
21618                    agent_slug: if agent_id == aider_agent_id {
21619                        "aider".into()
21620                    } else {
21621                        "claude_code".into()
21622                    },
21623                    workspace: Some(PathBuf::from("/tmp/workspace")),
21624                    external_id: Some(external_id.to_string()),
21625                    title: Some(title.to_string()),
21626                    source_path: PathBuf::from(source_path),
21627                    started_at: Some(started_at),
21628                    ended_at: Some(started_at + 100),
21629                    approx_tokens: None,
21630                    metadata_json: serde_json::Value::Null,
21631                    messages: message_specs
21632                        .iter()
21633                        .map(|(idx, role, author, created_at, content)| Message {
21634                            id: None,
21635                            idx: *idx,
21636                            role: role.clone(),
21637                            author: author.clone(),
21638                            created_at: *created_at,
21639                            content: content.clone(),
21640                            extra_json: serde_json::Value::Null,
21641                            snippets: Vec::new(),
21642                        })
21643                        .collect(),
21644                    source_id: LOCAL_SOURCE_ID.into(),
21645                    origin_host: None,
21646                };
21647                let conversation_id = storage
21648                    .insert_conversation_tree(agent_id, None, &conversation)
21649                    .unwrap()
21650                    .conversation_id;
21651                if first_conversation_id.is_none() {
21652                    first_conversation_id = Some(conversation_id);
21653                }
21654                last_conversation_id = Some(conversation_id);
21655                expected.extend(message_specs.into_iter().map(
21656                    |(idx, role, author, created_at, content)| {
21657                        (
21658                            conversation_id,
21659                            idx,
21660                            match role {
21661                                MessageRole::User => "user".to_string(),
21662                                MessageRole::Agent => "agent".to_string(),
21663                                MessageRole::Tool => "tool".to_string(),
21664                                MessageRole::System => "system".to_string(),
21665                                MessageRole::Other(other) => other,
21666                            },
21667                            author,
21668                            created_at,
21669                            content,
21670                        )
21671                    },
21672                ));
21673            };
21674
21675        for (label, base_ts) in [
21676            ("alpha", 1_700_000_000_000_i64),
21677            ("beta", 1_700_000_001_000_i64),
21678            ("gamma", 1_700_000_002_000_i64),
21679            ("delta", 1_700_000_003_000_i64),
21680            ("epsilon", 1_700_000_004_000_i64),
21681        ] {
21682            insert_conversation(
21683                claude_agent_id,
21684                &format!("lexical-{label}"),
21685                &format!("Lexical {label}"),
21686                &format!("/tmp/{label}.jsonl"),
21687                base_ts,
21688                vec![
21689                    (
21690                        0,
21691                        MessageRole::User,
21692                        None,
21693                        Some(base_ts + 10),
21694                        format!("{label}_content"),
21695                    ),
21696                    (
21697                        1,
21698                        MessageRole::Agent,
21699                        None,
21700                        Some(base_ts + 20),
21701                        format!("{label}_content_response"),
21702                    ),
21703                ],
21704            );
21705        }
21706
21707        insert_conversation(
21708            aider_agent_id,
21709            "lexical-aider-history",
21710            "Aider Chat: coding_agent_session_search",
21711            "/tmp/.aider.chat.history.md",
21712            1_764_619_673_394,
21713            vec![
21714                (
21715                    0,
21716                    MessageRole::System,
21717                    Some("system".to_string()),
21718                    None,
21719                    "# aider chat started at 2025-12-01 20:07:47".to_string(),
21720                ),
21721                (
21722                    1,
21723                    MessageRole::User,
21724                    Some("user".to_string()),
21725                    None,
21726                    "/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
21727                ),
21728            ],
21729        );
21730        insert_conversation(
21731            aider_agent_id,
21732            "lexical-aider-fixture",
21733            "Aider Chat: aider",
21734            "/tmp/tests/fixtures/aider/.aider.chat.history.md",
21735            1_764_621_401_399,
21736            vec![
21737                (
21738                    0,
21739                    MessageRole::User,
21740                    Some("user".to_string()),
21741                    None,
21742                    "/add src/main.rs".to_string(),
21743                ),
21744                (
21745                    1,
21746                    MessageRole::Agent,
21747                    Some("assistant".to_string()),
21748                    None,
21749                    "Added src/main.rs to the chat.
21750
21751#### /add src/main.rs"
21752                        .to_string(),
21753                ),
21754                (
21755                    2,
21756                    MessageRole::User,
21757                    Some("user".to_string()),
21758                    None,
21759                    "Please refactor.".to_string(),
21760                ),
21761                (
21762                    3,
21763                    MessageRole::Agent,
21764                    Some("assistant".to_string()),
21765                    None,
21766                    "Sure, here is the code.".to_string(),
21767                ),
21768            ],
21769        );
21770
21771        let mut streamed = Vec::new();
21772        storage
21773            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21774                first_conversation_id.unwrap(),
21775                last_conversation_id.unwrap(),
21776                |row| {
21777                    streamed.push((
21778                        row.conversation_id,
21779                        row.idx,
21780                        row.role,
21781                        row.author,
21782                        row.created_at,
21783                        row.content,
21784                    ));
21785                    Ok(())
21786                },
21787            )
21788            .unwrap();
21789
21790        assert_eq!(streamed, expected);
21791    }
21792
21793    #[test]
21794    fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
21795        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21796        use std::path::PathBuf;
21797
21798        let dir = TempDir::new().unwrap();
21799        let db_path = dir.path().join("agent_search.db");
21800        let storage = SqliteStorage::open(&db_path).unwrap();
21801
21802        let agent = Agent {
21803            id: None,
21804            slug: "claude_code".into(),
21805            name: "Claude Code".into(),
21806            version: None,
21807            kind: AgentKind::Cli,
21808        };
21809        let agent_id = storage.ensure_agent(&agent).unwrap();
21810
21811        for (external_id, base_ts) in [
21812            ("conv-1", 1_700_000_000_000_i64),
21813            ("conv-2", 1_700_000_001_000_i64),
21814        ] {
21815            let conversation = Conversation {
21816                id: None,
21817                agent_slug: "claude_code".into(),
21818                workspace: Some(PathBuf::from("/tmp/workspace")),
21819                external_id: Some(external_id.to_string()),
21820                title: Some("Lexical rebuild".into()),
21821                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21822                started_at: Some(base_ts),
21823                ended_at: Some(base_ts + 100),
21824                approx_tokens: None,
21825                metadata_json: serde_json::Value::Null,
21826                messages: vec![
21827                    Message {
21828                        id: None,
21829                        idx: 0,
21830                        role: MessageRole::User,
21831                        author: Some("user".into()),
21832                        created_at: Some(base_ts + 10),
21833                        content: format!("{external_id}-first"),
21834                        extra_json: serde_json::Value::Null,
21835                        snippets: Vec::new(),
21836                    },
21837                    Message {
21838                        id: None,
21839                        idx: 1,
21840                        role: MessageRole::Agent,
21841                        author: Some("assistant".into()),
21842                        created_at: Some(base_ts + 20),
21843                        content: format!("{external_id}-second"),
21844                        extra_json: serde_json::Value::Null,
21845                        snippets: Vec::new(),
21846                    },
21847                ],
21848                source_id: LOCAL_SOURCE_ID.into(),
21849                origin_host: None,
21850            };
21851            storage
21852                .insert_conversation_tree(agent_id, None, &conversation)
21853                .unwrap();
21854        }
21855
21856        let first_id: i64 = storage
21857            .conn
21858            .query_row_map(
21859                "SELECT id FROM conversations ORDER BY id LIMIT 1",
21860                fparams![],
21861                |row| row.get_typed(0),
21862            )
21863            .unwrap();
21864        let last_id: i64 = storage
21865            .conn
21866            .query_row_map(
21867                "SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
21868                fparams![],
21869                |row| row.get_typed(0),
21870            )
21871            .unwrap();
21872
21873        let conversation_plan_details: Vec<String> = storage
21874            .conn
21875            .query_map_collect(
21876                "EXPLAIN QUERY PLAN                  SELECT id FROM conversations                  WHERE id >= ?1 AND id <= ?2                  ORDER BY id ASC",
21877                fparams![first_id, last_id],
21878                |row| row.get_typed(3),
21879            )
21880            .unwrap();
21881        assert!(
21882            !conversation_plan_details
21883                .iter()
21884                .any(|detail| detail.contains("TEMP B-TREE")),
21885            "expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
21886        );
21887
21888        let message_plan_details: Vec<String> = storage
21889            .conn
21890            .query_map_collect(
21891                "EXPLAIN QUERY PLAN                  SELECT id, idx, role, author, created_at, content                  FROM messages INDEXED BY sqlite_autoindex_messages_1                  WHERE conversation_id = ?1                  ORDER BY idx",
21892                fparams![first_id],
21893                |row| row.get_typed(3),
21894            )
21895            .unwrap();
21896        assert!(
21897            message_plan_details
21898                .iter()
21899                .any(|detail| detail.contains("sqlite_autoindex_messages_1")
21900                    || detail.contains("idx_messages_conv_idx")),
21901            "expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
21902        );
21903        assert!(
21904            !message_plan_details
21905                .iter()
21906                .any(|detail| detail.contains("TEMP B-TREE")),
21907            "expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
21908        );
21909    }
21910
21911    #[test]
21912    fn discover_historical_database_bundles_prefers_larger_archives_first() {
21913        let dir = TempDir::new().unwrap();
21914        let canonical_db = dir.path().join("agent_search.db");
21915        fs::write(&canonical_db, b"canonical").unwrap();
21916
21917        let smaller = dir.path().join("agent_search.corrupt.small");
21918        fs::write(&smaller, vec![0_u8; 32]).unwrap();
21919
21920        let backups_dir = dir.path().join("backups");
21921        fs::create_dir_all(&backups_dir).unwrap();
21922        let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
21923        fs::write(&larger, vec![0_u8; 128]).unwrap();
21924
21925        let bundles = discover_historical_database_bundles(&canonical_db);
21926        let ordered_paths: Vec<PathBuf> =
21927            bundles.into_iter().map(|bundle| bundle.root_path).collect();
21928
21929        assert_eq!(ordered_paths, vec![larger, smaller]);
21930    }
21931
21932    #[test]
21933    fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
21934        let dir = TempDir::new().unwrap();
21935        let canonical_db = dir.path().join("agent_search.db");
21936        fs::write(&canonical_db, b"canonical").unwrap();
21937
21938        let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
21939        fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
21940
21941        let backups_dir = dir.path().join("backups");
21942        fs::create_dir_all(&backups_dir).unwrap();
21943        let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
21944        let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
21945        conn.execute_batch(
21946            "CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
21947             CREATE TABLE messages (
21948                 id INTEGER PRIMARY KEY,
21949                 conversation_id INTEGER NOT NULL,
21950                 idx INTEGER NOT NULL,
21951                 content TEXT
21952             );
21953             INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
21954             INSERT INTO messages(id, conversation_id, idx, content)
21955             VALUES (1, 1, 0, 'seed');",
21956        )
21957        .unwrap();
21958        drop(conn);
21959
21960        let bundles = discover_historical_database_bundles(&canonical_db);
21961        let ordered_paths: Vec<PathBuf> = bundles
21962            .iter()
21963            .map(|bundle| bundle.root_path.clone())
21964            .collect();
21965
21966        assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
21967        assert!(bundles[0].supports_direct_readonly);
21968        assert!(!bundles[1].supports_direct_readonly);
21969    }
21970
21971    #[test]
21972    fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
21973        let dir = TempDir::new().unwrap();
21974        let canonical_db = dir.path().join("agent_search.db");
21975        let storage = SqliteStorage::open(&canonical_db).unwrap();
21976
21977        let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
21978        fs::write(&quarantined, b"not a sqlite database").unwrap();
21979
21980        let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
21981            .into_iter()
21982            .map(|bundle| bundle.root_path)
21983            .collect();
21984        assert_eq!(discovered, vec![quarantined]);
21985
21986        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
21987        assert_eq!(outcome.bundles_considered, 1);
21988        assert_eq!(outcome.bundles_imported, 0);
21989        assert_eq!(outcome.conversations_imported, 0);
21990        assert_eq!(outcome.messages_imported, 0);
21991        assert!(storage.list_conversations(10, 0).unwrap().is_empty());
21992    }
21993
21994    #[test]
21995    fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
21996        let dir = TempDir::new().unwrap();
21997        let canonical_db = dir.path().join("agent_search.db");
21998        fs::write(&canonical_db, b"canonical").unwrap();
21999
22000        let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
22001        fs::create_dir_all(&repair_lab_dir).unwrap();
22002        let repair_lab_db = repair_lab_dir.join("agent_search.db");
22003        fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
22004        fs::write(
22005            repair_lab_dir.join("agent_search.rebuild-test.db"),
22006            vec![0_u8; 192],
22007        )
22008        .unwrap();
22009
22010        let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
22011        fs::create_dir_all(&snapshots_dir).unwrap();
22012        let snapshot_db = snapshots_dir.join("agent_search.db");
22013        fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
22014
22015        let bundles = discover_historical_database_bundles(&canonical_db);
22016        let ordered_paths: Vec<PathBuf> =
22017            bundles.into_iter().map(|bundle| bundle.root_path).collect();
22018
22019        assert!(ordered_paths.contains(&repair_lab_db));
22020        assert!(ordered_paths.contains(&snapshot_db));
22021        assert!(
22022            !ordered_paths
22023                .iter()
22024                .any(|path| path.file_name().and_then(|name| name.to_str())
22025                    == Some("agent_search.rebuild-test.db"))
22026        );
22027    }
22028
22029    #[test]
22030    fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
22031        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22032
22033        let dir = TempDir::new().unwrap();
22034        let canonical_db = dir.path().join("agent_search.db");
22035        fs::write(&canonical_db, b"canonical").unwrap();
22036
22037        let replay_dir = dir
22038            .path()
22039            .join("repair-lab")
22040            .join("replay-20260324T070101Z");
22041        fs::create_dir_all(&replay_dir).unwrap();
22042        let replay_db = replay_dir.join("agent_search.db");
22043        let replay_storage = SqliteStorage::open(&replay_db).unwrap();
22044        let agent = Agent {
22045            id: None,
22046            slug: "codex".into(),
22047            name: "Codex".into(),
22048            version: Some("0.2.3".into()),
22049            kind: AgentKind::Cli,
22050        };
22051        let agent_id = replay_storage.ensure_agent(&agent).unwrap();
22052        let conversation = Conversation {
22053            id: None,
22054            agent_slug: "codex".into(),
22055            workspace: Some(PathBuf::from("/tmp/workspace")),
22056            external_id: Some("replay-conv".into()),
22057            title: Some("Replay bundle".into()),
22058            source_path: PathBuf::from("/tmp/replay.jsonl"),
22059            started_at: Some(1_700_000_000_000),
22060            ended_at: Some(1_700_000_000_100),
22061            approx_tokens: Some(42),
22062            metadata_json: serde_json::Value::Null,
22063            messages: vec![Message {
22064                id: None,
22065                idx: 0,
22066                role: MessageRole::Agent,
22067                author: Some("assistant".into()),
22068                created_at: Some(1_700_000_000_050),
22069                content: "replay message".into(),
22070                extra_json: serde_json::Value::Null,
22071                snippets: Vec::new(),
22072            }],
22073            source_id: LOCAL_SOURCE_ID.into(),
22074            origin_host: None,
22075        };
22076        replay_storage
22077            .insert_conversation_tree(agent_id, None, &conversation)
22078            .unwrap();
22079        drop(replay_storage);
22080
22081        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
22082        let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
22083        replay_legacy
22084            .execute_batch(
22085                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
22086                 DELETE FROM _schema_migrations WHERE version = 14;
22087                 PRAGMA writable_schema = ON;",
22088            )
22089            .unwrap();
22090        replay_legacy
22091            .execute(
22092                "DELETE FROM meta WHERE key = ?1",
22093                [FTS_FRANKEN_REBUILD_META_KEY],
22094            )
22095            .unwrap();
22096        replay_legacy
22097            .execute(
22098                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22099                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22100                [duplicate_legacy_fts_sql],
22101            )
22102            .unwrap();
22103        replay_legacy
22104            .execute_batch("PRAGMA writable_schema = OFF;")
22105            .unwrap();
22106        drop(replay_legacy);
22107
22108        let backups_dir = dir.path().join("backups");
22109        fs::create_dir_all(&backups_dir).unwrap();
22110        let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
22111        let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
22112        let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
22113        clean_storage
22114            .insert_conversation_tree(clean_agent_id, None, &conversation)
22115            .unwrap();
22116        drop(clean_storage);
22117
22118        let bundles = discover_historical_database_bundles(&canonical_db);
22119        let ordered_paths: Vec<PathBuf> = bundles
22120            .iter()
22121            .map(|bundle| bundle.root_path.clone())
22122            .collect();
22123
22124        assert_eq!(ordered_paths[0], clean_backup);
22125        assert_eq!(ordered_paths[1], replay_db);
22126        assert_eq!(
22127            bundles[0].probe.schema_version,
22128            Some(CURRENT_SCHEMA_VERSION)
22129        );
22130        // Post-V14 cass drops the fts_messages virtual table during migration
22131        // and recreates it lazily on first open, so a freshly-migrated "clean"
22132        // backup has zero fts_messages rows in sqlite_master. The bundle is
22133        // still ranked as healthy by `bundle_health_rank` because 0 rows is a
22134        // legitimate lazy-FTS state (see comment there).
22135        assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
22136        // `fts_queryable` mirrors a direct rusqlite probe; with 0 sqlite_master
22137        // rows the table isn't queryable until lazy repair runs.
22138        assert!(!bundles[0].probe.fts_queryable);
22139        assert_eq!(bundles[1].probe.schema_version, Some(13));
22140        // The replay bundle had V14 run (dropping fts_messages → 0 rows), then
22141        // the test rolls meta.schema_version back to 13, deletes the V14
22142        // marker, and manually injects a duplicate sqlite_master row. Net
22143        // result: one synthetic (malformed) fts_messages entry.
22144        assert_eq!(bundles[1].probe.fts_schema_rows, Some(1));
22145    }
22146
22147    #[test]
22148    fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
22149        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22150
22151        let dir = TempDir::new().unwrap();
22152        let db_path = dir.path().join("fts-catchup.db");
22153        let storage = SqliteStorage::open(&db_path).unwrap();
22154        let agent = Agent {
22155            id: None,
22156            slug: "codex".into(),
22157            name: "Codex".into(),
22158            version: Some("0.2.3".into()),
22159            kind: AgentKind::Cli,
22160        };
22161        let agent_id = storage.ensure_agent(&agent).unwrap();
22162        let conversation = Conversation {
22163            id: None,
22164            agent_slug: "codex".into(),
22165            workspace: Some(PathBuf::from("/tmp/workspace")),
22166            external_id: Some("fts-catchup".into()),
22167            title: Some("FTS catchup".into()),
22168            source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
22169            started_at: Some(1_700_000_000_000),
22170            ended_at: Some(1_700_000_000_100),
22171            approx_tokens: Some(42),
22172            metadata_json: serde_json::Value::Null,
22173            messages: vec![Message {
22174                id: None,
22175                idx: 0,
22176                role: MessageRole::User,
22177                author: Some("user".into()),
22178                created_at: Some(1_700_000_000_050),
22179                content: "initial message".into(),
22180                extra_json: serde_json::Value::Null,
22181                snippets: Vec::new(),
22182            }],
22183            source_id: LOCAL_SOURCE_ID.into(),
22184            origin_host: None,
22185        };
22186        storage
22187            .insert_conversation_tree(agent_id, None, &conversation)
22188            .unwrap();
22189        drop(storage);
22190
22191        rebuild_fts_via_rusqlite(&db_path).unwrap();
22192
22193        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22194        let conversation_id: i64 = conn
22195            .query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
22196                row.get_typed(0)
22197            })
22198            .unwrap();
22199        conn.execute_compat(
22200            "INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
22201             VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
22202            fparams![conversation_id],
22203        )
22204        .unwrap();
22205        drop(conn);
22206
22207        let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
22208        assert_eq!(
22209            repair,
22210            FtsConsistencyRepair::IncrementalCatchUp {
22211                inserted_rows: 1,
22212                total_rows: 2
22213            }
22214        );
22215
22216        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22217        let auth_rows: i64 = conn
22218            .query_row_map(
22219                "SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
22220                fparams![],
22221                |row| row.get_typed(0),
22222            )
22223            .unwrap();
22224        assert_eq!(auth_rows, 1);
22225    }
22226
22227    #[test]
22228    fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
22229        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22230
22231        let dir = TempDir::new().unwrap();
22232        let db_path = dir.path().join("fts-duplicate-rebuild.db");
22233
22234        let storage = SqliteStorage::open(&db_path).unwrap();
22235        let agent = Agent {
22236            id: None,
22237            slug: "codex".into(),
22238            name: "Codex".into(),
22239            version: Some("0.2.3".into()),
22240            kind: AgentKind::Cli,
22241        };
22242        let agent_id = storage.ensure_agent(&agent).unwrap();
22243        let conversation = Conversation {
22244            id: None,
22245            agent_slug: "codex".into(),
22246            workspace: Some(PathBuf::from("/ws")),
22247            external_id: Some("retro".into()),
22248            title: Some("retro".into()),
22249            source_path: PathBuf::from("/tmp/retro.jsonl"),
22250            started_at: Some(42),
22251            ended_at: Some(42),
22252            approx_tokens: None,
22253            metadata_json: serde_json::Value::Null,
22254            messages: vec![Message {
22255                id: None,
22256                idx: 0,
22257                role: MessageRole::User,
22258                author: None,
22259                created_at: Some(42),
22260                content: "retro investigation".into(),
22261                extra_json: serde_json::Value::Null,
22262                snippets: Vec::new(),
22263            }],
22264            source_id: LOCAL_SOURCE_ID.into(),
22265            origin_host: None,
22266        };
22267        storage
22268            .insert_conversation_tree(agent_id, None, &conversation)
22269            .unwrap();
22270        drop(storage);
22271        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
22272
22273        let conn = rusqlite_test_fixture_conn(&db_path);
22274        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
22275        conn.execute(
22276            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22277             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22278            ["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
22279        )
22280        .unwrap();
22281        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
22282        let duplicate_rows: i64 = conn
22283            .query_row(
22284                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
22285                [],
22286                |row| row.get(0),
22287            )
22288            .unwrap();
22289        assert_eq!(duplicate_rows, 2);
22290        drop(conn);
22291
22292        let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
22293        assert_eq!(inserted, 1);
22294
22295        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22296        let schema_rows = franken_fts_schema_rows(&conn).unwrap();
22297        assert_eq!(
22298            schema_rows, 1,
22299            "DROP TABLE should leave one clean FTS schema"
22300        );
22301        let match_count: i64 = conn
22302            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
22303                row.get_typed(0)
22304            })
22305            .unwrap();
22306        assert_eq!(match_count, 1);
22307    }
22308
22309    // =========================================================================
22310    // Agent storage tests (bead yln.4)
22311    // =========================================================================
22312
22313    #[test]
22314    fn ensure_agent_creates_new() {
22315        let dir = TempDir::new().unwrap();
22316        let db_path = dir.path().join("test.db");
22317        let storage = SqliteStorage::open(&db_path).unwrap();
22318
22319        let agent = Agent {
22320            id: None,
22321            slug: "test_agent".into(),
22322            name: "Test Agent".into(),
22323            version: Some("1.0".into()),
22324            kind: AgentKind::Cli,
22325        };
22326
22327        let id = storage.ensure_agent(&agent).unwrap();
22328        assert!(id > 0);
22329    }
22330
22331    #[test]
22332    fn ensure_agent_returns_existing_id() {
22333        let dir = TempDir::new().unwrap();
22334        let db_path = dir.path().join("test.db");
22335        let storage = SqliteStorage::open(&db_path).unwrap();
22336
22337        let agent = Agent {
22338            id: None,
22339            slug: "codex".into(),
22340            name: "Codex".into(),
22341            version: None,
22342            kind: AgentKind::Cli,
22343        };
22344
22345        let id1 = storage.ensure_agent(&agent).unwrap();
22346        let id2 = storage.ensure_agent(&agent).unwrap();
22347        assert_eq!(id1, id2);
22348    }
22349
22350    #[test]
22351    fn ensure_agent_unchanged_preserves_updated_at() {
22352        let dir = TempDir::new().unwrap();
22353        let db_path = dir.path().join("test.db");
22354        let storage = SqliteStorage::open(&db_path).unwrap();
22355
22356        let agent = Agent {
22357            id: None,
22358            slug: "codex".into(),
22359            name: "Codex".into(),
22360            version: Some("1.0".into()),
22361            kind: AgentKind::Cli,
22362        };
22363
22364        storage.ensure_agent(&agent).unwrap();
22365        let initial_updated_at: i64 = storage
22366            .conn
22367            .query_row_map(
22368                "SELECT updated_at FROM agents WHERE slug = ?1",
22369                fparams![agent.slug.as_str()],
22370                |row| row.get_typed(0),
22371            )
22372            .unwrap();
22373        std::thread::sleep(std::time::Duration::from_millis(5));
22374
22375        storage.ensure_agent(&agent).unwrap();
22376        let fetched_updated_at: i64 = storage
22377            .conn
22378            .query_row_map(
22379                "SELECT updated_at FROM agents WHERE slug = ?1",
22380                fparams![agent.slug.as_str()],
22381                |row| row.get_typed(0),
22382            )
22383            .unwrap();
22384
22385        assert_eq!(fetched_updated_at, initial_updated_at);
22386    }
22387
22388    #[test]
22389    fn ensure_agent_changed_metadata_updates_cached_slug() {
22390        let dir = TempDir::new().unwrap();
22391        let db_path = dir.path().join("test.db");
22392        let storage = SqliteStorage::open(&db_path).unwrap();
22393
22394        let mut agent = Agent {
22395            id: None,
22396            slug: "codex".into(),
22397            name: "Codex".into(),
22398            version: Some("1.0".into()),
22399            kind: AgentKind::Cli,
22400        };
22401
22402        let id1 = storage.ensure_agent(&agent).unwrap();
22403        agent.name = "Codex CLI".into();
22404        agent.version = Some("1.1".into());
22405        let id2 = storage.ensure_agent(&agent).unwrap();
22406
22407        let fetched: (String, Option<String>) = storage
22408            .conn
22409            .query_row_map(
22410                "SELECT name, version FROM agents WHERE slug = ?1",
22411                fparams![agent.slug.as_str()],
22412                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
22413            )
22414            .unwrap();
22415
22416        assert_eq!(id1, id2);
22417        assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
22418    }
22419
22420    #[test]
22421    fn list_agents_returns_inserted() {
22422        let dir = TempDir::new().unwrap();
22423        let db_path = dir.path().join("test.db");
22424        let storage = SqliteStorage::open(&db_path).unwrap();
22425
22426        let agent = Agent {
22427            id: None,
22428            slug: "new_agent".into(),
22429            name: "New Agent".into(),
22430            version: None,
22431            kind: AgentKind::VsCode,
22432        };
22433        storage.ensure_agent(&agent).unwrap();
22434
22435        let agents = storage.list_agents().unwrap();
22436        assert!(agents.iter().any(|a| a.slug == "new_agent"));
22437    }
22438
22439    // =========================================================================
22440    // Workspace storage tests (bead yln.4)
22441    // =========================================================================
22442
22443    #[test]
22444    fn ensure_workspace_creates_new() {
22445        let dir = TempDir::new().unwrap();
22446        let db_path = dir.path().join("test.db");
22447        let storage = SqliteStorage::open(&db_path).unwrap();
22448
22449        let id = storage
22450            .ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
22451            .unwrap();
22452        assert!(id > 0);
22453    }
22454
22455    #[test]
22456    fn ensure_workspace_returns_existing() {
22457        let dir = TempDir::new().unwrap();
22458        let db_path = dir.path().join("test.db");
22459        let storage = SqliteStorage::open(&db_path).unwrap();
22460
22461        let path = Path::new("/home/user/myproject");
22462        let id1 = storage.ensure_workspace(path, None).unwrap();
22463        let id2 = storage.ensure_workspace(path, None).unwrap();
22464        assert_eq!(id1, id2);
22465    }
22466
22467    #[test]
22468    fn ensure_workspace_changed_display_name_updates_cached_path() {
22469        let dir = TempDir::new().unwrap();
22470        let db_path = dir.path().join("test.db");
22471        let storage = SqliteStorage::open(&db_path).unwrap();
22472
22473        let path = Path::new("/home/user/myproject");
22474        let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
22475        let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
22476
22477        let display_name: Option<String> = storage
22478            .conn
22479            .query_row_map(
22480                "SELECT display_name FROM workspaces WHERE path = ?1",
22481                fparams![path.to_string_lossy().as_ref()],
22482                |row| row.get_typed(0),
22483            )
22484            .unwrap();
22485
22486        assert_eq!(id1, id2);
22487        assert_eq!(display_name.as_deref(), Some("After"));
22488    }
22489
22490    #[test]
22491    fn list_workspaces_returns_inserted() {
22492        let dir = TempDir::new().unwrap();
22493        let db_path = dir.path().join("test.db");
22494        let storage = SqliteStorage::open(&db_path).unwrap();
22495
22496        storage
22497            .ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
22498            .unwrap();
22499
22500        let workspaces = storage.list_workspaces().unwrap();
22501        assert!(
22502            workspaces
22503                .iter()
22504                .any(|w| w.path.to_str() == Some("/test/workspace"))
22505        );
22506    }
22507
22508    // =========================================================================
22509    // Source storage tests (bead yln.4)
22510    // =========================================================================
22511
22512    #[test]
22513    fn upsert_source_creates_new() {
22514        let dir = TempDir::new().unwrap();
22515        let db_path = dir.path().join("test.db");
22516        let storage = SqliteStorage::open(&db_path).unwrap();
22517
22518        let source = Source {
22519            id: "test-laptop".into(),
22520            kind: SourceKind::Ssh,
22521            host_label: Some("test.local".into()),
22522            machine_id: Some("test-machine-id".into()),
22523            platform: None,
22524            config_json: None,
22525            created_at: Some(SqliteStorage::now_millis()),
22526            updated_at: None,
22527        };
22528
22529        storage.upsert_source(&source).unwrap();
22530        let fetched = storage.get_source("test-laptop").unwrap();
22531        assert!(fetched.is_some());
22532        assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
22533    }
22534
22535    #[test]
22536    fn upsert_source_updates_existing() {
22537        let dir = TempDir::new().unwrap();
22538        let db_path = dir.path().join("test.db");
22539        let storage = SqliteStorage::open(&db_path).unwrap();
22540
22541        let source1 = Source {
22542            id: "my-source".into(),
22543            kind: SourceKind::Ssh,
22544            host_label: Some("Original Label".into()),
22545            machine_id: None,
22546            platform: None,
22547            config_json: None,
22548            created_at: Some(SqliteStorage::now_millis()),
22549            updated_at: None,
22550        };
22551        storage.upsert_source(&source1).unwrap();
22552
22553        let source2 = Source {
22554            id: "my-source".into(),
22555            kind: SourceKind::Ssh,
22556            host_label: Some("Updated Label".into()),
22557            machine_id: None,
22558            platform: Some("linux".into()),
22559            config_json: None,
22560            created_at: Some(SqliteStorage::now_millis()),
22561            updated_at: Some(SqliteStorage::now_millis()),
22562        };
22563        storage.upsert_source(&source2).unwrap();
22564
22565        let fetched = storage.get_source("my-source").unwrap().unwrap();
22566        assert_eq!(fetched.host_label, Some("Updated Label".into()));
22567        assert!(fetched.platform.is_some());
22568    }
22569
22570    #[test]
22571    fn upsert_source_unchanged_preserves_updated_at() {
22572        let dir = TempDir::new().unwrap();
22573        let db_path = dir.path().join("test.db");
22574        let storage = SqliteStorage::open(&db_path).unwrap();
22575
22576        let source = Source {
22577            id: "stable-source".into(),
22578            kind: SourceKind::Ssh,
22579            host_label: Some("builder.local".into()),
22580            machine_id: None,
22581            platform: Some("linux".into()),
22582            config_json: Some(serde_json::json!({"role": "bench"})),
22583            created_at: None,
22584            updated_at: None,
22585        };
22586
22587        storage.upsert_source(&source).unwrap();
22588        let initial = storage.get_source("stable-source").unwrap().unwrap();
22589        std::thread::sleep(std::time::Duration::from_millis(5));
22590
22591        storage.upsert_source(&source).unwrap();
22592        let fetched = storage.get_source("stable-source").unwrap().unwrap();
22593
22594        assert_eq!(fetched.created_at, initial.created_at);
22595        assert_eq!(fetched.updated_at, initial.updated_at);
22596        assert_eq!(fetched.host_label, initial.host_label);
22597        assert_eq!(fetched.platform, initial.platform);
22598        assert_eq!(fetched.config_json, initial.config_json);
22599    }
22600
22601    #[test]
22602    fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
22603        let dir = TempDir::new().unwrap();
22604        let db_path = dir.path().join("test.db");
22605        let storage = SqliteStorage::open(&db_path).unwrap();
22606
22607        let conversation = Conversation {
22608            id: None,
22609            agent_slug: "codex".into(),
22610            workspace: Some(PathBuf::from("/ws/cache-recreate")),
22611            external_id: Some("cache-recreate".into()),
22612            title: Some("Cache Recreate".into()),
22613            source_path: PathBuf::from("/log/cache-recreate.jsonl"),
22614            started_at: Some(1_700_000_000_000),
22615            ended_at: Some(1_700_000_000_001),
22616            approx_tokens: Some(16),
22617            metadata_json: serde_json::json!({}),
22618            messages: vec![Message {
22619                id: None,
22620                idx: 0,
22621                role: MessageRole::User,
22622                author: Some("tester".into()),
22623                created_at: Some(1_700_000_000_000),
22624                content: "cache recreate".into(),
22625                extra_json: serde_json::json!({}),
22626                snippets: Vec::new(),
22627            }],
22628            source_id: "cache-remote-source".into(),
22629            origin_host: Some("builder-cache".into()),
22630        };
22631
22632        storage
22633            .ensure_source_for_conversation(&conversation)
22634            .unwrap();
22635        assert!(storage.get_source("cache-remote-source").unwrap().is_some());
22636
22637        let deleted = storage.delete_source("cache-remote-source", false).unwrap();
22638        assert!(deleted);
22639        assert!(storage.get_source("cache-remote-source").unwrap().is_none());
22640
22641        storage
22642            .ensure_source_for_conversation(&conversation)
22643            .unwrap();
22644        let recreated = storage.get_source("cache-remote-source").unwrap();
22645        assert!(recreated.is_some());
22646        assert_eq!(
22647            recreated.unwrap().host_label.as_deref(),
22648            Some("builder-cache")
22649        );
22650    }
22651
22652    #[test]
22653    fn delete_source_removes_entry() {
22654        let dir = TempDir::new().unwrap();
22655        let db_path = dir.path().join("test.db");
22656        let storage = SqliteStorage::open(&db_path).unwrap();
22657
22658        let source = Source {
22659            id: "to-delete".into(),
22660            kind: SourceKind::Local,
22661            host_label: None,
22662            machine_id: None,
22663            platform: None,
22664            config_json: None,
22665            created_at: Some(SqliteStorage::now_millis()),
22666            updated_at: None,
22667        };
22668        storage.upsert_source(&source).unwrap();
22669
22670        let deleted = storage.delete_source("to-delete", false).unwrap();
22671        assert!(deleted);
22672
22673        let fetched = storage.get_source("to-delete").unwrap();
22674        assert!(fetched.is_none());
22675    }
22676
22677    #[test]
22678    fn delete_source_cannot_delete_local() {
22679        let dir = TempDir::new().unwrap();
22680        let db_path = dir.path().join("test.db");
22681        let storage = SqliteStorage::open(&db_path).unwrap();
22682
22683        let result = storage.delete_source(LOCAL_SOURCE_ID, false);
22684        assert!(result.is_err());
22685    }
22686
22687    #[test]
22688    fn list_sources_includes_local() {
22689        let dir = TempDir::new().unwrap();
22690        let db_path = dir.path().join("test.db");
22691        let storage = SqliteStorage::open(&db_path).unwrap();
22692
22693        let sources = storage.list_sources().unwrap();
22694        assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
22695    }
22696
22697    #[test]
22698    fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
22699        let dir = TempDir::new().unwrap();
22700        let db_path = dir.path().join("test.db");
22701        let storage = SqliteStorage::open(&db_path).unwrap();
22702
22703        let agent_id = storage
22704            .ensure_agent(&Agent {
22705                id: None,
22706                slug: "codex".into(),
22707                name: "Codex".into(),
22708                version: None,
22709                kind: AgentKind::Cli,
22710            })
22711            .unwrap();
22712
22713        let conversation = Conversation {
22714            id: None,
22715            agent_slug: "codex".into(),
22716            workspace: None,
22717            external_id: Some("blank-local-source".into()),
22718            title: Some("Blank local source".into()),
22719            source_path: dir.path().join("blank-local.jsonl"),
22720            started_at: Some(1_700_000_000_000),
22721            ended_at: Some(1_700_000_000_001),
22722            approx_tokens: None,
22723            metadata_json: serde_json::Value::Null,
22724            messages: vec![Message {
22725                id: None,
22726                idx: 0,
22727                role: MessageRole::User,
22728                author: None,
22729                created_at: Some(1_700_000_000_000),
22730                content: "hello".into(),
22731                extra_json: serde_json::Value::Null,
22732                snippets: Vec::new(),
22733            }],
22734            source_id: "   ".into(),
22735            origin_host: None,
22736        };
22737
22738        storage
22739            .insert_conversation_tree(agent_id, None, &conversation)
22740            .unwrap();
22741
22742        assert!(storage.get_source("   ").unwrap().is_none());
22743        let source = storage
22744            .get_source(LOCAL_SOURCE_ID)
22745            .unwrap()
22746            .expect("local source row should exist");
22747        assert_eq!(source.kind, SourceKind::Local);
22748        assert_eq!(source.host_label, None);
22749
22750        let conversations = storage.list_conversations(10, 0).unwrap();
22751        assert_eq!(conversations.len(), 1);
22752        assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
22753        assert_eq!(conversations[0].origin_host, None);
22754    }
22755
22756    #[test]
22757    fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
22758        let dir = TempDir::new().unwrap();
22759        let db_path = dir.path().join("test.db");
22760        let storage = SqliteStorage::open(&db_path).unwrap();
22761
22762        let agent_id = storage
22763            .ensure_agent(&Agent {
22764                id: None,
22765                slug: "codex".into(),
22766                name: "Codex".into(),
22767                version: None,
22768                kind: AgentKind::Cli,
22769            })
22770            .unwrap();
22771
22772        let bootstrap_updated_at: i64 = storage
22773            .conn
22774            .query_row_map(
22775                "SELECT updated_at FROM sources WHERE id = ?1",
22776                fparams![LOCAL_SOURCE_ID],
22777                |row| row.get_typed(0),
22778            )
22779            .unwrap();
22780
22781        let make_conversation = |external_id: &str, suffix: &str| Conversation {
22782            id: None,
22783            agent_slug: "codex".into(),
22784            workspace: None,
22785            external_id: Some(external_id.into()),
22786            title: Some(format!("Local source {suffix}")),
22787            source_path: dir.path().join(format!("local-{suffix}.jsonl")),
22788            started_at: Some(1_700_000_000_000),
22789            ended_at: Some(1_700_000_000_001),
22790            approx_tokens: None,
22791            metadata_json: serde_json::Value::Null,
22792            messages: vec![Message {
22793                id: None,
22794                idx: 0,
22795                role: MessageRole::User,
22796                author: None,
22797                created_at: Some(1_700_000_000_000),
22798                content: format!("hello-{suffix}"),
22799                extra_json: serde_json::Value::Null,
22800                snippets: Vec::new(),
22801            }],
22802            source_id: LOCAL_SOURCE_ID.into(),
22803            origin_host: None,
22804        };
22805
22806        std::thread::sleep(std::time::Duration::from_millis(5));
22807        storage
22808            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
22809            .unwrap();
22810        let after_first_insert: i64 = storage
22811            .conn
22812            .query_row_map(
22813                "SELECT updated_at FROM sources WHERE id = ?1",
22814                fparams![LOCAL_SOURCE_ID],
22815                |row| row.get_typed(0),
22816            )
22817            .unwrap();
22818
22819        std::thread::sleep(std::time::Duration::from_millis(5));
22820        storage
22821            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
22822            .unwrap();
22823        let after_second_insert: i64 = storage
22824            .conn
22825            .query_row_map(
22826                "SELECT updated_at FROM sources WHERE id = ?1",
22827                fparams![LOCAL_SOURCE_ID],
22828                |row| row.get_typed(0),
22829            )
22830            .unwrap();
22831
22832        assert_eq!(after_first_insert, bootstrap_updated_at);
22833        assert_eq!(after_second_insert, bootstrap_updated_at);
22834    }
22835
22836    #[test]
22837    fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
22838        let dir = TempDir::new().unwrap();
22839        let db_path = dir.path().join("test.db");
22840        let storage = SqliteStorage::open(&db_path).unwrap();
22841
22842        let agent_id = storage
22843            .ensure_agent(&Agent {
22844                id: None,
22845                slug: "codex".into(),
22846                name: "Codex".into(),
22847                version: None,
22848                kind: AgentKind::Cli,
22849            })
22850            .unwrap();
22851
22852        let conversation = Conversation {
22853            id: None,
22854            agent_slug: "codex".into(),
22855            workspace: None,
22856            external_id: Some("blank-remote-source".into()),
22857            title: Some("Blank remote source".into()),
22858            source_path: dir.path().join("blank-remote.jsonl"),
22859            started_at: Some(1_700_000_000_000),
22860            ended_at: Some(1_700_000_000_001),
22861            approx_tokens: None,
22862            metadata_json: serde_json::Value::Null,
22863            messages: vec![Message {
22864                id: None,
22865                idx: 0,
22866                role: MessageRole::User,
22867                author: None,
22868                created_at: Some(1_700_000_000_000),
22869                content: "hello".into(),
22870                extra_json: serde_json::Value::Null,
22871                snippets: Vec::new(),
22872            }],
22873            source_id: "   ".into(),
22874            origin_host: Some("user@work-laptop".into()),
22875        };
22876
22877        storage
22878            .insert_conversation_tree(agent_id, None, &conversation)
22879            .unwrap();
22880
22881        assert!(storage.get_source("   ").unwrap().is_none());
22882        let source = storage
22883            .get_source("user@work-laptop")
22884            .unwrap()
22885            .expect("normalized remote source row should exist");
22886        assert_eq!(source.kind, SourceKind::Ssh);
22887        assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
22888
22889        let conversations = storage.list_conversations(10, 0).unwrap();
22890        assert_eq!(conversations.len(), 1);
22891        assert_eq!(conversations[0].source_id, "user@work-laptop");
22892        assert_eq!(
22893            conversations[0].origin_host.as_deref(),
22894            Some("user@work-laptop")
22895        );
22896    }
22897
22898    #[test]
22899    fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
22900        let dir = TempDir::new().unwrap();
22901        let db_path = dir.path().join("test.db");
22902        let storage = SqliteStorage::open(&db_path).unwrap();
22903
22904        let agent_id = storage
22905            .ensure_agent(&Agent {
22906                id: None,
22907                slug: "codex".into(),
22908                name: "Codex".into(),
22909                version: None,
22910                kind: AgentKind::Cli,
22911            })
22912            .unwrap();
22913
22914        let conversation = Conversation {
22915            id: None,
22916            agent_slug: "codex".into(),
22917            workspace: None,
22918            external_id: Some("batched-blank-remote-source".into()),
22919            title: Some("Batched blank remote source".into()),
22920            source_path: dir.path().join("batched-blank-remote.jsonl"),
22921            started_at: Some(1_700_000_000_000),
22922            ended_at: Some(1_700_000_000_001),
22923            approx_tokens: None,
22924            metadata_json: serde_json::Value::Null,
22925            messages: vec![Message {
22926                id: None,
22927                idx: 0,
22928                role: MessageRole::User,
22929                author: None,
22930                created_at: Some(1_700_000_000_000),
22931                content: "hello".into(),
22932                extra_json: serde_json::Value::Null,
22933                snippets: Vec::new(),
22934            }],
22935            source_id: "   ".into(),
22936            origin_host: Some("user@batch-host".into()),
22937        };
22938
22939        storage
22940            .insert_conversations_batched(&[(agent_id, None, &conversation)])
22941            .unwrap();
22942
22943        assert!(storage.get_source("   ").unwrap().is_none());
22944        let source = storage
22945            .get_source("user@batch-host")
22946            .unwrap()
22947            .expect("normalized batched remote source row should exist");
22948        assert_eq!(source.kind, SourceKind::Ssh);
22949        assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
22950
22951        let conversations = storage.list_conversations(10, 0).unwrap();
22952        assert_eq!(conversations.len(), 1);
22953        assert_eq!(conversations[0].source_id, "user@batch-host");
22954        assert_eq!(
22955            conversations[0].origin_host.as_deref(),
22956            Some("user@batch-host")
22957        );
22958    }
22959
22960    #[test]
22961    fn get_source_ids_excludes_local() {
22962        let dir = TempDir::new().unwrap();
22963        let db_path = dir.path().join("test.db");
22964        let storage = SqliteStorage::open(&db_path).unwrap();
22965
22966        // Add a non-local source
22967        let source = Source {
22968            id: "remote-1".into(),
22969            kind: SourceKind::Ssh,
22970            host_label: Some("server".into()),
22971            machine_id: None,
22972            platform: None,
22973            config_json: None,
22974            created_at: Some(SqliteStorage::now_millis()),
22975            updated_at: None,
22976        };
22977        storage.upsert_source(&source).unwrap();
22978
22979        let ids = storage.get_source_ids().unwrap();
22980        assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
22981        assert!(ids.contains(&"remote-1".to_string()));
22982    }
22983
22984    // =========================================================================
22985    // Scan timestamp tests (bead yln.4)
22986    // =========================================================================
22987
22988    #[test]
22989    fn get_last_scan_ts_returns_none_initially() {
22990        let dir = TempDir::new().unwrap();
22991        let db_path = dir.path().join("test.db");
22992        let storage = SqliteStorage::open(&db_path).unwrap();
22993
22994        let ts = storage.get_last_scan_ts().unwrap();
22995        assert!(ts.is_none());
22996    }
22997
22998    #[test]
22999    fn set_and_get_last_scan_ts() {
23000        let dir = TempDir::new().unwrap();
23001        let db_path = dir.path().join("test.db");
23002        let storage = SqliteStorage::open(&db_path).unwrap();
23003
23004        let expected_ts = 1700000000000_i64;
23005        storage.set_last_scan_ts(expected_ts).unwrap();
23006
23007        let actual_ts = storage.get_last_scan_ts().unwrap();
23008        assert_eq!(actual_ts, Some(expected_ts));
23009    }
23010
23011    // =========================================================================
23012    // now_millis utility test (bead yln.4)
23013    // =========================================================================
23014
23015    #[test]
23016    fn now_millis_returns_reasonable_value() {
23017        let ts = SqliteStorage::now_millis();
23018        // Should be after Jan 1, 2020 (approx 1577836800000)
23019        assert!(ts > 1577836800000);
23020        // Should be before Jan 1, 2100 (approx 4102444800000)
23021        assert!(ts < 4102444800000);
23022    }
23023
23024    // =========================================================================
23025    // Binary Metadata Serialization Tests (Opt 3.1)
23026    // =========================================================================
23027
23028    #[test]
23029    fn msgpack_roundtrip_basic_object() {
23030        let value = serde_json::json!({
23031            "key": "value",
23032            "number": 42,
23033            "nested": { "inner": true }
23034        });
23035
23036        let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
23037        let recovered = deserialize_msgpack_to_json(&bytes);
23038
23039        assert_eq!(value, recovered);
23040    }
23041
23042    #[test]
23043    fn msgpack_returns_none_for_null() {
23044        let value = serde_json::Value::Null;
23045        assert!(serialize_json_to_msgpack(&value).is_none());
23046    }
23047
23048    #[test]
23049    fn message_insert_stores_null_extra_json_as_sql_null() {
23050        let dir = TempDir::new().unwrap();
23051        let db_path = dir.path().join("test.db");
23052        let storage = SqliteStorage::open(&db_path).unwrap();
23053        let agent_id = storage
23054            .ensure_agent(&Agent {
23055                id: None,
23056                slug: "codex".into(),
23057                name: "Codex".into(),
23058                version: None,
23059                kind: AgentKind::Cli,
23060            })
23061            .unwrap();
23062        let conversation = Conversation {
23063            id: None,
23064            agent_slug: "codex".into(),
23065            workspace: None,
23066            external_id: Some("null-extra-json".into()),
23067            title: Some("Null extra_json".into()),
23068            source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
23069            started_at: Some(1_700_000_000_000),
23070            ended_at: Some(1_700_000_000_001),
23071            approx_tokens: None,
23072            metadata_json: serde_json::Value::Null,
23073            messages: vec![Message {
23074                id: None,
23075                idx: 0,
23076                role: MessageRole::User,
23077                author: None,
23078                created_at: Some(1_700_000_000_000),
23079                content: "null metadata message".into(),
23080                extra_json: serde_json::Value::Null,
23081                snippets: Vec::new(),
23082            }],
23083            source_id: LOCAL_SOURCE_ID.into(),
23084            origin_host: None,
23085        };
23086
23087        let conversation_id = storage
23088            .insert_conversation_tree(agent_id, None, &conversation)
23089            .unwrap()
23090            .conversation_id;
23091
23092        let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23093            .conn
23094            .query_row_map(
23095                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23096                fparams![conversation_id],
23097                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23098            )
23099            .unwrap();
23100        assert!(extra_json.is_none());
23101        assert!(extra_bin.is_none());
23102
23103        let stored = storage.fetch_messages(conversation_id).unwrap();
23104        assert!(stored[0].extra_json.is_null());
23105    }
23106
23107    #[test]
23108    fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
23109        let dir = TempDir::new().unwrap();
23110        let db_path = dir.path().join("test.db");
23111        let storage = SqliteStorage::open(&db_path).unwrap();
23112        let agent_id = storage
23113            .ensure_agent(&Agent {
23114                id: None,
23115                slug: "codex".into(),
23116                name: "Codex".into(),
23117                version: None,
23118                kind: AgentKind::Cli,
23119            })
23120            .unwrap();
23121        let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
23122        let conversation = Conversation {
23123            id: None,
23124            agent_slug: "codex".into(),
23125            workspace: None,
23126            external_id: Some("msgpack-extra-json".into()),
23127            title: Some("MessagePack extra_json".into()),
23128            source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
23129            started_at: Some(1_700_000_000_000),
23130            ended_at: Some(1_700_000_000_001),
23131            approx_tokens: None,
23132            metadata_json: serde_json::Value::Null,
23133            messages: vec![Message {
23134                id: None,
23135                idx: 0,
23136                role: MessageRole::User,
23137                author: None,
23138                created_at: Some(1_700_000_000_000),
23139                content: "msgpack metadata message".into(),
23140                extra_json: extra_json.clone(),
23141                snippets: Vec::new(),
23142            }],
23143            source_id: LOCAL_SOURCE_ID.into(),
23144            origin_host: None,
23145        };
23146
23147        let conversation_id = storage
23148            .insert_conversation_tree(agent_id, None, &conversation)
23149            .unwrap()
23150            .conversation_id;
23151
23152        let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23153            .conn
23154            .query_row_map(
23155                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23156                fparams![conversation_id],
23157                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23158            )
23159            .unwrap();
23160        assert!(extra_json_text.is_none());
23161        assert!(extra_bin.is_some());
23162
23163        let stored = storage.fetch_messages(conversation_id).unwrap();
23164        assert_eq!(stored[0].extra_json, extra_json);
23165    }
23166
23167    #[test]
23168    fn conversation_insert_preserves_null_metadata_json_as_json_null() {
23169        let dir = TempDir::new().unwrap();
23170        let db_path = dir.path().join("test.db");
23171        let storage = SqliteStorage::open(&db_path).unwrap();
23172        let agent_id = storage
23173            .ensure_agent(&Agent {
23174                id: None,
23175                slug: "codex".into(),
23176                name: "Codex".into(),
23177                version: None,
23178                kind: AgentKind::Cli,
23179            })
23180            .unwrap();
23181        let conversation = Conversation {
23182            id: None,
23183            agent_slug: "codex".into(),
23184            workspace: None,
23185            external_id: Some("null-conversation-metadata".into()),
23186            title: Some("Null conversation metadata".into()),
23187            source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
23188            started_at: Some(1_700_000_000_000),
23189            ended_at: Some(1_700_000_000_001),
23190            approx_tokens: None,
23191            metadata_json: serde_json::Value::Null,
23192            messages: vec![Message {
23193                id: None,
23194                idx: 0,
23195                role: MessageRole::User,
23196                author: None,
23197                created_at: Some(1_700_000_000_000),
23198                content: "null conversation metadata message".into(),
23199                extra_json: serde_json::Value::Null,
23200                snippets: Vec::new(),
23201            }],
23202            source_id: LOCAL_SOURCE_ID.into(),
23203            origin_host: None,
23204        };
23205
23206        storage
23207            .insert_conversation_tree(agent_id, None, &conversation)
23208            .unwrap();
23209
23210        let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23211            .conn
23212            .query_row_map(
23213                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23214                fparams!["null-conversation-metadata"],
23215                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23216            )
23217            .unwrap();
23218        assert_eq!(metadata_json.as_deref(), Some("null"));
23219        assert!(metadata_bin.is_none());
23220
23221        let listed = storage.list_conversations(10, 0).unwrap();
23222        assert!(listed[0].metadata_json.is_null());
23223    }
23224
23225    #[test]
23226    fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
23227        let dir = TempDir::new().unwrap();
23228        let db_path = dir.path().join("test.db");
23229        let storage = SqliteStorage::open(&db_path).unwrap();
23230        let agent_id = storage
23231            .ensure_agent(&Agent {
23232                id: None,
23233                slug: "codex".into(),
23234                name: "Codex".into(),
23235                version: None,
23236                kind: AgentKind::Cli,
23237            })
23238            .unwrap();
23239        let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
23240        let conversation = Conversation {
23241            id: None,
23242            agent_slug: "codex".into(),
23243            workspace: None,
23244            external_id: Some("msgpack-conversation-metadata".into()),
23245            title: Some("MessagePack conversation metadata".into()),
23246            source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
23247            started_at: Some(1_700_000_000_000),
23248            ended_at: Some(1_700_000_000_001),
23249            approx_tokens: None,
23250            metadata_json: metadata_json.clone(),
23251            messages: vec![Message {
23252                id: None,
23253                idx: 0,
23254                role: MessageRole::User,
23255                author: None,
23256                created_at: Some(1_700_000_000_000),
23257                content: "msgpack conversation metadata message".into(),
23258                extra_json: serde_json::Value::Null,
23259                snippets: Vec::new(),
23260            }],
23261            source_id: LOCAL_SOURCE_ID.into(),
23262            origin_host: None,
23263        };
23264
23265        storage
23266            .insert_conversation_tree(agent_id, None, &conversation)
23267            .unwrap();
23268
23269        let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23270            .conn
23271            .query_row_map(
23272                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23273                fparams!["msgpack-conversation-metadata"],
23274                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23275            )
23276            .unwrap();
23277        assert!(metadata_text.is_none());
23278        assert!(metadata_bin.is_some());
23279
23280        let listed = storage.list_conversations(10, 0).unwrap();
23281        assert_eq!(listed[0].metadata_json, metadata_json);
23282    }
23283
23284    #[test]
23285    fn msgpack_returns_none_for_empty_object() {
23286        let value = serde_json::json!({});
23287        assert!(serialize_json_to_msgpack(&value).is_none());
23288    }
23289
23290    #[test]
23291    fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
23292        let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
23293
23294        let value = parse_historical_json_column(Some(raw.clone()));
23295
23296        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23297        assert_eq!(json_value_size_hint(&value), raw.len());
23298    }
23299
23300    #[test]
23301    fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
23302        let raw = String::from("{\"ok\":true,\"n\":1}");
23303
23304        let value = parse_historical_json_column(Some(raw.clone()));
23305
23306        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23307    }
23308
23309    #[test]
23310    fn msgpack_serializes_non_empty_array() {
23311        let value = serde_json::json!([1, 2, 3]);
23312        let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
23313        let recovered = deserialize_msgpack_to_json(&bytes);
23314        assert_eq!(value, recovered);
23315    }
23316
23317    #[test]
23318    fn msgpack_smaller_than_json() {
23319        let value = serde_json::json!({
23320            "field_name_one": "some_value",
23321            "field_name_two": 123456,
23322            "field_name_three": [1, 2, 3, 4, 5],
23323            "field_name_four": { "nested": true }
23324        });
23325
23326        let json_bytes = serde_json::to_vec(&value).unwrap();
23327        let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
23328
23329        // MessagePack should be smaller due to more compact encoding
23330        assert!(
23331            msgpack_bytes.len() < json_bytes.len(),
23332            "MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
23333            msgpack_bytes.len(),
23334            json_bytes.len()
23335        );
23336    }
23337
23338    #[test]
23339    fn migration_v7_adds_binary_columns() {
23340        let dir = TempDir::new().unwrap();
23341        let db_path = dir.path().join("test.db");
23342        let storage = SqliteStorage::open(&db_path).unwrap();
23343
23344        // Verify metadata_bin column exists
23345        let has_metadata_bin = storage
23346            .raw()
23347            .query("PRAGMA table_info(conversations)")
23348            .unwrap()
23349            .iter()
23350            .any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
23351        assert!(
23352            has_metadata_bin,
23353            "conversations should have metadata_bin column"
23354        );
23355
23356        // Verify extra_bin column exists
23357        let has_extra_bin = storage
23358            .raw()
23359            .query("PRAGMA table_info(messages)")
23360            .unwrap()
23361            .iter()
23362            .any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
23363        assert!(has_extra_bin, "messages should have extra_bin column");
23364    }
23365
23366    #[test]
23367    fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
23368        let dir = TempDir::new().unwrap();
23369        let db_path = dir.path().join("append-tail-state-cache.db");
23370        let storage = SqliteStorage::open(&db_path).unwrap();
23371        let agent_id = storage
23372            .ensure_agent(&Agent {
23373                id: None,
23374                slug: "codex".into(),
23375                name: "Codex".into(),
23376                version: None,
23377                kind: AgentKind::Cli,
23378            })
23379            .unwrap();
23380        let workspace = PathBuf::from("/ws/profiled-append-remote");
23381        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
23382
23383        let initial = make_profiled_append_remote_merge_conversation(11, 5);
23384        let insert_outcome = storage
23385            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
23386            .unwrap();
23387        let conversation_id = insert_outcome.conversation_id;
23388
23389        let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23390            .raw()
23391            .query_row_map(
23392                "SELECT ended_at, last_message_idx, last_message_created_at
23393                 FROM conversation_tail_state
23394                 WHERE conversation_id = ?1",
23395                fparams![conversation_id],
23396                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23397            )
23398            .unwrap();
23399        assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
23400
23401        storage
23402            .raw()
23403            .execute_compat(
23404                "UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
23405                fparams![111_999_i64, conversation_id],
23406            )
23407            .unwrap();
23408        storage
23409            .raw()
23410            .execute_compat(
23411                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
23412                fparams![conversation_id],
23413            )
23414            .unwrap();
23415
23416        let appended = make_profiled_append_remote_merge_conversation(11, 10);
23417        let append_outcome = storage
23418            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
23419            .unwrap();
23420        assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
23421
23422        let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23423            .raw()
23424            .query_row_map(
23425                "SELECT ended_at, last_message_idx, last_message_created_at
23426                 FROM conversation_tail_state
23427                 WHERE conversation_id = ?1",
23428                fparams![conversation_id],
23429                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23430            )
23431            .unwrap();
23432        assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
23433    }
23434
23435    #[test]
23436    fn msgpack_deserialize_empty_returns_default() {
23437        let recovered = deserialize_msgpack_to_json(&[]);
23438        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23439    }
23440
23441    #[test]
23442    fn msgpack_deserialize_garbage_returns_default() {
23443        // Use truncated msgpack data that will fail to parse
23444        // 0x85 indicates a fixmap with 5 elements, but we don't provide them
23445        let recovered = deserialize_msgpack_to_json(&[0x85]);
23446        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23447    }
23448
23449    #[test]
23450    fn stats_aggregator_collects_and_expands() {
23451        let mut agg = StatsAggregator::new();
23452        assert!(agg.is_empty());
23453
23454        // Record some stats
23455        // Day 100, agent "claude", source "local"
23456        agg.record("claude", "local", 100, 5, 500);
23457        // Day 100, agent "codex", source "local"
23458        agg.record("codex", "local", 100, 3, 300);
23459        // Day 101, agent "claude", source "local"
23460        agg.record("claude", "local", 101, 2, 200);
23461
23462        assert!(!agg.is_empty());
23463        assert_eq!(agg.raw_entry_count(), 3);
23464
23465        let entries = agg.expand();
23466        // Each raw entry expands to 4 permutations.
23467        // But (all, local) and (all, all) will aggregate.
23468        //
23469        // Raw:
23470        // 1. (100, claude, local) -> 1 sess, 5 msgs, 500 chars
23471        // 2. (100, codex, local)  -> 1 sess, 3 msgs, 300 chars
23472        // 3. (101, claude, local) -> 1 sess, 2 msgs, 200 chars
23473        //
23474        // Expanded 1 (day 100):
23475        // - (100, claude, local): 1 sess, 5 msgs, 500 chars
23476        // - (100, all, local):    1 (from claude) + 1 (from codex) = 2 sess, 8 msgs, 800 chars
23477        // - (100, claude, all):   1 sess, 5 msgs, 500 chars
23478        // - (100, codex, local):  1 sess, 3 msgs, 300 chars
23479        // - (100, codex, all):    1 sess, 3 msgs, 300 chars
23480        // - (100, all, all):      2 sess, 8 msgs, 800 chars
23481        //
23482        // Expanded 3 (day 101):
23483        // - (101, claude, local): 1 sess, 2 msgs, 200 chars
23484        // - (101, all, local):    1 sess, 2 msgs, 200 chars
23485        // - (101, claude, all):   1 sess, 2 msgs, 200 chars
23486        // - (101, all, all):      1 sess, 2 msgs, 200 chars
23487        //
23488        // Total unique keys in expanded map:
23489        // Day 100: (claude, local), (codex, local), (all, local), (claude, all), (codex, all), (all, all) = 6
23490        // Day 101: (claude, local), (all, local), (claude, all), (all, all) = 4
23491        // Total = 10 entries
23492
23493        assert_eq!(entries.len(), 10);
23494
23495        // Verify totals for day 100, all/all
23496        let day100_all = entries
23497            .iter()
23498            .find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
23499            .unwrap();
23500        assert_eq!(day100_all.3.session_count_delta, 2);
23501        assert_eq!(day100_all.3.message_count_delta, 8);
23502        assert_eq!(day100_all.3.total_chars_delta, 800);
23503    }
23504
23505    // =========================================================================
23506    // LazyFrankenDb tests (bd-1ueu)
23507    // =========================================================================
23508
23509    #[test]
23510    fn lazy_franken_db_not_open_before_get() {
23511        let dir = TempDir::new().unwrap();
23512        let db_path = dir.path().join("lazy_test.db");
23513
23514        // Create a real DB so the path exists
23515        let _storage = SqliteStorage::open(&db_path).unwrap();
23516
23517        let lazy = LazyFrankenDb::new(db_path);
23518        assert!(
23519            !lazy.is_open(),
23520            "LazyFrankenDb must not open on construction"
23521        );
23522    }
23523
23524    #[test]
23525    fn lazy_franken_db_opens_on_first_get() {
23526        let dir = TempDir::new().unwrap();
23527        let db_path = dir.path().join("lazy_test.db");
23528
23529        // Create a real DB so the path exists
23530        let _storage = SqliteStorage::open(&db_path).unwrap();
23531        drop(_storage);
23532
23533        let lazy = LazyFrankenDb::new(db_path);
23534        assert!(!lazy.is_open());
23535
23536        let conn = lazy.get("test").expect("should open successfully");
23537        let count: i64 = conn
23538            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
23539                r.get_typed(0)
23540            })
23541            .unwrap();
23542        assert_eq!(count, 0);
23543        drop(conn);
23544
23545        assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
23546    }
23547
23548    #[test]
23549    fn lazy_franken_db_reuses_connection() {
23550        let dir = TempDir::new().unwrap();
23551        let db_path = dir.path().join("lazy_test.db");
23552        let _storage = SqliteStorage::open(&db_path).unwrap();
23553        drop(_storage);
23554
23555        let lazy = LazyFrankenDb::new(db_path);
23556
23557        // First access opens
23558        {
23559            let conn = lazy.get("first").unwrap();
23560            conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
23561                .unwrap();
23562        }
23563
23564        // Second access reuses (table still exists)
23565        {
23566            let conn = lazy.get("second").unwrap();
23567            let count: i64 = conn
23568                .query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
23569                    r.get_typed(0)
23570                })
23571                .unwrap();
23572            assert_eq!(count, 0);
23573        }
23574    }
23575
23576    #[test]
23577    fn lazy_franken_db_not_found_error() {
23578        let dir = TempDir::new().unwrap();
23579        let db_path = dir.path().join("nonexistent.db");
23580
23581        let lazy = LazyFrankenDb::new(db_path);
23582        let result = lazy.get("test");
23583        assert!(result.is_err());
23584        assert!(
23585            matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
23586            "should return NotFound for missing DB"
23587        );
23588    }
23589
23590    #[test]
23591    fn lazy_franken_db_path_accessor() {
23592        let path = PathBuf::from("/tmp/test_lazy.db");
23593        let lazy = LazyFrankenDb::new(path.clone());
23594        assert_eq!(lazy.path(), path.as_path());
23595    }
23596
23597    // =========================================================================
23598    // Pricing / cost estimation tests (bead z9fse.10)
23599    // =========================================================================
23600
23601    #[test]
23602    fn sql_like_match_basic_patterns() {
23603        assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
23604        assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
23605        assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
23606
23607        // Middle wildcard (gemini pattern)
23608        assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
23609        assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
23610        assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
23611
23612        // Exact match
23613        assert!(sql_like_match("hello", "hello"));
23614        assert!(!sql_like_match("hello!", "hello"));
23615
23616        // Underscore wildcard
23617        assert!(sql_like_match("gpt-4o", "gpt-4_"));
23618        assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
23619
23620        // Case insensitive
23621        assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
23622    }
23623
23624    #[test]
23625    fn date_str_to_day_id_converts_correctly() {
23626        // 2025-10-01 is 2100 days after 2020-01-01
23627        assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
23628        // 2024-04-01 is 1552 days after 2020-01-01
23629        assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
23630        assert!(date_str_to_day_id("invalid").is_err());
23631    }
23632
23633    #[test]
23634    fn pricing_table_lookup_selects_matching_entry() {
23635        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23636        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23637        let table = PricingTable {
23638            entries: vec![
23639                PricingEntry {
23640                    model_pattern: "claude-opus-4%".into(),
23641                    provider: "anthropic".into(),
23642                    input_cost_per_mtok: 15.0,
23643                    output_cost_per_mtok: 75.0,
23644                    cache_read_cost_per_mtok: Some(1.5),
23645                    cache_creation_cost_per_mtok: Some(18.75),
23646                    effective_day_id: effective_day,
23647                },
23648                PricingEntry {
23649                    model_pattern: "claude-sonnet-4%".into(),
23650                    provider: "anthropic".into(),
23651                    input_cost_per_mtok: 3.0,
23652                    output_cost_per_mtok: 15.0,
23653                    cache_read_cost_per_mtok: Some(0.3),
23654                    cache_creation_cost_per_mtok: Some(3.75),
23655                    effective_day_id: effective_day,
23656                },
23657            ],
23658        };
23659
23660        let result = table.lookup("claude-opus-4-20260101", lookup_day);
23661        assert!(result.is_some());
23662        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23663
23664        let result = table.lookup("claude-sonnet-4-latest", lookup_day);
23665        assert!(result.is_some());
23666        assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
23667
23668        assert!(table.lookup("unknown-model", lookup_day).is_none());
23669    }
23670
23671    #[test]
23672    fn pricing_table_lookup_respects_effective_date() {
23673        let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
23674        let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
23675        let table = PricingTable {
23676            entries: vec![
23677                PricingEntry {
23678                    model_pattern: "claude-opus-4%".into(),
23679                    provider: "anthropic".into(),
23680                    input_cost_per_mtok: 15.0,
23681                    output_cost_per_mtok: 75.0,
23682                    cache_read_cost_per_mtok: None,
23683                    cache_creation_cost_per_mtok: None,
23684                    effective_day_id: effective_day_1,
23685                },
23686                PricingEntry {
23687                    model_pattern: "claude-opus-4%".into(),
23688                    provider: "anthropic".into(),
23689                    input_cost_per_mtok: 12.0,
23690                    output_cost_per_mtok: 60.0,
23691                    cache_read_cost_per_mtok: None,
23692                    cache_creation_cost_per_mtok: None,
23693                    effective_day_id: effective_day_2,
23694                },
23695            ],
23696        };
23697
23698        // Before price drop
23699        let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
23700        assert!(result.is_some());
23701        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23702
23703        // After price drop
23704        let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
23705        assert!(result.is_some());
23706        assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
23707
23708        // Before all pricing
23709        assert!(
23710            table
23711                .lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
23712                .is_none()
23713        );
23714    }
23715
23716    #[test]
23717    fn pricing_table_lookup_specificity_tiebreak() {
23718        let effective_day = date_str_to_day_id("2025-01-01").unwrap();
23719        let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
23720        let table = PricingTable {
23721            entries: vec![
23722                PricingEntry {
23723                    model_pattern: "gpt-4%".into(),
23724                    provider: "openai".into(),
23725                    input_cost_per_mtok: 10.0,
23726                    output_cost_per_mtok: 30.0,
23727                    cache_read_cost_per_mtok: None,
23728                    cache_creation_cost_per_mtok: None,
23729                    effective_day_id: effective_day,
23730                },
23731                PricingEntry {
23732                    model_pattern: "gpt-4-turbo%".into(),
23733                    provider: "openai".into(),
23734                    input_cost_per_mtok: 5.0,
23735                    output_cost_per_mtok: 15.0,
23736                    cache_read_cost_per_mtok: None,
23737                    cache_creation_cost_per_mtok: None,
23738                    effective_day_id: effective_day,
23739                },
23740            ],
23741        };
23742
23743        // Longer pattern wins for specific model
23744        let result = table.lookup("gpt-4-turbo-2025", lookup_day);
23745        assert!(result.is_some());
23746        assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
23747
23748        // Shorter pattern matches broader model
23749        let result = table.lookup("gpt-4o", lookup_day);
23750        assert!(result.is_some());
23751        assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
23752    }
23753
23754    #[test]
23755    fn pricing_table_compute_cost_basic() {
23756        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23757        let table = PricingTable {
23758            entries: vec![PricingEntry {
23759                model_pattern: "claude-opus-4%".into(),
23760                provider: "anthropic".into(),
23761                input_cost_per_mtok: 15.0,
23762                output_cost_per_mtok: 75.0,
23763                cache_read_cost_per_mtok: Some(1.5),
23764                cache_creation_cost_per_mtok: Some(18.75),
23765                effective_day_id: effective_day,
23766            }],
23767        };
23768
23769        let cost = table.compute_cost(
23770            Some("claude-opus-4-latest"),
23771            date_str_to_day_id("2026-02-06").unwrap(),
23772            Some(1000),
23773            Some(500),
23774            None,
23775            None,
23776        );
23777        assert!(cost.is_some());
23778        // 1000 * 15.0 / 1M + 500 * 75.0 / 1M = 0.015 + 0.0375 = 0.0525
23779        assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
23780    }
23781
23782    #[test]
23783    fn pricing_table_compute_cost_with_cache() {
23784        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23785        let table = PricingTable {
23786            entries: vec![PricingEntry {
23787                model_pattern: "claude-opus-4%".into(),
23788                provider: "anthropic".into(),
23789                input_cost_per_mtok: 15.0,
23790                output_cost_per_mtok: 75.0,
23791                cache_read_cost_per_mtok: Some(1.5),
23792                cache_creation_cost_per_mtok: Some(18.75),
23793                effective_day_id: effective_day,
23794            }],
23795        };
23796
23797        let cost = table.compute_cost(
23798            Some("claude-opus-4-latest"),
23799            date_str_to_day_id("2026-02-06").unwrap(),
23800            Some(1_000_000),
23801            Some(100_000),
23802            Some(500_000),
23803            Some(200_000),
23804        );
23805        assert!(cost.is_some());
23806        // input excludes cache tokens to avoid double-charging them at both the
23807        // full input rate and the cache-specific rates.
23808        // non-cache input: 300K * 15/1M = 4.5, output: 100K * 75/1M = 7.5
23809        // cache_read: 500K * 1.5/1M = 0.75, cache_creation: 200K * 18.75/1M = 3.75
23810        // total = 16.5
23811        assert!((cost.unwrap() - 16.5).abs() < 1e-10);
23812    }
23813
23814    #[test]
23815    fn pricing_table_compute_cost_returns_none_for_unknown_model() {
23816        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23817        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23818        let table = PricingTable {
23819            entries: vec![PricingEntry {
23820                model_pattern: "claude-opus-4%".into(),
23821                provider: "anthropic".into(),
23822                input_cost_per_mtok: 15.0,
23823                output_cost_per_mtok: 75.0,
23824                cache_read_cost_per_mtok: None,
23825                cache_creation_cost_per_mtok: None,
23826                effective_day_id: effective_day,
23827            }],
23828        };
23829
23830        assert!(
23831            table
23832                .compute_cost(
23833                    Some("unknown-model"),
23834                    lookup_day,
23835                    Some(1000),
23836                    Some(500),
23837                    None,
23838                    None
23839                )
23840                .is_none()
23841        );
23842        assert!(
23843            table
23844                .compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
23845                .is_none()
23846        );
23847        assert!(
23848            table
23849                .compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
23850                .is_none()
23851        );
23852    }
23853
23854    #[test]
23855    fn pricing_table_load_from_db() {
23856        let dir = TempDir::new().unwrap();
23857        let db_path = dir.path().join("test.db");
23858        let storage = SqliteStorage::open(&db_path).unwrap();
23859
23860        let table = PricingTable::load(&storage.conn).unwrap();
23861        assert!(!table.is_empty());
23862
23863        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23864
23865        let opus = table.lookup("claude-opus-4-latest", lookup_day);
23866        assert!(opus.is_some());
23867        assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
23868
23869        let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
23870        assert!(flash.is_some());
23871        assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
23872    }
23873
23874    #[test]
23875    fn pricing_table_load_rejects_invalid_effective_date() {
23876        let dir = TempDir::new().unwrap();
23877        let db_path = dir.path().join("test.db");
23878        let storage = SqliteStorage::open(&db_path).unwrap();
23879
23880        storage
23881            .conn
23882            .execute_compat(
23883                "INSERT INTO model_pricing (
23884                    model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
23885                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
23886                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
23887                fparams![
23888                    "broken-model%",
23889                    "test",
23890                    1.0_f64,
23891                    2.0_f64,
23892                    Option::<f64>::None,
23893                    Option::<f64>::None,
23894                    "not-a-date"
23895                ],
23896            )
23897            .unwrap();
23898
23899        let err = PricingTable::load(&storage.conn).unwrap_err();
23900        assert!(err.to_string().contains("invalid effective_date"));
23901    }
23902
23903    #[test]
23904    fn pricing_diagnostics_tracks_coverage() {
23905        let mut diag = PricingDiagnostics::default();
23906        diag.record_priced();
23907        diag.record_priced();
23908        diag.record_unpriced(Some("custom-model-v1"));
23909        diag.record_unpriced(Some("custom-model-v1"));
23910        diag.record_unpriced(None);
23911
23912        assert_eq!(diag.priced_count, 2);
23913        assert_eq!(diag.unpriced_count, 3);
23914        assert_eq!(diag.unknown_models.len(), 2);
23915        assert_eq!(diag.unknown_models["custom-model-v1"], 2);
23916        assert_eq!(diag.unknown_models["(none)"], 1);
23917    }
23918
23919    // =========================================================================
23920    // FrankenStorage migration tests (bead 2j6p6)
23921    // =========================================================================
23922
23923    /// Helper: create a FrankenStorage wrapping an in-memory connection and
23924    /// run migrations. This exercises the same code path as `open()` but avoids
23925    /// frankensqlite's file-based autoindex renaming limitation (V5 uses
23926    /// ALTER TABLE RENAME which triggers sqlite_autoindex lookup issues on
23927    /// file-based pagers).
23928    fn franken_storage_in_memory() -> FrankenStorage {
23929        let conn = FrankenConnection::open(":memory:").unwrap();
23930        let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
23931        storage.run_migrations().unwrap();
23932        storage.apply_config().unwrap();
23933        storage
23934    }
23935
23936    #[test]
23937    fn franken_migrations_create_all_tables() {
23938        let storage = franken_storage_in_memory();
23939
23940        // Should be at CURRENT_SCHEMA_VERSION.
23941        let version = storage.schema_version().unwrap();
23942        assert_eq!(
23943            version, CURRENT_SCHEMA_VERSION,
23944            "fresh FrankenStorage should be at current schema version"
23945        );
23946
23947        // Core tables from V1 should exist.
23948        let rows = storage
23949            .raw()
23950            .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
23951            .unwrap();
23952        let table_names: Vec<String> = rows
23953            .iter()
23954            .filter_map(|r| r.get_typed::<String>(0).ok())
23955            .collect();
23956
23957        for required in [
23958            "meta",
23959            "agents",
23960            "workspaces",
23961            "conversations",
23962            "messages",
23963            "snippets",
23964            "tags",
23965            "conversation_tags",
23966        ] {
23967            assert!(
23968                table_names.contains(&required.to_string()),
23969                "missing table: {required}"
23970            );
23971        }
23972
23973        // V4 sources table.
23974        assert!(
23975            table_names.contains(&"sources".to_string()),
23976            "missing sources table"
23977        );
23978
23979        // V8 daily_stats table.
23980        assert!(
23981            table_names.contains(&"daily_stats".to_string()),
23982            "missing daily_stats table"
23983        );
23984
23985        // V9 embedding_jobs table.
23986        assert!(
23987            table_names.contains(&"embedding_jobs".to_string()),
23988            "missing embedding_jobs table"
23989        );
23990
23991        // V11 message_metrics, usage_hourly, usage_daily tables.
23992        for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
23993            assert!(
23994                table_names.contains(&analytics_table.to_string()),
23995                "missing table: {analytics_table}"
23996            );
23997        }
23998        assert!(
23999            table_names.contains(&"conversation_tail_state".to_string()),
24000            "missing conversation_tail_state table"
24001        );
24002        assert!(
24003            table_names.contains(&"conversation_external_lookup".to_string()),
24004            "missing conversation_external_lookup table"
24005        );
24006        assert!(
24007            table_names.contains(&"conversation_external_tail_lookup".to_string()),
24008            "missing conversation_external_tail_lookup table"
24009        );
24010
24011        // Fresh frankensqlite databases should record the combined V13 base
24012        // schema plus every additive post-V13 migration.
24013        let rows = storage
24014            .raw()
24015            .query("SELECT COUNT(*) FROM _schema_migrations;")
24016            .unwrap();
24017        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24018        assert_eq!(
24019            count,
24020            (13..=CURRENT_SCHEMA_VERSION).count() as i64,
24021            "_schema_migrations should record the V13 base schema and post-V13 migrations"
24022        );
24023
24024        // The latest applied migration should be the current schema version.
24025        let rows = storage
24026            .raw()
24027            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24028            .unwrap();
24029        let versions: Vec<i64> = rows
24030            .iter()
24031            .map(|row| row.get_typed(0))
24032            .collect::<std::result::Result<_, _>>()
24033            .unwrap();
24034        assert_eq!(
24035            versions,
24036            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24037            "_schema_migrations should contain v13 through current"
24038        );
24039    }
24040
24041    #[test]
24042    fn franken_migrations_idempotent() {
24043        let storage = franken_storage_in_memory();
24044        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24045
24046        // Re-running migrations on the same connection is a no-op.
24047        storage.run_migrations().unwrap();
24048        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24049    }
24050
24051    #[test]
24052    fn migration_v20_backfills_conversation_external_tail_lookup() {
24053        let storage = franken_storage_in_memory();
24054        let agent_id = storage
24055            .ensure_agent(&Agent {
24056                id: None,
24057                slug: "codex".into(),
24058                name: "Codex".into(),
24059                version: None,
24060                kind: AgentKind::Cli,
24061            })
24062            .unwrap();
24063        let workspace_id = storage
24064            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
24065            .unwrap();
24066        let mut conv = make_profiled_storage_remote_conversation(1919, 2);
24067        conv.source_id = "profiled-storage-remote-source-東京".into();
24068        conv.external_id = Some("profiled-storage-remote-☃-1919".into());
24069        let outcome = storage
24070            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
24071            .unwrap();
24072        let external_id = conv.external_id.as_deref().unwrap();
24073        let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
24074
24075        storage
24076            .raw()
24077            .execute("DELETE FROM conversation_external_tail_lookup")
24078            .unwrap();
24079        storage
24080            .raw()
24081            .execute("DELETE FROM _schema_migrations WHERE version = 20")
24082            .unwrap();
24083        storage
24084            .raw()
24085            .execute_compat(
24086                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24087                fparams!["19"],
24088            )
24089            .unwrap();
24090
24091        storage.run_migrations().unwrap();
24092
24093        let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
24094            .raw()
24095            .query_row_map(
24096                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
24097                 FROM conversation_external_tail_lookup
24098                 WHERE lookup_key = ?1",
24099                fparams![lookup_key.as_str()],
24100                |row| {
24101                    Ok((
24102                        row.get_typed(0)?,
24103                        row.get_typed(1)?,
24104                        row.get_typed(2)?,
24105                        row.get_typed(3)?,
24106                    ))
24107                },
24108            )
24109            .unwrap();
24110        assert_eq!(
24111            backfilled,
24112            (
24113                outcome.conversation_id,
24114                conv.ended_at,
24115                Some(1),
24116                conv.messages[1].created_at
24117            )
24118        );
24119        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24120    }
24121
24122    #[test]
24123    fn migration_v15_creates_lazy_tail_state_cache() {
24124        let conn = FrankenConnection::open(":memory:").unwrap();
24125        conn.execute_batch(
24126            "CREATE TABLE conversations (
24127                 id INTEGER PRIMARY KEY,
24128                 ended_at INTEGER
24129             );
24130             CREATE TABLE messages (
24131                 id INTEGER PRIMARY KEY,
24132                 conversation_id INTEGER NOT NULL,
24133                 idx INTEGER NOT NULL,
24134                 created_at INTEGER
24135             );
24136             INSERT INTO conversations(id, ended_at) VALUES
24137                 (1, 1710000000300),
24138                 (2, NULL);
24139             INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
24140                 (10, 1, 0, 1710000000100),
24141                 (11, 1, 1, 1710000000200),
24142                 (12, 2, 0, 1710000000400);",
24143        )
24144        .unwrap();
24145
24146        conn.execute(
24147            "CREATE TABLE _schema_migrations (
24148                version INTEGER PRIMARY KEY,
24149                name TEXT NOT NULL,
24150                applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
24151             );",
24152        )
24153        .unwrap();
24154
24155        assert!(
24156            apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24157            "v15 migration should apply once"
24158        );
24159        assert!(
24160            !apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24161            "v15 migration should be idempotent once recorded"
24162        );
24163
24164        let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
24165        let column_names: HashSet<String> = columns
24166            .iter()
24167            .map(|row| row.get_typed(1))
24168            .collect::<std::result::Result<_, frankensqlite::FrankenError>>()
24169            .unwrap();
24170        assert!(column_names.contains("last_message_idx"));
24171        assert!(column_names.contains("last_message_created_at"));
24172
24173        let tail_rows: i64 = conn
24174            .query("SELECT COUNT(*) FROM conversation_tail_state;")
24175            .unwrap()
24176            .first()
24177            .unwrap()
24178            .get_typed(0)
24179            .unwrap();
24180        assert_eq!(
24181            tail_rows, 0,
24182            "v15 should create the cache without an open-time message scan"
24183        );
24184
24185        let applied: i64 = conn
24186            .query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
24187            .unwrap()
24188            .first()
24189            .unwrap()
24190            .get_typed(0)
24191            .unwrap();
24192        assert_eq!(applied, 1);
24193    }
24194
24195    #[test]
24196    fn schema_repair_adds_missing_conversations_token_columns() {
24197        let conn = FrankenConnection::open(":memory:").unwrap();
24198        conn.execute_batch(
24199            "CREATE TABLE conversations (
24200                 id INTEGER PRIMARY KEY,
24201                 agent_id INTEGER NOT NULL,
24202                 source_path TEXT NOT NULL
24203             );",
24204        )
24205        .unwrap();
24206        let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
24207
24208        storage.repair_missing_conversation_token_columns().unwrap();
24209        storage.repair_missing_conversation_token_columns().unwrap();
24210
24211        let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
24212        for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
24213            assert!(
24214                columns.contains(column_name),
24215                "schema repair should add conversations.{column_name}"
24216            );
24217        }
24218    }
24219
24220    #[test]
24221    fn franken_meta_schema_version_in_sync() {
24222        let storage = franken_storage_in_memory();
24223
24224        // meta.schema_version should be kept in sync.
24225        let rows = storage
24226            .raw()
24227            .query("SELECT value FROM meta WHERE key = 'schema_version';")
24228            .unwrap();
24229        let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
24230        assert_eq!(
24231            meta_version,
24232            CURRENT_SCHEMA_VERSION.to_string(),
24233            "meta.schema_version should match CURRENT_SCHEMA_VERSION"
24234        );
24235    }
24236
24237    #[test]
24238    fn franken_transition_from_meta_version() {
24239        let dir = TempDir::new().unwrap();
24240        let db_path = dir.path().join("test_transition.db");
24241
24242        // Simulate an existing database created by SqliteStorage at version 10.
24243        // We create just enough schema to test the transition.
24244        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24245        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24246            .unwrap();
24247        conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
24248            .unwrap();
24249        // Create a dummy conversations table so transition doesn't think it's corrupted.
24250        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24251            .unwrap();
24252        drop(conn);
24253
24254        // Now run the transition function.
24255        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24256        transition_from_meta_version(&conn).unwrap();
24257
24258        // _schema_migrations should exist with entries for versions 1..=10.
24259        let rows = conn
24260            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24261            .unwrap();
24262        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24263        assert_eq!(
24264            versions,
24265            (1..=10).collect::<Vec<i64>>(),
24266            "transition should backfill versions 1..=10"
24267        );
24268    }
24269
24270    #[test]
24271    fn franken_transition_from_current_meta_backfills_current_schema_marker() {
24272        let dir = TempDir::new().unwrap();
24273        let db_path = dir.path().join("test_current_transition.db");
24274
24275        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24276        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24277            .unwrap();
24278        conn.execute_compat(
24279            "INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
24280            &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24281        )
24282        .unwrap();
24283        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24284            .unwrap();
24285        drop(conn);
24286
24287        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24288        transition_from_meta_version(&conn).unwrap();
24289
24290        let rows = conn
24291            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24292            .unwrap();
24293        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24294        assert_eq!(
24295            versions,
24296            (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24297            "current meta schema marker should backfill every known migration"
24298        );
24299    }
24300
24301    #[test]
24302    fn franken_transition_skips_when_already_done() {
24303        let dir = TempDir::new().unwrap();
24304        let db_path = dir.path().join("test_transition_skip.db");
24305
24306        // Create a DB that already has _schema_migrations.
24307        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24308        conn.execute(
24309            "CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
24310        ).unwrap();
24311        conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
24312            .unwrap();
24313
24314        // Transition should be a no-op.
24315        transition_from_meta_version(&conn).unwrap();
24316
24317        // Should still have exactly 1 entry.
24318        let rows = conn
24319            .query("SELECT COUNT(*) FROM _schema_migrations;")
24320            .unwrap();
24321        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24322        assert_eq!(
24323            count, 1,
24324            "transition should not re-run on already-transitioned DB"
24325        );
24326    }
24327
24328    #[test]
24329    fn franken_transition_fresh_db_is_noop() {
24330        let dir = TempDir::new().unwrap();
24331        let db_path = dir.path().join("test_fresh_noop.db");
24332
24333        // Empty database — no meta table, no tables at all.
24334        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24335        transition_from_meta_version(&conn).unwrap();
24336
24337        // _schema_migrations should NOT have been created.
24338        let res = conn.query("SELECT * FROM \"_schema_migrations\";");
24339        assert!(
24340            res.is_err(),
24341            "transition should not create _schema_migrations on fresh DB"
24342        );
24343    }
24344
24345    #[test]
24346    fn franken_transition_with_fts_virtual_table_succeeds() {
24347        let dir = TempDir::new().unwrap();
24348        let db_path = dir.path().join("test_transition_with_fts.db");
24349
24350        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24351        conn.execute_batch(
24352            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24353             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24354             CREATE TABLE conversations (id INTEGER PRIMARY KEY);
24355             CREATE VIRTUAL TABLE fts_messages USING fts5(
24356                 content,
24357                 title,
24358                 agent,
24359                 workspace,
24360                 source_path,
24361                 created_at,
24362                 content='',
24363                 tokenize='porter unicode61'
24364             );",
24365        )
24366        .unwrap();
24367        drop(conn);
24368
24369        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24370        transition_from_meta_version(&conn).unwrap();
24371
24372        let rows = conn
24373            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24374            .unwrap();
24375        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24376        assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
24377    }
24378
24379    #[test]
24380    fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
24381        let dir = TempDir::new().unwrap();
24382        let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
24383
24384        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24385        conn.execute_batch(
24386            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24387             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24388             CREATE TABLE agents (
24389                 id INTEGER PRIMARY KEY,
24390                 slug TEXT NOT NULL
24391             );
24392             CREATE TABLE workspaces (
24393                 id INTEGER PRIMARY KEY,
24394                 path TEXT NOT NULL
24395             );
24396             CREATE TABLE sources (
24397                 id TEXT PRIMARY KEY,
24398                 kind TEXT NOT NULL,
24399                 host_label TEXT,
24400                 machine_id TEXT,
24401                 platform TEXT,
24402                 config_json TEXT,
24403                 created_at INTEGER NOT NULL,
24404                 updated_at INTEGER NOT NULL
24405             );
24406             CREATE TABLE conversations (
24407                 id INTEGER PRIMARY KEY,
24408                 agent_id INTEGER NOT NULL,
24409                 workspace_id INTEGER,
24410                 source_id TEXT NOT NULL DEFAULT 'local',
24411                 external_id TEXT,
24412                 title TEXT,
24413                 source_path TEXT NOT NULL,
24414                 started_at INTEGER,
24415                 ended_at INTEGER
24416             );
24417             CREATE TABLE messages (
24418                 id INTEGER PRIMARY KEY,
24419                 conversation_id INTEGER NOT NULL,
24420                 idx INTEGER NOT NULL,
24421                 role TEXT NOT NULL,
24422                 author TEXT,
24423                 created_at INTEGER,
24424                 content TEXT NOT NULL,
24425                 extra_json TEXT,
24426                 extra_bin BLOB
24427             );
24428             INSERT INTO agents(id, slug) VALUES (1, 'codex');
24429             INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
24430             INSERT INTO sources(id, kind, host_label, created_at, updated_at)
24431             VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
24432             INSERT INTO conversations(
24433                 id,
24434                 agent_id,
24435                 workspace_id,
24436                 source_id,
24437                 external_id,
24438                 title,
24439                 source_path,
24440                 started_at
24441             )
24442             VALUES (
24443                 1,
24444                 1,
24445                 1,
24446                 'local',
24447                 'legacy-session',
24448                 'legacy session',
24449                 '/tmp/legacy.jsonl',
24450                 1710000000000
24451             );
24452             INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
24453             VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
24454             CREATE VIRTUAL TABLE fts_messages USING fts5(
24455                 content,
24456                 title,
24457                 agent,
24458                 workspace,
24459                 source_path,
24460                 created_at,
24461                 message_id,
24462                 content='',
24463                 tokenize='porter unicode61'
24464             );",
24465        )
24466        .unwrap();
24467        drop(conn);
24468
24469        let storage = FrankenStorage::open(&db_path).unwrap();
24470        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24471
24472        let rows = storage
24473            .raw()
24474            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24475            .unwrap();
24476        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24477        assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
24478    }
24479
24480    #[test]
24481    fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
24482        let dir = TempDir::new().unwrap();
24483        let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
24484
24485        let storage = FrankenStorage::open(&db_path).unwrap();
24486        let agent = Agent {
24487            id: None,
24488            slug: "codex".into(),
24489            name: "Codex".into(),
24490            version: None,
24491            kind: AgentKind::Cli,
24492        };
24493        let agent_id = storage.ensure_agent(&agent).unwrap();
24494        let conversation = Conversation {
24495            id: None,
24496            agent_slug: "codex".into(),
24497            workspace: Some(PathBuf::from("/tmp/workspace")),
24498            external_id: Some("dup-fts-schema".into()),
24499            title: Some("Duplicate FTS schema".into()),
24500            source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
24501            started_at: Some(1_700_000_000_000),
24502            ended_at: Some(1_700_000_000_100),
24503            approx_tokens: Some(42),
24504            metadata_json: serde_json::Value::Null,
24505            messages: vec![Message {
24506                id: None,
24507                idx: 0,
24508                role: MessageRole::User,
24509                author: Some("user".into()),
24510                created_at: Some(1_700_000_000_050),
24511                content: "message that should remain queryable".into(),
24512                extra_json: serde_json::Value::Null,
24513                snippets: Vec::new(),
24514            }],
24515            source_id: LOCAL_SOURCE_ID.into(),
24516            origin_host: None,
24517        };
24518        storage
24519            .insert_conversation_tree(agent_id, None, &conversation)
24520            .unwrap();
24521        drop(storage);
24522        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
24523
24524        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
24525        let conn = rusqlite_test_fixture_conn(&db_path);
24526        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
24527        conn.execute(
24528            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
24529             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
24530            [duplicate_legacy_fts_sql],
24531        )
24532        .unwrap();
24533        conn.execute(
24534            "DELETE FROM meta WHERE key = ?1",
24535            [FTS_FRANKEN_REBUILD_META_KEY],
24536        )
24537        .unwrap();
24538        // Simulate a pre-fix upgraded database that has never gone through the
24539        // authoritative frankensqlite FTS rebuild generation yet.
24540        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
24541
24542        let duplicate_rows: i64 = conn
24543            .query_row(
24544                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
24545                [],
24546                |row| row.get(0),
24547            )
24548            .unwrap();
24549        assert_eq!(duplicate_rows, 2);
24550        drop(conn);
24551
24552        let reopened = FrankenStorage::open(&db_path).unwrap();
24553        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24554        let generation_rows: Vec<String> = reopened
24555            .raw()
24556            .query_map_collect(
24557                "SELECT value FROM meta WHERE key = ?1",
24558                fparams![FTS_FRANKEN_REBUILD_META_KEY],
24559                |row| row.get_typed(0),
24560            )
24561            .unwrap();
24562        assert_eq!(
24563            generation_rows.len(),
24564            0,
24565            "canonical open should not eagerly rewrite FTS repair metadata"
24566        );
24567        reopened.ensure_search_fallback_fts_consistency().unwrap();
24568        let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24569        assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
24570
24571        let total_messages: i64 = reopened
24572            .raw()
24573            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
24574                row.get_typed(0)
24575            })
24576            .unwrap();
24577        let total_fts_rows: i64 = reopened
24578            .raw()
24579            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
24580                row.get_typed(0)
24581            })
24582            .unwrap();
24583        assert_eq!(total_fts_rows, total_messages);
24584    }
24585
24586    #[test]
24587    fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
24588        let dir = TempDir::new().unwrap();
24589        let db_path = dir.path().join("fresh-franken-storage-open.db");
24590
24591        let storage = FrankenStorage::open(&db_path).unwrap();
24592        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24593
24594        // The FTS5 virtual table is no longer created eagerly by the
24595        // migration runner (V14 drops the old internal-content table and the
24596        // current contentless table is recreated lazily — see MIGRATION_V14).
24597        // Invoke the repair path to match normal cass startup, then assert
24598        // there is exactly one fts_messages entry in sqlite_schema (no
24599        // duplicates).
24600        storage
24601            .ensure_search_fallback_fts_consistency()
24602            .expect("ensure FTS consistency after fresh open");
24603        drop(storage);
24604
24605        let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
24606            .expect("open DB via frankensqlite for sqlite_master inspection");
24607        assert_eq!(
24608            franken_fts_schema_rows(&c_reader).unwrap(),
24609            1,
24610            "exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
24611        );
24612        drop(c_reader);
24613
24614        let storage = FrankenStorage::open(&db_path).unwrap();
24615        assert!(
24616            storage
24617                .raw()
24618                .query("SELECT rowid FROM fts_messages LIMIT 1")
24619                .is_ok(),
24620            "fts_messages must be queryable through frankensqlite after open"
24621        );
24622    }
24623
24624    #[test]
24625    fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
24626        let dir = TempDir::new().unwrap();
24627        let db_path = dir.path().join("test_repair_missing_analytics.db");
24628
24629        {
24630            let storage = FrankenStorage::open(&db_path).unwrap();
24631            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24632        }
24633
24634        {
24635            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24636            for table in &[
24637                "usage_models_daily",
24638                "usage_daily",
24639                "usage_hourly",
24640                "message_metrics",
24641                "token_daily_stats",
24642                "token_usage",
24643                "model_pricing",
24644                "embedding_jobs",
24645                "daily_stats",
24646            ] {
24647                conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
24648                    .unwrap();
24649            }
24650            conn.execute_compat(
24651                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24652                &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24653            )
24654            .unwrap();
24655        }
24656
24657        let repaired = FrankenStorage::open(&db_path).unwrap();
24658        assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24659
24660        let analytics_count: i64 = repaired
24661            .raw()
24662            .query_row_map(
24663                "SELECT COUNT(*) FROM sqlite_master
24664                 WHERE type='table'
24665                   AND name IN (
24666                     'daily_stats',
24667                     'embedding_jobs',
24668                     'token_usage',
24669                     'token_daily_stats',
24670                     'model_pricing',
24671                     'message_metrics',
24672                     'usage_hourly',
24673                     'usage_daily',
24674                     'usage_models_daily'
24675                   )",
24676                &[],
24677                |row| row.get_typed(0),
24678            )
24679            .unwrap();
24680        assert_eq!(
24681            analytics_count, 9,
24682            "open() should recreate the missing analytics tables even when schema_version already says current"
24683        );
24684    }
24685
24686    #[test]
24687    fn current_schema_repair_batches_cover_every_required_probe() {
24688        let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
24689            .iter()
24690            .map(|(table_name, _)| *table_name)
24691            .collect();
24692
24693        let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
24694        let covered_tables: HashSet<&'static str> = batches
24695            .iter()
24696            .flat_map(|batch| batch.tables.iter().copied())
24697            .collect();
24698
24699        for table_name in missing_tables {
24700            assert!(
24701                covered_tables.contains(table_name),
24702                "missing repair coverage for {table_name}"
24703            );
24704        }
24705    }
24706
24707    #[test]
24708    fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
24709        for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
24710            assert!(
24711                !batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
24712                "repair batch {} should not recreate meta",
24713                batch.name
24714            );
24715            assert!(
24716                !batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
24717                "repair batch {} should not recreate agents",
24718                batch.name
24719            );
24720            assert!(
24721                !batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
24722                "repair batch {} should not recreate workspaces",
24723                batch.name
24724            );
24725            assert!(
24726                !batch
24727                    .sql
24728                    .contains("CREATE TABLE IF NOT EXISTS conversations"),
24729                "repair batch {} should not recreate conversations",
24730                batch.name
24731            );
24732            assert!(
24733                !batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
24734                "repair batch {} should not recreate messages",
24735                batch.name
24736            );
24737            assert!(
24738                !batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
24739                "repair batch {} should not recreate snippets",
24740                batch.name
24741            );
24742            assert!(
24743                !batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
24744                "repair batch {} should not recreate FTS tables",
24745                batch.name
24746            );
24747            assert!(
24748                !batch.sql.contains("DROP TABLE"),
24749                "repair batch {} should never drop tables",
24750                batch.name
24751            );
24752        }
24753    }
24754
24755    #[test]
24756    fn build_cass_migrations_applies_combined_v13() {
24757        let conn = FrankenConnection::open(":memory:").unwrap();
24758        let base_result = build_cass_migrations_before_tail_cache()
24759            .run(&conn)
24760            .unwrap();
24761        assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
24762        let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
24763
24764        assert!(base_result.was_fresh);
24765        let mut applied = base_result.applied;
24766        applied.push(15);
24767        applied.extend(post_result.applied);
24768        assert_eq!(
24769            applied,
24770            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24771            "should apply combined V13 plus additive post-V13 migrations"
24772        );
24773        let current: i64 = conn
24774            .query("SELECT MAX(version) FROM _schema_migrations;")
24775            .unwrap()
24776            .first()
24777            .unwrap()
24778            .get_typed(0)
24779            .unwrap();
24780        assert_eq!(current, CURRENT_SCHEMA_VERSION);
24781    }
24782
24783    #[test]
24784    fn franken_insert_conversations_batched_populates_analytics_rollups() {
24785        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
24786        use frankensqlite::compat::{ConnectionExt, RowExt};
24787        use std::path::PathBuf;
24788
24789        let dir = TempDir::new().unwrap();
24790        let db_path = dir.path().join("franken-index.db");
24791        let storage = FrankenStorage::open(&db_path).unwrap();
24792
24793        let agent = Agent {
24794            id: None,
24795            slug: "claude_code".into(),
24796            name: "Claude Code".into(),
24797            version: Some("1.0".into()),
24798            kind: AgentKind::Cli,
24799        };
24800        let agent_id = storage.ensure_agent(&agent).unwrap();
24801
24802        let ts_ms = 1_770_551_400_000_i64;
24803        let usage_json = serde_json::json!({
24804            "message": {
24805                "model": "claude-opus-4-6",
24806                "usage": {
24807                    "input_tokens": 100,
24808                    "output_tokens": 50,
24809                    "cache_read_input_tokens": 25,
24810                    "cache_creation_input_tokens": 10,
24811                    "service_tier": "standard"
24812                }
24813            }
24814        });
24815
24816        let conv = Conversation {
24817            id: None,
24818            agent_slug: "claude_code".into(),
24819            workspace: Some(PathBuf::from("/tmp/workspace")),
24820            external_id: Some("franken-batch-upsert".into()),
24821            title: Some("Franken batch upsert".into()),
24822            source_path: PathBuf::from("/tmp/franken.jsonl"),
24823            started_at: Some(ts_ms),
24824            ended_at: Some(ts_ms + 60_000),
24825            approx_tokens: None,
24826            metadata_json: serde_json::Value::Null,
24827            messages: vec![
24828                Message {
24829                    id: None,
24830                    idx: 0,
24831                    role: MessageRole::User,
24832                    author: None,
24833                    created_at: Some(ts_ms),
24834                    content: "Please make a plan.".into(),
24835                    extra_json: serde_json::Value::Null,
24836                    snippets: vec![],
24837                },
24838                Message {
24839                    id: None,
24840                    idx: 1,
24841                    role: MessageRole::Agent,
24842                    author: None,
24843                    created_at: Some(ts_ms + 30_000),
24844                    content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
24845                    extra_json: usage_json,
24846                    snippets: vec![],
24847                },
24848            ],
24849            source_id: "local".into(),
24850            origin_host: None,
24851        };
24852
24853        let outcomes = storage
24854            .insert_conversations_batched(&[(agent_id, None, &conv)])
24855            .unwrap();
24856        assert_eq!(outcomes.len(), 1);
24857        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
24858
24859        let conn = storage.raw();
24860        let daily_stats_rows: i64 = conn
24861            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
24862                row.get_typed(0)
24863            })
24864            .unwrap();
24865        let token_daily_rows: i64 = conn
24866            .query_row_map(
24867                "SELECT COUNT(*) FROM token_daily_stats",
24868                fparams![],
24869                |row| row.get_typed(0),
24870            )
24871            .unwrap();
24872        let usage_daily_rows: i64 = conn
24873            .query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
24874                row.get_typed(0)
24875            })
24876            .unwrap();
24877        let model_daily_rows: i64 = conn
24878            .query_row_map(
24879                "SELECT COUNT(*) FROM usage_models_daily",
24880                fparams![],
24881                |row| row.get_typed(0),
24882            )
24883            .unwrap();
24884
24885        assert!(daily_stats_rows > 0, "daily_stats should be populated");
24886        assert!(
24887            token_daily_rows > 0,
24888            "token_daily_stats should be populated"
24889        );
24890        assert!(usage_daily_rows > 0, "usage_daily should be populated");
24891        assert!(
24892            model_daily_rows > 0,
24893            "usage_models_daily should be populated"
24894        );
24895    }
24896
24897    // =========================================================================
24898    // FrankenConnectionManager tests (bead 3rlf8)
24899    // =========================================================================
24900
24901    #[test]
24902    fn connection_manager_creates_readers() {
24903        let dir = TempDir::new().unwrap();
24904        let db_path = dir.path().join("cm.db");
24905
24906        // Create the DB first
24907        let fs = FrankenStorage::open(&db_path).unwrap();
24908        drop(fs);
24909
24910        let config = ConnectionManagerConfig {
24911            reader_count: 3,
24912            max_writers: 2,
24913        };
24914        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
24915        assert_eq!(mgr.reader_count(), 3);
24916        assert_eq!(mgr.max_writers(), 2);
24917    }
24918
24919    #[test]
24920    fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
24921        let dir = TempDir::new().unwrap();
24922        let db_path = dir.path().join("cm.db");
24923
24924        let fs = FrankenStorage::open(&db_path).unwrap();
24925        drop(fs);
24926
24927        let mgr = std::sync::Arc::new(
24928            FrankenConnectionManager::new(
24929                &db_path,
24930                ConnectionManagerConfig {
24931                    reader_count: 0,
24932                    max_writers: 0,
24933                },
24934            )
24935            .unwrap(),
24936        );
24937        assert_eq!(mgr.reader_count(), 1);
24938        assert_eq!(mgr.max_writers(), 1);
24939
24940        let (tx, rx) = std::sync::mpsc::channel();
24941        let mgr_for_thread = std::sync::Arc::clone(&mgr);
24942        std::thread::spawn(move || {
24943            let result = mgr_for_thread.writer().map(|mut guard| {
24944                guard.mark_committed();
24945            });
24946            tx.send(result.is_ok()).expect("writer result send");
24947        });
24948
24949        assert!(
24950            rx.recv_timeout(Duration::from_secs(10)).unwrap(),
24951            "writer acquisition should not block forever when configured with zero writer slots"
24952        );
24953    }
24954
24955    #[test]
24956    fn connection_manager_reader_round_robin() {
24957        let dir = TempDir::new().unwrap();
24958        let db_path = dir.path().join("cm.db");
24959
24960        let fs = FrankenStorage::open(&db_path).unwrap();
24961        drop(fs);
24962
24963        let config = ConnectionManagerConfig {
24964            reader_count: 2,
24965            max_writers: 1,
24966        };
24967        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
24968
24969        // Reader index should advance (round-robin)
24970        let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
24971        let _r1 = mgr.reader();
24972        let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
24973        assert_eq!(idx_after, idx_before + 1, "reader index should advance");
24974    }
24975
24976    #[test]
24977    fn connection_manager_writer_reads_and_writes() {
24978        use frankensqlite::compat::RowExt;
24979
24980        let dir = TempDir::new().unwrap();
24981        let db_path = dir.path().join("cm.db");
24982
24983        let fs = FrankenStorage::open(&db_path).unwrap();
24984        drop(fs);
24985
24986        let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
24987
24988        // Acquire writer and insert data
24989        {
24990            let mut guard = mgr.writer().unwrap();
24991            guard
24992                .storage()
24993                .raw()
24994                .execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
24995                .unwrap();
24996            guard
24997                .storage()
24998                .raw()
24999                .execute("INSERT INTO cm_test (val) VALUES ('hello')")
25000                .unwrap();
25001            guard.mark_committed();
25002        }
25003
25004        // Verify via reader (returns MutexGuard<SendFrankenConnection>)
25005        let reader_guard = mgr.reader();
25006        let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
25007        assert_eq!(rows.len(), 1);
25008        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
25009    }
25010
25011    #[test]
25012    fn connection_manager_writer_guard_drops_releases_slot() {
25013        let dir = TempDir::new().unwrap();
25014        let db_path = dir.path().join("cm.db");
25015
25016        let fs = FrankenStorage::open(&db_path).unwrap();
25017        drop(fs);
25018
25019        let config = ConnectionManagerConfig {
25020            reader_count: 1,
25021            max_writers: 1,
25022        };
25023        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25024
25025        // Acquire and release writer
25026        {
25027            let mut guard = mgr.writer().unwrap();
25028            guard.mark_committed();
25029        }
25030
25031        // Should be able to acquire again (slot released)
25032        let mut guard2 = mgr.writer().unwrap();
25033        guard2.mark_committed();
25034    }
25035
25036    #[test]
25037    fn connection_manager_concurrent_writer_works() {
25038        use frankensqlite::compat::RowExt;
25039
25040        let dir = TempDir::new().unwrap();
25041        let db_path = dir.path().join("cm.db");
25042
25043        let fs = FrankenStorage::open(&db_path).unwrap();
25044        drop(fs);
25045
25046        let config = ConnectionManagerConfig {
25047            reader_count: 1,
25048            max_writers: 2,
25049        };
25050        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25051
25052        {
25053            let mut guard = mgr.concurrent_writer().unwrap();
25054            guard
25055                .storage()
25056                .raw()
25057                .execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
25058                .unwrap();
25059            guard
25060                .storage()
25061                .raw()
25062                .execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
25063                .unwrap();
25064            guard.mark_committed();
25065        }
25066
25067        let reader_guard = mgr.reader();
25068        let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
25069        assert_eq!(rows.len(), 1);
25070        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
25071    }
25072
25073    #[test]
25074    fn connection_manager_default_config() {
25075        let config = ConnectionManagerConfig::default();
25076        assert_eq!(config.reader_count, 4);
25077        assert!(config.max_writers > 0);
25078    }
25079
25080    #[test]
25081    fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
25082        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25083        use std::path::PathBuf;
25084
25085        fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
25086            let agent = Agent {
25087                id: None,
25088                slug: agent_slug.into(),
25089                name: agent_slug.into(),
25090                version: None,
25091                kind: AgentKind::Cli,
25092            };
25093            let agent_id = storage.ensure_agent(&agent).unwrap();
25094            let conversation = Conversation {
25095                id: None,
25096                agent_slug: agent_slug.into(),
25097                workspace: Some(PathBuf::from("/tmp/workspace")),
25098                external_id: Some(format!("{agent_slug}-{marker}")),
25099                title: Some(format!("{agent_slug} {marker}")),
25100                source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
25101                started_at: Some(1_700_000_000_000),
25102                ended_at: Some(1_700_000_000_100),
25103                approx_tokens: None,
25104                metadata_json: serde_json::Value::Null,
25105                messages: vec![
25106                    Message {
25107                        id: None,
25108                        idx: 0,
25109                        role: MessageRole::User,
25110                        author: Some("user".into()),
25111                        created_at: Some(1_700_000_000_010),
25112                        content: format!("{agent_slug} {marker} user"),
25113                        extra_json: serde_json::Value::Null,
25114                        snippets: Vec::new(),
25115                    },
25116                    Message {
25117                        id: None,
25118                        idx: 1,
25119                        role: MessageRole::Agent,
25120                        author: Some("assistant".into()),
25121                        created_at: Some(1_700_000_000_020),
25122                        content: format!("{agent_slug} {marker} assistant"),
25123                        extra_json: serde_json::Value::Null,
25124                        snippets: Vec::new(),
25125                    },
25126                ],
25127                source_id: LOCAL_SOURCE_ID.into(),
25128                origin_host: None,
25129            };
25130            storage
25131                .insert_conversation_tree(agent_id, None, &conversation)
25132                .unwrap();
25133        }
25134
25135        let dir = TempDir::new().unwrap();
25136        let db_path = dir.path().join("agent_search.db");
25137        let storage = FrankenStorage::open(&db_path).unwrap();
25138
25139        seed_conversation(&storage, "openclaw", "purge-target");
25140        seed_conversation(&storage, "codex", "keep-target");
25141
25142        let purge = storage.purge_agent_archive_data("openclaw").unwrap();
25143        assert_eq!(purge.conversations_deleted, 1);
25144        assert_eq!(purge.messages_deleted, 2);
25145
25146        storage.rebuild_fts().unwrap();
25147        storage.rebuild_analytics().unwrap();
25148        storage.rebuild_daily_stats().unwrap();
25149        storage.rebuild_token_daily_stats().unwrap();
25150
25151        let agents = storage.list_agents().unwrap();
25152        assert_eq!(agents.len(), 1);
25153        assert_eq!(agents[0].slug, "codex");
25154        assert_eq!(storage.total_conversation_count().unwrap(), 1);
25155        assert_eq!(storage.total_message_count().unwrap(), 2);
25156
25157        let fts_rows: i64 = storage
25158            .raw()
25159            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25160                row.get_typed(0)
25161            })
25162            .unwrap();
25163        assert_eq!(fts_rows, 2);
25164
25165        let total_daily_sessions: i64 = storage
25166            .raw()
25167            .query_row_map(
25168                "SELECT COALESCE(SUM(session_count), 0)
25169                 FROM daily_stats
25170                 WHERE agent_slug = 'all' AND source_id = 'all'",
25171                fparams![],
25172                |row| row.get_typed(0),
25173            )
25174            .unwrap();
25175        assert_eq!(total_daily_sessions, 1);
25176
25177        let openclaw_token_rows: i64 = storage
25178            .raw()
25179            .query_row_map(
25180                "SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
25181                fparams![],
25182                |row| row.get_typed(0),
25183            )
25184            .unwrap();
25185        assert_eq!(openclaw_token_rows, 0);
25186    }
25187
25188    /// Regression for cass#202: a `Connection` dropped mid-transaction can
25189    /// leave child rows persisted without a matching parent. The next indexer
25190    /// pass then trips `FOREIGN KEY constraint failed` on every write, the
25191    /// session never gets marked indexed, and the pending backlog grows
25192    /// without bound. `cleanup_orphan_fk_rows` is the indexer-startup
25193    /// self-heal that breaks the cycle.
25194    #[test]
25195    fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
25196        let dir = TempDir::new().unwrap();
25197        let db_path = dir.path().join("orphan_fk_self_heal.db");
25198        let storage = FrankenStorage::open(&db_path).unwrap();
25199
25200        // Plant orphan rows directly: rows whose FK parent does not exist.
25201        // FK enforcement is temporarily off so the planted rows can land.
25202        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25203
25204        // Seed a real conversation so a subset of children DO have valid
25205        // parents — we want the cleanup to be precise, not a table-flush.
25206        storage
25207            .raw()
25208            .execute_compat(
25209                "INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
25210                 VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
25211                fparams![],
25212            )
25213            .unwrap();
25214        storage
25215            .raw()
25216            .execute_compat(
25217                "INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
25218                 VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
25219                fparams![],
25220            )
25221            .unwrap();
25222        storage
25223            .raw()
25224            .execute_compat(
25225                "INSERT INTO messages(id, conversation_id, idx, role, content) \
25226                 VALUES(1, 1, 0, 'user', 'real message')",
25227                fparams![],
25228            )
25229            .unwrap();
25230
25231        // Plant orphan messages referencing conversation_id=99999 (does not exist)
25232        // and conversation_id=0 (the specific shape reported in #202). Distinct
25233        // (conversation_id, idx) pairs are required by the UNIQUE constraint.
25234        for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
25235            storage
25236                .raw()
25237                .execute_compat(
25238                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25239                     VALUES(?1, ?2, ?3, 'user', 'orphan message')",
25240                    fparams![mid, cid, idx],
25241                )
25242                .unwrap();
25243        }
25244
25245        // Rows below are not directly orphaned because their immediate
25246        // `messages` parent exists, but that parent is itself orphaned. The
25247        // cleanup deletes them explicitly before deleting orphan messages so the
25248        // FK cascade engine does not have to run one delete program per orphan.
25249        for message_id in [1_i64, 101_i64, 102_i64] {
25250            storage
25251                .raw()
25252                .execute_compat(
25253                    "INSERT INTO message_metrics(
25254                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25255                         role, content_chars, content_tokens_est
25256                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
25257                    fparams![message_id],
25258                )
25259                .unwrap();
25260            storage
25261                .raw()
25262                .execute_compat(
25263                    "INSERT INTO token_usage(
25264                         message_id, conversation_id, agent_id, timestamp_ms, day_id,
25265                         role, content_chars
25266                     ) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
25267                    fparams![message_id],
25268                )
25269                .unwrap();
25270        }
25271
25272        // Plant a directly-orphan snippet — message_id=99999 does not exist
25273        // anywhere, so this exercises the snippets DELETE path rather than
25274        // riding on the cascade from the orphan-message DELETE.
25275        storage
25276            .raw()
25277            .execute_compat(
25278                "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
25279                 VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
25280                fparams![],
25281            )
25282            .unwrap();
25283
25284        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25285
25286        // Sanity: the planted orphans are visible.
25287        let messages_before: i64 = storage
25288            .raw()
25289            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25290                row.get_typed(0)
25291            })
25292            .unwrap();
25293        assert_eq!(messages_before, 4); // 1 real + 3 orphans
25294        let snippets_before: i64 = storage
25295            .raw()
25296            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25297                row.get_typed(0)
25298            })
25299            .unwrap();
25300        assert_eq!(snippets_before, 1);
25301        let metrics_before: i64 = storage
25302            .raw()
25303            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25304                row.get_typed(0)
25305            })
25306            .unwrap();
25307        assert_eq!(metrics_before, 3);
25308        let token_usage_before: i64 = storage
25309            .raw()
25310            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25311                row.get_typed(0)
25312            })
25313            .unwrap();
25314        assert_eq!(token_usage_before, 3);
25315
25316        // Run the self-heal.
25317        let report = storage.cleanup_orphan_fk_rows().unwrap();
25318
25319        // 3 orphan messages + 1 directly-orphan snippet = 4 primary orphans
25320        // reported. Dependent message_metrics/token_usage rows for orphan
25321        // messages are pruned too, but they are not double-counted because the
25322        // orphan message is the root row that made them invalid.
25323        let messages_after: i64 = storage
25324            .raw()
25325            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25326                row.get_typed(0)
25327            })
25328            .unwrap();
25329        assert_eq!(messages_after, 1, "real message must be preserved");
25330        let snippets_after: i64 = storage
25331            .raw()
25332            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25333                row.get_typed(0)
25334            })
25335            .unwrap();
25336        assert_eq!(snippets_after, 0);
25337        let metrics_after: i64 = storage
25338            .raw()
25339            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25340                row.get_typed(0)
25341            })
25342            .unwrap();
25343        assert_eq!(metrics_after, 1, "real message metric must be preserved");
25344        let token_usage_after: i64 = storage
25345            .raw()
25346            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25347                row.get_typed(0)
25348            })
25349            .unwrap();
25350        assert_eq!(token_usage_after, 1, "real token row must be preserved");
25351
25352        assert_eq!(report.total, 4, "report total: {:?}", report);
25353        let messages_count = report
25354            .per_table
25355            .iter()
25356            .find(|(t, _)| *t == "messages")
25357            .map(|(_, c)| *c);
25358        assert_eq!(messages_count, Some(3));
25359        let snippets_count = report
25360            .per_table
25361            .iter()
25362            .find(|(t, _)| *t == "snippets")
25363            .map(|(_, c)| *c);
25364        assert_eq!(snippets_count, Some(1));
25365
25366        // Second invocation on a now-clean DB must be a no-op.
25367        let second = storage.cleanup_orphan_fk_rows().unwrap();
25368        assert_eq!(second.total, 0);
25369        assert!(second.per_table.is_empty());
25370    }
25371
25372    #[test]
25373    fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
25374        let dir = TempDir::new().unwrap();
25375        let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
25376        let storage = FrankenStorage::open(&db_path).unwrap();
25377        let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
25378
25379        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25380        {
25381            let mut tx = storage.raw().transaction().unwrap();
25382            for idx in 0..orphan_count {
25383                let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
25384                let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
25385                tx.execute_compat(
25386                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25387                     VALUES(?1, ?2, 0, 'user', 'orphan message')",
25388                    fparams![message_id, conversation_id],
25389                )
25390                .unwrap();
25391                tx.execute_compat(
25392                    "INSERT INTO message_metrics(
25393                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25394                         role, content_chars, content_tokens_est
25395                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
25396                    fparams![message_id],
25397                )
25398                .unwrap();
25399            }
25400            tx.commit().unwrap();
25401        }
25402        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25403
25404        let report = storage.cleanup_orphan_fk_rows().unwrap();
25405
25406        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25407        let messages_count = report
25408            .per_table
25409            .iter()
25410            .find(|(table, _)| *table == "messages")
25411            .map(|(_, count)| *count);
25412        assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
25413        let messages_after: i64 = storage
25414            .raw()
25415            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25416                row.get_typed(0)
25417            })
25418            .unwrap();
25419        assert_eq!(messages_after, 0);
25420        let metrics_after: i64 = storage
25421            .raw()
25422            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25423                row.get_typed(0)
25424            })
25425            .unwrap();
25426        assert_eq!(metrics_after, 0);
25427    }
25428
25429    #[test]
25430    fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
25431        let dir = TempDir::new().unwrap();
25432        let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
25433        let storage = FrankenStorage::open(&db_path).unwrap();
25434        let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
25435
25436        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25437        {
25438            let mut tx = storage.raw().transaction().unwrap();
25439            for idx in 0..orphan_count {
25440                let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
25441                tx.execute_compat(
25442                    "INSERT INTO message_metrics(
25443                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25444                         role, content_chars, content_tokens_est
25445                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
25446                    fparams![message_id],
25447                )
25448                .unwrap();
25449            }
25450            tx.commit().unwrap();
25451        }
25452        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25453
25454        let report = storage.cleanup_orphan_fk_rows().unwrap();
25455
25456        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25457        let metrics_count = report
25458            .per_table
25459            .iter()
25460            .filter(|(table, _)| *table == "message_metrics")
25461            .map(|(_, count)| *count)
25462            .sum::<i64>();
25463        assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
25464        assert_eq!(
25465            report
25466                .per_table
25467                .iter()
25468                .filter(|(table, _)| *table == "message_metrics")
25469                .count(),
25470            1,
25471            "paged cleanup should aggregate report entries by table: {report:?}"
25472        );
25473        let metrics_after: i64 = storage
25474            .raw()
25475            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25476                row.get_typed(0)
25477            })
25478            .unwrap();
25479        assert_eq!(metrics_after, 0);
25480    }
25481}