Skip to main content

coding_agent_search/storage/
sqlite.rs

1//! `SQLite` backend: schema, pragmas, and migrations.
2
3use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
4use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
5use anyhow::{Context, Result, anyhow, bail};
6use frankensqlite::{
7    Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
8    compat::{
9        ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
10        OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
11        Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
12        open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
13    },
14    migrate::MigrationRunner,
15};
16use serde::{Deserialize, Serialize};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::io::{BufRead, BufReader, Write};
22use std::process::{Command, Stdio};
23use std::sync::{
24    Arc,
25    atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
26};
27
28/// Frankensqlite parameter list builder.
29macro_rules! fparams {
30    () => {
31        &[] as &[ParamValue]
32    };
33    ($($val:expr),+ $(,)?) => {
34        &[$(ParamValue::from($val)),+] as &[ParamValue]
35    };
36}
37use std::path::{Path, PathBuf};
38use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
39use thiserror::Error;
40use tracing::info;
41
42const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
43const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
44
45// -------------------------------------------------------------------------
46// Lazy FrankenSQLite Connection (bd-1ueu)
47// -------------------------------------------------------------------------
48// Defers opening the database until first use, cutting startup cost for
49// commands that may not need the DB at all.  Thread-safe via parking_lot
50// Mutex; logs the reason and duration of the open on first access.
51
52/// Error from lazy database initialization.
53#[derive(Debug, Error)]
54pub enum LazyDbError {
55    #[error("Database not found at {0}")]
56    NotFound(PathBuf),
57    #[error("Failed to open FrankenSQLite database at {path}: {source}")]
58    FrankenOpenFailed {
59        path: PathBuf,
60        source: frankensqlite::FrankenError,
61    },
62}
63
64// -------------------------------------------------------------------------
65// LazyFrankenDb — lazy wrapper around FrankenConnection
66// -------------------------------------------------------------------------
67
68/// Wrapper around `FrankenConnection` that implements `Send`.
69///
70/// `FrankenConnection` is `!Send` because it uses `Rc` internally.
71/// However, the `Rc` values are entirely self-contained within the Connection
72/// and are not shared externally.  When wrapped in a `Mutex`,
73/// exclusive access is guaranteed, making cross-thread transfer safe.
74pub struct SendFrankenConnection(FrankenConnection, i64, u64);
75
76// Safety: Rc fields inside FrankenConnection are not cloned or shared externally.
77// The Mutex<Option<SendFrankenConnection>> ensures exclusive access.
78unsafe impl Send for SendFrankenConnection {}
79
80impl SendFrankenConnection {
81    pub(crate) fn new(conn: FrankenConnection) -> Self {
82        Self(
83            conn,
84            UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
85            UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
86        )
87    }
88
89    pub(crate) fn new_with_index_writer_state(
90        conn: FrankenConnection,
91        checkpoint_pages: i64,
92        busy_timeout_ms: u64,
93    ) -> Self {
94        Self(conn, checkpoint_pages, busy_timeout_ms)
95    }
96
97    pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
98        (self.0, self.1, self.2)
99    }
100}
101
102impl std::ops::Deref for SendFrankenConnection {
103    type Target = FrankenConnection;
104    fn deref(&self) -> &FrankenConnection {
105        &self.0
106    }
107}
108
109/// Lazy-opening wrapper for `FrankenConnection` (frankensqlite).
110///
111/// Constructing a `LazyFrankenDb` is cheap (no I/O).  The underlying
112/// `FrankenConnection` is opened on the first call to [`get`].
113/// Subsequent calls return the cached connection.
114pub struct LazyFrankenDb {
115    path: PathBuf,
116    conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
117}
118
119/// RAII guard that dereferences to the inner `FrankenConnection`.
120pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
121
122impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_tuple("LazyFrankenDbGuard")
125            .field(&self.0.is_some())
126            .finish()
127    }
128}
129
130impl std::ops::Deref for LazyFrankenDbGuard<'_> {
131    type Target = FrankenConnection;
132    fn deref(&self) -> &FrankenConnection {
133        self.0
134            .as_ref()
135            .expect("LazyFrankenDb connection must be initialized before access")
136    }
137}
138
139impl LazyFrankenDb {
140    /// Create a lazy handle pointing at `path`.  No I/O is performed.
141    pub fn new(path: PathBuf) -> Self {
142        Self {
143            path,
144            conn: parking_lot::Mutex::new(None),
145        }
146    }
147
148    /// Resolve path from optional CLI overrides.
149    ///
150    /// Uses `data_dir / agent_search.db` as fallback.
151    pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
152        let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
153        let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
154        Self::new(path)
155    }
156
157    /// Get the connection, opening the database on first access.
158    ///
159    /// `reason` is logged alongside the open duration so callers can
160    /// identify which command triggered the open.
161    pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
162        let mut guard = self.conn.lock();
163        if guard.is_none() {
164            if !self.path.exists() {
165                return Err(LazyDbError::NotFound(self.path.clone()));
166            }
167            let start = Instant::now();
168            let _doctor_guard = acquire_doctor_mutation_db_open_guard(
169                &self.path,
170                DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
171            )
172            .map_err(|err| LazyDbError::FrankenOpenFailed {
173                path: self.path.clone(),
174                source: frankensqlite::FrankenError::Internal(err.to_string()),
175            })?;
176            let conn =
177                FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
178                    LazyDbError::FrankenOpenFailed {
179                        path: self.path.clone(),
180                        source: e,
181                    }
182                })?;
183            let elapsed_ms = start.elapsed().as_millis();
184            info!(
185                path = %self.path.display(),
186                elapsed_ms = elapsed_ms,
187                reason = reason,
188                "lazily opened FrankenSQLite database"
189            );
190            *guard = Some(SendFrankenConnection::new(conn));
191        }
192        Ok(LazyFrankenDbGuard(guard))
193    }
194
195    /// Get the connection with a timeout, opening the database on first access.
196    ///
197    /// Like [`get`] but spawns the open in a background thread and waits up to
198    /// `timeout` for it to complete. Returns `LazyDbError::FrankenOpenFailed`
199    /// with a descriptive message if the timeout elapses. Fix for #128.
200    pub fn get_with_timeout(
201        &self,
202        reason: &str,
203        timeout: Duration,
204    ) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
205        let mut guard = self.conn.lock();
206        if guard.is_none() {
207            if !self.path.exists() {
208                return Err(LazyDbError::NotFound(self.path.clone()));
209            }
210            let start = Instant::now();
211            let path_owned = self.path.to_string_lossy().into_owned();
212            let path_for_guard = self.path.clone();
213            let (tx, rx) = std::sync::mpsc::channel();
214            std::thread::spawn(move || {
215                let _doctor_guard =
216                    match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
217                        Ok(guard) => guard,
218                        Err(err) => {
219                            let _ = tx
220                                .send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
221                            return;
222                        }
223                    };
224                let _ =
225                    tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
226            });
227            let conn = rx
228                .recv_timeout(timeout)
229                .map_err(|_| LazyDbError::FrankenOpenFailed {
230                    path: self.path.clone(),
231                    source: frankensqlite::FrankenError::Internal(format!(
232                        "database open timed out after {}s (possible corruption or lock contention)",
233                        timeout.as_secs()
234                    )),
235                })?
236                .map_err(|e| LazyDbError::FrankenOpenFailed {
237                    path: self.path.clone(),
238                    source: e,
239                })?;
240            let elapsed_ms = start.elapsed().as_millis();
241            info!(
242                path = %self.path.display(),
243                elapsed_ms = elapsed_ms,
244                reason = reason,
245                "lazily opened FrankenSQLite database (with timeout)"
246            );
247            *guard = Some(conn);
248        }
249        Ok(LazyFrankenDbGuard(guard))
250    }
251
252    /// Path to the database file (even if not yet opened).
253    pub fn path(&self) -> &Path {
254        &self.path
255    }
256
257    /// Whether the connection has been opened.
258    pub fn is_open(&self) -> bool {
259        self.conn.lock().is_some()
260    }
261}
262
263static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
264static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
265static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
266static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
267static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
268static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
269static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
270
271#[derive(Debug, Clone, Copy, Default, Serialize)]
272pub(crate) struct MessageLookupTraceCounters {
273    pub exact_idx_probes: u64,
274    pub bounded_lookup_queries: u64,
275    pub full_scan_queries: u64,
276    pub rows_materialized: u64,
277}
278
279impl MessageLookupTraceCounters {
280    pub(crate) fn saturating_sub(self, before: Self) -> Self {
281        Self {
282            exact_idx_probes: self
283                .exact_idx_probes
284                .saturating_sub(before.exact_idx_probes),
285            bounded_lookup_queries: self
286                .bounded_lookup_queries
287                .saturating_sub(before.bounded_lookup_queries),
288            full_scan_queries: self
289                .full_scan_queries
290                .saturating_sub(before.full_scan_queries),
291            rows_materialized: self
292                .rows_materialized
293                .saturating_sub(before.rows_materialized),
294        }
295    }
296
297    pub(crate) fn lookups_against_global(self) -> u64 {
298        self.exact_idx_probes.saturating_add(self.rows_materialized)
299    }
300}
301
302pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
303    MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
304}
305
306pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
307    MessageLookupTraceCounters {
308        exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
309        bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
310        full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
311        rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
312    }
313}
314
315fn record_message_lookup_exact_idx_probe() {
316    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
317        MESSAGE_LOOKUP_EXACT_IDX_PROBES.fetch_add(1, Ordering::Relaxed);
318    }
319}
320
321fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
322    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
323        MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
324        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
325    }
326}
327
328fn record_message_lookup_full_scan_query(rows: usize) {
329    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
330        MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
331        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
332    }
333}
334
335pub(crate) struct DoctorMutationDbOpenBypassGuard;
336
337impl Drop for DoctorMutationDbOpenBypassGuard {
338    fn drop(&mut self) {
339        DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
340    }
341}
342
343pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
344    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
345    DoctorMutationDbOpenBypassGuard
346}
347
348fn doctor_mutation_db_open_bypass_active() -> bool {
349    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
350}
351
352fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
353    let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
354    value ^= value >> 30;
355    value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
356    value ^= value >> 27;
357    value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
358    value ^= value >> 31;
359    value % max_inclusive.saturating_add(1)
360}
361
362/// Sleep with jittered exponential backoff to avoid lock-step retry storms
363/// when many threads hit the same transient SQLite/frankensqlite contention.
364pub(crate) fn sleep_with_franken_retry_backoff(
365    backoff: &mut Duration,
366    remaining: Duration,
367    max_backoff: Duration,
368) {
369    let capped = (*backoff).min(remaining);
370    let extra_budget = remaining.saturating_sub(capped).min(capped);
371    let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
372    let sleep_for = if extra_ms == 0 {
373        capped
374    } else {
375        capped
376            .saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
377                extra_ms,
378            )))
379            .min(remaining)
380    };
381    std::thread::sleep(sleep_for);
382    *backoff = backoff.saturating_mul(2).min(max_backoff);
383}
384
385struct DoctorMutationDbOpenGuard(Option<fs::File>);
386
387impl Drop for DoctorMutationDbOpenGuard {
388    fn drop(&mut self) {
389        if let Some(file) = self.0.as_ref() {
390            let _ = fs2::FileExt::unlock(file);
391        }
392    }
393}
394
395fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
396    if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
397        return None;
398    }
399
400    Some(
401        db_path
402            .parent()?
403            .join("doctor")
404            .join("locks")
405            .join("doctor-repair.lock"),
406    )
407}
408
409fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
410    raw.lines().any(|line| {
411        let Some((key, value)) = line.split_once('=') else {
412            return false;
413        };
414        key.trim() == "pid"
415            && value
416                .trim()
417                .parse::<u32>()
418                .is_ok_and(|pid| pid == std::process::id())
419    })
420}
421
422fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
423    use std::io::Read as _;
424
425    let Ok(mut file) = file.try_clone() else {
426        return false;
427    };
428    let mut raw = String::new();
429    let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
430        .read_to_string(&mut raw);
431    doctor_lock_metadata_pid_is_current_process(&raw)
432}
433
434fn doctor_mutation_lock_error_is_active(err: &std::io::Error) -> bool {
435    if err.kind() == std::io::ErrorKind::WouldBlock {
436        return true;
437    }
438
439    #[cfg(windows)]
440    {
441        err.raw_os_error() == Some(33)
442    }
443    #[cfg(not(windows))]
444    {
445        false
446    }
447}
448
449fn acquire_doctor_mutation_db_open_guard(
450    db_path: &Path,
451    timeout: Duration,
452) -> Result<DoctorMutationDbOpenGuard> {
453    let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
454        return Ok(DoctorMutationDbOpenGuard(None));
455    };
456    if doctor_mutation_db_open_bypass_active() {
457        return Ok(DoctorMutationDbOpenGuard(None));
458    }
459
460    if let Some(parent) = lock_path.parent() {
461        fs::create_dir_all(parent).with_context(|| {
462            format!(
463                "creating doctor mutation lock directory {} before opening {}",
464                parent.display(),
465                db_path.display()
466            )
467        })?;
468    }
469
470    let deadline = Instant::now() + timeout;
471    let mut backoff = Duration::from_millis(4);
472    loop {
473        let file = fs::OpenOptions::new()
474            .create(true)
475            .truncate(false)
476            .read(true)
477            .write(true)
478            .open(&lock_path)
479            .with_context(|| {
480                format!(
481                    "opening doctor mutation lock {} before opening {}",
482                    lock_path.display(),
483                    db_path.display()
484                )
485            })?;
486
487        if doctor_lock_file_pid_is_current_process(&file) {
488            return Ok(DoctorMutationDbOpenGuard(None));
489        }
490
491        match fs2::FileExt::try_lock_shared(&file) {
492            Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
493            Err(err) if doctor_mutation_lock_error_is_active(&err) => {
494                let now = Instant::now();
495                if now >= deadline {
496                    return Err(anyhow!(
497                        "doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
498                        lock_path.display(),
499                        db_path.display(),
500                        timeout.as_millis()
501                    ));
502                }
503                let remaining = deadline.saturating_duration_since(now);
504                sleep_with_franken_retry_backoff(
505                    &mut backoff,
506                    remaining,
507                    Duration::from_millis(128),
508                );
509            }
510            Err(err) => {
511                return Err(anyhow!(
512                    "failed to acquire shared doctor mutation lock {} before opening {}: {}",
513                    lock_path.display(),
514                    db_path.display(),
515                    err
516                ));
517            }
518        }
519    }
520}
521
522pub(crate) fn open_franken_storage_with_timeout(
523    path: &Path,
524    timeout: Duration,
525) -> Result<FrankenStorage> {
526    if !path.exists() {
527        return Err(anyhow!("Database not found at {}", path.display()));
528    }
529
530    let deadline = Instant::now() + timeout;
531    let mut backoff = Duration::from_millis(4);
532    loop {
533        match FrankenStorage::open(path) {
534            Ok(storage) => return Ok(storage),
535            Err(err) if retryable_franken_anyhow(&err) => {
536                let now = Instant::now();
537                if now >= deadline {
538                    return Err(err);
539                }
540                let remaining = deadline.saturating_duration_since(now);
541                sleep_with_franken_retry_backoff(
542                    &mut backoff,
543                    remaining,
544                    Duration::from_millis(128),
545                );
546            }
547            Err(err) => return Err(err),
548        }
549    }
550}
551
552pub(crate) fn open_current_schema_storage_with_timeout(
553    path: &Path,
554    timeout: Duration,
555) -> Result<Option<FrankenStorage>> {
556    if !path.exists() {
557        return Ok(None);
558    }
559
560    let mut storage = FrankenStorage::new(
561        open_franken_raw_connection_with_timeout(path, timeout)?,
562        path.to_path_buf(),
563    );
564    storage.apply_open_stage_busy_timeout();
565
566    let version = storage
567        .raw()
568        .query("SELECT value FROM meta WHERE key = 'schema_version';")
569        .ok()
570        .and_then(|rows| rows.first().cloned())
571        .and_then(|row| row.get_typed::<String>(0).ok())
572        .and_then(|raw| raw.parse::<i64>().ok());
573
574    if version != Some(CURRENT_SCHEMA_VERSION) {
575        if let Err(close_err) = storage.close_without_checkpoint_in_place() {
576            tracing::debug!(
577                error = %close_err,
578                db_path = %path.display(),
579                "open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
580            );
581            storage.close_best_effort_in_place();
582        }
583        return Ok(None);
584    }
585
586    transition_from_meta_version(&storage.conn)?;
587    storage.repair_missing_current_schema_objects()?;
588    storage.apply_config()?;
589    Ok(Some(storage))
590}
591
592pub(crate) fn open_franken_readonly_storage_with_timeout(
593    path: &Path,
594    timeout: Duration,
595) -> Result<FrankenStorage> {
596    if !path.exists() {
597        return Err(anyhow!("Database not found at {}", path.display()));
598    }
599
600    let deadline = Instant::now() + timeout;
601    let mut backoff = Duration::from_millis(4);
602    loop {
603        match FrankenStorage::open_readonly(path) {
604            Ok(storage) => return Ok(storage),
605            Err(err) if retryable_franken_anyhow(&err) => {
606                let now = Instant::now();
607                if now >= deadline {
608                    return Err(err);
609                }
610                let remaining = deadline.saturating_duration_since(now);
611                sleep_with_franken_retry_backoff(
612                    &mut backoff,
613                    remaining,
614                    Duration::from_millis(128),
615                );
616            }
617            Err(err) => return Err(err),
618        }
619    }
620}
621
622pub(crate) fn open_franken_raw_connection_with_timeout(
623    path: &Path,
624    timeout: Duration,
625) -> Result<FrankenConnection> {
626    if !path.exists() {
627        return Err(anyhow!("Database not found at {}", path.display()));
628    }
629
630    let path_str = path.to_string_lossy().to_string();
631    let deadline = Instant::now() + timeout;
632    let mut backoff = Duration::from_millis(4);
633    loop {
634        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
635        match FrankenConnection::open(&path_str)
636            .with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
637        {
638            Ok(conn) => return Ok(conn),
639            Err(err) if retryable_franken_anyhow(&err) => {
640                let now = Instant::now();
641                if now >= deadline {
642                    return Err(err);
643                }
644                let remaining = deadline.saturating_duration_since(now);
645                sleep_with_franken_retry_backoff(
646                    &mut backoff,
647                    remaining,
648                    Duration::from_millis(128),
649                );
650            }
651            Err(err) => return Err(err),
652        }
653    }
654}
655
656pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
657    path: &Path,
658    timeout: Duration,
659) -> Result<FrankenConnection> {
660    if !path.exists() {
661        return Err(anyhow!("Database not found at {}", path.display()));
662    }
663
664    let path_str = path.to_string_lossy().to_string();
665    let deadline = Instant::now() + timeout;
666    let mut backoff = Duration::from_millis(4);
667    loop {
668        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
669        match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
670            .with_context(|| {
671                format!(
672                    "opening raw frankensqlite db readonly at {}",
673                    path.display()
674                )
675            }) {
676            Ok(conn) => return Ok(conn),
677            Err(err) if retryable_franken_anyhow(&err) => {
678                let now = Instant::now();
679                if now >= deadline {
680                    return Err(err);
681                }
682                let remaining = deadline.saturating_duration_since(now);
683                sleep_with_franken_retry_backoff(
684                    &mut backoff,
685                    remaining,
686                    Duration::from_millis(128),
687                );
688            }
689            Err(err) => return Err(err),
690        }
691    }
692}
693
694pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
695    matches!(
696        err,
697        frankensqlite::FrankenError::Busy
698            | frankensqlite::FrankenError::BusyRecovery
699            | frankensqlite::FrankenError::BusySnapshot { .. }
700            | frankensqlite::FrankenError::DatabaseLocked { .. }
701            | frankensqlite::FrankenError::LockFailed { .. }
702            | frankensqlite::FrankenError::WriteConflict { .. }
703            | frankensqlite::FrankenError::SerializationFailure { .. }
704    ) || retryable_storage_error_message(&err.to_string())
705}
706
707pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
708    let lower = message.to_ascii_lowercase();
709    lower.contains("busy")
710        || lower.contains("locked")
711        || lower.contains("locking")
712        || lower.contains("contention")
713        || lower.contains("temporarily unavailable")
714        || lower.contains("would block")
715}
716
717pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
718    err.chain().any(|cause| {
719        cause
720            .downcast_ref::<frankensqlite::FrankenError>()
721            .is_some_and(retryable_franken_error)
722            || retryable_storage_error_message(&cause.to_string())
723    })
724}
725
726impl Drop for LazyFrankenDb {
727    fn drop(&mut self) {
728        let Some(mut conn) = self.conn.get_mut().take() else {
729            return;
730        };
731        conn.0.close_best_effort_in_place();
732    }
733}
734
735// -------------------------------------------------------------------------
736// FrankenSQLite Connection Manager (bead 3rlf8)
737// -------------------------------------------------------------------------
738// Multi-connection management: reader pool + concurrent writer connections.
739// Replaces the LazyFrankenDb single-connection bottleneck for high-throughput
740// scenarios (indexer parallel writes, concurrent TUI reads + indexer writes).
741
742/// Configuration for the [`FrankenConnectionManager`].
743#[derive(Debug, Clone)]
744pub struct ConnectionManagerConfig {
745    /// Number of pre-opened reader connections (default: 4).
746    pub reader_count: usize,
747    /// Maximum concurrent writer connections (default: available parallelism).
748    pub max_writers: usize,
749}
750
751impl Default for ConnectionManagerConfig {
752    fn default() -> Self {
753        let cpus = std::thread::available_parallelism()
754            .map(|n| n.get())
755            .unwrap_or(4);
756        Self {
757            reader_count: 4,
758            max_writers: cpus,
759        }
760    }
761}
762
763/// Multi-connection manager for frankensqlite.
764///
765/// Provides:
766/// - A pool of pre-opened reader connections (round-robin, Mutex-protected)
767/// - Controlled creation of writer connections with token-based limits
768/// - RAII guards that auto-rollback uncommitted transactions on drop
769///
770/// Thread-safe: reader connections are wrapped in Mutex (FrankenConnection is !Sync).
771/// Writer connections are created per-request (each thread gets its own).
772pub struct FrankenConnectionManager {
773    db_path: PathBuf,
774    readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
775    reader_idx: std::sync::atomic::AtomicUsize,
776    /// Token-based writer limit: channel pre-filled with `max_writers` tokens.
777    /// `recv()` = acquire slot, `send()` = release slot.
778    writer_tokens: (
779        crossbeam_channel::Sender<()>,
780        crossbeam_channel::Receiver<()>,
781    ),
782    config: ConnectionManagerConfig,
783}
784
785// Safety: FrankenConnectionManager is Send+Sync because:
786// - readers wrapped in Mutex<SendFrankenConnection> (exclusive access)
787// - writer_tokens uses crossbeam (Send+Sync)
788// - db_path is PathBuf (Send+Sync)
789unsafe impl Send for FrankenConnectionManager {}
790unsafe impl Sync for FrankenConnectionManager {}
791
792impl FrankenConnectionManager {
793    /// Create a new connection manager.
794    ///
795    /// Opens `config.reader_count` reader connections immediately.
796    /// Writer connections are created on demand (up to `config.max_writers`).
797    pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
798        let db_path = db_path.into();
799        let path_str = db_path.to_string_lossy().to_string();
800
801        let reader_count = config.reader_count.max(1);
802        let mut readers = Vec::with_capacity(reader_count);
803        for _ in 0..reader_count {
804            let conn = FrankenConnection::open(&path_str)
805                .with_context(|| format!("opening reader connection at {}", db_path.display()))?;
806            // Apply read-tuned config (no migration, no write PRAGMAs)
807            let _ = conn.execute("PRAGMA busy_timeout = 5000;"); // match writer config
808            let _ = conn.execute("PRAGMA cache_size = -16384;"); // 16MB reader cache
809            readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
810        }
811
812        let max_writers = config.max_writers.max(1);
813
814        // Pre-fill bounded channel with tokens (acts as counting semaphore).
815        // A zero-capacity channel with no initial tokens would make the first
816        // writer acquisition block forever.
817        let (tx, rx) = crossbeam_channel::bounded(max_writers);
818        for _ in 0..max_writers {
819            tx.send(())
820                .map_err(|_| anyhow!("writer token channel closed during initialization"))?;
821        }
822
823        Ok(Self {
824            db_path,
825            readers,
826            reader_idx: std::sync::atomic::AtomicUsize::new(0),
827            writer_tokens: (tx, rx),
828            config: ConnectionManagerConfig {
829                reader_count,
830                max_writers,
831            },
832        })
833    }
834
835    /// Get a reader connection (round-robin from the pool).
836    ///
837    /// Returns a mutex guard wrapping the connection. The guard prevents
838    /// concurrent access to the same connection (FrankenConnection is !Sync).
839    pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
840        let idx = self
841            .reader_idx
842            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
843        self.readers[idx % self.readers.len()].lock()
844    }
845
846    /// Acquire a writer connection.
847    ///
848    /// Opens a new frankensqlite connection with full config (no migration).
849    /// Blocks if `max_writers` connections are already in use.
850    /// The returned [`WriterGuard`] auto-rolls back on drop.
851    pub fn writer(&self) -> Result<WriterGuard<'_>> {
852        self.writer_tokens
853            .1
854            .recv()
855            .map_err(|_| anyhow!("writer token channel closed"))?;
856        let path_str = self.db_path.to_string_lossy().to_string();
857        let conn = match FrankenConnection::open(&path_str) {
858            Ok(c) => c,
859            Err(e) => {
860                let _ = self.writer_tokens.0.send(());
861                return Err(anyhow::Error::from(e).context(format!(
862                    "opening writer connection at {}",
863                    self.db_path.display()
864                )));
865            }
866        };
867        let storage = FrankenStorage::new(conn, self.db_path.clone());
868        if let Err(e) = storage.apply_config() {
869            let _ = self.writer_tokens.0.send(());
870            return Err(e);
871        }
872        Ok(WriterGuard {
873            storage,
874            mgr: self,
875            committed: false,
876        })
877    }
878
879    /// Acquire a concurrent writer connection (BEGIN CONCURRENT via MVCC).
880    ///
881    /// Similar to [`writer`] but tuned for the parallel indexer write pool.
882    /// Uses reduced cache size and is designed for short-lived batch inserts.
883    pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
884        self.writer_tokens
885            .1
886            .recv()
887            .map_err(|_| anyhow!("writer token channel closed"))?;
888        let path_str = self.db_path.to_string_lossy().to_string();
889        let conn = match FrankenConnection::open(&path_str) {
890            Ok(c) => c,
891            Err(e) => {
892                let _ = self.writer_tokens.0.send(());
893                return Err(anyhow::Error::from(e).context(format!(
894                    "opening concurrent writer at {}",
895                    self.db_path.display()
896                )));
897            }
898        };
899        let storage = FrankenStorage::new(conn, self.db_path.clone());
900        if let Err(e) = storage.apply_config() {
901            let _ = self.writer_tokens.0.send(());
902            return Err(e);
903        }
904        // Reduced cache for concurrent writers (they're short-lived)
905        let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
906        Ok(WriterGuard {
907            storage,
908            mgr: self,
909            committed: false,
910        })
911    }
912
913    /// Database path managed by this pool.
914    pub fn db_path(&self) -> &Path {
915        &self.db_path
916    }
917
918    /// Number of reader connections in the pool.
919    pub fn reader_count(&self) -> usize {
920        self.readers.len()
921    }
922
923    /// Maximum concurrent writers allowed.
924    pub fn max_writers(&self) -> usize {
925        self.config.max_writers
926    }
927}
928
929impl Drop for FrankenConnectionManager {
930    fn drop(&mut self) {
931        for reader in &mut self.readers {
932            reader.get_mut().0.close_best_effort_in_place();
933        }
934    }
935}
936
937/// RAII guard for a writer connection.
938///
939/// Provides access to a [`FrankenStorage`] for write operations.
940/// Releases the writer semaphore slot when dropped.
941pub struct WriterGuard<'a> {
942    storage: FrankenStorage,
943    mgr: &'a FrankenConnectionManager,
944    committed: bool,
945}
946
947impl<'a> WriterGuard<'a> {
948    /// Access the underlying storage for read/write operations.
949    pub fn storage(&self) -> &FrankenStorage {
950        &self.storage
951    }
952
953    /// Mark this writer as successfully committed.
954    ///
955    /// Call after your transaction's `commit()` succeeds. Prevents the drop
956    /// guard from attempting a rollback.
957    pub fn mark_committed(&mut self) {
958        self.committed = true;
959    }
960}
961
962impl Drop for WriterGuard<'_> {
963    fn drop(&mut self) {
964        if !self.committed {
965            // Best-effort rollback — connection may already be in autocommit
966            let _ = self.storage.raw().execute("ROLLBACK;");
967        }
968        self.storage.close_best_effort_in_place();
969        // Release writer token
970        let _ = self.mgr.writer_tokens.0.send(());
971    }
972}
973
974// -------------------------------------------------------------------------
975// Binary Metadata Serialization (Opt 3.1)
976// -------------------------------------------------------------------------
977// MessagePack provides 50-70% storage reduction vs JSON and faster parsing.
978// New rows use binary columns; existing JSON is read on fallback.
979
980/// Serialize a JSON value to MessagePack bytes.
981/// Returns None for null/empty values to save storage.
982fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
983    if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
984        return None;
985    }
986    rmp_serde::to_vec(value).ok()
987}
988
989/// Deserialize MessagePack bytes to a JSON value.
990/// Returns default Value::Object({}) on error or empty input.
991fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
992    if bytes.is_empty() {
993        return serde_json::Value::Object(serde_json::Map::new());
994    }
995    rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
996        tracing::debug!(
997            error = %e,
998            bytes_len = bytes.len(),
999            "Failed to deserialize metadata - returning empty object"
1000        );
1001        serde_json::Value::Object(serde_json::Map::new())
1002    })
1003}
1004
1005/// Read metadata from a frankensqlite Row, preferring binary (msgpack) over JSON.
1006fn franken_read_metadata_compat(
1007    row: &FrankenRow,
1008    json_idx: usize,
1009    bin_idx: usize,
1010) -> serde_json::Value {
1011    // Try binary column first (new format)
1012    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1013        && !bytes.is_empty()
1014    {
1015        return deserialize_msgpack_to_json(&bytes);
1016    }
1017
1018    // Fall back to JSON column (old format or migration in progress)
1019    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1020        return serde_json::from_str(&json_str)
1021            .unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
1022    }
1023
1024    serde_json::Value::Object(serde_json::Map::new())
1025}
1026
1027fn franken_read_message_extra_compat(
1028    row: &FrankenRow,
1029    json_idx: usize,
1030    bin_idx: usize,
1031) -> serde_json::Value {
1032    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1033        && !bytes.is_empty()
1034    {
1035        return deserialize_msgpack_to_json(&bytes);
1036    }
1037
1038    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1039        return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
1040    }
1041
1042    serde_json::Value::Null
1043}
1044
1045// -------------------------------------------------------------------------
1046// Migration Error Types (P1.5)
1047// -------------------------------------------------------------------------
1048
1049/// Error type for schema migration operations.
1050#[derive(Debug, Error)]
1051pub enum MigrationError {
1052    /// The schema requires a full rebuild. The database has been backed up.
1053    #[error("Rebuild required: {reason}")]
1054    RebuildRequired {
1055        reason: String,
1056        backup_path: Option<std::path::PathBuf>,
1057    },
1058
1059    /// A database error occurred during migration.
1060    #[error("Database error: {0}")]
1061    Database(#[from] frankensqlite::FrankenError),
1062
1063    /// An I/O error occurred during backup.
1064    #[error("I/O error: {0}")]
1065    Io(#[from] std::io::Error),
1066
1067    /// Other migration error.
1068    #[error("{0}")]
1069    Other(String),
1070}
1071
1072impl From<anyhow::Error> for MigrationError {
1073    fn from(e: anyhow::Error) -> Self {
1074        MigrationError::Other(e.to_string())
1075    }
1076}
1077
1078/// Maximum number of backup files to retain.
1079const MAX_BACKUPS: usize = 3;
1080const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
1081
1082/// Files that contain user-authored state and must NEVER be deleted during rebuild.
1083const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
1084
1085/// Check if a file is user-authored data that must be preserved during rebuild.
1086pub fn is_user_data_file(path: &Path) -> bool {
1087    path.file_name()
1088        .and_then(|n| n.to_str())
1089        .map(|name| USER_DATA_FILES.contains(&name))
1090        .unwrap_or(false)
1091}
1092
1093/// SQL to register the FTS5 virtual table on a frankensqlite connection.
1094///
1095/// FrankenSQLite skips virtual-table entries (rootpage=0) when loading
1096/// `sqlite_master` from a stock-SQLite database.  Executing this CREATE
1097/// triggers the legacy FTS5 fallback path and materialises the table so
1098/// subsequent FTS queries work.
1099pub const FTS5_REGISTER_SQL: &str = "\
1100    CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
1101        content, title, agent, workspace, source_path, \
1102        created_at UNINDEXED, \
1103        content='', tokenize='porter'\
1104    )";
1105
1106const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
1107const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
1108const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
1109const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
1110const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
1111const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
1112
1113/// SQL to clear all rows from the contentless `fts_messages` table.
1114///
1115/// Contentless FTS5 tables reject ordinary `DELETE FROM ...` statements.
1116pub const FTS5_DELETE_ALL_SQL: &str =
1117    "INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
1118
1119pub const FTS_MESSAGES_REQUIRED_SHADOW_TABLES: [&str; 5] = [
1120    "fts_messages_config",
1121    "fts_messages_content",
1122    "fts_messages_data",
1123    "fts_messages_docsize",
1124    "fts_messages_idx",
1125];
1126
1127pub const FTS_MESSAGES_INTEGRITY_PROBE_SQL: &str = "SELECT * FROM fts_messages LIMIT 0";
1128
1129pub const FTS_MESSAGES_CORRUPTION_RECOVERY_HINT: &str = "Stop all cass index/watch processes, back up the current database, then run \
1130     'cass doctor check --json' for a read-only diagnosis before using a supported \
1131     repair/rebuild path.";
1132
1133#[derive(Debug, Clone, PartialEq, Eq)]
1134pub struct FtsMessagesIntegrityError {
1135    missing_shadow_tables: Vec<&'static str>,
1136    failed_sql: Option<&'static str>,
1137    source_error: Option<String>,
1138}
1139
1140impl FtsMessagesIntegrityError {
1141    fn new(
1142        missing_shadow_tables: Vec<&'static str>,
1143        failed_sql: Option<&'static str>,
1144        source_error: Option<String>,
1145    ) -> Self {
1146        Self {
1147            missing_shadow_tables,
1148            failed_sql,
1149            source_error,
1150        }
1151    }
1152
1153    pub fn missing_shadow_tables(&self) -> &[&'static str] {
1154        &self.missing_shadow_tables
1155    }
1156}
1157
1158impl std::fmt::Display for FtsMessagesIntegrityError {
1159    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
1160        write!(
1161            f,
1162            "CASS database FTS5 index is corrupt: fts_messages exists, but required FTS5 shadow tables are missing or unreadable"
1163        )?;
1164        if !self.missing_shadow_tables.is_empty() {
1165            write!(
1166                f,
1167                "; missing shadow tables: {}",
1168                self.missing_shadow_tables.join(", ")
1169            )?;
1170        }
1171        if let Some(sql) = self.failed_sql {
1172            write!(f, "; failed SQL: {sql}")?;
1173        }
1174        if let Some(source_error) = &self.source_error {
1175            write!(f, "; error: {source_error}")?;
1176        }
1177        write!(
1178            f,
1179            ". Suggested recovery: {FTS_MESSAGES_CORRUPTION_RECOVERY_HINT}"
1180        )
1181    }
1182}
1183
1184impl std::error::Error for FtsMessagesIntegrityError {}
1185
1186pub fn fts_messages_integrity_error_from_message(
1187    source_error: impl Into<String>,
1188) -> Option<FtsMessagesIntegrityError> {
1189    let source_error = source_error.into();
1190    let lower = source_error.to_ascii_lowercase();
1191    if !lower.contains("fts_messages") {
1192        return None;
1193    }
1194
1195    let mentions_structural_fts_failure = lower.contains("shadow table")
1196        || lower.contains("vtable constructor failed")
1197        || lower.contains("sqlite_corrupt")
1198        || lower.contains("databasecorrupt")
1199        || lower.contains("database corrupt")
1200        || lower.contains("missing required");
1201    if !mentions_structural_fts_failure {
1202        return None;
1203    }
1204
1205    let missing_shadow_tables = FTS_MESSAGES_REQUIRED_SHADOW_TABLES
1206        .iter()
1207        .copied()
1208        .filter(|table| lower.contains(&table.to_ascii_lowercase()))
1209        .collect::<Vec<_>>();
1210
1211    Some(FtsMessagesIntegrityError::new(
1212        missing_shadow_tables,
1213        Some(FTS_MESSAGES_INTEGRITY_PROBE_SQL),
1214        Some(source_error),
1215    ))
1216}
1217
1218fn fts_schema_tolerates_missing_shadow_metadata(sql: &str) -> bool {
1219    let normalized = sql
1220        .chars()
1221        .filter(|ch| !ch.is_whitespace())
1222        .collect::<String>()
1223        .to_ascii_lowercase();
1224    normalized.contains("usingfts5(")
1225        && normalized.contains("content=''")
1226        && !normalized.contains("message_id")
1227}
1228
1229pub fn validate_fts_messages_integrity_for_connection(conn: &FrankenConnection) -> Result<()> {
1230    let fts_schema_sql: Vec<String> = conn
1231        .query_map_collect(
1232            "SELECT sql FROM sqlite_master WHERE type = 'table' AND name = 'fts_messages'",
1233            fparams![],
1234            |row: &FrankenRow| row.get_typed::<String>(0),
1235        )
1236        .with_context(|| "checking for fts_messages in sqlite_master")?;
1237    if fts_schema_sql.is_empty() {
1238        return Ok(());
1239    }
1240
1241    let probe_error = conn.query(FTS_MESSAGES_INTEGRITY_PROBE_SQL).err();
1242    if probe_error.is_none()
1243        && fts_schema_sql
1244            .iter()
1245            .all(|sql| fts_schema_tolerates_missing_shadow_metadata(sql))
1246    {
1247        return Ok(());
1248    }
1249
1250    let present_shadow_tables: HashSet<String> = conn
1251        .query_map_collect(
1252            "SELECT name FROM sqlite_master
1253             WHERE type = 'table'
1254               AND name IN (
1255                 'fts_messages_config',
1256                 'fts_messages_content',
1257                 'fts_messages_data',
1258                 'fts_messages_docsize',
1259                 'fts_messages_idx'
1260               )",
1261            fparams![],
1262            |row: &FrankenRow| row.get_typed::<String>(0),
1263        )
1264        .map(|rows| rows.into_iter().collect())
1265        .map_err(|err| {
1266            FtsMessagesIntegrityError::new(
1267                Vec::new(),
1268                Some(
1269                    "SELECT name FROM sqlite_master WHERE name IN \
1270                     ('fts_messages_config','fts_messages_content','fts_messages_data','fts_messages_docsize','fts_messages_idx')",
1271                ),
1272                Some(err.to_string()),
1273            )
1274        })?;
1275    let missing_shadow_tables = FTS_MESSAGES_REQUIRED_SHADOW_TABLES
1276        .iter()
1277        .copied()
1278        .filter(|table| !present_shadow_tables.contains(*table))
1279        .collect::<Vec<_>>();
1280
1281    // If every required shadow table is present, the FTS5 schema is
1282    // structurally sound. A probe-SQL failure here typically reflects an
1283    // incomplete FTS5 runtime emulation (e.g. frankensqlite's vtable path)
1284    // rather than fixture corruption — and conflating the two would
1285    // wrongly reject every database with the new message_id schema that
1286    // frankensqlite happens to serve via a different code path. Returning
1287    // Ok here keeps the false-positive surface narrow; the truly-missing-
1288    // shadow case below still surfaces as before.
1289    if missing_shadow_tables.is_empty() {
1290        return Ok(());
1291    }
1292
1293    Err(FtsMessagesIntegrityError::new(
1294        missing_shadow_tables,
1295        probe_error
1296            .as_ref()
1297            .map(|_| FTS_MESSAGES_INTEGRITY_PROBE_SQL),
1298        probe_error.map(|err| err.to_string()),
1299    )
1300    .into())
1301}
1302
1303#[cfg(test)]
1304pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
1305    // Delegate to FrankenStorage: DROP TABLE IF EXISTS + CREATE VIRTUAL TABLE
1306    // is fully supported by the frankensqlite FTS5 path at
1307    // FrankenStorage::rebuild_fts_via_frankensqlite. We call rebuild which
1308    // also populates rows, matching the historical semantics ("fresh FTS"
1309    // means the schema exists and is consistent with message rows).
1310    let storage = FrankenStorage::open(db_path).with_context(|| {
1311        format!(
1312            "opening frankensqlite db at {} for FTS materialization",
1313            db_path.display()
1314        )
1315    })?;
1316    storage.rebuild_fts_via_frankensqlite().map(|_| ())
1317}
1318
1319#[cfg(test)]
1320pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
1321    let storage = FrankenStorage::open(db_path).with_context(|| {
1322        format!(
1323            "opening frankensqlite db at {} for FTS rebuild",
1324            db_path.display()
1325        )
1326    })?;
1327    let inserted = storage.rebuild_fts_via_frankensqlite()?;
1328    storage.record_fts_franken_rebuild_generation()?;
1329    Ok(inserted)
1330}
1331
1332pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
1333    // Delegates to the FrankenStorage-native path. The function name retains
1334    // the `_via_rusqlite` suffix only for backwards compatibility with the
1335    // few test-site callers; all operations now run through frankensqlite.
1336    let storage = FrankenStorage::open(db_path).with_context(|| {
1337        format!(
1338            "opening frankensqlite db at {} for FTS consistency check",
1339            db_path.display()
1340        )
1341    })?;
1342    storage.ensure_search_fallback_fts_consistency()
1343}
1344
1345/// Create a uniquely named backup of the database file.
1346///
1347/// Returns the path to the backup file, or None if the source doesn't exist.
1348pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
1349    if !bundle_path_exists(db_path)? {
1350        return Ok(None);
1351    }
1352
1353    if !copyable_bundle_file_exists(db_path)? {
1354        return Ok(None);
1355    }
1356    let _ = copyable_bundle_sidecar_sources(db_path)?;
1357
1358    let backup_path = unique_backup_path(db_path);
1359    let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
1360
1361    // Try to use SQLite's VACUUM INTO command first, which safely handles WAL files
1362    // and produces a clean, minimized backup.
1363    match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
1364        Ok(()) => {
1365            fs::rename(&vacuum_stage_path, &backup_path)?;
1366        }
1367        Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
1368            tracing::warn!(
1369                db_path = %db_path.display(),
1370                error = %err,
1371                "create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
1372            );
1373            return Err(MigrationError::Database(err));
1374        }
1375        Err(err) => {
1376            tracing::warn!(
1377                db_path = %db_path.display(),
1378                error = %err,
1379                "create_backup: VACUUM INTO failed; falling back to raw evidence copy"
1380            );
1381        }
1382    }
1383
1384    if backup_path.exists() {
1385        sync_file_if_exists(&backup_path)?;
1386        if let Some(parent) = backup_path.parent() {
1387            sync_parent_directory(parent)?;
1388        }
1389        return Ok(Some(backup_path));
1390    }
1391
1392    // Fallback to a raw evidence copy if VACUUM INTO failed (e.g., older SQLite
1393    // or corruption). Keep this on the same symlink-safe bundle path as
1394    // historical seeding so a malformed archive root cannot make us copy an
1395    // arbitrary symlink target or publish a partial sidecar backup.
1396    copy_database_bundle(db_path, &backup_path)?;
1397
1398    Ok(Some(backup_path))
1399}
1400
1401fn vacuum_into_backup_stage(
1402    db_path: &Path,
1403    stage_path: &Path,
1404) -> std::result::Result<(), frankensqlite::FrankenError> {
1405    let mut conn = open_franken_with_flags(
1406        &db_path.to_string_lossy(),
1407        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
1408    )?;
1409    let result = (|| {
1410        conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
1411        let path_str = stage_path.to_string_lossy();
1412        conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
1413        Ok(())
1414    })();
1415    if let Err(close_err) = conn.close_in_place() {
1416        tracing::warn!(
1417            error = %close_err,
1418            db_path = %db_path.display(),
1419            "create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
1420        );
1421        conn.close_best_effort_in_place();
1422    }
1423    result
1424}
1425
1426fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
1427    retryable_franken_error(err)
1428}
1429
1430#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1431pub struct DatabaseBundleMoveResult {
1432    pub database: bool,
1433    pub wal: bool,
1434    pub shm: bool,
1435}
1436
1437impl DatabaseBundleMoveResult {
1438    pub fn moved_any(&self) -> bool {
1439        self.database || self.wal || self.shm
1440    }
1441}
1442
1443fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
1444    PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
1445}
1446
1447/// Move a database file and its WAL/SHM sidecars to a new basename.
1448///
1449/// This is used for non-destructive quarantine of a corrupted bundle before a
1450/// rebuild. If the main database file is already missing but orphaned sidecars
1451/// remain, those sidecars are still moved so a fresh database can be created
1452/// without inheriting stale WAL state.
1453pub(crate) fn move_database_bundle(
1454    source_root: &Path,
1455    destination_root: &Path,
1456) -> std::io::Result<DatabaseBundleMoveResult> {
1457    let mut moved = DatabaseBundleMoveResult::default();
1458    if let Some(parent) = destination_root.parent() {
1459        fs::create_dir_all(parent)?;
1460        sync_parent_directory(parent)?;
1461    }
1462
1463    if bundle_path_exists(source_root)? {
1464        fs::rename(source_root, destination_root)?;
1465        moved.database = true;
1466    }
1467
1468    let wal_source = database_sidecar_path(source_root, "-wal");
1469    if bundle_path_exists(&wal_source)? {
1470        fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
1471        moved.wal = true;
1472    }
1473
1474    let shm_source = database_sidecar_path(source_root, "-shm");
1475    if bundle_path_exists(&shm_source)? {
1476        fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
1477        moved.shm = true;
1478    }
1479
1480    if moved.moved_any() {
1481        if let Some(parent) = source_root.parent() {
1482            sync_parent_directory(parent)?;
1483        }
1484        if let Some(parent) = destination_root.parent() {
1485            sync_parent_directory(parent)?;
1486        }
1487    }
1488
1489    Ok(moved)
1490}
1491
1492fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
1493    match fs::symlink_metadata(path) {
1494        Ok(_) => Ok(true),
1495        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1496        Err(err) => Err(err),
1497    }
1498}
1499
1500fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
1501    if let Some(parent) = destination_root.parent() {
1502        fs::create_dir_all(parent).with_context(|| {
1503            format!(
1504                "creating destination directory for database bundle copy: {}",
1505                parent.display()
1506            )
1507        })?;
1508        sync_parent_directory(parent)
1509            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1510    }
1511
1512    if !copyable_bundle_file_exists(source_root)? {
1513        bail!(
1514            "database bundle root is missing before copy: {}",
1515            source_root.display()
1516        );
1517    }
1518
1519    let sidecars = copyable_bundle_sidecar_sources(source_root)?;
1520
1521    fs::copy(source_root, destination_root).with_context(|| {
1522        format!(
1523            "copying database bundle {} -> {}",
1524            source_root.display(),
1525            destination_root.display()
1526        )
1527    })?;
1528    sync_file_if_exists(destination_root).with_context(|| {
1529        format!(
1530            "syncing copied database bundle {}",
1531            destination_root.display()
1532        )
1533    })?;
1534
1535    for (source_sidecar, suffix) in sidecars {
1536        let destination_sidecar = database_sidecar_path(destination_root, suffix);
1537        fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
1538            format!(
1539                "copying database bundle sidecar {} -> {}",
1540                source_sidecar.display(),
1541                destination_sidecar.display()
1542            )
1543        })?;
1544        sync_file_if_exists(&destination_sidecar).with_context(|| {
1545            format!(
1546                "syncing copied database bundle sidecar {}",
1547                destination_sidecar.display()
1548            )
1549        })?;
1550    }
1551
1552    if let Some(parent) = destination_root.parent() {
1553        sync_parent_directory(parent)
1554            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1555    }
1556
1557    Ok(())
1558}
1559
1560fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
1561    let mut sidecars = Vec::new();
1562    for suffix in ["-wal", "-shm"] {
1563        let source_sidecar = database_sidecar_path(source_root, suffix);
1564        if copyable_bundle_file_exists(&source_sidecar)? {
1565            sidecars.push((source_sidecar, suffix));
1566        }
1567    }
1568    Ok(sidecars)
1569}
1570
1571fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
1572    match fs::symlink_metadata(path) {
1573        Ok(metadata) => {
1574            let file_type = metadata.file_type();
1575            if file_type.is_symlink() {
1576                bail!(
1577                    "refusing to copy database bundle symlink: {}",
1578                    path.display()
1579                );
1580            }
1581            if !file_type.is_file() {
1582                bail!(
1583                    "refusing to copy non-file database bundle path: {}",
1584                    path.display()
1585                );
1586            }
1587            Ok(true)
1588        }
1589        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1590        Err(err) => Err(err).with_context(|| {
1591            format!(
1592                "checking database bundle path before copy: {}",
1593                path.display()
1594            )
1595        }),
1596    }
1597}
1598
1599/// Helper to safely remove a database file and its potential WAL/SHM sidecars.
1600pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
1601    let mut removed_any = false;
1602
1603    match fs::remove_file(path) {
1604        Ok(()) => removed_any = true,
1605        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1606        Err(err) => return Err(err),
1607    }
1608
1609    // Best-effort removal of sidecar files (ignore errors if they don't exist)
1610    for suffix in ["-wal", "-shm"] {
1611        match fs::remove_file(database_sidecar_path(path, suffix)) {
1612            Ok(()) => removed_any = true,
1613            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1614            Err(err) => return Err(err),
1615        }
1616    }
1617
1618    if removed_any && let Some(parent) = path.parent() {
1619        sync_parent_directory(parent)?;
1620    }
1621
1622    Ok(())
1623}
1624
1625#[cfg(not(windows))]
1626fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
1627    fs::File::open(path)?.sync_all()
1628}
1629
1630#[cfg(windows)]
1631fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
1632    Ok(())
1633}
1634
1635#[cfg(not(windows))]
1636fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1637    if path.exists() {
1638        fs::File::open(path)?.sync_all()?;
1639    }
1640    Ok(())
1641}
1642
1643#[cfg(windows)]
1644fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1645    if path.exists() {
1646        fs::OpenOptions::new()
1647            .read(true)
1648            .write(true)
1649            .open(path)?
1650            .sync_all()?;
1651    }
1652    Ok(())
1653}
1654
1655/// Remove old backup files, keeping only the most recent `keep_count`.
1656pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
1657    let parent = match db_path.parent() {
1658        Some(p) => p,
1659        None => return Ok(()),
1660    };
1661
1662    let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
1663
1664    let prefix = format!("{}.backup.", db_name);
1665
1666    // Collect backup files matching the pattern
1667    let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
1668
1669    if let Ok(entries) = fs::read_dir(parent) {
1670        for entry in entries.flatten() {
1671            let path = entry.path();
1672            if let Some(name) = path.file_name().and_then(|n| n.to_str())
1673                && is_backup_root_name(name, &prefix)
1674                && let Ok(meta) = fs::metadata(&path)
1675                && meta.is_file()
1676                && let Ok(mtime) = meta.modified()
1677            {
1678                backups.push((path, mtime));
1679            }
1680        }
1681    }
1682
1683    // Sort by modification time, newest first
1684    backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
1685
1686    // Delete oldest backups beyond keep_count
1687    for (path, _) in backups.into_iter().skip(keep_count) {
1688        let _ = fs::remove_file(&path);
1689
1690        // Also try to cleanup potential sidecars from fs::copy fallback
1691        let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
1692        let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
1693    }
1694
1695    Ok(())
1696}
1697
1698#[derive(Debug, Clone)]
1699pub(crate) struct HistoricalDatabaseBundle {
1700    root_path: PathBuf,
1701    total_bytes: u64,
1702    modified_at_ms: i64,
1703    supports_direct_readonly: bool,
1704    probe: HistoricalBundleProbe,
1705}
1706
1707#[derive(Debug, Clone, Copy, Default)]
1708struct HistoricalBundleProbe {
1709    schema_version: Option<i64>,
1710    fts_schema_rows: Option<i64>,
1711    fts_queryable: bool,
1712    max_message_id: i64,
1713}
1714
1715#[cfg(test)]
1716#[allow(dead_code)]
1717#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1718pub(crate) struct SqliteDatabaseHealthProbe {
1719    pub schema_version: Option<i64>,
1720    pub quick_check_ok: bool,
1721    pub fts_schema_rows: i64,
1722    pub fts_queryable: bool,
1723    pub message_count: i64,
1724    pub max_message_id: i64,
1725}
1726
1727#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1728pub(crate) enum FtsConsistencyRepair {
1729    AlreadyHealthy {
1730        rows: usize,
1731    },
1732    IncrementalCatchUp {
1733        inserted_rows: usize,
1734        total_rows: usize,
1735    },
1736    Rebuilt {
1737        inserted_rows: usize,
1738    },
1739}
1740
1741#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
1742pub struct HistoricalSalvageOutcome {
1743    pub bundles_considered: usize,
1744    pub bundles_imported: usize,
1745    pub conversations_imported: usize,
1746    pub messages_imported: usize,
1747}
1748
1749impl HistoricalSalvageOutcome {
1750    pub(crate) fn accumulate(&mut self, other: Self) {
1751        self.bundles_considered += other.bundles_considered;
1752        self.bundles_imported += other.bundles_imported;
1753        self.conversations_imported += other.conversations_imported;
1754        self.messages_imported += other.messages_imported;
1755    }
1756}
1757
1758#[derive(Debug)]
1759struct HistoricalReadConnection {
1760    conn: FrankenConnection,
1761    method: &'static str,
1762    root_path: PathBuf,
1763    _tempdir: Option<tempfile::TempDir>,
1764}
1765
1766const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
1767CREATE TABLE sources (
1768    id TEXT PRIMARY KEY,
1769    kind TEXT,
1770    host_label TEXT,
1771    machine_id TEXT,
1772    platform TEXT,
1773    config_json TEXT,
1774    created_at INTEGER,
1775    updated_at INTEGER
1776);
1777CREATE TABLE agents (
1778    id INTEGER PRIMARY KEY,
1779    slug TEXT,
1780    name TEXT,
1781    version TEXT,
1782    kind TEXT,
1783    created_at INTEGER,
1784    updated_at INTEGER
1785);
1786CREATE TABLE workspaces (
1787    id INTEGER PRIMARY KEY,
1788    path TEXT,
1789    display_name TEXT
1790);
1791CREATE TABLE conversations (
1792    id INTEGER PRIMARY KEY,
1793    agent_id INTEGER,
1794    workspace_id INTEGER,
1795    source_id TEXT,
1796    external_id TEXT,
1797    title TEXT,
1798    source_path TEXT,
1799    started_at INTEGER,
1800    ended_at INTEGER,
1801    approx_tokens INTEGER,
1802    metadata_json TEXT,
1803    origin_host TEXT,
1804    metadata_bin BLOB,
1805    total_input_tokens INTEGER,
1806    total_output_tokens INTEGER,
1807    total_cache_read_tokens INTEGER,
1808    total_cache_creation_tokens INTEGER,
1809    grand_total_tokens INTEGER,
1810    estimated_cost_usd REAL,
1811    primary_model TEXT,
1812    api_call_count INTEGER,
1813    tool_call_count INTEGER,
1814    user_message_count INTEGER,
1815    assistant_message_count INTEGER,
1816    last_message_idx INTEGER,
1817    last_message_created_at INTEGER
1818);
1819CREATE TABLE messages (
1820    id INTEGER PRIMARY KEY,
1821    conversation_id INTEGER,
1822    idx INTEGER,
1823    role TEXT,
1824    author TEXT,
1825    created_at INTEGER,
1826    content TEXT,
1827    extra_json TEXT,
1828    extra_bin BLOB
1829);
1830CREATE TABLE snippets (
1831    id INTEGER PRIMARY KEY,
1832    message_id INTEGER,
1833    file_path TEXT,
1834    start_line INTEGER,
1835    end_line INTEGER,
1836    language TEXT,
1837    snippet_text TEXT
1838);
1839";
1840const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
1841const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
1842const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
1843
1844#[derive(Debug, Clone, Serialize, Deserialize)]
1845struct HistoricalBundleProgress {
1846    progress_version: u32,
1847    path: String,
1848    bytes: u64,
1849    modified_at_ms: i64,
1850    method: String,
1851    last_completed_source_row_id: i64,
1852    conversations_imported: usize,
1853    messages_imported: usize,
1854    updated_at_ms: i64,
1855}
1856
1857#[derive(Debug, Clone)]
1858struct HistoricalBatchEntry {
1859    source_row_id: i64,
1860    agent_id: i64,
1861    workspace_id: Option<i64>,
1862    conversation: Conversation,
1863}
1864
1865#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1866struct HistoricalBatchImportTotals {
1867    inserted_source_rows: usize,
1868    inserted_messages: usize,
1869}
1870
1871fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
1872    let mut roots = Vec::new();
1873    let Some(parent) = db_path.parent() else {
1874        return roots;
1875    };
1876    let db_name = db_path
1877        .file_name()
1878        .and_then(|n| n.to_str())
1879        .unwrap_or("agent_search.db");
1880    let db_stem = db_path
1881        .file_stem()
1882        .and_then(|n| n.to_str())
1883        .unwrap_or("agent_search");
1884
1885    let mut push_root = |path: PathBuf| {
1886        if path == db_path {
1887            return;
1888        }
1889        if !roots.iter().any(|existing| existing == &path) {
1890            roots.push(path);
1891        }
1892    };
1893
1894    if let Ok(entries) = fs::read_dir(parent) {
1895        for entry in entries.flatten() {
1896            let path = entry.path();
1897            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1898                continue;
1899            };
1900            if has_db_sidecar_suffix(name) {
1901                continue;
1902            }
1903            if name.starts_with(&format!("{db_name}.backup."))
1904                || name.starts_with(&format!("{db_stem}.corrupt."))
1905            {
1906                push_root(path);
1907            }
1908        }
1909    }
1910
1911    let backups_dir = parent.join("backups");
1912    if let Ok(entries) = fs::read_dir(backups_dir) {
1913        for entry in entries.flatten() {
1914            let path = entry.path();
1915            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1916                continue;
1917            };
1918            if has_db_sidecar_suffix(name) {
1919                continue;
1920            }
1921            if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
1922                push_root(path);
1923            }
1924        }
1925    }
1926
1927    push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
1928    push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
1929
1930    roots
1931}
1932
1933fn push_named_database_children(
1934    roots: &mut Vec<PathBuf>,
1935    canonical_db_path: &Path,
1936    dir: &Path,
1937    db_name: &str,
1938) {
1939    if let Ok(entries) = fs::read_dir(dir) {
1940        for entry in entries.flatten() {
1941            let candidate = entry.path().join(db_name);
1942            if candidate == canonical_db_path {
1943                continue;
1944            }
1945            if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
1946                roots.push(candidate);
1947            }
1948        }
1949    }
1950}
1951
1952fn file_mtime_ms(path: &Path) -> i64 {
1953    fs::metadata(path)
1954        .and_then(|meta| meta.modified())
1955        .ok()
1956        .and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
1957        .map(|d| d.as_millis() as i64)
1958        .unwrap_or(0)
1959}
1960
1961fn bundle_total_bytes(root_path: &Path) -> u64 {
1962    let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
1963    for suffix in ["-wal", "-shm"] {
1964        let sidecar = database_sidecar_path(root_path, suffix);
1965        total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
1966    }
1967    total
1968}
1969
1970pub(crate) fn discover_historical_database_bundles(
1971    db_path: &Path,
1972) -> Vec<HistoricalDatabaseBundle> {
1973    let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
1974        .into_iter()
1975        .filter(|root| root.exists())
1976        .map(|root_path| {
1977            let modified_at_ms = file_mtime_ms(&root_path);
1978            let total_bytes = bundle_total_bytes(&root_path);
1979            let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
1980            let probe = probe_historical_bundle(&root_path);
1981            HistoricalDatabaseBundle {
1982                modified_at_ms,
1983                total_bytes,
1984                supports_direct_readonly,
1985                root_path,
1986                probe,
1987            }
1988        })
1989        .filter(|bundle| bundle.total_bytes > 0)
1990        .collect();
1991
1992    fn bundle_priority(path: &Path) -> i32 {
1993        let path_str = path.to_string_lossy();
1994        if path_str.contains("/repair-lab/replay-") {
1995            return 5;
1996        }
1997        if path_str.contains("/repair-lab/") {
1998            return 4;
1999        }
2000        if path_str.contains("/snapshots/") {
2001            return 3;
2002        }
2003        if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
2004            return 0;
2005        }
2006        1
2007    }
2008
2009    fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
2010        // Classify FTS health. The probe only sets `fts_queryable = true`
2011        // when `fts_schema_rows == Some(1)` (see
2012        // `historical_bundle_fts_queryable_via_frankensqlite`), so we have
2013        // two legitimate "clean" shapes for a bundle:
2014        //
2015        //   * `fts_schema_rows == Some(1) && fts_queryable` — a pre-V14
2016        //     bundle where the FTS virtual table was eagerly created by
2017        //     migration and is queryable right now.
2018        //
2019        //   * `fts_schema_rows == Some(0) && schema_version == Some(V14+)` —
2020        //     a modern bundle where `MIGRATION_V14` dropped fts_messages on
2021        //     purpose and cass recreates it lazily via
2022        //     `ensure_search_fallback_fts_consistency` on the first open.
2023        //     Gating on `schema_version == CURRENT_SCHEMA_VERSION` is critical
2024        //     so an incomplete pre-V14 bundle with 0 fts rows is not promoted
2025        //     alongside real lazy-V14+ bundles. A `None` schema_version
2026        //     (schema marker unreadable) is excluded for the same reason.
2027        //
2028        // Everything else — `Some(1)` without queryability, `Some(n)` for
2029        // n >= 2 (duplicated CREATE VIRTUAL TABLE rows from a broken legacy
2030        // rebuild), `None` entirely, or `Some(0)` on a non-current schema —
2031        // is not "fts clean".
2032        let fts_clean = match bundle.probe.fts_schema_rows {
2033            Some(1) => bundle.probe.fts_queryable,
2034            Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
2035            _ => false,
2036        };
2037
2038        let clean_schema14_fts =
2039            bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
2040        if clean_schema14_fts {
2041            return 5;
2042        }
2043
2044        if fts_clean {
2045            return 4;
2046        }
2047
2048        if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
2049            && bundle.supports_direct_readonly
2050        {
2051            return 3;
2052        }
2053
2054        if bundle.supports_direct_readonly {
2055            return 2;
2056        }
2057
2058        1
2059    }
2060
2061    bundles.sort_by(|left, right| {
2062        bundle_health_rank(right)
2063            .cmp(&bundle_health_rank(left))
2064            .then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
2065            .then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
2066            .then_with(|| {
2067                right
2068                    .supports_direct_readonly
2069                    .cmp(&left.supports_direct_readonly)
2070            })
2071            .then_with(|| right.total_bytes.cmp(&left.total_bytes))
2072            .then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
2073            .then_with(|| right.root_path.cmp(&left.root_path))
2074    });
2075    bundles
2076}
2077
2078fn probe_historical_bundle(root_path: &Path) -> HistoricalBundleProbe {
2079    let Ok(conn) = open_historical_bundle_readonly(root_path) else {
2080        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or_default();
2081    };
2082
2083    let schema_version = read_meta_schema_version(&conn).ok().flatten();
2084    let fts_schema_rows: Option<i64> = conn
2085        .query_row_map(
2086            "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2087            fparams![],
2088            |row| row.get_typed(0),
2089        )
2090        .ok();
2091    let fts_queryable =
2092        historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
2093    let max_message_id: i64 = conn
2094        .query_row_map(
2095            "SELECT COALESCE(MAX(id), 0) FROM messages",
2096            fparams![],
2097            |row| row.get_typed(0),
2098        )
2099        .unwrap_or(0);
2100
2101    let probe = HistoricalBundleProbe {
2102        schema_version,
2103        fts_schema_rows,
2104        fts_queryable,
2105        max_message_id,
2106    };
2107
2108    if probe.schema_version.is_none()
2109        && probe.fts_schema_rows.is_none()
2110        && probe.max_message_id == 0
2111    {
2112        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or(probe);
2113    }
2114
2115    probe
2116}
2117
2118fn probe_historical_bundle_via_sqlite3_metadata(root_path: &Path) -> Option<HistoricalBundleProbe> {
2119    let bundle_uri = format!("file:{}?immutable=1", root_path.to_string_lossy());
2120    let output = Command::new("sqlite3")
2121        .arg("-batch")
2122        .arg("-noheader")
2123        .arg(&bundle_uri)
2124        .arg(
2125            "PRAGMA writable_schema=ON;
2126             SELECT COALESCE((SELECT value FROM meta WHERE key = 'schema_version'), '');
2127             SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages';
2128             SELECT COALESCE(MAX(id), 0) FROM messages;",
2129        )
2130        .output()
2131        .ok()?;
2132    if !output.status.success() {
2133        return None;
2134    }
2135
2136    let stdout = String::from_utf8(output.stdout).ok()?;
2137    let mut lines = stdout.lines();
2138    let schema_version = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
2139    let fts_schema_rows = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
2140    let max_message_id = lines
2141        .next()
2142        .and_then(|raw| raw.trim().parse::<i64>().ok())
2143        .unwrap_or(0);
2144
2145    Some(HistoricalBundleProbe {
2146        schema_version,
2147        fts_schema_rows,
2148        fts_queryable: false,
2149        max_message_id,
2150    })
2151}
2152
2153fn historical_bundle_fts_queryable_via_frankensqlite(
2154    root_path: &Path,
2155    fts_schema_rows: Option<i64>,
2156) -> bool {
2157    matches!(fts_schema_rows, Some(1))
2158        && FrankenStorage::open_readonly(root_path)
2159            .map(|storage| {
2160                storage
2161                    .raw()
2162                    .query("SELECT COUNT(*) FROM fts_messages")
2163                    .is_ok()
2164            })
2165            .unwrap_or(false)
2166}
2167
2168fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
2169    open_historical_bundle_readonly(root_path)
2170        .and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
2171        .is_ok()
2172}
2173
2174fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
2175    let found: Option<i64> = conn
2176        .query_row_map(
2177            "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
2178            fparams![table],
2179            |row| row.get_typed(0),
2180        )
2181        .optional()
2182        .with_context(|| format!("checking for historical table {table}"))?;
2183    Ok(found.is_some())
2184}
2185
2186fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
2187    if !historical_table_exists(conn, table)? {
2188        return Err(anyhow!(
2189            "historical database missing required table {table}"
2190        ));
2191    }
2192
2193    let sql = format!("SELECT rowid FROM {table} LIMIT 1");
2194    let _: Option<i64> = conn
2195        .query_row_map(&sql, fparams![], |row| row.get_typed(0))
2196        .optional()
2197        .with_context(|| format!("probing rows from historical table {table}"))?;
2198    Ok(())
2199}
2200
2201fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
2202    probe_historical_table_reads(conn, "conversations")?;
2203    probe_historical_table_reads(conn, "messages")?;
2204    Ok(())
2205}
2206
2207fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
2208    let path_str = root_path.to_string_lossy();
2209    let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
2210    let conn = open_franken_with_flags(&path_str, flags)
2211        .with_context(|| format!("opening historical database {}", root_path.display()))?;
2212    Ok(conn)
2213}
2214
2215fn is_recoverable_insert_line(line: &str) -> bool {
2216    [
2217        "sources",
2218        "agents",
2219        "workspaces",
2220        "conversations",
2221        "messages",
2222        "snippets",
2223    ]
2224    .iter()
2225    .any(|table| {
2226        line.starts_with(&format!("INSERT INTO '{table}'"))
2227            || line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
2228            || line.starts_with(&format!("INSERT INTO \"{table}\""))
2229            || line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
2230    })
2231}
2232
2233fn recover_historical_bundle_via_sqlite3(
2234    bundle: &HistoricalDatabaseBundle,
2235) -> Result<HistoricalReadConnection> {
2236    let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
2237    let recovered_db = tempdir.path().join("historical-recovered.db");
2238    let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
2239        .with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
2240    temp_conn
2241        .execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
2242        .with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
2243    drop(temp_conn);
2244
2245    let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
2246    let mut recover = Command::new("sqlite3")
2247        .arg(&bundle_uri)
2248        .arg(".recover")
2249        .stdout(Stdio::piped())
2250        .spawn()
2251        .with_context(|| {
2252            format!(
2253                "launching sqlite3 .recover for historical bundle {}",
2254                bundle.root_path.display()
2255            )
2256        })?;
2257    let recover_stdout = recover
2258        .stdout
2259        .take()
2260        .context("capturing sqlite3 .recover stdout")?;
2261
2262    let mut importer = Command::new("sqlite3")
2263        .arg(&recovered_db)
2264        .stdin(Stdio::piped())
2265        .spawn()
2266        .with_context(|| {
2267            format!(
2268                "launching sqlite3 importer for recovered bundle {}",
2269                recovered_db.display()
2270            )
2271        })?;
2272
2273    {
2274        let importer_stdin = importer
2275            .stdin
2276            .as_mut()
2277            .context("opening sqlite3 importer stdin")?;
2278        importer_stdin
2279            .write_all(b"BEGIN;\n")
2280            .context("starting recovery import transaction")?;
2281
2282        let reader = BufReader::new(recover_stdout);
2283        for line in reader.lines() {
2284            let line = line.context("reading sqlite3 .recover output")?;
2285            if is_recoverable_insert_line(&line) {
2286                importer_stdin
2287                    .write_all(line.as_bytes())
2288                    .context("writing recovered INSERT")?;
2289                importer_stdin
2290                    .write_all(b"\n")
2291                    .context("writing recovered INSERT newline")?;
2292            }
2293        }
2294
2295        importer_stdin
2296            .write_all(b"COMMIT;\n")
2297            .context("committing recovery import transaction")?;
2298    }
2299
2300    let importer_status = importer
2301        .wait()
2302        .context("waiting for sqlite3 recovery importer")?;
2303    let recover_status = recover
2304        .wait()
2305        .context("waiting for sqlite3 .recover process")?;
2306    if !importer_status.success() {
2307        anyhow::bail!(
2308            "sqlite3 recovery importer exited with status {} for {} after sqlite3 .recover exited with status {}",
2309            importer_status,
2310            recovered_db.display(),
2311            recover_status
2312        );
2313    }
2314
2315    let conn = open_historical_bundle_readonly(&recovered_db)?;
2316    historical_bundle_has_queryable_core_tables(&conn)?;
2317    if !recover_status.success() {
2318        let (conversations, messages) = historical_bundle_counts(&conn)?;
2319        if conversations == 0 && messages == 0 {
2320            anyhow::bail!(
2321                "sqlite3 .recover exited with status {} for {} and recovered no core rows",
2322                recover_status,
2323                bundle.root_path.display()
2324            );
2325        }
2326        tracing::warn!(
2327            path = %bundle.root_path.display(),
2328            status = %recover_status,
2329            conversations,
2330            messages,
2331            "sqlite3 .recover exited nonzero after emitting recoverable core rows; continuing with recovered subset"
2332        );
2333    }
2334    Ok(HistoricalReadConnection {
2335        conn,
2336        method: "sqlite3-recover",
2337        root_path: recovered_db,
2338        _tempdir: Some(tempdir),
2339    })
2340}
2341
2342fn open_historical_bundle_for_salvage(
2343    bundle: &HistoricalDatabaseBundle,
2344) -> Result<HistoricalReadConnection> {
2345    match open_historical_bundle_readonly(&bundle.root_path) {
2346        Ok(conn) => {
2347            if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
2348                return Ok(HistoricalReadConnection {
2349                    conn,
2350                    method: "direct-readonly",
2351                    root_path: bundle.root_path.clone(),
2352                    _tempdir: None,
2353                });
2354            }
2355        }
2356        Err(err) => {
2357            tracing::warn!(
2358                path = %bundle.root_path.display(),
2359                error = %err,
2360                "historical bundle direct open failed; falling back to sqlite3 .recover"
2361            );
2362        }
2363    }
2364
2365    recover_historical_bundle_via_sqlite3(bundle)
2366}
2367
2368fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
2369    let conversations: i64 =
2370        conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
2371            row.get_typed(0)
2372        })?;
2373    let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2374        row.get_typed(0)
2375    })?;
2376    Ok((
2377        usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
2378        usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
2379    ))
2380}
2381
2382fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
2383    conn.execute(
2384        "DELETE FROM meta
2385         WHERE key LIKE 'historical_bundle_salvaged:%'
2386            OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
2387    )?;
2388    Ok(())
2389}
2390
2391fn record_historical_bundle_import(
2392    conn: &FrankenConnection,
2393    bundle: &HistoricalDatabaseBundle,
2394    method: &str,
2395    conversations_imported: usize,
2396    messages_imported: usize,
2397) -> Result<()> {
2398    let key = FrankenStorage::historical_bundle_meta_key(bundle);
2399    let value = serde_json::json!({
2400        "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
2401        "path": bundle.root_path.display().to_string(),
2402        "bytes": bundle.total_bytes,
2403        "modified_at_ms": bundle.modified_at_ms,
2404        "method": method,
2405        "conversations_imported": conversations_imported,
2406        "messages_imported": messages_imported,
2407        "recorded_at_ms": FrankenStorage::now_millis(),
2408    });
2409    let value_str = serde_json::to_string(&value)?;
2410    conn.execute_compat(
2411        "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
2412        fparams![key, value_str],
2413    )?;
2414    Ok(())
2415}
2416
2417fn scrub_staged_derived_fts_metadata_via_sqlite3(staged_db_path: &Path) -> Result<()> {
2418    let scrub_sql = "PRAGMA writable_schema = ON;
2419         DELETE FROM sqlite_master
2420          WHERE name = 'fts_messages'
2421             OR tbl_name = 'fts_messages'
2422             OR name IN (
2423                'fts_messages_config',
2424                'fts_messages_content',
2425                'fts_messages_data',
2426                'fts_messages_docsize',
2427                'fts_messages_idx'
2428             )
2429             OR tbl_name IN (
2430                'fts_messages_config',
2431                'fts_messages_content',
2432                'fts_messages_data',
2433                'fts_messages_docsize',
2434                'fts_messages_idx'
2435             );
2436         PRAGMA writable_schema = OFF;";
2437
2438    let run_scrub = |disable_defensive: bool| -> Result<std::process::Output> {
2439        let mut command = Command::new("sqlite3");
2440        command.arg("-batch").arg(staged_db_path);
2441        if disable_defensive {
2442            command.arg(".dbconfig defensive off");
2443        }
2444        command.arg(scrub_sql).output().with_context(|| {
2445            format!(
2446                "running sqlite3 staged FTS metadata scrub for {}",
2447                staged_db_path.display()
2448            )
2449        })
2450    };
2451    let render_output = |output: &std::process::Output| -> String {
2452        format!(
2453            "status {}; stdout: {}; stderr: {}",
2454            output.status,
2455            String::from_utf8_lossy(&output.stdout).trim(),
2456            String::from_utf8_lossy(&output.stderr).trim()
2457        )
2458    };
2459
2460    let defensive_off_output = run_scrub(true)?;
2461    if defensive_off_output.status.success() {
2462        return Ok(());
2463    }
2464
2465    let fallback_output = run_scrub(false)?;
2466    if !fallback_output.status.success() {
2467        anyhow::bail!(
2468            "sqlite3 staged FTS metadata scrub failed for {}; defensive-off attempt {}; fallback without .dbconfig {}",
2469            staged_db_path.display(),
2470            render_output(&defensive_off_output),
2471            render_output(&fallback_output)
2472        );
2473    }
2474    Ok(())
2475}
2476
2477fn ensure_seeded_canonical_fts_consistency(staged_db_path: &Path) -> Result<FtsConsistencyRepair> {
2478    match ensure_fts_consistency_via_rusqlite(staged_db_path) {
2479        Ok(repair) => Ok(repair),
2480        Err(err) => {
2481            if fts_messages_integrity_error_from_message(format!("{err:#}")).is_none() {
2482                return Err(err).with_context(|| {
2483                    format!(
2484                        "repairing staged canonical FTS consistency before finalization: {}",
2485                        staged_db_path.display()
2486                    )
2487                });
2488            }
2489
2490            tracing::warn!(
2491                path = %staged_db_path.display(),
2492                error = %err,
2493                "staged historical seed has malformed derived FTS metadata; scrubbing and rebuilding FTS on staged copy"
2494            );
2495            scrub_staged_derived_fts_metadata_via_sqlite3(staged_db_path).with_context(|| {
2496                format!(
2497                    "scrubbing malformed staged FTS metadata before finalization: {}",
2498                    staged_db_path.display()
2499                )
2500            })?;
2501            ensure_fts_consistency_via_rusqlite(staged_db_path).with_context(|| {
2502                format!(
2503                    "repairing staged canonical FTS consistency after metadata scrub: {}",
2504                    staged_db_path.display()
2505                )
2506            })
2507        }
2508    }
2509}
2510
2511fn finalize_seeded_canonical_bundle_via_rusqlite(
2512    canonical_db_path: &Path,
2513    bundle: &HistoricalDatabaseBundle,
2514) -> Result<(usize, usize)> {
2515    let _fts_repair = ensure_seeded_canonical_fts_consistency(canonical_db_path)?;
2516
2517    let path_str = canonical_db_path.to_string_lossy();
2518    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2519        format!(
2520            "opening seeded canonical database for post-seed finalization: {}",
2521            canonical_db_path.display()
2522        )
2523    })?;
2524    conn.execute("PRAGMA busy_timeout = 30000;")
2525        .with_context(|| {
2526            format!(
2527                "configuring busy timeout for seeded canonical database {}",
2528                canonical_db_path.display()
2529            )
2530        })?;
2531    let schema_version = read_meta_schema_version(&conn)?;
2532
2533    if let Some(version) = schema_version
2534        && version < CURRENT_SCHEMA_VERSION
2535        && version != 13
2536    {
2537        anyhow::bail!(
2538            "seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
2539        );
2540    }
2541
2542    clear_seeded_runtime_meta(&conn)?;
2543    let (conversations_imported, messages_imported) = historical_bundle_counts(&conn)?;
2544
2545    conn.execute_compat(
2546        "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
2547        fparams![CURRENT_SCHEMA_VERSION.to_string()],
2548    )?;
2549
2550    conn.execute_compat(
2551        "INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
2552        fparams![CURRENT_SCHEMA_VERSION],
2553    )?;
2554    record_historical_bundle_import(
2555        &conn,
2556        bundle,
2557        "baseline-bulk-sql-copy",
2558        conversations_imported,
2559        messages_imported,
2560    )?;
2561    Ok((conversations_imported, messages_imported))
2562}
2563
2564fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
2565    let version: Option<String> = conn
2566        .query_row_map(
2567            "SELECT value FROM meta WHERE key = 'schema_version'",
2568            fparams![],
2569            |row| row.get_typed(0),
2570        )
2571        .optional()?;
2572    Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
2573}
2574
2575#[cfg(test)]
2576fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
2577    conn.query_row_map(
2578        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2579        fparams![],
2580        |row| row.get_typed(0),
2581    )
2582    .context("counting sqlite_master rows for fts_messages via frankensqlite")
2583}
2584
2585#[cfg(test)]
2586fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
2587    conn.query("SELECT COUNT(*) FROM fts_messages").is_ok()
2588}
2589
2590#[cfg(test)]
2591#[allow(dead_code)]
2592pub(crate) fn probe_database_health_via_frankensqlite(
2593    db_path: &Path,
2594) -> Result<SqliteDatabaseHealthProbe> {
2595    let path_str = db_path.to_string_lossy();
2596    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2597        format!(
2598            "opening frankensqlite db at {} for database health probe",
2599            db_path.display()
2600        )
2601    })?;
2602    conn.execute_batch("PRAGMA busy_timeout = 30000;")
2603        .with_context(|| {
2604            format!(
2605                "configuring busy timeout for database health probe at {}",
2606                db_path.display()
2607            )
2608        })?;
2609
2610    let schema_version = read_meta_schema_version(&conn)?;
2611    let quick_check_status: String = conn
2612        .query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
2613        .with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
2614    let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
2615    let fts_schema_rows = franken_fts_schema_rows(&conn)?;
2616    let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
2617
2618    if !quick_check_ok {
2619        return Ok(SqliteDatabaseHealthProbe {
2620            schema_version,
2621            quick_check_ok,
2622            fts_schema_rows,
2623            fts_queryable,
2624            message_count: 0,
2625            max_message_id: 0,
2626        });
2627    }
2628
2629    let message_count: i64 = conn
2630        .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2631            row.get_typed(0)
2632        })
2633        .context("counting messages during frankensqlite database health probe")?;
2634    let max_message_id: i64 = conn
2635        .query_row_map(
2636            "SELECT COALESCE(MAX(id), 0) FROM messages",
2637            fparams![],
2638            |row| row.get_typed(0),
2639        )
2640        .context("reading max message id during frankensqlite database health probe")?;
2641
2642    Ok(SqliteDatabaseHealthProbe {
2643        schema_version,
2644        quick_check_ok,
2645        fts_schema_rows,
2646        fts_queryable,
2647        message_count,
2648        max_message_id,
2649    })
2650}
2651
2652struct StagedHistoricalSeed {
2653    tempdir: tempfile::TempDir,
2654    db_path: PathBuf,
2655}
2656
2657fn stage_historical_bundle_for_seed(
2658    canonical_db_path: &Path,
2659    source_root_path: &Path,
2660) -> Result<StagedHistoricalSeed> {
2661    let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
2662    fs::create_dir_all(canonical_parent).with_context(|| {
2663        format!(
2664            "creating canonical database directory before bulk historical seed import: {}",
2665            canonical_parent.display()
2666        )
2667    })?;
2668    let tempdir = tempfile::TempDir::new_in(canonical_parent)
2669        .context("creating temporary baseline seed directory")?;
2670    let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
2671    copy_database_bundle(source_root_path, &staged_seed_db)?;
2672
2673    Ok(StagedHistoricalSeed {
2674        tempdir,
2675        db_path: staged_seed_db,
2676    })
2677}
2678
2679fn stage_and_finalize_historical_seed(
2680    canonical_db_path: &Path,
2681    bundle: &HistoricalDatabaseBundle,
2682    source_root_path: &Path,
2683) -> Result<(StagedHistoricalSeed, usize, usize)> {
2684    let staged_seed = stage_historical_bundle_for_seed(canonical_db_path, source_root_path)?;
2685    let (conversations_imported, messages_imported) =
2686        finalize_seeded_canonical_bundle_via_rusqlite(&staged_seed.db_path, bundle)?;
2687    Ok((staged_seed, conversations_imported, messages_imported))
2688}
2689
2690fn promote_staged_historical_seed(
2691    canonical_db_path: &Path,
2692    staged_seed: &StagedHistoricalSeed,
2693) -> Result<()> {
2694    let canonical_backup = staged_seed
2695        .tempdir
2696        .path()
2697        .join("pre-seed-canonical-backup.db");
2698    let had_canonical = canonical_db_path.exists()
2699        || database_sidecar_path(canonical_db_path, "-wal").exists()
2700        || database_sidecar_path(canonical_db_path, "-shm").exists();
2701
2702    if had_canonical {
2703        move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
2704            format!(
2705                "backing up canonical database before promoting staged historical seed import: {}",
2706                canonical_db_path.display()
2707            )
2708        })?;
2709    }
2710
2711    if let Err(err) =
2712        move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
2713            format!(
2714                "promoting staged historical seed database bundle {} into canonical path {}",
2715                staged_seed.db_path.display(),
2716                canonical_db_path.display()
2717            )
2718        })
2719    {
2720        if had_canonical {
2721            let _ = move_database_bundle(&canonical_backup, canonical_db_path);
2722        }
2723        return Err(err);
2724    }
2725
2726    Ok(())
2727}
2728
2729pub(crate) fn seed_canonical_from_best_historical_bundle(
2730    canonical_db_path: &Path,
2731) -> Result<Option<HistoricalSalvageOutcome>> {
2732    let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
2733    let mut last_seed_error: Option<anyhow::Error> = None;
2734    for bundle in ordered_bundles {
2735        if let Some(version) = bundle.probe.schema_version
2736            && version < 13
2737        {
2738            let err = anyhow!(
2739                "historical bundle {} schema_version {version} is too old for baseline import",
2740                bundle.root_path.display()
2741            );
2742            tracing::warn!(
2743                path = %bundle.root_path.display(),
2744                schema_version = version,
2745                "historical bundle is too old for baseline seed import"
2746            );
2747            last_seed_error = Some(err);
2748            continue;
2749        }
2750
2751        let (staged_seed, conversations_imported, messages_imported) =
2752            match stage_and_finalize_historical_seed(canonical_db_path, &bundle, &bundle.root_path)
2753            {
2754                Ok(result) => result,
2755                Err(primary_err) => {
2756                    tracing::warn!(
2757                        path = %bundle.root_path.display(),
2758                        error = %primary_err,
2759                        "direct bulk baseline seed from historical bundle failed; trying sqlite3 salvage copy"
2760                    );
2761                    let source = match open_historical_bundle_for_salvage(&bundle).with_context(
2762                        || {
2763                            format!(
2764                                "opening historical seed bundle {} for baseline import",
2765                                bundle.root_path.display()
2766                            )
2767                        },
2768                    ) {
2769                        Ok(source) => source,
2770                        Err(salvage_err) => {
2771                            last_seed_error = Some(anyhow!(
2772                                "direct baseline seed from {} failed: {primary_err:#}; sqlite3 salvage open also failed: {salvage_err:#}",
2773                                bundle.root_path.display()
2774                            ));
2775                            continue;
2776                        }
2777                    };
2778                    match stage_and_finalize_historical_seed(
2779                        canonical_db_path,
2780                        &bundle,
2781                        &source.root_path,
2782                    ) {
2783                        Ok(result) => result,
2784                        Err(err) => {
2785                            tracing::warn!(
2786                                path = %bundle.root_path.display(),
2787                                source_path = %source.root_path.display(),
2788                                error = %err,
2789                                "bulk baseline seed staging from sqlite3-salvaged historical bundle failed; trying next candidate"
2790                            );
2791                            last_seed_error = Some(err);
2792                            continue;
2793                        }
2794                    }
2795                }
2796            };
2797
2798        if conversations_imported == 0 && messages_imported == 0 {
2799            let err = anyhow!(
2800                "historical bundle {} has no core rows for baseline import",
2801                bundle.root_path.display()
2802            );
2803            tracing::warn!(
2804                path = %bundle.root_path.display(),
2805                "historical bundle has no core rows for baseline seed import"
2806            );
2807            last_seed_error = Some(err);
2808            continue;
2809        }
2810
2811        if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
2812            tracing::warn!(
2813                path = %bundle.root_path.display(),
2814                error = %err,
2815                "promoting staged historical seed import failed; trying next candidate"
2816            );
2817            last_seed_error = Some(err);
2818            continue;
2819        }
2820
2821        tracing::info!(
2822            path = %bundle.root_path.display(),
2823            conversations_imported,
2824            messages_imported,
2825            "seeded empty canonical database from largest healthy historical bundle"
2826        );
2827
2828        return Ok(Some(HistoricalSalvageOutcome {
2829            bundles_considered: 0,
2830            bundles_imported: 1,
2831            conversations_imported,
2832            messages_imported,
2833        }));
2834    }
2835    if let Some(err) = last_seed_error {
2836        return Err(err);
2837    }
2838    Ok(None)
2839}
2840
2841fn parse_json_column(value: Option<String>) -> serde_json::Value {
2842    value
2843        .and_then(|raw| serde_json::from_str(&raw).ok())
2844        .unwrap_or(serde_json::Value::Null)
2845}
2846
2847const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
2848
2849fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
2850    serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
2851}
2852
2853fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
2854    match value {
2855        serde_json::Value::Object(map) if map.len() == 1 => map
2856            .get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
2857            .and_then(serde_json::Value::as_str),
2858        _ => None,
2859    }
2860}
2861
2862fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
2863    match value {
2864        Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
2865        Some(raw) => wrap_historical_raw_json(raw),
2866        None => serde_json::Value::Null,
2867    }
2868}
2869
2870fn historical_salvage_debug_enabled() -> bool {
2871    std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
2872}
2873
2874#[derive(Debug, Clone, Copy)]
2875struct HistoricalImportBatchLimits {
2876    conversations: usize,
2877    messages: usize,
2878    payload_chars: usize,
2879}
2880
2881fn env_positive_usize(key: &str) -> Option<usize> {
2882    dotenvy::var(key)
2883        .ok()
2884        .and_then(|value| value.parse::<usize>().ok())
2885        .filter(|value| *value > 0)
2886}
2887
2888fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
2889    let cpu_count = std::thread::available_parallelism()
2890        .map(std::num::NonZeroUsize::get)
2891        .unwrap_or(1);
2892
2893    let default_limits = if cpu_count >= 32 {
2894        HistoricalImportBatchLimits {
2895            conversations: 128,
2896            messages: 16_384,
2897            payload_chars: 12_000_000,
2898        }
2899    } else {
2900        HistoricalImportBatchLimits {
2901            conversations: 32,
2902            messages: 4_096,
2903            payload_chars: 3_000_000,
2904        }
2905    };
2906
2907    HistoricalImportBatchLimits {
2908        conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
2909            .unwrap_or(default_limits.conversations),
2910        messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
2911            .unwrap_or(default_limits.messages),
2912        payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
2913            .unwrap_or(default_limits.payload_chars),
2914    }
2915}
2916
2917fn json_value_size_hint(value: &serde_json::Value) -> usize {
2918    if let Some(raw) = historical_raw_json(value) {
2919        return raw.len();
2920    }
2921    match value {
2922        serde_json::Value::Null => 0,
2923        other => serde_json::to_string(other)
2924            .map(|raw| raw.len())
2925            .unwrap_or(0),
2926    }
2927}
2928
2929fn message_payload_size_hint(message: &Message) -> usize {
2930    message
2931        .content
2932        .len()
2933        .saturating_add(json_value_size_hint(&message.extra_json))
2934}
2935
2936fn is_backup_root_name(name: &str, prefix: &str) -> bool {
2937    name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
2938}
2939
2940// Suffixes that mark sqlite sidecar files we must never re-open as a DB root.
2941// Includes the standard -wal/-shm pair plus frankensqlite's Windows advisory-
2942// lock sidecars (-lock-shared/-lock-reserved/-lock-pending). Used by directory
2943// enumeration paths in `historical_bundle_root_paths`; deliberately NOT used
2944// by `is_backup_root_name`, because the existing backup-rotation cleanup must
2945// continue to sweep up any pre-existing orphan lock sidecars.
2946fn has_db_sidecar_suffix(name: &str) -> bool {
2947    const SIDECAR_SUFFIXES: &[&str] = &[
2948        "-wal",
2949        "-shm",
2950        "-lock-shared",
2951        "-lock-reserved",
2952        "-lock-pending",
2953    ];
2954    SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
2955}
2956
2957/// Public schema version constant for external checks.
2958pub const CURRENT_SCHEMA_VERSION: i64 = 20;
2959const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
2960
2961/// Result of checking schema compatibility.
2962#[derive(Debug, Clone)]
2963pub enum SchemaCheck {
2964    /// Schema is up to date, no migration needed.
2965    Compatible,
2966    /// Schema needs migration but can be done incrementally.
2967    NeedsMigration,
2968    /// Schema is incompatible and needs a full rebuild (with reason).
2969    NeedsRebuild(String),
2970}
2971
2972fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
2973    // Only on-disk corruption classes justify destructive rebuild.
2974    // Locking, open, and generic I/O failures are often transient and must
2975    // surface as errors rather than deleting the database under the caller.
2976    matches!(
2977        err,
2978        frankensqlite::FrankenError::DatabaseCorrupt { .. }
2979            | frankensqlite::FrankenError::WalCorrupt { .. }
2980            | frankensqlite::FrankenError::NotADatabase { .. }
2981            | frankensqlite::FrankenError::ShortRead { .. }
2982    )
2983}
2984
2985fn unique_backup_path(path: &Path) -> PathBuf {
2986    static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
2987
2988    let timestamp = SystemTime::now()
2989        .duration_since(UNIX_EPOCH)
2990        .map(|d| d.as_nanos())
2991        .unwrap_or(0);
2992    let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
2993    let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
2994
2995    path.with_file_name(format!(
2996        "{file_name}.backup.{}.{}.{}",
2997        std::process::id(),
2998        timestamp,
2999        nonce
3000    ))
3001}
3002
3003fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
3004    let file_name = backup_path
3005        .file_name()
3006        .and_then(|name| name.to_str())
3007        .unwrap_or("db.backup");
3008    backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
3009}
3010
3011/// Check schema compatibility without modifying the database.
3012///
3013/// Opens the database read-only and checks the schema version.
3014fn check_schema_compatibility(
3015    path: &Path,
3016) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
3017    let mut conn = open_franken_with_flags(
3018        &path.to_string_lossy(),
3019        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
3020    )?;
3021
3022    let result = (|| {
3023        // Check if meta table exists
3024        let meta_exists: i32 = conn.query_row_map(
3025            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
3026            fparams![],
3027            |row| row.get_typed(0),
3028        )?;
3029
3030        if meta_exists == 0 {
3031            // No meta table - could be empty or very old schema, needs rebuild
3032            // But first check if there are any tables at all
3033            let table_count: i32 = conn.query_row_map(
3034                "SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
3035                fparams![],
3036                |row| row.get_typed(0),
3037            )?;
3038
3039            if table_count == 0 {
3040                // Empty database, will be initialized fresh
3041                return Ok(SchemaCheck::NeedsMigration);
3042            }
3043
3044            // Has tables but no meta - very old or corrupted
3045            return Ok(SchemaCheck::NeedsRebuild(
3046                "Database missing schema version metadata".to_string(),
3047            ));
3048        }
3049
3050        // Get the schema version
3051        let version: Option<i64> = conn
3052            .query_row_map(
3053                "SELECT value FROM meta WHERE key = 'schema_version'",
3054                fparams![],
3055                |row| Ok(row.get_typed::<String>(0)?.parse().ok()),
3056            )
3057            .ok()
3058            .flatten();
3059
3060        match version {
3061            Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
3062            Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
3063                Ok(SchemaCheck::NeedsMigration)
3064            }
3065            Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
3066                Ok(SchemaCheck::NeedsRebuild(format!(
3067                    "Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
3068                    v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
3069                )))
3070            }
3071            Some(v) => {
3072                // v > SCHEMA_VERSION - database is from a newer version
3073                Ok(SchemaCheck::NeedsRebuild(format!(
3074                    "Schema version {} is newer than supported version {}",
3075                    v, SCHEMA_VERSION
3076                )))
3077            }
3078            None => Ok(SchemaCheck::NeedsRebuild(
3079                "Schema version not found or invalid".to_string(),
3080            )),
3081        }
3082    })();
3083
3084    if let Err(close_err) = conn.close_in_place() {
3085        tracing::warn!(
3086            error = %close_err,
3087            db_path = %path.display(),
3088            "check_schema_compatibility: close_in_place failed; falling back to best-effort close"
3089        );
3090        conn.close_best_effort_in_place();
3091    }
3092
3093    result
3094}
3095
3096const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
3097
3098#[cfg(test)]
3099const MIGRATION_V1: &str = r"
3100PRAGMA foreign_keys = ON;
3101
3102CREATE TABLE IF NOT EXISTS meta (
3103    key TEXT PRIMARY KEY,
3104    value TEXT NOT NULL
3105);
3106
3107CREATE TABLE IF NOT EXISTS agents (
3108    id INTEGER PRIMARY KEY,
3109    slug TEXT NOT NULL UNIQUE,
3110    name TEXT NOT NULL,
3111    version TEXT,
3112    kind TEXT NOT NULL,
3113    created_at INTEGER NOT NULL,
3114    updated_at INTEGER NOT NULL
3115);
3116
3117CREATE TABLE IF NOT EXISTS workspaces (
3118    id INTEGER PRIMARY KEY,
3119    path TEXT NOT NULL UNIQUE,
3120    display_name TEXT
3121);
3122
3123CREATE TABLE IF NOT EXISTS conversations (
3124    id INTEGER PRIMARY KEY,
3125    agent_id INTEGER NOT NULL REFERENCES agents(id),
3126    workspace_id INTEGER REFERENCES workspaces(id),
3127    external_id TEXT,
3128    title TEXT,
3129    source_path TEXT NOT NULL,
3130    started_at INTEGER,
3131    ended_at INTEGER,
3132    approx_tokens INTEGER,
3133    metadata_json TEXT,
3134    UNIQUE(agent_id, external_id)
3135);
3136
3137CREATE TABLE IF NOT EXISTS messages (
3138    id INTEGER PRIMARY KEY,
3139    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
3140    idx INTEGER NOT NULL,
3141    role TEXT NOT NULL,
3142    author TEXT,
3143    created_at INTEGER,
3144    content TEXT NOT NULL,
3145    extra_json TEXT,
3146    UNIQUE(conversation_id, idx)
3147);
3148
3149CREATE TABLE IF NOT EXISTS snippets (
3150    id INTEGER PRIMARY KEY,
3151    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
3152    file_path TEXT,
3153    start_line INTEGER,
3154    end_line INTEGER,
3155    language TEXT,
3156    snippet_text TEXT
3157);
3158
3159CREATE TABLE IF NOT EXISTS tags (
3160    id INTEGER PRIMARY KEY,
3161    name TEXT NOT NULL UNIQUE
3162);
3163
3164CREATE TABLE IF NOT EXISTS conversation_tags (
3165    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
3166    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
3167    PRIMARY KEY (conversation_id, tag_id)
3168);
3169
3170CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
3171    ON conversations(agent_id, started_at DESC);
3172
3173CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
3174    ON messages(conversation_id, idx);
3175
3176";
3177
3178#[cfg(test)]
3179const MIGRATION_V2: &str = r"
3180CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
3181    content,
3182    title,
3183    agent,
3184    workspace,
3185    source_path,
3186    created_at UNINDEXED,
3187    message_id UNINDEXED,
3188    tokenize='porter'
3189);
3190INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
3191SELECT
3192    m.content,
3193    c.title,
3194    a.slug,
3195    w.path,
3196    c.source_path,
3197    m.created_at,
3198    m.id
3199FROM messages m
3200JOIN conversations c ON m.conversation_id = c.id
3201JOIN agents a ON c.agent_id = a.id
3202LEFT JOIN workspaces w ON c.workspace_id = w.id;
3203";
3204
3205#[cfg(test)]
3206#[allow(dead_code)]
3207const MIGRATION_V3: &str = r"
3208DROP TABLE IF EXISTS fts_messages;
3209CREATE VIRTUAL TABLE fts_messages USING fts5(
3210    content,
3211    title,
3212    agent,
3213    workspace,
3214    source_path,
3215    created_at UNINDEXED,
3216    message_id UNINDEXED,
3217    tokenize='porter'
3218);
3219INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
3220SELECT
3221    m.content,
3222    c.title,
3223    a.slug,
3224    w.path,
3225    c.source_path,
3226    m.created_at,
3227    m.id
3228FROM messages m
3229JOIN conversations c ON m.conversation_id = c.id
3230JOIN agents a ON c.agent_id = a.id
3231LEFT JOIN workspaces w ON c.workspace_id = w.id;
3232";
3233
3234#[cfg(test)]
3235const MIGRATION_V4: &str = r"
3236-- Sources table for tracking where conversations come from
3237CREATE TABLE IF NOT EXISTS sources (
3238    id TEXT PRIMARY KEY,           -- source_id (e.g., 'local', 'work-laptop')
3239    kind TEXT NOT NULL,            -- 'local', 'ssh', etc.
3240    host_label TEXT,               -- display label
3241    machine_id TEXT,               -- optional stable machine id
3242    platform TEXT,                 -- 'macos', 'linux', 'windows'
3243    config_json TEXT,              -- JSON blob for extra config (SSH params, path rewrites)
3244    created_at INTEGER NOT NULL,
3245    updated_at INTEGER NOT NULL
3246);
3247
3248-- Bootstrap: Insert the default 'local' source
3249INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
3250VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
3251";
3252
3253#[cfg(test)]
3254const MIGRATION_V5: &str = r"
3255-- Add provenance columns to conversations table
3256-- SQLite cannot alter unique constraints, so we need to recreate the table
3257
3258-- Create new table with provenance columns and updated unique constraint
3259CREATE TABLE conversations_new (
3260    id INTEGER PRIMARY KEY,
3261    agent_id INTEGER NOT NULL REFERENCES agents(id),
3262    workspace_id INTEGER REFERENCES workspaces(id),
3263    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
3264    external_id TEXT,
3265    title TEXT,
3266    source_path TEXT NOT NULL,
3267    started_at INTEGER,
3268    ended_at INTEGER,
3269    approx_tokens INTEGER,
3270    metadata_json TEXT,
3271    origin_host TEXT,
3272    UNIQUE(source_id, agent_id, external_id)
3273);
3274
3275-- Copy data from old table (all existing conversations get source_id='local')
3276INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
3277                               source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
3278SELECT id, agent_id, workspace_id, 'local', external_id, title,
3279       source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
3280FROM conversations;
3281
3282-- Drop old table and rename new
3283DROP TABLE conversations;
3284ALTER TABLE conversations_new RENAME TO conversations;
3285
3286-- Recreate indexes
3287CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
3288CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
3289";
3290
3291#[cfg(test)]
3292const MIGRATION_V6: &str = r"
3293-- Optimize lookup by source_path (used by TUI detail view)
3294CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
3295";
3296
3297#[cfg(test)]
3298const MIGRATION_V7: &str = r"
3299-- Add binary columns for MessagePack serialization (Opt 3.1)
3300-- Binary format is 50-70% smaller than JSON and faster to parse
3301ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
3302ALTER TABLE messages ADD COLUMN extra_bin BLOB;
3303";
3304
3305#[cfg(test)]
3306const MIGRATION_V8: &str = r"
3307-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
3308-- Provides fast aggregated queries for stats/dashboard without full table scans
3309
3310CREATE TABLE IF NOT EXISTS daily_stats (
3311    day_id INTEGER NOT NULL,              -- Days since 2020-01-01 (Unix epoch + offset)
3312    agent_slug TEXT NOT NULL,             -- 'all' for totals, or specific agent slug
3313    source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
3314    session_count INTEGER NOT NULL DEFAULT 0,
3315    message_count INTEGER NOT NULL DEFAULT 0,
3316    total_chars INTEGER NOT NULL DEFAULT 0,
3317    last_updated INTEGER NOT NULL,
3318    PRIMARY KEY (day_id, agent_slug, source_id)
3319);
3320
3321CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
3322CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
3323";
3324
3325#[cfg(test)]
3326const MIGRATION_V9: &str = r"
3327-- Background embedding jobs tracking table
3328CREATE TABLE IF NOT EXISTS embedding_jobs (
3329    id INTEGER PRIMARY KEY AUTOINCREMENT,
3330    db_path TEXT NOT NULL,
3331    model_id TEXT NOT NULL,
3332    status TEXT NOT NULL DEFAULT 'pending',
3333    total_docs INTEGER NOT NULL DEFAULT 0,
3334    completed_docs INTEGER NOT NULL DEFAULT 0,
3335    error_message TEXT,
3336    created_at TEXT NOT NULL DEFAULT (datetime('now')),
3337    started_at TEXT,
3338    completed_at TEXT
3339);
3340
3341-- Only one pending or running job per (db_path, model_id) at a time.
3342-- Multiple completed/failed/cancelled jobs are allowed for history.
3343CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
3344ON embedding_jobs(db_path, model_id)
3345WHERE status IN ('pending', 'running');
3346";
3347
3348#[cfg(test)]
3349const MIGRATION_V10: &str = r"
3350-- Token analytics: per-message token usage ledger
3351CREATE TABLE IF NOT EXISTS token_usage (
3352    id INTEGER PRIMARY KEY AUTOINCREMENT,
3353    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
3354    conversation_id INTEGER NOT NULL,
3355    agent_id INTEGER NOT NULL,
3356    workspace_id INTEGER,
3357    source_id TEXT NOT NULL DEFAULT 'local',
3358
3359    -- Timing
3360    timestamp_ms INTEGER NOT NULL,
3361    day_id INTEGER NOT NULL,
3362
3363    -- Model identification
3364    model_name TEXT,
3365    model_family TEXT,
3366    model_tier TEXT,
3367    service_tier TEXT,
3368    provider TEXT,
3369
3370    -- Token counts (nullable — not all agents provide all fields)
3371    input_tokens INTEGER,
3372    output_tokens INTEGER,
3373    cache_read_tokens INTEGER,
3374    cache_creation_tokens INTEGER,
3375    thinking_tokens INTEGER,
3376    total_tokens INTEGER,
3377
3378    -- Cost estimation
3379    estimated_cost_usd REAL,
3380
3381    -- Message context
3382    role TEXT NOT NULL,
3383    content_chars INTEGER NOT NULL,
3384    has_tool_calls INTEGER NOT NULL DEFAULT 0,
3385    tool_call_count INTEGER NOT NULL DEFAULT 0,
3386
3387    -- Data quality
3388    data_source TEXT NOT NULL DEFAULT 'api',
3389
3390    UNIQUE(message_id)
3391);
3392
3393CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
3394CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
3395CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
3396CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
3397CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
3398
3399-- Token analytics: pre-aggregated daily rollups
3400CREATE TABLE IF NOT EXISTS token_daily_stats (
3401    day_id INTEGER NOT NULL,
3402    agent_slug TEXT NOT NULL,
3403    source_id TEXT NOT NULL DEFAULT 'all',
3404    model_family TEXT NOT NULL DEFAULT 'all',
3405
3406    api_call_count INTEGER NOT NULL DEFAULT 0,
3407    user_message_count INTEGER NOT NULL DEFAULT 0,
3408    assistant_message_count INTEGER NOT NULL DEFAULT 0,
3409    tool_message_count INTEGER NOT NULL DEFAULT 0,
3410
3411    total_input_tokens INTEGER NOT NULL DEFAULT 0,
3412    total_output_tokens INTEGER NOT NULL DEFAULT 0,
3413    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
3414    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
3415    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
3416    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
3417
3418    total_content_chars INTEGER NOT NULL DEFAULT 0,
3419    total_tool_calls INTEGER NOT NULL DEFAULT 0,
3420
3421    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
3422
3423    session_count INTEGER NOT NULL DEFAULT 0,
3424
3425    last_updated INTEGER NOT NULL,
3426
3427    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
3428);
3429
3430CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
3431CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
3432
3433-- Model pricing lookup table
3434CREATE TABLE IF NOT EXISTS model_pricing (
3435    model_pattern TEXT NOT NULL,
3436    provider TEXT NOT NULL,
3437    input_cost_per_mtok REAL NOT NULL,
3438    output_cost_per_mtok REAL NOT NULL,
3439    cache_read_cost_per_mtok REAL,
3440    cache_creation_cost_per_mtok REAL,
3441    effective_date TEXT NOT NULL,
3442    PRIMARY KEY (model_pattern, effective_date)
3443);
3444
3445-- Seed with current pricing (as of 2026-02)
3446INSERT OR IGNORE INTO model_pricing VALUES
3447    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
3448    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
3449    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
3450    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
3451    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
3452    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3453    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3454    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
3455    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
3456    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
3457
3458-- Extend conversations table with token summary columns
3459ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
3460ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
3461ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
3462ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
3463ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
3464ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
3465ALTER TABLE conversations ADD COLUMN primary_model TEXT;
3466ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
3467ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
3468ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
3469ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
3470";
3471
3472const MIGRATION_V14: &str = r"
3473-- Switch FTS5 from internal-content to contentless mode (CASS #163).
3474-- Drop the old V13 internal-content fts_messages first so that
3475-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
3476-- entries, which makes the database completely unreadable.
3477-- The current contentless table is recreated lazily after open() only when the
3478-- frankensqlite FTS consistency check finds it missing or malformed.
3479DROP TABLE IF EXISTS fts_messages;
3480";
3481
3482const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
3483CREATE TABLE IF NOT EXISTS conversation_tail_state (
3484    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3485    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3486    conversation_id INTEGER PRIMARY KEY,
3487    ended_at INTEGER,
3488    last_message_idx INTEGER,
3489    last_message_created_at INTEGER
3490);
3491";
3492
3493const MIGRATION_V16: &str = r"
3494-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
3495-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
3496-- doubles message insert index maintenance on the hot indexing path.
3497DROP INDEX IF EXISTS idx_messages_conv_idx;
3498";
3499
3500const MIGRATION_V17: &str = r"
3501-- Drop the global messages(created_at) secondary index from the ingest hot
3502-- path. Search/time filters are served by the derived search layer and
3503-- conversation/analytics indexes, while this index is maintained on every
3504-- message insert.
3505DROP INDEX IF EXISTS idx_messages_created;
3506";
3507
3508const MIGRATION_V18: &str = r"
3509-- Move append-tail state out of the wide, indexed conversations row. The hot
3510-- append path updates this cache for every appended conversation; keeping it in
3511-- a tiny rowid table avoids rewriting the large conversation record.
3512CREATE TABLE IF NOT EXISTS conversation_tail_state (
3513    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3514    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3515    conversation_id INTEGER PRIMARY KEY,
3516    ended_at INTEGER,
3517    last_message_idx INTEGER,
3518    last_message_created_at INTEGER
3519);
3520
3521INSERT OR REPLACE INTO conversation_tail_state (
3522    conversation_id, ended_at, last_message_idx, last_message_created_at
3523)
3524SELECT id, ended_at, last_message_idx, last_message_created_at
3525FROM conversations
3526WHERE ended_at IS NOT NULL
3527   OR last_message_idx IS NOT NULL
3528   OR last_message_created_at IS NOT NULL;
3529";
3530
3531const MIGRATION_V19: &str = r"
3532-- Materialize external conversation provenance into one compact lookup key.
3533-- This keeps the hot append/new-conversation probe on a single primary-key
3534-- lookup instead of a composite conversations-table predicate.
3535CREATE TABLE IF NOT EXISTS conversation_external_lookup (
3536    lookup_key TEXT PRIMARY KEY,
3537    conversation_id INTEGER NOT NULL
3538);
3539
3540INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
3541SELECT
3542    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
3543    CAST(agent_id AS TEXT) || ':' ||
3544    CAST(length(external_id) AS TEXT) || ':' || external_id,
3545    id
3546FROM conversations
3547WHERE external_id IS NOT NULL;
3548";
3549
3550const MIGRATION_V20: &str = r"
3551-- Fuse external conversation lookup with append-tail state. Append-heavy
3552-- workloads can resolve both the conversation id and tail plan from one
3553-- primary-key probe.
3554CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
3555    lookup_key TEXT PRIMARY KEY,
3556    conversation_id INTEGER NOT NULL,
3557    ended_at INTEGER,
3558    last_message_idx INTEGER,
3559    last_message_created_at INTEGER
3560);
3561
3562INSERT OR REPLACE INTO conversation_external_tail_lookup (
3563    lookup_key,
3564    conversation_id,
3565    ended_at,
3566    last_message_idx,
3567    last_message_created_at
3568)
3569SELECT
3570    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
3571    CAST(c.agent_id AS TEXT) || ':' ||
3572    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
3573    c.id,
3574    (SELECT ts.ended_at
3575     FROM conversation_tail_state ts
3576     WHERE ts.conversation_id = c.id),
3577    (SELECT ts.last_message_idx
3578     FROM conversation_tail_state ts
3579     WHERE ts.conversation_id = c.id),
3580    (SELECT ts.last_message_created_at
3581     FROM conversation_tail_state ts
3582     WHERE ts.conversation_id = c.id)
3583FROM conversations c
3584WHERE c.external_id IS NOT NULL;
3585";
3586
3587/// Row from the embedding_jobs table.
3588#[derive(Debug, Clone)]
3589pub struct EmbeddingJobRow {
3590    pub id: i64,
3591    pub db_path: String,
3592    pub model_id: String,
3593    pub status: String,
3594    pub total_docs: i64,
3595    pub completed_docs: i64,
3596    pub error_message: Option<String>,
3597    pub created_at: String,
3598    pub started_at: Option<String>,
3599    pub completed_at: Option<String>,
3600}
3601
3602/// Lightweight conversation projection used while rebuilding the lexical index.
3603///
3604/// This intentionally omits `metadata_json` / `metadata_bin` and other bulky
3605/// fields because Tantivy only needs the stable envelope plus provenance
3606/// identifiers. Reading full metadata here can force frankensqlite to traverse
3607/// large overflow chains before the first lexical checkpoint is committed.
3608#[derive(Debug, Clone)]
3609pub struct LexicalRebuildConversationRow {
3610    pub id: Option<i64>,
3611    pub agent_slug: String,
3612    pub workspace: Option<PathBuf>,
3613    pub external_id: Option<String>,
3614    pub title: Option<String>,
3615    pub source_path: PathBuf,
3616    pub started_at: Option<i64>,
3617    pub ended_at: Option<i64>,
3618    pub source_id: String,
3619    pub origin_host: Option<String>,
3620}
3621
3622/// Lightweight per-conversation footprint used to pre-plan lexical rebuild
3623/// shard boundaries without re-reading full message bodies in the hot path.
3624#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3625pub struct LexicalRebuildConversationFootprintRow {
3626    pub conversation_id: i64,
3627    pub message_count: usize,
3628    pub message_bytes: usize,
3629}
3630
3631pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
3632const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
3633
3634fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
3635    total_conversations: usize,
3636    covered_conversations: usize,
3637) -> bool {
3638    total_conversations == 0
3639        || total_conversations.saturating_sub(covered_conversations.min(total_conversations))
3640            <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
3641}
3642
3643fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
3644    let last_message_idx = u64::try_from(last_message_idx?).ok()?;
3645    let high_water = last_message_idx.checked_add(1)?;
3646    usize::try_from(high_water).ok()
3647}
3648
3649fn lexical_rebuild_conversation_footprint_from_count(
3650    conversation_id: i64,
3651    message_count: usize,
3652) -> LexicalRebuildConversationFootprintRow {
3653    LexicalRebuildConversationFootprintRow {
3654        conversation_id,
3655        message_count,
3656        message_bytes: message_count
3657            .saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
3658    }
3659}
3660
3661/// Lightweight message projection used by the streaming lexical rebuild path.
3662#[derive(Debug, Clone)]
3663pub struct LexicalRebuildMessageRow {
3664    pub conversation_id: i64,
3665    pub id: i64,
3666    pub idx: i64,
3667    pub role: String,
3668    pub author: Option<String>,
3669    pub created_at: Option<i64>,
3670    pub content: String,
3671}
3672
3673/// Even lighter message projection used only by the grouped lexical rebuild
3674/// stream hot path. It keeps just the per-message fields the rebuild consumes
3675/// and tracks the final message id at conversation scope instead.
3676#[derive(Debug, Clone, PartialEq, Eq)]
3677pub struct LexicalRebuildGroupedMessageRow {
3678    pub idx: i64,
3679    pub is_tool_role: bool,
3680    pub created_at: Option<i64>,
3681    pub content: String,
3682}
3683
3684pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
3685
3686/// Compatibility alias retained while call sites finish converging on `FrankenStorage`.
3687pub type SqliteStorage = FrankenStorage;
3688
3689/// Primary frankensqlite-backed storage backend.
3690pub struct FrankenStorage {
3691    conn: FrankenConnection,
3692    db_path: PathBuf,
3693    ephemeral_writer_preflight_verified: AtomicBool,
3694    index_writer_checkpoint_pages: AtomicI64,
3695    index_writer_busy_timeout_ms: AtomicU64,
3696    cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
3697    ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3698    ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3699    ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
3700    ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3701    fts_messages_present_cache: AtomicI8,
3702}
3703
3704/// Keep ordinary storage commits from tripping over frequent auto-checkpoints
3705/// while still bounding WAL growth. Bulk index paths may override this through
3706/// their explicit checkpoint policy.
3707const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
3708const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
3709const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
3710const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
3711const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
3712const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
3713
3714enum CachedEphemeralWriter {
3715    Uninitialized,
3716    Cached(Box<SendFrankenConnection>),
3717    InUse,
3718}
3719
3720#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3721struct EnsuredAgentKey {
3722    slug: String,
3723    name: String,
3724    version: Option<String>,
3725    kind: String,
3726}
3727
3728impl EnsuredAgentKey {
3729    fn from_agent(agent: &Agent) -> Self {
3730        Self {
3731            slug: agent.slug.clone(),
3732            name: agent.name.clone(),
3733            version: agent.version.clone(),
3734            kind: agent_kind_str(agent.kind.clone()),
3735        }
3736    }
3737}
3738
3739#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3740struct EnsuredWorkspaceKey {
3741    path: String,
3742    display_name: Option<String>,
3743}
3744
3745impl EnsuredWorkspaceKey {
3746    fn new(path: String, display_name: Option<&str>) -> Self {
3747        Self {
3748            path,
3749            display_name: display_name.map(str::to_owned),
3750        }
3751    }
3752}
3753
3754#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3755struct EnsuredConversationSourceKey {
3756    id: String,
3757    kind: SourceKind,
3758    host_label: Option<String>,
3759}
3760
3761impl EnsuredConversationSourceKey {
3762    fn from_source(source: &Source) -> Self {
3763        Self {
3764            id: source.id.clone(),
3765            kind: source.kind,
3766            host_label: source.host_label.clone(),
3767        }
3768    }
3769}
3770
3771#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3772struct EnsuredDailyStatsKey {
3773    day_id: i64,
3774    agent_slug: String,
3775    source_id: String,
3776}
3777
3778impl EnsuredDailyStatsKey {
3779    fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
3780        Self {
3781            day_id,
3782            agent_slug: agent_slug.to_owned(),
3783            source_id: source_id.to_owned(),
3784        }
3785    }
3786}
3787
3788const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
3789    "PRAGMA fsqlite.autocommit_retain = OFF;",
3790    "PRAGMA autocommit_retain = OFF;",
3791];
3792
3793fn disable_autocommit_retain<E>(
3794    mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
3795) -> Result<&'static str>
3796where
3797    E: std::fmt::Display,
3798{
3799    let mut failures = Vec::new();
3800    for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
3801        match execute(pragma) {
3802            Ok(()) => return Ok(pragma),
3803            Err(err) => {
3804                let error = err.to_string();
3805                tracing::debug!(
3806                    %pragma,
3807                    error = %error,
3808                    "autocommit_retain PRAGMA variant not supported"
3809                );
3810                failures.push(format!("{pragma}: {error}"));
3811            }
3812        }
3813    }
3814
3815    Err(anyhow!(
3816        "failed to disable autocommit_retain on frankensqlite connection; \
3817         refusing to keep a long-lived MVCC connection that may accumulate \
3818         unbounded write snapshots. Upgrade frankensqlite to a version that \
3819         supports one of these PRAGMAs or use a short-lived connection path. \
3820         attempts: {}",
3821        failures.join("; ")
3822    ))
3823}
3824
3825impl FrankenStorage {
3826    fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
3827        Self::new_with_shared_caches(
3828            conn,
3829            db_path,
3830            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3831            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3832            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3833            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3834        )
3835    }
3836
3837    fn new_with_shared_caches(
3838        conn: FrankenConnection,
3839        db_path: PathBuf,
3840        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3841        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3842        ensured_conversation_sources: Arc<
3843            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3844        >,
3845        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3846    ) -> Self {
3847        Self {
3848            conn,
3849            db_path,
3850            ephemeral_writer_preflight_verified: AtomicBool::new(false),
3851            index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
3852            index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
3853            cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
3854            ensured_agents,
3855            ensured_workspaces,
3856            ensured_conversation_sources,
3857            ensured_daily_stats_keys,
3858            fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
3859        }
3860    }
3861
3862    fn apply_open_stage_busy_timeout(&self) {
3863        if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
3864            tracing::debug!(
3865                error = %err,
3866                "failed to apply open-stage busy_timeout before migrations"
3867            );
3868        }
3869    }
3870
3871    /// Open a frankensqlite connection, run migrations, and apply config.
3872    ///
3873    /// This initializes canonical schema state only. Derived fallback search
3874    /// structures like the in-database `fts_messages` table are repaired
3875    /// separately so ordinary opens never block on heavyweight maintenance.
3876    pub fn open(path: &Path) -> Result<Self> {
3877        if let Some(parent) = path.parent() {
3878            fs::create_dir_all(parent)
3879                .with_context(|| format!("creating db directory {}", parent.display()))?;
3880        }
3881
3882        let path_str = path.to_string_lossy().to_string();
3883        let _doctor_guard =
3884            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3885        let conn = FrankenConnection::open(&path_str)
3886            .with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
3887        let storage = Self::new(conn, path.to_path_buf());
3888        storage.apply_open_stage_busy_timeout();
3889        storage.run_migrations()?;
3890        storage.repair_missing_current_schema_objects()?;
3891        storage.apply_config()?;
3892        Ok(storage)
3893    }
3894
3895    /// Open a writer connection that skips migration (assumes DB already migrated).
3896    ///
3897    /// Used by the BEGIN CONCURRENT parallel writer pool: each writer needs its
3898    /// own connection with config applied, but migrations have already been run
3899    /// by the primary connection.
3900    pub fn open_writer(path: &Path) -> Result<Self> {
3901        Self::open_writer_with_shared_caches(
3902            path,
3903            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3904            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3905            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3906            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3907        )
3908    }
3909
3910    fn open_writer_with_shared_caches(
3911        path: &Path,
3912        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3913        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3914        ensured_conversation_sources: Arc<
3915            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3916        >,
3917        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3918    ) -> Result<Self> {
3919        let path_str = path.to_string_lossy().to_string();
3920        let _doctor_guard =
3921            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3922        let conn = FrankenConnection::open(&path_str)
3923            .with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
3924        let storage = Self::new_with_shared_caches(
3925            conn,
3926            path.to_path_buf(),
3927            ensured_agents,
3928            ensured_workspaces,
3929            ensured_conversation_sources,
3930            ensured_daily_stats_keys,
3931        );
3932        storage.apply_config()?;
3933        Ok(storage)
3934    }
3935
3936    pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
3937        let mut cached = self.cached_ephemeral_writer.lock();
3938        match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
3939            CachedEphemeralWriter::Cached(conn) => {
3940                let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
3941                let writer = Self::new_with_shared_caches(
3942                    conn,
3943                    self.db_path.clone(),
3944                    Arc::clone(&self.ensured_agents),
3945                    Arc::clone(&self.ensured_workspaces),
3946                    Arc::clone(&self.ensured_conversation_sources),
3947                    Arc::clone(&self.ensured_daily_stats_keys),
3948                );
3949                writer
3950                    .index_writer_checkpoint_pages
3951                    .store(checkpoint_pages, Ordering::Relaxed);
3952                writer
3953                    .index_writer_busy_timeout_ms
3954                    .store(busy_timeout_ms, Ordering::Relaxed);
3955                Ok((writer, true))
3956            }
3957            CachedEphemeralWriter::Uninitialized => {
3958                drop(cached);
3959                match Self::open_writer_with_shared_caches(
3960                    &self.db_path,
3961                    Arc::clone(&self.ensured_agents),
3962                    Arc::clone(&self.ensured_workspaces),
3963                    Arc::clone(&self.ensured_conversation_sources),
3964                    Arc::clone(&self.ensured_daily_stats_keys),
3965                ) {
3966                    Ok(writer) => Ok((writer, true)),
3967                    Err(err) => {
3968                        let mut cached = self.cached_ephemeral_writer.lock();
3969                        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3970                            *cached = CachedEphemeralWriter::Uninitialized;
3971                        }
3972                        Err(err)
3973                    }
3974                }
3975            }
3976            CachedEphemeralWriter::InUse => {
3977                *cached = CachedEphemeralWriter::InUse;
3978                drop(cached);
3979                Ok((
3980                    Self::open_writer_with_shared_caches(
3981                        &self.db_path,
3982                        Arc::clone(&self.ensured_agents),
3983                        Arc::clone(&self.ensured_workspaces),
3984                        Arc::clone(&self.ensured_conversation_sources),
3985                        Arc::clone(&self.ensured_daily_stats_keys),
3986                    )?,
3987                    false,
3988                ))
3989            }
3990        }
3991    }
3992
3993    pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
3994        let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
3995        let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
3996        let conn = writer.into_raw();
3997        let mut cached = self.cached_ephemeral_writer.lock();
3998        debug_assert!(
3999            matches!(&*cached, CachedEphemeralWriter::InUse),
4000            "cached ephemeral writer state should be in-use when releasing"
4001        );
4002        *cached = CachedEphemeralWriter::Cached(Box::new(
4003            SendFrankenConnection::new_with_index_writer_state(
4004                conn,
4005                checkpoint_pages,
4006                busy_timeout_ms,
4007            ),
4008        ));
4009    }
4010
4011    pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
4012        writer.close_best_effort_in_place();
4013        let mut cached = self.cached_ephemeral_writer.lock();
4014        if matches!(&*cached, CachedEphemeralWriter::InUse) {
4015            *cached = CachedEphemeralWriter::Uninitialized;
4016        }
4017    }
4018
4019    fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
4020        self.ensured_agents.lock().get(key).copied()
4021    }
4022
4023    fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
4024        self.ensured_agents.lock().insert(key, id);
4025    }
4026
4027    fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
4028        self.ensured_workspaces.lock().get(key).copied()
4029    }
4030
4031    fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
4032        self.ensured_workspaces.lock().insert(key, id);
4033    }
4034
4035    fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
4036        self.ensured_conversation_sources.lock().contains(key)
4037    }
4038
4039    fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
4040        self.ensured_conversation_sources.lock().insert(key);
4041    }
4042
4043    fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
4044        self.ensured_daily_stats_keys.lock().contains(key)
4045    }
4046
4047    fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
4048        let ensured = self.ensured_daily_stats_keys.lock();
4049        keys.iter().all(|key| ensured.contains(key))
4050    }
4051
4052    fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
4053        self.ensured_daily_stats_keys.lock().insert(key);
4054    }
4055
4056    fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
4057        match self.fts_messages_present_cache.load(Ordering::Acquire) {
4058            FTS_MESSAGES_PRESENT_PRESENT => return true,
4059            FTS_MESSAGES_PRESENT_ABSENT => return false,
4060            _ => {}
4061        }
4062
4063        let present = tx
4064            .query_row_map(
4065                "SELECT COUNT(*) FROM sqlite_master
4066                 WHERE name = 'fts_messages'
4067                   AND rootpage > 0",
4068                fparams![],
4069                |row| row.get_typed::<i64>(0),
4070            )
4071            .map(|count| count > 0)
4072            .unwrap_or_else(|err| {
4073                tracing::debug!(
4074                    error = %err,
4075                    "failed to probe fts_messages presence; skipping db-resident FTS maintenance"
4076                );
4077                false
4078            });
4079        self.set_fts_messages_present_cache(present);
4080        present
4081    }
4082
4083    fn set_fts_messages_present_cache(&self, present: bool) {
4084        self.fts_messages_present_cache.store(
4085            if present {
4086                FTS_MESSAGES_PRESENT_PRESENT
4087            } else {
4088                FTS_MESSAGES_PRESENT_ABSENT
4089            },
4090            Ordering::Release,
4091        );
4092    }
4093
4094    fn invalidate_fts_messages_present_cache(&self) {
4095        self.fts_messages_present_cache
4096            .store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
4097    }
4098
4099    fn invalidate_conversation_source_cache(&self, source_id: &str) {
4100        self.ensured_conversation_sources
4101            .lock()
4102            .retain(|key| key.id != source_id);
4103    }
4104
4105    fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
4106        let cached = self.cached_ephemeral_writer.get_mut();
4107        if let CachedEphemeralWriter::Cached(conn) =
4108            std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
4109        {
4110            let mut conn = conn;
4111            conn.0.close_best_effort_in_place();
4112        }
4113    }
4114
4115    fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
4116        let cached = self.cached_ephemeral_writer.get_mut();
4117        match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
4118            CachedEphemeralWriter::Cached(mut conn) => conn
4119                .0
4120                .close_without_checkpoint_in_place()
4121                .with_context(|| "closing cached frankensqlite writer without final checkpoint"),
4122            CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
4123        }
4124    }
4125
4126    /// Open in read-only mode using frankensqlite compat flags.
4127    pub fn open_readonly(path: &Path) -> Result<Self> {
4128        Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
4129    }
4130
4131    /// Open in read-only mode with an explicit doctor mutation-lock timeout.
4132    ///
4133    /// This is primarily useful for probes that need to prove a reader would
4134    /// not enter the archive while `cass doctor --fix` owns the repair lock.
4135    pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
4136        let path_str = path.to_string_lossy().to_string();
4137        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
4138        let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
4139            .with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
4140        let storage = Self::new(conn, path.to_path_buf());
4141        storage.apply_readonly_config()?;
4142        Ok(storage)
4143    }
4144
4145    pub fn close(self) -> Result<()> {
4146        let mut this = self;
4147        this.close_cached_ephemeral_writer_best_effort_in_place();
4148        this.conn
4149            .close()
4150            .with_context(|| "closing frankensqlite connection")
4151    }
4152
4153    pub fn close_without_checkpoint(self) -> Result<()> {
4154        let mut this = self;
4155        this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
4156        this.conn
4157            .close_without_checkpoint()
4158            .with_context(|| "closing frankensqlite connection without final checkpoint")
4159    }
4160
4161    pub fn close_best_effort_in_place(&mut self) {
4162        self.close_cached_ephemeral_writer_best_effort_in_place();
4163        self.conn.close_best_effort_in_place();
4164    }
4165
4166    pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
4167        self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
4168        self.conn
4169            .close_without_checkpoint_in_place()
4170            .with_context(|| "closing frankensqlite connection without final checkpoint")
4171    }
4172
4173    /// Access the raw frankensqlite connection.
4174    pub fn raw(&self) -> &FrankenConnection {
4175        &self.conn
4176    }
4177
4178    /// Consume the storage wrapper and return the underlying frankensqlite
4179    /// connection after migrations/repair have already been applied.
4180    pub fn into_raw(self) -> FrankenConnection {
4181        let mut this = self;
4182        this.close_cached_ephemeral_writer_best_effort_in_place();
4183        this.conn
4184    }
4185
4186    /// Apply connection PRAGMAs for parity with SqliteStorage's `apply_pragmas()`.
4187    ///
4188    /// Frankensqlite supports all PRAGMAs cass uses (journal_mode, synchronous,
4189    /// cache_size, foreign_keys, busy_timeout). Its default journal_mode is already
4190    /// WAL and default synchronous is NORMAL, matching cass's requirements.
4191    ///
4192    pub fn apply_config(&self) -> Result<()> {
4193        // journal_mode: frankensqlite defaults to WAL, same as cass.
4194        // synchronous: frankensqlite defaults to NORMAL, same as cass.
4195        // Both are set explicitly for clarity.
4196        self.conn
4197            .execute("PRAGMA journal_mode = WAL;")
4198            .with_context(|| "setting journal_mode")?;
4199        self.conn
4200            .execute("PRAGMA synchronous = NORMAL;")
4201            .with_context(|| "setting synchronous")?;
4202
4203        // cache_size: 64MB (negative value = KiB).
4204        self.conn
4205            .execute("PRAGMA cache_size = -65536;")
4206            .with_context(|| "setting cache_size")?;
4207
4208        // foreign_keys: enable constraint enforcement.
4209        self.conn
4210            .execute("PRAGMA foreign_keys = ON;")
4211            .with_context(|| "setting foreign_keys")?;
4212
4213        // busy_timeout: 5 seconds (in milliseconds).
4214        self.conn
4215            .execute("PRAGMA busy_timeout = 5000;")
4216            .with_context(|| "setting busy_timeout")?;
4217
4218        // temp_store = MEMORY and mmap_size are C SQLite performance knobs.
4219        // In frankensqlite's architecture (in-memory MVCC engine with pager
4220        // backend), temp_store is always memory-resident and mmap_size does not
4221        // apply. Skipped intentionally — these are no-ops or errors.
4222
4223        // wal_autocheckpoint: use a bounded cadence that avoids checkpointing
4224        // inside common append batches without deferring checkpoints forever.
4225        let checkpoint_pragma =
4226            format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
4227        let _ = self.conn.execute(&checkpoint_pragma);
4228        self.index_writer_checkpoint_pages
4229            .store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
4230        // Explicitly enable concurrent writer mode for BEGIN/transaction paths.
4231        // Try both namespace variants for compatibility across fsqlite builds.
4232        let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
4233        let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
4234        // Frankensqlite retained autocommit currently mis-serves same-connection
4235        // read-after-write queries on cass's storage paths; keep it off here
4236        // until the upstream visibility bug is fixed.
4237        //
4238        // CASS #163 item 3: If neither PRAGMA variant succeeds, the MVCC engine
4239        // will accumulate write snapshots for the lifetime of the connection,
4240        // causing unbounded memory growth on long-lived watch-mode handles.
4241        // Log at warn level so the failure is visible instead of silently
4242        // swallowed, and set a flag for callers that need to periodically
4243        // recycle the connection.
4244        let autocommit_pragma =
4245            disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ()))?;
4246        tracing::debug!(
4247            pragma = autocommit_pragma,
4248            "disabled frankensqlite autocommit_retain for storage connection"
4249        );
4250
4251        Ok(())
4252    }
4253
4254    fn apply_readonly_config(&self) -> Result<()> {
4255        self.conn
4256            .execute("PRAGMA query_only = 1;")
4257            .with_context(|| "setting query_only")?;
4258        self.conn
4259            .execute("PRAGMA busy_timeout = 5000;")
4260            .with_context(|| "setting busy_timeout")?;
4261        self.conn
4262            .execute("PRAGMA cache_size = -65536;")
4263            .with_context(|| "setting cache_size")?;
4264        self.conn
4265            .execute("PRAGMA foreign_keys = ON;")
4266            .with_context(|| "setting foreign_keys")?;
4267        Ok(())
4268    }
4269
4270    /// Run all schema migrations, handling transition from meta table versioning.
4271    ///
4272    /// The existing `SqliteStorage` tracks schema version in a `meta` table entry.
4273    /// The new `MigrationRunner` uses a `_schema_migrations` table. This method:
4274    /// 1. Transitions existing databases from meta table → `_schema_migrations`
4275    /// 2. Runs pending migrations via `MigrationRunner`
4276    /// 3. Syncs `meta.schema_version` for backward compatibility
4277    ///
4278    /// # Fresh vs existing databases
4279    ///
4280    /// Fresh databases use a single combined migration (`MIGRATION_FRESH_SCHEMA`)
4281    /// that creates the complete V13 schema directly. This avoids the incremental
4282    /// V5 migration which uses `DROP TABLE` — an operation that triggers a known
4283    /// frankensqlite autoindex limitation.
4284    ///
4285    /// Existing databases (transitioned from SqliteStorage) are typically at
4286    /// V13 or newer already; additive post-V13 migrations are applied normally.
4287    pub fn run_migrations(&self) -> Result<()> {
4288        transition_from_meta_version(&self.conn)?;
4289
4290        let base_result = build_cass_migrations_before_tail_cache()
4291            .run(&self.conn)
4292            .with_context(|| "running base schema migrations")?;
4293
4294        let mut applied = base_result.applied;
4295        if apply_conversation_tail_state_cache_migration(&self.conn)
4296            .with_context(|| "running conversation tail-state cache migration")?
4297        {
4298            applied.push(15);
4299        }
4300
4301        let post_result = build_cass_migrations_after_tail_cache()
4302            .run(&self.conn)
4303            .with_context(|| "running post-tail-cache schema migrations")?;
4304        applied.extend(post_result.applied);
4305
4306        let current = self.schema_version()?;
4307        if !applied.is_empty() {
4308            info!(
4309                applied = ?applied,
4310                current,
4311                was_fresh = base_result.was_fresh,
4312                "frankensqlite schema migrations applied"
4313            );
4314        }
4315
4316        // Keep meta.schema_version in sync for backward compatibility.
4317        self.sync_meta_schema_version(current)?;
4318
4319        Ok(())
4320    }
4321
4322    /// Some historical canonical rebuild paths produced databases whose
4323    /// version markers claim the current schema while post-V10 analytics
4324    /// tables were never materialized. Detect that drift and backfill the
4325    /// idempotent table/index set from the combined schema migration.
4326    fn repair_missing_current_schema_objects(&self) -> Result<()> {
4327        let mut missing_tables = Vec::new();
4328        for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4329            if let Err(err) = self.conn.query(probe_sql) {
4330                if error_indicates_missing_table(&err) {
4331                    missing_tables.push(table_name);
4332                    continue;
4333                }
4334                return Err(err).with_context(|| {
4335                    format!("probing required schema table {table_name} for completeness")
4336                });
4337            }
4338        }
4339
4340        if !missing_tables.is_empty() {
4341            info!(
4342                missing_tables = ?missing_tables,
4343                "repairing missing current-schema tables on an already-versioned cass database"
4344            );
4345
4346            for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
4347                self.conn
4348                    .execute_batch(batch.sql)
4349                    .with_context(|| format!("repairing current-schema batch {}", batch.name))?;
4350            }
4351
4352            for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4353                if !missing_tables.contains(&table_name) {
4354                    continue;
4355                }
4356                self.conn
4357                    .query(probe_sql)
4358                    .with_context(|| format!("verifying repaired schema table {table_name}"))?;
4359            }
4360        }
4361        self.repair_missing_conversation_token_columns()?;
4362        Ok(())
4363    }
4364
4365    fn repair_missing_conversation_token_columns(&self) -> Result<()> {
4366        let columns = franken_table_column_names(&self.conn, "conversations")
4367            .with_context(|| "inspecting conversations columns for token-summary repair")?;
4368        let mut missing_columns = Vec::new();
4369        for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
4370            if columns.contains(column_name) {
4371                continue;
4372            }
4373            let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
4374            self.conn.execute(&sql).with_context(|| {
4375                format!("adding missing conversations.{column_name} token-summary column")
4376            })?;
4377            missing_columns.push(column_name);
4378        }
4379        if !missing_columns.is_empty() {
4380            tracing::warn!(
4381                target: "cass::schema_repair",
4382                db_path = %self.db_path.display(),
4383                missing_columns = ?missing_columns,
4384                "cass#222: repaired missing conversations token-summary columns"
4385            );
4386        }
4387        Ok(())
4388    }
4389
4390    /// Detect and remove orphan rows whose FK parent has gone missing.
4391    ///
4392    /// A `Connection` dropped mid-transaction (the `drop_close` warning emitted
4393    /// by frankensqlite's `Drop` impl) can leave child rows persisted without a
4394    /// matching parent — `messages` referencing a `conversation_id` that does
4395    /// not exist, `message_metrics`/`token_usage`/`snippets` referencing a
4396    /// `message_id` that does not exist, etc. With `PRAGMA foreign_keys = ON`,
4397    /// every subsequent indexer pass then trips `FOREIGN KEY constraint failed`
4398    /// on the next write, the session never gets marked indexed, and the
4399    /// pending backlog grows without bound (issue #202).
4400    ///
4401    /// This pass runs at indexer startup as defense in depth: it scans each
4402    /// child table for rows whose parent row has gone missing and removes them
4403    /// in bounded committed chunks, breaking the failure cycle even when the
4404    /// underlying transaction-discipline bug has not been fully root-caused.
4405    /// The pass is idempotent (a clean database is a no-op), and emits a
4406    /// `WARN` after successful cleanup so the upstream `drop_close` condition
4407    /// stays visible.
4408    pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
4409        let mut report = OrphanFkCleanupReport::default();
4410        let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
4411            Ok(ids) => ids,
4412            Err(err) if error_indicates_missing_table(&err) => {
4413                tracing::debug!(
4414                    target: "cass::fk_repair",
4415                    child_table = "messages",
4416                    error = %err,
4417                    "skipping orphan-message probe (table or column unavailable)"
4418                );
4419                Vec::new()
4420            }
4421            Err(err) => return Err(err),
4422        };
4423        if !orphan_message_ids.is_empty() {
4424            report.record("messages", orphan_message_ids.len() as i64);
4425        }
4426
4427        if !orphan_message_ids.is_empty() {
4428            delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
4429                .context("deleting orphan message rows and dependent children")?;
4430        }
4431
4432        for entry in ORPHAN_DIRECT_CHILD_TABLES {
4433            loop {
4434                let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
4435                    Ok(ids) => ids,
4436                    Err(err)
4437                        if error_indicates_missing_table(&err)
4438                            || error_indicates_missing_column(&err) =>
4439                    {
4440                        // Tolerant probe: a missing child/parent table or FK
4441                        // column on older schemas means there is nothing to
4442                        // clean up for this table.
4443                        tracing::debug!(
4444                            target: "cass::fk_repair",
4445                            child_table = entry.child_table,
4446                            error = %err,
4447                            "skipping orphan probe (table or column unavailable)"
4448                        );
4449                        break;
4450                    }
4451                    Err(err) => {
4452                        return Err(err).with_context(|| {
4453                            format!("probing orphan rows in {}", entry.child_table)
4454                        });
4455                    }
4456                };
4457                if ids.is_empty() {
4458                    break;
4459                }
4460
4461                let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
4462                    .with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
4463                if deleted == 0 {
4464                    break;
4465                }
4466                report.record(
4467                    entry.child_table,
4468                    i64::try_from(deleted).unwrap_or(i64::MAX),
4469                );
4470            }
4471        }
4472
4473        if report.total == 0 {
4474            return Ok(report);
4475        }
4476
4477        // WARN only fires after a successful commit so the message accurately
4478        // reflects what actually happened on disk. db_path is included so logs
4479        // from concurrent indexers against different databases stay
4480        // disambiguated.
4481        tracing::warn!(
4482            target: "cass::fk_repair",
4483            db_path = %self.db_path.display(),
4484            total_orphans = report.total,
4485            per_table = ?report.per_table,
4486            "cass#202: removed orphan rows left behind by interrupted index transactions"
4487        );
4488
4489        Ok(report)
4490    }
4491
4492    /// Return the current schema version from `_schema_migrations`.
4493    pub fn schema_version(&self) -> Result<i64> {
4494        let rows = self
4495            .conn
4496            .query("SELECT MAX(version) FROM _schema_migrations;")
4497            .with_context(|| "reading schema version from _schema_migrations")?;
4498
4499        if let Some(row) = rows.first()
4500            && let Ok(v) = row.get_typed::<Option<i64>>(0)
4501        {
4502            return Ok(v.unwrap_or(0));
4503        }
4504        Ok(0)
4505    }
4506
4507    /// Keep `meta.schema_version` in sync for backward compatibility with `SqliteStorage`.
4508    fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
4509        // The meta table is created by V1 migration. If it doesn't exist yet,
4510        // there's nothing to sync.
4511        if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
4512            return Ok(());
4513        }
4514
4515        // Only write if the version needs updating to avoid write lock contention
4516        if let Ok(rows) = self
4517            .conn
4518            .query("SELECT value FROM meta WHERE key = 'schema_version';")
4519            && let Some(row) = rows.first()
4520            && let Ok(val) = row.get_typed::<String>(0)
4521            && val == version.to_string()
4522        {
4523            return Ok(()); // Already up to date
4524        }
4525
4526        self.conn
4527            .execute_compat(
4528                "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
4529                &[ParamValue::from(version.to_string())],
4530            )
4531            .with_context(|| "syncing meta schema_version")?;
4532
4533        Ok(())
4534    }
4535
4536    /// Resolve the database file path for this connection.
4537    pub fn database_path(&self) -> Result<PathBuf> {
4538        Ok(self.db_path.clone())
4539    }
4540
4541    pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
4542        self.ephemeral_writer_preflight_verified
4543            .load(Ordering::Relaxed)
4544    }
4545
4546    pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
4547        self.ephemeral_writer_preflight_verified
4548            .store(true, Ordering::Relaxed);
4549    }
4550
4551    pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
4552        let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4553        (pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
4554    }
4555
4556    pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
4557        self.index_writer_checkpoint_pages
4558            .store(pages, Ordering::Relaxed);
4559    }
4560
4561    pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
4562        let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4563        (timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
4564    }
4565
4566    pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
4567        self.index_writer_busy_timeout_ms
4568            .store(timeout_ms, Ordering::Relaxed);
4569    }
4570
4571    /// Open database with migration, backing up if schema is incompatible.
4572    pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
4573        if let Some(parent) = path.parent() {
4574            fs::create_dir_all(parent)?;
4575        }
4576
4577        if path.exists() {
4578            let check_result = check_schema_compatibility(path);
4579            match check_result {
4580                Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
4581                    // Continue with normal open
4582                }
4583                Ok(SchemaCheck::NeedsRebuild(reason)) => {
4584                    let backup_path = create_backup(path)?;
4585                    cleanup_old_backups(path, MAX_BACKUPS)?;
4586                    remove_database_files(path)?;
4587                    return Err(MigrationError::RebuildRequired {
4588                        reason,
4589                        backup_path,
4590                    });
4591                }
4592                Err(err) if schema_check_error_requires_rebuild(&err) => {
4593                    let backup_path = create_backup(path)?;
4594                    cleanup_old_backups(path, MAX_BACKUPS)?;
4595                    remove_database_files(path)?;
4596                    return Err(MigrationError::RebuildRequired {
4597                        reason: format!("Database appears corrupted: {err}"),
4598                        backup_path,
4599                    });
4600                }
4601                Err(err) => return Err(MigrationError::Database(err)),
4602            }
4603        }
4604
4605        let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
4606        Ok(storage)
4607    }
4608}
4609
4610// -------------------------------------------------------------------------
4611// Frankensqlite migration helpers
4612// -------------------------------------------------------------------------
4613
4614/// Build the `MigrationRunner` for the frankensqlite migration path.
4615///
4616/// Uses a single combined migration (version 13) that creates the complete
4617/// final schema in one step. This avoids the V5 `DROP TABLE conversations`
4618/// operation which triggers a known frankensqlite limitation: autoindex entries
4619/// in sqlite_master are not properly cleaned up during DROP TABLE, causing
4620/// "sqlite_master entry not found" errors.
4621///
4622/// For existing databases transitioned from SqliteStorage, the transition
4623/// function backfills `_schema_migrations`; post-V13 additive migrations then
4624/// run normally.
4625fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
4626    MigrationRunner::new()
4627        .add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
4628        .add(14, "fts_contentless", MIGRATION_V14)
4629}
4630
4631fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
4632    MigrationRunner::new()
4633        .add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
4634        .add(17, "drop_message_created_idx", MIGRATION_V17)
4635        .add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
4636        .add(19, "conversation_external_lookup", MIGRATION_V19)
4637        .add(20, "conversation_external_tail_lookup", MIGRATION_V20)
4638}
4639
4640fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
4641    let rows = conn
4642        .query_with_params(
4643            "SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
4644            &[SqliteValue::from(version)],
4645        )
4646        .with_context(|| format!("checking schema migration version {version}"))?;
4647    Ok(!rows.is_empty())
4648}
4649
4650fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
4651    conn.execute("BEGIN IMMEDIATE;")
4652        .with_context(|| "starting v15 conversation tail-state migration transaction")?;
4653
4654    let result = (|| -> Result<bool> {
4655        if schema_migration_is_applied(conn, 15)? {
4656            conn.execute("COMMIT;")
4657                .with_context(|| "committing already-applied v15 migration transaction")?;
4658            return Ok(false);
4659        }
4660
4661        let started = Instant::now();
4662        let conversation_columns = franken_table_column_names(conn, "conversations")
4663            .with_context(|| "inspecting conversations columns before v15 migration")?;
4664        if !conversation_columns.contains("last_message_idx") {
4665            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
4666                .with_context(|| "adding v15 conversations.last_message_idx column")?;
4667        }
4668        if !conversation_columns.contains("last_message_created_at") {
4669            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
4670                .with_context(|| "adding v15 conversations.last_message_created_at column")?;
4671        }
4672        conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
4673            .with_context(|| "applying v15 conversation tail-state table schema")?;
4674        conn.execute_compat(
4675            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
4676            fparams![15_i64, "conversation_tail_state_cache"],
4677        )
4678        .with_context(|| "recording v15 conversation tail-state migration")?;
4679        conn.execute("COMMIT;")
4680            .with_context(|| "committing v15 conversation tail-state migration")?;
4681        info!(
4682            elapsed_ms = started.elapsed().as_millis(),
4683            "applied v15 conversation tail-state cache migration"
4684        );
4685        Ok(true)
4686    })();
4687
4688    if result.is_err() {
4689        let _ = conn.execute("ROLLBACK;");
4690    }
4691
4692    result
4693}
4694
4695fn franken_table_column_names(
4696    conn: &FrankenConnection,
4697    table_name: &str,
4698) -> Result<HashSet<String>> {
4699    if !table_name
4700        .chars()
4701        .all(|c| c.is_ascii_alphanumeric() || c == '_')
4702    {
4703        return Err(anyhow!(
4704            "unsafe table name for PRAGMA table_info: {table_name}"
4705        ));
4706    }
4707
4708    conn.query_map_collect(
4709        &format!("PRAGMA table_info({table_name})"),
4710        fparams![],
4711        |row: &FrankenRow| row.get_typed::<String>(1),
4712    )
4713    .with_context(|| format!("reading PRAGMA table_info({table_name})"))
4714    .map(|columns| columns.into_iter().collect())
4715}
4716
4717/// Combined V13 schema for fresh databases.
4718///
4719/// Creates the complete final schema in a single migration, avoiding the
4720/// incremental V5 `DROP TABLE conversations` which triggers a frankensqlite
4721/// autoindex limitation. All columns from V1-V13 are included in their
4722/// respective CREATE TABLE statements.
4723///
4724/// Table creation order respects foreign key references:
4725/// sources → agents/workspaces → conversations → messages → snippets, etc.
4726const MIGRATION_FRESH_SCHEMA: &str = r"
4727-- Core tables (V1)
4728CREATE TABLE IF NOT EXISTS meta (
4729    key TEXT PRIMARY KEY,
4730    value TEXT NOT NULL
4731);
4732
4733CREATE TABLE IF NOT EXISTS agents (
4734    id INTEGER PRIMARY KEY,
4735    slug TEXT NOT NULL UNIQUE,
4736    name TEXT NOT NULL,
4737    version TEXT,
4738    kind TEXT NOT NULL,
4739    created_at INTEGER NOT NULL,
4740    updated_at INTEGER NOT NULL
4741);
4742
4743CREATE TABLE IF NOT EXISTS workspaces (
4744    id INTEGER PRIMARY KEY,
4745    path TEXT NOT NULL UNIQUE,
4746    display_name TEXT
4747);
4748
4749-- Sources (V4)
4750CREATE TABLE IF NOT EXISTS sources (
4751    id TEXT PRIMARY KEY,
4752    kind TEXT NOT NULL,
4753    host_label TEXT,
4754    machine_id TEXT,
4755    platform TEXT,
4756    config_json TEXT,
4757    created_at INTEGER NOT NULL,
4758    updated_at INTEGER NOT NULL
4759);
4760
4761INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4762VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4763
4764-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
4765CREATE TABLE IF NOT EXISTS conversations (
4766    id INTEGER PRIMARY KEY,
4767    agent_id INTEGER NOT NULL REFERENCES agents(id),
4768    workspace_id INTEGER REFERENCES workspaces(id),
4769    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
4770    external_id TEXT,
4771    title TEXT,
4772    source_path TEXT NOT NULL,
4773    started_at INTEGER,
4774    ended_at INTEGER,
4775    approx_tokens INTEGER,
4776    metadata_json TEXT,
4777    origin_host TEXT,
4778    metadata_bin BLOB,
4779    total_input_tokens INTEGER,
4780    total_output_tokens INTEGER,
4781    total_cache_read_tokens INTEGER,
4782    total_cache_creation_tokens INTEGER,
4783    grand_total_tokens INTEGER,
4784    estimated_cost_usd REAL,
4785    primary_model TEXT,
4786    api_call_count INTEGER,
4787    tool_call_count INTEGER,
4788    user_message_count INTEGER,
4789    assistant_message_count INTEGER,
4790    -- V15 columns are included in the fresh schema so fresh DB creation does
4791    -- not need ALTER TABLE on conversations. That ALTER path can duplicate
4792    -- provenance autoindex state in frankensqlite when the named unique
4793    -- provenance index already exists.
4794    last_message_idx INTEGER,
4795    last_message_created_at INTEGER
4796);
4797
4798-- Named unique index avoids autoindex issues if table is ever recreated
4799CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
4800    ON conversations(source_id, agent_id, external_id);
4801
4802-- Messages: V1 base + V7 extra_bin
4803CREATE TABLE IF NOT EXISTS messages (
4804    id INTEGER PRIMARY KEY,
4805    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4806    idx INTEGER NOT NULL,
4807    role TEXT NOT NULL,
4808    author TEXT,
4809    created_at INTEGER,
4810    content TEXT NOT NULL,
4811    extra_json TEXT,
4812    extra_bin BLOB,
4813    UNIQUE(conversation_id, idx)
4814);
4815
4816CREATE TABLE IF NOT EXISTS snippets (
4817    id INTEGER PRIMARY KEY,
4818    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4819    file_path TEXT,
4820    start_line INTEGER,
4821    end_line INTEGER,
4822    language TEXT,
4823    snippet_text TEXT
4824);
4825
4826CREATE TABLE IF NOT EXISTS tags (
4827    id INTEGER PRIMARY KEY,
4828    name TEXT NOT NULL UNIQUE
4829);
4830
4831CREATE TABLE IF NOT EXISTS conversation_tags (
4832    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4833    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
4834    PRIMARY KEY (conversation_id, tag_id)
4835);
4836
4837-- Daily stats (V8)
4838CREATE TABLE IF NOT EXISTS daily_stats (
4839    day_id INTEGER NOT NULL,
4840    agent_slug TEXT NOT NULL,
4841    source_id TEXT NOT NULL DEFAULT 'all',
4842    session_count INTEGER NOT NULL DEFAULT 0,
4843    message_count INTEGER NOT NULL DEFAULT 0,
4844    total_chars INTEGER NOT NULL DEFAULT 0,
4845    last_updated INTEGER NOT NULL,
4846    PRIMARY KEY (day_id, agent_slug, source_id)
4847);
4848
4849-- Embedding jobs (V9)
4850CREATE TABLE IF NOT EXISTS embedding_jobs (
4851    id INTEGER PRIMARY KEY AUTOINCREMENT,
4852    db_path TEXT NOT NULL,
4853    model_id TEXT NOT NULL,
4854    status TEXT NOT NULL DEFAULT 'pending',
4855    total_docs INTEGER NOT NULL DEFAULT 0,
4856    completed_docs INTEGER NOT NULL DEFAULT 0,
4857    error_message TEXT,
4858    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4859    started_at TEXT,
4860    completed_at TEXT
4861);
4862
4863CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4864ON embedding_jobs(db_path, model_id)
4865WHERE status IN ('pending', 'running');
4866
4867-- Token usage ledger (V10)
4868CREATE TABLE IF NOT EXISTS token_usage (
4869    id INTEGER PRIMARY KEY AUTOINCREMENT,
4870    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4871    conversation_id INTEGER NOT NULL,
4872    agent_id INTEGER NOT NULL,
4873    workspace_id INTEGER,
4874    source_id TEXT NOT NULL DEFAULT 'local',
4875    timestamp_ms INTEGER NOT NULL,
4876    day_id INTEGER NOT NULL,
4877    model_name TEXT,
4878    model_family TEXT,
4879    model_tier TEXT,
4880    service_tier TEXT,
4881    provider TEXT,
4882    input_tokens INTEGER,
4883    output_tokens INTEGER,
4884    cache_read_tokens INTEGER,
4885    cache_creation_tokens INTEGER,
4886    thinking_tokens INTEGER,
4887    total_tokens INTEGER,
4888    estimated_cost_usd REAL,
4889    role TEXT NOT NULL,
4890    content_chars INTEGER NOT NULL,
4891    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4892    tool_call_count INTEGER NOT NULL DEFAULT 0,
4893    data_source TEXT NOT NULL DEFAULT 'api',
4894    UNIQUE(message_id)
4895);
4896
4897-- Token daily stats (V10)
4898CREATE TABLE IF NOT EXISTS token_daily_stats (
4899    day_id INTEGER NOT NULL,
4900    agent_slug TEXT NOT NULL,
4901    source_id TEXT NOT NULL DEFAULT 'all',
4902    model_family TEXT NOT NULL DEFAULT 'all',
4903    api_call_count INTEGER NOT NULL DEFAULT 0,
4904    user_message_count INTEGER NOT NULL DEFAULT 0,
4905    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4906    tool_message_count INTEGER NOT NULL DEFAULT 0,
4907    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4908    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4909    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4910    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4911    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4912    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4913    total_content_chars INTEGER NOT NULL DEFAULT 0,
4914    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4915    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4916    session_count INTEGER NOT NULL DEFAULT 0,
4917    last_updated INTEGER NOT NULL,
4918    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4919);
4920
4921-- Model pricing (V10)
4922CREATE TABLE IF NOT EXISTS model_pricing (
4923    model_pattern TEXT NOT NULL,
4924    provider TEXT NOT NULL,
4925    input_cost_per_mtok REAL NOT NULL,
4926    output_cost_per_mtok REAL NOT NULL,
4927    cache_read_cost_per_mtok REAL,
4928    cache_creation_cost_per_mtok REAL,
4929    effective_date TEXT NOT NULL,
4930    PRIMARY KEY (model_pattern, effective_date)
4931);
4932
4933INSERT OR IGNORE INTO model_pricing VALUES
4934    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4935    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4936    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4937    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4938    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4939    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4940    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4941    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4942    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4943    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4944
4945-- Message metrics: V11 base + V12 model dimensions
4946CREATE TABLE IF NOT EXISTS message_metrics (
4947    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4948    created_at_ms INTEGER NOT NULL,
4949    hour_id INTEGER NOT NULL,
4950    day_id INTEGER NOT NULL,
4951    agent_slug TEXT NOT NULL,
4952    workspace_id INTEGER NOT NULL DEFAULT 0,
4953    source_id TEXT NOT NULL DEFAULT 'local',
4954    role TEXT NOT NULL,
4955    content_chars INTEGER NOT NULL,
4956    content_tokens_est INTEGER NOT NULL,
4957    api_input_tokens INTEGER,
4958    api_output_tokens INTEGER,
4959    api_cache_read_tokens INTEGER,
4960    api_cache_creation_tokens INTEGER,
4961    api_thinking_tokens INTEGER,
4962    api_service_tier TEXT,
4963    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4964    tool_call_count INTEGER NOT NULL DEFAULT 0,
4965    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4966    has_plan INTEGER NOT NULL DEFAULT 0,
4967    model_name TEXT,
4968    model_family TEXT NOT NULL DEFAULT 'unknown',
4969    model_tier TEXT NOT NULL DEFAULT 'unknown',
4970    provider TEXT NOT NULL DEFAULT 'unknown'
4971);
4972
4973-- Hourly rollups: V11 base + V13 plan columns
4974CREATE TABLE IF NOT EXISTS usage_hourly (
4975    hour_id INTEGER NOT NULL,
4976    agent_slug TEXT NOT NULL,
4977    workspace_id INTEGER NOT NULL DEFAULT 0,
4978    source_id TEXT NOT NULL DEFAULT 'local',
4979    message_count INTEGER NOT NULL DEFAULT 0,
4980    user_message_count INTEGER NOT NULL DEFAULT 0,
4981    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4982    tool_call_count INTEGER NOT NULL DEFAULT 0,
4983    plan_message_count INTEGER NOT NULL DEFAULT 0,
4984    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4985    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4986    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4987    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4988    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4989    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4990    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4991    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4992    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4993    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4994    last_updated INTEGER NOT NULL DEFAULT 0,
4995    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4996    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4997    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4998);
4999
5000-- Daily rollups: V11 base + V13 plan columns
5001CREATE TABLE IF NOT EXISTS usage_daily (
5002    day_id INTEGER NOT NULL,
5003    agent_slug TEXT NOT NULL,
5004    workspace_id INTEGER NOT NULL DEFAULT 0,
5005    source_id TEXT NOT NULL DEFAULT 'local',
5006    message_count INTEGER NOT NULL DEFAULT 0,
5007    user_message_count INTEGER NOT NULL DEFAULT 0,
5008    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5009    tool_call_count INTEGER NOT NULL DEFAULT 0,
5010    plan_message_count INTEGER NOT NULL DEFAULT 0,
5011    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5012    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5013    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5014    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5015    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5016    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5017    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5018    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5019    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5020    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5021    last_updated INTEGER NOT NULL DEFAULT 0,
5022    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5023    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5024    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
5025);
5026
5027-- Model daily rollups (V12)
5028CREATE TABLE IF NOT EXISTS usage_models_daily (
5029    day_id INTEGER NOT NULL,
5030    agent_slug TEXT NOT NULL,
5031    workspace_id INTEGER NOT NULL DEFAULT 0,
5032    source_id TEXT NOT NULL DEFAULT 'local',
5033    model_family TEXT NOT NULL DEFAULT 'unknown',
5034    model_tier TEXT NOT NULL DEFAULT 'unknown',
5035    message_count INTEGER NOT NULL DEFAULT 0,
5036    user_message_count INTEGER NOT NULL DEFAULT 0,
5037    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5038    tool_call_count INTEGER NOT NULL DEFAULT 0,
5039    plan_message_count INTEGER NOT NULL DEFAULT 0,
5040    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5041    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5042    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5043    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5044    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5045    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5046    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5047    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5048    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5049    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5050    last_updated INTEGER NOT NULL DEFAULT 0,
5051    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5052);
5053
5054-- All indexes
5055CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
5056CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
5057CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
5058CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
5059CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
5060CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
5061CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
5062CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
5063CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
5064CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
5065CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
5066CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
5067CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5068CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5069CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5070CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5071CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5072CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5073CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5074CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5075CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5076CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5077CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5078CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5079CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5080CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5081CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5082CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5083CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5084CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5085";
5086
5087#[derive(Clone, Copy)]
5088struct SchemaRepairBatch {
5089    name: &'static str,
5090    tables: &'static [&'static str],
5091    sql: &'static str,
5092}
5093
5094const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
5095CREATE TABLE IF NOT EXISTS sources (
5096    id TEXT PRIMARY KEY,
5097    kind TEXT NOT NULL,
5098    host_label TEXT,
5099    machine_id TEXT,
5100    platform TEXT,
5101    config_json TEXT,
5102    created_at INTEGER NOT NULL,
5103    updated_at INTEGER NOT NULL
5104);
5105
5106INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
5107VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
5108";
5109
5110const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
5111CREATE TABLE IF NOT EXISTS daily_stats (
5112    day_id INTEGER NOT NULL,
5113    agent_slug TEXT NOT NULL,
5114    source_id TEXT NOT NULL DEFAULT 'all',
5115    session_count INTEGER NOT NULL DEFAULT 0,
5116    message_count INTEGER NOT NULL DEFAULT 0,
5117    total_chars INTEGER NOT NULL DEFAULT 0,
5118    last_updated INTEGER NOT NULL,
5119    PRIMARY KEY (day_id, agent_slug, source_id)
5120);
5121
5122CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
5123CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
5124";
5125
5126const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
5127CREATE TABLE IF NOT EXISTS conversation_external_lookup (
5128    lookup_key TEXT PRIMARY KEY,
5129    conversation_id INTEGER NOT NULL
5130);
5131
5132INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
5133SELECT
5134    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
5135    CAST(agent_id AS TEXT) || ':' ||
5136    CAST(length(external_id) AS TEXT) || ':' || external_id,
5137    id
5138FROM conversations
5139WHERE external_id IS NOT NULL;
5140";
5141
5142const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
5143CREATE TABLE IF NOT EXISTS conversation_tail_state (
5144    conversation_id INTEGER PRIMARY KEY,
5145    ended_at INTEGER,
5146    last_message_idx INTEGER,
5147    last_message_created_at INTEGER
5148);
5149
5150CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
5151    lookup_key TEXT PRIMARY KEY,
5152    conversation_id INTEGER NOT NULL,
5153    ended_at INTEGER,
5154    last_message_idx INTEGER,
5155    last_message_created_at INTEGER
5156);
5157
5158INSERT OR REPLACE INTO conversation_external_tail_lookup (
5159    lookup_key,
5160    conversation_id,
5161    ended_at,
5162    last_message_idx,
5163    last_message_created_at
5164)
5165SELECT
5166    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
5167    CAST(c.agent_id AS TEXT) || ':' ||
5168    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
5169    c.id,
5170    ts.ended_at,
5171    ts.last_message_idx,
5172    ts.last_message_created_at
5173FROM conversations c
5174LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
5175WHERE c.external_id IS NOT NULL;
5176";
5177
5178const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
5179CREATE TABLE IF NOT EXISTS embedding_jobs (
5180    id INTEGER PRIMARY KEY AUTOINCREMENT,
5181    db_path TEXT NOT NULL,
5182    model_id TEXT NOT NULL,
5183    status TEXT NOT NULL DEFAULT 'pending',
5184    total_docs INTEGER NOT NULL DEFAULT 0,
5185    completed_docs INTEGER NOT NULL DEFAULT 0,
5186    error_message TEXT,
5187    created_at TEXT NOT NULL DEFAULT (datetime('now')),
5188    started_at TEXT,
5189    completed_at TEXT
5190);
5191
5192CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
5193ON embedding_jobs(db_path, model_id)
5194WHERE status IN ('pending', 'running');
5195";
5196
5197const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
5198CREATE TABLE IF NOT EXISTS token_usage (
5199    id INTEGER PRIMARY KEY AUTOINCREMENT,
5200    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
5201    conversation_id INTEGER NOT NULL,
5202    agent_id INTEGER NOT NULL,
5203    workspace_id INTEGER,
5204    source_id TEXT NOT NULL DEFAULT 'local',
5205    timestamp_ms INTEGER NOT NULL,
5206    day_id INTEGER NOT NULL,
5207    model_name TEXT,
5208    model_family TEXT,
5209    model_tier TEXT,
5210    service_tier TEXT,
5211    provider TEXT,
5212    input_tokens INTEGER,
5213    output_tokens INTEGER,
5214    cache_read_tokens INTEGER,
5215    cache_creation_tokens INTEGER,
5216    thinking_tokens INTEGER,
5217    total_tokens INTEGER,
5218    estimated_cost_usd REAL,
5219    role TEXT NOT NULL,
5220    content_chars INTEGER NOT NULL,
5221    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5222    tool_call_count INTEGER NOT NULL DEFAULT 0,
5223    data_source TEXT NOT NULL DEFAULT 'api',
5224    UNIQUE(message_id)
5225);
5226
5227CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
5228CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
5229CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
5230CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
5231CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
5232
5233CREATE TABLE IF NOT EXISTS token_daily_stats (
5234    day_id INTEGER NOT NULL,
5235    agent_slug TEXT NOT NULL,
5236    source_id TEXT NOT NULL DEFAULT 'all',
5237    model_family TEXT NOT NULL DEFAULT 'all',
5238    api_call_count INTEGER NOT NULL DEFAULT 0,
5239    user_message_count INTEGER NOT NULL DEFAULT 0,
5240    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5241    tool_message_count INTEGER NOT NULL DEFAULT 0,
5242    total_input_tokens INTEGER NOT NULL DEFAULT 0,
5243    total_output_tokens INTEGER NOT NULL DEFAULT 0,
5244    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
5245    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
5246    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
5247    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
5248    total_content_chars INTEGER NOT NULL DEFAULT 0,
5249    total_tool_calls INTEGER NOT NULL DEFAULT 0,
5250    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
5251    session_count INTEGER NOT NULL DEFAULT 0,
5252    last_updated INTEGER NOT NULL,
5253    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
5254);
5255
5256CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
5257CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
5258
5259CREATE TABLE IF NOT EXISTS model_pricing (
5260    model_pattern TEXT NOT NULL,
5261    provider TEXT NOT NULL,
5262    input_cost_per_mtok REAL NOT NULL,
5263    output_cost_per_mtok REAL NOT NULL,
5264    cache_read_cost_per_mtok REAL,
5265    cache_creation_cost_per_mtok REAL,
5266    effective_date TEXT NOT NULL,
5267    PRIMARY KEY (model_pattern, effective_date)
5268);
5269
5270INSERT OR IGNORE INTO model_pricing VALUES
5271    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
5272    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
5273    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
5274    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
5275    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
5276    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
5277    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
5278    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
5279    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
5280    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
5281";
5282
5283const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
5284CREATE TABLE IF NOT EXISTS message_metrics (
5285    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
5286    created_at_ms INTEGER NOT NULL,
5287    hour_id INTEGER NOT NULL,
5288    day_id INTEGER NOT NULL,
5289    agent_slug TEXT NOT NULL,
5290    workspace_id INTEGER NOT NULL DEFAULT 0,
5291    source_id TEXT NOT NULL DEFAULT 'local',
5292    role TEXT NOT NULL,
5293    content_chars INTEGER NOT NULL,
5294    content_tokens_est INTEGER NOT NULL,
5295    api_input_tokens INTEGER,
5296    api_output_tokens INTEGER,
5297    api_cache_read_tokens INTEGER,
5298    api_cache_creation_tokens INTEGER,
5299    api_thinking_tokens INTEGER,
5300    api_service_tier TEXT,
5301    api_data_source TEXT NOT NULL DEFAULT 'estimated',
5302    tool_call_count INTEGER NOT NULL DEFAULT 0,
5303    has_tool_calls INTEGER NOT NULL DEFAULT 0,
5304    has_plan INTEGER NOT NULL DEFAULT 0,
5305    model_name TEXT,
5306    model_family TEXT NOT NULL DEFAULT 'unknown',
5307    model_tier TEXT NOT NULL DEFAULT 'unknown',
5308    provider TEXT NOT NULL DEFAULT 'unknown'
5309);
5310
5311CREATE TABLE IF NOT EXISTS usage_hourly (
5312    hour_id INTEGER NOT NULL,
5313    agent_slug TEXT NOT NULL,
5314    workspace_id INTEGER NOT NULL DEFAULT 0,
5315    source_id TEXT NOT NULL DEFAULT 'local',
5316    message_count INTEGER NOT NULL DEFAULT 0,
5317    user_message_count INTEGER NOT NULL DEFAULT 0,
5318    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5319    tool_call_count INTEGER NOT NULL DEFAULT 0,
5320    plan_message_count INTEGER NOT NULL DEFAULT 0,
5321    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5322    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5323    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5324    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5325    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5326    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5327    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5328    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5329    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5330    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5331    last_updated INTEGER NOT NULL DEFAULT 0,
5332    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5333    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5334    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
5335);
5336
5337CREATE TABLE IF NOT EXISTS usage_daily (
5338    day_id INTEGER NOT NULL,
5339    agent_slug TEXT NOT NULL,
5340    workspace_id INTEGER NOT NULL DEFAULT 0,
5341    source_id TEXT NOT NULL DEFAULT 'local',
5342    message_count INTEGER NOT NULL DEFAULT 0,
5343    user_message_count INTEGER NOT NULL DEFAULT 0,
5344    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5345    tool_call_count INTEGER NOT NULL DEFAULT 0,
5346    plan_message_count INTEGER NOT NULL DEFAULT 0,
5347    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5348    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5349    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5350    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5351    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5352    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5353    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5354    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5355    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5356    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5357    last_updated INTEGER NOT NULL DEFAULT 0,
5358    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5359    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5360    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
5361);
5362
5363CREATE TABLE IF NOT EXISTS usage_models_daily (
5364    day_id INTEGER NOT NULL,
5365    agent_slug TEXT NOT NULL,
5366    workspace_id INTEGER NOT NULL DEFAULT 0,
5367    source_id TEXT NOT NULL DEFAULT 'local',
5368    model_family TEXT NOT NULL DEFAULT 'unknown',
5369    model_tier TEXT NOT NULL DEFAULT 'unknown',
5370    message_count INTEGER NOT NULL DEFAULT 0,
5371    user_message_count INTEGER NOT NULL DEFAULT 0,
5372    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5373    tool_call_count INTEGER NOT NULL DEFAULT 0,
5374    plan_message_count INTEGER NOT NULL DEFAULT 0,
5375    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5376    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5377    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5378    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5379    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5380    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5381    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5382    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5383    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5384    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5385    last_updated INTEGER NOT NULL DEFAULT 0,
5386    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5387);
5388
5389CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5390CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5391CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5392CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5393CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5394CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5395CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5396CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5397CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5398CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5399CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5400CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5401CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5402CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5403CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5404CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5405CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5406CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5407";
5408
5409const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
5410    SchemaRepairBatch {
5411        name: "sources",
5412        tables: &["sources"],
5413        sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
5414    },
5415    SchemaRepairBatch {
5416        name: "daily_stats",
5417        tables: &["daily_stats"],
5418        sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
5419    },
5420    SchemaRepairBatch {
5421        name: "conversation_external_lookup",
5422        tables: &["conversation_external_lookup"],
5423        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
5424    },
5425    SchemaRepairBatch {
5426        name: "conversation_external_tail_lookup",
5427        tables: &[
5428            "conversation_tail_state",
5429            "conversation_external_tail_lookup",
5430        ],
5431        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
5432    },
5433    SchemaRepairBatch {
5434        name: "embedding_jobs",
5435        tables: &["embedding_jobs"],
5436        sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
5437    },
5438    SchemaRepairBatch {
5439        name: "token_analytics",
5440        tables: &["token_usage", "token_daily_stats", "model_pricing"],
5441        sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
5442    },
5443    SchemaRepairBatch {
5444        name: "message_rollups",
5445        tables: &[
5446            "message_metrics",
5447            "usage_hourly",
5448            "usage_daily",
5449            "usage_models_daily",
5450        ],
5451        sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
5452    },
5453];
5454
5455fn current_schema_repair_batches_for_missing_tables(
5456    missing_tables: &[&'static str],
5457) -> Result<Vec<&'static SchemaRepairBatch>> {
5458    let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
5459    let mut selected_batches = Vec::new();
5460    let mut covered_tables = HashSet::new();
5461
5462    for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
5463        if !batch
5464            .tables
5465            .iter()
5466            .any(|table_name| missing_set.contains(table_name))
5467        {
5468            continue;
5469        }
5470        selected_batches.push(batch);
5471        covered_tables.extend(batch.tables.iter().copied());
5472    }
5473
5474    for &table_name in missing_tables {
5475        if !covered_tables.contains(table_name) {
5476            return Err(anyhow!(
5477                "no current-schema repair batch registered for missing table {table_name}"
5478            ));
5479        }
5480    }
5481
5482    Ok(selected_batches)
5483}
5484
5485/// Migration name lookup for backfilling `_schema_migrations` during transition.
5486const MIGRATION_NAMES: [(i64, &str); 20] = [
5487    (1, "core_tables"),
5488    (2, "fts_messages"),
5489    (3, "fts_messages_rebuild"),
5490    (4, "sources"),
5491    (5, "provenance_columns"),
5492    (6, "source_path_index"),
5493    (7, "msgpack_columns"),
5494    (8, "daily_stats"),
5495    (9, "embedding_jobs"),
5496    (10, "token_analytics"),
5497    (11, "message_metrics"),
5498    (12, "model_dimensions"),
5499    (13, "plan_token_rollups"),
5500    (14, "fts_contentless"),
5501    (15, "conversation_tail_state_cache"),
5502    (16, "drop_redundant_message_conv_idx"),
5503    (17, "drop_message_created_idx"),
5504    (18, "conversation_tail_state_hot_table"),
5505    (19, "conversation_external_lookup"),
5506    (20, "conversation_external_tail_lookup"),
5507];
5508
5509/// Transitions an existing database from `meta` table schema versioning to the
5510/// `_schema_migrations` table used by `MigrationRunner`.
5511///
5512/// The existing `SqliteStorage` tracks schema version as a string value in
5513/// `meta WHERE key = 'schema_version'`. The bead spec references
5514/// `PRAGMA user_version`, but the actual cass code uses the `meta` table.
5515/// This function handles the real code path.
5516///
5517/// Behavior:
5518/// - If `_schema_migrations` already exists → skip (already transitioned)
5519/// - If `meta` table has `schema_version > 0` → create `_schema_migrations`
5520///   and backfill entries for versions `1..=current_version`
5521/// - Legacy V10-V12 databases are represented as V13 in `_schema_migrations`
5522///   because frankensqlite uses one combined V13 base migration instead of
5523///   replaying the old incremental V11-V13 steps.
5524/// - If `meta` table missing or `schema_version = 0` with no tables → fresh DB,
5525///   let `MigrationRunner` handle it
5526/// - If `schema_version = 0` but tables exist → corrupted state, log warning
5527fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
5528    // Avoid sqlite_master enumeration here. Databases with FTS virtual tables
5529    // can trigger frankensqlite parse-recovery on sqlite_master reads, which is
5530    // enough to break the transition on otherwise-healthy legacy cass DBs.
5531    if conn
5532        .query("SELECT version FROM \"_schema_migrations\";")
5533        .is_ok()
5534    {
5535        return Ok(());
5536    }
5537
5538    // Check if the meta table exists.
5539    if conn.query("SELECT key FROM meta;").is_err() {
5540        // No meta table → fresh database, let MigrationRunner handle it.
5541        return Ok(());
5542    }
5543
5544    // Read the current schema version from the meta table.
5545    let rows = conn
5546        .query("SELECT value FROM meta WHERE key = 'schema_version';")
5547        .with_context(|| "reading schema_version from meta")?;
5548
5549    let current_version: i64 = rows
5550        .first()
5551        .and_then(|row| row.get_typed::<String>(0).ok())
5552        .and_then(|s| s.parse().ok())
5553        .unwrap_or(0);
5554
5555    if current_version == 0 {
5556        // Check if tables actually exist (corrupted state: tables present but version=0).
5557        if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
5558            // Truly fresh DB (meta table exists but empty/reset). Let MigrationRunner handle it.
5559            return Ok(());
5560        }
5561
5562        // Tables exist but version=0: corrupted state. Log and skip transition;
5563        // MigrationRunner will fail on "table already exists" and surface the error.
5564        info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
5565        return Ok(());
5566    }
5567
5568    // Create _schema_migrations and backfill entries for all applied versions.
5569    info!(
5570        current_version,
5571        "transitioning schema tracking from meta table to _schema_migrations"
5572    );
5573
5574    conn.execute(
5575        "CREATE TABLE IF NOT EXISTS _schema_migrations (\
5576            version INTEGER PRIMARY KEY, \
5577            name TEXT NOT NULL, \
5578            applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
5579        );",
5580    )
5581    .with_context(|| "creating _schema_migrations table for transition")?;
5582
5583    let backfill_through_version = if (10..13).contains(&current_version) {
5584        13
5585    } else {
5586        current_version
5587    };
5588
5589    for &(version, name) in &MIGRATION_NAMES {
5590        if version > backfill_through_version {
5591            break;
5592        }
5593        conn.execute_compat(
5594            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
5595            &[ParamValue::from(version), ParamValue::from(name)],
5596        )
5597        .with_context(|| format!("backfilling _schema_migrations version {version}"))?;
5598    }
5599
5600    info!(
5601        current_version,
5602        backfill_through_version,
5603        "schema version transition complete: backfilled legacy meta schema versions"
5604    );
5605
5606    Ok(())
5607}
5608
5609const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
5610    ("sources", "SELECT id FROM sources LIMIT 1;"),
5611    ("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
5612    (
5613        "conversation_external_lookup",
5614        "SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
5615    ),
5616    (
5617        "conversation_tail_state",
5618        "SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
5619    ),
5620    (
5621        "conversation_external_tail_lookup",
5622        "SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
5623    ),
5624    ("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
5625    ("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
5626    (
5627        "token_daily_stats",
5628        "SELECT day_id FROM token_daily_stats LIMIT 1;",
5629    ),
5630    (
5631        "model_pricing",
5632        "SELECT model_pattern FROM model_pricing LIMIT 1;",
5633    ),
5634    (
5635        "message_metrics",
5636        "SELECT message_id FROM message_metrics LIMIT 1;",
5637    ),
5638    ("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
5639    ("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
5640    (
5641        "usage_models_daily",
5642        "SELECT day_id FROM usage_models_daily LIMIT 1;",
5643    ),
5644];
5645
5646const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
5647    ("total_input_tokens", "INTEGER"),
5648    ("total_output_tokens", "INTEGER"),
5649    ("total_cache_read_tokens", "INTEGER"),
5650    ("total_cache_creation_tokens", "INTEGER"),
5651    ("grand_total_tokens", "INTEGER"),
5652    ("estimated_cost_usd", "REAL"),
5653    ("primary_model", "TEXT"),
5654    ("api_call_count", "INTEGER"),
5655    ("tool_call_count", "INTEGER"),
5656    ("user_message_count", "INTEGER"),
5657    ("assistant_message_count", "INTEGER"),
5658];
5659
5660fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
5661    err.to_string()
5662        .to_ascii_lowercase()
5663        .contains("no such table")
5664}
5665
5666fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
5667    err.to_string()
5668        .to_ascii_lowercase()
5669        .contains("no such column")
5670}
5671
5672const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
5673
5674fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
5675    let min_conversation_id = conn
5676        .query_map_collect(
5677            "SELECT conversation_id
5678             FROM messages
5679             ORDER BY conversation_id ASC
5680             LIMIT 1",
5681            fparams![],
5682            |row| row.get_typed(0),
5683        )
5684        .context("finding minimum message conversation id for orphan FK cleanup")?
5685        .into_iter()
5686        .next();
5687    let Some(min_conversation_id) = min_conversation_id else {
5688        return Ok(Vec::new());
5689    };
5690    let max_conversation_id: i64 = conn
5691        .query_row_map(
5692            "SELECT conversation_id
5693             FROM messages
5694             ORDER BY conversation_id DESC
5695             LIMIT 1",
5696            fparams![],
5697            |row| row.get_typed(0),
5698        )
5699        .context("finding maximum message conversation id for orphan FK cleanup")?;
5700
5701    let parent_conversation_ids: Vec<i64> = conn
5702        .query_map_collect(
5703            "SELECT id
5704             FROM conversations
5705             WHERE id BETWEEN ?1 AND ?2
5706             ORDER BY id",
5707            fparams![min_conversation_id, max_conversation_id],
5708            |row| row.get_typed(0),
5709        )
5710        .context("listing parent conversation ids for orphan FK cleanup")?;
5711
5712    let mut message_ids = Vec::new();
5713    let mut gap_start = min_conversation_id;
5714    for parent_id in parent_conversation_ids {
5715        if parent_id < gap_start {
5716            continue;
5717        }
5718        if parent_id > max_conversation_id {
5719            break;
5720        }
5721        if gap_start < parent_id {
5722            collect_message_ids_for_conversation_gap(
5723                conn,
5724                gap_start,
5725                parent_id.saturating_sub(1),
5726                &mut message_ids,
5727            )?;
5728        }
5729        if parent_id == i64::MAX {
5730            return Ok(message_ids);
5731        }
5732        gap_start = parent_id + 1;
5733    }
5734    if gap_start <= max_conversation_id {
5735        collect_message_ids_for_conversation_gap(
5736            conn,
5737            gap_start,
5738            max_conversation_id,
5739            &mut message_ids,
5740        )?;
5741    }
5742
5743    Ok(message_ids)
5744}
5745
5746fn collect_message_ids_for_conversation_gap(
5747    conn: &FrankenConnection,
5748    gap_start: i64,
5749    gap_end: i64,
5750    message_ids: &mut Vec<i64>,
5751) -> Result<()> {
5752    let (sql, params) = if gap_start == gap_end {
5753        (
5754            "SELECT id FROM messages WHERE conversation_id = ?1",
5755            vec![SqliteValue::from(gap_start)],
5756        )
5757    } else {
5758        (
5759            "SELECT id FROM messages WHERE conversation_id BETWEEN ?1 AND ?2",
5760            vec![SqliteValue::from(gap_start), SqliteValue::from(gap_end)],
5761        )
5762    };
5763    let rows = conn.query_with_params(sql, &params).with_context(|| {
5764        format!("listing orphan message ids for conversation-id gap {gap_start}..={gap_end}")
5765    })?;
5766    message_ids.reserve(rows.len());
5767    for row in rows {
5768        message_ids.push(row.get_typed(0)?);
5769    }
5770    Ok(())
5771}
5772
5773fn delete_rows_by_i64_chunks(
5774    tx: &FrankenTransaction<'_>,
5775    delete_many_sql_prefix: &'static str,
5776    ids: &[i64],
5777) -> Result<usize> {
5778    if ids.is_empty() {
5779        return Ok(0);
5780    }
5781
5782    let full_chunk_sql = delete_rows_by_i64_sql(delete_many_sql_prefix, ORPHAN_FK_ID_CHUNK_SIZE);
5783    let tail_len = ids.len() % ORPHAN_FK_ID_CHUNK_SIZE;
5784    let tail_sql =
5785        (tail_len != 0).then(|| delete_rows_by_i64_sql(delete_many_sql_prefix, tail_len));
5786
5787    let mut deleted = 0;
5788    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5789        let sql = if chunk.len() == ORPHAN_FK_ID_CHUNK_SIZE {
5790            &full_chunk_sql
5791        } else {
5792            tail_sql.as_ref().unwrap_or(&full_chunk_sql)
5793        };
5794        let params = chunk
5795            .iter()
5796            .map(|id| SqliteValue::from(*id))
5797            .collect::<Vec<_>>();
5798        deleted += tx.execute_with_params(sql, &params)?;
5799    }
5800    Ok(deleted)
5801}
5802
5803fn delete_rows_by_i64_sql(delete_many_sql_prefix: &'static str, count: usize) -> String {
5804    let placeholders = sql_placeholders(count);
5805    format!("{delete_many_sql_prefix} ({placeholders})")
5806}
5807
5808fn sql_placeholders(count: usize) -> String {
5809    vec!["?"; count].join(", ")
5810}
5811
5812fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5813    let mut deleted = 0usize;
5814    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5815        deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
5816    }
5817    Ok(deleted)
5818}
5819
5820fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5821    if ids.is_empty() {
5822        return Ok(0);
5823    }
5824
5825    match delete_orphan_message_id_chunk_once(conn, ids) {
5826        Ok(deleted) => Ok(deleted),
5827        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5828            let split_at = ids.len() / 2;
5829            tracing::warn!(
5830                target: "cass::fk_repair",
5831                rows = ids.len(),
5832                left = split_at,
5833                right = ids.len().saturating_sub(split_at),
5834                error = %err,
5835                "orphan-message cleanup ran out of memory; retrying as smaller batches"
5836            );
5837            let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
5838            let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
5839            Ok(left.saturating_add(right))
5840        }
5841        Err(err) => Err(err),
5842    }
5843}
5844
5845fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5846    let mut tx = conn.transaction()?;
5847    let mut deleted = 0usize;
5848    for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
5849        match delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids) {
5850            Ok(count) => {
5851                deleted = deleted.saturating_add(count);
5852            }
5853            Err(err) if error_indicates_missing_table(&err) => {
5854                tracing::debug!(
5855                    target: "cass::fk_repair",
5856                    child_table = entry.child_table,
5857                    error = %err,
5858                    "skipping orphan-message dependent cleanup (table unavailable)"
5859                );
5860            }
5861            Err(err) => {
5862                return Err(err).with_context(|| {
5863                    format!(
5864                        "deleting rows from {} that depend on orphan messages",
5865                        entry.child_table
5866                    )
5867                });
5868            }
5869        }
5870    }
5871    deleted = deleted.saturating_add(
5872        delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id IN", ids)
5873            .context("deleting orphan rows from messages")?,
5874    );
5875    tx.commit()?;
5876    Ok(deleted)
5877}
5878
5879fn collect_direct_orphan_id_page(
5880    conn: &FrankenConnection,
5881    entry: &'static OrphanFkTable,
5882) -> Result<Vec<i64>> {
5883    Ok(conn.query_map_collect(
5884        entry.orphan_id_page_sql,
5885        fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
5886        |row| row.get_typed(0),
5887    )?)
5888}
5889
5890fn delete_direct_orphan_ids_bisecting_oom(
5891    conn: &FrankenConnection,
5892    entry: &'static OrphanFkTable,
5893    ids: &[i64],
5894) -> Result<usize> {
5895    let mut deleted = 0usize;
5896    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5897        deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
5898    }
5899    Ok(deleted)
5900}
5901
5902fn delete_direct_orphan_id_chunk(
5903    conn: &FrankenConnection,
5904    entry: &'static OrphanFkTable,
5905    ids: &[i64],
5906) -> Result<usize> {
5907    if ids.is_empty() {
5908        return Ok(0);
5909    }
5910
5911    match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
5912        Ok(deleted) => Ok(deleted),
5913        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5914            let split_at = ids.len() / 2;
5915            tracing::warn!(
5916                target: "cass::fk_repair",
5917                child_table = entry.child_table,
5918                rows = ids.len(),
5919                left = split_at,
5920                right = ids.len().saturating_sub(split_at),
5921                error = %err,
5922                "direct orphan cleanup ran out of memory; retrying as smaller batches"
5923            );
5924            let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
5925            let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
5926            Ok(left.saturating_add(right))
5927        }
5928        Err(err) => Err(err),
5929    }
5930}
5931
5932fn delete_direct_orphan_id_chunk_once(
5933    conn: &FrankenConnection,
5934    entry: &'static OrphanFkTable,
5935    ids: &[i64],
5936) -> Result<usize> {
5937    let mut tx = conn.transaction()?;
5938    let deleted = delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids)?;
5939    tx.commit()?;
5940    Ok(deleted)
5941}
5942
5943/// Tables whose FK parent rows can go missing when an index transaction is
5944/// dropped mid-flight. The select and delete SQL strings are intentionally
5945/// static (no dynamic table names) so they can be audited at a glance and so
5946/// they cannot be subverted by injected identifiers. The select statement
5947/// yields the integer FK key used by the matching chunked delete.
5948struct OrphanFkTable {
5949    child_table: &'static str,
5950    orphan_id_page_sql: &'static str,
5951    delete_many_sql_prefix: &'static str,
5952}
5953
5954const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
5955    OrphanFkTable {
5956        child_table: "message_metrics",
5957        orphan_id_page_sql: "SELECT message_id FROM message_metrics \
5958                             WHERE NOT EXISTS (\
5959                                 SELECT 1 FROM messages \
5960                                 WHERE messages.id = message_metrics.message_id\
5961                             ) \
5962                             ORDER BY message_id \
5963                             LIMIT ?1",
5964        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5965    },
5966    OrphanFkTable {
5967        child_table: "token_usage",
5968        orphan_id_page_sql: "SELECT message_id FROM token_usage \
5969                             WHERE NOT EXISTS (\
5970                                 SELECT 1 FROM messages \
5971                                 WHERE messages.id = token_usage.message_id\
5972                             ) \
5973                             ORDER BY message_id \
5974                             LIMIT ?1",
5975        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5976    },
5977    OrphanFkTable {
5978        child_table: "snippets",
5979        orphan_id_page_sql: "SELECT message_id FROM snippets \
5980                             WHERE NOT EXISTS (\
5981                                 SELECT 1 FROM messages \
5982                                 WHERE messages.id = snippets.message_id\
5983                             ) \
5984                             ORDER BY message_id \
5985                             LIMIT ?1",
5986        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5987    },
5988    OrphanFkTable {
5989        child_table: "conversation_tags",
5990        orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
5991                             WHERE NOT EXISTS (\
5992                                 SELECT 1 FROM conversations \
5993                                 WHERE conversations.id = conversation_tags.conversation_id\
5994                             ) \
5995                             ORDER BY conversation_id \
5996                             LIMIT ?1",
5997        delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
5998    },
5999];
6000
6001struct OrphanMessageDependentTable {
6002    child_table: &'static str,
6003    delete_many_sql_prefix: &'static str,
6004}
6005
6006const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
6007    OrphanMessageDependentTable {
6008        child_table: "message_metrics",
6009        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
6010    },
6011    OrphanMessageDependentTable {
6012        child_table: "token_usage",
6013        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
6014    },
6015    OrphanMessageDependentTable {
6016        child_table: "snippets",
6017        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
6018    },
6019];
6020
6021/// Summary of orphan rows detected and removed by `cleanup_orphan_fk_rows`.
6022///
6023/// Message-root counts come from the probe phase, while direct child counts
6024/// come from bounded page deletes. Under the function's intended use — a single
6025/// indexer-startup pass holding the index run lock — no concurrent writers
6026/// exist, so these counts match the primary orphan roots identified and
6027/// removed during cleanup. Dependent rows below an orphan message
6028/// (`message_metrics` / `token_usage` / `snippets`) are an expected consequence
6029/// of removing that root orphan and are *not* separately counted in `total` or
6030/// `per_table`.
6031#[derive(Debug, Default, Clone)]
6032pub(crate) struct OrphanFkCleanupReport {
6033    pub total: i64,
6034    pub per_table: Vec<(&'static str, i64)>,
6035}
6036
6037impl OrphanFkCleanupReport {
6038    fn record(&mut self, child_table: &'static str, count: i64) {
6039        if let Some((_, existing)) = self
6040            .per_table
6041            .iter_mut()
6042            .find(|(table, _)| *table == child_table)
6043        {
6044            *existing = existing.saturating_add(count);
6045        } else {
6046            self.per_table.push((child_table, count));
6047        }
6048        self.total = self.total.saturating_add(count);
6049    }
6050}
6051
6052pub struct InsertOutcome {
6053    pub conversation_id: i64,
6054    pub conversation_inserted: bool,
6055    pub inserted_indices: Vec<i64>,
6056}
6057
6058#[cfg(test)]
6059#[derive(Debug, Clone, Default)]
6060struct MessageInsertSubstageProfile {
6061    single_row_calls: usize,
6062    batch_calls: usize,
6063    batch_rows: usize,
6064    payload_duration: Duration,
6065    sql_build_duration: Duration,
6066    param_build_duration: Duration,
6067    execute_duration: Duration,
6068    rowid_duration: Duration,
6069}
6070
6071#[cfg(test)]
6072#[derive(Debug, Clone, Default)]
6073struct InsertConversationTreePerfProfile {
6074    invocations: usize,
6075    messages: usize,
6076    inserted_messages: usize,
6077    total_duration: Duration,
6078    source_duration: Duration,
6079    tx_open_duration: Duration,
6080    existing_lookup_duration: Duration,
6081    existing_idx_lookup_duration: Duration,
6082    existing_replay_lookup_duration: Duration,
6083    dedupe_filter_duration: Duration,
6084    conversation_row_duration: Duration,
6085    message_insert_duration: Duration,
6086    message_insert_breakdown: MessageInsertSubstageProfile,
6087    snippet_insert_duration: Duration,
6088    fts_entry_duration: Duration,
6089    fts_flush_duration: Duration,
6090    analytics_duration: Duration,
6091    commit_duration: Duration,
6092}
6093
6094#[cfg(test)]
6095impl InsertConversationTreePerfProfile {
6096    fn millis(duration: Duration) -> f64 {
6097        duration.as_secs_f64() * 1000.0
6098    }
6099
6100    fn log_summary(&self, label: &str) {
6101        let calls = self.invocations.max(1) as f64;
6102        let accounted_duration = self.source_duration
6103            + self.tx_open_duration
6104            + self.existing_lookup_duration
6105            + self.existing_idx_lookup_duration
6106            + self.existing_replay_lookup_duration
6107            + self.dedupe_filter_duration
6108            + self.conversation_row_duration
6109            + self.message_insert_duration
6110            + self.snippet_insert_duration
6111            + self.fts_entry_duration
6112            + self.fts_flush_duration
6113            + self.analytics_duration
6114            + self.commit_duration;
6115        let residual_duration = self.total_duration.saturating_sub(accounted_duration);
6116        eprintln!(
6117            concat!(
6118                "CASS_INSERT_TREE_STAGE_PROFILE ",
6119                "label={} calls={} messages={} inserted_messages={} ",
6120                "total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
6121                "existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
6122                "conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
6123                "fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
6124                "msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
6125                "residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
6126                "avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
6127            ),
6128            label,
6129            self.invocations,
6130            self.messages,
6131            self.inserted_messages,
6132            Self::millis(self.total_duration),
6133            Self::millis(self.source_duration),
6134            Self::millis(self.tx_open_duration),
6135            Self::millis(self.existing_lookup_duration),
6136            Self::millis(self.existing_idx_lookup_duration),
6137            Self::millis(self.existing_replay_lookup_duration),
6138            Self::millis(self.dedupe_filter_duration),
6139            Self::millis(self.conversation_row_duration),
6140            Self::millis(self.message_insert_duration),
6141            Self::millis(self.snippet_insert_duration),
6142            Self::millis(self.fts_entry_duration),
6143            Self::millis(self.fts_flush_duration),
6144            Self::millis(self.analytics_duration),
6145            Self::millis(self.commit_duration),
6146            Self::millis(self.message_insert_breakdown.payload_duration),
6147            Self::millis(self.message_insert_breakdown.sql_build_duration),
6148            Self::millis(self.message_insert_breakdown.param_build_duration),
6149            Self::millis(self.message_insert_breakdown.execute_duration),
6150            Self::millis(self.message_insert_breakdown.rowid_duration),
6151            Self::millis(residual_duration),
6152            Self::millis(self.total_duration) / calls,
6153            Self::millis(self.message_insert_duration) / calls,
6154            Self::millis(self.message_insert_breakdown.execute_duration) / calls,
6155            Self::millis(self.message_insert_breakdown.payload_duration) / calls,
6156            Self::millis(self.snippet_insert_duration) / calls,
6157            Self::millis(self.fts_entry_duration) / calls,
6158            Self::millis(self.commit_duration) / calls,
6159        );
6160    }
6161}
6162
6163#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6164enum PendingConversationKey {
6165    External {
6166        source_id: String,
6167        agent_id: i64,
6168        external_id: String,
6169    },
6170    SourcePath {
6171        source_id: String,
6172        agent_id: i64,
6173        source_path: String,
6174        started_at: Option<i64>,
6175    },
6176}
6177
6178fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
6179    format!(
6180        "{}:{source_id}:{agent_id}:{}:{external_id}",
6181        source_id.chars().count(),
6182        external_id.chars().count()
6183    )
6184}
6185
6186fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
6187    conv.external_id
6188        .as_deref()
6189        .map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
6190}
6191
6192#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6193struct MessageMergeFingerprint {
6194    idx: i64,
6195    created_at: Option<i64>,
6196    role: MessageRole,
6197    author: Option<String>,
6198    content_hash: [u8; 32],
6199}
6200
6201#[derive(Debug, Clone, PartialEq, Eq, Hash)]
6202struct MessageReplayFingerprint {
6203    created_at: Option<i64>,
6204    role: MessageRole,
6205    author: Option<String>,
6206    content_hash: [u8; 32],
6207}
6208
6209#[derive(Debug, Clone, Copy, PartialEq, Eq)]
6210struct ConversationMergeEvidence {
6211    exact_overlap: usize,
6212    replay_overlap: usize,
6213    smaller_replay_set: usize,
6214    started_close: bool,
6215    start_distance_ms: i64,
6216}
6217
6218struct ExistingConversationNewMessages<'a> {
6219    messages: Vec<&'a Message>,
6220    new_chars: i64,
6221    idx_collision_count: usize,
6222    first_collision_idx: Option<i64>,
6223}
6224
6225#[derive(Debug, Clone, Copy)]
6226struct ExistingConversationTailState {
6227    last_message_idx: i64,
6228    last_message_created_at: i64,
6229    ended_at: Option<i64>,
6230}
6231
6232#[derive(Debug, Clone, Copy)]
6233struct ExistingConversationWithTail {
6234    id: i64,
6235    tail_state: Option<ExistingConversationTailState>,
6236}
6237
6238fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
6239    conv.started_at
6240        .or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
6241}
6242
6243fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
6244    (
6245        conv.messages.iter().map(|msg| msg.idx).max(),
6246        conv.messages.iter().filter_map(|msg| msg.created_at).max(),
6247    )
6248}
6249
6250fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
6251    (
6252        messages.iter().map(|msg| msg.idx).max(),
6253        messages.iter().filter_map(|msg| msg.created_at).max(),
6254    )
6255}
6256
6257fn role_from_str(role: &str) -> MessageRole {
6258    match role {
6259        "user" => MessageRole::User,
6260        "agent" | "assistant" => MessageRole::Agent,
6261        "tool" => MessageRole::Tool,
6262        "system" => MessageRole::System,
6263        other => MessageRole::Other(other.to_string()),
6264    }
6265}
6266
6267fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
6268    MessageMergeFingerprint {
6269        idx: msg.idx,
6270        created_at: msg.created_at,
6271        role: msg.role.clone(),
6272        author: msg.author.clone(),
6273        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
6274    }
6275}
6276
6277fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
6278    MessageReplayFingerprint {
6279        created_at: msg.created_at,
6280        role: msg.role.clone(),
6281        author: msg.author.clone(),
6282        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
6283    }
6284}
6285
6286fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
6287    conv.messages
6288        .iter()
6289        .map(message_merge_fingerprint)
6290        .collect()
6291}
6292
6293fn conversation_message_replay_fingerprints(
6294    conv: &Conversation,
6295) -> HashSet<MessageReplayFingerprint> {
6296    conv.messages
6297        .iter()
6298        .map(message_replay_fingerprint)
6299        .collect()
6300}
6301
6302fn replay_fingerprint_from_merge(
6303    fingerprint: &MessageMergeFingerprint,
6304) -> MessageReplayFingerprint {
6305    MessageReplayFingerprint {
6306        created_at: fingerprint.created_at,
6307        role: fingerprint.role.clone(),
6308        author: fingerprint.author.clone(),
6309        content_hash: fingerprint.content_hash,
6310    }
6311}
6312
6313fn replay_fingerprints_from_merge_set(
6314    fingerprints: &HashSet<MessageMergeFingerprint>,
6315) -> HashSet<MessageReplayFingerprint> {
6316    fingerprints
6317        .iter()
6318        .map(replay_fingerprint_from_merge)
6319        .collect()
6320}
6321
6322fn collect_new_messages_for_existing_conversation<'a>(
6323    conversation_id: i64,
6324    conv: &'a Conversation,
6325    existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
6326    existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
6327    replay_skip_log: &'static str,
6328) -> ExistingConversationNewMessages<'a> {
6329    let mut idx_collision_count = 0usize;
6330    let mut first_collision_idx: Option<i64> = None;
6331    let mut new_chars: i64 = 0;
6332    let mut messages = Vec::new();
6333
6334    for msg in &conv.messages {
6335        let incoming_fingerprint = message_merge_fingerprint(msg);
6336        if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
6337            if existing_fingerprint != &incoming_fingerprint {
6338                idx_collision_count = idx_collision_count.saturating_add(1);
6339                first_collision_idx.get_or_insert(msg.idx);
6340            }
6341            continue;
6342        }
6343
6344        let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
6345        if existing_replay_fingerprints.contains(&incoming_replay) {
6346            tracing::debug!(
6347                conversation_id,
6348                idx = msg.idx,
6349                source_path = %conv.source_path.display(),
6350                "{replay_skip_log}"
6351            );
6352            continue;
6353        }
6354
6355        existing_messages.insert(msg.idx, incoming_fingerprint);
6356        existing_replay_fingerprints.insert(incoming_replay);
6357        new_chars += msg.content.len() as i64;
6358        messages.push(msg);
6359    }
6360
6361    ExistingConversationNewMessages {
6362        messages,
6363        new_chars,
6364        idx_collision_count,
6365        first_collision_idx,
6366    }
6367}
6368
6369fn franken_existing_conversation_append_tail_state(
6370    tx: &FrankenTransaction<'_>,
6371    conversation_id: i64,
6372) -> Result<Option<ExistingConversationTailState>> {
6373    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
6374        .query_row_map(
6375            "SELECT last_message_idx, last_message_created_at, ended_at
6376             FROM conversation_tail_state
6377             WHERE conversation_id = ?1",
6378            fparams![conversation_id],
6379            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6380        )
6381        .optional()?;
6382    if let Some(cached) = cached {
6383        let (_, _, cached_ended_at) = cached;
6384        if let Some(tail_state) =
6385            existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
6386        {
6387            return Ok(Some(tail_state));
6388        }
6389    }
6390
6391    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
6392        "SELECT last_message_idx, last_message_created_at, ended_at
6393         FROM conversations
6394         WHERE id = ?1",
6395        fparams![conversation_id],
6396        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6397    )?;
6398    let (_, _, cached_ended_at) = legacy_cached;
6399    if let Some(tail_state) = existing_conversation_tail_state_from_cached(
6400        legacy_cached.0,
6401        legacy_cached.1,
6402        cached_ended_at,
6403    ) {
6404        franken_insert_conversation_tail_state(
6405            tx,
6406            conversation_id,
6407            cached_ended_at,
6408            Some(tail_state.last_message_idx),
6409            Some(tail_state.last_message_created_at),
6410        )?;
6411        return Ok(Some(tail_state));
6412    }
6413
6414    let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
6415        "SELECT MAX(idx), MAX(created_at)
6416         FROM messages
6417         WHERE conversation_id = ?1",
6418        fparams![conversation_id],
6419        |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
6420    )?;
6421    if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
6422        franken_update_conversation_tail_state(
6423            tx,
6424            conversation_id,
6425            None,
6426            Some(last_message_idx),
6427            Some(last_message_created_at),
6428        )?;
6429        return Ok(Some(ExistingConversationTailState {
6430            last_message_idx,
6431            last_message_created_at,
6432            ended_at: cached_ended_at,
6433        }));
6434    }
6435    Ok(None)
6436}
6437
6438fn existing_conversation_tail_state_from_cached(
6439    last_message_idx: Option<i64>,
6440    last_message_created_at: Option<i64>,
6441    ended_at: Option<i64>,
6442) -> Option<ExistingConversationTailState> {
6443    let (last_message_idx, last_message_created_at) =
6444        last_message_idx.zip(last_message_created_at)?;
6445    Some(ExistingConversationTailState {
6446        last_message_idx,
6447        last_message_created_at,
6448        ended_at,
6449    })
6450}
6451
6452fn franken_find_existing_conversation_with_tail_by_key(
6453    tx: &FrankenTransaction<'_>,
6454    key: &PendingConversationKey,
6455    conv: Option<&Conversation>,
6456) -> Result<Option<ExistingConversationWithTail>> {
6457    if let PendingConversationKey::External {
6458        source_id,
6459        agent_id,
6460        external_id,
6461    } = key
6462    {
6463        let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
6464        if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
6465            return Ok(Some(existing));
6466        }
6467        return Ok(None);
6468    }
6469
6470    let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
6471        return Ok(None);
6472    };
6473    let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
6474    Ok(Some(ExistingConversationWithTail { id, tail_state }))
6475}
6476
6477fn franken_insert_conversation_tail_state(
6478    tx: &FrankenTransaction<'_>,
6479    conversation_id: i64,
6480    ended_at: Option<i64>,
6481    last_message_idx: Option<i64>,
6482    last_message_created_at: Option<i64>,
6483) -> Result<()> {
6484    if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
6485        return Ok(());
6486    }
6487    tx.execute_compat(
6488        "INSERT OR REPLACE INTO conversation_tail_state (
6489             conversation_id, ended_at, last_message_idx, last_message_created_at
6490         ) VALUES (?1, ?2, ?3, ?4)",
6491        fparams![
6492            conversation_id,
6493            ended_at,
6494            last_message_idx,
6495            last_message_created_at
6496        ],
6497    )?;
6498    Ok(())
6499}
6500
6501fn franken_update_conversation_tail_columns(
6502    tx: &FrankenTransaction<'_>,
6503    conversation_id: i64,
6504    ended_at_candidate: Option<i64>,
6505    last_message_idx_candidate: Option<i64>,
6506    last_message_created_at_candidate: Option<i64>,
6507) -> Result<()> {
6508    if ended_at_candidate.is_none()
6509        && last_message_idx_candidate.is_none()
6510        && last_message_created_at_candidate.is_none()
6511    {
6512        return Ok(());
6513    }
6514
6515    tx.execute_compat(
6516        "UPDATE conversations
6517         SET ended_at = CASE
6518                 WHEN ?1 IS NULL THEN ended_at
6519                 WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
6520                 ELSE ended_at
6521             END,
6522             last_message_idx = CASE
6523                 WHEN ?2 IS NULL THEN last_message_idx
6524                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6525                 ELSE last_message_idx
6526             END,
6527             last_message_created_at = CASE
6528                 WHEN ?3 IS NULL THEN last_message_created_at
6529                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6530                 ELSE last_message_created_at
6531             END
6532         WHERE id = ?4",
6533        fparams![
6534            ended_at_candidate,
6535            last_message_idx_candidate,
6536            last_message_created_at_candidate,
6537            conversation_id
6538        ],
6539    )?;
6540    Ok(())
6541}
6542
6543fn franken_tail_state_insert_ended_at(
6544    tx: &FrankenTransaction<'_>,
6545    conversation_id: i64,
6546    candidate: Option<i64>,
6547) -> Result<Option<i64>> {
6548    let canonical: Option<i64> = tx
6549        .query_row_map(
6550            "SELECT ended_at FROM conversations WHERE id = ?1",
6551            fparams![conversation_id],
6552            |row| row.get_typed(0),
6553        )
6554        .optional()?
6555        .flatten();
6556    Ok(canonical.max(candidate))
6557}
6558
6559fn franken_update_conversation_tail_state(
6560    tx: &FrankenTransaction<'_>,
6561    conversation_id: i64,
6562    ended_at_candidate: Option<i64>,
6563    last_message_idx_candidate: Option<i64>,
6564    last_message_created_at_candidate: Option<i64>,
6565) -> Result<()> {
6566    if ended_at_candidate.is_none()
6567        && last_message_idx_candidate.is_none()
6568        && last_message_created_at_candidate.is_none()
6569    {
6570        return Ok(());
6571    }
6572
6573    let changed = tx.execute_compat(
6574        "UPDATE conversation_tail_state
6575         SET ended_at = CASE
6576                 WHEN ?1 IS NULL THEN ended_at
6577                 ELSE MAX(IFNULL(ended_at, 0), ?1)
6578             END,
6579             last_message_idx = CASE
6580                 WHEN ?2 IS NULL THEN last_message_idx
6581                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6582                 ELSE last_message_idx
6583             END,
6584             last_message_created_at = CASE
6585                 WHEN ?3 IS NULL THEN last_message_created_at
6586                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6587                 ELSE last_message_created_at
6588             END
6589         WHERE conversation_id = ?4",
6590        fparams![
6591            ended_at_candidate,
6592            last_message_idx_candidate,
6593            last_message_created_at_candidate,
6594            conversation_id
6595        ],
6596    )?;
6597    if changed == 0 {
6598        let insert_ended_at =
6599            franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
6600        franken_insert_conversation_tail_state(
6601            tx,
6602            conversation_id,
6603            insert_ended_at,
6604            last_message_idx_candidate,
6605            last_message_created_at_candidate,
6606        )?;
6607    }
6608    franken_update_conversation_tail_columns(
6609        tx,
6610        conversation_id,
6611        ended_at_candidate,
6612        last_message_idx_candidate,
6613        last_message_created_at_candidate,
6614    )?;
6615    Ok(())
6616}
6617
6618fn franken_set_conversation_tail_state_after_append(
6619    tx: &FrankenTransaction<'_>,
6620    conversation_id: i64,
6621    ended_at: i64,
6622    last_message_idx: i64,
6623    last_message_created_at: i64,
6624) -> Result<()> {
6625    let changed = tx.execute_compat(
6626        "UPDATE conversation_tail_state
6627         SET ended_at = ?1,
6628             last_message_idx = ?2,
6629             last_message_created_at = ?3
6630         WHERE conversation_id = ?4",
6631        fparams![
6632            ended_at,
6633            last_message_idx,
6634            last_message_created_at,
6635            conversation_id
6636        ],
6637    )?;
6638    if changed == 0 {
6639        let insert_ended_at =
6640            franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
6641        franken_insert_conversation_tail_state(
6642            tx,
6643            conversation_id,
6644            insert_ended_at,
6645            Some(last_message_idx),
6646            Some(last_message_created_at),
6647        )?;
6648    }
6649    franken_update_conversation_tail_columns(
6650        tx,
6651        conversation_id,
6652        Some(ended_at),
6653        Some(last_message_idx),
6654        Some(last_message_created_at),
6655    )?;
6656    Ok(())
6657}
6658
6659fn collect_append_only_tail_messages<'a>(
6660    conv: &'a Conversation,
6661    existing_max_idx: i64,
6662    existing_max_created_at: i64,
6663) -> Option<ExistingConversationNewMessages<'a>> {
6664    if conv.messages.is_empty() {
6665        return Some(ExistingConversationNewMessages {
6666            messages: Vec::new(),
6667            new_chars: 0,
6668            idx_collision_count: 0,
6669            first_collision_idx: None,
6670        });
6671    }
6672
6673    let mut split_idx = None;
6674    let mut prev_idx = None;
6675    for (pos, msg) in conv.messages.iter().enumerate() {
6676        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6677            return None;
6678        }
6679        prev_idx = Some(msg.idx);
6680        if split_idx.is_none() && msg.idx > existing_max_idx {
6681            split_idx = Some(pos);
6682        }
6683    }
6684    let split_idx = split_idx?;
6685
6686    let mut seen_tail_idx = HashSet::new();
6687    let mut seen_tail_replay = HashSet::new();
6688    let mut new_chars = 0i64;
6689    let mut messages = Vec::new();
6690    for msg in &conv.messages[split_idx..] {
6691        let created_at = msg.created_at?;
6692        if created_at <= existing_max_created_at {
6693            return None;
6694        }
6695
6696        if !seen_tail_idx.insert(msg.idx) {
6697            return None;
6698        }
6699
6700        let replay_fingerprint = message_replay_fingerprint(msg);
6701        if !seen_tail_replay.insert(replay_fingerprint) {
6702            return None;
6703        }
6704
6705        new_chars += msg.content.len() as i64;
6706        messages.push(msg);
6707    }
6708
6709    Some(ExistingConversationNewMessages {
6710        messages,
6711        new_chars,
6712        idx_collision_count: 0,
6713        first_collision_idx: None,
6714    })
6715}
6716
6717fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
6718    match (left, right) {
6719        (Some(left), Some(right)) => (i128::from(left) - i128::from(right))
6720            .abs()
6721            .try_into()
6722            .unwrap_or(i64::MAX),
6723        _ => i64::MAX,
6724    }
6725}
6726
6727fn conversation_merge_evidence(
6728    incoming_exact: &HashSet<MessageMergeFingerprint>,
6729    incoming_replay: &HashSet<MessageReplayFingerprint>,
6730    existing_exact: &HashSet<MessageMergeFingerprint>,
6731    existing_replay: &HashSet<MessageReplayFingerprint>,
6732    incoming_started_at: Option<i64>,
6733    existing_started_at: Option<i64>,
6734) -> Option<ConversationMergeEvidence> {
6735    let exact_overlap = incoming_exact.intersection(existing_exact).count();
6736    let replay_overlap = incoming_replay.intersection(existing_replay).count();
6737    if exact_overlap == 0 && replay_overlap == 0 {
6738        return None;
6739    }
6740
6741    let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
6742    let started_close = timestamps_within_tolerance(
6743        incoming_started_at,
6744        existing_started_at,
6745        SOURCE_PATH_MERGE_START_TOLERANCE_MS,
6746    );
6747    let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
6748
6749    let merge_allowed = if started_close {
6750        exact_overlap >= 1 || replay_overlap >= 2
6751    } else {
6752        exact_overlap >= 2 || full_replay_subset_match
6753    };
6754
6755    merge_allowed.then_some(ConversationMergeEvidence {
6756        exact_overlap,
6757        replay_overlap,
6758        smaller_replay_set,
6759        started_close,
6760        start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
6761    })
6762}
6763
6764fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
6765    match (left, right) {
6766        (Some(left), Some(right)) => {
6767            (i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
6768        }
6769        _ => false,
6770    }
6771}
6772
6773fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
6774    if let Some(external_id) = conv.external_id.clone() {
6775        PendingConversationKey::External {
6776            source_id: conv.source_id.clone(),
6777            agent_id,
6778            external_id,
6779        }
6780    } else {
6781        PendingConversationKey::SourcePath {
6782            source_id: conv.source_id.clone(),
6783            agent_id,
6784            source_path: path_to_string(&conv.source_path),
6785            started_at: conversation_effective_started_at(conv),
6786        }
6787    }
6788}
6789
6790/// Message data needed for semantic embedding generation.
6791pub struct MessageForEmbedding {
6792    pub message_id: i64,
6793    pub created_at: Option<i64>,
6794    pub agent_id: i64,
6795    pub workspace_id: Option<i64>,
6796    pub source_id_hash: u32,
6797    pub role: String,
6798    pub content: String,
6799}
6800
6801// =========================================================================
6802// FrankenStorage CRUD operations
6803// =========================================================================
6804
6805impl FrankenStorage {
6806    /// Ensure an agent exists in the database, returning its ID.
6807    pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
6808        let cache_key = EnsuredAgentKey::from_agent(agent);
6809        if let Some(id) = self.cached_agent_id(&cache_key) {
6810            return Ok(id);
6811        }
6812
6813        let now = Self::now_millis();
6814        self.conn.execute_compat(
6815            "INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
6816             VALUES(?1, ?2, ?3, ?4, ?5, ?6)
6817             ON CONFLICT(slug) DO UPDATE SET
6818                 name = excluded.name,
6819                 version = excluded.version,
6820                 kind = excluded.kind,
6821                 updated_at = excluded.updated_at
6822             WHERE NOT (
6823                 agents.name IS excluded.name
6824                 AND agents.version IS excluded.version
6825                 AND agents.kind IS excluded.kind
6826             )",
6827            fparams![
6828                agent.slug.as_str(),
6829                agent.name.as_str(),
6830                agent.version.as_deref(),
6831                cache_key.kind.as_str(),
6832                now,
6833                now
6834            ],
6835        )?;
6836
6837        let id = self
6838            .conn
6839            .query_row_map(
6840                "SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
6841                fparams![agent.slug.as_str()],
6842                |row| row.get_typed(0),
6843            )
6844            .with_context(|| format!("fetching agent id for {}", agent.slug))?;
6845        self.mark_agent_ensured(cache_key, id);
6846        Ok(id)
6847    }
6848
6849    /// Ensure a workspace exists in the database, returning its ID.
6850    pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
6851        let path_str = path.to_string_lossy().to_string();
6852        let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
6853        if let Some(id) = self.cached_workspace_id(&cache_key) {
6854            return Ok(id);
6855        }
6856
6857        if let Some(display_name) = display_name {
6858            self.conn.execute_compat(
6859                "INSERT INTO workspaces(path, display_name)
6860                 VALUES(?1, ?2)
6861                 ON CONFLICT(path) DO UPDATE SET
6862                     display_name = excluded.display_name
6863                 WHERE NOT (workspaces.display_name IS excluded.display_name)",
6864                fparams![path_str.as_str(), display_name],
6865            )?;
6866        } else {
6867            self.conn.execute_compat(
6868                "INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
6869                fparams![path_str.as_str()],
6870            )?;
6871        }
6872
6873        let id = self
6874            .conn
6875            .query_row_map(
6876                "SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
6877                fparams![path_str.as_str()],
6878                |row| row.get_typed(0),
6879            )
6880            .with_context(|| format!("fetching workspace id for {path_str}"))?;
6881        self.mark_workspace_ensured(cache_key, id);
6882        Ok(id)
6883    }
6884
6885    /// Get current time as milliseconds since epoch.
6886    pub fn now_millis() -> i64 {
6887        SystemTime::now()
6888            .duration_since(UNIX_EPOCH)
6889            .map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
6890            .unwrap_or(0)
6891    }
6892
6893    /// Convert a millisecond timestamp to a day ID (days since 2020-01-01).
6894    pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
6895        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6896        let secs = timestamp_ms.div_euclid(1000);
6897        (secs - EPOCH_2020_SECS).div_euclid(86400)
6898    }
6899
6900    /// Convert a millisecond timestamp to an hour ID (hours since 2020-01-01 00:00 UTC).
6901    pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
6902        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6903        let secs = timestamp_ms.div_euclid(1000);
6904        (secs - EPOCH_2020_SECS).div_euclid(3600)
6905    }
6906
6907    /// Convert a day ID back to milliseconds (start of day).
6908    pub fn millis_from_day_id(day_id: i64) -> i64 {
6909        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6910        (EPOCH_2020_SECS + day_id * 86400) * 1000
6911    }
6912
6913    /// Convert an hour ID back to milliseconds (start of hour).
6914    pub fn millis_from_hour_id(hour_id: i64) -> i64 {
6915        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6916        (EPOCH_2020_SECS + hour_id * 3600) * 1000
6917    }
6918
6919    /// Get the timestamp of the last successful scan.
6920    pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
6921        let result: Result<String, _> = self.conn.query_row_map(
6922            "SELECT value FROM meta WHERE key = 'last_scan_ts'",
6923            fparams![],
6924            |row| row.get_typed(0),
6925        );
6926        match result.optional() {
6927            Ok(Some(s)) => Ok(s.parse().ok()),
6928            Ok(None) => Ok(None),
6929            Err(e) => Err(e.into()),
6930        }
6931    }
6932
6933    /// Set the timestamp of the last successful scan (milliseconds since epoch).
6934    pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
6935        self.conn.execute_compat(
6936            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
6937            fparams![ts.to_string()],
6938        )?;
6939        Ok(())
6940    }
6941
6942    /// Get the timestamp of the last successful index completion.
6943    pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
6944        let result: Result<String, _> = self.conn.query_row_map(
6945            "SELECT value FROM meta WHERE key = 'last_indexed_at'",
6946            fparams![],
6947            |row| row.get_typed(0),
6948        );
6949        match result.optional() {
6950            Ok(Some(s)) => Ok(s.parse().ok()),
6951            Ok(None) => Ok(None),
6952            Err(e) => Err(e.into()),
6953        }
6954    }
6955
6956    /// Set the timestamp of the last successful index completion (milliseconds since epoch).
6957    pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
6958        self.conn.execute_compat(
6959            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
6960            fparams![ts.to_string()],
6961        )?;
6962        Ok(())
6963    }
6964
6965    /// List all registered agents.
6966    pub fn list_agents(&self) -> Result<Vec<Agent>> {
6967        self.conn
6968            .query_map_collect(
6969                "SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
6970                fparams![],
6971                |row| {
6972                    let kind: String = row.get_typed(4)?;
6973                    Ok(Agent {
6974                        id: Some(row.get_typed(0)?),
6975                        slug: row.get_typed(1)?,
6976                        name: row.get_typed(2)?,
6977                        version: row.get_typed(3)?,
6978                        kind: match kind.as_str() {
6979                            "cli" => AgentKind::Cli,
6980                            "vscode" => AgentKind::VsCode,
6981                            _ => AgentKind::Hybrid,
6982                        },
6983                    })
6984                },
6985            )
6986            .with_context(|| "listing agents")
6987    }
6988
6989    /// Count all archived conversations.
6990    pub fn total_conversation_count(&self) -> Result<usize> {
6991        let count: i64 =
6992            self.conn
6993                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6994                    row.get_typed(0)
6995                })?;
6996        Ok(count.max(0) as usize)
6997    }
6998
6999    /// Count all archived messages.
7000    pub fn total_message_count(&self) -> Result<usize> {
7001        let count: i64 =
7002            self.conn
7003                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
7004                    row.get_typed(0)
7005                })?;
7006        Ok(count.max(0) as usize)
7007    }
7008
7009    /// Remove all archived conversations/messages for one agent slug.
7010    ///
7011    /// This only affects cass's local archive database. Source session files on
7012    /// disk are untouched.
7013    pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
7014        let normalized = agent_slug.trim().to_ascii_lowercase();
7015        if normalized.is_empty() {
7016            return Err(anyhow!("agent slug cannot be empty"));
7017        }
7018
7019        let Some(agent_id) = self
7020            .conn
7021            .query_row_map(
7022                "SELECT id FROM agents WHERE slug = ?1",
7023                fparams![normalized.as_str()],
7024                |row| row.get_typed::<i64>(0),
7025            )
7026            .optional()?
7027        else {
7028            return Ok(AgentArchivePurgeResult::default());
7029        };
7030
7031        let conversations_deleted: i64 = self.conn.query_row_map(
7032            "SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
7033            fparams![agent_id],
7034            |row| row.get_typed(0),
7035        )?;
7036        if conversations_deleted == 0 {
7037            return Ok(AgentArchivePurgeResult::default());
7038        }
7039
7040        let messages_deleted: i64 = self.conn.query_row_map(
7041            "SELECT COUNT(*)
7042             FROM messages
7043             WHERE conversation_id IN (
7044                 SELECT id FROM conversations WHERE agent_id = ?1
7045             )",
7046            fparams![agent_id],
7047            |row| row.get_typed(0),
7048        )?;
7049
7050        let mut tx = self.conn.transaction()?;
7051        tx.execute_compat(
7052            "DELETE FROM conversation_external_lookup
7053             WHERE conversation_id IN (
7054                 SELECT id FROM conversations WHERE agent_id = ?1
7055             )",
7056            fparams![agent_id],
7057        )?;
7058        tx.execute_compat(
7059            "DELETE FROM conversation_external_tail_lookup
7060             WHERE conversation_id IN (
7061                 SELECT id FROM conversations WHERE agent_id = ?1
7062             )",
7063            fparams![agent_id],
7064        )?;
7065        tx.execute_compat(
7066            "DELETE FROM conversations WHERE agent_id = ?1",
7067            fparams![agent_id],
7068        )?;
7069        tx.execute_compat(
7070            "DELETE FROM agents
7071             WHERE id = ?1
7072               AND NOT EXISTS (
7073                   SELECT 1 FROM conversations WHERE agent_id = ?1
7074               )",
7075            fparams![agent_id],
7076        )?;
7077        tx.commit()?;
7078
7079        Ok(AgentArchivePurgeResult {
7080            conversations_deleted: conversations_deleted.max(0) as usize,
7081            messages_deleted: messages_deleted.max(0) as usize,
7082        })
7083    }
7084
7085    /// List all registered workspaces.
7086    pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
7087        self.conn
7088            .query_map_collect(
7089                "SELECT id, path, display_name FROM workspaces ORDER BY path",
7090                fparams![],
7091                |row| {
7092                    let path_str: String = row.get_typed(1)?;
7093                    Ok(crate::model::types::Workspace {
7094                        id: Some(row.get_typed(0)?),
7095                        path: Path::new(&path_str).to_path_buf(),
7096                        display_name: row.get_typed(2)?,
7097                    })
7098                },
7099            )
7100            .with_context(|| "listing workspaces")
7101    }
7102
7103    /// List conversations with pagination.
7104    pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
7105        // Avoid the multi-table JOIN with LIMIT/OFFSET that triggers
7106        // frankensqlite's materialization fallback (see c38edcd9, 860acb12).
7107        // Use correlated subqueries for the tiny agents (~20 rows) and
7108        // workspaces (~30 rows) lookup tables and degrade NULL agent_id to
7109        // the same 'unknown' sentinel that 8a0c547c established for the
7110        // lexical rebuild path.
7111        self.conn
7112            .query_map_collect(
7113                r"SELECT c.id,
7114                         COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
7115                         (SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
7116                         c.external_id, c.title, c.source_path,
7117                         c.started_at,
7118                         COALESCE(
7119                             (SELECT ts.ended_at
7120                              FROM conversation_tail_state ts
7121                              WHERE ts.conversation_id = c.id),
7122                             c.ended_at
7123                         ),
7124                         c.approx_tokens, c.metadata_json,
7125                         c.source_id, c.origin_host, c.metadata_bin
7126                FROM conversations c
7127                ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
7128                LIMIT ?1 OFFSET ?2",
7129                fparams![limit, offset],
7130                |row| {
7131                    let workspace_path: Option<String> = row.get_typed(2)?;
7132                    let source_path: String = row.get_typed(5)?;
7133                    let raw_source_id: Option<String> = row.get_typed(10)?;
7134                    let raw_origin_host: Option<String> = row.get_typed(11)?;
7135                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7136                        raw_source_id.as_deref(),
7137                        None,
7138                        raw_origin_host.as_deref(),
7139                    );
7140                    Ok(Conversation {
7141                        id: Some(row.get_typed(0)?),
7142                        agent_slug: row.get_typed(1)?,
7143                        workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
7144                        external_id: row.get_typed(3)?,
7145                        title: row.get_typed(4)?,
7146                        source_path: Path::new(&source_path).to_path_buf(),
7147                        started_at: row.get_typed(6)?,
7148                        ended_at: row.get_typed(7)?,
7149                        approx_tokens: row.get_typed(8)?,
7150                        metadata_json: franken_read_metadata_compat(row, 9, 12),
7151                        messages: Vec::new(),
7152                        source_id,
7153                        origin_host,
7154                    })
7155                },
7156            )
7157            .with_context(|| "listing conversations")
7158    }
7159
7160    /// Build lookup maps for agents and workspaces to avoid JOINs in
7161    /// paged conversation queries.  Both tables are tiny (tens of rows)
7162    /// so this is effectively free.
7163    pub fn build_lexical_rebuild_lookups(
7164        &self,
7165    ) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
7166        let agents: HashMap<i64, String> = self
7167            .conn
7168            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
7169                Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
7170            })
7171            .with_context(|| "loading agent lookup for lexical rebuild")?
7172            .into_iter()
7173            .collect();
7174        let workspaces: HashMap<i64, PathBuf> = self
7175            .conn
7176            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
7177                let path_str: String = row.get_typed(1)?;
7178                Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
7179            })
7180            .with_context(|| "loading workspace lookup for lexical rebuild")?
7181            .into_iter()
7182            .collect();
7183        Ok((agents, workspaces))
7184    }
7185
7186    /// List per-conversation message footprints in primary-key order.
7187    ///
7188    /// This deliberately avoids rebuild-path JOINs. Instead we merge ordered
7189    /// single-table reads over `conversations` and the narrow
7190    /// `conversation_tail_state` cache in Rust, then use `last_message_idx + 1`
7191    /// as a planning estimate.
7192    ///
7193    /// The planner only needs a sizing heuristic; exact message and byte
7194    /// accounting is performed later by the rebuild packet pipeline as it reads
7195    /// message content for indexing. Rows missing both tail-cache sources fall
7196    /// back to `MAX(messages.idx) + 1`, which preserves legacy upgraded
7197    /// databases without treating populated conversations as empty.
7198    pub fn list_conversation_footprints_for_lexical_rebuild(
7199        &self,
7200    ) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
7201        let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
7202            "SELECT conversation_id, last_message_idx
7203             FROM conversation_tail_state
7204             ORDER BY conversation_id ASC",
7205            fparams![],
7206            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
7207        ) {
7208            Ok(rows) => rows,
7209            Err(err) if error_indicates_missing_table(&err) => Vec::new(),
7210            Err(err) => {
7211                return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
7212            }
7213        };
7214        let tail_state_by_conversation: HashMap<i64, Option<i64>> =
7215            tail_state_rows.into_iter().collect();
7216
7217        let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
7218            "SELECT id, last_message_idx
7219             FROM conversations
7220             ORDER BY id ASC",
7221            fparams![],
7222            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
7223        ) {
7224            Ok(rows) => rows,
7225            Err(err) if error_indicates_missing_column(&err) => self
7226                .conn
7227                .query_map_collect(
7228                    "SELECT id
7229                     FROM conversations
7230                     ORDER BY id ASC",
7231                    fparams![],
7232                    |row| Ok((row.get_typed::<i64>(0)?, None)),
7233                )
7234                .with_context(|| {
7235                    "listing lexical rebuild conversation ids after missing tail column fallback"
7236                })?,
7237            Err(err) => {
7238                return Err(err)
7239                    .with_context(|| "listing lexical rebuild conversation footprint estimates");
7240            }
7241        };
7242
7243        let mut footprints = Vec::with_capacity(rows.len());
7244        let mut missing_tail_positions = HashMap::new();
7245        for (conversation_id, conversation_last_message_idx) in rows {
7246            let last_message_idx = tail_state_by_conversation
7247                .get(&conversation_id)
7248                .copied()
7249                .flatten()
7250                .or(conversation_last_message_idx);
7251            let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7252            else {
7253                missing_tail_positions.insert(conversation_id, footprints.len());
7254                footprints.push(LexicalRebuildConversationFootprintRow {
7255                    conversation_id,
7256                    message_count: 0,
7257                    message_bytes: 0,
7258                });
7259                continue;
7260            };
7261            footprints.push(lexical_rebuild_conversation_footprint_from_count(
7262                conversation_id,
7263                message_count,
7264            ));
7265        }
7266
7267        let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
7268        if !missing_tail_positions.is_empty() {
7269            self.fill_missing_lexical_rebuild_footprint_tails(
7270                &mut footprints,
7271                &missing_tail_positions,
7272            )?;
7273        }
7274        if !every_footprint_was_missing_tail {
7275            self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
7276        }
7277
7278        Ok(footprints)
7279    }
7280
7281    pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
7282        let total_conversations: i64 = self
7283            .conn
7284            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
7285                row.get_typed(0)
7286            })
7287            .with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
7288        let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
7289        if total_conversations == 0 {
7290            return Ok(true);
7291        }
7292
7293        let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
7294        let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
7295        let tail_state_has_tail_column =
7296            match franken_table_column_names(&self.conn, "conversation_tail_state") {
7297                Ok(columns) => columns.contains("last_message_idx"),
7298                Err(err) if error_indicates_missing_table(&err) => false,
7299                Err(err) => {
7300                    return Err(err)
7301                        .with_context(|| "reading lexical rebuild tail-state metadata columns");
7302                }
7303            };
7304        if !conversations_have_tail_column && !tail_state_has_tail_column {
7305            return Ok(false);
7306        }
7307
7308        let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
7309            (true, true) => {
7310                "SELECT COUNT(*)
7311                 FROM conversations c
7312                 LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
7313                 WHERE c.last_message_idx IS NOT NULL
7314                    OR ts.last_message_idx IS NOT NULL"
7315            }
7316            (true, false) => {
7317                "SELECT COUNT(*)
7318                 FROM conversations
7319                 WHERE last_message_idx IS NOT NULL"
7320            }
7321            (false, true) => {
7322                "SELECT COUNT(*)
7323                 FROM conversations c
7324                 WHERE EXISTS (
7325                     SELECT 1
7326                     FROM conversation_tail_state ts
7327                     WHERE ts.conversation_id = c.id
7328                       AND ts.last_message_idx IS NOT NULL
7329                 )"
7330            }
7331            (false, false) => unreachable!("checked before covered_sql selection"),
7332        };
7333        let covered_conversations: i64 = self
7334            .conn
7335            .query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
7336            .with_context(
7337                || "counting conversations covered by lexical rebuild tail footprint metadata",
7338            )?;
7339        let covered_conversations =
7340            usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
7341
7342        Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
7343            total_conversations,
7344            covered_conversations,
7345        ))
7346    }
7347
7348    fn raise_lexical_rebuild_footprints_to_exact_message_counts(
7349        &self,
7350        footprints: &mut [LexicalRebuildConversationFootprintRow],
7351    ) -> Result<()> {
7352        if footprints.is_empty() {
7353            return Ok(());
7354        }
7355
7356        let positions_by_conversation: HashMap<i64, usize> = footprints
7357            .iter()
7358            .enumerate()
7359            .map(|(position, footprint)| (footprint.conversation_id, position))
7360            .collect();
7361        self.conn
7362            .query_with_params_for_each(
7363                "SELECT conversation_id, COUNT(*) AS message_count
7364                 FROM messages
7365                 GROUP BY conversation_id
7366                 ORDER BY conversation_id ASC",
7367                &[] as &[SqliteValue],
7368                |row| {
7369                    let conversation_id: i64 = row.get_typed(0)?;
7370                    let exact_count: i64 = row.get_typed(1)?;
7371                    let Some(position) = positions_by_conversation.get(&conversation_id) else {
7372                        return Ok(());
7373                    };
7374                    let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
7375                    let footprint = &mut footprints[*position];
7376                    if exact_count > footprint.message_count {
7377                        footprint.message_count = exact_count;
7378                        footprint.message_bytes =
7379                            footprint.message_bytes.max(exact_count.saturating_mul(
7380                                LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
7381                            ));
7382                    }
7383                    Ok(())
7384                },
7385            )
7386            .with_context(|| "raising lexical rebuild footprints to exact message counts")?;
7387        Ok(())
7388    }
7389
7390    fn fill_missing_lexical_rebuild_footprint_tails(
7391        &self,
7392        footprints: &mut [LexicalRebuildConversationFootprintRow],
7393        missing_tail_positions: &HashMap<i64, usize>,
7394    ) -> Result<()> {
7395        if missing_tail_positions.len() <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT {
7396            for (conversation_id, position) in missing_tail_positions {
7397                let last_message_idx: Option<i64> = self
7398                    .conn
7399                    .query_row_map(
7400                        "SELECT MAX(idx) FROM messages WHERE conversation_id = ?1",
7401                        fparams![*conversation_id],
7402                        |row| row.get_typed(0),
7403                    )
7404                    .with_context(|| {
7405                        format!(
7406                            "looking up missing lexical rebuild tail estimate for conversation {conversation_id}"
7407                        )
7408                    })?;
7409                if let Some(message_count) =
7410                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7411                {
7412                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7413                        *conversation_id,
7414                        message_count,
7415                    );
7416                }
7417            }
7418            return Ok(());
7419        }
7420
7421        self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7422            footprints,
7423            missing_tail_positions,
7424            "SELECT conversation_id, MAX(idx) AS last_message_idx
7425             FROM messages INDEXED BY idx_messages_conv_idx
7426             GROUP BY conversation_id
7427             ORDER BY conversation_id ASC",
7428        )
7429        .or_else(|err| {
7430            if err
7431                .to_string()
7432                .contains("no such index: idx_messages_conv_idx")
7433            {
7434                return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7435                    footprints,
7436                    missing_tail_positions,
7437                    "SELECT conversation_id, MAX(idx) AS last_message_idx
7438                     FROM messages
7439                     GROUP BY conversation_id
7440                     ORDER BY conversation_id ASC",
7441                );
7442            }
7443            Err(err)
7444        })
7445        .with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
7446
7447        Ok(())
7448    }
7449
7450    fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7451        &self,
7452        footprints: &mut [LexicalRebuildConversationFootprintRow],
7453        missing_tail_positions: &HashMap<i64, usize>,
7454        sql: &str,
7455    ) -> Result<()> {
7456        self.conn
7457            .query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
7458                let conversation_id: i64 = row.get_typed(0)?;
7459                let last_message_idx: Option<i64> = row.get_typed(1)?;
7460                let Some(position) = missing_tail_positions.get(&conversation_id) else {
7461                    return Ok(());
7462                };
7463                if let Some(message_count) =
7464                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7465                {
7466                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7467                        conversation_id,
7468                        message_count,
7469                    );
7470                }
7471                Ok(())
7472            })
7473            .with_context(|| "grouping lexical rebuild missing tail estimates")
7474    }
7475
7476    /// List conversation ids in the stable order used by lexical rebuilds.
7477    pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
7478        self.conn
7479            .query_map_collect(
7480                "SELECT id FROM conversations ORDER BY id ASC",
7481                fparams![],
7482                |row| row.get_typed(0),
7483            )
7484            .with_context(|| "listing conversation ids for lexical rebuild")
7485    }
7486    /// Legacy OFFSET-based traversal for one-time checkpoint migration only.
7487    ///
7488    /// New code must use `list_conversations_for_lexical_rebuild_after_id`
7489    /// for keyset pagination.
7490    pub fn list_conversations_for_lexical_rebuild_by_offset(
7491        &self,
7492        limit: i64,
7493        offset: i64,
7494        agent_slugs: &HashMap<i64, String>,
7495        workspace_paths: &HashMap<i64, PathBuf>,
7496    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7497        // Single-table query avoids the 3-table JOIN that triggers
7498        // frankensqlite's full-materialization fallback path.
7499        self.conn
7500            .query_map_collect(
7501                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7502                       started_at,
7503                       COALESCE(
7504                           (SELECT ts.ended_at
7505                            FROM conversation_tail_state ts
7506                            WHERE ts.conversation_id = conversations.id),
7507                           ended_at
7508                       ),
7509                       source_id, origin_host
7510                FROM conversations
7511                ORDER BY id ASC
7512                LIMIT ?1 OFFSET ?2",
7513                fparams![limit, offset],
7514                |row| {
7515                    let agent_id: Option<i64> = row.get_typed(1)?;
7516                    let workspace_id: Option<i64> = row.get_typed(2)?;
7517                    let source_path: String = row.get_typed(5)?;
7518                    let raw_source_id: Option<String> = row.get_typed(8)?;
7519                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7520                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7521                        raw_source_id.as_deref(),
7522                        None,
7523                        raw_origin_host.as_deref(),
7524                    );
7525                    Ok(LexicalRebuildConversationRow {
7526                        id: Some(row.get_typed(0)?),
7527                        agent_slug: agent_id
7528                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7529                            .unwrap_or_else(|| "unknown".to_string()),
7530                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7531                        external_id: row.get_typed(3)?,
7532                        title: row.get_typed(4)?,
7533                        source_path: Path::new(&source_path).to_path_buf(),
7534                        started_at: row.get_typed(6)?,
7535                        ended_at: row.get_typed(7)?,
7536                        source_id,
7537                        origin_host,
7538                    })
7539                },
7540            )
7541            .with_context(|| "listing conversations for lexical rebuild")
7542    }
7543
7544    /// List lexical rebuild conversations strictly after the given primary key.
7545    ///
7546    /// Keyset pagination keeps later rebuild pages as cheap as earlier ones,
7547    /// avoiding the ever-growing `OFFSET` scan cost during large rebuilds.
7548    pub fn list_conversations_for_lexical_rebuild_after_id(
7549        &self,
7550        limit: i64,
7551        after_conversation_id: i64,
7552        agent_slugs: &HashMap<i64, String>,
7553        workspace_paths: &HashMap<i64, PathBuf>,
7554    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7555        self.conn
7556            .query_map_collect(
7557                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7558                       started_at,
7559                       COALESCE(
7560                           (SELECT ts.ended_at
7561                            FROM conversation_tail_state ts
7562                            WHERE ts.conversation_id = conversations.id),
7563                           ended_at
7564                       ),
7565                       source_id, origin_host
7566                FROM conversations
7567                WHERE id > ?2
7568                ORDER BY id ASC
7569                LIMIT ?1",
7570                fparams![limit, after_conversation_id],
7571                |row| {
7572                    let agent_id: Option<i64> = row.get_typed(1)?;
7573                    let workspace_id: Option<i64> = row.get_typed(2)?;
7574                    let source_path: String = row.get_typed(5)?;
7575                    let raw_source_id: Option<String> = row.get_typed(8)?;
7576                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7577                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7578                        raw_source_id.as_deref(),
7579                        None,
7580                        raw_origin_host.as_deref(),
7581                    );
7582                    Ok(LexicalRebuildConversationRow {
7583                        id: Some(row.get_typed(0)?),
7584                        agent_slug: agent_id
7585                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7586                            .unwrap_or_else(|| "unknown".to_string()),
7587                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7588                        external_id: row.get_typed(3)?,
7589                        title: row.get_typed(4)?,
7590                        source_path: Path::new(&source_path).to_path_buf(),
7591                        started_at: row.get_typed(6)?,
7592                        ended_at: row.get_typed(7)?,
7593                        source_id,
7594                        origin_host,
7595                    })
7596                },
7597            )
7598            .with_context(|| {
7599                format!(
7600                    "listing conversations for lexical rebuild after id {after_conversation_id}"
7601                )
7602            })
7603    }
7604
7605    /// List lexical rebuild conversations inside an `(after_id, through_id]`
7606    /// primary-key window.
7607    ///
7608    /// This lets the rebuild producer respect planned shard boundaries without
7609    /// falling back to client-side trimming or multi-table joins.
7610    pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
7611        &self,
7612        limit: i64,
7613        after_conversation_id: i64,
7614        through_conversation_id: i64,
7615        agent_slugs: &HashMap<i64, String>,
7616        workspace_paths: &HashMap<i64, PathBuf>,
7617    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7618        if through_conversation_id <= after_conversation_id {
7619            return Ok(Vec::new());
7620        }
7621        self.conn
7622            .query_map_collect(
7623                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7624                       started_at,
7625                       COALESCE(
7626                           (SELECT ts.ended_at
7627                            FROM conversation_tail_state ts
7628                            WHERE ts.conversation_id = conversations.id),
7629                           ended_at
7630                       ),
7631                       source_id, origin_host
7632                FROM conversations
7633                WHERE id > ?2 AND id <= ?3
7634                ORDER BY id ASC
7635                LIMIT ?1",
7636                fparams![limit, after_conversation_id, through_conversation_id],
7637                |row| {
7638                    let agent_id: Option<i64> = row.get_typed(1)?;
7639                    let workspace_id: Option<i64> = row.get_typed(2)?;
7640                    let source_path: String = row.get_typed(5)?;
7641                    let raw_source_id: Option<String> = row.get_typed(8)?;
7642                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7643                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7644                        raw_source_id.as_deref(),
7645                        None,
7646                        raw_origin_host.as_deref(),
7647                    );
7648                    Ok(LexicalRebuildConversationRow {
7649                        id: Some(row.get_typed(0)?),
7650                        agent_slug: agent_id
7651                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7652                            .unwrap_or_else(|| "unknown".to_string()),
7653                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7654                        external_id: row.get_typed(3)?,
7655                        title: row.get_typed(4)?,
7656                        source_path: Path::new(&source_path).to_path_buf(),
7657                        started_at: row.get_typed(6)?,
7658                        ended_at: row.get_typed(7)?,
7659                        source_id,
7660                        origin_host,
7661                    })
7662                },
7663            )
7664            .with_context(|| {
7665                format!(
7666                    "listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
7667                )
7668            })
7669    }
7670
7671    /// Fetch messages for a conversation.
7672    pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
7673        let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7674             FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7675             WHERE conversation_id = ?1 ORDER BY idx";
7676        let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7677             FROM messages \
7678             WHERE conversation_id = ?1 ORDER BY idx";
7679
7680        self.conn
7681            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7682                let role: String = row.get_typed(2)?;
7683                Ok(Message {
7684                    id: Some(row.get_typed(0)?),
7685                    idx: row.get_typed(1)?,
7686                    role: match role.as_str() {
7687                        "user" => MessageRole::User,
7688                        "agent" | "assistant" => MessageRole::Agent,
7689                        "tool" => MessageRole::Tool,
7690                        "system" => MessageRole::System,
7691                        other => MessageRole::Other(other.to_string()),
7692                    },
7693                    author: row.get_typed(3)?,
7694                    created_at: row.get_typed(4)?,
7695                    content: row.get_typed(5)?,
7696                    extra_json: franken_read_message_extra_compat(row, 6, 7),
7697                    snippets: Vec::new(),
7698                })
7699            })
7700            .or_else(|err| {
7701                if err
7702                    .to_string()
7703                    .contains("no such index: sqlite_autoindex_messages_1")
7704                {
7705                    return self.conn.query_map_collect(
7706                        fallback_sql,
7707                        fparams![conversation_id],
7708                        |row| {
7709                            let role: String = row.get_typed(2)?;
7710                            Ok(Message {
7711                                id: Some(row.get_typed(0)?),
7712                                idx: row.get_typed(1)?,
7713                                role: match role.as_str() {
7714                                    "user" => MessageRole::User,
7715                                    "agent" | "assistant" => MessageRole::Agent,
7716                                    "tool" => MessageRole::Tool,
7717                                    "system" => MessageRole::System,
7718                                    other => MessageRole::Other(other.to_string()),
7719                                },
7720                                author: row.get_typed(3)?,
7721                                created_at: row.get_typed(4)?,
7722                                content: row.get_typed(5)?,
7723                                extra_json: franken_read_message_extra_compat(row, 6, 7),
7724                                snippets: Vec::new(),
7725                            })
7726                        },
7727                    );
7728                }
7729                Err(err)
7730            })
7731            .with_context(|| format!("fetching messages for conversation {conversation_id}"))
7732    }
7733
7734    /// Fetch messages for lexical index rebuilds without deserializing extra metadata.
7735    ///
7736    /// Tantivy only needs message text and core envelope fields, so avoiding
7737    /// `extra_json` here prevents rebuilds from rehydrating enormous historical
7738    /// payloads that are irrelevant to lexical search.
7739    pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
7740        let hinted_sql = "SELECT id, idx, role, author, created_at, content \
7741                 FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7742                 WHERE conversation_id = ?1 ORDER BY idx";
7743        let fallback_sql = "SELECT id, idx, role, author, created_at, content \
7744                 FROM messages \
7745                 WHERE conversation_id = ?1 ORDER BY idx";
7746
7747        self.conn
7748            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7749                let role: String = row.get_typed(2)?;
7750                Ok(Message {
7751                    id: Some(row.get_typed(0)?),
7752                    idx: row.get_typed(1)?,
7753                    role: match role.as_str() {
7754                        "user" => MessageRole::User,
7755                        "agent" | "assistant" => MessageRole::Agent,
7756                        "tool" => MessageRole::Tool,
7757                        "system" => MessageRole::System,
7758                        other => MessageRole::Other(other.to_string()),
7759                    },
7760                    author: row.get_typed(3)?,
7761                    created_at: row.get_typed(4)?,
7762                    content: row.get_typed(5)?,
7763                    extra_json: serde_json::Value::Null,
7764                    snippets: Vec::new(),
7765                })
7766            })
7767            .or_else(|err| {
7768                if err
7769                    .to_string()
7770                    .contains("no such index: sqlite_autoindex_messages_1")
7771                {
7772                    return self.conn.query_map_collect(
7773                        fallback_sql,
7774                        fparams![conversation_id],
7775                        |row| {
7776                            let role: String = row.get_typed(2)?;
7777                            Ok(Message {
7778                                id: Some(row.get_typed(0)?),
7779                                idx: row.get_typed(1)?,
7780                                role: match role.as_str() {
7781                                    "user" => MessageRole::User,
7782                                    "agent" | "assistant" => MessageRole::Agent,
7783                                    "tool" => MessageRole::Tool,
7784                                    "system" => MessageRole::System,
7785                                    other => MessageRole::Other(other.to_string()),
7786                                },
7787                                author: row.get_typed(3)?,
7788                                created_at: row.get_typed(4)?,
7789                                content: row.get_typed(5)?,
7790                                extra_json: serde_json::Value::Null,
7791                                snippets: Vec::new(),
7792                            })
7793                        },
7794                    );
7795                }
7796                Err(err)
7797            })
7798            .with_context(|| {
7799                format!("fetching messages for lexical rebuild of conversation {conversation_id}")
7800            })
7801    }
7802
7803    /// Fetch messages for multiple conversations during lexical rebuilds.
7804    ///
7805    /// This preserves the lightweight lexical-rebuild projection while avoiding
7806    /// one round-trip per conversation when rebuilding large canonical indexes.
7807    pub fn fetch_messages_for_lexical_rebuild_batch(
7808        &self,
7809        conversation_ids: &[i64],
7810        max_messages: Option<usize>,
7811        max_content_bytes: Option<usize>,
7812    ) -> Result<HashMap<i64, Vec<Message>>> {
7813        if conversation_ids.is_empty() {
7814            return Ok(HashMap::new());
7815        }
7816
7817        let mut grouped: HashMap<i64, Vec<Message>> =
7818            HashMap::with_capacity(conversation_ids.len());
7819        let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
7820        let mut total_messages = 0usize;
7821        let mut total_content_bytes = 0usize;
7822
7823        // The apparent single-query shape (`WHERE conversation_id IN (...) ORDER BY ...`)
7824        // is a bad frankensqlite plan for large live databases: it can
7825        // materialize far more of `messages` than the requested conversations.
7826        // Reuse the hinted per-conversation primary-key lookup instead.
7827        for conversation_id in conversation_ids {
7828            if !fetched_conversation_ids.insert(*conversation_id) {
7829                continue;
7830            }
7831
7832            let messages = self
7833                .fetch_messages_for_lexical_rebuild(*conversation_id)
7834                .with_context(|| {
7835                    format!("fetching lexical rebuild messages for conversation {conversation_id}")
7836                })?;
7837            total_messages = total_messages.saturating_add(messages.len());
7838            if let Some(limit) = max_messages
7839                && total_messages > limit
7840            {
7841                return Err(anyhow!(
7842                    "lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
7843                    conversation_ids.len()
7844                ));
7845            }
7846
7847            let message_bytes = messages
7848                .iter()
7849                .map(|message| message.content.len())
7850                .sum::<usize>();
7851            total_content_bytes = total_content_bytes.saturating_add(message_bytes);
7852            if let Some(limit) = max_content_bytes
7853                && total_content_bytes > limit
7854            {
7855                return Err(anyhow!(
7856                    "lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
7857                    conversation_ids.len()
7858                ));
7859            }
7860
7861            if !messages.is_empty() {
7862                grouped.insert(*conversation_id, messages);
7863            }
7864        }
7865
7866        Ok(grouped)
7867    }
7868
7869    /// Stream lexical rebuild message rows in `(conversation_id, idx)` order
7870    /// without materializing the full result set.
7871    pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
7872        &self,
7873        start_conversation_id: i64,
7874        end_conversation_id: i64,
7875        mut f: F,
7876    ) -> Result<()>
7877    where
7878        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7879    {
7880        if end_conversation_id < start_conversation_id {
7881            return Ok(());
7882        }
7883
7884        let conversation_ids: Vec<i64> = self
7885            .conn
7886            .query_map_collect(
7887                "SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
7888                fparams![start_conversation_id, end_conversation_id],
7889                |row| row.get_typed(0),
7890            )
7891            .with_context(|| "listing conversation ids for streamed lexical rebuild")?;
7892
7893        for conversation_id in conversation_ids {
7894            let messages = self
7895                .fetch_messages_for_lexical_rebuild(conversation_id)
7896                .with_context(|| {
7897                    format!("streaming lexical rebuild messages for conversation {conversation_id}")
7898                })?;
7899
7900            for message in messages {
7901                let message_id = message.id.ok_or_else(|| {
7902                    anyhow!(
7903                        "lexical rebuild message missing id for conversation {conversation_id} idx {}",
7904                        message.idx
7905                    )
7906                })?;
7907                f(LexicalRebuildMessageRow {
7908                    conversation_id,
7909                    id: message_id,
7910                    idx: message.idx,
7911                    role: role_str(&message.role),
7912                    author: message.author,
7913                    created_at: message.created_at,
7914                    content: message.content,
7915                })?;
7916            }
7917        }
7918
7919        Ok(())
7920    }
7921
7922    /// Stream grouped lexical rebuild message rows in `(conversation_id, idx)`
7923    /// order by reusing the canonical per-message stream and coalescing rows
7924    /// per conversation.
7925    pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
7926        &self,
7927        start_conversation_id: i64,
7928        end_conversation_id: i64,
7929        mut f: F,
7930    ) -> Result<()>
7931    where
7932        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7933    {
7934        if end_conversation_id < start_conversation_id {
7935            return Ok(());
7936        }
7937
7938        let mut current_conversation_id: Option<i64> = None;
7939        let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
7940        let mut current_last_message_id = 0i64;
7941        let mut flush_current = |current_conversation_id: &mut Option<i64>,
7942                                 current_messages: &mut LexicalRebuildGroupedMessageRows,
7943                                 current_last_message_id: &mut i64|
7944         -> Result<()> {
7945            let Some(conversation_id) = current_conversation_id.take() else {
7946                return Ok(());
7947            };
7948            let messages = std::mem::take(current_messages);
7949            let last_message_id = std::mem::take(current_last_message_id);
7950            f(conversation_id, messages, last_message_id)
7951        };
7952
7953        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7954            start_conversation_id,
7955            end_conversation_id,
7956            |row| {
7957                if current_conversation_id != Some(row.conversation_id) {
7958                    flush_current(
7959                        &mut current_conversation_id,
7960                        &mut current_messages,
7961                        &mut current_last_message_id,
7962                    )?;
7963                    current_conversation_id = Some(row.conversation_id);
7964                }
7965                current_last_message_id = row.id;
7966                current_messages.push(LexicalRebuildGroupedMessageRow {
7967                    idx: row.idx,
7968                    is_tool_role: row.role == "tool",
7969                    created_at: row.created_at,
7970                    content: row.content,
7971                });
7972                Ok(())
7973            },
7974        )
7975        .with_context(|| "streaming grouped lexical rebuild messages")?;
7976
7977        flush_current(
7978            &mut current_conversation_id,
7979            &mut current_messages,
7980            &mut current_last_message_id,
7981        )
7982        .with_context(|| "flushing grouped lexical rebuild messages")
7983    }
7984
7985    /// Stream grouped lexical rebuild message rows from a starting conversation
7986    /// id to the end of the table.
7987    pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
7988        &self,
7989        start_conversation_id: i64,
7990        f: F,
7991    ) -> Result<()>
7992    where
7993        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7994    {
7995        self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
7996            start_conversation_id,
7997            i64::MAX,
7998            f,
7999        )
8000    }
8001
8002    /// Stream lexical rebuild message rows from a starting conversation id to
8003    /// the end of the table.
8004    pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
8005        &self,
8006        start_conversation_id: i64,
8007        f: F,
8008    ) -> Result<()>
8009    where
8010        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
8011    {
8012        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
8013            start_conversation_id,
8014            i64::MAX,
8015            f,
8016        )
8017    }
8018
8019    /// Get a source by ID.
8020    pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
8021        let result = self.conn.query_row_map(
8022            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
8023            fparams![id],
8024            |row| {
8025                let kind_str: String = row.get_typed(1)?;
8026                let config_json_str: Option<String> = row.get_typed(5)?;
8027                Ok(Source {
8028                    id: row.get_typed(0)?,
8029                    kind: SourceKind::parse(&kind_str).unwrap_or_default(),
8030                    host_label: row.get_typed(2)?,
8031                    machine_id: row.get_typed(3)?,
8032                    platform: row.get_typed(4)?,
8033                    config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
8034                    created_at: row.get_typed(6)?,
8035                    updated_at: row.get_typed(7)?,
8036                })
8037            },
8038        );
8039        Ok(result.optional()?)
8040    }
8041
8042    /// List all sources.
8043    pub fn list_sources(&self) -> Result<Vec<Source>> {
8044        self.conn
8045            .query_map_collect(
8046                "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
8047                fparams![],
8048                |row| {
8049                    let kind_str: String = row.get_typed(1)?;
8050                    let config_json_str: Option<String> = row.get_typed(5)?;
8051                    Ok(Source {
8052                        id: row.get_typed(0)?,
8053                        kind: SourceKind::parse(&kind_str).unwrap_or_default(),
8054                        host_label: row.get_typed(2)?,
8055                        machine_id: row.get_typed(3)?,
8056                        platform: row.get_typed(4)?,
8057                        config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
8058                        created_at: row.get_typed(6)?,
8059                        updated_at: row.get_typed(7)?,
8060                    })
8061                },
8062            )
8063            .with_context(|| "listing sources")
8064    }
8065
8066    /// Get IDs of all non-local sources.
8067    pub fn get_source_ids(&self) -> Result<Vec<String>> {
8068        self.conn
8069            .query_map_collect(
8070                "SELECT id FROM sources WHERE id != 'local' ORDER BY id",
8071                fparams![],
8072                |row| row.get_typed(0),
8073            )
8074            .with_context(|| "listing source ids")
8075    }
8076
8077    /// Create or update a source.
8078    pub fn upsert_source(&self, source: &Source) -> Result<()> {
8079        self.invalidate_conversation_source_cache(source.id.as_str());
8080        let now = Self::now_millis();
8081        let kind_str = source.kind.to_string();
8082        let config_json_str = source
8083            .config_json
8084            .as_ref()
8085            .map(serde_json::to_string)
8086            .transpose()?;
8087
8088        // Re-indexing commonly reuses the same normalized source metadata
8089        // across many conversations. Skip the write entirely when the row is
8090        // already identical so we avoid needless WAL churn and timestamp bumps.
8091        self.conn.execute_compat(
8092            "INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
8093             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
8094             ON CONFLICT(id) DO UPDATE SET
8095                 kind = excluded.kind,
8096                 host_label = excluded.host_label,
8097                 machine_id = excluded.machine_id,
8098                 platform = excluded.platform,
8099                 config_json = excluded.config_json,
8100                 updated_at = excluded.updated_at
8101             WHERE NOT (
8102                 sources.kind IS excluded.kind
8103                 AND sources.host_label IS excluded.host_label
8104                 AND sources.machine_id IS excluded.machine_id
8105                 AND sources.platform IS excluded.platform
8106                 AND sources.config_json IS excluded.config_json
8107             )",
8108            fparams![
8109                source.id.as_str(),
8110                kind_str.as_str(),
8111                source.host_label.as_deref(),
8112                source.machine_id.as_deref(),
8113                source.platform.as_deref(),
8114                config_json_str.as_deref(),
8115                source.created_at.unwrap_or(now),
8116                now
8117            ],
8118        )?;
8119        Ok(())
8120    }
8121
8122    fn historical_bundle_key_hash(
8123        version: u32,
8124        bundle: &HistoricalDatabaseBundle,
8125        include_bundle_stats: bool,
8126    ) -> String {
8127        let signature = if include_bundle_stats {
8128            format!(
8129                "{}:{}:{}:{}",
8130                version,
8131                bundle.root_path.display(),
8132                bundle.total_bytes,
8133                bundle.modified_at_ms
8134            )
8135        } else {
8136            format!("{}:{}", version, bundle.root_path.display())
8137        };
8138        blake3::hash(signature.as_bytes()).to_hex().to_string()
8139    }
8140
8141    fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
8142        format!(
8143            "historical_bundle_salvaged:{}",
8144            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
8145        )
8146    }
8147
8148    fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
8149        let signature = format!(
8150            "{}:{}:{}:{}",
8151            HISTORICAL_SALVAGE_LEDGER_VERSION,
8152            bundle.root_path.display(),
8153            bundle.total_bytes,
8154            bundle.modified_at_ms
8155        );
8156        format!(
8157            "historical_bundle_salvaged:{}",
8158            blake3::hash(signature.as_bytes()).to_hex()
8159        )
8160    }
8161
8162    fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
8163        format!(
8164            "historical_bundle_progress:{}",
8165            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
8166        )
8167    }
8168
8169    fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
8170        let signature = format!(
8171            "{}:{}:{}:{}",
8172            HISTORICAL_SALVAGE_PROGRESS_VERSION,
8173            bundle.root_path.display(),
8174            bundle.total_bytes,
8175            bundle.modified_at_ms
8176        );
8177        format!(
8178            "historical_bundle_progress:{}",
8179            blake3::hash(signature.as_bytes()).to_hex()
8180        )
8181    }
8182
8183    fn historical_bundle_already_imported(
8184        &self,
8185        bundle: &HistoricalDatabaseBundle,
8186    ) -> Result<bool> {
8187        for key in [
8188            Self::historical_bundle_meta_key(bundle),
8189            Self::historical_bundle_legacy_meta_key(bundle),
8190        ] {
8191            let existing: Option<String> = self
8192                .conn
8193                .query_row_map(
8194                    "SELECT value FROM meta WHERE key = ?1",
8195                    fparams![key.as_str()],
8196                    |row| row.get_typed(0),
8197                )
8198                .optional()?;
8199            if existing.is_some() {
8200                return Ok(true);
8201            }
8202        }
8203        Ok(false)
8204    }
8205
8206    pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
8207        for bundle in discover_historical_database_bundles(canonical_db_path) {
8208            if !self.historical_bundle_already_imported(&bundle)? {
8209                return Ok(true);
8210            }
8211        }
8212        Ok(false)
8213    }
8214
8215    fn load_historical_bundle_progress(
8216        &self,
8217        bundle: &HistoricalDatabaseBundle,
8218    ) -> Result<Option<HistoricalBundleProgress>> {
8219        for key in [
8220            Self::historical_bundle_progress_key(bundle),
8221            Self::historical_bundle_legacy_progress_key(bundle),
8222        ] {
8223            let raw: Option<String> = self
8224                .conn
8225                .query_row_map(
8226                    "SELECT value FROM meta WHERE key = ?1",
8227                    fparams![key.as_str()],
8228                    |row| row.get_typed(0),
8229                )
8230                .optional()?;
8231            let Some(raw) = raw else {
8232                continue;
8233            };
8234            let parsed: HistoricalBundleProgress =
8235                serde_json::from_str(&raw).with_context(|| {
8236                    format!(
8237                        "parsing historical salvage progress checkpoint for {}",
8238                        bundle.root_path.display()
8239                    )
8240                })?;
8241            if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
8242                return Ok(Some(parsed));
8243            }
8244        }
8245        Ok(None)
8246    }
8247
8248    fn record_historical_bundle_progress(
8249        &self,
8250        bundle: &HistoricalDatabaseBundle,
8251        method: &str,
8252        last_completed_source_row_id: i64,
8253        conversations_imported: usize,
8254        messages_imported: usize,
8255    ) -> Result<()> {
8256        let key = Self::historical_bundle_progress_key(bundle);
8257        let value = HistoricalBundleProgress {
8258            progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
8259            path: bundle.root_path.display().to_string(),
8260            bytes: bundle.total_bytes,
8261            modified_at_ms: bundle.modified_at_ms,
8262            method: method.to_string(),
8263            last_completed_source_row_id,
8264            conversations_imported,
8265            messages_imported,
8266            updated_at_ms: Self::now_millis(),
8267        };
8268        let value_str = serde_json::to_string(&value)?;
8269        self.conn.execute_compat(
8270            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
8271            fparams![key.as_str(), value_str.as_str()],
8272        )?;
8273        Ok(())
8274    }
8275
8276    fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
8277        for key in [
8278            Self::historical_bundle_progress_key(bundle),
8279            Self::historical_bundle_legacy_progress_key(bundle),
8280        ] {
8281            self.conn
8282                .execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
8283        }
8284        Ok(())
8285    }
8286
8287    fn record_historical_bundle_import(
8288        &self,
8289        bundle: &HistoricalDatabaseBundle,
8290        method: &str,
8291        conversations_imported: usize,
8292        messages_imported: usize,
8293    ) -> Result<()> {
8294        let key = Self::historical_bundle_meta_key(bundle);
8295        let value = serde_json::json!({
8296            "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
8297            "path": bundle.root_path.display().to_string(),
8298            "bytes": bundle.total_bytes,
8299            "modified_at_ms": bundle.modified_at_ms,
8300            "method": method,
8301            "conversations_imported": conversations_imported,
8302            "messages_imported": messages_imported,
8303            "recorded_at_ms": Self::now_millis(),
8304        });
8305        let value_str = serde_json::to_string(&value)?;
8306        self.conn.execute_compat(
8307            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
8308            fparams![key.as_str(), value_str.as_str()],
8309        )?;
8310        Ok(())
8311    }
8312
8313    fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
8314        const RETRYABLE_PATTERNS: &[&str] = &[
8315            "out of memory",
8316            "string or blob too big",
8317            "too many sql variables",
8318        ];
8319        err.chain().any(|cause| {
8320            let rendered = cause.to_string().to_ascii_lowercase();
8321            RETRYABLE_PATTERNS
8322                .iter()
8323                .any(|pattern| rendered.contains(pattern))
8324        })
8325    }
8326
8327    fn split_historical_batch_entry_messages(
8328        entry: &HistoricalBatchEntry,
8329    ) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
8330        if entry.conversation.messages.len() < 2 {
8331            return None;
8332        }
8333        let split_at = entry.conversation.messages.len() / 2;
8334        if split_at == 0 || split_at >= entry.conversation.messages.len() {
8335            return None;
8336        }
8337
8338        let mut left = entry.clone();
8339        left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
8340
8341        let mut right = entry.clone();
8342        right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
8343
8344        Some((left, right))
8345    }
8346
8347    fn import_historical_batch_with_retry<F>(
8348        entries: &[HistoricalBatchEntry],
8349        insert_batch: &mut F,
8350    ) -> Result<HistoricalBatchImportTotals>
8351    where
8352        F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
8353    {
8354        match insert_batch(entries) {
8355            Ok(totals) => Ok(totals),
8356            Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
8357                if entries.len() > 1 {
8358                    let mid = entries.len() / 2;
8359                    tracing::warn!(
8360                        batch_entries = entries.len(),
8361                        split_left = mid,
8362                        split_right = entries.len() - mid,
8363                        error = %err,
8364                        "historical salvage batch failed; retrying in smaller sub-batches"
8365                    );
8366                    let left =
8367                        Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
8368                    let right =
8369                        Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
8370                    return Ok(HistoricalBatchImportTotals {
8371                        inserted_source_rows: left.inserted_source_rows
8372                            + right.inserted_source_rows,
8373                        inserted_messages: left.inserted_messages + right.inserted_messages,
8374                    });
8375                }
8376
8377                if let Some(entry) = entries.first()
8378                    && let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
8379                {
8380                    tracing::warn!(
8381                        source_row_id = entry.source_row_id,
8382                        message_count = entry.conversation.messages.len(),
8383                        error = %err,
8384                        "historical salvage conversation failed; retrying in smaller message slices"
8385                    );
8386                    let left_totals = Self::import_historical_batch_with_retry(
8387                        std::slice::from_ref(&left),
8388                        insert_batch,
8389                    )?;
8390                    let right_totals = Self::import_historical_batch_with_retry(
8391                        std::slice::from_ref(&right),
8392                        insert_batch,
8393                    )?;
8394                    return Ok(HistoricalBatchImportTotals {
8395                        inserted_source_rows: usize::from(
8396                            left_totals.inserted_source_rows > 0
8397                                || right_totals.inserted_source_rows > 0,
8398                        ),
8399                        inserted_messages: left_totals
8400                            .inserted_messages
8401                            .saturating_add(right_totals.inserted_messages),
8402                    });
8403                }
8404
8405                Err(err)
8406            }
8407            Err(err) => Err(err),
8408        }
8409    }
8410
8411    fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
8412        let sources: Vec<Source> = match source_conn.query_map_collect(
8413            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
8414             FROM sources",
8415            fparams![],
8416            |row| {
8417                let raw_source_id: String = row.get_typed(0)?;
8418                let kind_str: String = row.get_typed(1)?;
8419                let raw_host_label: Option<String> = row.get_typed(2)?;
8420                let config_json_raw: Option<String> = row.get_typed(5)?;
8421                let (source_id, source_kind, host_label) = normalized_storage_source_parts(
8422                    Some(raw_source_id.as_str()),
8423                    Some(kind_str.as_str()),
8424                    raw_host_label.as_deref(),
8425                );
8426                Ok(Source {
8427                    id: source_id,
8428                    kind: source_kind,
8429                    host_label,
8430                    machine_id: row.get_typed(3)?,
8431                    platform: row.get_typed(4)?,
8432                    config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
8433                    created_at: row.get_typed(6)?,
8434                    updated_at: row.get_typed(7)?,
8435                })
8436            },
8437        ) {
8438            Ok(rows) => rows,
8439            Err(err) => {
8440                tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
8441                return Ok(());
8442            }
8443        };
8444
8445        for source in sources {
8446            self.upsert_source(&source)?;
8447        }
8448        Ok(())
8449    }
8450
8451    fn import_historical_conversations(
8452        &self,
8453        bundle: &HistoricalDatabaseBundle,
8454        salvage_method: &str,
8455        source_conn: &FrankenConnection,
8456    ) -> Result<(usize, usize)> {
8457        let batch_limits = historical_import_batch_limits();
8458        let cache_enabled = IndexingCache::is_enabled();
8459        let mut indexing_cache = IndexingCache::new();
8460        let mut known_sources: HashSet<String> = self
8461            .list_sources()?
8462            .into_iter()
8463            .map(|source| source.id)
8464            .collect();
8465        let resume_progress = self.load_historical_bundle_progress(bundle)?;
8466        let resume_after_row_id = resume_progress
8467            .as_ref()
8468            .map(|progress| progress.last_completed_source_row_id)
8469            .filter(|row_id| *row_id > 0);
8470
8471        tracing::info!(
8472            target: "cass::historical_salvage",
8473            batch_conversations = batch_limits.conversations,
8474            batch_messages = batch_limits.messages,
8475            batch_payload_chars = batch_limits.payload_chars,
8476            cache_enabled,
8477            resume_after_row_id,
8478            "configured historical salvage batch limits"
8479        );
8480
8481        if let Some(progress) = &resume_progress {
8482            tracing::info!(
8483                target: "cass::historical_salvage",
8484                path = %bundle.root_path.display(),
8485                resume_after_row_id = progress.last_completed_source_row_id,
8486                prior_conversations_imported = progress.conversations_imported,
8487                prior_messages_imported = progress.messages_imported,
8488                "resuming historical salvage bundle from durable checkpoint"
8489            );
8490        }
8491
8492        // LEFT JOIN + COALESCE on agents so legacy source databases with NULL
8493        // agent_id (the V1 schema did not require NOT NULL) still have their
8494        // conversations imported, degrading to 'unknown' slug like the other
8495        // rebuild paths.  Using INNER JOIN here would silently drop those
8496        // conversations during historical salvage, which is data loss.
8497        let conv_sql = if resume_after_row_id.is_some() {
8498            "SELECT
8499                c.id,
8500                COALESCE(a.slug, 'unknown'),
8501                w.path,
8502                c.external_id,
8503                c.title,
8504                c.source_path,
8505                c.started_at,
8506                c.ended_at,
8507                c.approx_tokens,
8508                c.metadata_json,
8509                c.source_id,
8510                c.origin_host
8511             FROM conversations c
8512             LEFT JOIN agents a ON c.agent_id = a.id
8513             LEFT JOIN workspaces w ON c.workspace_id = w.id
8514             WHERE c.id > ?1
8515             ORDER BY c.id"
8516        } else {
8517            "SELECT
8518                c.id,
8519                COALESCE(a.slug, 'unknown'),
8520                w.path,
8521                c.external_id,
8522                c.title,
8523                c.source_path,
8524                c.started_at,
8525                c.ended_at,
8526                c.approx_tokens,
8527                c.metadata_json,
8528                c.source_id,
8529                c.origin_host
8530             FROM conversations c
8531             LEFT JOIN agents a ON c.agent_id = a.id
8532             LEFT JOIN workspaces w ON c.workspace_id = w.id
8533             ORDER BY c.id"
8534        };
8535        let conv_params: &[ParamValue] =
8536            if let Some(last_completed_source_row_id) = resume_after_row_id {
8537                &[ParamValue::from(last_completed_source_row_id)]
8538            } else {
8539                &[]
8540            };
8541
8542        #[allow(clippy::type_complexity)]
8543        let conv_rows: Vec<(
8544            i64,
8545            String,
8546            Option<String>,
8547            Option<String>,
8548            Option<String>,
8549            String,
8550            Option<i64>,
8551            Option<i64>,
8552            Option<i64>,
8553            Option<String>,
8554            Option<String>,
8555            Option<String>,
8556        )> = source_conn
8557            .query_map_collect(conv_sql, conv_params, |row| {
8558                Ok((
8559                    row.get_typed::<i64>(0)?,
8560                    row.get_typed::<String>(1)?,
8561                    row.get_typed::<Option<String>>(2)?,
8562                    row.get_typed::<Option<String>>(3)?,
8563                    row.get_typed::<Option<String>>(4)?,
8564                    row.get_typed::<String>(5)?,
8565                    row.get_typed::<Option<i64>>(6)?,
8566                    row.get_typed::<Option<i64>>(7)?,
8567                    row.get_typed::<Option<i64>>(8)?,
8568                    row.get_typed::<Option<String>>(9)?,
8569                    row.get_typed::<Option<String>>(10)?,
8570                    row.get_typed::<Option<String>>(11)?,
8571                ))
8572            })
8573            .context("querying historical conversations")?;
8574
8575        let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
8576             FROM messages
8577             WHERE conversation_id = ?1
8578             ORDER BY idx";
8579
8580        let mut imported_conversations = resume_progress
8581            .as_ref()
8582            .map(|progress| progress.conversations_imported)
8583            .unwrap_or(0);
8584        let mut imported_messages = resume_progress
8585            .as_ref()
8586            .map(|progress| progress.messages_imported)
8587            .unwrap_or(0);
8588        let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
8589        let mut pending_batch_messages = 0usize;
8590        let mut pending_batch_chars = 0usize;
8591        let mut pending_batch_first_row_id: Option<i64> = None;
8592        let mut pending_batch_last_row_id: Option<i64> = None;
8593
8594        let flush_batch = |storage: &FrankenStorage,
8595                           batch: &mut Vec<HistoricalBatchEntry>,
8596                           pending_messages: &mut usize,
8597                           pending_chars: &mut usize,
8598                           first_row_id: &mut Option<i64>,
8599                           last_row_id: &mut Option<i64>,
8600                           imported_conversations: &mut usize,
8601                           imported_messages: &mut usize|
8602         -> Result<()> {
8603            if batch.is_empty() {
8604                return Ok(());
8605            }
8606
8607            let batch_first_row_id = *first_row_id;
8608            let batch_last_row_id = *last_row_id;
8609            if historical_salvage_debug_enabled() {
8610                eprintln!(
8611                    "[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
8612                    batch_first_row_id,
8613                    batch_last_row_id,
8614                    batch.len(),
8615                    *pending_messages,
8616                    *pending_chars
8617                );
8618            }
8619            tracing::info!(
8620                target: "cass::historical_salvage",
8621                batch_conversations = batch.len(),
8622                batch_messages = *pending_messages,
8623                batch_payload_chars = *pending_chars,
8624                first_source_row_id = batch_first_row_id,
8625                last_source_row_id = batch_last_row_id,
8626                "flushing historical salvage batch"
8627            );
8628
8629            let mut insert_batch =
8630                |entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
8631                    let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
8632                        .iter()
8633                        .map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
8634                        .collect();
8635                    let outcomes = storage
8636                        .insert_conversations_batched(&borrowed_batch)
8637                        .with_context(|| {
8638                            let first_source_row_id =
8639                                entries.first().map(|entry| entry.source_row_id);
8640                            let last_source_row_id =
8641                                entries.last().map(|entry| entry.source_row_id);
8642                            format!(
8643                                "inserting historical salvage batch source rows {:?}..{:?}",
8644                                first_source_row_id, last_source_row_id
8645                            )
8646                        })?;
8647                    let mut totals = HistoricalBatchImportTotals::default();
8648                    for outcome in outcomes {
8649                        if !outcome.inserted_indices.is_empty() {
8650                            totals.inserted_source_rows += 1;
8651                            totals.inserted_messages += outcome.inserted_indices.len();
8652                        }
8653                    }
8654                    Ok(totals)
8655                };
8656            let totals =
8657                Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
8658            *imported_conversations =
8659                (*imported_conversations).saturating_add(totals.inserted_source_rows);
8660            *imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
8661            if let Some(last_completed_row_id) = batch_last_row_id {
8662                storage.record_historical_bundle_progress(
8663                    bundle,
8664                    salvage_method,
8665                    last_completed_row_id,
8666                    *imported_conversations,
8667                    *imported_messages,
8668                )?;
8669            }
8670            tracing::info!(
8671                target: "cass::historical_salvage",
8672                batch_conversations = batch.len(),
8673                batch_messages = *pending_messages,
8674                imported_conversations = *imported_conversations,
8675                imported_messages = *imported_messages,
8676                first_source_row_id = batch_first_row_id,
8677                last_source_row_id = batch_last_row_id,
8678                "historical salvage batch committed"
8679            );
8680            if historical_salvage_debug_enabled() {
8681                eprintln!(
8682                    "[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
8683                    batch_first_row_id,
8684                    batch_last_row_id,
8685                    *imported_conversations,
8686                    *imported_messages
8687                );
8688            }
8689            batch.clear();
8690            *pending_messages = 0;
8691            *pending_chars = 0;
8692            *first_row_id = None;
8693            *last_row_id = None;
8694            Ok(())
8695        };
8696
8697        for (
8698            conversation_row_id,
8699            agent_slug,
8700            workspace_path,
8701            external_id,
8702            title,
8703            source_path,
8704            started_at,
8705            ended_at,
8706            approx_tokens,
8707            metadata_json_raw,
8708            raw_source_id,
8709            raw_origin_host,
8710        ) in conv_rows
8711        {
8712            let source_id = crate::search::tantivy::normalized_index_source_id(
8713                raw_source_id.as_deref(),
8714                None,
8715                raw_origin_host.as_deref(),
8716            );
8717            let origin_host =
8718                crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
8719
8720            let messages: Vec<Message> = source_conn
8721                .query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
8722                    let role: String = msg_row.get_typed(1)?;
8723                    Ok(Message {
8724                        id: None,
8725                        idx: msg_row.get_typed(0)?,
8726                        role: match role.as_str() {
8727                            "user" => MessageRole::User,
8728                            "agent" | "assistant" => MessageRole::Agent,
8729                            "tool" => MessageRole::Tool,
8730                            "system" => MessageRole::System,
8731                            other => MessageRole::Other(other.to_string()),
8732                        },
8733                        author: msg_row.get_typed(2)?,
8734                        created_at: msg_row.get_typed(3)?,
8735                        content: msg_row.get_typed(4)?,
8736                        extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
8737                        snippets: Vec::new(),
8738                    })
8739                })
8740                .context("collecting historical message rows")?;
8741
8742            if messages.is_empty() {
8743                continue;
8744            }
8745
8746            let conversation_message_count = messages.len();
8747            let conversation_chars = messages
8748                .iter()
8749                .map(message_payload_size_hint)
8750                .sum::<usize>();
8751
8752            let conversation = Conversation {
8753                id: None,
8754                agent_slug: agent_slug.clone(),
8755                workspace: workspace_path.map(PathBuf::from),
8756                external_id,
8757                title,
8758                source_path: PathBuf::from(source_path),
8759                started_at,
8760                ended_at,
8761                approx_tokens,
8762                metadata_json: parse_json_column(metadata_json_raw),
8763                messages,
8764                source_id,
8765                origin_host,
8766            };
8767
8768            if !known_sources.contains(&conversation.source_id) {
8769                let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
8770                    Source::local()
8771                } else {
8772                    Source {
8773                        id: conversation.source_id.clone(),
8774                        kind: SourceKind::Ssh,
8775                        host_label: conversation.origin_host.clone(),
8776                        machine_id: None,
8777                        platform: None,
8778                        config_json: None,
8779                        created_at: None,
8780                        updated_at: None,
8781                    }
8782                };
8783                self.upsert_source(&placeholder)?;
8784                known_sources.insert(conversation.source_id.clone());
8785            }
8786
8787            let agent = Agent {
8788                id: None,
8789                slug: agent_slug.clone(),
8790                name: agent_slug,
8791                version: None,
8792                kind: AgentKind::Cli,
8793            };
8794            let agent_id = if cache_enabled {
8795                indexing_cache.get_or_insert_agent(self, &agent)?
8796            } else {
8797                self.ensure_agent(&agent)?
8798            };
8799            let workspace_id = if let Some(workspace) = &conversation.workspace {
8800                if cache_enabled {
8801                    Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
8802                } else {
8803                    Some(self.ensure_workspace(workspace, None)?)
8804                }
8805            } else {
8806                None
8807            };
8808
8809            let exceeds_pending_limits = !pending_batch.is_empty()
8810                && (pending_batch.len() >= batch_limits.conversations
8811                    || pending_batch_messages.saturating_add(conversation_message_count)
8812                        > batch_limits.messages
8813                    || pending_batch_chars.saturating_add(conversation_chars)
8814                        > batch_limits.payload_chars);
8815            if exceeds_pending_limits {
8816                flush_batch(
8817                    self,
8818                    &mut pending_batch,
8819                    &mut pending_batch_messages,
8820                    &mut pending_batch_chars,
8821                    &mut pending_batch_first_row_id,
8822                    &mut pending_batch_last_row_id,
8823                    &mut imported_conversations,
8824                    &mut imported_messages,
8825                )?;
8826            }
8827
8828            if pending_batch_first_row_id.is_none() {
8829                pending_batch_first_row_id = Some(conversation_row_id);
8830            }
8831            pending_batch_last_row_id = Some(conversation_row_id);
8832            pending_batch_messages =
8833                pending_batch_messages.saturating_add(conversation_message_count);
8834            pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
8835            pending_batch.push(HistoricalBatchEntry {
8836                source_row_id: conversation_row_id,
8837                agent_id,
8838                workspace_id,
8839                conversation,
8840            });
8841
8842            if pending_batch.len() >= batch_limits.conversations
8843                || pending_batch_messages >= batch_limits.messages
8844                || pending_batch_chars >= batch_limits.payload_chars
8845            {
8846                flush_batch(
8847                    self,
8848                    &mut pending_batch,
8849                    &mut pending_batch_messages,
8850                    &mut pending_batch_chars,
8851                    &mut pending_batch_first_row_id,
8852                    &mut pending_batch_last_row_id,
8853                    &mut imported_conversations,
8854                    &mut imported_messages,
8855                )?;
8856            }
8857        }
8858
8859        flush_batch(
8860            self,
8861            &mut pending_batch,
8862            &mut pending_batch_messages,
8863            &mut pending_batch_chars,
8864            &mut pending_batch_first_row_id,
8865            &mut pending_batch_last_row_id,
8866            &mut imported_conversations,
8867            &mut imported_messages,
8868        )?;
8869
8870        if cache_enabled {
8871            let (hits, misses, hit_rate) = indexing_cache.stats();
8872            tracing::info!(
8873                target: "cass::historical_salvage",
8874                hits,
8875                misses,
8876                hit_rate = format!("{:.1}%", hit_rate * 100.0),
8877                agents = indexing_cache.agent_count(),
8878                workspaces = indexing_cache.workspace_count(),
8879                sources = known_sources.len(),
8880                "historical salvage cache stats"
8881            );
8882        }
8883
8884        Ok((imported_conversations, imported_messages))
8885    }
8886
8887    pub fn salvage_historical_databases(
8888        &self,
8889        canonical_db_path: &Path,
8890    ) -> Result<HistoricalSalvageOutcome> {
8891        let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
8892        let mut outcome = HistoricalSalvageOutcome {
8893            bundles_considered: ordered_bundles.len(),
8894            ..HistoricalSalvageOutcome::default()
8895        };
8896
8897        for bundle in ordered_bundles {
8898            if self.historical_bundle_already_imported(&bundle)? {
8899                self.clear_historical_bundle_progress(&bundle)?;
8900                continue;
8901            }
8902
8903            let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
8904                format!(
8905                    "opening historical bundle {} for salvage",
8906                    bundle.root_path.display()
8907                )
8908            }) {
8909                Ok(source) => source,
8910                Err(err) => {
8911                    tracing::warn!(
8912                        path = %bundle.root_path.display(),
8913                        error = %err,
8914                        "skipping unreadable historical cass database bundle during salvage"
8915                    );
8916                    self.clear_historical_bundle_progress(&bundle)?;
8917                    continue;
8918                }
8919            };
8920
8921            // #247 (coding_agent_session_search-r8pcy): if a per-bundle progress
8922            // checkpoint already covers the backup's entire conversation row-id
8923            // space, the bundle was effectively fully imported but the daemon was
8924            // killed (e.g. OOM) before the completion ledger marker landed.
8925            // Re-scanning it is a pure O(n) no-op — every batch commits
8926            // imported=0 while taking 5-12 min. Detect it via the high-water
8927            // checkpoint, write the ledger marker, drop the checkpoint, and skip.
8928            if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
8929                let backup_max_conversation_id: i64 = source
8930                    .conn
8931                    .query_row_map(
8932                        "SELECT COALESCE(MAX(id), 0) FROM conversations",
8933                        fparams![],
8934                        |row| row.get_typed(0),
8935                    )
8936                    .unwrap_or(0);
8937                if backup_max_conversation_id > 0
8938                    && progress.last_completed_source_row_id >= backup_max_conversation_id
8939                {
8940                    self.record_historical_bundle_import(
8941                        &bundle,
8942                        source.method,
8943                        progress.conversations_imported,
8944                        progress.messages_imported,
8945                    )?;
8946                    self.clear_historical_bundle_progress(&bundle)?;
8947                    tracing::info!(
8948                        path = %bundle.root_path.display(),
8949                        last_completed_source_row_id = progress.last_completed_source_row_id,
8950                        backup_max_conversation_id,
8951                        conversations_imported = progress.conversations_imported,
8952                        messages_imported = progress.messages_imported,
8953                        "historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
8954                    );
8955                    continue;
8956                }
8957            }
8958
8959            self.import_historical_sources(&source.conn)?;
8960            let (imported_conversations, imported_messages) =
8961                self.import_historical_conversations(&bundle, source.method, &source.conn)?;
8962            self.record_historical_bundle_import(
8963                &bundle,
8964                source.method,
8965                imported_conversations,
8966                imported_messages,
8967            )?;
8968            self.clear_historical_bundle_progress(&bundle)?;
8969
8970            outcome.bundles_imported += 1;
8971            outcome.conversations_imported += imported_conversations;
8972            outcome.messages_imported += imported_messages;
8973
8974            tracing::info!(
8975                path = %bundle.root_path.display(),
8976                bytes = bundle.total_bytes,
8977                method = source.method,
8978                imported_conversations,
8979                imported_messages,
8980                "salvaged historical cass database bundle"
8981            );
8982        }
8983
8984        Ok(outcome)
8985    }
8986
8987    /// Delete a source by ID. Returns true if a row was deleted.
8988    pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
8989        if id == LOCAL_SOURCE_ID {
8990            anyhow::bail!("cannot delete the local source");
8991        }
8992        let count = self
8993            .conn
8994            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
8995        if count > 0 {
8996            self.invalidate_conversation_source_cache(id);
8997        }
8998        Ok(count > 0)
8999    }
9000
9001    /// Insert a conversation tree (conversation + messages + snippets + FTS).
9002    pub fn insert_conversation_tree(
9003        &self,
9004        agent_id: i64,
9005        workspace_id: Option<i64>,
9006        conv: &Conversation,
9007    ) -> Result<InsertOutcome> {
9008        let normalized_conv = normalized_conversation_for_storage(conv);
9009        let conv = normalized_conv.as_ref();
9010        self.ensure_source_for_conversation(conv)?;
9011        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9012        let defer_analytics_updates = defer_analytics_updates_enabled();
9013        let conversation_key = conversation_merge_key(agent_id, conv);
9014        let mut tx = self.conn.transaction()?;
9015        let existing = franken_find_existing_conversation_with_tail_by_key(
9016            &tx,
9017            &conversation_key,
9018            Some(conv),
9019        )?;
9020        if let Some(existing) = existing {
9021            let outcome = self.franken_append_messages_with_tail_in_tx(
9022                &tx,
9023                agent_id,
9024                existing.id,
9025                conv,
9026                existing.tail_state,
9027                defer_lexical_updates,
9028                defer_analytics_updates,
9029            )?;
9030            tx.commit()?;
9031            return Ok(outcome);
9032        }
9033
9034        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
9035            &tx,
9036            agent_id,
9037            workspace_id,
9038            conv,
9039            &conversation_key,
9040        )? {
9041            ConversationInsertStatus::Inserted(conv_id) => conv_id,
9042            ConversationInsertStatus::Existing(existing_id) => {
9043                let ExistingMessageLookup {
9044                    by_idx: mut existing_messages,
9045                    replay: mut existing_replay_fingerprints,
9046                } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9047                let ExistingConversationNewMessages {
9048                    messages: new_messages,
9049                    new_chars,
9050                    idx_collision_count,
9051                    first_collision_idx,
9052                } = collect_new_messages_for_existing_conversation(
9053                    existing_id,
9054                    conv,
9055                    &mut existing_messages,
9056                    &mut existing_replay_fingerprints,
9057                    "skipping replay-equivalent recovered message with shifted idx",
9058                );
9059                let (inserted_last_idx, inserted_last_created_at) =
9060                    borrowed_messages_tail_state(&new_messages);
9061                let mut inserted_indices = Vec::new();
9062                let mut fts_entries = Vec::new();
9063                let mut fts_pending_chars = 0usize;
9064                let mut _fts_inserted_total = 0usize;
9065                let inserted_message_ids =
9066                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
9067                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9068                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9069                    if !defer_lexical_updates {
9070                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9071                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9072                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9073                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9074                        {
9075                            flush_pending_fts_entries(
9076                                self,
9077                                &tx,
9078                                &mut fts_entries,
9079                                &mut fts_pending_chars,
9080                                &mut _fts_inserted_total,
9081                            )?;
9082                        }
9083                    }
9084                    inserted_indices.push(msg.idx);
9085                }
9086
9087                if idx_collision_count > 0 {
9088                    tracing::warn!(
9089                        conversation_id = existing_id,
9090                        collision_count = idx_collision_count,
9091                        first_idx = first_collision_idx,
9092                        source_path = %conv.source_path.display(),
9093                        "message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
9094                    );
9095                }
9096
9097                if !defer_lexical_updates {
9098                    flush_pending_fts_entries(
9099                        self,
9100                        &tx,
9101                        &mut fts_entries,
9102                        &mut fts_pending_chars,
9103                        &mut _fts_inserted_total,
9104                    )?;
9105                }
9106
9107                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9108                franken_update_conversation_tail_state(
9109                    &tx,
9110                    existing_id,
9111                    conv_last_ts,
9112                    inserted_last_idx,
9113                    inserted_last_created_at,
9114                )?;
9115                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
9116                {
9117                    franken_update_external_conversation_tail_lookup_key(
9118                        &tx,
9119                        &lookup_key,
9120                        conv_last_ts,
9121                        inserted_last_idx,
9122                        inserted_last_created_at,
9123                    )?;
9124                }
9125
9126                if !defer_analytics_updates && !inserted_indices.is_empty() {
9127                    franken_update_daily_stats_in_tx(
9128                        self,
9129                        &tx,
9130                        &conv.agent_slug,
9131                        &conv.source_id,
9132                        conversation_effective_started_at(conv),
9133                        StatsDelta {
9134                            session_count_delta: 0,
9135                            message_count_delta: inserted_indices.len() as i64,
9136                            total_chars_delta: new_chars,
9137                        },
9138                    )?;
9139                }
9140
9141                tx.commit()?;
9142                return Ok(InsertOutcome {
9143                    conversation_id: existing_id,
9144                    conversation_inserted: false,
9145                    inserted_indices,
9146                });
9147            }
9148        };
9149        let mut fts_entries = Vec::new();
9150        let mut fts_pending_chars = 0usize;
9151        let mut _fts_inserted_total = 0usize;
9152        let mut total_chars: i64 = 0;
9153        let mut inserted_indices = Vec::new();
9154        let mut pending_messages = HashMap::new();
9155        let mut pending_replay_fingerprints = HashSet::new();
9156        let mut idx_collision_count = 0usize;
9157        let mut first_collision_idx: Option<i64> = None;
9158        let mut new_messages = Vec::new();
9159        for msg in &conv.messages {
9160            let incoming_fingerprint = message_merge_fingerprint(msg);
9161            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
9162                if existing_fingerprint != &incoming_fingerprint {
9163                    idx_collision_count = idx_collision_count.saturating_add(1);
9164                    first_collision_idx.get_or_insert(msg.idx);
9165                }
9166                continue;
9167            }
9168            let incoming_replay = message_replay_fingerprint(msg);
9169            if pending_replay_fingerprints.contains(&incoming_replay) {
9170                tracing::debug!(
9171                    conversation_id = conv_id,
9172                    idx = msg.idx,
9173                    source_path = %conv.source_path.display(),
9174                    "skipping replay-equivalent duplicate message within new conversation insert"
9175                );
9176                continue;
9177            }
9178            pending_messages.insert(msg.idx, incoming_fingerprint);
9179            pending_replay_fingerprints.insert(incoming_replay);
9180            new_messages.push(msg);
9181        }
9182        let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
9183        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9184            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9185            if !defer_lexical_updates {
9186                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9187                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9188                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9189                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9190                {
9191                    flush_pending_fts_entries(
9192                        self,
9193                        &tx,
9194                        &mut fts_entries,
9195                        &mut fts_pending_chars,
9196                        &mut _fts_inserted_total,
9197                    )?;
9198                }
9199            }
9200            total_chars += msg.content.len() as i64;
9201            inserted_indices.push(msg.idx);
9202        }
9203        if idx_collision_count > 0 {
9204            tracing::warn!(
9205                conversation_id = conv_id,
9206                collision_count = idx_collision_count,
9207                first_idx = first_collision_idx,
9208                source_path = %conv.source_path.display(),
9209                "message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
9210            );
9211        }
9212        if !defer_lexical_updates {
9213            flush_pending_fts_entries(
9214                self,
9215                &tx,
9216                &mut fts_entries,
9217                &mut fts_pending_chars,
9218                &mut _fts_inserted_total,
9219            )?;
9220        }
9221
9222        if !defer_analytics_updates {
9223            franken_update_daily_stats_in_tx(
9224                self,
9225                &tx,
9226                &conv.agent_slug,
9227                &conv.source_id,
9228                conversation_effective_started_at(conv),
9229                StatsDelta {
9230                    session_count_delta: 1,
9231                    message_count_delta: inserted_indices.len() as i64,
9232                    total_chars_delta: total_chars,
9233                },
9234            )?;
9235        }
9236
9237        tx.commit()?;
9238        Ok(InsertOutcome {
9239            conversation_id: conv_id,
9240            conversation_inserted: true,
9241            inserted_indices,
9242        })
9243    }
9244
9245    #[cfg(test)]
9246    fn insert_conversation_tree_with_profile(
9247        &self,
9248        agent_id: i64,
9249        workspace_id: Option<i64>,
9250        conv: &Conversation,
9251        profile: &mut InsertConversationTreePerfProfile,
9252    ) -> Result<InsertOutcome> {
9253        let total_start = Instant::now();
9254        let normalized_conv = normalized_conversation_for_storage(conv);
9255        let conv = normalized_conv.as_ref();
9256
9257        let source_start = Instant::now();
9258        self.ensure_source_for_conversation(conv)?;
9259        profile.source_duration += source_start.elapsed();
9260
9261        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9262        let defer_analytics_updates = defer_analytics_updates_enabled();
9263        let conversation_key = conversation_merge_key(agent_id, conv);
9264
9265        let tx_open_start = Instant::now();
9266        let mut tx = self.conn.transaction()?;
9267        profile.tx_open_duration += tx_open_start.elapsed();
9268
9269        let existing_lookup_start = Instant::now();
9270        let existing =
9271            franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
9272        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9273        if let Some(existing_id) = existing {
9274            return Err(anyhow!(
9275                "profile helper expects new conversation path, found existing id {existing_id}"
9276            ));
9277        }
9278
9279        let conversation_row_start = Instant::now();
9280        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
9281            &tx,
9282            agent_id,
9283            workspace_id,
9284            conv,
9285            &conversation_key,
9286        )? {
9287            ConversationInsertStatus::Inserted(conv_id) => conv_id,
9288            ConversationInsertStatus::Existing(existing_id) => {
9289                return Err(anyhow!(
9290                    "profile helper expected inserted conversation row, reused existing id {existing_id}"
9291                ));
9292            }
9293        };
9294        profile.conversation_row_duration += conversation_row_start.elapsed();
9295
9296        let mut fts_entries = Vec::new();
9297        let mut fts_pending_chars = 0usize;
9298        let mut fts_inserted_total = 0usize;
9299        let mut total_chars: i64 = 0;
9300        let mut inserted_indices = Vec::new();
9301        let mut pending_messages = HashMap::new();
9302        let mut pending_replay_fingerprints = HashSet::new();
9303        let mut idx_collision_count = 0usize;
9304        let mut first_collision_idx: Option<i64> = None;
9305        let mut new_messages = Vec::new();
9306
9307        for msg in &conv.messages {
9308            let incoming_fingerprint = message_merge_fingerprint(msg);
9309            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
9310                if existing_fingerprint != &incoming_fingerprint {
9311                    idx_collision_count = idx_collision_count.saturating_add(1);
9312                    first_collision_idx.get_or_insert(msg.idx);
9313                }
9314                continue;
9315            }
9316
9317            let incoming_replay = message_replay_fingerprint(msg);
9318            if pending_replay_fingerprints.contains(&incoming_replay) {
9319                tracing::debug!(
9320                    conversation_id = conv_id,
9321                    idx = msg.idx,
9322                    source_path = %conv.source_path.display(),
9323                    "skipping replay-equivalent duplicate message within profiled new conversation insert"
9324                );
9325                continue;
9326            }
9327
9328            pending_messages.insert(msg.idx, incoming_fingerprint);
9329            pending_replay_fingerprints.insert(incoming_replay);
9330            new_messages.push(msg);
9331        }
9332
9333        let message_insert_start = Instant::now();
9334        let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
9335            &tx,
9336            conv_id,
9337            &new_messages,
9338            &mut profile.message_insert_breakdown,
9339        )?;
9340        profile.message_insert_duration += message_insert_start.elapsed();
9341
9342        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9343            let snippet_insert_start = Instant::now();
9344            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9345            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9346
9347            if !defer_lexical_updates {
9348                let fts_entry_start = Instant::now();
9349                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9350                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9351                profile.fts_entry_duration += fts_entry_start.elapsed();
9352                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9353                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9354                {
9355                    let fts_flush_start = Instant::now();
9356                    flush_pending_fts_entries(
9357                        self,
9358                        &tx,
9359                        &mut fts_entries,
9360                        &mut fts_pending_chars,
9361                        &mut fts_inserted_total,
9362                    )?;
9363                    profile.fts_flush_duration += fts_flush_start.elapsed();
9364                }
9365            }
9366
9367            total_chars += msg.content.len() as i64;
9368            inserted_indices.push(msg.idx);
9369        }
9370
9371        if idx_collision_count > 0 {
9372            tracing::warn!(
9373                conversation_id = conv_id,
9374                collision_count = idx_collision_count,
9375                first_idx = first_collision_idx,
9376                source_path = %conv.source_path.display(),
9377                "message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
9378            );
9379        }
9380
9381        if !defer_lexical_updates {
9382            let fts_flush_start = Instant::now();
9383            flush_pending_fts_entries(
9384                self,
9385                &tx,
9386                &mut fts_entries,
9387                &mut fts_pending_chars,
9388                &mut fts_inserted_total,
9389            )?;
9390            profile.fts_flush_duration += fts_flush_start.elapsed();
9391        }
9392
9393        if !defer_analytics_updates {
9394            let analytics_start = Instant::now();
9395            franken_update_daily_stats_in_tx(
9396                self,
9397                &tx,
9398                &conv.agent_slug,
9399                &conv.source_id,
9400                conversation_effective_started_at(conv),
9401                StatsDelta {
9402                    session_count_delta: 1,
9403                    message_count_delta: inserted_indices.len() as i64,
9404                    total_chars_delta: total_chars,
9405                },
9406            )?;
9407            profile.analytics_duration += analytics_start.elapsed();
9408        }
9409
9410        let commit_start = Instant::now();
9411        tx.commit()?;
9412        profile.commit_duration += commit_start.elapsed();
9413        profile.invocations += 1;
9414        profile.messages += conv.messages.len();
9415        profile.inserted_messages += inserted_indices.len();
9416        profile.total_duration += total_start.elapsed();
9417
9418        Ok(InsertOutcome {
9419            conversation_id: conv_id,
9420            conversation_inserted: true,
9421            inserted_indices,
9422        })
9423    }
9424
9425    #[cfg(test)]
9426    fn append_existing_conversation_with_profile(
9427        &self,
9428        agent_id: i64,
9429        _workspace_id: Option<i64>,
9430        conv: &Conversation,
9431        profile: &mut InsertConversationTreePerfProfile,
9432    ) -> Result<InsertOutcome> {
9433        let total_start = Instant::now();
9434        let normalized_conv = normalized_conversation_for_storage(conv);
9435        let conv = normalized_conv.as_ref();
9436
9437        let source_start = Instant::now();
9438        self.ensure_source_for_conversation(conv)?;
9439        profile.source_duration += source_start.elapsed();
9440
9441        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9442        let defer_analytics_updates = defer_analytics_updates_enabled();
9443        let conversation_key = conversation_merge_key(agent_id, conv);
9444
9445        let tx_open_start = Instant::now();
9446        let mut tx = self.conn.transaction()?;
9447        profile.tx_open_duration += tx_open_start.elapsed();
9448
9449        let existing_lookup_start = Instant::now();
9450        let existing = franken_find_existing_conversation_with_tail_by_key(
9451            &tx,
9452            &conversation_key,
9453            Some(conv),
9454        )?;
9455        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9456        let existing = existing.ok_or_else(|| {
9457            anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
9458        })?;
9459        let existing_id = existing.id;
9460
9461        let existing_idx_lookup_start = Instant::now();
9462        let append_tail_state = existing.tail_state;
9463        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9464        let existing_plan = append_tail_state.as_ref().and_then(|state| {
9465            collect_append_only_tail_messages(
9466                conv,
9467                state.last_message_idx,
9468                state.last_message_created_at,
9469            )
9470        });
9471        let used_append_tail_plan = existing_plan.is_some();
9472        profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
9473
9474        let dedupe_filter_start = Instant::now();
9475        let ExistingConversationNewMessages {
9476            messages: new_messages,
9477            new_chars,
9478            idx_collision_count,
9479            first_collision_idx,
9480        } = if let Some(existing_plan) = existing_plan {
9481            existing_plan
9482        } else {
9483            let ExistingMessageLookup {
9484                by_idx: mut existing_messages,
9485                replay: mut existing_replay_fingerprints,
9486            } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9487            collect_new_messages_for_existing_conversation(
9488                existing_id,
9489                conv,
9490                &mut existing_messages,
9491                &mut existing_replay_fingerprints,
9492                "skipping replay-equivalent profiled append message with shifted idx",
9493            )
9494        };
9495        profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
9496
9497        let mut inserted_indices = Vec::new();
9498        let mut fts_entries = Vec::new();
9499        let mut fts_pending_chars = 0usize;
9500        let mut fts_inserted_total = 0usize;
9501        let (inserted_last_idx, inserted_last_created_at) =
9502            borrowed_messages_tail_state(&new_messages);
9503
9504        let message_insert_start = Instant::now();
9505        let inserted_message_ids = franken_append_insert_new_messages_with_profile(
9506            &tx,
9507            existing_id,
9508            &new_messages,
9509            &mut profile.message_insert_breakdown,
9510        )?;
9511        profile.message_insert_duration += message_insert_start.elapsed();
9512
9513        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9514            let snippet_insert_start = Instant::now();
9515            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9516            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9517
9518            if !defer_lexical_updates {
9519                let fts_entry_start = Instant::now();
9520                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9521                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9522                profile.fts_entry_duration += fts_entry_start.elapsed();
9523                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9524                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9525                {
9526                    let fts_flush_start = Instant::now();
9527                    flush_pending_fts_entries(
9528                        self,
9529                        &tx,
9530                        &mut fts_entries,
9531                        &mut fts_pending_chars,
9532                        &mut fts_inserted_total,
9533                    )?;
9534                    profile.fts_flush_duration += fts_flush_start.elapsed();
9535                }
9536            }
9537
9538            inserted_indices.push(msg.idx);
9539        }
9540
9541        if idx_collision_count > 0 {
9542            tracing::warn!(
9543                conversation_id = existing_id,
9544                collision_count = idx_collision_count,
9545                first_idx = first_collision_idx,
9546                source_path = %conv.source_path.display(),
9547                "message idx collisions encountered while profiling append merge; retaining canonical message variants"
9548            );
9549        }
9550
9551        if !defer_lexical_updates {
9552            let fts_flush_start = Instant::now();
9553            flush_pending_fts_entries(
9554                self,
9555                &tx,
9556                &mut fts_entries,
9557                &mut fts_pending_chars,
9558                &mut fts_inserted_total,
9559            )?;
9560            profile.fts_flush_duration += fts_flush_start.elapsed();
9561        }
9562
9563        let conversation_row_start = Instant::now();
9564        let mut exact_append_tail_set = false;
9565        if used_append_tail_plan {
9566            if let (Some(last_message_idx), Some(last_message_created_at)) =
9567                (inserted_last_idx, inserted_last_created_at)
9568            {
9569                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9570                    franken_set_conversation_tail_state_after_append(
9571                        &tx,
9572                        existing_id,
9573                        last_message_created_at,
9574                        last_message_idx,
9575                        last_message_created_at,
9576                    )?;
9577                    exact_append_tail_set = true;
9578                } else {
9579                    franken_update_conversation_tail_state(
9580                        &tx,
9581                        existing_id,
9582                        Some(last_message_created_at),
9583                        inserted_last_idx,
9584                        inserted_last_created_at,
9585                    )?;
9586                }
9587            }
9588        } else {
9589            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9590            franken_update_conversation_tail_state(
9591                &tx,
9592                existing_id,
9593                conv_last_ts,
9594                inserted_last_idx,
9595                inserted_last_created_at,
9596            )?;
9597        }
9598        franken_update_external_conversation_tail_after_append(
9599            &tx,
9600            agent_id,
9601            conv,
9602            used_append_tail_plan,
9603            exact_append_tail_set,
9604            inserted_last_idx,
9605            inserted_last_created_at,
9606        )?;
9607        profile.conversation_row_duration += conversation_row_start.elapsed();
9608
9609        if !defer_analytics_updates && !inserted_indices.is_empty() {
9610            let analytics_start = Instant::now();
9611            franken_update_daily_stats_in_tx(
9612                self,
9613                &tx,
9614                &conv.agent_slug,
9615                &conv.source_id,
9616                conversation_effective_started_at(conv),
9617                StatsDelta {
9618                    session_count_delta: 0,
9619                    message_count_delta: inserted_indices.len() as i64,
9620                    total_chars_delta: new_chars,
9621                },
9622            )?;
9623            profile.analytics_duration += analytics_start.elapsed();
9624        }
9625
9626        let commit_start = Instant::now();
9627        tx.commit()?;
9628        profile.commit_duration += commit_start.elapsed();
9629        profile.invocations += 1;
9630        profile.messages += conv.messages.len();
9631        profile.inserted_messages += inserted_indices.len();
9632        profile.total_duration += total_start.elapsed();
9633
9634        Ok(InsertOutcome {
9635            conversation_id: existing_id,
9636            conversation_inserted: false,
9637            inserted_indices,
9638        })
9639    }
9640
9641    /// Append new messages to an existing conversation within an active transaction.
9642    #[allow(clippy::too_many_arguments)]
9643    fn franken_append_messages_with_tail_in_tx(
9644        &self,
9645        tx: &FrankenTransaction<'_>,
9646        agent_id: i64,
9647        conversation_id: i64,
9648        conv: &Conversation,
9649        append_tail_state: Option<ExistingConversationTailState>,
9650        defer_lexical_updates: bool,
9651        defer_analytics_updates: bool,
9652    ) -> Result<InsertOutcome> {
9653        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9654        let append_plan = append_tail_state.as_ref().and_then(|state| {
9655            collect_append_only_tail_messages(
9656                conv,
9657                state.last_message_idx,
9658                state.last_message_created_at,
9659            )
9660        });
9661        let used_append_tail_plan = append_plan.is_some();
9662        let ExistingConversationNewMessages {
9663            messages: new_messages,
9664            new_chars,
9665            idx_collision_count,
9666            first_collision_idx,
9667        } = if let Some(append_plan) = append_plan {
9668            append_plan
9669        } else {
9670            let ExistingMessageLookup {
9671                by_idx: mut existing_messages,
9672                replay: mut existing_replay_fingerprints,
9673            } = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
9674            collect_new_messages_for_existing_conversation(
9675                conversation_id,
9676                conv,
9677                &mut existing_messages,
9678                &mut existing_replay_fingerprints,
9679                "skipping replay-equivalent recovered message with shifted idx",
9680            )
9681        };
9682
9683        let mut inserted_indices = Vec::new();
9684        let mut fts_entries = Vec::new();
9685        let mut fts_pending_chars = 0usize;
9686        let mut _fts_inserted_total = 0usize;
9687        let (inserted_last_idx, inserted_last_created_at) =
9688            borrowed_messages_tail_state(&new_messages);
9689        let inserted_message_ids =
9690            franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
9691        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9692            franken_insert_snippets(tx, msg_id, &msg.snippets)?;
9693            if !defer_lexical_updates {
9694                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9695                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9696                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9697                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9698                {
9699                    flush_pending_fts_entries(
9700                        self,
9701                        tx,
9702                        &mut fts_entries,
9703                        &mut fts_pending_chars,
9704                        &mut _fts_inserted_total,
9705                    )?;
9706                }
9707            }
9708            inserted_indices.push(msg.idx);
9709        }
9710
9711        if idx_collision_count > 0 {
9712            tracing::warn!(
9713                conversation_id,
9714                collision_count = idx_collision_count,
9715                first_idx = first_collision_idx,
9716                source_path = %conv.source_path.display(),
9717                "message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
9718            );
9719        }
9720
9721        if !defer_lexical_updates {
9722            flush_pending_fts_entries(
9723                self,
9724                tx,
9725                &mut fts_entries,
9726                &mut fts_pending_chars,
9727                &mut _fts_inserted_total,
9728            )?;
9729        }
9730
9731        let mut exact_append_tail_set = false;
9732        if used_append_tail_plan {
9733            if let (Some(last_message_idx), Some(last_message_created_at)) =
9734                (inserted_last_idx, inserted_last_created_at)
9735            {
9736                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9737                    franken_set_conversation_tail_state_after_append(
9738                        tx,
9739                        conversation_id,
9740                        last_message_created_at,
9741                        last_message_idx,
9742                        last_message_created_at,
9743                    )?;
9744                    exact_append_tail_set = true;
9745                } else {
9746                    franken_update_conversation_tail_state(
9747                        tx,
9748                        conversation_id,
9749                        Some(last_message_created_at),
9750                        inserted_last_idx,
9751                        inserted_last_created_at,
9752                    )?;
9753                }
9754            }
9755        } else {
9756            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9757            franken_update_conversation_tail_state(
9758                tx,
9759                conversation_id,
9760                conv_last_ts,
9761                inserted_last_idx,
9762                inserted_last_created_at,
9763            )?;
9764        }
9765        franken_update_external_conversation_tail_after_append(
9766            tx,
9767            agent_id,
9768            conv,
9769            used_append_tail_plan,
9770            exact_append_tail_set,
9771            inserted_last_idx,
9772            inserted_last_created_at,
9773        )?;
9774
9775        if !defer_analytics_updates && !inserted_indices.is_empty() {
9776            let message_count = inserted_indices.len() as i64;
9777            franken_update_daily_stats_in_tx(
9778                self,
9779                tx,
9780                &conv.agent_slug,
9781                &conv.source_id,
9782                conversation_effective_started_at(conv),
9783                StatsDelta {
9784                    session_count_delta: 0,
9785                    message_count_delta: message_count,
9786                    total_chars_delta: new_chars,
9787                },
9788            )?;
9789        }
9790
9791        Ok(InsertOutcome {
9792            conversation_id,
9793            conversation_inserted: false,
9794            inserted_indices,
9795        })
9796    }
9797
9798    /// Rebuild the FTS5 index from scratch (chunked to avoid OOM on large databases, #110).
9799    pub fn rebuild_fts(&self) -> Result<()> {
9800        self.rebuild_fts_via_frankensqlite().map(|_| ())
9801    }
9802
9803    /// Best-effort repair for the derived SQLite FTS fallback index.
9804    ///
9805    /// The canonical archive and Tantivy index remain authoritative, so callers
9806    /// should invoke this from maintenance paths rather than ordinary opens.
9807    pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
9808        self.ensure_fts_consistency_via_frankensqlite()
9809    }
9810
9811    pub(crate) fn validate_fts_messages_integrity(&self) -> Result<()> {
9812        validate_fts_messages_integrity_for_connection(&self.conn)
9813    }
9814
9815    pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
9816        &self,
9817        archive_fingerprint: &str,
9818    ) -> Result<bool> {
9819        Ok(
9820            self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
9821                && self
9822                    .read_fts_franken_rebuild_archive_fingerprint()?
9823                    .as_deref()
9824                    == Some(archive_fingerprint),
9825        )
9826    }
9827
9828    pub(crate) fn record_search_fallback_fts_archive_fingerprint(
9829        &self,
9830        archive_fingerprint: &str,
9831    ) -> Result<()> {
9832        self.conn
9833            .execute_compat(
9834                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9835                fparams![
9836                    FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
9837                    archive_fingerprint.to_string()
9838                ],
9839            )
9840            .with_context(|| "recording frankensqlite FTS archive fingerprint")?;
9841        Ok(())
9842    }
9843
9844    pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
9845        &self,
9846        archive_fingerprint: &str,
9847    ) -> Result<bool> {
9848        Ok(
9849            self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
9850                && self.read_daily_stats_archive_fingerprint()?.as_deref()
9851                    == Some(archive_fingerprint),
9852        )
9853    }
9854
9855    pub(crate) fn record_daily_stats_archive_fingerprint(
9856        &self,
9857        archive_fingerprint: &str,
9858    ) -> Result<()> {
9859        self.conn
9860            .execute_compat(
9861                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9862                fparams![
9863                    DAILY_STATS_HEALTH_GENERATION_META_KEY,
9864                    DAILY_STATS_HEALTH_GENERATION.to_string()
9865                ],
9866            )
9867            .with_context(|| "recording daily_stats health generation")?;
9868        self.conn
9869            .execute_compat(
9870                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9871                fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
9872            )
9873            .with_context(|| "recording daily_stats archive fingerprint")?;
9874        Ok(())
9875    }
9876
9877    fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
9878        let value: Option<String> = self
9879            .conn
9880            .query_row_map(
9881                "SELECT value FROM meta WHERE key = ?1",
9882                fparams![FTS_FRANKEN_REBUILD_META_KEY],
9883                |row| row.get_typed(0),
9884            )
9885            .optional()?;
9886        Ok(value.and_then(|v| v.parse::<i64>().ok()))
9887    }
9888
9889    fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
9890        Ok(self
9891            .conn
9892            .query_row_map(
9893                "SELECT value FROM meta WHERE key = ?1",
9894                fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
9895                |row| row.get_typed(0),
9896            )
9897            .optional()?)
9898    }
9899
9900    fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
9901        let value: Option<String> = self
9902            .conn
9903            .query_row_map(
9904                "SELECT value FROM meta WHERE key = ?1",
9905                fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
9906                |row| row.get_typed(0),
9907            )
9908            .optional()?;
9909        Ok(value.and_then(|value| value.parse::<i64>().ok()))
9910    }
9911
9912    fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
9913        Ok(self
9914            .conn
9915            .query_row_map(
9916                "SELECT value FROM meta WHERE key = ?1",
9917                fparams![DAILY_STATS_HEALTH_META_KEY],
9918                |row| row.get_typed(0),
9919            )
9920            .optional()?)
9921    }
9922
9923    fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
9924        self.conn
9925            .execute_compat(
9926                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9927                fparams![
9928                    FTS_FRANKEN_REBUILD_META_KEY,
9929                    FTS_FRANKEN_REBUILD_GENERATION.to_string()
9930                ],
9931            )
9932            .with_context(|| "recording frankensqlite FTS rebuild generation")?;
9933        Ok(())
9934    }
9935
9936    fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
9937        if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
9938            // Before triggering an expensive full rebuild, probe whether
9939            // fts_messages is already populated and consistent.  On large
9940            // databases the rebuild can take hours and OOM — skip it when
9941            // the only thing missing is the generation marker (#184).
9942            let fts_already_healthy = (|| -> Result<bool> {
9943                let fts_exists: i64 = self.conn.query_row_map(
9944                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9945                    fparams![],
9946                    |row| row.get_typed(0),
9947                )?;
9948                if fts_exists != 1 {
9949                    return Ok(false);
9950                }
9951                let total: i64 = self.conn.query_row_map(
9952                    "SELECT COUNT(*) FROM messages",
9953                    fparams![],
9954                    |row| row.get_typed(0),
9955                )?;
9956                if total == 0 {
9957                    return Ok(false);
9958                }
9959                let indexed: i64 = self.conn.query_row_map(
9960                    "SELECT COUNT(*) FROM fts_messages",
9961                    fparams![],
9962                    |row| row.get_typed(0),
9963                )?;
9964                // Consider healthy if >=90% of messages are indexed
9965                Ok(indexed > 0 && indexed * 100 >= total * 90)
9966            })()
9967            .unwrap_or(false);
9968
9969            if fts_already_healthy {
9970                tracing::info!(
9971                    target: "cass::fts_rebuild",
9972                    "FTS already populated and consistent; setting generation marker without rebuild"
9973                );
9974                self.record_fts_franken_rebuild_generation()?;
9975                self.set_fts_messages_present_cache(true);
9976            } else {
9977                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9978                self.record_fts_franken_rebuild_generation()?;
9979                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9980            }
9981        }
9982
9983        let inspection = (|| -> Result<(i64, bool)> {
9984            let fts_schema_rows = self.conn.query_row_map(
9985                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9986                fparams![],
9987                |row| row.get_typed::<i64>(0),
9988            )?;
9989            let fts_queryable = fts_schema_rows == 1
9990                && self.conn.query("SELECT COUNT(*) FROM fts_messages").is_ok();
9991            Ok((fts_schema_rows, fts_queryable))
9992        })();
9993
9994        let (fts_schema_rows, fts_queryable) = match inspection {
9995            Ok(result) => result,
9996            Err(err) => {
9997                tracing::warn!(
9998                    error = %err,
9999                    "frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
10000                );
10001                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10002                self.record_fts_franken_rebuild_generation()?;
10003                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10004            }
10005        };
10006
10007        if fts_schema_rows != 1 || !fts_queryable {
10008            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10009            self.record_fts_franken_rebuild_generation()?;
10010            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10011        }
10012
10013        let total_messages =
10014            self.conn
10015                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
10016                    row.get_typed::<i64>(0)
10017                })?;
10018        let indexed_messages =
10019            self.conn
10020                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
10021                    row.get_typed::<i64>(0)
10022                })?;
10023
10024        if indexed_messages == total_messages {
10025            self.set_fts_messages_present_cache(true);
10026            return Ok(FtsConsistencyRepair::AlreadyHealthy {
10027                rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
10028            });
10029        }
10030
10031        if indexed_messages > total_messages {
10032            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10033            self.record_fts_franken_rebuild_generation()?;
10034            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
10035        }
10036
10037        let inserted_rows = self
10038            .stream_fts_rows_via_frankensqlite(true)
10039            .with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
10040        let repaired_rows =
10041            self.conn
10042                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
10043                    row.get_typed::<i64>(0)
10044                })?;
10045        if repaired_rows == total_messages {
10046            self.set_fts_messages_present_cache(true);
10047            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
10048                inserted_rows,
10049                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
10050            });
10051        }
10052
10053        // The incremental catch-up found nothing to insert, yet the gap
10054        // between total_messages (all rows, including orphans) and
10055        // indexed_messages (only rows with valid conversation_id, since the
10056        // FTS INSERT inner-joins on conversations) remains.  A full rebuild
10057        // cannot close this gap either — the orphaned messages will be
10058        // excluded again — so falling through to one would just re-do ~5 min
10059        // of work on every startup.  Accept the current state.
10060        if inserted_rows == 0 {
10061            tracing::debug!(
10062                target: "cass::fts_rebuild",
10063                indexed_messages = repaired_rows,
10064                total_messages,
10065                un_indexable_gap = total_messages.saturating_sub(repaired_rows),
10066                "FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
10067            );
10068            self.set_fts_messages_present_cache(true);
10069            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
10070                inserted_rows: 0,
10071                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
10072            });
10073        }
10074
10075        // Incremental made progress but didn't fully close the gap — something
10076        // is genuinely inconsistent, so do a full rebuild.
10077        let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
10078        self.record_fts_franken_rebuild_generation()?;
10079        Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
10080    }
10081
10082    pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
10083        self.invalidate_fts_messages_present_cache();
10084        self.conn
10085            .execute("DROP TABLE IF EXISTS fts_messages;")
10086            .with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
10087        self.conn
10088            .execute_compat(FTS5_REGISTER_SQL, fparams![])
10089            .with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
10090        self.set_fts_messages_present_cache(true);
10091
10092        self.stream_fts_rows_via_frankensqlite(false)
10093    }
10094
10095    fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
10096        let batch_size = fts_rebuild_batch_size().max(1);
10097        let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
10098        let mut total_inserted: usize = 0;
10099        let mut total_skipped_orphans: usize = 0;
10100        let mut total_skipped_existing: usize = 0;
10101        let mut last_rowid: i64 = 0;
10102        let conversation_by_id = self.load_fts_conversation_projection_map()?;
10103        let agent_slug_by_id = self.load_fts_agent_slug_map()?;
10104        let workspace_path_by_id = self.load_fts_workspace_path_map()?;
10105        let existing_fts_rowids = if missing_only {
10106            Some(self.load_fts_message_rowid_set()?)
10107        } else {
10108            None
10109        };
10110        let mut entries = Vec::new();
10111        let mut pending_chars = 0usize;
10112
10113        loop {
10114            let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
10115            let fetched_count = rows.len();
10116            if fetched_count == 0 {
10117                break;
10118            }
10119
10120            let inserted_before_batch = total_inserted;
10121            let skipped_before_batch = total_skipped_orphans;
10122            let existing_before_batch = total_skipped_existing;
10123
10124            for row in rows {
10125                last_rowid = row.rowid;
10126                if existing_fts_rowids
10127                    .as_ref()
10128                    .is_some_and(|rowids| rowids.contains(&row.message_id))
10129                {
10130                    total_skipped_existing = total_skipped_existing.saturating_add(1);
10131                    continue;
10132                }
10133                let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
10134                    total_skipped_orphans = total_skipped_orphans.saturating_add(1);
10135                    continue;
10136                };
10137                let agent = conversation
10138                    .agent_id
10139                    .and_then(|agent_id| agent_slug_by_id.get(&agent_id))
10140                    .filter(|slug| !slug.is_empty())
10141                    .cloned()
10142                    .unwrap_or_else(|| "unknown".to_string());
10143                let workspace = conversation
10144                    .workspace_id
10145                    .and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
10146                    .cloned()
10147                    .unwrap_or_default();
10148                pending_chars = pending_chars.saturating_add(row.content.len());
10149                entries.push(FtsEntry {
10150                    content: row.content,
10151                    title: conversation.title.clone(),
10152                    agent,
10153                    workspace,
10154                    source_path: conversation.source_path.clone(),
10155                    created_at: row.created_at,
10156                    message_id: row.message_id,
10157                });
10158                if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10159                    || pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10160                {
10161                    total_inserted = total_inserted.saturating_add(
10162                        franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
10163                    );
10164                    entries.clear();
10165                    pending_chars = 0;
10166                }
10167            }
10168
10169            if !entries.is_empty() {
10170                total_inserted = total_inserted.saturating_add(
10171                    franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
10172                );
10173                entries.clear();
10174                pending_chars = 0;
10175            }
10176
10177            tracing::debug!(
10178                target: "cass::fts_rebuild",
10179                batch_rows = fetched_count,
10180                batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
10181                batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
10182                batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
10183                total_inserted,
10184                total_skipped_orphans,
10185                total_skipped_existing,
10186                last_rowid,
10187                missing_only,
10188                "FTS streaming maintenance batch complete"
10189            );
10190
10191            if fetched_count < batch_size {
10192                break;
10193            }
10194        }
10195
10196        Ok(total_inserted)
10197    }
10198
10199    fn fetch_fts_rebuild_message_rows(
10200        &self,
10201        last_rowid: i64,
10202        batch_limit: i64,
10203    ) -> Result<Vec<FtsRebuildMessageRow>> {
10204        self.conn
10205            .query_map_collect(
10206                "SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
10207                 FROM messages m
10208                 WHERE m.rowid > ?1
10209                 ORDER BY m.rowid
10210                 LIMIT ?2",
10211                fparams![last_rowid, batch_limit],
10212                |row| {
10213                    Ok(FtsRebuildMessageRow {
10214                        rowid: row.get_typed(0)?,
10215                        message_id: row.get_typed(1)?,
10216                        conversation_id: row.get_typed(2)?,
10217                        content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
10218                        created_at: row.get_typed(4)?,
10219                    })
10220                },
10221            )
10222            .with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
10223    }
10224
10225    fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
10226        let rows: Vec<i64> = self
10227            .conn
10228            .query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
10229                row.get_typed(0)
10230            })
10231            .with_context(|| "loading existing FTS message rowids")?;
10232        Ok(rows.into_iter().collect())
10233    }
10234
10235    fn load_fts_conversation_projection_map(
10236        &self,
10237    ) -> Result<HashMap<i64, FtsConversationProjection>> {
10238        let rows: Vec<(i64, FtsConversationProjection)> = self
10239            .conn
10240            .query_map_collect(
10241                "SELECT id, title, agent_id, workspace_id, source_path
10242                 FROM conversations",
10243                fparams![],
10244                |row| {
10245                    Ok((
10246                        row.get_typed(0)?,
10247                        FtsConversationProjection {
10248                            title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
10249                            agent_id: row.get_typed(2)?,
10250                            workspace_id: row.get_typed(3)?,
10251                            source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
10252                        },
10253                    ))
10254                },
10255            )
10256            .with_context(|| "loading FTS conversation projection map")?;
10257        Ok(rows.into_iter().collect())
10258    }
10259
10260    fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
10261        let rows: Vec<(i64, String)> = self
10262            .conn
10263            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
10264                Ok((
10265                    row.get_typed(0)?,
10266                    row.get_typed::<Option<String>>(1)?
10267                        .unwrap_or_else(|| "unknown".to_string()),
10268                ))
10269            })
10270            .with_context(|| "loading FTS agent slug map")?;
10271        Ok(rows.into_iter().collect())
10272    }
10273
10274    fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
10275        let rows: Vec<(i64, String)> = self
10276            .conn
10277            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
10278                Ok((
10279                    row.get_typed(0)?,
10280                    row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
10281                ))
10282            })
10283            .with_context(|| "loading FTS workspace path map")?;
10284        Ok(rows.into_iter().collect())
10285    }
10286
10287    /// Fetch all messages for embedding generation.
10288    pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
10289        // COALESCE(c.agent_id, 0) so legacy V1 conversations with NULL
10290        // agent_id don't cause a runtime row-decode failure (agent_id in
10291        // MessageForEmbedding is i64).  saturating_u32_from_i64 downstream
10292        // turns 0 into the "unknown agent" sentinel for doc-id hashing.
10293        self.conn
10294            .query_map_collect(
10295                "SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
10296                 FROM messages m
10297                 JOIN conversations c ON m.conversation_id = c.id
10298                 ORDER BY m.id",
10299                fparams![],
10300                |row| {
10301                    let source_id: String = row.get_typed::<Option<String>>(4)?
10302                        .unwrap_or_else(|| "local".to_string());
10303                    Ok(MessageForEmbedding {
10304                        message_id: row.get_typed(0)?,
10305                        created_at: row.get_typed(1)?,
10306                        agent_id: row.get_typed(2)?,
10307                        workspace_id: row.get_typed(3)?,
10308                        source_id_hash: crc32fast::hash(source_id.as_bytes()),
10309                        role: row.get_typed(5)?,
10310                        content: row.get_typed(6)?,
10311                    })
10312                },
10313            )
10314            .with_context(|| "fetching messages for embedding")
10315    }
10316
10317    /// Get the watermark for incremental semantic embedding.
10318    pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
10319        let result: Result<String, _> = self.conn.query_row_map(
10320            "SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
10321            fparams![],
10322            |row| row.get_typed(0),
10323        );
10324        match result.optional() {
10325            Ok(Some(s)) => Ok(s.parse().ok()),
10326            Ok(None) => Ok(None),
10327            Err(e) => Err(e.into()),
10328        }
10329    }
10330
10331    /// Set the watermark for incremental semantic embedding.
10332    pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
10333        self.conn.execute_compat(
10334            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
10335            fparams![id.to_string()],
10336        )?;
10337        Ok(())
10338    }
10339
10340    /// Get embedding jobs for a database path.
10341    pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
10342        self.conn
10343            .query_map_collect(
10344                "SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
10345                 FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
10346                fparams![db_path],
10347                |row| {
10348                    Ok(EmbeddingJobRow {
10349                        id: row.get_typed(0)?,
10350                        db_path: row.get_typed(1)?,
10351                        model_id: row.get_typed(2)?,
10352                        status: row.get_typed(3)?,
10353                        total_docs: row.get_typed(4)?,
10354                        completed_docs: row.get_typed(5)?,
10355                        error_message: row.get_typed(6)?,
10356                        created_at: row.get_typed(7)?,
10357                        started_at: row.get_typed(8)?,
10358                        completed_at: row.get_typed(9)?,
10359                    })
10360                },
10361            )
10362            .with_context(|| format!("fetching embedding jobs for {db_path}"))
10363    }
10364
10365    /// Create or update an embedding job.
10366    pub fn upsert_embedding_job(
10367        &self,
10368        db_path: &str,
10369        model_id: &str,
10370        total_docs: i64,
10371    ) -> Result<i64> {
10372        let updated = self.conn.execute_compat(
10373            "UPDATE embedding_jobs
10374             SET total_docs = ?3
10375             WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10376            fparams![db_path, model_id, total_docs],
10377        )?;
10378        if updated == 0 {
10379            let insert_result = self.conn.execute_compat(
10380                "INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
10381                fparams![db_path, model_id, total_docs],
10382            );
10383            if let Err(err) = insert_result {
10384                if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
10385                    return Err(err.into());
10386                }
10387                self.conn.execute_compat(
10388                    "UPDATE embedding_jobs
10389                     SET total_docs = ?3
10390                     WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10391                    fparams![db_path, model_id, total_docs],
10392                )?;
10393            }
10394        }
10395        self.conn
10396            .query_row_map(
10397                "SELECT id FROM embedding_jobs
10398                 WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
10399                 ORDER BY id DESC
10400                 LIMIT 1",
10401                fparams![db_path, model_id],
10402                |row| row.get_typed(0),
10403            )
10404            .with_context(|| "resolving embedding job id after upsert")
10405    }
10406
10407    /// Mark an embedding job as started.
10408    pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
10409        self.conn.execute_compat(
10410            "UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
10411            fparams![job_id],
10412        )?;
10413        Ok(())
10414    }
10415
10416    /// Mark an embedding job as completed.
10417    pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
10418        self.conn.execute_compat(
10419            "UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
10420            fparams![job_id],
10421        )?;
10422        Ok(())
10423    }
10424
10425    /// Mark an embedding job as failed.
10426    pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
10427        self.conn.execute_compat(
10428            "UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
10429            fparams![job_id, error],
10430        )?;
10431        Ok(())
10432    }
10433
10434    /// Cancel embedding jobs for a database path.
10435    pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
10436        if let Some(mid) = model_id {
10437            Ok(self.conn.execute_compat(
10438                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10439                fparams![db_path, mid],
10440            )?)
10441        } else {
10442            Ok(self.conn.execute_compat(
10443                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
10444                fparams![db_path],
10445            )?)
10446        }
10447    }
10448
10449    /// Update embedding job progress.
10450    pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
10451        self.conn.execute_compat(
10452            "UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
10453            fparams![job_id, completed_docs],
10454        )?;
10455        Ok(())
10456    }
10457
10458    // =====================================================================
10459    // Analytics query methods
10460    // =====================================================================
10461
10462    /// Get session count for a date range using materialized stats.
10463    /// Returns (count, is_from_cache) where is_from_cache is true if from daily_stats.
10464    ///
10465    /// Falls back to COUNT(*) query when daily_stats table is empty or stale.
10466    pub fn count_sessions_in_range(
10467        &self,
10468        start_ts_ms: Option<i64>,
10469        end_ts_ms: Option<i64>,
10470        agent_slug: Option<&str>,
10471        source_id: Option<&str>,
10472    ) -> Result<(i64, bool)> {
10473        let agent = agent_slug.unwrap_or("all");
10474        let source = source_id.unwrap_or("all");
10475
10476        // Check if we have materialized stats
10477        let stats_count: i64 = self
10478            .conn
10479            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10480                row.get_typed(0)
10481            })
10482            .unwrap_or(0);
10483
10484        if stats_count == 0 {
10485            return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
10486        }
10487
10488        // Use materialized stats
10489        let start_day = start_ts_ms.map(Self::day_id_from_millis);
10490        let end_day = end_ts_ms.map(Self::day_id_from_millis);
10491
10492        let count: i64 = match (start_day, end_day) {
10493            (Some(start), Some(end)) => self.conn.query_row_map(
10494                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10495                 WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
10496                fparams![start, end, agent, source],
10497                |row| row.get_typed(0),
10498            )?,
10499            (Some(start), None) => self.conn.query_row_map(
10500                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10501                 WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
10502                fparams![start, agent, source],
10503                |row| row.get_typed(0),
10504            )?,
10505            (None, Some(end)) => self.conn.query_row_map(
10506                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10507                 WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
10508                fparams![end, agent, source],
10509                |row| row.get_typed(0),
10510            )?,
10511            (None, None) => self.conn.query_row_map(
10512                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10513                 WHERE agent_slug = ?1 AND source_id = ?2",
10514                fparams![agent, source],
10515                |row| row.get_typed(0),
10516            )?,
10517        };
10518
10519        Ok((count, true))
10520    }
10521
10522    /// Direct COUNT(*) query as fallback when daily_stats is empty.
10523    fn count_sessions_direct(
10524        &self,
10525        start_ts_ms: Option<i64>,
10526        end_ts_ms: Option<i64>,
10527        agent_slug: Option<&str>,
10528        source_id: Option<&str>,
10529    ) -> Result<(i64, bool)> {
10530        // Build dynamic SQL with positional params.  Single-table scan of
10531        // conversations; filter on agent slug via an EXISTS subquery only
10532        // when that filter is actually requested.  This avoids the unneeded
10533        // 2-table JOIN (which also silently dropped legacy conversations
10534        // with NULL agent_id) and sidesteps frankensqlite's materialization
10535        // fallback entirely.
10536        let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
10537        let mut param_values: Vec<ParamValue> = Vec::new();
10538        let mut idx = 1;
10539
10540        if let Some(start) = start_ts_ms {
10541            sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
10542            param_values.push(ParamValue::from(start));
10543            idx += 1;
10544        }
10545        if let Some(end) = end_ts_ms {
10546            sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
10547            param_values.push(ParamValue::from(end));
10548            idx += 1;
10549        }
10550        if let Some(agent) = agent_slug
10551            && agent != "all"
10552        {
10553            sql.push_str(&format!(
10554                " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
10555            ));
10556            param_values.push(ParamValue::from(agent));
10557            idx += 1;
10558        }
10559        if let Some(source) = source_id
10560            && source != "all"
10561        {
10562            sql.push_str(&format!(" AND c.source_id = ?{idx}"));
10563            param_values.push(ParamValue::from(source));
10564            let _ = idx; // suppress unused warning
10565        }
10566
10567        let count: i64 = self
10568            .conn
10569            .query_row_map(&sql, &param_values, |row| row.get_typed(0))?;
10570        Ok((count, false))
10571    }
10572
10573    /// Get daily histogram data for a date range.
10574    pub fn get_daily_histogram(
10575        &self,
10576        start_ts_ms: i64,
10577        end_ts_ms: i64,
10578        agent_slug: Option<&str>,
10579        source_id: Option<&str>,
10580    ) -> Result<Vec<DailyCount>> {
10581        let start_day = Self::day_id_from_millis(start_ts_ms);
10582        let end_day = Self::day_id_from_millis(end_ts_ms);
10583        let agent = agent_slug.unwrap_or("all");
10584        let source = source_id.unwrap_or("all");
10585
10586        let rows = self.conn.query_map_collect(
10587            "SELECT day_id, session_count, message_count, total_chars
10588             FROM daily_stats
10589             WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
10590             ORDER BY day_id",
10591            fparams![start_day, end_day, agent, source],
10592            |row| {
10593                Ok(DailyCount {
10594                    day_id: row.get_typed(0)?,
10595                    sessions: row.get_typed(1)?,
10596                    messages: row.get_typed(2)?,
10597                    chars: row.get_typed(3)?,
10598                })
10599            },
10600        )?;
10601
10602        Ok(rows)
10603    }
10604
10605    /// Check health of daily stats table.
10606    pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
10607        let row_count: i64 =
10608            self.conn
10609                .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10610                    row.get_typed(0)
10611                })?;
10612
10613        let oldest_update: Option<i64> = self.conn.query_row_map(
10614            "SELECT MIN(last_updated) FROM daily_stats",
10615            fparams![],
10616            |row| row.get_typed(0),
10617        )?;
10618
10619        let conversation_count: i64 =
10620            self.conn
10621                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
10622                    row.get_typed(0)
10623                })?;
10624
10625        let materialized_total: i64 = self.conn.query_row_map(
10626            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10627                 WHERE agent_slug = 'all' AND source_id = 'all'",
10628            fparams![],
10629            |row| row.get_typed(0),
10630        )?;
10631
10632        Ok(DailyStatsHealth {
10633            populated: row_count > 0,
10634            row_count,
10635            oldest_update_ms: oldest_update,
10636            conversation_count,
10637            materialized_total,
10638            drift: (conversation_count - materialized_total).abs(),
10639        })
10640    }
10641
10642    /// Batch insert multiple conversations with full analytics (token usage,
10643    /// message metrics, rollups).  Frankensqlite equivalent of
10644    /// `SqliteStorage::insert_conversations_batched`.
10645    pub fn insert_conversations_batched(
10646        &self,
10647        conversations: &[(i64, Option<i64>, &Conversation)],
10648    ) -> Result<Vec<InsertOutcome>> {
10649        if conversations.is_empty() {
10650            return Ok(Vec::new());
10651        }
10652
10653        self.ensure_sources_for_batch(conversations)?;
10654
10655        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
10656        let defer_analytics_updates = defer_analytics_updates_enabled();
10657
10658        let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
10659            tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
10660            PricingTable { entries: Vec::new() }
10661        });
10662        let mut pricing_diag = PricingDiagnostics::default();
10663
10664        let mut tx = self.conn.transaction()?;
10665
10666        // Bug #167: Ensure all referenced agents, workspaces, and sources
10667        // exist inside the transaction so FK checks pass.  The caller resolves
10668        // IDs via ensure_agent / ensure_workspace / ensure_sources_for_batch
10669        // outside the transaction, but those autocommit writes may not be
10670        // visible inside the transaction snapshot in frankensqlite.  Re-verify
10671        // (and insert if missing) within the tx.
10672        ensure_agents_in_tx(&tx, conversations)?;
10673        ensure_workspaces_in_tx(&tx, conversations)?;
10674        ensure_sources_in_tx(&tx, conversations)?;
10675
10676        let mut outcomes = Vec::with_capacity(conversations.len());
10677        let mut fts_entries = Vec::new();
10678        let mut fts_pending_chars = 0usize;
10679        let mut fts_inserted_total = 0usize;
10680        let mut fts_count_total = 0usize;
10681        let mut stats = StatsAggregator::new();
10682        let mut token_stats = TokenStatsAggregator::new();
10683        let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
10684        let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
10685        let mut rollup_agg = AnalyticsRollupAggregator::new();
10686        let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
10687        let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
10688        let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
10689            HashMap::new();
10690        let mut pending_message_replay_fingerprints: HashMap<
10691            i64,
10692            HashSet<MessageReplayFingerprint>,
10693        > = HashMap::new();
10694
10695        for &(agent_id, workspace_id, raw_conv) in conversations {
10696            let normalized_conv = normalized_conversation_for_storage(raw_conv);
10697            let conv = normalized_conv.as_ref();
10698            let mut total_chars: i64 = 0;
10699            let mut inserted_indices = Vec::with_capacity(conv.messages.len());
10700            let mut inserted_messages: Vec<(i64, &Message)> =
10701                Vec::with_capacity(conv.messages.len());
10702            let mut session_count_delta = 1_i64;
10703            let conversation_key = conversation_merge_key(agent_id, conv);
10704
10705            let existing_conv_id = if let Some(existing_id) =
10706                pending_conversation_ids.get(&conversation_key)
10707            {
10708                Some(*existing_id)
10709            } else {
10710                let existing_id =
10711                    franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
10712                if let Some(existing_id) = existing_id {
10713                    pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10714                }
10715                existing_id
10716            };
10717
10718            let conv_id = if let Some(existing_id) = existing_conv_id {
10719                session_count_delta = 0;
10720                let ExistingMessageLookup {
10721                    by_idx: mut existing_messages,
10722                    replay: mut existing_replay_fingerprints,
10723                } = franken_existing_message_lookup_with_pending(
10724                    &tx,
10725                    existing_id,
10726                    &conv.messages,
10727                    &mut pending_message_fingerprints,
10728                    &mut pending_message_replay_fingerprints,
10729                )?;
10730                let ExistingConversationNewMessages {
10731                    messages: new_messages,
10732                    new_chars,
10733                    idx_collision_count,
10734                    first_collision_idx,
10735                } = collect_new_messages_for_existing_conversation(
10736                    existing_id,
10737                    conv,
10738                    &mut existing_messages,
10739                    &mut existing_replay_fingerprints,
10740                    "skipping replay-equivalent recovered message with shifted idx during batched merge",
10741                );
10742                let (inserted_last_idx, inserted_last_created_at) =
10743                    borrowed_messages_tail_state(&new_messages);
10744                let inserted_message_ids =
10745                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10746                total_chars += new_chars;
10747                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10748                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10749                    if !defer_lexical_updates {
10750                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10751                        fts_count_total += 1;
10752                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
10753                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10754                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10755                        {
10756                            flush_pending_fts_entries(
10757                                self,
10758                                &tx,
10759                                &mut fts_entries,
10760                                &mut fts_pending_chars,
10761                                &mut fts_inserted_total,
10762                            )?;
10763                        }
10764                    }
10765                    inserted_indices.push(msg.idx);
10766                    inserted_messages.push((msg_id, msg));
10767                }
10768
10769                if idx_collision_count > 0 {
10770                    tracing::warn!(
10771                        conversation_id = existing_id,
10772                        collision_count = idx_collision_count,
10773                        first_idx = first_collision_idx,
10774                        source_path = %conv.source_path.display(),
10775                        "message idx collisions encountered during batched conversation merge; retaining canonical message variants"
10776                    );
10777                }
10778
10779                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10780                franken_update_conversation_tail_state(
10781                    &tx,
10782                    existing_id,
10783                    conv_last_ts,
10784                    inserted_last_idx,
10785                    inserted_last_created_at,
10786                )?;
10787                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
10788                {
10789                    franken_update_external_conversation_tail_lookup_key(
10790                        &tx,
10791                        &lookup_key,
10792                        conv_last_ts,
10793                        inserted_last_idx,
10794                        inserted_last_created_at,
10795                    )?;
10796                }
10797
10798                pending_message_fingerprints.insert(existing_id, existing_messages);
10799                pending_message_replay_fingerprints
10800                    .insert(existing_id, existing_replay_fingerprints);
10801
10802                existing_id
10803            } else {
10804                match franken_insert_conversation_or_get_existing(
10805                    &tx,
10806                    agent_id,
10807                    workspace_id,
10808                    conv,
10809                )? {
10810                    ConversationInsertStatus::Inserted(new_conv_id) => {
10811                        pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
10812                        let pending_messages =
10813                            pending_message_fingerprints.entry(new_conv_id).or_default();
10814                        let pending_replay_fingerprints = pending_message_replay_fingerprints
10815                            .entry(new_conv_id)
10816                            .or_default();
10817                        let mut new_messages = Vec::new();
10818                        for msg in &conv.messages {
10819                            let incoming_replay = message_replay_fingerprint(msg);
10820                            if pending_messages.contains_key(&msg.idx)
10821                                || pending_replay_fingerprints.contains(&incoming_replay)
10822                            {
10823                                continue;
10824                            }
10825                            pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
10826                            pending_replay_fingerprints.insert(incoming_replay);
10827                            new_messages.push(msg);
10828                        }
10829                        let inserted_message_ids =
10830                            franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
10831                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10832                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10833                            if !defer_lexical_updates {
10834                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10835                                fts_count_total += 1;
10836                                fts_pending_chars =
10837                                    fts_pending_chars.saturating_add(msg.content.len());
10838                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10839                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10840                                {
10841                                    flush_pending_fts_entries(
10842                                        self,
10843                                        &tx,
10844                                        &mut fts_entries,
10845                                        &mut fts_pending_chars,
10846                                        &mut fts_inserted_total,
10847                                    )?;
10848                                }
10849                            }
10850                            total_chars += msg.content.len() as i64;
10851                            inserted_indices.push(msg.idx);
10852                            inserted_messages.push((msg_id, msg));
10853                        }
10854                        new_conv_id
10855                    }
10856                    ConversationInsertStatus::Existing(existing_id) => {
10857                        session_count_delta = 0;
10858                        pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10859                        let ExistingMessageLookup {
10860                            by_idx: mut existing_messages,
10861                            replay: mut existing_replay_fingerprints,
10862                        } = franken_existing_message_lookup_with_pending(
10863                            &tx,
10864                            existing_id,
10865                            &conv.messages,
10866                            &mut pending_message_fingerprints,
10867                            &mut pending_message_replay_fingerprints,
10868                        )?;
10869                        let ExistingConversationNewMessages {
10870                            messages: new_messages,
10871                            new_chars,
10872                            idx_collision_count,
10873                            first_collision_idx,
10874                        } = collect_new_messages_for_existing_conversation(
10875                            existing_id,
10876                            conv,
10877                            &mut existing_messages,
10878                            &mut existing_replay_fingerprints,
10879                            "skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
10880                        );
10881                        let (inserted_last_idx, inserted_last_created_at) =
10882                            borrowed_messages_tail_state(&new_messages);
10883                        let inserted_message_ids =
10884                            franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10885                        total_chars += new_chars;
10886                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10887                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10888                            if !defer_lexical_updates {
10889                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10890                                fts_count_total += 1;
10891                                fts_pending_chars =
10892                                    fts_pending_chars.saturating_add(msg.content.len());
10893                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10894                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10895                                {
10896                                    flush_pending_fts_entries(
10897                                        self,
10898                                        &tx,
10899                                        &mut fts_entries,
10900                                        &mut fts_pending_chars,
10901                                        &mut fts_inserted_total,
10902                                    )?;
10903                                }
10904                            }
10905                            inserted_indices.push(msg.idx);
10906                            inserted_messages.push((msg_id, msg));
10907                        }
10908
10909                        if idx_collision_count > 0 {
10910                            tracing::warn!(
10911                                conversation_id = existing_id,
10912                                collision_count = idx_collision_count,
10913                                first_idx = first_collision_idx,
10914                                source_path = %conv.source_path.display(),
10915                                "message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
10916                            );
10917                        }
10918
10919                        let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10920                        franken_update_conversation_tail_state(
10921                            &tx,
10922                            existing_id,
10923                            conv_last_ts,
10924                            inserted_last_idx,
10925                            inserted_last_created_at,
10926                        )?;
10927                        if let Some(lookup_key) =
10928                            conversation_external_lookup_key_for_conv(agent_id, conv)
10929                        {
10930                            franken_update_external_conversation_tail_lookup_key(
10931                                &tx,
10932                                &lookup_key,
10933                                conv_last_ts,
10934                                inserted_last_idx,
10935                                inserted_last_created_at,
10936                            )?;
10937                        }
10938
10939                        pending_message_fingerprints.insert(existing_id, existing_messages);
10940                        pending_message_replay_fingerprints
10941                            .insert(existing_id, existing_replay_fingerprints);
10942
10943                        existing_id
10944                    }
10945                }
10946            };
10947
10948            if !defer_analytics_updates {
10949                let delta = StatsDelta {
10950                    session_count_delta,
10951                    message_count_delta: inserted_messages.len() as i64,
10952                    total_chars_delta: total_chars,
10953                };
10954
10955                let effective_started_at = conversation_effective_started_at(conv);
10956                let day_id = effective_started_at
10957                    .map(FrankenStorage::day_id_from_millis)
10958                    .unwrap_or(0);
10959                stats.record_delta(
10960                    &conv.agent_slug,
10961                    &conv.source_id,
10962                    day_id,
10963                    delta.session_count_delta,
10964                    delta.message_count_delta,
10965                    delta.total_chars_delta,
10966                );
10967
10968                let conv_day_id = day_id;
10969                let mut session_model_family = String::from("unknown");
10970                let mut has_any_tokens = false;
10971
10972                for &(message_id, msg) in &inserted_messages {
10973                    let role_s = role_str(&msg.role);
10974                    let usage = if historical_raw_json(&msg.extra_json).is_some() {
10975                        crate::connectors::extract_tokens_for_agent(
10976                            &conv.agent_slug,
10977                            &serde_json::Value::Null,
10978                            &msg.content,
10979                            &role_s,
10980                        )
10981                    } else {
10982                        crate::connectors::extract_tokens_for_agent(
10983                            &conv.agent_slug,
10984                            &msg.extra_json,
10985                            &msg.content,
10986                            &role_s,
10987                        )
10988                    };
10989
10990                    let msg_ts = msg
10991                        .created_at
10992                        .or(conversation_effective_started_at(conv))
10993                        .unwrap_or(0);
10994                    let msg_day_id = if msg_ts > 0 {
10995                        FrankenStorage::day_id_from_millis(msg_ts)
10996                    } else {
10997                        conv_day_id
10998                    };
10999
11000                    let model_info = usage
11001                        .model_name
11002                        .as_deref()
11003                        .map(crate::connectors::normalize_model);
11004
11005                    let model_family = model_info
11006                        .as_ref()
11007                        .map(|i| i.family.clone())
11008                        .unwrap_or_else(|| "unknown".into());
11009                    let model_tier = model_info
11010                        .as_ref()
11011                        .map(|i| i.tier.clone())
11012                        .unwrap_or_else(|| "unknown".into());
11013                    let provider = usage
11014                        .provider
11015                        .clone()
11016                        .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
11017                        .unwrap_or_else(|| "unknown".into());
11018
11019                    if model_family != "unknown" {
11020                        session_model_family = model_family.clone();
11021                    }
11022
11023                    let estimated_cost = pricing_table.compute_cost(
11024                        usage.model_name.as_deref(),
11025                        msg_day_id,
11026                        usage.input_tokens,
11027                        usage.output_tokens,
11028                        usage.cache_read_tokens,
11029                        usage.cache_creation_tokens,
11030                    );
11031                    if estimated_cost.is_some() {
11032                        pricing_diag.record_priced();
11033                    } else if usage.has_token_data() {
11034                        pricing_diag.record_unpriced(usage.model_name.as_deref());
11035                    }
11036
11037                    token_stats.record(
11038                        &conv.agent_slug,
11039                        &conv.source_id,
11040                        msg_day_id,
11041                        &model_family,
11042                        &role_s,
11043                        &usage,
11044                        msg.content.len() as i64,
11045                        estimated_cost.unwrap_or(0.0),
11046                    );
11047
11048                    if usage.has_token_data() {
11049                        has_any_tokens = true;
11050                    }
11051
11052                    let content_chars = msg.content.len() as i64;
11053                    let content_tokens_est = content_chars / 4;
11054                    let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
11055                    let has_plan = has_plan_for_role(&role_s, &msg.content);
11056
11057                    token_entries.push(TokenUsageEntry {
11058                        message_id,
11059                        conversation_id: conv_id,
11060                        agent_id,
11061                        workspace_id,
11062                        source_id: conv.source_id.clone(),
11063                        timestamp_ms: msg_ts,
11064                        day_id: msg_day_id,
11065                        model_name: usage.model_name.clone(),
11066                        model_family: Some(model_family.clone()),
11067                        model_tier: Some(model_tier.clone()),
11068                        service_tier: usage.service_tier.clone(),
11069                        provider: Some(provider.clone()),
11070                        input_tokens: usage.input_tokens,
11071                        output_tokens: usage.output_tokens,
11072                        cache_read_tokens: usage.cache_read_tokens,
11073                        cache_creation_tokens: usage.cache_creation_tokens,
11074                        thinking_tokens: usage.thinking_tokens,
11075                        total_tokens: usage.total_tokens(),
11076                        estimated_cost_usd: estimated_cost,
11077                        role: role_s.to_string(),
11078                        content_chars,
11079                        has_tool_calls: usage.has_tool_calls,
11080                        tool_call_count: usage.tool_call_count,
11081                        data_source: usage.data_source.as_str().to_string(),
11082                    });
11083
11084                    let mm = MessageMetricsEntry {
11085                        message_id,
11086                        created_at_ms: msg_ts,
11087                        hour_id: msg_hour_id,
11088                        day_id: msg_day_id,
11089                        agent_slug: conv.agent_slug.clone(),
11090                        workspace_id: workspace_id.unwrap_or(0),
11091                        source_id: conv.source_id.clone(),
11092                        role: role_s.to_string(),
11093                        content_chars,
11094                        content_tokens_est,
11095                        model_name: usage.model_name.clone(),
11096                        model_family: model_family.clone(),
11097                        model_tier: model_tier.clone(),
11098                        provider,
11099                        api_input_tokens: usage.input_tokens,
11100                        api_output_tokens: usage.output_tokens,
11101                        api_cache_read_tokens: usage.cache_read_tokens,
11102                        api_cache_creation_tokens: usage.cache_creation_tokens,
11103                        api_thinking_tokens: usage.thinking_tokens,
11104                        api_service_tier: usage.service_tier.clone(),
11105                        api_data_source: usage.data_source.as_str().to_string(),
11106                        tool_call_count: usage.tool_call_count as i64,
11107                        has_tool_calls: usage.has_tool_calls,
11108                        has_plan,
11109                    };
11110                    rollup_agg.record(&mm);
11111                    metrics_entries.push(mm);
11112                }
11113
11114                if session_count_delta > 0 {
11115                    token_stats.record_session(
11116                        &conv.agent_slug,
11117                        &conv.source_id,
11118                        conv_day_id,
11119                        &session_model_family,
11120                    );
11121                }
11122
11123                if has_any_tokens {
11124                    conv_ids_to_summarize.push(conv_id);
11125                }
11126            }
11127
11128            outcomes.push(InsertOutcome {
11129                conversation_id: conv_id,
11130                conversation_inserted: session_count_delta > 0,
11131                inserted_indices,
11132            });
11133        }
11134
11135        // Batch insert all FTS entries at once
11136        if !defer_lexical_updates {
11137            flush_pending_fts_entries(
11138                self,
11139                &tx,
11140                &mut fts_entries,
11141                &mut fts_pending_chars,
11142                &mut fts_inserted_total,
11143            )?;
11144        }
11145        if !defer_lexical_updates && fts_count_total > 0 {
11146            tracing::debug!(
11147                target: "cass::perf::fts5",
11148                total = fts_count_total,
11149                inserted = fts_inserted_total,
11150                conversations = conversations.len(),
11151                "franken_batch_fts_insert_complete"
11152            );
11153        }
11154
11155        // Batched daily_stats update
11156        if !defer_analytics_updates && !stats.is_empty() {
11157            let entries = stats.expand();
11158            let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
11159            tracing::debug!(
11160                target: "cass::perf::daily_stats",
11161                raw = stats.raw_entry_count(),
11162                expanded = entries.len(),
11163                affected = affected,
11164                "franken_batched_stats_update_complete"
11165            );
11166        }
11167
11168        // Batch insert token_usage rows
11169        if !defer_analytics_updates && !token_entries.is_empty() {
11170            let token_count = token_entries.len();
11171            let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
11172            tracing::debug!(
11173                target: "cass::perf::token_usage",
11174                total = token_count,
11175                inserted = inserted,
11176                "franken_batch_token_usage_insert_complete"
11177            );
11178        }
11179
11180        // Batched token_daily_stats update
11181        if !defer_analytics_updates && !token_stats.is_empty() {
11182            let entries = token_stats.expand();
11183            let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
11184            tracing::debug!(
11185                target: "cass::perf::token_daily_stats",
11186                raw = token_stats.raw_entry_count(),
11187                expanded = entries.len(),
11188                affected = affected,
11189                "franken_batched_token_stats_update_complete"
11190            );
11191        }
11192
11193        // Batch insert message_metrics rows
11194        if !defer_analytics_updates && !metrics_entries.is_empty() {
11195            let mm_count = metrics_entries.len();
11196            let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
11197            tracing::debug!(
11198                target: "cass::perf::message_metrics",
11199                total = mm_count,
11200                inserted = inserted,
11201                "franken_batch_message_metrics_insert_complete"
11202            );
11203        }
11204
11205        // Flush usage_hourly + usage_daily rollups
11206        if !defer_analytics_updates && !rollup_agg.is_empty() {
11207            let (hourly, daily, models_daily) =
11208                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
11209            tracing::debug!(
11210                target: "cass::perf::usage_rollups",
11211                hourly_buckets = rollup_agg.hourly_entry_count(),
11212                daily_buckets = rollup_agg.daily_entry_count(),
11213                models_daily_buckets = rollup_agg.models_daily_entry_count(),
11214                hourly_affected = hourly,
11215                daily_affected = daily,
11216                models_daily_affected = models_daily,
11217                "franken_batched_usage_rollups_complete"
11218            );
11219        }
11220
11221        // Update conversation-level token summaries
11222        if !defer_analytics_updates {
11223            for conv_id in &conv_ids_to_summarize {
11224                franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
11225            }
11226        }
11227
11228        tx.commit()?;
11229
11230        pricing_diag.log_summary();
11231
11232        Ok(outcomes)
11233    }
11234}
11235
11236fn normalized_storage_source_parts(
11237    source_id: Option<&str>,
11238    origin_kind: Option<&str>,
11239    origin_host: Option<&str>,
11240) -> (String, SourceKind, Option<String>) {
11241    let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
11242    let source_id = crate::search::tantivy::normalized_index_source_id(
11243        source_id,
11244        origin_kind,
11245        host_label.as_deref(),
11246    );
11247
11248    if source_id == LOCAL_SOURCE_ID {
11249        (source_id, SourceKind::Local, None)
11250    } else {
11251        (source_id, SourceKind::Ssh, host_label)
11252    }
11253}
11254
11255fn normalized_source_for_conversation(conv: &Conversation) -> Source {
11256    let (id, kind, host_label) = normalized_storage_source_parts(
11257        Some(conv.source_id.as_str()),
11258        None,
11259        conv.origin_host.as_deref(),
11260    );
11261    Source {
11262        id,
11263        kind,
11264        host_label,
11265        machine_id: None,
11266        platform: None,
11267        config_json: None,
11268        created_at: None,
11269        updated_at: None,
11270    }
11271}
11272
11273fn is_bootstrap_local_source(source: &Source) -> bool {
11274    source.id == LOCAL_SOURCE_ID
11275        && matches!(source.kind, SourceKind::Local)
11276        && source.host_label.is_none()
11277        && source.machine_id.is_none()
11278        && source.platform.is_none()
11279        && source.config_json.is_none()
11280        && source.created_at.is_none()
11281        && source.updated_at.is_none()
11282}
11283
11284fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
11285    let normalized_source = normalized_source_for_conversation(conv);
11286    if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
11287        Cow::Borrowed(conv)
11288    } else {
11289        let mut normalized = conv.clone();
11290        normalized.source_id = normalized_source.id;
11291        normalized.origin_host = normalized_source.host_label;
11292        Cow::Owned(normalized)
11293    }
11294}
11295
11296impl FrankenStorage {
11297    fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
11298        let source = normalized_source_for_conversation(conv);
11299        if is_bootstrap_local_source(&source) {
11300            // `open()` and schema repair always seed the canonical local source row.
11301            // Avoid an autocommit UPDATE on every local conversation insert.
11302            return Ok(());
11303        }
11304        let cache_key = EnsuredConversationSourceKey::from_source(&source);
11305        if self.conversation_source_already_ensured(&cache_key) {
11306            return Ok(());
11307        }
11308        self.upsert_source(&source)?;
11309        self.mark_conversation_source_ensured(cache_key);
11310        Ok(())
11311    }
11312
11313    fn ensure_sources_for_batch(
11314        &self,
11315        conversations: &[(i64, Option<i64>, &Conversation)],
11316    ) -> Result<()> {
11317        let mut seen = HashSet::with_capacity(conversations.len());
11318        for &(_, _, conv) in conversations {
11319            let source = normalized_source_for_conversation(conv);
11320            if seen.insert(source.id.clone()) {
11321                if is_bootstrap_local_source(&source) {
11322                    continue;
11323                }
11324                self.upsert_source(&source)?;
11325                self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
11326                    &source,
11327                ));
11328            }
11329        }
11330        Ok(())
11331    }
11332}
11333
11334// =========================================================================
11335// FrankenStorage transaction helper functions
11336// =========================================================================
11337
11338/// Get last_insert_rowid from a frankensqlite transaction.
11339fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
11340    tx.last_insert_rowid()
11341        .ok()
11342        .filter(|&id| id > 0)
11343        .with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
11344}
11345
11346/// Bug #167: Ensure all agents referenced by a batch exist within the
11347/// transaction.  The caller already resolved `agent_id` values via
11348/// `ensure_agent` outside the transaction, but those autocommit writes may
11349/// not be visible inside a frankensqlite transaction snapshot.  This function
11350/// checks each unique agent_id and creates a stub row if it's missing.
11351fn ensure_agents_in_tx(
11352    tx: &FrankenTransaction<'_>,
11353    conversations: &[(i64, Option<i64>, &Conversation)],
11354) -> Result<()> {
11355    let mut seen = HashSet::new();
11356    let now = FrankenStorage::now_millis();
11357    for &(agent_id, _, conv) in conversations {
11358        if !seen.insert(agent_id) {
11359            continue;
11360        }
11361        let exists: i64 = tx.query_row_map(
11362            "SELECT COUNT(*) FROM agents WHERE id = ?1",
11363            fparams![agent_id],
11364            |row| row.get_typed(0),
11365        )?;
11366        if exists == 0 {
11367            tracing::debug!(
11368                target: "cass::fk_guard",
11369                agent_id,
11370                slug = %conv.agent_slug,
11371                "inserting agent row inside transaction to satisfy FK constraint"
11372            );
11373            // INSERT OR IGNORE: the slug might already exist with a different
11374            // id from a concurrent writer.  If the slug row exists, the FK
11375            // constraint is already satisfied (the caller just got a stale id).
11376            tx.execute_compat(
11377                "INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
11378                 VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
11379                fparams![
11380                    agent_id,
11381                    conv.agent_slug.as_str(),
11382                    conv.agent_slug.as_str(),
11383                    now,
11384                    now
11385                ],
11386            )?;
11387        }
11388    }
11389    Ok(())
11390}
11391
11392/// Bug #167: Ensure all workspaces referenced by a batch exist within the
11393/// transaction.  Same rationale as `ensure_agents_in_tx`.
11394fn ensure_workspaces_in_tx(
11395    tx: &FrankenTransaction<'_>,
11396    conversations: &[(i64, Option<i64>, &Conversation)],
11397) -> Result<()> {
11398    let mut seen = HashSet::new();
11399    for &(_, workspace_id, conv) in conversations {
11400        let ws_id = match workspace_id {
11401            Some(id) => id,
11402            None => continue,
11403        };
11404        if !seen.insert(ws_id) {
11405            continue;
11406        }
11407        let exists: i64 = tx.query_row_map(
11408            "SELECT COUNT(*) FROM workspaces WHERE id = ?1",
11409            fparams![ws_id],
11410            |row| row.get_typed(0),
11411        )?;
11412        if exists == 0 {
11413            let path_str = conv
11414                .workspace
11415                .as_ref()
11416                .map(|p| p.to_string_lossy().to_string())
11417                .unwrap_or_default();
11418            tracing::debug!(
11419                target: "cass::fk_guard",
11420                workspace_id = ws_id,
11421                path = %path_str,
11422                "inserting workspace row inside transaction to satisfy FK constraint"
11423            );
11424            tx.execute_compat(
11425                "INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
11426                fparams![ws_id, path_str.as_str()],
11427            )?;
11428        }
11429    }
11430    Ok(())
11431}
11432
11433/// Bug #167: Ensure all sources referenced by a batch exist within the
11434/// transaction.  Same rationale as `ensure_agents_in_tx` — source_id is a
11435/// TEXT FK on the conversations table.
11436fn ensure_sources_in_tx(
11437    tx: &FrankenTransaction<'_>,
11438    conversations: &[(i64, Option<i64>, &Conversation)],
11439) -> Result<()> {
11440    let mut seen = HashSet::new();
11441    for &(_, _, conv) in conversations {
11442        let (source_id, source_kind, host_label) = normalized_storage_source_parts(
11443            Some(conv.source_id.as_str()),
11444            None,
11445            conv.origin_host.as_deref(),
11446        );
11447        if !seen.insert(source_id.clone()) {
11448            continue;
11449        }
11450        let exists: i64 = tx.query_row_map(
11451            "SELECT COUNT(*) FROM sources WHERE id = ?1",
11452            fparams![source_id.as_str()],
11453            |row| row.get_typed(0),
11454        )?;
11455        if exists == 0 {
11456            let kind_str = source_kind.to_string();
11457            let now = FrankenStorage::now_millis();
11458            tracing::debug!(
11459                target: "cass::fk_guard",
11460                source_id = %source_id,
11461                kind = kind_str.as_str(),
11462                "inserting source row inside transaction to satisfy FK constraint"
11463            );
11464            tx.execute_compat(
11465                "INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
11466                 VALUES(?1, ?2, ?3, ?4, ?5)",
11467                fparams![
11468                    source_id.as_str(),
11469                    kind_str.as_str(),
11470                    host_label.as_deref(),
11471                    now,
11472                    now
11473                ],
11474            )?;
11475        }
11476    }
11477    Ok(())
11478}
11479
11480fn env_flag_enabled(name: &str) -> bool {
11481    dotenvy::var(name)
11482        .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
11483        .unwrap_or(false)
11484}
11485
11486fn defer_storage_lexical_updates_enabled() -> bool {
11487    env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
11488}
11489
11490fn defer_analytics_updates_enabled() -> bool {
11491    env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES")
11492}
11493
11494enum ConversationInsertStatus {
11495    Inserted(i64),
11496    Existing(i64),
11497}
11498
11499fn franken_find_external_conversation_tail_lookup(
11500    tx: &FrankenTransaction<'_>,
11501    lookup_key: &str,
11502) -> Result<Option<ExistingConversationWithTail>> {
11503    let params = [SqliteValue::from(lookup_key)];
11504    let row = tx
11505        .query_row_with_params(
11506            "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
11507             FROM conversation_external_tail_lookup
11508             WHERE lookup_key = ?1",
11509            &params,
11510        )
11511        .optional()?;
11512    let Some(row) = row else {
11513        return Ok(None);
11514    };
11515    let id = row.get_typed(0)?;
11516    let ended_at = row.get_typed(1)?;
11517    let last_message_idx = row.get_typed(2)?;
11518    let last_message_created_at = row.get_typed(3)?;
11519    Ok(Some(ExistingConversationWithTail {
11520        id,
11521        tail_state: existing_conversation_tail_state_from_cached(
11522            last_message_idx,
11523            last_message_created_at,
11524            ended_at,
11525        ),
11526    }))
11527}
11528
11529fn franken_find_external_conversation_lookup(
11530    tx: &FrankenTransaction<'_>,
11531    lookup_key: &str,
11532) -> Result<Option<i64>> {
11533    Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
11534}
11535
11536fn franken_insert_external_conversation_tail_lookup_key(
11537    tx: &FrankenTransaction<'_>,
11538    lookup_key: &str,
11539    conversation_id: i64,
11540    ended_at: Option<i64>,
11541    last_message_idx: Option<i64>,
11542    last_message_created_at: Option<i64>,
11543) -> Result<()> {
11544    let params = [
11545        SqliteValue::from(lookup_key),
11546        SqliteValue::from(conversation_id),
11547        SqliteValue::from(ended_at),
11548        SqliteValue::from(last_message_idx),
11549        SqliteValue::from(last_message_created_at),
11550    ];
11551    tx.execute_with_params(
11552        "INSERT OR REPLACE INTO conversation_external_tail_lookup(
11553             lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
11554         ) VALUES(?1, ?2, ?3, ?4, ?5)",
11555        &params,
11556    )?;
11557    Ok(())
11558}
11559
11560fn franken_insert_external_conversation_tail_lookup(
11561    tx: &FrankenTransaction<'_>,
11562    source_id: &str,
11563    agent_id: i64,
11564    external_id: &str,
11565    existing: ExistingConversationWithTail,
11566) -> Result<()> {
11567    let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
11568    let ended_at = existing.tail_state.and_then(|state| state.ended_at);
11569    let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
11570    let last_message_created_at = existing
11571        .tail_state
11572        .map(|state| state.last_message_created_at);
11573    franken_insert_external_conversation_tail_lookup_key(
11574        tx,
11575        &lookup_key,
11576        existing.id,
11577        ended_at,
11578        last_message_idx,
11579        last_message_created_at,
11580    )
11581}
11582
11583fn franken_update_external_conversation_tail_lookup_key(
11584    tx: &FrankenTransaction<'_>,
11585    lookup_key: &str,
11586    ended_at_candidate: Option<i64>,
11587    last_message_idx_candidate: Option<i64>,
11588    last_message_created_at_candidate: Option<i64>,
11589) -> Result<()> {
11590    if ended_at_candidate.is_none()
11591        && last_message_idx_candidate.is_none()
11592        && last_message_created_at_candidate.is_none()
11593    {
11594        return Ok(());
11595    }
11596    tx.execute_compat(
11597        "UPDATE conversation_external_tail_lookup
11598         SET ended_at = CASE
11599                 WHEN ?1 IS NULL THEN ended_at
11600                 ELSE MAX(IFNULL(ended_at, 0), ?1)
11601             END,
11602             last_message_idx = CASE
11603                 WHEN ?2 IS NULL THEN last_message_idx
11604                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
11605                 ELSE last_message_idx
11606             END,
11607             last_message_created_at = CASE
11608                 WHEN ?3 IS NULL THEN last_message_created_at
11609                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
11610                 ELSE last_message_created_at
11611             END
11612         WHERE lookup_key = ?4",
11613        fparams![
11614            ended_at_candidate,
11615            last_message_idx_candidate,
11616            last_message_created_at_candidate,
11617            lookup_key
11618        ],
11619    )?;
11620    Ok(())
11621}
11622
11623fn franken_set_external_conversation_tail_lookup_after_append(
11624    tx: &FrankenTransaction<'_>,
11625    lookup_key: &str,
11626    ended_at: i64,
11627    last_message_idx: i64,
11628    last_message_created_at: i64,
11629) -> Result<()> {
11630    tx.execute_compat(
11631        "UPDATE conversation_external_tail_lookup
11632         SET ended_at = ?1,
11633             last_message_idx = ?2,
11634             last_message_created_at = ?3
11635         WHERE lookup_key = ?4",
11636        fparams![
11637            ended_at,
11638            last_message_idx,
11639            last_message_created_at,
11640            lookup_key
11641        ],
11642    )?;
11643    Ok(())
11644}
11645
11646fn franken_update_external_conversation_tail_after_append(
11647    tx: &FrankenTransaction<'_>,
11648    agent_id: i64,
11649    conv: &Conversation,
11650    used_append_tail_plan: bool,
11651    exact_append_set: bool,
11652    inserted_last_idx: Option<i64>,
11653    inserted_last_created_at: Option<i64>,
11654) -> Result<()> {
11655    let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
11656        return Ok(());
11657    };
11658
11659    if exact_append_set
11660        && let (Some(last_message_idx), Some(last_message_created_at)) =
11661            (inserted_last_idx, inserted_last_created_at)
11662    {
11663        return franken_set_external_conversation_tail_lookup_after_append(
11664            tx,
11665            &lookup_key,
11666            last_message_created_at,
11667            last_message_idx,
11668            last_message_created_at,
11669        );
11670    }
11671
11672    let ended_at_candidate = if used_append_tail_plan {
11673        inserted_last_created_at
11674    } else {
11675        conv.messages.iter().filter_map(|m| m.created_at).max()
11676    };
11677    franken_update_external_conversation_tail_lookup_key(
11678        tx,
11679        &lookup_key,
11680        ended_at_candidate,
11681        inserted_last_idx,
11682        inserted_last_created_at,
11683    )
11684}
11685
11686fn franken_find_existing_conversation_by_key(
11687    tx: &FrankenTransaction<'_>,
11688    key: &PendingConversationKey,
11689    conv: Option<&Conversation>,
11690) -> Result<Option<i64>> {
11691    franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
11692}
11693
11694fn franken_find_existing_conversation_by_key_after_conflict(
11695    tx: &FrankenTransaction<'_>,
11696    key: &PendingConversationKey,
11697    conv: Option<&Conversation>,
11698) -> Result<Option<i64>> {
11699    franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
11700}
11701
11702fn franken_find_existing_conversation_by_key_impl(
11703    tx: &FrankenTransaction<'_>,
11704    key: &PendingConversationKey,
11705    conv: Option<&Conversation>,
11706    allow_legacy_external_scan: bool,
11707) -> Result<Option<i64>> {
11708    match key {
11709        PendingConversationKey::External {
11710            source_id,
11711            agent_id,
11712            external_id,
11713        } => {
11714            let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
11715            if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
11716                return Ok(Some(existing_id));
11717            }
11718            if !allow_legacy_external_scan {
11719                return Ok(None);
11720            }
11721
11722            let existing_id = tx
11723                .query_row_map(
11724                    "SELECT id
11725                 FROM conversations
11726                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
11727                    fparams![source_id.as_str(), *agent_id, external_id.as_str()],
11728                    |row| row.get_typed(0),
11729                )
11730                .optional()?;
11731            if let Some(existing_id) = existing_id {
11732                let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
11733                franken_insert_external_conversation_tail_lookup_key(
11734                    tx,
11735                    &lookup_key,
11736                    existing_id,
11737                    tail_state.and_then(|state| state.ended_at),
11738                    tail_state.map(|state| state.last_message_idx),
11739                    tail_state.map(|state| state.last_message_created_at),
11740                )?;
11741                Ok(Some(existing_id))
11742            } else {
11743                Ok(None)
11744            }
11745        }
11746        PendingConversationKey::SourcePath {
11747            source_id,
11748            agent_id,
11749            source_path,
11750            started_at,
11751        } => {
11752            let exact_match = tx
11753                .query_row_map(
11754                    "SELECT c.id
11755                     FROM conversations c
11756                     WHERE c.source_id = ?1
11757                       AND c.agent_id = ?2
11758                       AND c.source_path = ?3
11759                       AND ((
11760                            COALESCE(
11761                                c.started_at,
11762                                (SELECT MIN(created_at)
11763                                 FROM messages
11764                                 WHERE conversation_id = c.id
11765                                   AND created_at IS NOT NULL)
11766                            ) IS NULL
11767                            AND ?4 IS NULL
11768                       ) OR COALESCE(
11769                            c.started_at,
11770                            (SELECT MIN(created_at)
11771                             FROM messages
11772                             WHERE conversation_id = c.id
11773                               AND created_at IS NOT NULL)
11774                       ) = ?4)
11775                     ORDER BY c.id
11776                     LIMIT 1",
11777                    fparams![
11778                        source_id.as_str(),
11779                        *agent_id,
11780                        source_path.as_str(),
11781                        *started_at
11782                    ],
11783                    |row| row.get_typed(0),
11784                )
11785                .optional()?;
11786            if exact_match.is_some() {
11787                return Ok(exact_match);
11788            }
11789
11790            let Some(conv) = conv else {
11791                return Ok(None);
11792            };
11793            let incoming_fingerprints = conversation_message_fingerprints(conv);
11794            if incoming_fingerprints.is_empty() {
11795                return Ok(None);
11796            }
11797            let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
11798
11799            let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
11800                "SELECT
11801                     c.id,
11802                     COALESCE(
11803                         c.started_at,
11804                         (SELECT MIN(created_at)
11805                          FROM messages
11806                          WHERE conversation_id = c.id
11807                            AND created_at IS NOT NULL)
11808                     ) AS effective_started_at
11809                 FROM conversations c
11810                 WHERE c.source_id = ?1
11811                   AND c.agent_id = ?2
11812                   AND c.source_path = ?3
11813                 ORDER BY c.id",
11814                fparams![source_id.as_str(), *agent_id, source_path.as_str()],
11815                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
11816            )?;
11817
11818            let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
11819            for (candidate_id, candidate_started_at) in candidates {
11820                let existing_fingerprints =
11821                    franken_existing_message_fingerprints(tx, candidate_id)?;
11822                let existing_replay_fingerprints =
11823                    replay_fingerprints_from_merge_set(&existing_fingerprints);
11824                let Some(evidence) = conversation_merge_evidence(
11825                    &incoming_fingerprints,
11826                    &incoming_replay_fingerprints,
11827                    &existing_fingerprints,
11828                    &existing_replay_fingerprints,
11829                    *started_at,
11830                    candidate_started_at,
11831                ) else {
11832                    continue;
11833                };
11834
11835                let candidate_key = (
11836                    evidence.exact_overlap,
11837                    evidence.replay_overlap,
11838                    evidence.started_close,
11839                    evidence.smaller_replay_set,
11840                    std::cmp::Reverse(evidence.start_distance_ms),
11841                );
11842                let should_replace = best_candidate
11843                    .as_ref()
11844                    .map(|(_, best_evidence)| {
11845                        candidate_key
11846                            > (
11847                                best_evidence.exact_overlap,
11848                                best_evidence.replay_overlap,
11849                                best_evidence.started_close,
11850                                best_evidence.smaller_replay_set,
11851                                std::cmp::Reverse(best_evidence.start_distance_ms),
11852                            )
11853                    })
11854                    .unwrap_or(true);
11855
11856                if should_replace {
11857                    best_candidate = Some((candidate_id, evidence));
11858                }
11859            }
11860
11861            Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
11862        }
11863    }
11864}
11865
11866fn franken_insert_conversation_or_get_existing(
11867    tx: &FrankenTransaction<'_>,
11868    agent_id: i64,
11869    workspace_id: Option<i64>,
11870    conv: &Conversation,
11871) -> Result<ConversationInsertStatus> {
11872    let conversation_key = conversation_merge_key(agent_id, conv);
11873    if let Some(existing_id) =
11874        franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
11875    {
11876        return Ok(ConversationInsertStatus::Existing(existing_id));
11877    }
11878
11879    franken_insert_conversation_or_get_existing_after_miss(
11880        tx,
11881        agent_id,
11882        workspace_id,
11883        conv,
11884        &conversation_key,
11885    )
11886}
11887
11888fn franken_insert_conversation_or_get_existing_after_miss(
11889    tx: &FrankenTransaction<'_>,
11890    agent_id: i64,
11891    workspace_id: Option<i64>,
11892    conv: &Conversation,
11893    conversation_key: &PendingConversationKey,
11894) -> Result<ConversationInsertStatus> {
11895    match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
11896        Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
11897        Ok(None) => {
11898            // A concurrent writer won the unique-provenance race. Resolve the
11899            // canonical row so callers can merge messages into it.
11900            let existing_id =
11901                franken_find_existing_conversation_by_key_after_conflict(
11902                    tx,
11903                    conversation_key,
11904                    Some(conv),
11905                )?
11906                    .with_context(|| {
11907                        format!(
11908                            "conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
11909                            conv.source_id,
11910                            agent_id,
11911                            conv.external_id,
11912                            conv.source_path.display()
11913                        )
11914                    })?;
11915            tracing::warn!(
11916                source_id = %conv.source_id,
11917                agent_id,
11918                external_id = ?conv.external_id,
11919                existing_id,
11920                source_path = %conv.source_path.display(),
11921                "conversation INSERT: duplicate gracefully recovered, reusing existing row"
11922            );
11923            Ok(ConversationInsertStatus::Existing(existing_id))
11924        }
11925        Err(error) => {
11926            tracing::error!(
11927                source_id = %conv.source_id,
11928                agent_id,
11929                external_id = ?conv.external_id,
11930                error = %error,
11931                source_path = %conv.source_path.display(),
11932                "franken_insert_conversation failed"
11933            );
11934            Err(error)
11935        }
11936    }
11937}
11938
11939/// Insert a conversation into the DB within a frankensqlite transaction.
11940///
11941/// Uses a plain `INSERT` so the common miss path stays on the slim direct
11942/// insert lane. Duplicate provenance conflicts are converted into `Ok(None)`
11943/// so callers can recover the canonical row and merge messages into it.
11944fn franken_insert_conversation(
11945    tx: &FrankenTransaction<'_>,
11946    agent_id: i64,
11947    workspace_id: Option<i64>,
11948    conv: &Conversation,
11949) -> Result<Option<i64>> {
11950    let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
11951    let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
11952    let metadata_bin_bytes = metadata_bin.as_deref();
11953
11954    match tx.execute_compat(
11955        "INSERT INTO conversations(
11956            agent_id, workspace_id, source_id, external_id, title, source_path,
11957            started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
11958            last_message_idx, last_message_created_at
11959        ) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
11960        fparams![
11961            agent_id,
11962            workspace_id,
11963            conv.source_id.as_str(),
11964            conv.external_id.as_deref(),
11965            conv.title.as_deref(),
11966            path_to_string(&conv.source_path),
11967            conv.started_at,
11968            conv.ended_at,
11969            conv.approx_tokens,
11970            metadata_json_str.as_deref(),
11971            conv.origin_host.as_deref(),
11972            metadata_bin_bytes,
11973            last_message_idx,
11974            last_message_created_at
11975        ],
11976    ) {
11977        Ok(_) => {
11978            let conv_id = franken_last_rowid(tx)?;
11979            franken_insert_conversation_tail_state(
11980                tx,
11981                conv_id,
11982                conv.ended_at,
11983                last_message_idx,
11984                last_message_created_at,
11985            )?;
11986            if let Some(external_id) = conv.external_id.as_deref() {
11987                franken_insert_external_conversation_tail_lookup(
11988                    tx,
11989                    conv.source_id.as_str(),
11990                    agent_id,
11991                    external_id,
11992                    ExistingConversationWithTail {
11993                        id: conv_id,
11994                        tail_state: existing_conversation_tail_state_from_cached(
11995                            last_message_idx,
11996                            last_message_created_at,
11997                            conv.ended_at,
11998                        ),
11999                    },
12000                )?;
12001            }
12002            Ok(Some(conv_id))
12003        }
12004        Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
12005            tracing::debug!(
12006                source_id = %conv.source_id,
12007                agent_id,
12008                external_id = ?conv.external_id,
12009                source_path = %conv.source_path.display(),
12010                "conversation INSERT: duplicate provenance conflict"
12011            );
12012            Ok(None)
12013        }
12014        Err(error) => Err(error.into()),
12015    }
12016}
12017
12018type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
12019
12020fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
12021    if let Some(raw) = historical_raw_json(value) {
12022        Ok((Some(Cow::Borrowed(raw)), None))
12023    } else if value.is_null() {
12024        Ok((Some(Cow::Borrowed("null")), None))
12025    } else if value.as_object().is_some_and(|object| object.is_empty()) {
12026        Ok((None, None))
12027    } else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
12028        Ok((None, Some(metadata_bin)))
12029    } else {
12030        Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
12031    }
12032}
12033
12034fn franken_insert_new_message(
12035    tx: &FrankenTransaction<'_>,
12036    conversation_id: i64,
12037    msg: &Message,
12038) -> Result<i64> {
12039    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12040    let extra_bin_bytes = extra_bin.as_deref();
12041
12042    tx.execute_compat(
12043        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
12044         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
12045            fparams![
12046                conversation_id,
12047                msg.idx,
12048                role_as_str(&msg.role),
12049                msg.author.as_deref(),
12050                msg.created_at,
12051                msg.content.as_str(),
12052                extra_json_str.as_deref(),
12053                extra_bin_bytes
12054        ],
12055    )?;
12056    franken_last_rowid(tx)
12057}
12058
12059type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
12060
12061fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
12062    if let Some(raw) = historical_raw_json(&msg.extra_json) {
12063        Ok((Some(Cow::Borrowed(raw)), None))
12064    } else if msg.extra_json.is_null() {
12065        Ok((None, None))
12066    } else {
12067        let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
12068        if extra_bin.is_some() {
12069            Ok((None, extra_bin))
12070        } else {
12071            Ok((
12072                Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
12073                None,
12074            ))
12075        }
12076    }
12077}
12078
12079/// Batch size for proven-new message inserts.
12080///
12081/// Each row binds 8 values, so 100 rows stays well under SQLite's default
12082/// `SQLITE_MAX_VARIABLE_NUMBER` limit of 999 while still amortizing parse cost.
12083const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
12084
12085/// Append workloads profile fastest with larger chunks on current frankensqlite.
12086///
12087/// After the tail-state hot table removed conversation-row rewrites from the
12088/// append path, 50-row chunks beat the old 20-row setting on the append-merge
12089/// profile. 100-row chunks slightly regress the 20-message workload.
12090const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
12091
12092fn message_insert_batch_sql(row_count: usize) -> &'static str {
12093    static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
12094
12095    let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
12096    let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
12097        let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
12098        sql_by_row_count.push(String::new());
12099        for row_count in 1..=max_batch_size {
12100            let placeholders = (0..row_count)
12101                .map(|idx| {
12102                    let base = idx * 8;
12103                    format!(
12104                        "(?{},?{},?{},?{},?{},?{},?{},?{})",
12105                        base + 1,
12106                        base + 2,
12107                        base + 3,
12108                        base + 4,
12109                        base + 5,
12110                        base + 6,
12111                        base + 7,
12112                        base + 8
12113                    )
12114                })
12115                .collect::<Vec<_>>()
12116                .join(",");
12117            sql_by_row_count.push(format!(
12118                "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
12119            ));
12120        }
12121        sql_by_row_count
12122    });
12123
12124    cached_sql
12125        .get(row_count)
12126        .map(String::as_str)
12127        .expect("message insert batch size must be covered by the cached SQL table")
12128}
12129
12130fn franken_batch_insert_new_messages(
12131    tx: &FrankenTransaction<'_>,
12132    conversation_id: i64,
12133    messages: &[&Message],
12134) -> Result<Vec<i64>> {
12135    franken_batch_insert_new_messages_with_batch_size(
12136        tx,
12137        conversation_id,
12138        messages,
12139        MESSAGE_INSERT_BATCH_SIZE,
12140    )
12141}
12142
12143fn franken_append_insert_new_messages(
12144    tx: &FrankenTransaction<'_>,
12145    conversation_id: i64,
12146    messages: &[&Message],
12147) -> Result<Vec<i64>> {
12148    franken_batch_insert_new_messages_with_batch_size(
12149        tx,
12150        conversation_id,
12151        messages,
12152        APPEND_MESSAGE_INSERT_BATCH_SIZE,
12153    )
12154}
12155
12156fn franken_batch_insert_new_messages_with_batch_size(
12157    tx: &FrankenTransaction<'_>,
12158    conversation_id: i64,
12159    messages: &[&Message],
12160    batch_size: usize,
12161) -> Result<Vec<i64>> {
12162    let batch_size = batch_size.max(1);
12163    let mut inserted_ids = Vec::with_capacity(messages.len());
12164    for chunk in messages.chunks(batch_size) {
12165        if chunk.len() == 1 {
12166            inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
12167            continue;
12168        }
12169        let sql = message_insert_batch_sql(chunk.len());
12170
12171        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
12172        for msg in chunk {
12173            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12174            param_values.push(SqliteValue::from(conversation_id));
12175            param_values.push(SqliteValue::from(msg.idx));
12176            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
12177            param_values.push(SqliteValue::from(msg.author.as_deref()));
12178            param_values.push(SqliteValue::from(msg.created_at));
12179            param_values.push(SqliteValue::from(msg.content.as_str()));
12180            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
12181            param_values.push(SqliteValue::from(extra_bin.as_deref()));
12182        }
12183
12184        tx.execute_with_params(sql, &param_values)?;
12185
12186        let last_id = franken_last_rowid(tx)?;
12187        let first_id = last_id
12188            .checked_sub((chunk.len() - 1) as i64)
12189            .with_context(|| {
12190                format!(
12191                    "inferring rowid range for {}-row message batch ending at {last_id}",
12192                    chunk.len()
12193                )
12194            })?;
12195        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
12196    }
12197
12198    Ok(inserted_ids)
12199}
12200
12201#[cfg(test)]
12202fn franken_insert_new_message_with_profile(
12203    tx: &FrankenTransaction<'_>,
12204    conversation_id: i64,
12205    msg: &Message,
12206    profile: &mut MessageInsertSubstageProfile,
12207) -> Result<i64> {
12208    profile.single_row_calls += 1;
12209    profile.batch_rows += 1;
12210
12211    let payload_start = Instant::now();
12212    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12213    profile.payload_duration += payload_start.elapsed();
12214    let extra_bin_bytes = extra_bin.as_deref();
12215
12216    let execute_start = Instant::now();
12217    tx.execute_compat(
12218        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
12219         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
12220            fparams![
12221                conversation_id,
12222                msg.idx,
12223                role_as_str(&msg.role),
12224                msg.author.as_deref(),
12225                msg.created_at,
12226                msg.content.as_str(),
12227                extra_json_str.as_deref(),
12228                extra_bin_bytes
12229        ],
12230    )?;
12231    profile.execute_duration += execute_start.elapsed();
12232
12233    let rowid_start = Instant::now();
12234    let rowid = franken_last_rowid(tx)?;
12235    profile.rowid_duration += rowid_start.elapsed();
12236    Ok(rowid)
12237}
12238
12239#[cfg(test)]
12240fn franken_batch_insert_new_messages_with_profile(
12241    tx: &FrankenTransaction<'_>,
12242    conversation_id: i64,
12243    messages: &[&Message],
12244    profile: &mut MessageInsertSubstageProfile,
12245) -> Result<Vec<i64>> {
12246    franken_batch_insert_new_messages_with_profile_batch_size(
12247        tx,
12248        conversation_id,
12249        messages,
12250        profile,
12251        MESSAGE_INSERT_BATCH_SIZE,
12252    )
12253}
12254
12255#[cfg(test)]
12256fn franken_append_insert_new_messages_with_profile(
12257    tx: &FrankenTransaction<'_>,
12258    conversation_id: i64,
12259    messages: &[&Message],
12260    profile: &mut MessageInsertSubstageProfile,
12261) -> Result<Vec<i64>> {
12262    franken_batch_insert_new_messages_with_profile_batch_size(
12263        tx,
12264        conversation_id,
12265        messages,
12266        profile,
12267        APPEND_MESSAGE_INSERT_BATCH_SIZE,
12268    )
12269}
12270
12271#[cfg(test)]
12272fn franken_batch_insert_new_messages_with_profile_batch_size(
12273    tx: &FrankenTransaction<'_>,
12274    conversation_id: i64,
12275    messages: &[&Message],
12276    profile: &mut MessageInsertSubstageProfile,
12277    batch_size: usize,
12278) -> Result<Vec<i64>> {
12279    let batch_size = batch_size.max(1);
12280    let mut inserted_ids = Vec::with_capacity(messages.len());
12281    for chunk in messages.chunks(batch_size) {
12282        if chunk.len() == 1 {
12283            inserted_ids.push(franken_insert_new_message_with_profile(
12284                tx,
12285                conversation_id,
12286                chunk[0],
12287                profile,
12288            )?);
12289            continue;
12290        }
12291
12292        profile.batch_calls += 1;
12293        profile.batch_rows += chunk.len();
12294
12295        let sql_build_start = Instant::now();
12296        let sql = message_insert_batch_sql(chunk.len());
12297        profile.sql_build_duration += sql_build_start.elapsed();
12298
12299        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
12300        for msg in chunk {
12301            let payload_start = Instant::now();
12302            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
12303            profile.payload_duration += payload_start.elapsed();
12304
12305            let param_build_start = Instant::now();
12306            param_values.push(SqliteValue::from(conversation_id));
12307            param_values.push(SqliteValue::from(msg.idx));
12308            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
12309            param_values.push(SqliteValue::from(msg.author.as_deref()));
12310            param_values.push(SqliteValue::from(msg.created_at));
12311            param_values.push(SqliteValue::from(msg.content.as_str()));
12312            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
12313            param_values.push(SqliteValue::from(extra_bin.as_deref()));
12314            profile.param_build_duration += param_build_start.elapsed();
12315        }
12316
12317        let execute_start = Instant::now();
12318        tx.execute_with_params(sql, &param_values)?;
12319        profile.execute_duration += execute_start.elapsed();
12320
12321        let rowid_start = Instant::now();
12322        let last_id = franken_last_rowid(tx)?;
12323        let first_id = last_id
12324            .checked_sub((chunk.len() - 1) as i64)
12325            .with_context(|| {
12326                format!(
12327                    "inferring rowid range for {}-row message batch ending at {last_id}",
12328                    chunk.len()
12329                )
12330            })?;
12331        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
12332        profile.rowid_duration += rowid_start.elapsed();
12333    }
12334
12335    Ok(inserted_ids)
12336}
12337
12338/// Insert snippets within a frankensqlite transaction.
12339fn franken_insert_snippets(
12340    tx: &FrankenTransaction<'_>,
12341    message_id: i64,
12342    snippets: &[Snippet],
12343) -> Result<()> {
12344    for snip in snippets {
12345        let file_path_str = snip.file_path.as_ref().map(path_to_string);
12346        tx.execute_compat(
12347            "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
12348             VALUES(?1,?2,?3,?4,?5,?6)",
12349            fparams![
12350                message_id,
12351                file_path_str.as_deref(),
12352                snip.start_line,
12353                snip.end_line,
12354                snip.language.as_deref(),
12355                snip.snippet_text.as_deref()
12356            ],
12357        )?;
12358    }
12359    Ok(())
12360}
12361
12362fn franken_existing_message_fingerprints(
12363    tx: &FrankenTransaction<'_>,
12364    conversation_id: i64,
12365) -> Result<HashSet<MessageMergeFingerprint>> {
12366    let rows = tx.query_params(
12367        "SELECT idx, role, author, created_at, content
12368         FROM messages
12369         WHERE conversation_id = ?1",
12370        fparams![conversation_id],
12371    )?;
12372    let mut fingerprints = HashSet::with_capacity(rows.len());
12373    for row in rows {
12374        let role: String = row.get_typed(1)?;
12375        let content: String = row.get_typed(4)?;
12376        fingerprints.insert(MessageMergeFingerprint {
12377            idx: row.get_typed(0)?,
12378            created_at: row.get_typed(3)?,
12379            role: role_from_str(&role),
12380            author: row.get_typed(2)?,
12381            content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
12382        });
12383    }
12384    Ok(fingerprints)
12385}
12386
12387struct ExistingMessageLookup {
12388    by_idx: HashMap<i64, MessageMergeFingerprint>,
12389    replay: HashSet<MessageReplayFingerprint>,
12390}
12391
12392fn franken_existing_message_lookup(
12393    tx: &FrankenTransaction<'_>,
12394    conversation_id: i64,
12395    incoming_messages: &[Message],
12396) -> Result<ExistingMessageLookup> {
12397    if incoming_messages.is_empty() {
12398        return Ok(ExistingMessageLookup {
12399            by_idx: HashMap::new(),
12400            replay: HashSet::new(),
12401        });
12402    }
12403
12404    let min_idx = incoming_messages
12405        .iter()
12406        .map(|msg| msg.idx)
12407        .min()
12408        .unwrap_or(0);
12409    let max_idx = incoming_messages
12410        .iter()
12411        .map(|msg| msg.idx)
12412        .max()
12413        .unwrap_or(min_idx);
12414    let requires_full_scan = incoming_messages.iter().any(|msg| msg.created_at.is_none());
12415    let created_bounds = incoming_messages
12416        .iter()
12417        .filter_map(|msg| msg.created_at)
12418        .fold(None, |bounds: Option<(i64, i64)>, created_at| {
12419            Some(match bounds {
12420                Some((min_created_at, max_created_at)) => (
12421                    min_created_at.min(created_at),
12422                    max_created_at.max(created_at),
12423                ),
12424                None => (created_at, created_at),
12425            })
12426        });
12427
12428    let mut indexed_by_idx = HashMap::with_capacity(incoming_messages.len());
12429    let mut indexed_replay = HashSet::with_capacity(incoming_messages.len());
12430    let mut exact_idx_match = true;
12431    for msg in incoming_messages {
12432        record_message_lookup_exact_idx_probe();
12433        let Some((role, author, created_at, content)) = tx
12434            .query_row_map(
12435                "SELECT role, author, created_at, content
12436                 FROM messages INDEXED BY sqlite_autoindex_messages_1
12437                 WHERE conversation_id = ?1 AND idx = ?2
12438                 LIMIT 1",
12439                fparams![conversation_id, msg.idx],
12440                |row| {
12441                    Ok((
12442                        row.get_typed::<String>(0)?,
12443                        row.get_typed::<Option<String>>(1)?,
12444                        row.get_typed::<Option<i64>>(2)?,
12445                        row.get_typed::<String>(3)?,
12446                    ))
12447                },
12448            )
12449            .optional()?
12450        else {
12451            exact_idx_match = false;
12452            break;
12453        };
12454        let role = role_from_str(&role);
12455        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12456        let fingerprint = MessageMergeFingerprint {
12457            idx: msg.idx,
12458            created_at,
12459            role: role.clone(),
12460            author: author.clone(),
12461            content_hash,
12462        };
12463        if fingerprint != message_merge_fingerprint(msg) {
12464            exact_idx_match = false;
12465            break;
12466        }
12467        indexed_by_idx.insert(msg.idx, fingerprint);
12468        indexed_replay.insert(MessageReplayFingerprint {
12469            created_at,
12470            role,
12471            author,
12472            content_hash,
12473        });
12474    }
12475
12476    if exact_idx_match {
12477        return Ok(ExistingMessageLookup {
12478            by_idx: indexed_by_idx,
12479            replay: indexed_replay,
12480        });
12481    }
12482
12483    let (rows, replay_full_scan) = if requires_full_scan {
12484        let rows = tx.query_params(
12485            "SELECT idx, role, author, created_at, content
12486             FROM messages INDEXED BY sqlite_autoindex_messages_1
12487             WHERE conversation_id = ?1",
12488            fparams![conversation_id],
12489        )?;
12490        record_message_lookup_full_scan_query(rows.len());
12491        (rows, true)
12492    } else if let Some((min_created_at, max_created_at)) = created_bounds {
12493        let mut rows = tx.query_params(
12494            "SELECT idx, role, author, created_at, content
12495             FROM messages INDEXED BY sqlite_autoindex_messages_1
12496             WHERE conversation_id = ?1
12497               AND idx >= ?2
12498               AND idx <= ?3",
12499            fparams![conversation_id, min_idx, max_idx],
12500        )?;
12501        rows.extend(tx.query_params(
12502            "SELECT idx, role, author, created_at, content
12503             FROM messages INDEXED BY sqlite_autoindex_messages_1
12504             WHERE conversation_id = ?1
12505               AND created_at IS NOT NULL
12506               AND created_at >= ?2
12507               AND created_at <= ?3",
12508            fparams![conversation_id, min_created_at, max_created_at],
12509        )?);
12510        record_message_lookup_bounded_queries(2, rows.len());
12511        (rows, false)
12512    } else {
12513        let rows = tx.query_params(
12514            "SELECT idx, role, author, created_at, content
12515             FROM messages INDEXED BY sqlite_autoindex_messages_1
12516             WHERE conversation_id = ?1",
12517            fparams![conversation_id],
12518        )?;
12519        record_message_lookup_full_scan_query(rows.len());
12520        (rows, true)
12521    };
12522
12523    let mut by_idx = HashMap::with_capacity(rows.len());
12524    let mut replay = HashSet::with_capacity(rows.len());
12525    for row in rows {
12526        let idx: i64 = row.get_typed(0)?;
12527        let role: String = row.get_typed(1)?;
12528        let author: Option<String> = row.get_typed(2)?;
12529        let created_at: Option<i64> = row.get_typed(3)?;
12530        let content: String = row.get_typed(4)?;
12531        let role = role_from_str(&role);
12532        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12533
12534        if idx >= min_idx && idx <= max_idx {
12535            by_idx.insert(
12536                idx,
12537                MessageMergeFingerprint {
12538                    idx,
12539                    created_at,
12540                    role: role.clone(),
12541                    author: author.clone(),
12542                    content_hash,
12543                },
12544            );
12545        }
12546
12547        let replay_matches = if replay_full_scan {
12548            true
12549        } else if let Some((min_created_at, max_created_at)) = created_bounds {
12550            created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
12551        } else {
12552            true
12553        };
12554        if replay_matches {
12555            replay.insert(MessageReplayFingerprint {
12556                created_at,
12557                role,
12558                author,
12559                content_hash,
12560            });
12561        }
12562    }
12563
12564    Ok(ExistingMessageLookup { by_idx, replay })
12565}
12566
12567fn franken_existing_message_lookup_with_pending(
12568    tx: &FrankenTransaction<'_>,
12569    conversation_id: i64,
12570    incoming_messages: &[Message],
12571    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12572    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12573) -> Result<ExistingMessageLookup> {
12574    if let (Some(by_idx), Some(replay)) = (
12575        pending_message_fingerprints.get(&conversation_id),
12576        pending_message_replay_fingerprints.get(&conversation_id),
12577    ) {
12578        if incoming_messages.iter().all(|msg| {
12579            by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
12580        }) {
12581            return Ok(ExistingMessageLookup {
12582                by_idx: by_idx.clone(),
12583                replay: replay.clone(),
12584            });
12585        }
12586
12587        let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12588        let mut merged_by_idx = by_idx.clone();
12589        let mut merged_replay = replay.clone();
12590        merged_by_idx.extend(fresh.by_idx);
12591        merged_replay.extend(fresh.replay);
12592        pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
12593        pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
12594        return Ok(ExistingMessageLookup {
12595            by_idx: merged_by_idx,
12596            replay: merged_replay,
12597        });
12598    }
12599
12600    let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12601    pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
12602    pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
12603    Ok(lookup)
12604}
12605
12606/// Batch insert FTS5 entries within a frankensqlite transaction.
12607fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
12608    if entries.is_empty() {
12609        return Ok(0);
12610    }
12611
12612    let mut inserted = 0;
12613
12614    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12615        let placeholders: String = chunk
12616            .iter()
12617            .enumerate()
12618            .map(|(i, _)| {
12619                let base = i * 7 + 1; // +1 for 1-indexed params
12620                format!(
12621                    "(?{},?{},?{},?{},?{},?{},?{})",
12622                    base,
12623                    base + 1,
12624                    base + 2,
12625                    base + 3,
12626                    base + 4,
12627                    base + 5,
12628                    base + 6
12629                )
12630            })
12631            .collect::<Vec<_>>()
12632            .join(",");
12633
12634        let sql = format!(
12635            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12636        );
12637
12638        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12639        for entry in chunk {
12640            param_values.push(SqliteValue::from(entry.message_id));
12641            param_values.push(SqliteValue::from(entry.content.as_str()));
12642            param_values.push(SqliteValue::from(entry.title.as_str()));
12643            param_values.push(SqliteValue::from(entry.agent.as_str()));
12644            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12645            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12646            param_values.push(SqliteValue::from(entry.created_at));
12647        }
12648
12649        match tx.execute_with_params(&sql, &param_values) {
12650            Ok(_) => {
12651                inserted += chunk.len();
12652            }
12653            Err(err) => {
12654                tracing::warn!(
12655                    error = %err,
12656                    chunk_docs = chunk.len(),
12657                    "frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
12658                );
12659                return Ok(inserted);
12660            }
12661        }
12662    }
12663
12664    Ok(inserted)
12665}
12666
12667fn franken_batch_insert_fts_on_connection(
12668    conn: &FrankenConnection,
12669    entries: &[FtsEntry],
12670) -> Result<usize> {
12671    if entries.is_empty() {
12672        return Ok(0);
12673    }
12674
12675    let mut inserted = 0;
12676
12677    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12678        let placeholders: String = chunk
12679            .iter()
12680            .enumerate()
12681            .map(|(i, _)| {
12682                let base = i * 7 + 1;
12683                format!(
12684                    "(?{},?{},?{},?{},?{},?{},?{})",
12685                    base,
12686                    base + 1,
12687                    base + 2,
12688                    base + 3,
12689                    base + 4,
12690                    base + 5,
12691                    base + 6
12692                )
12693            })
12694            .collect::<Vec<_>>()
12695            .join(",");
12696
12697        let sql = format!(
12698            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12699        );
12700
12701        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12702        for entry in chunk {
12703            param_values.push(SqliteValue::from(entry.message_id));
12704            param_values.push(SqliteValue::from(entry.content.as_str()));
12705            param_values.push(SqliteValue::from(entry.title.as_str()));
12706            param_values.push(SqliteValue::from(entry.agent.as_str()));
12707            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12708            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12709            param_values.push(SqliteValue::from(entry.created_at));
12710        }
12711
12712        conn.execute_with_params(&sql, &param_values)
12713            .with_context(|| {
12714                format!(
12715                    "inserting {} rows into fts_messages during streaming FTS maintenance",
12716                    chunk.len()
12717                )
12718            })?;
12719        inserted += chunk.len();
12720    }
12721
12722    Ok(inserted)
12723}
12724
12725/// Update daily stats within a frankensqlite transaction.
12726fn franken_update_daily_stats_in_tx(
12727    storage: &FrankenStorage,
12728    tx: &FrankenTransaction<'_>,
12729    agent_slug: &str,
12730    source_id: &str,
12731    started_at: Option<i64>,
12732    delta: StatsDelta,
12733) -> Result<()> {
12734    let day_id = started_at
12735        .map(FrankenStorage::day_id_from_millis)
12736        .unwrap_or(0);
12737    let now = FrankenStorage::now_millis();
12738
12739    let targets = [
12740        DailyStatsTarget {
12741            day_id,
12742            agent_slug,
12743            source_id,
12744        },
12745        DailyStatsTarget {
12746            day_id,
12747            agent_slug: "all",
12748            source_id,
12749        },
12750        DailyStatsTarget {
12751            day_id,
12752            agent_slug,
12753            source_id: "all",
12754        },
12755        DailyStatsTarget {
12756            day_id,
12757            agent_slug: "all",
12758            source_id: "all",
12759        },
12760    ];
12761
12762    if agent_slug != "all"
12763        && source_id != "all"
12764        && franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
12765    {
12766        return Ok(());
12767    }
12768
12769    for target in targets {
12770        franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
12771    }
12772
12773    Ok(())
12774}
12775
12776#[derive(Clone, Copy)]
12777struct DailyStatsTarget<'a> {
12778    day_id: i64,
12779    agent_slug: &'a str,
12780    source_id: &'a str,
12781}
12782
12783fn franken_update_ensured_daily_stats_targets_in_tx(
12784    storage: &FrankenStorage,
12785    tx: &FrankenTransaction<'_>,
12786    targets: &[DailyStatsTarget<'_>; 4],
12787    now: i64,
12788    delta: StatsDelta,
12789) -> Result<bool> {
12790    let cache_keys = targets.map(|target| {
12791        EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
12792    });
12793    if !storage.daily_stats_keys_already_ensured(&cache_keys) {
12794        return Ok(false);
12795    }
12796
12797    let primary = targets[0];
12798    let rows_changed = tx.execute_compat(
12799        "UPDATE daily_stats
12800         SET session_count = session_count + ?4,
12801             message_count = message_count + ?5,
12802             total_chars = total_chars + ?6,
12803             last_updated = ?7
12804         WHERE day_id = ?1
12805           AND ((agent_slug = ?2 AND source_id = ?3)
12806                OR (agent_slug = 'all' AND source_id = ?3)
12807                OR (agent_slug = ?2 AND source_id = 'all')
12808                OR (agent_slug = 'all' AND source_id = 'all'))",
12809        fparams![
12810            primary.day_id,
12811            primary.agent_slug,
12812            primary.source_id,
12813            delta.session_count_delta,
12814            delta.message_count_delta,
12815            delta.total_chars_delta,
12816            now
12817        ],
12818    )?;
12819    if rows_changed == targets.len() {
12820        return Ok(true);
12821    }
12822
12823    for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
12824        let exists = tx
12825            .query_row_map(
12826                "SELECT 1 FROM daily_stats
12827                 WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
12828                 LIMIT 1",
12829                fparams![target.day_id, target.agent_slug, target.source_id],
12830                |row| row.get_typed::<i64>(0),
12831            )
12832            .optional()?
12833            .is_some();
12834        if exists {
12835            continue;
12836        }
12837
12838        tx.execute_compat(
12839            "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12840             VALUES(?1,?2,?3,?4,?5,?6,?7)",
12841            fparams![
12842                target.day_id,
12843                target.agent_slug,
12844                target.source_id,
12845                delta.session_count_delta,
12846                delta.message_count_delta,
12847                delta.total_chars_delta,
12848                now
12849            ],
12850        )?;
12851        storage.mark_daily_stats_key_ensured(cache_key);
12852    }
12853
12854    Ok(true)
12855}
12856
12857fn franken_apply_daily_stats_delta_in_tx(
12858    storage: &FrankenStorage,
12859    tx: &FrankenTransaction<'_>,
12860    target: DailyStatsTarget<'_>,
12861    now: i64,
12862    delta: StatsDelta,
12863) -> Result<()> {
12864    let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
12865    if storage.daily_stats_key_already_ensured(&cache_key) {
12866        let rows_changed = tx.execute_compat(
12867            "UPDATE daily_stats
12868             SET session_count = session_count + ?4,
12869                 message_count = message_count + ?5,
12870                 total_chars = total_chars + ?6,
12871                 last_updated = ?7
12872             WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
12873            fparams![
12874                target.day_id,
12875                target.agent_slug,
12876                target.source_id,
12877                delta.session_count_delta,
12878                delta.message_count_delta,
12879                delta.total_chars_delta,
12880                now
12881            ],
12882        )?;
12883        if rows_changed > 0 {
12884            return Ok(());
12885        }
12886    }
12887
12888    tx.execute_compat(
12889        "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12890         VALUES(?1,?2,?3,?4,?5,?6,?7)
12891         ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12892            session_count = session_count + excluded.session_count,
12893            message_count = message_count + excluded.message_count,
12894            total_chars = total_chars + excluded.total_chars,
12895            last_updated = excluded.last_updated",
12896        fparams![
12897            target.day_id,
12898            target.agent_slug,
12899            target.source_id,
12900            delta.session_count_delta,
12901            delta.message_count_delta,
12902            delta.total_chars_delta,
12903            now
12904        ],
12905    )?;
12906    storage.mark_daily_stats_key_ensured(cache_key);
12907    Ok(())
12908}
12909
12910// -------------------------------------------------------------------------
12911// Frankensqlite batch helpers
12912// -------------------------------------------------------------------------
12913
12914/// Batch upsert daily_stats within a frankensqlite transaction.
12915fn franken_update_daily_stats_batched_in_tx(
12916    tx: &FrankenTransaction<'_>,
12917    entries: &[(i64, String, String, StatsDelta)],
12918) -> Result<usize> {
12919    if entries.is_empty() {
12920        return Ok(0);
12921    }
12922
12923    let now = FrankenStorage::now_millis();
12924    let mut total_affected = 0;
12925
12926    // Keep frankensqlite UPSERTs row-wise inside the transaction. The
12927    // multi-row VALUES ... ON CONFLICT form still falls back through
12928    // INSERT...SELECT in fsqlite-core, which rejects UPSERT/RETURNING during
12929    // real cass indexing.
12930    for (day_id, agent, source, delta) in entries {
12931        total_affected += tx.execute_compat(
12932            "INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12933             VALUES(?1,?2,?3,?4,?5,?6,?7)
12934             ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12935                 session_count = session_count + excluded.session_count,
12936                 message_count = message_count + excluded.message_count,
12937                 total_chars = total_chars + excluded.total_chars,
12938                 last_updated = excluded.last_updated",
12939            fparams![
12940                *day_id,
12941                agent.as_str(),
12942                source.as_str(),
12943                delta.session_count_delta,
12944                delta.message_count_delta,
12945                delta.total_chars_delta,
12946                now
12947            ],
12948        )?;
12949    }
12950
12951    Ok(total_affected)
12952}
12953
12954/// Batch insert token_usage rows within a frankensqlite transaction.
12955///
12956/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12957/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12958/// UPSERT/OR IGNORE conflict clauses.
12959fn franken_insert_token_usage_batched_in_tx(
12960    tx: &FrankenTransaction<'_>,
12961    entries: &[TokenUsageEntry],
12962) -> Result<usize> {
12963    if entries.is_empty() {
12964        return Ok(0);
12965    }
12966
12967    let mut total_inserted = 0;
12968
12969    for e in entries {
12970        let params_vec: Vec<ParamValue> = vec![
12971            ParamValue::from(e.message_id),
12972            ParamValue::from(e.conversation_id),
12973            ParamValue::from(e.agent_id),
12974            ParamValue::from(e.workspace_id),
12975            ParamValue::from(e.source_id.clone()),
12976            ParamValue::from(e.timestamp_ms),
12977            ParamValue::from(e.day_id),
12978            ParamValue::from(e.model_name.clone()),
12979            ParamValue::from(e.model_family.clone()),
12980            ParamValue::from(e.model_tier.clone()),
12981            ParamValue::from(e.service_tier.clone()),
12982            ParamValue::from(e.provider.clone()),
12983            ParamValue::from(e.input_tokens),
12984            ParamValue::from(e.output_tokens),
12985            ParamValue::from(e.cache_read_tokens),
12986            ParamValue::from(e.cache_creation_tokens),
12987            ParamValue::from(e.thinking_tokens),
12988            ParamValue::from(e.total_tokens),
12989            ParamValue::from(e.estimated_cost_usd),
12990            ParamValue::from(e.role.clone()),
12991            ParamValue::from(e.content_chars),
12992            ParamValue::from(e.has_tool_calls as i64),
12993            ParamValue::from(e.tool_call_count as i64),
12994            ParamValue::from(e.data_source.clone()),
12995        ];
12996
12997        let values = param_slice_to_values(&params_vec);
12998        total_inserted += tx.execute_with_params(
12999            "INSERT OR IGNORE INTO token_usage (
13000                message_id, conversation_id, agent_id, workspace_id, source_id,
13001                timestamp_ms, day_id,
13002                model_name, model_family, model_tier, service_tier, provider,
13003                input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
13004                thinking_tokens, total_tokens, estimated_cost_usd,
13005                role, content_chars, has_tool_calls, tool_call_count, data_source
13006            )
13007            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
13008            &values,
13009        )?;
13010    }
13011
13012    Ok(total_inserted)
13013}
13014
13015/// Batch upsert token_daily_stats within a frankensqlite transaction.
13016fn franken_update_token_daily_stats_batched_in_tx(
13017    tx: &FrankenTransaction<'_>,
13018    entries: &[(i64, String, String, String, TokenStatsDelta)],
13019) -> Result<usize> {
13020    if entries.is_empty() {
13021        return Ok(0);
13022    }
13023
13024    let now = FrankenStorage::now_millis();
13025    let mut total_affected = 0;
13026
13027    for (day_id, agent, source, model, delta) in entries {
13028        total_affected += tx.execute_compat(
13029            "INSERT INTO token_daily_stats (
13030                day_id, agent_slug, source_id, model_family,
13031                api_call_count, user_message_count, assistant_message_count, tool_message_count,
13032                total_input_tokens, total_output_tokens, total_cache_read_tokens,
13033                total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
13034                total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
13035                last_updated
13036            )
13037            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
13038            ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
13039                api_call_count = api_call_count + excluded.api_call_count,
13040                user_message_count = user_message_count + excluded.user_message_count,
13041                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13042                tool_message_count = tool_message_count + excluded.tool_message_count,
13043                total_input_tokens = total_input_tokens + excluded.total_input_tokens,
13044                total_output_tokens = total_output_tokens + excluded.total_output_tokens,
13045                total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
13046                total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
13047                total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
13048                grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
13049                total_content_chars = total_content_chars + excluded.total_content_chars,
13050                total_tool_calls = total_tool_calls + excluded.total_tool_calls,
13051                estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
13052                session_count = session_count + excluded.session_count,
13053                last_updated = excluded.last_updated",
13054            fparams![
13055                *day_id,
13056                agent.as_str(),
13057                source.as_str(),
13058                model.as_str(),
13059                delta.api_call_count,
13060                delta.user_message_count,
13061                delta.assistant_message_count,
13062                delta.tool_message_count,
13063                delta.total_input_tokens,
13064                delta.total_output_tokens,
13065                delta.total_cache_read_tokens,
13066                delta.total_cache_creation_tokens,
13067                delta.total_thinking_tokens,
13068                delta.grand_total_tokens,
13069                delta.total_content_chars,
13070                delta.total_tool_calls,
13071                delta.estimated_cost_usd,
13072                delta.session_count,
13073                now
13074            ],
13075        )?;
13076    }
13077
13078    Ok(total_affected)
13079}
13080
13081/// Batch insert message_metrics rows within a frankensqlite transaction.
13082///
13083/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
13084/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
13085/// UPSERT/OR IGNORE conflict clauses.
13086fn franken_insert_message_metrics_batched_in_tx(
13087    tx: &FrankenTransaction<'_>,
13088    entries: &[MessageMetricsEntry],
13089) -> Result<usize> {
13090    if entries.is_empty() {
13091        return Ok(0);
13092    }
13093
13094    let mut total_inserted = 0;
13095
13096    for e in entries {
13097        let params_vec: Vec<ParamValue> = vec![
13098            ParamValue::from(e.message_id),
13099            ParamValue::from(e.created_at_ms),
13100            ParamValue::from(e.hour_id),
13101            ParamValue::from(e.day_id),
13102            ParamValue::from(e.agent_slug.clone()),
13103            ParamValue::from(e.workspace_id),
13104            ParamValue::from(e.source_id.clone()),
13105            ParamValue::from(e.role.clone()),
13106            ParamValue::from(e.content_chars),
13107            ParamValue::from(e.content_tokens_est),
13108            ParamValue::from(e.model_name.clone()),
13109            ParamValue::from(e.model_family.clone()),
13110            ParamValue::from(e.model_tier.clone()),
13111            ParamValue::from(e.provider.clone()),
13112            ParamValue::from(e.api_input_tokens),
13113            ParamValue::from(e.api_output_tokens),
13114            ParamValue::from(e.api_cache_read_tokens),
13115            ParamValue::from(e.api_cache_creation_tokens),
13116            ParamValue::from(e.api_thinking_tokens),
13117            ParamValue::from(e.api_service_tier.clone()),
13118            ParamValue::from(e.api_data_source.clone()),
13119            ParamValue::from(e.tool_call_count),
13120            ParamValue::from(e.has_tool_calls as i64),
13121            ParamValue::from(e.has_plan as i64),
13122        ];
13123
13124        let values = param_slice_to_values(&params_vec);
13125        total_inserted += tx.execute_with_params(
13126            "INSERT OR IGNORE INTO message_metrics (
13127                message_id, created_at_ms, hour_id, day_id,
13128                agent_slug, workspace_id, source_id, role,
13129                content_chars, content_tokens_est,
13130                model_name, model_family, model_tier, provider,
13131                api_input_tokens, api_output_tokens, api_cache_read_tokens,
13132                api_cache_creation_tokens, api_thinking_tokens,
13133                api_service_tier, api_data_source,
13134                tool_call_count, has_tool_calls, has_plan
13135            )
13136            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
13137            &values,
13138        )?;
13139    }
13140
13141    Ok(total_inserted)
13142}
13143
13144/// Flush one rollup table (shared logic for hourly + daily) within a frankensqlite transaction.
13145fn franken_flush_rollup_table(
13146    tx: &FrankenTransaction<'_>,
13147    table: &str,
13148    bucket_col: &str,
13149    deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
13150    now: i64,
13151) -> Result<usize> {
13152    if deltas.is_empty() {
13153        return Ok(0);
13154    }
13155
13156    let mut total_affected = 0;
13157
13158    for ((bucket_id, agent, workspace_id, source), d) in deltas {
13159        let sql = format!(
13160            "INSERT INTO {table} (
13161                {bucket_col}, agent_slug, workspace_id, source_id,
13162                message_count, user_message_count, assistant_message_count,
13163                tool_call_count, plan_message_count, plan_content_tokens_est_total,
13164                plan_api_tokens_total, api_coverage_message_count,
13165                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
13166                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
13167                api_cache_read_tokens_total, api_cache_creation_tokens_total,
13168                api_thinking_tokens_total, last_updated
13169            )
13170            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
13171            ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
13172                message_count = message_count + excluded.message_count,
13173                user_message_count = user_message_count + excluded.user_message_count,
13174                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13175                tool_call_count = tool_call_count + excluded.tool_call_count,
13176                plan_message_count = plan_message_count + excluded.plan_message_count,
13177                plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
13178                plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
13179                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
13180                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
13181                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
13182                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
13183                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
13184                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
13185                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
13186                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
13187                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
13188                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
13189                last_updated = excluded.last_updated"
13190        );
13191
13192        total_affected += tx.execute_compat(
13193            &sql,
13194            fparams![
13195                *bucket_id,
13196                agent.as_str(),
13197                *workspace_id,
13198                source.as_str(),
13199                d.message_count,
13200                d.user_message_count,
13201                d.assistant_message_count,
13202                d.tool_call_count,
13203                d.plan_message_count,
13204                d.plan_content_tokens_est_total,
13205                d.plan_api_tokens_total,
13206                d.api_coverage_message_count,
13207                d.content_tokens_est_total,
13208                d.content_tokens_est_user,
13209                d.content_tokens_est_assistant,
13210                d.api_tokens_total,
13211                d.api_input_tokens_total,
13212                d.api_output_tokens_total,
13213                d.api_cache_read_tokens_total,
13214                d.api_cache_creation_tokens_total,
13215                d.api_thinking_tokens_total,
13216                now
13217            ],
13218        )?;
13219    }
13220
13221    Ok(total_affected)
13222}
13223
13224/// Flush usage_models_daily rollup within a frankensqlite transaction.
13225fn franken_flush_model_daily_rollup_table(
13226    tx: &FrankenTransaction<'_>,
13227    deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
13228    now: i64,
13229) -> Result<usize> {
13230    if deltas.is_empty() {
13231        return Ok(0);
13232    }
13233
13234    let mut total_affected = 0;
13235
13236    for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
13237        total_affected += tx.execute_compat(
13238            "INSERT INTO usage_models_daily (
13239                day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
13240                message_count, user_message_count, assistant_message_count,
13241                tool_call_count, plan_message_count, api_coverage_message_count,
13242                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
13243                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
13244                api_cache_read_tokens_total, api_cache_creation_tokens_total,
13245                api_thinking_tokens_total, last_updated
13246            )
13247            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
13248            ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
13249                message_count = message_count + excluded.message_count,
13250                user_message_count = user_message_count + excluded.user_message_count,
13251                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
13252                tool_call_count = tool_call_count + excluded.tool_call_count,
13253                plan_message_count = plan_message_count + excluded.plan_message_count,
13254                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
13255                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
13256                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
13257                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
13258                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
13259                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
13260                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
13261                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
13262                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
13263                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
13264                last_updated = excluded.last_updated",
13265            fparams![
13266                *day_id,
13267                agent.as_str(),
13268                *workspace_id,
13269                source.as_str(),
13270                model_family.as_str(),
13271                model_tier.as_str(),
13272                d.message_count,
13273                d.user_message_count,
13274                d.assistant_message_count,
13275                d.tool_call_count,
13276                d.plan_message_count,
13277                d.api_coverage_message_count,
13278                d.content_tokens_est_total,
13279                d.content_tokens_est_user,
13280                d.content_tokens_est_assistant,
13281                d.api_tokens_total,
13282                d.api_input_tokens_total,
13283                d.api_output_tokens_total,
13284                d.api_cache_read_tokens_total,
13285                d.api_cache_creation_tokens_total,
13286                d.api_thinking_tokens_total,
13287                now
13288            ],
13289        )?;
13290    }
13291
13292    Ok(total_affected)
13293}
13294
13295/// Flush AnalyticsRollupAggregator deltas via frankensqlite transaction.
13296fn franken_flush_analytics_rollups_in_tx(
13297    tx: &FrankenTransaction<'_>,
13298    agg: &AnalyticsRollupAggregator,
13299) -> Result<(usize, usize, usize)> {
13300    let now = FrankenStorage::now_millis();
13301
13302    let hourly_affected =
13303        franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
13304    let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
13305    let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
13306
13307    Ok((hourly_affected, daily_affected, models_daily_affected))
13308}
13309
13310/// Update conversation-level token summary columns via frankensqlite transaction.
13311fn franken_update_conversation_token_summaries_in_tx(
13312    tx: &FrankenTransaction<'_>,
13313    conversation_id: i64,
13314) -> Result<()> {
13315    tx.execute_compat(
13316        "UPDATE conversations SET
13317            total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
13318            total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
13319            total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
13320            total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
13321            grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
13322            estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
13323            primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
13324                             AND model_name IS NOT NULL
13325                             GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
13326            api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13327                              AND data_source = 'api'),
13328            tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
13329            user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13330                                  AND role = 'user'),
13331            assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
13332                                       AND role IN ('assistant', 'agent'))
13333         WHERE id = ?1",
13334        fparams![conversation_id],
13335    )?;
13336    Ok(())
13337}
13338
13339impl FrankenStorage {
13340    /// Rebuild token_daily_stats from the token_usage ledger.
13341    pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
13342        const CONVERSATION_BATCH_SIZE: usize = 1_000;
13343        const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
13344
13345        let total_usage_rows: i64 =
13346            self.conn
13347                .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
13348                    row.get_typed(0)
13349                })?;
13350        tracing::info!(
13351            target: "cass::analytics",
13352            total_usage_rows,
13353            "token_daily_stats_rebuild_start"
13354        );
13355
13356        let mut tx = self.conn.transaction()?;
13357        tx.execute("DELETE FROM token_daily_stats")?;
13358
13359        let mut last_conversation_id = 0_i64;
13360        let mut rows_created = 0_usize;
13361
13362        loop {
13363            let conversation_rows = tx.query_map_collect(
13364                "SELECT c.id, c.started_at, c.source_id,
13365                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
13366                 FROM conversations c
13367                 WHERE c.id > ?1
13368                 ORDER BY c.id
13369                 LIMIT ?2",
13370                fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
13371                |row| {
13372                    Ok((
13373                        row.get_typed::<i64>(0)?,
13374                        row.get_typed::<Option<i64>>(1)?,
13375                        row.get_typed::<String>(2)?,
13376                        row.get_typed::<String>(3)?,
13377                    ))
13378                },
13379            )?;
13380            if conversation_rows.is_empty() {
13381                break;
13382            }
13383
13384            let mut aggregate = TokenStatsAggregator::new();
13385
13386            for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
13387                last_conversation_id = conversation_id;
13388                let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13389                let mut last_token_usage_id = 0_i64;
13390                let mut session_model_family = String::from("unknown");
13391
13392                loop {
13393                    let usage_rows = tx.query_map_collect(
13394                        "SELECT id, day_id, role,
13395                                COALESCE(model_family, 'unknown'),
13396                                input_tokens, output_tokens, cache_read_tokens,
13397                                cache_creation_tokens, thinking_tokens,
13398                                has_tool_calls, tool_call_count,
13399                                content_chars, estimated_cost_usd
13400                         FROM token_usage
13401                         WHERE conversation_id = ?1
13402                           AND id > ?2
13403                         ORDER BY id
13404                         LIMIT ?3",
13405                        fparams![
13406                            conversation_id,
13407                            last_token_usage_id,
13408                            TOKEN_USAGE_BATCH_SIZE as i64
13409                        ],
13410                        |row| {
13411                            Ok((
13412                                row.get_typed::<i64>(0)?,
13413                                row.get_typed::<i64>(1)?,
13414                                row.get_typed::<String>(2)?,
13415                                row.get_typed::<String>(3)?,
13416                                row.get_typed::<Option<i64>>(4)?,
13417                                row.get_typed::<Option<i64>>(5)?,
13418                                row.get_typed::<Option<i64>>(6)?,
13419                                row.get_typed::<Option<i64>>(7)?,
13420                                row.get_typed::<Option<i64>>(8)?,
13421                                row.get_typed::<i64>(9)?,
13422                                row.get_typed::<i64>(10)?,
13423                                row.get_typed::<i64>(11)?,
13424                                row.get_typed::<Option<f64>>(12)?,
13425                            ))
13426                        },
13427                    )?;
13428                    if usage_rows.is_empty() {
13429                        break;
13430                    }
13431
13432                    for (
13433                        token_usage_id,
13434                        day_id,
13435                        role,
13436                        model_family,
13437                        input_tokens,
13438                        output_tokens,
13439                        cache_read_tokens,
13440                        cache_creation_tokens,
13441                        thinking_tokens,
13442                        has_tool_calls,
13443                        tool_call_count,
13444                        content_chars,
13445                        estimated_cost_usd,
13446                    ) in usage_rows
13447                    {
13448                        last_token_usage_id = token_usage_id;
13449                        if model_family != "unknown" {
13450                            session_model_family = model_family.clone();
13451                        }
13452                        let usage = crate::connectors::ExtractedTokenUsage {
13453                            model_name: None,
13454                            provider: None,
13455                            input_tokens,
13456                            output_tokens,
13457                            cache_read_tokens,
13458                            cache_creation_tokens,
13459                            thinking_tokens,
13460                            service_tier: None,
13461                            has_tool_calls: has_tool_calls != 0,
13462                            tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
13463                            data_source: franken_agent_detection::TokenDataSource::Api,
13464                        };
13465                        aggregate.record(
13466                            &agent_slug,
13467                            &source_id,
13468                            day_id,
13469                            &model_family,
13470                            &role,
13471                            &usage,
13472                            content_chars,
13473                            estimated_cost_usd.unwrap_or(0.0),
13474                        );
13475                    }
13476                }
13477
13478                aggregate.record_session(
13479                    &agent_slug,
13480                    &source_id,
13481                    conversation_day_id,
13482                    &session_model_family,
13483                );
13484            }
13485
13486            let entries = aggregate.expand();
13487            rows_created = rows_created.saturating_add(entries.len());
13488            franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
13489        }
13490
13491        tx.commit()?;
13492
13493        tracing::info!(
13494            target: "cass::analytics",
13495            rows_created,
13496            "token_daily_stats_rebuild_complete"
13497        );
13498
13499        Ok(rows_created)
13500    }
13501
13502    /// Rebuild analytics tables (message_metrics + rollups) from existing
13503    /// messages in the database. Does NOT re-parse raw agent session files.
13504    pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
13505        let start = Instant::now();
13506
13507        let total_messages: i64 =
13508            self.conn
13509                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13510                    row.get_typed(0)
13511                })?;
13512        tracing::info!(
13513            target: "cass::analytics",
13514            total_messages,
13515            "analytics_rebuild_start"
13516        );
13517
13518        let mut tx = self.conn.transaction()?;
13519
13520        tx.execute("DELETE FROM message_metrics")?;
13521        tx.execute("DELETE FROM usage_hourly")?;
13522        tx.execute("DELETE FROM usage_daily")?;
13523        tx.execute("DELETE FROM usage_models_daily")?;
13524
13525        const CHUNK_SIZE: i64 = 10_000;
13526        let mut offset: i64 = 0;
13527        let mut total_inserted: usize = 0;
13528        let mut usage_hourly_rows: usize = 0;
13529        let mut usage_daily_rows: usize = 0;
13530        let mut usage_models_daily_rows: usize = 0;
13531
13532        loop {
13533            #[allow(clippy::type_complexity)]
13534            let rows: Vec<(
13535                i64,
13536                String,
13537                String,
13538                Option<serde_json::Value>,
13539                Option<i64>,
13540                Option<i64>,
13541                String,
13542                Option<i64>,
13543                String,
13544            )> = tx.query_map_collect(
13545                // Avoid the 3-table JOIN with LIMIT/OFFSET that triggers
13546                // frankensqlite's materialization fallback (see 860acb12).
13547                // Inline the agent slug lookup as a correlated subquery and
13548                // fall back to 'unknown' for NULL agent_id, matching the
13549                // FTS / lexical rebuild paths.
13550                "SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
13551                        m.created_at,
13552                        c.id AS conv_id, c.started_at AS conv_started_at,
13553                        c.source_id, c.workspace_id,
13554                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
13555                 FROM messages m
13556                 JOIN conversations c ON m.conversation_id = c.id
13557                 ORDER BY m.id
13558                 LIMIT ?1 OFFSET ?2",
13559                fparams![CHUNK_SIZE, offset],
13560                |row| {
13561                    let msg_id: i64 = row.get_typed(0)?;
13562                    let role: String = row.get_typed(2)?;
13563                    let content: String = row.get_typed(3)?;
13564                    let extra_json = row
13565                        .get_typed::<Option<String>>(4)?
13566                        .and_then(|s| serde_json::from_str(&s).ok())
13567                        .or_else(|| {
13568                            row.get_typed::<Option<Vec<u8>>>(5)
13569                                .ok()
13570                                .flatten()
13571                                .and_then(|b| rmp_serde::from_slice(&b).ok())
13572                        });
13573                    let msg_ts: Option<i64> = row.get_typed(6)?;
13574                    let conv_started_at: Option<i64> = row.get_typed(8)?;
13575                    let source_id: String = row.get_typed(9)?;
13576                    let workspace_id: Option<i64> = row.get_typed(10)?;
13577                    let agent_slug: String = row.get_typed(11)?;
13578                    let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
13579
13580                    Ok((
13581                        msg_id,
13582                        role,
13583                        content,
13584                        extra_json,
13585                        Some(effective_ts),
13586                        workspace_id,
13587                        source_id,
13588                        conv_started_at,
13589                        agent_slug,
13590                    ))
13591                },
13592            )?;
13593
13594            if rows.is_empty() {
13595                break;
13596            }
13597
13598            let chunk_len = rows.len();
13599            let mut entries = Vec::with_capacity(chunk_len);
13600            let mut rollup_agg = AnalyticsRollupAggregator::new();
13601
13602            for (
13603                msg_id,
13604                role,
13605                content,
13606                extra_json,
13607                effective_ts,
13608                workspace_id,
13609                source_id,
13610                _conv_started_at,
13611                agent_slug,
13612            ) in &rows
13613            {
13614                let ts = effective_ts.unwrap_or(0);
13615                let day_id = Self::day_id_from_millis(ts);
13616                let hour_id = Self::hour_id_from_millis(ts);
13617                let content_chars = content.len() as i64;
13618                let content_tokens_est = content_chars / 4;
13619                let extra = extra_json
13620                    .as_ref()
13621                    .cloned()
13622                    .unwrap_or(serde_json::Value::Null);
13623                let usage =
13624                    crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
13625                let model_info = usage
13626                    .model_name
13627                    .as_deref()
13628                    .map(crate::connectors::normalize_model);
13629                let model_family = model_info
13630                    .as_ref()
13631                    .map(|i| i.family.clone())
13632                    .unwrap_or_else(|| "unknown".into());
13633                let model_tier = model_info
13634                    .as_ref()
13635                    .map(|i| i.tier.clone())
13636                    .unwrap_or_else(|| "unknown".into());
13637                let provider = usage
13638                    .provider
13639                    .clone()
13640                    .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
13641                    .unwrap_or_else(|| "unknown".into());
13642
13643                let entry = MessageMetricsEntry {
13644                    message_id: *msg_id,
13645                    created_at_ms: ts,
13646                    hour_id,
13647                    day_id,
13648                    agent_slug: agent_slug.clone(),
13649                    workspace_id: workspace_id.unwrap_or(0),
13650                    source_id: source_id.clone(),
13651                    role: role.clone(),
13652                    content_chars,
13653                    content_tokens_est,
13654                    model_name: usage.model_name.clone(),
13655                    model_family,
13656                    model_tier,
13657                    provider,
13658                    api_input_tokens: usage.input_tokens,
13659                    api_output_tokens: usage.output_tokens,
13660                    api_cache_read_tokens: usage.cache_read_tokens,
13661                    api_cache_creation_tokens: usage.cache_creation_tokens,
13662                    api_thinking_tokens: usage.thinking_tokens,
13663                    api_service_tier: usage.service_tier,
13664                    api_data_source: usage.data_source.as_str().to_string(),
13665                    tool_call_count: usage.tool_call_count as i64,
13666                    has_tool_calls: usage.has_tool_calls,
13667                    has_plan: has_plan_for_role(role, content),
13668                };
13669                rollup_agg.record(&entry);
13670                entries.push(entry);
13671            }
13672
13673            total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
13674            let (hourly, daily, models_daily) =
13675                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
13676            usage_hourly_rows += hourly;
13677            usage_daily_rows += daily;
13678            usage_models_daily_rows += models_daily;
13679            offset += chunk_len as i64;
13680
13681            tracing::debug!(
13682                target: "cass::analytics",
13683                offset,
13684                chunk = chunk_len,
13685                inserted = entries.len(),
13686                total = total_inserted,
13687                "analytics_rebuild_chunk"
13688            );
13689
13690            if (chunk_len as i64) < CHUNK_SIZE {
13691                break;
13692            }
13693        }
13694
13695        tx.commit()?;
13696
13697        let elapsed = start.elapsed();
13698        let elapsed_ms = elapsed.as_millis() as u64;
13699        let msgs_per_sec = if elapsed_ms > 0 {
13700            (total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
13701        } else {
13702            0.0
13703        };
13704
13705        tracing::info!(
13706            target: "cass::analytics",
13707            message_metrics_rows = total_inserted,
13708            usage_hourly_rows,
13709            usage_daily_rows,
13710            usage_models_daily_rows,
13711            elapsed_ms,
13712            messages_per_sec = format!("{:.0}", msgs_per_sec),
13713            "analytics_rebuild_complete"
13714        );
13715
13716        Ok(AnalyticsRebuildResult {
13717            message_metrics_rows: total_inserted,
13718            usage_hourly_rows,
13719            usage_daily_rows,
13720            usage_models_daily_rows,
13721            elapsed_ms,
13722            messages_per_sec: msgs_per_sec,
13723        })
13724    }
13725
13726    /// Rebuild all daily stats from scratch.
13727    pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
13728        const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
13729        const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
13730
13731        let mut conversation_batch_size = rebuild_batch_size_env(
13732            "CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
13733            DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
13734        );
13735        let mut message_batch_size = rebuild_batch_size_env(
13736            "CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
13737            DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
13738        );
13739
13740        let total_messages: i64 =
13741            self.conn
13742                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13743                    row.get_typed(0)
13744                })?;
13745        let message_metrics_rows: i64 =
13746            self.conn
13747                .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
13748                    row.get_typed(0)
13749                })?;
13750        let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
13751
13752        tracing::info!(
13753            target: "cass::perf::daily_stats",
13754            total_messages,
13755            message_metrics_rows,
13756            use_message_metrics,
13757            "daily_stats rebuild selected message source"
13758        );
13759
13760        let mut tx = self.conn.transaction()?;
13761        tx.execute("DELETE FROM daily_stats")?;
13762
13763        let mut last_conversation_id = 0_i64;
13764        let mut conversation_batch_count = 0_usize;
13765        let mut conversations_processed = 0_usize;
13766        let mut messages_processed = 0_usize;
13767        let mut message_batch_count = 0_usize;
13768        let mut raw_entries_flushed = 0_usize;
13769        let mut expanded_entries_flushed = 0_usize;
13770        let message_scan_sql = if use_message_metrics {
13771            "SELECT m.idx, mm.content_chars
13772             FROM messages m
13773             JOIN message_metrics mm ON mm.message_id = m.id
13774             WHERE m.conversation_id = ?1
13775               AND m.idx > ?2
13776             ORDER BY m.conversation_id, m.idx
13777             LIMIT ?3"
13778        } else {
13779            "SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
13780             FROM messages m
13781             WHERE m.conversation_id = ?1
13782               AND m.idx > ?2
13783             ORDER BY m.conversation_id, m.idx
13784             LIMIT ?3"
13785        };
13786
13787        loop {
13788            // Avoid the 2-table JOIN with LIMIT that triggers frankensqlite's
13789            // materialization fallback (which is what the OOM retry below is
13790            // defending against — see 860acb12).  Inline agent slug via
13791            // correlated subquery and degrade NULL agent_id to 'unknown' for
13792            // consistency with the lexical/FTS rebuild paths.
13793            let conversation_rows = match self.conn.query_with_params(
13794                "SELECT c.id, c.started_at,
13795                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
13796                        c.source_id
13797                 FROM conversations c
13798                 WHERE c.id > ?1
13799                 ORDER BY c.id
13800                 LIMIT ?2",
13801                &params_from_iter([
13802                    ParamValue::from(last_conversation_id),
13803                    ParamValue::from(conversation_batch_size as i64),
13804                ]),
13805            ) {
13806                Ok(rows) => rows,
13807                Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
13808                    let previous_batch_size = conversation_batch_size;
13809                    conversation_batch_size = (conversation_batch_size / 2).max(1);
13810                    tracing::warn!(
13811                        previous_batch_size,
13812                        conversation_batch_size,
13813                        last_conversation_id,
13814                        "daily_stats conversation scan ran out of memory; retrying with smaller batch"
13815                    );
13816                    continue;
13817                }
13818                Err(err) => return Err(err.into()),
13819            };
13820            if conversation_rows.is_empty() {
13821                break;
13822            }
13823
13824            let mut aggregate = StatsAggregator::new();
13825            let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
13826                Vec::with_capacity(conversation_rows.len());
13827            for row in &conversation_rows {
13828                let conversation_id: i64 = row.get_typed(0)?;
13829                let started_at: Option<i64> = row.get_typed(1)?;
13830                let agent_slug: String = row.get_typed(2)?;
13831                let source_id: String = row.get_typed(3)?;
13832                last_conversation_id = conversation_id;
13833                let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13834                aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
13835                conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
13836                conversations_processed += 1;
13837            }
13838
13839            conversation_batch_count += 1;
13840            raw_entries_flushed += aggregate.raw_entry_count();
13841            let entries = aggregate.expand();
13842            expanded_entries_flushed += entries.len();
13843            if !entries.is_empty() {
13844                franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13845            }
13846            if conversation_batch_count.is_multiple_of(25) {
13847                tracing::info!(
13848                    target: "cass::perf::daily_stats",
13849                    conversations_processed,
13850                    batches = conversation_batch_count,
13851                    batch_size = conversation_batch_size,
13852                    last_conversation_id,
13853                    "daily_stats rebuild conversation scan progress"
13854                );
13855            }
13856            if conversation_batch_meta.is_empty() {
13857                continue;
13858            }
13859
13860            for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
13861                let mut cursor_message_idx = -1_i64;
13862                loop {
13863                    let message_rows = match self.conn.query_with_params(
13864                        message_scan_sql,
13865                        &params_from_iter([
13866                            ParamValue::from(conversation_id),
13867                            ParamValue::from(cursor_message_idx),
13868                            ParamValue::from(message_batch_size as i64),
13869                        ]),
13870                    ) {
13871                        Ok(rows) => rows,
13872                        Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
13873                            let previous_batch_size = message_batch_size;
13874                            message_batch_size = (message_batch_size / 2).max(1);
13875                            tracing::warn!(
13876                                previous_batch_size,
13877                                message_batch_size,
13878                                conversation_id,
13879                                cursor_message_idx,
13880                                "daily_stats message scan ran out of memory; retrying with smaller batch"
13881                            );
13882                            continue;
13883                        }
13884                        Err(err) => return Err(err.into()),
13885                    };
13886                    if message_rows.is_empty() {
13887                        break;
13888                    }
13889
13890                    let mut aggregate = StatsAggregator::new();
13891                    for row in &message_rows {
13892                        let message_idx: i64 = row.get_typed(0)?;
13893                        let content_len: i64 = row.get_typed(1)?;
13894                        cursor_message_idx = message_idx;
13895                        aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
13896                        messages_processed += 1;
13897                    }
13898
13899                    message_batch_count += 1;
13900                    raw_entries_flushed += aggregate.raw_entry_count();
13901                    let entries = aggregate.expand();
13902                    expanded_entries_flushed += entries.len();
13903                    if !entries.is_empty() {
13904                        franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13905                    }
13906                    if message_batch_count.is_multiple_of(50) {
13907                        tracing::info!(
13908                            target: "cass::perf::daily_stats",
13909                            messages_processed,
13910                            batches = message_batch_count,
13911                            batch_size = message_batch_size,
13912                            source = if use_message_metrics {
13913                                "message_metrics"
13914                            } else {
13915                                "messages"
13916                            },
13917                            conversation_id,
13918                            cursor_message_idx,
13919                            "daily_stats rebuild message scan progress"
13920                        );
13921                    }
13922                }
13923            }
13924        }
13925
13926        let rows_created: i64 =
13927            tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
13928                row.get_typed(0)
13929            })?;
13930        let total_sessions: i64 = tx.query_row_map(
13931            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
13932            fparams![],
13933            |row| row.get_typed(0),
13934        )?;
13935
13936        tx.commit()?;
13937
13938        tracing::info!(
13939            target: "cass::perf::daily_stats",
13940            rows_created,
13941            total_sessions,
13942            conversations_processed,
13943            conversation_batches = conversation_batch_count,
13944            conversation_batch_size,
13945            message_batches = message_batch_count,
13946            message_batch_size,
13947            messages_processed,
13948            use_message_metrics,
13949            raw_entries_flushed,
13950            expanded_entries_flushed,
13951            "Daily stats rebuilt from conversations"
13952        );
13953
13954        Ok(DailyStatsRebuildResult {
13955            rows_created,
13956            total_sessions,
13957        })
13958    }
13959}
13960
13961// SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
13962// All methods are available through FrankenStorage.
13963
13964// -------------------------------------------------------------------------
13965// IndexingCache (Opt 7.2) - N+1 Prevention for Agent/Workspace IDs
13966// -------------------------------------------------------------------------
13967
13968/// Cache for agent and workspace IDs during batch indexing.
13969///
13970/// Prevents N+1 database queries by caching the results of ensure_agent
13971/// and ensure_workspace calls within a batch. This is per-batch and
13972/// single-threaded, so no synchronization is needed.
13973///
13974/// # Usage
13975/// ```ignore
13976/// let mut cache = IndexingCache::new();
13977/// for conv in conversations {
13978///     let agent_id = cache.get_or_insert_agent(storage, &agent)?;
13979///     let workspace_id = cache.get_or_insert_workspace(storage, workspace)?;
13980///     // ... use agent_id and workspace_id
13981/// }
13982/// ```
13983///
13984/// # Rollback
13985/// Set environment variable `CASS_SQLITE_CACHE=0` to bypass caching
13986/// and use direct DB calls (useful for debugging).
13987#[derive(Debug, Default)]
13988pub struct IndexingCache {
13989    agent_ids: HashMap<String, i64>,
13990    workspace_ids: HashMap<PathBuf, i64>,
13991    hits: u64,
13992    misses: u64,
13993}
13994
13995pub trait IndexingCacheStorage {
13996    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
13997    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
13998}
13999
14000impl IndexingCacheStorage for FrankenStorage {
14001    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
14002        self.ensure_agent(agent)
14003    }
14004
14005    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
14006        self.ensure_workspace(path, display_name)
14007    }
14008}
14009
14010// IndexingCacheStorage for SqliteStorage removed: SqliteStorage is a type alias for FrankenStorage.
14011
14012impl IndexingCache {
14013    /// Create a new empty cache.
14014    pub fn new() -> Self {
14015        Self {
14016            agent_ids: HashMap::new(),
14017            workspace_ids: HashMap::new(),
14018            hits: 0,
14019            misses: 0,
14020        }
14021    }
14022
14023    /// Check if caching is enabled via environment variable.
14024    /// Returns true unless CASS_SQLITE_CACHE is set to "0" or "false".
14025    pub fn is_enabled() -> bool {
14026        dotenvy::var("CASS_SQLITE_CACHE")
14027            .map(|v| v != "0" && v.to_lowercase() != "false")
14028            .unwrap_or(true)
14029    }
14030
14031    /// Get or insert an agent ID, using cache if available.
14032    ///
14033    /// Returns the cached ID if present, otherwise calls ensure_agent
14034    /// and caches the result.
14035    pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
14036    where
14037        S: IndexingCacheStorage + ?Sized,
14038    {
14039        if let Some(&cached) = self.agent_ids.get(&agent.slug) {
14040            self.hits += 1;
14041            return Ok(cached);
14042        }
14043
14044        self.misses += 1;
14045        let id = storage.ensure_indexing_agent(agent)?;
14046        self.agent_ids.insert(agent.slug.clone(), id);
14047        Ok(id)
14048    }
14049
14050    /// Get or insert a workspace ID, using cache if available.
14051    ///
14052    /// Returns the cached ID if present, otherwise calls ensure_workspace
14053    /// and caches the result.
14054    pub fn get_or_insert_workspace(
14055        &mut self,
14056        storage: &(impl IndexingCacheStorage + ?Sized),
14057        path: &Path,
14058        display_name: Option<&str>,
14059    ) -> Result<i64> {
14060        if let Some(&cached) = self.workspace_ids.get(path) {
14061            self.hits += 1;
14062            return Ok(cached);
14063        }
14064
14065        self.misses += 1;
14066        let id = storage.ensure_indexing_workspace(path, display_name)?;
14067        self.workspace_ids.insert(path.to_path_buf(), id);
14068        Ok(id)
14069    }
14070
14071    /// Get cache statistics: (hits, misses, hit_rate).
14072    pub fn stats(&self) -> (u64, u64, f64) {
14073        let total = self.hits + self.misses;
14074        let hit_rate = if total > 0 {
14075            self.hits as f64 / total as f64
14076        } else {
14077            0.0
14078        };
14079        (self.hits, self.misses, hit_rate)
14080    }
14081
14082    /// Clear the cache, resetting all state.
14083    pub fn clear(&mut self) {
14084        self.agent_ids.clear();
14085        self.workspace_ids.clear();
14086        self.hits = 0;
14087        self.misses = 0;
14088    }
14089
14090    /// Number of cached agents.
14091    pub fn agent_count(&self) -> usize {
14092        self.agent_ids.len()
14093    }
14094
14095    /// Number of cached workspaces.
14096    pub fn workspace_count(&self) -> usize {
14097        self.workspace_ids.len()
14098    }
14099}
14100
14101// -------------------------------------------------------------------------
14102// StatsAggregator (kzxu) - Batched Daily Stats Updates
14103// -------------------------------------------------------------------------
14104// Aggregates daily stats in memory during batch ingestion, then flushes
14105// to the database in a single batched INSERT...ON CONFLICT operation.
14106// This prevents N×4 database writes (4 permutations per conversation).
14107
14108/// Accumulated statistics delta for a single (day_id, agent, source) combination.
14109#[derive(Clone, Copy, Debug, Default)]
14110pub struct StatsDelta {
14111    pub session_count_delta: i64,
14112    pub message_count_delta: i64,
14113    pub total_chars_delta: i64,
14114}
14115
14116/// In-memory aggregator for batched daily stats updates.
14117///
14118/// During batch ingestion, we accumulate deltas per (day_id, agent, source) key.
14119/// After processing all conversations, call `expand()` to generate the 4
14120/// permutations per raw entry, then flush via `SqliteStorage::update_daily_stats_batched`.
14121///
14122/// # Example
14123/// ```ignore
14124/// let mut agg = StatsAggregator::new();
14125/// for conv in conversations {
14126///     agg.record(&conv.agent_slug, source_id, day_id, msg_count, char_count);
14127/// }
14128/// let entries = agg.expand();
14129/// storage.update_daily_stats_batched(&entries)?;
14130/// ```
14131#[derive(Debug, Default)]
14132pub struct StatsAggregator {
14133    /// Raw deltas keyed by (day_id, agent_slug, source_id).
14134    /// Only stores specific (non-"all") combinations.
14135    deltas: HashMap<(i64, String, String), StatsDelta>,
14136}
14137
14138impl StatsAggregator {
14139    /// Create a new empty aggregator.
14140    pub fn new() -> Self {
14141        Self {
14142            deltas: HashMap::new(),
14143        }
14144    }
14145
14146    /// Record a conversation's contribution to stats (session + messages + chars).
14147    ///
14148    /// This increments session_count by 1.
14149    ///
14150    /// # Arguments
14151    /// * `agent_slug` - The specific agent slug (not "all")
14152    /// * `source_id` - The specific source ID (not "all")
14153    /// * `day_id` - Days since 2020-01-01 (from `SqliteStorage::day_id_from_millis`)
14154    /// * `message_count` - Number of messages in the conversation
14155    /// * `total_chars` - Total character count across all messages
14156    pub fn record(
14157        &mut self,
14158        agent_slug: &str,
14159        source_id: &str,
14160        day_id: i64,
14161        message_count: i64,
14162        total_chars: i64,
14163    ) {
14164        self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
14165    }
14166
14167    /// Record an arbitrary delta. Use this for append-only updates where
14168    /// `session_count_delta` may be 0 but message/char deltas are non-zero.
14169    pub fn record_delta(
14170        &mut self,
14171        agent_slug: &str,
14172        source_id: &str,
14173        day_id: i64,
14174        session_count_delta: i64,
14175        message_count_delta: i64,
14176        total_chars_delta: i64,
14177    ) {
14178        if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
14179            return;
14180        }
14181        let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
14182        let delta = self.deltas.entry(key).or_default();
14183        delta.session_count_delta += session_count_delta;
14184        delta.message_count_delta += message_count_delta;
14185        delta.total_chars_delta += total_chars_delta;
14186    }
14187
14188    /// Expand raw deltas into the 4 permutation keys:
14189    /// - (agent, source) - specific both
14190    /// - ("all", source) - all agents, specific source
14191    /// - (agent, "all") - specific agent, all sources
14192    /// - ("all", "all") - totals
14193    ///
14194    /// Returns entries sorted by (day_id, agent_slug, source_id) for deterministic batching.
14195    pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
14196        let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
14197
14198        for ((day_id, agent, source), delta) in &self.deltas {
14199            let permutations = [
14200                (agent.as_str(), source.as_str()),
14201                ("all", source.as_str()),
14202                (agent.as_str(), "all"),
14203                ("all", "all"),
14204            ];
14205
14206            // Ensure we don't double-apply deltas if agent/source is already "all".
14207            for idx in 0..permutations.len() {
14208                let (a, s) = permutations[idx];
14209                if permutations[..idx].contains(&(a, s)) {
14210                    continue;
14211                }
14212                let key = (*day_id, a.to_owned(), s.to_owned());
14213                let entry = expanded.entry(key).or_default();
14214                entry.session_count_delta += delta.session_count_delta;
14215                entry.message_count_delta += delta.message_count_delta;
14216                entry.total_chars_delta += delta.total_chars_delta;
14217            }
14218        }
14219
14220        let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
14221            .into_iter()
14222            .map(|((d, a, s), delta)| (d, a, s, delta))
14223            .collect();
14224        out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
14225            d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
14226        });
14227        out
14228    }
14229
14230    /// Check if the aggregator is empty (no data recorded).
14231    pub fn is_empty(&self) -> bool {
14232        self.deltas.is_empty()
14233    }
14234
14235    /// Get number of distinct raw (day, agent, source) combinations recorded.
14236    pub fn raw_entry_count(&self) -> usize {
14237        self.deltas.len()
14238    }
14239}
14240
14241// -------------------------------------------------------------------------
14242// TokenStatsAggregator — Batched Token Analytics Daily Stats
14243// -------------------------------------------------------------------------
14244// Mirrors StatsAggregator pattern for token-level metrics.
14245// Aggregates token usage in memory during batch ingestion, then flushes
14246// to token_daily_stats in a single batched INSERT...ON CONFLICT operation.
14247
14248/// Accumulated token statistics delta for a single (day_id, agent, source, model_family) combination.
14249#[derive(Clone, Debug, Default)]
14250pub struct TokenStatsDelta {
14251    pub api_call_count: i64,
14252    pub user_message_count: i64,
14253    pub assistant_message_count: i64,
14254    pub tool_message_count: i64,
14255    pub total_input_tokens: i64,
14256    pub total_output_tokens: i64,
14257    pub total_cache_read_tokens: i64,
14258    pub total_cache_creation_tokens: i64,
14259    pub total_thinking_tokens: i64,
14260    pub grand_total_tokens: i64,
14261    pub total_content_chars: i64,
14262    pub total_tool_calls: i64,
14263    pub estimated_cost_usd: f64,
14264    pub session_count: i64,
14265}
14266
14267/// In-memory aggregator for batched token daily stats updates.
14268///
14269/// During batch ingestion, accumulate token deltas per (day_id, agent, source, model_family) key.
14270/// After processing, call `expand()` to generate the 5 permutation keys, then flush via
14271/// `update_token_daily_stats_batched_in_tx`.
14272#[derive(Debug, Default)]
14273pub struct TokenStatsAggregator {
14274    /// Raw deltas keyed by (day_id, agent_slug, source_id, model_family).
14275    deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
14276}
14277
14278impl TokenStatsAggregator {
14279    pub fn new() -> Self {
14280        Self {
14281            deltas: HashMap::new(),
14282        }
14283    }
14284
14285    /// Record a single message's token contribution.
14286    #[allow(clippy::too_many_arguments)]
14287    pub fn record(
14288        &mut self,
14289        agent_slug: &str,
14290        source_id: &str,
14291        day_id: i64,
14292        model_family: &str,
14293        role: &str,
14294        usage: &crate::connectors::ExtractedTokenUsage,
14295        content_chars: i64,
14296        estimated_cost_usd: f64,
14297    ) {
14298        let key = (
14299            day_id,
14300            agent_slug.to_owned(),
14301            source_id.to_owned(),
14302            model_family.to_owned(),
14303        );
14304        let delta = self.deltas.entry(key).or_default();
14305
14306        delta.api_call_count += 1;
14307        match role {
14308            "user" => delta.user_message_count += 1,
14309            "assistant" | "agent" => delta.assistant_message_count += 1,
14310            "tool" => delta.tool_message_count += 1,
14311            _ => {}
14312        }
14313
14314        delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
14315        delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
14316        delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
14317        delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
14318        delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
14319        delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
14320        delta.total_content_chars += content_chars;
14321        delta.total_tool_calls += usage.tool_call_count as i64;
14322        delta.estimated_cost_usd += estimated_cost_usd;
14323    }
14324
14325    /// Record a session count bump for a given day/agent/source/model.
14326    pub fn record_session(
14327        &mut self,
14328        agent_slug: &str,
14329        source_id: &str,
14330        day_id: i64,
14331        model_family: &str,
14332    ) {
14333        let key = (
14334            day_id,
14335            agent_slug.to_owned(),
14336            source_id.to_owned(),
14337            model_family.to_owned(),
14338        );
14339        self.deltas.entry(key).or_default().session_count += 1;
14340    }
14341
14342    /// Expand raw deltas into 5 permutation keys for the 4-dimensional composite PK:
14343    /// - (agent, source, model)  — specific all three
14344    /// - ("all", source, model)  — all agents
14345    /// - (agent, "all", model)   — all sources
14346    /// - (agent, source, "all")  — all models
14347    /// - ("all", "all", "all")   — global total
14348    pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
14349        let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
14350
14351        for ((day_id, agent, source, model), delta) in &self.deltas {
14352            let permutations = [
14353                (agent.as_str(), source.as_str(), model.as_str()),
14354                ("all", source.as_str(), model.as_str()),
14355                (agent.as_str(), "all", model.as_str()),
14356                (agent.as_str(), source.as_str(), "all"),
14357                ("all", "all", "all"),
14358            ];
14359
14360            for idx in 0..permutations.len() {
14361                let (a, s, m) = permutations[idx];
14362                // Deduplicate if agent/source/model is already "all"
14363                if permutations[..idx].contains(&(a, s, m)) {
14364                    continue;
14365                }
14366                let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
14367                let entry = expanded.entry(key).or_default();
14368                entry.api_call_count += delta.api_call_count;
14369                entry.user_message_count += delta.user_message_count;
14370                entry.assistant_message_count += delta.assistant_message_count;
14371                entry.tool_message_count += delta.tool_message_count;
14372                entry.total_input_tokens += delta.total_input_tokens;
14373                entry.total_output_tokens += delta.total_output_tokens;
14374                entry.total_cache_read_tokens += delta.total_cache_read_tokens;
14375                entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
14376                entry.total_thinking_tokens += delta.total_thinking_tokens;
14377                entry.grand_total_tokens += delta.grand_total_tokens;
14378                entry.total_content_chars += delta.total_content_chars;
14379                entry.total_tool_calls += delta.total_tool_calls;
14380                entry.estimated_cost_usd += delta.estimated_cost_usd;
14381                entry.session_count += delta.session_count;
14382            }
14383        }
14384
14385        let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
14386            .into_iter()
14387            .map(|((d, a, s, m), delta)| (d, a, s, m, delta))
14388            .collect();
14389        out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
14390            d1.cmp(d2)
14391                .then_with(|| a1.cmp(a2))
14392                .then_with(|| s1.cmp(s2))
14393                .then_with(|| m1.cmp(m2))
14394        });
14395        out
14396    }
14397
14398    pub fn is_empty(&self) -> bool {
14399        self.deltas.is_empty()
14400    }
14401
14402    pub fn raw_entry_count(&self) -> usize {
14403        self.deltas.len()
14404    }
14405}
14406
14407// -------------------------------------------------------------------------
14408// AnalyticsRollupAggregator — Batched usage_hourly + usage_daily Updates
14409// -------------------------------------------------------------------------
14410// Accumulates per-message deltas in memory, then flushes to both
14411// usage_hourly and usage_daily in a single batched operation.
14412
14413/// Delta for a single (bucket, agent_slug, workspace_id, source_id) rollup key.
14414#[derive(Clone, Debug, Default)]
14415pub struct UsageRollupDelta {
14416    pub message_count: i64,
14417    pub user_message_count: i64,
14418    pub assistant_message_count: i64,
14419    pub tool_call_count: i64,
14420    pub plan_message_count: i64,
14421    pub plan_content_tokens_est_total: i64,
14422    pub plan_api_tokens_total: i64,
14423    pub api_coverage_message_count: i64,
14424    pub content_tokens_est_total: i64,
14425    pub content_tokens_est_user: i64,
14426    pub content_tokens_est_assistant: i64,
14427    pub api_tokens_total: i64,
14428    pub api_input_tokens_total: i64,
14429    pub api_output_tokens_total: i64,
14430    pub api_cache_read_tokens_total: i64,
14431    pub api_cache_creation_tokens_total: i64,
14432    pub api_thinking_tokens_total: i64,
14433}
14434
14435/// Pending message_metrics row for batch insertion.
14436#[derive(Debug, Clone)]
14437pub struct MessageMetricsEntry {
14438    pub message_id: i64,
14439    pub created_at_ms: i64,
14440    pub hour_id: i64,
14441    pub day_id: i64,
14442    pub agent_slug: String,
14443    pub workspace_id: i64,
14444    pub source_id: String,
14445    pub role: String,
14446    pub content_chars: i64,
14447    pub content_tokens_est: i64,
14448    pub model_name: Option<String>,
14449    pub model_family: String,
14450    pub model_tier: String,
14451    pub provider: String,
14452    pub api_input_tokens: Option<i64>,
14453    pub api_output_tokens: Option<i64>,
14454    pub api_cache_read_tokens: Option<i64>,
14455    pub api_cache_creation_tokens: Option<i64>,
14456    pub api_thinking_tokens: Option<i64>,
14457    pub api_service_tier: Option<String>,
14458    pub api_data_source: String,
14459    pub tool_call_count: i64,
14460    pub has_tool_calls: bool,
14461    pub has_plan: bool,
14462}
14463
14464/// In-memory aggregator for batched usage_hourly and usage_daily rollup updates.
14465///
14466/// Keyed by (bucket_id, agent_slug, workspace_id, source_id).
14467/// Maintains separate hourly and daily delta maps.
14468#[derive(Debug, Default)]
14469pub struct AnalyticsRollupAggregator {
14470    hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14471    daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14472    models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
14473}
14474
14475impl AnalyticsRollupAggregator {
14476    pub fn new() -> Self {
14477        Self::default()
14478    }
14479
14480    /// Record a single message's contribution to both hourly and daily rollups.
14481    pub fn record(&mut self, entry: &MessageMetricsEntry) {
14482        let content_est = entry.content_tokens_est;
14483        let api_total = entry.api_input_tokens.unwrap_or(0)
14484            + entry.api_output_tokens.unwrap_or(0)
14485            + entry.api_cache_read_tokens.unwrap_or(0)
14486            + entry.api_cache_creation_tokens.unwrap_or(0)
14487            + entry.api_thinking_tokens.unwrap_or(0);
14488        let is_api = entry.api_data_source == "api";
14489        let is_user = entry.role == "user";
14490        let is_assistant = entry.role == "assistant" || entry.role == "agent";
14491
14492        // Apply to both hourly and daily
14493        for (map, bucket_id) in [
14494            (&mut self.hourly, entry.hour_id),
14495            (&mut self.daily, entry.day_id),
14496        ] {
14497            let key = (
14498                bucket_id,
14499                entry.agent_slug.clone(),
14500                entry.workspace_id,
14501                entry.source_id.clone(),
14502            );
14503            let d = map.entry(key).or_default();
14504            d.message_count += 1;
14505            if is_user {
14506                d.user_message_count += 1;
14507                d.content_tokens_est_user += content_est;
14508            }
14509            if is_assistant {
14510                d.assistant_message_count += 1;
14511                d.content_tokens_est_assistant += content_est;
14512            }
14513            d.tool_call_count += entry.tool_call_count;
14514            if entry.has_plan {
14515                d.plan_message_count += 1;
14516                d.plan_content_tokens_est_total += content_est;
14517                if is_api {
14518                    d.plan_api_tokens_total += api_total;
14519                }
14520            }
14521            if is_api {
14522                d.api_coverage_message_count += 1;
14523                d.api_tokens_total += api_total;
14524                d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14525                d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14526                d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14527                d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14528                d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14529            }
14530            d.content_tokens_est_total += content_est;
14531        }
14532
14533        let model_key = (
14534            entry.day_id,
14535            entry.agent_slug.clone(),
14536            entry.workspace_id,
14537            entry.source_id.clone(),
14538            entry.model_family.clone(),
14539            entry.model_tier.clone(),
14540        );
14541        let d = self.models_daily.entry(model_key).or_default();
14542        d.message_count += 1;
14543        if is_user {
14544            d.user_message_count += 1;
14545            d.content_tokens_est_user += content_est;
14546        }
14547        if is_assistant {
14548            d.assistant_message_count += 1;
14549            d.content_tokens_est_assistant += content_est;
14550        }
14551        d.tool_call_count += entry.tool_call_count;
14552        if entry.has_plan {
14553            d.plan_message_count += 1;
14554            d.plan_content_tokens_est_total += content_est;
14555            if is_api {
14556                d.plan_api_tokens_total += api_total;
14557            }
14558        }
14559        if is_api {
14560            d.api_coverage_message_count += 1;
14561            d.api_tokens_total += api_total;
14562            d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14563            d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14564            d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14565            d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14566            d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14567        }
14568        d.content_tokens_est_total += content_est;
14569    }
14570
14571    pub fn is_empty(&self) -> bool {
14572        self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
14573    }
14574
14575    pub fn hourly_entry_count(&self) -> usize {
14576        self.hourly.len()
14577    }
14578
14579    pub fn daily_entry_count(&self) -> usize {
14580        self.daily.len()
14581    }
14582
14583    pub fn models_daily_entry_count(&self) -> usize {
14584        self.models_daily.len()
14585    }
14586}
14587
14588/// Whether the current role should be considered for plan attribution.
14589///
14590/// Plan attribution v2 defaults to assistant/agent messages only.
14591fn has_plan_for_role(role: &str, content: &str) -> bool {
14592    let role = role.trim();
14593    (role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
14594        && has_plan_heuristic(content)
14595}
14596
14597/// Heuristic to detect "plan" messages.
14598///
14599/// v2 behavior:
14600/// - Require an explicit plan marker near the top of the message.
14601/// - Require structured steps (numbered or bullets) to reduce false positives.
14602/// - Avoid classifying tool-output blobs as plans.
14603fn has_plan_heuristic(content: &str) -> bool {
14604    if content.len() < 24 {
14605        return false;
14606    }
14607
14608    let lower = content.to_lowercase();
14609
14610    // Ignore tool-output-like blobs unless they also have a strong plan header.
14611    let looks_like_tool_blob = lower.contains("```")
14612        || lower.contains("\"tool\"")
14613        || lower.contains("stdout:")
14614        || lower.contains("stderr:")
14615        || lower.contains("exit code:");
14616
14617    let mut lines: Vec<&str> = Vec::with_capacity(60);
14618    let mut in_fenced_code = false;
14619    for raw in lower.lines() {
14620        let line = raw.trim();
14621        if line.starts_with("```") {
14622            in_fenced_code = !in_fenced_code;
14623            continue;
14624        }
14625        if in_fenced_code || line.is_empty() {
14626            continue;
14627        }
14628        lines.push(line);
14629        if lines.len() >= 60 {
14630            break;
14631        }
14632    }
14633
14634    let header_pos = lines.iter().position(|line| {
14635        line.starts_with("## plan")
14636            || line.starts_with("# plan")
14637            || line.starts_with("plan:")
14638            || line.starts_with("implementation plan")
14639            || line.starts_with("next steps:")
14640            || line.starts_with("action plan:")
14641    });
14642    let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
14643    let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
14644
14645    if !header_near_top {
14646        return false;
14647    }
14648    if looks_like_tool_blob && header_pos.is_none() {
14649        return false;
14650    }
14651
14652    let numbered_steps = lines
14653        .iter()
14654        .filter(|line| is_numbered_step_line(line))
14655        .count();
14656    let bullet_steps = lines
14657        .iter()
14658        .filter(|line| {
14659            line.starts_with("- ")
14660                || line.starts_with("* ")
14661                || line.starts_with("+ ")
14662                || line.starts_with("- [ ] ")
14663                || line.starts_with("- [x] ")
14664        })
14665        .count();
14666
14667    numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
14668}
14669
14670fn is_numbered_step_line(line: &str) -> bool {
14671    let trimmed = line.trim_start();
14672    let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
14673    if digit_count == 0 || digit_count > 3 {
14674        return false;
14675    }
14676    let rest = &trimmed[digit_count..];
14677    rest.starts_with(". ") || rest.starts_with(") ")
14678}
14679
14680/// Pending token_usage row to be batch-inserted.
14681#[derive(Debug, Clone)]
14682pub struct TokenUsageEntry {
14683    pub message_id: i64,
14684    pub conversation_id: i64,
14685    pub agent_id: i64,
14686    pub workspace_id: Option<i64>,
14687    pub source_id: String,
14688    pub timestamp_ms: i64,
14689    pub day_id: i64,
14690    pub model_name: Option<String>,
14691    pub model_family: Option<String>,
14692    pub model_tier: Option<String>,
14693    pub service_tier: Option<String>,
14694    pub provider: Option<String>,
14695    pub input_tokens: Option<i64>,
14696    pub output_tokens: Option<i64>,
14697    pub cache_read_tokens: Option<i64>,
14698    pub cache_creation_tokens: Option<i64>,
14699    pub thinking_tokens: Option<i64>,
14700    pub total_tokens: Option<i64>,
14701    pub estimated_cost_usd: Option<f64>,
14702    pub role: String,
14703    pub content_chars: i64,
14704    pub has_tool_calls: bool,
14705    pub tool_call_count: u32,
14706    pub data_source: String,
14707}
14708
14709// -------------------------------------------------------------------------
14710// PricingTable — In-memory cache for model_pricing lookups (bead z9fse.10)
14711// -------------------------------------------------------------------------
14712
14713/// One pricing row loaded from the `model_pricing` table.
14714#[derive(Debug, Clone)]
14715pub struct PricingEntry {
14716    pub model_pattern: String,
14717    pub provider: String,
14718    pub input_cost_per_mtok: f64,
14719    pub output_cost_per_mtok: f64,
14720    pub cache_read_cost_per_mtok: Option<f64>,
14721    pub cache_creation_cost_per_mtok: Option<f64>,
14722    /// Effective date as day_id (days since 2020-01-01).
14723    pub effective_day_id: i64,
14724}
14725
14726/// Diagnostics for pricing coverage during a batch operation.
14727#[derive(Debug, Clone, Default)]
14728pub struct PricingDiagnostics {
14729    pub priced_count: u64,
14730    pub unpriced_count: u64,
14731    /// Top unknown model names → count.
14732    pub unknown_models: HashMap<String, u64>,
14733}
14734
14735impl PricingDiagnostics {
14736    fn record_priced(&mut self) {
14737        self.priced_count += 1;
14738    }
14739
14740    fn record_unpriced(&mut self, model_name: Option<&str>) {
14741        self.unpriced_count += 1;
14742        let key = model_name.unwrap_or("(none)").to_string();
14743        *self.unknown_models.entry(key).or_insert(0) += 1;
14744    }
14745
14746    /// Log a summary of pricing coverage.
14747    pub fn log_summary(&self) {
14748        let total = self.priced_count + self.unpriced_count;
14749        if total == 0 {
14750            return;
14751        }
14752        let pct = (self.priced_count as f64 / total as f64) * 100.0;
14753        tracing::info!(
14754            target: "cass::analytics::pricing",
14755            priced = self.priced_count,
14756            unpriced = self.unpriced_count,
14757            total = total,
14758            coverage_pct = format!("{pct:.1}%"),
14759            "pricing coverage"
14760        );
14761        if !self.unknown_models.is_empty() {
14762            let mut sorted: Vec<_> = self.unknown_models.iter().collect();
14763            sorted.sort_by(|a, b| b.1.cmp(a.1));
14764            for (model, count) in sorted.iter().take(5) {
14765                tracing::debug!(
14766                    target: "cass::analytics::pricing",
14767                    model = model.as_str(),
14768                    count = count,
14769                    "unknown model (no pricing)"
14770                );
14771            }
14772        }
14773    }
14774}
14775
14776/// In-memory pricing table loaded from `model_pricing` for fast lookups.
14777#[derive(Debug, Clone)]
14778pub struct PricingTable {
14779    entries: Vec<PricingEntry>,
14780}
14781
14782impl PricingTable {
14783    /// Load all pricing entries from the database.
14784    pub fn load(conn: &FrankenConnection) -> Result<Self> {
14785        Self::franken_load(conn)
14786    }
14787
14788    /// Load all pricing entries from a frankensqlite connection.
14789    pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
14790        let rows = conn.query(
14791            "SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
14792                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
14793             FROM model_pricing
14794             ORDER BY effective_date DESC",
14795        )?;
14796        let mut entries = Vec::with_capacity(rows.len());
14797        for row in &rows {
14798            let effective_date: String = row.get_typed(6)?;
14799            let effective_day_id = date_str_to_day_id(&effective_date)?;
14800            entries.push(PricingEntry {
14801                model_pattern: row.get_typed(0)?,
14802                provider: row.get_typed(1)?,
14803                input_cost_per_mtok: row.get_typed(2)?,
14804                output_cost_per_mtok: row.get_typed(3)?,
14805                cache_read_cost_per_mtok: row.get_typed(4)?,
14806                cache_creation_cost_per_mtok: row.get_typed(5)?,
14807                effective_day_id,
14808            });
14809        }
14810        Ok(Self { entries })
14811    }
14812
14813    /// Look up the best pricing entry for a given model name and date.
14814    ///
14815    /// Selection rules:
14816    /// 1. Pattern must match model_name (SQL LIKE semantics).
14817    /// 2. effective_day_id must be <= message_day_id.
14818    /// 3. Among matches, prefer the most recent effective_date.
14819    /// 4. Tie-break by pattern specificity (longest pattern wins).
14820    pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
14821        let mut best: Option<&PricingEntry> = None;
14822
14823        for entry in &self.entries {
14824            if entry.effective_day_id > message_day_id {
14825                continue;
14826            }
14827            if !sql_like_match(model_name, &entry.model_pattern) {
14828                continue;
14829            }
14830
14831            match best {
14832                None => best = Some(entry),
14833                Some(current) => {
14834                    if entry.effective_day_id > current.effective_day_id
14835                        || (entry.effective_day_id == current.effective_day_id
14836                            && entry.model_pattern.len() > current.model_pattern.len())
14837                    {
14838                        best = Some(entry);
14839                    }
14840                }
14841            }
14842        }
14843
14844        best
14845    }
14846
14847    /// Compute estimated cost in USD for a set of token counts.
14848    ///
14849    /// Returns `None` if no pricing entry matches or if no token counts are available.
14850    pub fn compute_cost(
14851        &self,
14852        model_name: Option<&str>,
14853        message_day_id: i64,
14854        input_tokens: Option<i64>,
14855        output_tokens: Option<i64>,
14856        cache_read_tokens: Option<i64>,
14857        cache_creation_tokens: Option<i64>,
14858    ) -> Option<f64> {
14859        let model = model_name?;
14860        let pricing = self.lookup(model, message_day_id)?;
14861
14862        if input_tokens.is_none() && output_tokens.is_none() {
14863            return None;
14864        }
14865
14866        let mut cost = 0.0;
14867        let cache_read = cache_read_tokens.unwrap_or(0);
14868        let cache_creation = cache_creation_tokens.unwrap_or(0);
14869        // input_tokens includes cache tokens as a subset; subtract them
14870        // so we don't charge at both the full input rate AND the cache rate.
14871        let non_cache_input = input_tokens
14872            .unwrap_or(0)
14873            .saturating_sub(cache_read)
14874            .saturating_sub(cache_creation)
14875            .max(0);
14876        cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
14877        cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
14878
14879        if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
14880            cost += cache_read as f64 * cache_price / 1_000_000.0;
14881        }
14882        if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
14883            cost += cache_creation as f64 * cache_price / 1_000_000.0;
14884        }
14885
14886        Some(cost)
14887    }
14888
14889    /// Whether the pricing table has any entries.
14890    pub fn is_empty(&self) -> bool {
14891        self.entries.is_empty()
14892    }
14893}
14894
14895/// Convert "YYYY-MM-DD" date string to day_id (days since 2020-01-01),
14896/// matching the format produced by `day_id_from_millis`.
14897fn date_str_to_day_id(s: &str) -> Result<i64> {
14898    use chrono::NaiveDate;
14899    const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
14900        Some(d) => d,
14901        None => unreachable!(),
14902    };
14903    NaiveDate::parse_from_str(s, "%Y-%m-%d")
14904        .map(|d| (d - EPOCH_2020).num_days())
14905        .with_context(|| format!("invalid effective_date '{s}'"))
14906}
14907
14908/// SQL LIKE pattern matcher (case-insensitive). `%` = any sequence, `_` = any single char.
14909fn sql_like_match(value: &str, pattern: &str) -> bool {
14910    sql_like_match_bytes(
14911        value.to_ascii_lowercase().as_bytes(),
14912        pattern.to_ascii_lowercase().as_bytes(),
14913    )
14914}
14915
14916/// Determine the byte length of the UTF-8 character starting at `b`.
14917fn utf8_char_len(b: u8) -> usize {
14918    if b < 0x80 {
14919        1
14920    } else if b < 0xE0 {
14921        2
14922    } else if b < 0xF0 {
14923        3
14924    } else {
14925        4
14926    }
14927}
14928
14929fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
14930    if pat.is_empty() {
14931        return val.is_empty();
14932    }
14933    match pat[0] {
14934        b'%' => {
14935            let mut p = 1;
14936            while p < pat.len() && pat[p] == b'%' {
14937                p += 1;
14938            }
14939            let rest = &pat[p..];
14940            // Iterate only at UTF-8 char boundaries
14941            let mut i = 0;
14942            while i <= val.len() {
14943                if sql_like_match_bytes(&val[i..], rest) {
14944                    return true;
14945                }
14946                if i < val.len() {
14947                    i += utf8_char_len(val[i]);
14948                } else {
14949                    break;
14950                }
14951            }
14952            false
14953        }
14954        b'_' => {
14955            // Match one full UTF-8 character, not just one byte
14956            if val.is_empty() {
14957                return false;
14958            }
14959            let char_len = utf8_char_len(val[0]);
14960            val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
14961        }
14962        c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
14963    }
14964}
14965
14966fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
14967    dotenvy::var(var)
14968        .ok()
14969        .and_then(|raw| raw.parse::<usize>().ok())
14970        .filter(|value| *value > 0)
14971        .unwrap_or(default)
14972}
14973
14974/// Returns true when the error chain represents a real `FrankenError::OutOfMemory`
14975/// (typed variant) or a bare "out of memory" / "not enough memory" message.
14976///
14977/// We *deliberately* do not do substring matching on the rendered chain: frankensqlite's
14978/// `FrankenError::OutOfMemory` renders as the literal "out of memory" and is also emitted
14979/// for several non-process-OOM internal conditions (VFS buffer / VDBE register allocation).
14980/// Contextual messages like "connector parse failed: not enough memory in record" must not
14981/// be promoted into the OOM-bisect/quarantine path. See `retryable_franken_anyhow` above
14982/// for the same downcast idiom.
14983fn is_out_of_memory_error<E: OutOfMemoryProbe + ?Sized>(err: &E) -> bool {
14984    err.is_out_of_memory()
14985}
14986
14987trait OutOfMemoryProbe {
14988    fn is_out_of_memory(&self) -> bool;
14989}
14990
14991impl OutOfMemoryProbe for anyhow::Error {
14992    fn is_out_of_memory(&self) -> bool {
14993        self.chain().any(|cause| {
14994            if cause
14995                .downcast_ref::<frankensqlite::FrankenError>()
14996                .is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
14997            {
14998                return true;
14999            }
15000            is_exact_out_of_memory_message(&cause.to_string())
15001        })
15002    }
15003}
15004
15005impl OutOfMemoryProbe for frankensqlite::FrankenError {
15006    fn is_out_of_memory(&self) -> bool {
15007        matches!(self, frankensqlite::FrankenError::OutOfMemory)
15008    }
15009}
15010
15011fn is_exact_out_of_memory_message(message: &str) -> bool {
15012    matches!(
15013        message.trim().to_ascii_lowercase().as_str(),
15014        "out of memory" | "not enough memory"
15015    )
15016}
15017
15018// Second SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
15019// All methods (insert_conversation_tree, list_agents, list_conversations, etc.) are
15020// available through FrankenStorage.
15021
15022/// Daily count data for histogram display.
15023#[derive(Debug, Clone)]
15024pub struct DailyCount {
15025    pub day_id: i64,
15026    pub sessions: i64,
15027    pub messages: i64,
15028    pub chars: i64,
15029}
15030
15031/// Result of an analytics rebuild operation.
15032#[derive(Debug, Clone)]
15033pub struct AnalyticsRebuildResult {
15034    pub message_metrics_rows: usize,
15035    pub usage_hourly_rows: usize,
15036    pub usage_daily_rows: usize,
15037    pub usage_models_daily_rows: usize,
15038    pub elapsed_ms: u64,
15039    pub messages_per_sec: f64,
15040}
15041
15042/// Result of rebuilding daily stats.
15043#[derive(Debug, Clone)]
15044pub struct DailyStatsRebuildResult {
15045    pub rows_created: i64,
15046    pub total_sessions: i64,
15047}
15048
15049/// Result of purging archived data for a single agent.
15050#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
15051pub struct AgentArchivePurgeResult {
15052    pub conversations_deleted: usize,
15053    pub messages_deleted: usize,
15054}
15055
15056/// Health status of daily stats table.
15057#[derive(Debug, Clone)]
15058pub struct DailyStatsHealth {
15059    pub populated: bool,
15060    pub row_count: i64,
15061    pub oldest_update_ms: Option<i64>,
15062    pub conversation_count: i64,
15063    pub materialized_total: i64,
15064    pub drift: i64,
15065}
15066
15067// -------------------------------------------------------------------------
15068// FTS5 Batch Insert (P2 Opt 2.1)
15069// -------------------------------------------------------------------------
15070
15071/// Batch size for FTS5 inserts. With 7 columns per row (rowid + 6 cols) and
15072/// SQLite's SQLITE_MAX_VARIABLE_NUMBER default of 999, max batch is ~142 rows.
15073/// Using 100 for safety margin and memory efficiency.
15074const FTS5_BATCH_SIZE: usize = 100;
15075
15076#[derive(Debug, Clone)]
15077struct FtsRebuildMessageRow {
15078    rowid: i64,
15079    message_id: i64,
15080    conversation_id: i64,
15081    content: String,
15082    created_at: Option<i64>,
15083}
15084
15085#[derive(Debug, Clone)]
15086struct FtsConversationProjection {
15087    title: String,
15088    agent_id: Option<i64>,
15089    workspace_id: Option<i64>,
15090    source_path: String,
15091}
15092
15093/// Entry for pending FTS5 insert.
15094#[derive(Debug, Clone)]
15095pub struct FtsEntry {
15096    pub content: String,
15097    pub title: String,
15098    pub agent: String,
15099    pub workspace: String,
15100    pub source_path: String,
15101    pub created_at: Option<i64>,
15102    pub message_id: i64,
15103}
15104
15105impl FtsEntry {
15106    /// Create an FTS entry from a message and conversation.
15107    pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
15108        FtsEntry {
15109            content: msg.content.clone(),
15110            title: conv.title.clone().unwrap_or_default(),
15111            agent: conv.agent_slug.clone(),
15112            workspace: conv
15113                .workspace
15114                .as_ref()
15115                .map(|p| p.to_string_lossy().into_owned())
15116                .unwrap_or_default(),
15117            source_path: path_to_string(&conv.source_path),
15118            created_at: msg.created_at.or(conv.started_at),
15119            message_id,
15120        }
15121    }
15122}
15123
15124const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
15125const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
15126
15127/// Default batch size for the FTS rebuild INSERT (Bug #168).  When
15128/// `fts_messages` is empty but `messages` has 100K+ rows, a single unbounded
15129/// INSERT-SELECT OOMs.  This constant caps each batch so peak memory stays
15130/// bounded.  Override via `CASS_FTS_REBUILD_BATCH_SIZE` for tuning.
15131const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
15132
15133/// Read the FTS rebuild batch size from the environment, falling back to the
15134/// compiled-in default.
15135fn fts_rebuild_batch_size() -> usize {
15136    dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
15137        .ok()
15138        .and_then(|v| v.parse::<usize>().ok())
15139        .filter(|&n| n > 0)
15140        .unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
15141}
15142
15143fn flush_pending_fts_entries(
15144    storage: &FrankenStorage,
15145    tx: &FrankenTransaction<'_>,
15146    entries: &mut Vec<FtsEntry>,
15147    pending_chars: &mut usize,
15148    inserted_total: &mut usize,
15149) -> Result<()> {
15150    if entries.is_empty() {
15151        return Ok(());
15152    }
15153
15154    if storage.fts_messages_present_cached(tx) {
15155        *inserted_total += franken_batch_insert_fts(tx, entries)?;
15156    }
15157    entries.clear();
15158    *pending_chars = 0;
15159    Ok(())
15160}
15161
15162fn path_to_string<P: AsRef<Path>>(p: P) -> String {
15163    p.as_ref().to_string_lossy().into_owned()
15164}
15165
15166fn role_str(role: &MessageRole) -> String {
15167    role_as_str(role).to_owned()
15168}
15169
15170fn role_as_str(role: &MessageRole) -> &str {
15171    match role {
15172        MessageRole::User => "user",
15173        MessageRole::Agent => "agent",
15174        MessageRole::Tool => "tool",
15175        MessageRole::System => "system",
15176        MessageRole::Other(v) => v.as_str(),
15177    }
15178}
15179
15180fn agent_kind_str(kind: AgentKind) -> String {
15181    match kind {
15182        AgentKind::Cli => "cli".into(),
15183        AgentKind::VsCode => "vscode".into(),
15184        AgentKind::Hybrid => "hybrid".into(),
15185    }
15186}
15187
15188// =============================================================================
15189// Tests (bead yln.4)
15190// =============================================================================
15191
15192#[cfg(test)]
15193mod tests {
15194    use super::*;
15195    use serial_test::serial;
15196    use tempfile::TempDir;
15197
15198    struct EnvGuard {
15199        key: &'static str,
15200        previous: Option<String>,
15201    }
15202
15203    impl Drop for EnvGuard {
15204        fn drop(&mut self) {
15205            if let Some(value) = &self.previous {
15206                // SAFETY: test helper restores prior process env for isolation.
15207                unsafe {
15208                    std::env::set_var(self.key, value);
15209                }
15210            } else {
15211                // SAFETY: test helper restores prior process env for isolation.
15212                unsafe {
15213                    std::env::remove_var(self.key);
15214                }
15215            }
15216        }
15217    }
15218
15219    fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
15220        let previous = dotenvy::var(key).ok();
15221        // SAFETY: test helper toggles a process-local env var for isolation.
15222        unsafe {
15223            std::env::set_var(key, value.as_ref());
15224        }
15225        EnvGuard { key, previous }
15226    }
15227
15228    #[test]
15229    fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
15230        let dir = TempDir::new().unwrap();
15231        let canonical = dir.path().join("agent_search.db");
15232        let scratch = dir.path().join("scratch.db");
15233
15234        assert_eq!(
15235            doctor_mutation_lock_path_for_db_open(&canonical),
15236            Some(dir.path().join("doctor/locks/doctor-repair.lock"))
15237        );
15238        assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
15239    }
15240
15241    #[test]
15242    fn doctor_lock_metadata_pid_detection_is_exact() {
15243        let current = std::process::id();
15244
15245        assert!(doctor_lock_metadata_pid_is_current_process(&format!(
15246            "schema_version=1\npid={current}\nmode=safe_auto_run\n"
15247        )));
15248        assert!(!doctor_lock_metadata_pid_is_current_process(
15249            "schema_version=1\npid=not-a-pid\n"
15250        ));
15251        assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
15252            "pid={}\n",
15253            current.saturating_add(1)
15254        )));
15255    }
15256
15257    #[test]
15258    #[cfg(not(windows))]
15259    fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
15260        use std::io::Write as _;
15261
15262        let dir = TempDir::new().unwrap();
15263        let db_path = dir.path().join("agent_search.db");
15264        {
15265            let storage = FrankenStorage::open(&db_path).unwrap();
15266            storage.close().unwrap();
15267        }
15268
15269        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
15270        let mut lock_file = fs::OpenOptions::new()
15271            .create(true)
15272            .truncate(false)
15273            .read(true)
15274            .write(true)
15275            .open(&lock_path)
15276            .unwrap();
15277        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
15278        lock_file.set_len(0).unwrap();
15279        lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
15280        lock_file.sync_all().unwrap();
15281
15282        let err =
15283            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
15284                .expect_err("active doctor mutation lock must block canonical DB opens");
15285        let message = err.to_string();
15286        assert!(
15287            message.contains("doctor mutation lock") && message.contains("active"),
15288            "error should identify the active doctor mutation lock: {message}"
15289        );
15290
15291        fs2::FileExt::unlock(&lock_file).unwrap();
15292    }
15293
15294    #[test]
15295    fn doctor_storage_open_allows_current_doctor_process_probe() {
15296        use std::io::Write as _;
15297
15298        let dir = TempDir::new().unwrap();
15299        let db_path = dir.path().join("agent_search.db");
15300        {
15301            let storage = FrankenStorage::open(&db_path).unwrap();
15302            storage.close().unwrap();
15303        }
15304
15305        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
15306        let mut lock_file = fs::OpenOptions::new()
15307            .create(true)
15308            .truncate(false)
15309            .read(true)
15310            .write(true)
15311            .open(&lock_path)
15312            .unwrap();
15313        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
15314        lock_file.set_len(0).unwrap();
15315        write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
15316        lock_file.sync_all().unwrap();
15317
15318        #[cfg(windows)]
15319        let _bypass = enter_doctor_mutation_db_open_bypass();
15320
15321        let conn =
15322            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
15323                .expect(
15324                    "doctor process must be able to run post-repair read probes under its own lock",
15325                );
15326        drop(conn);
15327
15328        fs2::FileExt::unlock(&lock_file).unwrap();
15329    }
15330
15331    #[test]
15332    fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
15333        let mut attempts = Vec::new();
15334
15335        let selected = disable_autocommit_retain(|pragma| {
15336            attempts.push(pragma);
15337            if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
15338                Err("compat namespace unavailable")
15339            } else {
15340                Ok(())
15341            }
15342        })
15343        .expect("canonical pragma should disable autocommit retain");
15344
15345        assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
15346        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
15347    }
15348
15349    #[test]
15350    fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
15351        let mut attempts = Vec::new();
15352
15353        let err = disable_autocommit_retain(|pragma| {
15354            attempts.push(pragma);
15355            Err("unsupported pragma")
15356        })
15357        .expect_err("unsupported autocommit retain controls should fail closed");
15358
15359        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
15360        let message = err.to_string();
15361        assert!(
15362            message.contains("refusing to keep a long-lived MVCC connection"),
15363            "error should force callers away from unbounded snapshot retention: {message}"
15364        );
15365        assert!(
15366            message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
15367                && message.contains("PRAGMA autocommit_retain = OFF;"),
15368            "error should preserve attempted PRAGMAs for diagnostics: {message}"
15369        );
15370    }
15371
15372    /// Open a rusqlite connection on `db_path` for the narrow purpose of
15373    /// injecting (or inspecting the raw projection of) sqlite_master
15374    /// corruption patterns in test fixtures. Frankensqlite intentionally does
15375    /// not support `PRAGMA writable_schema` writes or raw inserts to
15376    /// sqlite_master (see AGENTS.md: "PRAGMA writable_schema: Not supported for
15377    /// write operations"), so these fixtures retain rusqlite as the standard-
15378    /// SQLite interop layer. All callers are in this test module and run under
15379    /// #[cfg(test)]; no production code path touches rusqlite here.
15380    fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
15381        rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
15382    }
15383
15384    fn seed_historical_db_direct(
15385        db_path: &Path,
15386        conversations: &[crate::model::types::Conversation],
15387    ) {
15388        if let Some(parent) = db_path.parent() {
15389            fs::create_dir_all(parent).unwrap();
15390        }
15391
15392        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
15393        conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
15394        conn.execute_compat(
15395            "INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
15396             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
15397            fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
15398        )
15399        .unwrap();
15400
15401        let mut next_message_id = 1_i64;
15402        for (conv_index, conv) in conversations.iter().enumerate() {
15403            let conversation_id = i64::try_from(conv_index + 1).unwrap();
15404            let workspace_id = conv.workspace.as_ref().map(|workspace| {
15405                let workspace_id = conversation_id;
15406                let workspace_path = workspace.to_string_lossy().into_owned();
15407                conn.execute_compat(
15408                    "INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
15409                    fparams![
15410                        workspace_id,
15411                        workspace_path.as_str(),
15412                        workspace_path.as_str()
15413                    ],
15414                )
15415                .unwrap();
15416                workspace_id
15417            });
15418            let source_path = conv.source_path.to_string_lossy().into_owned();
15419            let metadata_json = conv.metadata_json.to_string();
15420            conn.execute_compat(
15421                "INSERT INTO conversations (
15422                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
15423                    started_at, ended_at, approx_tokens, metadata_json, origin_host
15424                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
15425                fparams![
15426                    conversation_id,
15427                    1_i64,
15428                    workspace_id,
15429                    conv.source_id.as_str(),
15430                    conv.external_id.as_deref(),
15431                    conv.title.as_deref(),
15432                    source_path.as_str(),
15433                    conv.started_at,
15434                    conv.ended_at,
15435                    conv.approx_tokens,
15436                    metadata_json.as_str(),
15437                    conv.origin_host.as_deref()
15438                ],
15439            )
15440            .unwrap();
15441
15442            for msg in &conv.messages {
15443                let extra_json = msg.extra_json.to_string();
15444                let role = role_str(&msg.role);
15445                conn.execute_compat(
15446                    "INSERT INTO messages(
15447                        id, conversation_id, idx, role, author, created_at, content, extra_json
15448                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
15449                    fparams![
15450                        next_message_id,
15451                        conversation_id,
15452                        msg.idx,
15453                        role.as_str(),
15454                        msg.author.as_deref(),
15455                        msg.created_at,
15456                        msg.content.as_str(),
15457                        extra_json.as_str()
15458                    ],
15459                )
15460                .unwrap();
15461                next_message_id += 1;
15462            }
15463        }
15464    }
15465
15466    // =========================================================================
15467    // User data file protection tests (bead yln.4)
15468    // =========================================================================
15469
15470    #[test]
15471    fn is_user_data_file_detects_bookmarks() {
15472        assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
15473        assert!(is_user_data_file(Path::new("bookmarks.db")));
15474    }
15475
15476    #[test]
15477    fn is_user_data_file_detects_tui_state() {
15478        assert!(is_user_data_file(Path::new("/data/tui_state.json")));
15479    }
15480
15481    #[test]
15482    fn is_user_data_file_detects_sources_toml() {
15483        assert!(is_user_data_file(Path::new("/config/sources.toml")));
15484    }
15485
15486    #[test]
15487    fn is_user_data_file_detects_env() {
15488        assert!(is_user_data_file(Path::new(".env")));
15489    }
15490
15491    #[test]
15492    fn is_user_data_file_rejects_other_files() {
15493        assert!(!is_user_data_file(Path::new("index.db")));
15494        assert!(!is_user_data_file(Path::new("conversations.db")));
15495        assert!(!is_user_data_file(Path::new("random.txt")));
15496    }
15497
15498    // =========================================================================
15499    // Backup creation tests (bead yln.4)
15500    // =========================================================================
15501
15502    #[test]
15503    fn create_backup_returns_none_for_nonexistent() {
15504        let dir = TempDir::new().unwrap();
15505        let db_path = dir.path().join("nonexistent.db");
15506        let result = create_backup(&db_path).unwrap();
15507        assert!(result.is_none());
15508    }
15509
15510    #[test]
15511    fn create_backup_creates_named_file() {
15512        let dir = TempDir::new().unwrap();
15513        let db_path = dir.path().join("test.db");
15514        std::fs::write(&db_path, b"test data").unwrap();
15515
15516        let backup_path = create_backup(&db_path).unwrap();
15517        assert!(backup_path.is_some());
15518        let backup = backup_path.unwrap();
15519        assert!(backup.exists());
15520        assert!(
15521            backup
15522                .file_name()
15523                .unwrap()
15524                .to_str()
15525                .unwrap()
15526                .contains("backup")
15527        );
15528    }
15529
15530    #[test]
15531    fn create_backup_paths_are_unique() {
15532        let dir = TempDir::new().unwrap();
15533        let db_path = dir.path().join("test.db");
15534        std::fs::write(&db_path, b"test data").unwrap();
15535
15536        let first = create_backup(&db_path).unwrap().unwrap();
15537        let second = create_backup(&db_path).unwrap().unwrap();
15538
15539        assert_ne!(first, second);
15540        assert!(first.exists());
15541        assert!(second.exists());
15542    }
15543
15544    #[test]
15545    fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
15546        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
15547        use std::path::PathBuf;
15548
15549        let dir = TempDir::new().unwrap();
15550        let db_path = dir.path().join("agent_search.db");
15551        let storage = SqliteStorage::open(&db_path).unwrap();
15552
15553        let agent = Agent {
15554            id: None,
15555            slug: "claude_code".into(),
15556            name: "Claude Code".into(),
15557            version: None,
15558            kind: AgentKind::Cli,
15559        };
15560        let agent_id = storage.ensure_agent(&agent).unwrap();
15561        let conversation = Conversation {
15562            id: None,
15563            agent_slug: "claude_code".into(),
15564            workspace: Some(PathBuf::from("/tmp/workspace")),
15565            external_id: Some("conv-1".into()),
15566            title: Some("Lexical rebuild".into()),
15567            source_path: PathBuf::from("/tmp/conv-1.jsonl"),
15568            started_at: Some(1_700_000_000_000),
15569            ended_at: Some(1_700_000_000_100),
15570            approx_tokens: None,
15571            metadata_json: serde_json::Value::Null,
15572            messages: vec![
15573                Message {
15574                    id: None,
15575                    idx: 0,
15576                    role: MessageRole::User,
15577                    author: Some("user".into()),
15578                    created_at: Some(1_700_000_000_010),
15579                    content: "first".into(),
15580                    extra_json: serde_json::Value::Null,
15581                    snippets: Vec::new(),
15582                },
15583                Message {
15584                    id: None,
15585                    idx: 1,
15586                    role: MessageRole::Agent,
15587                    author: Some("assistant".into()),
15588                    created_at: Some(1_700_000_000_020),
15589                    content: "second".into(),
15590                    extra_json: serde_json::Value::Null,
15591                    snippets: Vec::new(),
15592                },
15593            ],
15594            source_id: LOCAL_SOURCE_ID.into(),
15595            origin_host: None,
15596        };
15597        storage
15598            .insert_conversation_tree(agent_id, None, &conversation)
15599            .unwrap();
15600        let conversation_id = storage
15601            .conn
15602            .query_row_map(
15603                "SELECT id FROM conversations WHERE external_id = ?1",
15604                fparams!["conv-1"],
15605                |row| row.get_typed::<i64>(0),
15606            )
15607            .unwrap();
15608
15609        let opcodes: Vec<String> = storage
15610            .conn
15611            .query_map_collect(
15612                "EXPLAIN \
15613                 SELECT id, idx, role, author, created_at, content \
15614                 FROM messages \
15615                 WHERE conversation_id = ?1 ORDER BY idx",
15616                fparams![conversation_id],
15617                |row| row.get_typed(1),
15618            )
15619            .unwrap();
15620
15621        assert!(
15622            opcodes.iter().any(|opcode| opcode == "SeekGE"),
15623            "expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
15624        );
15625        assert!(
15626            !opcodes.iter().any(|opcode| opcode == "SorterOpen"),
15627            "expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
15628        );
15629    }
15630
15631    #[test]
15632    fn schema_check_rebuild_classification_ignores_transient_errors() {
15633        assert!(!schema_check_error_requires_rebuild(
15634            &frankensqlite::FrankenError::Busy
15635        ));
15636        assert!(!schema_check_error_requires_rebuild(
15637            &frankensqlite::FrankenError::DatabaseLocked {
15638                path: PathBuf::from("/tmp/test.db"),
15639            }
15640        ));
15641        assert!(!schema_check_error_requires_rebuild(
15642            &frankensqlite::FrankenError::CannotOpen {
15643                path: PathBuf::from("/tmp/test.db"),
15644            }
15645        ));
15646        assert!(!schema_check_error_requires_rebuild(
15647            &frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
15648        ));
15649    }
15650
15651    #[test]
15652    fn schema_check_rebuild_classification_keeps_corruption_errors() {
15653        assert!(schema_check_error_requires_rebuild(
15654            &frankensqlite::FrankenError::DatabaseCorrupt {
15655                detail: "bad header".to_string(),
15656            }
15657        ));
15658        assert!(schema_check_error_requires_rebuild(
15659            &frankensqlite::FrankenError::WalCorrupt {
15660                detail: "bad wal".to_string(),
15661            }
15662        ));
15663        assert!(schema_check_error_requires_rebuild(
15664            &frankensqlite::FrankenError::NotADatabase {
15665                path: PathBuf::from("/tmp/test.db"),
15666            }
15667        ));
15668        assert!(schema_check_error_requires_rebuild(
15669            &frankensqlite::FrankenError::ShortRead {
15670                expected: 4096,
15671                actual: 64,
15672            }
15673        ));
15674    }
15675
15676    #[test]
15677    fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
15678        let retryable_errors = [
15679            frankensqlite::FrankenError::Busy,
15680            frankensqlite::FrankenError::BusyRecovery,
15681            frankensqlite::FrankenError::BusySnapshot {
15682                conflicting_pages: "1,2".to_string(),
15683            },
15684            frankensqlite::FrankenError::DatabaseLocked {
15685                path: PathBuf::from("/tmp/test.db"),
15686            },
15687            frankensqlite::FrankenError::LockFailed {
15688                detail: "fcntl lock still held".to_string(),
15689            },
15690            frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
15691            frankensqlite::FrankenError::SerializationFailure { page: 11 },
15692            frankensqlite::FrankenError::Internal("database is locked".to_string()),
15693        ];
15694
15695        for err in retryable_errors {
15696            assert!(
15697                backup_vacuum_error_requires_consistent_retry(&err),
15698                "retryable VACUUM failure must not fall back to raw bundle copy: {err}"
15699            );
15700        }
15701
15702        assert!(!backup_vacuum_error_requires_consistent_retry(
15703            &frankensqlite::FrankenError::NotADatabase {
15704                path: PathBuf::from("/tmp/test.db")
15705            }
15706        ));
15707        assert!(!backup_vacuum_error_requires_consistent_retry(
15708            &frankensqlite::FrankenError::DatabaseCorrupt {
15709                detail: "bad header".to_string()
15710            }
15711        ));
15712    }
15713
15714    #[test]
15715    fn create_backup_uses_hidden_vacuum_stage_path() {
15716        let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
15717        let stage_path = vacuum_stage_backup_path(&backup_path);
15718        let stage_name = stage_path
15719            .file_name()
15720            .and_then(|name| name.to_str())
15721            .unwrap_or_default();
15722
15723        assert!(stage_name.starts_with('.'));
15724        assert!(stage_name.ends_with(".vacuum-in-progress"));
15725        assert!(
15726            !is_backup_root_name(stage_name, "test.db.backup."),
15727            "incomplete VACUUM output must not be discoverable as a backup root"
15728        );
15729    }
15730
15731    #[test]
15732    fn create_backup_preserves_content() {
15733        let dir = TempDir::new().unwrap();
15734        let db_path = dir.path().join("test.db");
15735        let original_content = b"test database content 12345";
15736        std::fs::write(&db_path, original_content).unwrap();
15737
15738        let backup_path = create_backup(&db_path).unwrap().unwrap();
15739        let backup_content = std::fs::read(&backup_path).unwrap();
15740        assert_eq!(backup_content, original_content);
15741    }
15742
15743    #[test]
15744    fn create_backup_copies_sidecars_when_present() {
15745        let dir = TempDir::new().unwrap();
15746        let db_path = dir.path().join("test.db");
15747        std::fs::write(&db_path, b"db").unwrap();
15748        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15749        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15750
15751        let backup_path = create_backup(&db_path).unwrap().unwrap();
15752
15753        assert_eq!(
15754            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15755            b"wal"
15756        );
15757        assert_eq!(
15758            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15759            b"shm"
15760        );
15761    }
15762
15763    #[test]
15764    #[cfg(unix)]
15765    fn create_backup_rejects_symlink_root_during_raw_fallback() {
15766        use std::os::unix::fs::symlink;
15767
15768        let dir = TempDir::new().unwrap();
15769        let outside_db = dir.path().join("outside.db");
15770        let db_path = dir.path().join("test.db");
15771        std::fs::write(&outside_db, b"not sqlite").unwrap();
15772        symlink(&outside_db, &db_path).unwrap();
15773
15774        let err = create_backup(&db_path).unwrap_err();
15775
15776        assert!(
15777            err.to_string().contains("bundle symlink"),
15778            "unexpected error: {err:#}"
15779        );
15780        assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
15781        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15782            .unwrap()
15783            .filter_map(|entry| entry.ok())
15784            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15785            .filter(|name| name.starts_with("test.db.backup."))
15786            .collect();
15787        assert!(
15788            backup_roots.is_empty(),
15789            "symlinked backup source must not publish backup roots: {backup_roots:?}"
15790        );
15791    }
15792
15793    #[test]
15794    #[cfg(unix)]
15795    fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
15796        use std::os::unix::fs::symlink;
15797
15798        let dir = TempDir::new().unwrap();
15799        let db_path = dir.path().join("test.db");
15800        let outside_wal = dir.path().join("outside.wal");
15801        let wal_path = database_sidecar_path(&db_path, "-wal");
15802        std::fs::write(&db_path, b"not sqlite").unwrap();
15803        std::fs::write(&outside_wal, b"outside wal").unwrap();
15804        symlink(&outside_wal, &wal_path).unwrap();
15805
15806        let err = create_backup(&db_path).unwrap_err();
15807
15808        assert!(
15809            err.to_string().contains("bundle symlink"),
15810            "unexpected error: {err:#}"
15811        );
15812        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15813        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15814            .unwrap()
15815            .filter_map(|entry| entry.ok())
15816            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15817            .filter(|name| name.starts_with("test.db.backup."))
15818            .collect();
15819        assert!(
15820            backup_roots.is_empty(),
15821            "sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
15822        );
15823    }
15824
15825    // =========================================================================
15826    // Backup cleanup tests (bead yln.4)
15827    // =========================================================================
15828
15829    #[test]
15830    fn cleanup_old_backups_keeps_recent() {
15831        let dir = TempDir::new().unwrap();
15832        let db_path = dir.path().join("test.db");
15833
15834        // Create 5 backup files with different timestamps
15835        for i in 0..5 {
15836            let backup_name = format!("test.db.backup.{}", 1000 + i);
15837            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15838        }
15839
15840        cleanup_old_backups(&db_path, 3).unwrap();
15841
15842        // Count remaining backup files
15843        let backups: Vec<_> = std::fs::read_dir(dir.path())
15844            .unwrap()
15845            .filter_map(|e| e.ok())
15846            .filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
15847            .collect();
15848
15849        assert_eq!(backups.len(), 3);
15850    }
15851
15852    #[test]
15853    fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
15854        let dir = TempDir::new().unwrap();
15855        let db_path = dir.path().join("test.db");
15856
15857        for i in 0..3 {
15858            let backup_name = format!("test.db.backup.{}", 1000 + i);
15859            let backup_path = dir.path().join(&backup_name);
15860            std::fs::write(&backup_path, format!("backup {i}")).unwrap();
15861            std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
15862            std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
15863            std::thread::sleep(std::time::Duration::from_millis(20));
15864        }
15865
15866        cleanup_old_backups(&db_path, 2).unwrap();
15867
15868        let mut roots = Vec::new();
15869        let mut wals = Vec::new();
15870        let mut shms = Vec::new();
15871        for entry in std::fs::read_dir(dir.path())
15872            .unwrap()
15873            .filter_map(|e| e.ok())
15874        {
15875            let name = entry.file_name().to_string_lossy().into_owned();
15876            if name.ends_with("-wal") {
15877                wals.push(name);
15878            } else if name.ends_with("-shm") {
15879                shms.push(name);
15880            } else if name.contains("backup") {
15881                roots.push(name);
15882            }
15883        }
15884
15885        assert_eq!(roots.len(), 2, "should keep two backup roots");
15886        assert_eq!(
15887            wals.len(),
15888            2,
15889            "should keep WAL sidecars only for retained backups"
15890        );
15891        assert_eq!(
15892            shms.len(),
15893            2,
15894            "should keep SHM sidecars only for retained backups"
15895        );
15896    }
15897
15898    #[test]
15899    fn move_database_bundle_moves_database_and_sidecars() {
15900        let dir = TempDir::new().unwrap();
15901        let db_path = dir.path().join("test.db");
15902        let backup_path = dir.path().join("test.db.corrupt");
15903
15904        std::fs::write(&db_path, b"db").unwrap();
15905        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15906        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15907
15908        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15909        assert_eq!(
15910            moved,
15911            DatabaseBundleMoveResult {
15912                database: true,
15913                wal: true,
15914                shm: true
15915            }
15916        );
15917        assert!(moved.moved_any());
15918
15919        assert!(!db_path.exists());
15920        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15921        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15922
15923        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15924        assert_eq!(
15925            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15926            b"wal"
15927        );
15928        assert_eq!(
15929            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15930            b"shm"
15931        );
15932    }
15933
15934    #[test]
15935    fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
15936        let dir = TempDir::new().unwrap();
15937        let db_path = dir.path().join("test.db");
15938        let backup_path = dir.path().join("test.db.corrupt");
15939
15940        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15941        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15942
15943        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15944        assert_eq!(
15945            moved,
15946            DatabaseBundleMoveResult {
15947                database: false,
15948                wal: true,
15949                shm: true
15950            }
15951        );
15952        assert!(moved.moved_any());
15953        assert!(!db_path.exists());
15954        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15955        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15956        assert_eq!(
15957            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15958            b"wal"
15959        );
15960        assert_eq!(
15961            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15962            b"shm"
15963        );
15964    }
15965
15966    #[test]
15967    #[cfg(unix)]
15968    fn move_database_bundle_moves_dangling_symlink_database_root() {
15969        use std::os::unix::fs::symlink;
15970
15971        let dir = TempDir::new().unwrap();
15972        let db_path = dir.path().join("test.db");
15973        let backup_path = dir.path().join("test.db.corrupt");
15974        let missing_target = dir.path().join("missing-target.db");
15975
15976        symlink(&missing_target, &db_path).unwrap();
15977
15978        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15979
15980        assert_eq!(
15981            moved,
15982            DatabaseBundleMoveResult {
15983                database: true,
15984                wal: false,
15985                shm: false
15986            }
15987        );
15988        assert!(std::fs::symlink_metadata(&db_path).is_err());
15989        assert!(
15990            std::fs::symlink_metadata(&backup_path)
15991                .unwrap()
15992                .file_type()
15993                .is_symlink()
15994        );
15995        assert!(!missing_target.exists());
15996    }
15997
15998    #[test]
15999    #[cfg(unix)]
16000    fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
16001        use std::os::unix::fs::symlink;
16002
16003        let dir = TempDir::new().unwrap();
16004        let db_path = dir.path().join("test.db");
16005        let backup_path = dir.path().join("test.db.corrupt");
16006        let missing_wal_target = dir.path().join("missing-wal");
16007        let missing_shm_target = dir.path().join("missing-shm");
16008        let wal_path = database_sidecar_path(&db_path, "-wal");
16009        let shm_path = database_sidecar_path(&db_path, "-shm");
16010
16011        symlink(&missing_wal_target, &wal_path).unwrap();
16012        symlink(&missing_shm_target, &shm_path).unwrap();
16013
16014        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16015
16016        assert_eq!(
16017            moved,
16018            DatabaseBundleMoveResult {
16019                database: false,
16020                wal: true,
16021                shm: true
16022            }
16023        );
16024        assert!(std::fs::symlink_metadata(&wal_path).is_err());
16025        assert!(std::fs::symlink_metadata(&shm_path).is_err());
16026        assert!(
16027            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
16028                .unwrap()
16029                .file_type()
16030                .is_symlink()
16031        );
16032        assert!(
16033            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
16034                .unwrap()
16035                .file_type()
16036                .is_symlink()
16037        );
16038        assert!(!missing_wal_target.exists());
16039        assert!(!missing_shm_target.exists());
16040    }
16041
16042    #[test]
16043    fn copy_database_bundle_copies_database_and_sidecars() {
16044        let dir = TempDir::new().unwrap();
16045        let db_path = dir.path().join("test.db");
16046        let copied_path = dir.path().join("copy.db");
16047
16048        std::fs::write(&db_path, b"db").unwrap();
16049        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16050        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16051
16052        copy_database_bundle(&db_path, &copied_path).unwrap();
16053
16054        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
16055        assert_eq!(
16056            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
16057            b"wal"
16058        );
16059        assert_eq!(
16060            std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
16061            b"shm"
16062        );
16063        assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
16064    }
16065
16066    #[test]
16067    fn copy_database_bundle_creates_destination_parent() {
16068        let dir = TempDir::new().unwrap();
16069        let db_path = dir.path().join("test.db");
16070        let copied_path = dir.path().join("nested/copies/copy.db");
16071
16072        std::fs::write(&db_path, b"db").unwrap();
16073        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16074
16075        copy_database_bundle(&db_path, &copied_path).unwrap();
16076
16077        assert!(copied_path.parent().unwrap().is_dir());
16078        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
16079        assert_eq!(
16080            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
16081            b"wal"
16082        );
16083    }
16084
16085    #[test]
16086    #[cfg(unix)]
16087    fn copy_database_bundle_rejects_symlink_source_root() {
16088        use std::os::unix::fs::symlink;
16089
16090        let dir = TempDir::new().unwrap();
16091        let outside_db = dir.path().join("outside.db");
16092        let db_path = dir.path().join("test.db");
16093        let copied_path = dir.path().join("copy.db");
16094
16095        std::fs::write(&outside_db, b"outside").unwrap();
16096        symlink(&outside_db, &db_path).unwrap();
16097
16098        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
16099
16100        assert!(
16101            err.to_string().contains("bundle symlink"),
16102            "unexpected error: {err:#}"
16103        );
16104        assert!(!copied_path.exists());
16105        assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
16106    }
16107
16108    #[test]
16109    #[cfg(unix)]
16110    fn copy_database_bundle_rejects_symlink_sidecar() {
16111        use std::os::unix::fs::symlink;
16112
16113        let dir = TempDir::new().unwrap();
16114        let db_path = dir.path().join("test.db");
16115        let copied_path = dir.path().join("copy.db");
16116        let outside_wal = dir.path().join("outside.wal");
16117        let wal_path = database_sidecar_path(&db_path, "-wal");
16118
16119        std::fs::write(&db_path, b"db").unwrap();
16120        std::fs::write(&outside_wal, b"outside wal").unwrap();
16121        symlink(&outside_wal, &wal_path).unwrap();
16122
16123        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
16124
16125        assert!(
16126            err.to_string().contains("bundle symlink"),
16127            "unexpected error: {err:#}"
16128        );
16129        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
16130        assert!(!copied_path.exists());
16131        assert!(!database_sidecar_path(&copied_path, "-wal").exists());
16132    }
16133
16134    #[test]
16135    fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
16136        let dir = TempDir::new().unwrap();
16137        let db_path = dir.path().join("test.db");
16138        let backup_path = dir.path().join("nested/backups/test.db.corrupt");
16139
16140        std::fs::write(&db_path, b"db").unwrap();
16141        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16142        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16143
16144        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
16145        assert_eq!(
16146            moved,
16147            DatabaseBundleMoveResult {
16148                database: true,
16149                wal: true,
16150                shm: true
16151            }
16152        );
16153        assert!(backup_path.parent().unwrap().is_dir());
16154        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
16155        assert_eq!(
16156            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
16157            b"wal"
16158        );
16159        assert_eq!(
16160            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
16161            b"shm"
16162        );
16163    }
16164
16165    #[test]
16166    fn remove_database_files_removes_orphan_sidecars_without_main_db() {
16167        let dir = TempDir::new().unwrap();
16168        let db_path = dir.path().join("test.db");
16169
16170        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
16171        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
16172
16173        remove_database_files(&db_path).unwrap();
16174
16175        assert!(!db_path.exists());
16176        assert!(!database_sidecar_path(&db_path, "-wal").exists());
16177        assert!(!database_sidecar_path(&db_path, "-shm").exists());
16178    }
16179
16180    #[test]
16181    fn cleanup_old_backups_ignores_backup_named_directories() {
16182        let dir = TempDir::new().unwrap();
16183        let db_path = dir.path().join("test.db");
16184
16185        for i in 0..3 {
16186            let backup_name = format!("test.db.backup.{}", 1000 + i);
16187            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
16188        }
16189        std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
16190
16191        cleanup_old_backups(&db_path, 2).unwrap();
16192
16193        let mut backup_files = Vec::new();
16194        let mut backup_dirs = Vec::new();
16195        for entry in std::fs::read_dir(dir.path())
16196            .unwrap()
16197            .filter_map(|e| e.ok())
16198        {
16199            let name = entry.file_name().to_string_lossy().into_owned();
16200            if !name.starts_with("test.db.backup.") {
16201                continue;
16202            }
16203            if entry.path().is_dir() {
16204                backup_dirs.push(name);
16205            } else {
16206                backup_files.push(name);
16207            }
16208        }
16209
16210        assert_eq!(
16211            backup_files.len(),
16212            2,
16213            "only real backup files count toward retention"
16214        );
16215        assert_eq!(
16216            backup_dirs.len(),
16217            1,
16218            "backup-named directories should be ignored"
16219        );
16220    }
16221
16222    // =========================================================================
16223    // Storage open/create tests (bead yln.4)
16224    // =========================================================================
16225
16226    #[test]
16227    fn open_creates_new_database() {
16228        let dir = TempDir::new().unwrap();
16229        let db_path = dir.path().join("new.db");
16230        assert!(!db_path.exists());
16231
16232        let storage = SqliteStorage::open(&db_path).unwrap();
16233        assert!(db_path.exists());
16234        storage.close().unwrap();
16235    }
16236
16237    #[test]
16238    fn open_readonly_fails_for_nonexistent() {
16239        let dir = TempDir::new().unwrap();
16240        let db_path = dir.path().join("nonexistent.db");
16241        let result = SqliteStorage::open_readonly(&db_path);
16242        assert!(result.is_err());
16243    }
16244
16245    #[test]
16246    fn open_readonly_succeeds_for_existing() {
16247        let dir = TempDir::new().unwrap();
16248        let db_path = dir.path().join("existing.db");
16249
16250        // Create first
16251        let _storage = SqliteStorage::open(&db_path).unwrap();
16252        drop(_storage);
16253
16254        // Now open readonly
16255        let storage = SqliteStorage::open_readonly(&db_path).unwrap();
16256        assert!(storage.schema_version().is_ok());
16257    }
16258
16259    #[test]
16260    fn reopen_existing_current_schema_is_idempotent() {
16261        let dir = TempDir::new().unwrap();
16262        let db_path = dir.path().join("existing.db");
16263
16264        // First open creates and migrates to current schema.
16265        {
16266            let storage = SqliteStorage::open(&db_path).unwrap();
16267            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
16268        }
16269
16270        // Re-open should not fail on current schema.
16271        let reopened = SqliteStorage::open(&db_path).unwrap();
16272        assert_eq!(
16273            reopened.schema_version().unwrap(),
16274            CURRENT_SCHEMA_VERSION,
16275            "reopening current schema DB should be idempotent"
16276        );
16277    }
16278
16279    #[test]
16280    fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
16281        let dir = TempDir::new().unwrap();
16282        let db_path = dir.path().join("existing.db");
16283
16284        // Create DB at current schema.
16285        {
16286            let storage = SqliteStorage::open(&db_path).unwrap();
16287            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
16288        }
16289
16290        // Should open normally, not require rebuild.
16291        let reopened = SqliteStorage::open_or_rebuild(&db_path)
16292            .expect("current schema DB should open without rebuild");
16293        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
16294    }
16295
16296    #[test]
16297    fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
16298        let dir = TempDir::new().unwrap();
16299        let db_path = dir.path().join("db_dir");
16300        std::fs::create_dir(&db_path).unwrap();
16301
16302        let result = SqliteStorage::open_or_rebuild(&db_path);
16303
16304        assert!(
16305            matches!(
16306                result,
16307                Err(MigrationError::Database(_)) | Err(MigrationError::Io(_))
16308            ),
16309            "non-database path should report the underlying open error without rebuild"
16310        );
16311
16312        assert!(
16313            db_path.is_dir(),
16314            "non-database directory must be left in place"
16315        );
16316    }
16317
16318    // =========================================================================
16319    // Schema version tests (bead yln.4)
16320    // =========================================================================
16321
16322    #[test]
16323    fn schema_version_returns_current() {
16324        let dir = TempDir::new().unwrap();
16325        let db_path = dir.path().join("test.db");
16326        let storage = SqliteStorage::open(&db_path).unwrap();
16327        let version = storage.schema_version().unwrap();
16328        assert!(version >= 5, "Schema version should be at least 5");
16329    }
16330
16331    // =========================================================================
16332    // Current analytics/schema smoke test (bead z9fse.11)
16333    // =========================================================================
16334
16335    #[test]
16336    fn migration_v13_creates_analytics_tables() {
16337        let dir = TempDir::new().unwrap();
16338        let db_path = dir.path().join("test.db");
16339        let storage = SqliteStorage::open(&db_path).unwrap();
16340
16341        // Schema version should be current.
16342        let version = storage.schema_version().unwrap();
16343        assert_eq!(
16344            version, CURRENT_SCHEMA_VERSION,
16345            "Schema version must match CURRENT_SCHEMA_VERSION after migration"
16346        );
16347
16348        let conn = storage.raw();
16349
16350        // Helper: collect column names from PRAGMA table_info
16351        fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
16352            conn.query_map_collect(
16353                &format!("PRAGMA table_info({})", table),
16354                fparams![],
16355                |row: &FrankenRow| row.get_typed(1),
16356            )
16357            .unwrap()
16358        }
16359
16360        // Helper: collect index names from PRAGMA index_list
16361        fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
16362            conn.query_map_collect(
16363                &format!("PRAGMA index_list({})", table),
16364                fparams![],
16365                |row: &FrankenRow| row.get_typed(1),
16366            )
16367            .unwrap()
16368        }
16369
16370        // Verify message_metrics table exists with expected columns
16371        let mm_cols = col_names(conn, "message_metrics");
16372        for expected in &[
16373            "message_id",
16374            "hour_id",
16375            "day_id",
16376            "content_tokens_est",
16377            "model_name",
16378            "model_family",
16379            "model_tier",
16380            "provider",
16381            "api_input_tokens",
16382            "has_plan",
16383            "agent_slug",
16384            "role",
16385            "api_data_source",
16386        ] {
16387            assert!(
16388                mm_cols.contains(&expected.to_string()),
16389                "message_metrics missing column: {expected}"
16390            );
16391        }
16392
16393        // Verify usage_hourly table
16394        let uh_cols = col_names(conn, "usage_hourly");
16395        for expected in &[
16396            "hour_id",
16397            "plan_message_count",
16398            "plan_content_tokens_est_total",
16399            "plan_api_tokens_total",
16400            "api_coverage_message_count",
16401            "content_tokens_est_user",
16402            "api_thinking_tokens_total",
16403        ] {
16404            assert!(
16405                uh_cols.contains(&expected.to_string()),
16406                "usage_hourly missing column: {expected}"
16407            );
16408        }
16409
16410        // Verify usage_daily table
16411        let ud_cols = col_names(conn, "usage_daily");
16412        for expected in &[
16413            "day_id",
16414            "plan_content_tokens_est_total",
16415            "plan_api_tokens_total",
16416            "api_thinking_tokens_total",
16417            "content_tokens_est_assistant",
16418            "message_count",
16419        ] {
16420            assert!(
16421                ud_cols.contains(&expected.to_string()),
16422                "usage_daily missing column: {expected}"
16423            );
16424        }
16425
16426        // Verify usage_models_daily table
16427        let umd_cols = col_names(conn, "usage_models_daily");
16428        for expected in &[
16429            "day_id",
16430            "model_family",
16431            "model_tier",
16432            "message_count",
16433            "api_tokens_total",
16434            "api_coverage_message_count",
16435        ] {
16436            assert!(
16437                umd_cols.contains(&expected.to_string()),
16438                "usage_models_daily missing column: {expected}"
16439            );
16440        }
16441
16442        // Verify indexes on message_metrics
16443        let mm_idxs = idx_names(conn, "message_metrics");
16444        assert!(
16445            mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
16446            "message_metrics must have hour index"
16447        );
16448        assert!(
16449            mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
16450            "message_metrics must have agent+day index"
16451        );
16452        assert!(
16453            mm_idxs
16454                .iter()
16455                .any(|n| n.contains("idx_mm_model_family_day")),
16456            "message_metrics must have model_family+day index"
16457        );
16458
16459        // Verify indexes on usage_hourly
16460        let uh_idxs = idx_names(conn, "usage_hourly");
16461        assert!(
16462            uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
16463            "usage_hourly must have agent index"
16464        );
16465
16466        // Verify indexes on usage_daily
16467        let ud_idxs = idx_names(conn, "usage_daily");
16468        assert!(
16469            ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
16470            "usage_daily must have agent index"
16471        );
16472
16473        // Verify indexes on usage_models_daily
16474        let umd_idxs = idx_names(conn, "usage_models_daily");
16475        assert!(
16476            umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
16477            "usage_models_daily must have model+day index"
16478        );
16479
16480        let conversation_cols = col_names(conn, "conversations");
16481        assert!(
16482            conversation_cols.contains(&"last_message_idx".to_string())
16483                && conversation_cols.contains(&"last_message_created_at".to_string()),
16484            "fresh schema must include V15 tail columns without ALTER TABLE on conversations"
16485        );
16486        let fts_schema_rows: i64 = conn
16487            .query_row_map(
16488                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
16489                fparams![],
16490                |row: &FrankenRow| row.get_typed(0),
16491            )
16492            .unwrap();
16493        assert_eq!(
16494            fts_schema_rows, 0,
16495            "fresh schema should not create and immediately drop derived fts_messages"
16496        );
16497        let integrity: Vec<String> = conn
16498            .query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
16499                row.get_typed(0)
16500            })
16501            .unwrap();
16502        assert_eq!(
16503            integrity,
16504            vec!["ok".to_string()],
16505            "fresh schema must pass SQLite integrity_check"
16506        );
16507    }
16508
16509    #[test]
16510    fn hour_id_round_trip() {
16511        // 2026-02-06 12:00:00 UTC
16512        let ts_ms = 1_770_508_800_000_i64;
16513        let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
16514        let day_id = SqliteStorage::day_id_from_millis(ts_ms);
16515
16516        // hour_id should be 24x day_id (approximately)
16517        assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
16518
16519        // Round-trip: millis_from_hour_id should give start of that hour
16520        let back = SqliteStorage::millis_from_hour_id(hour_id);
16521        assert!(
16522            back <= ts_ms && ts_ms - back < 3_600_000,
16523            "Round-trip should land within the same hour"
16524        );
16525    }
16526
16527    #[test]
16528    fn day_and_hour_ids_floor_negative_millis() {
16529        // One millisecond before the Unix epoch should still floor into the
16530        // previous second/hour/day rather than truncating toward zero.
16531        let ts_ms = -1_i64;
16532        let expected_secs = -1_i64;
16533        let epoch_2020_secs = 1_577_836_800_i64;
16534
16535        assert_eq!(
16536            SqliteStorage::day_id_from_millis(ts_ms),
16537            (expected_secs - epoch_2020_secs).div_euclid(86_400)
16538        );
16539        assert_eq!(
16540            SqliteStorage::hour_id_from_millis(ts_ms),
16541            (expected_secs - epoch_2020_secs).div_euclid(3_600)
16542        );
16543    }
16544
16545    #[test]
16546    fn migration_v13_from_v10() {
16547        let dir = TempDir::new().unwrap();
16548        let db_path = dir.path().join("test.db");
16549
16550        // Open at v10 first by faking it
16551        {
16552            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
16553            conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
16554            conn.execute_batch(
16555                "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
16556            )
16557            .unwrap();
16558            conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
16559                .unwrap();
16560            // Apply V1-V10 so schema is correct. Keep each historical DDL batch
16561            // in autocommit mode; the fixture is testing cass migration
16562            // transition behavior, not frankensqlite's handling of a giant
16563            // synthetic legacy-DDL transaction.
16564            conn.execute_batch(MIGRATION_V1).unwrap();
16565            conn.execute_batch(MIGRATION_V2).unwrap();
16566            conn.execute_batch(MIGRATION_V4).unwrap();
16567            conn.execute_batch(MIGRATION_V5).unwrap();
16568            conn.execute_batch(MIGRATION_V6).unwrap();
16569            conn.execute_batch(MIGRATION_V7).unwrap();
16570            conn.execute_batch(MIGRATION_V8).unwrap();
16571            conn.execute_batch(MIGRATION_V9).unwrap();
16572            conn.execute_batch(MIGRATION_V10).unwrap();
16573            conn.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
16574                .unwrap();
16575        }
16576        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
16577
16578        // Now open with SqliteStorage — should auto-migrate to current schema
16579        let storage = SqliteStorage::open(&db_path).unwrap();
16580        let version = storage.schema_version().unwrap();
16581        assert_eq!(
16582            version, CURRENT_SCHEMA_VERSION,
16583            "Should have migrated from v10 to the current schema"
16584        );
16585
16586        // Verify new tables exist
16587        let count: i64 = storage
16588            .raw()
16589            .query_row_map(
16590                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
16591                &[],
16592                |row: &FrankenRow| row.get_typed::<i64>(0),
16593            )
16594            .unwrap();
16595        assert_eq!(count, 4, "All 4 analytics tables should exist");
16596    }
16597
16598    // =========================================================================
16599    // Analytics ingest integration test (bead z9fse.2)
16600    // =========================================================================
16601
16602    #[test]
16603    fn analytics_ingest_populates_metrics_and_rollups() {
16604        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16605        use std::path::PathBuf;
16606
16607        let dir = TempDir::new().unwrap();
16608        let db_path = dir.path().join("test.db");
16609        let storage = SqliteStorage::open(&db_path).unwrap();
16610
16611        // Register agent + workspace
16612        let agent = Agent {
16613            id: None,
16614            slug: "claude_code".into(),
16615            name: "Claude Code".into(),
16616            version: Some("1.0".into()),
16617            kind: AgentKind::Cli,
16618        };
16619        let agent_id = storage.ensure_agent(&agent).unwrap();
16620
16621        // Create a synthetic conversation with 3 messages at a known timestamp
16622        // 2026-02-06 10:30:00 UTC → day_id = 2228, hour_id = 53472
16623        let ts_ms = 1_770_551_400_000_i64;
16624        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16625        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16626
16627        // Include a JSON usage block on the assistant message (like Claude Code data)
16628        let usage_json = serde_json::json!({
16629            "message": {
16630                "model": "claude-opus-4-6",
16631                "usage": {
16632                    "input_tokens": 100,
16633                    "output_tokens": 50,
16634                    "cache_read_input_tokens": 200,
16635                    "cache_creation_input_tokens": 30,
16636                    "service_tier": "standard"
16637                }
16638            }
16639        });
16640
16641        let conv = Conversation {
16642            id: None,
16643            agent_slug: "claude_code".into(),
16644            workspace: None,
16645            external_id: Some("test-conv-1".into()),
16646            title: Some("Test conversation".into()),
16647            source_path: PathBuf::from("/tmp/test.jsonl"),
16648            started_at: Some(ts_ms),
16649            ended_at: Some(ts_ms + 60_000),
16650            approx_tokens: None,
16651            metadata_json: serde_json::Value::Null,
16652            messages: vec![
16653                Message {
16654                    id: None,
16655                    idx: 0,
16656                    role: MessageRole::User,
16657                    author: None,
16658                    created_at: Some(ts_ms),
16659                    content: "Hello, can you help me with a plan?".into(),
16660                    extra_json: serde_json::Value::Null,
16661                    snippets: vec![],
16662                },
16663                Message {
16664                    id: None,
16665                    idx: 1,
16666                    role: MessageRole::Agent,
16667                    author: None,
16668                    created_at: Some(ts_ms + 30_000),
16669                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16670                    extra_json: usage_json,
16671                    snippets: vec![],
16672                },
16673                Message {
16674                    id: None,
16675                    idx: 2,
16676                    role: MessageRole::User,
16677                    author: None,
16678                    created_at: Some(ts_ms + 60_000),
16679                    content: "Great, let's proceed!".into(),
16680                    extra_json: serde_json::Value::Null,
16681                    snippets: vec![],
16682                },
16683            ],
16684            source_id: "local".into(),
16685            origin_host: None,
16686        };
16687
16688        let outcomes = storage
16689            .insert_conversations_batched(&[(agent_id, None, &conv)])
16690            .unwrap();
16691        assert_eq!(outcomes.len(), 1);
16692        assert_eq!(outcomes[0].inserted_indices.len(), 3);
16693
16694        let conn = storage.raw();
16695
16696        // Verify message_metrics rows
16697        let mm_count: i64 = conn
16698            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16699                row.get_typed::<i64>(0)
16700            })
16701            .unwrap();
16702        assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
16703
16704        // Verify hour_id and day_id are correct
16705        #[allow(clippy::type_complexity)]
16706        let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
16707            .query_map_collect(
16708                "SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
16709                fparams![],
16710                |row: &FrankenRow| {
16711                    Ok((
16712                        row.get_typed(0)?,
16713                        row.get_typed(1)?,
16714                        row.get_typed(2)?,
16715                        row.get_typed(3)?,
16716                        row.get_typed(4)?,
16717                        row.get_typed(5)?,
16718                        row.get_typed(6)?,
16719                        row.get_typed(7)?,
16720                        row.get_typed(8)?,
16721                    ))
16722                },
16723            )
16724            .unwrap();
16725
16726        assert_eq!(rows.len(), 3);
16727        // All messages in the same hour/day
16728        assert_eq!(rows[0].0, expected_hour);
16729        assert_eq!(rows[0].1, expected_day);
16730        // First message is user
16731        assert_eq!(rows[0].2, "user");
16732        // Second message (assistant) should have has_plan=1 (contains "## Plan" + numbered steps)
16733        assert_eq!(
16734            rows[1].4, 1,
16735            "Assistant message with plan should have has_plan=1"
16736        );
16737        // Second message should have api data source
16738        assert_eq!(
16739            rows[1].5, "api",
16740            "Claude Code assistant message should have api data source"
16741        );
16742        // First and third (user) messages should be estimated
16743        assert_eq!(rows[0].5, "estimated");
16744        assert_eq!(rows[2].5, "estimated");
16745        assert_eq!(rows[1].6, "claude");
16746        assert_eq!(rows[1].7, "opus");
16747        assert_eq!(rows[1].8, "anthropic");
16748        assert_eq!(rows[0].6, "unknown");
16749        // content_tokens_est = chars / 4
16750        let user_chars = "Hello, can you help me with a plan?".len() as i64;
16751        assert_eq!(rows[0].3, user_chars / 4);
16752
16753        // Verify usage_hourly rollup
16754        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
16755            i64,
16756            i64,
16757            i64,
16758            i64,
16759            i64,
16760            i64,
16761            i64,
16762        ) = conn
16763            .query_row_map(
16764                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16765                        plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
16766                 FROM usage_hourly WHERE hour_id = ?",
16767                fparams![expected_hour],
16768                |row: &FrankenRow| {
16769                    Ok((
16770                        row.get_typed(0)?,
16771                        row.get_typed(1)?,
16772                        row.get_typed(2)?,
16773                        row.get_typed(3)?,
16774                        row.get_typed(4)?,
16775                        row.get_typed(5)?,
16776                        row.get_typed(6)?,
16777                    ))
16778                },
16779            )
16780            .unwrap();
16781        assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
16782        assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
16783        assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
16784        assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
16785        assert!(
16786            uh_plan_content > 0,
16787            "Hourly rollup should include plan content tokens"
16788        );
16789        assert!(
16790            uh_plan_api > 0,
16791            "Hourly rollup should include plan API tokens"
16792        );
16793        assert_eq!(
16794            uh_api_cov, 1,
16795            "Hourly rollup should have 1 API-covered message"
16796        );
16797
16798        // Verify usage_daily rollup matches hourly (same day)
16799        let (ud_msg, ud_api_cov): (i64, i64) = conn
16800            .query_row_map(
16801                "SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
16802                fparams![expected_day],
16803                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16804            )
16805            .unwrap();
16806        assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
16807        assert_eq!(
16808            ud_api_cov, 1,
16809            "Daily api_coverage should be 1 (only assistant msg has real API data)"
16810        );
16811
16812        // Verify the API input tokens from message_metrics (only API-sourced)
16813        let api_only_input: i64 = conn
16814            .query_row_map(
16815                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
16816                fparams![expected_day],
16817                |row: &FrankenRow| row.get_typed::<i64>(0),
16818            )
16819            .unwrap();
16820        assert_eq!(
16821            api_only_input, 100,
16822            "Only API-sourced input tokens should be 100"
16823        );
16824
16825        // Verify rollups match summed message_metrics
16826        let mm_total_content_est: i64 = conn
16827            .query_row_map(
16828                "SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
16829                fparams![expected_day],
16830                |row| row.get_typed::<i64>(0),
16831            )
16832            .unwrap();
16833        let mm_plan_content_est: i64 = conn
16834            .query_row_map(
16835                "SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
16836                fparams![expected_day],
16837                |row: &FrankenRow| row.get_typed::<i64>(0),
16838            )
16839            .unwrap();
16840        let mm_plan_api_total: i64 = conn
16841            .query_row_map(
16842                "SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
16843                 FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
16844                fparams![expected_day],
16845                |row: &FrankenRow| row.get_typed::<i64>(0),
16846            )
16847            .unwrap();
16848        let ud_content_est: i64 = conn
16849            .query_row_map(
16850                "SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
16851                fparams![expected_day],
16852                |row| row.get_typed::<i64>(0),
16853            )
16854            .unwrap();
16855        let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
16856            .query_row_map(
16857                "SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
16858                fparams![expected_day],
16859                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16860            )
16861            .unwrap();
16862        assert_eq!(
16863            mm_total_content_est, ud_content_est,
16864            "Daily rollup content_tokens_est_total must equal SUM of message_metrics"
16865        );
16866        assert_eq!(
16867            mm_plan_content_est, ud_plan_content_est,
16868            "Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
16869        );
16870        assert_eq!(
16871            mm_plan_api_total, ud_plan_api_total,
16872            "Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
16873        );
16874
16875        // Verify model rollup rows
16876        let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
16877            i64,
16878            i64,
16879            i64,
16880            i64,
16881            i64,
16882        ) = conn
16883            .query_row_map(
16884                "SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
16885                 FROM usage_models_daily
16886                 WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
16887                fparams![expected_day],
16888                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
16889            )
16890            .unwrap();
16891        assert_eq!(claude_msg, 1);
16892        assert_eq!(claude_user, 0);
16893        assert_eq!(claude_asst, 1);
16894        assert_eq!(claude_api_total, 380);
16895        assert_eq!(claude_api_cov, 1);
16896
16897        let unknown_msg: i64 = conn
16898            .query_row_map(
16899                "SELECT message_count FROM usage_models_daily
16900                 WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
16901                fparams![expected_day],
16902                |row| row.get_typed(0),
16903            )
16904            .unwrap();
16905        assert_eq!(
16906            unknown_msg, 2,
16907            "user messages should land in unknown model bucket"
16908        );
16909    }
16910
16911    #[test]
16912    fn has_plan_heuristic_detects_plans() {
16913        assert!(has_plan_heuristic(
16914            "## Plan\n\n1. First step\n2. Second step"
16915        ));
16916        assert!(has_plan_heuristic(
16917            "# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
16918        ));
16919        assert!(has_plan_heuristic(
16920            "Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
16921        ));
16922        assert!(has_plan_heuristic(
16923            "Next steps:\n1. Update schema\n2. Rebuild rollups"
16924        ));
16925        assert!(!has_plan_heuristic("Hello world"));
16926        assert!(!has_plan_heuristic("Short"));
16927        assert!(!has_plan_heuristic(
16928            "This is a regular message without plans"
16929        ));
16930        assert!(!has_plan_heuristic(
16931            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
16932        ));
16933    }
16934
16935    #[test]
16936    fn has_plan_for_role_only_counts_assistant_messages() {
16937        let plan_text = "## Plan\n1. First\n2. Second";
16938        assert!(has_plan_for_role("assistant", plan_text));
16939        assert!(has_plan_for_role("agent", plan_text));
16940        assert!(has_plan_for_role("Assistant", plan_text));
16941        assert!(!has_plan_for_role("user", plan_text));
16942        assert!(!has_plan_for_role("tool", plan_text));
16943    }
16944
16945    #[test]
16946    fn api_rollups_require_api_data_source() {
16947        let mut agg = AnalyticsRollupAggregator::new();
16948
16949        let estimated_plan = MessageMetricsEntry {
16950            message_id: 1,
16951            created_at_ms: 0,
16952            hour_id: 1,
16953            day_id: 1,
16954            agent_slug: "codex".into(),
16955            workspace_id: 0,
16956            source_id: "local".into(),
16957            role: "assistant".into(),
16958            content_chars: 120,
16959            content_tokens_est: 30,
16960            model_name: None,
16961            model_family: "unknown".into(),
16962            model_tier: "unknown".into(),
16963            provider: "unknown".into(),
16964            api_input_tokens: Some(100),
16965            api_output_tokens: Some(50),
16966            api_cache_read_tokens: Some(0),
16967            api_cache_creation_tokens: Some(0),
16968            api_thinking_tokens: Some(0),
16969            api_service_tier: None,
16970            api_data_source: "estimated".into(),
16971            tool_call_count: 0,
16972            has_tool_calls: false,
16973            has_plan: true,
16974        };
16975        agg.record(&estimated_plan);
16976
16977        let api_plan = MessageMetricsEntry {
16978            message_id: 2,
16979            created_at_ms: 0,
16980            hour_id: 1,
16981            day_id: 1,
16982            agent_slug: "codex".into(),
16983            workspace_id: 0,
16984            source_id: "local".into(),
16985            role: "assistant".into(),
16986            content_chars: 80,
16987            content_tokens_est: 20,
16988            model_name: None,
16989            model_family: "unknown".into(),
16990            model_tier: "unknown".into(),
16991            provider: "unknown".into(),
16992            api_input_tokens: Some(40),
16993            api_output_tokens: Some(10),
16994            api_cache_read_tokens: Some(0),
16995            api_cache_creation_tokens: Some(0),
16996            api_thinking_tokens: Some(0),
16997            api_service_tier: None,
16998            api_data_source: "api".into(),
16999            tool_call_count: 0,
17000            has_tool_calls: false,
17001            has_plan: true,
17002        };
17003        agg.record(&api_plan);
17004
17005        let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
17006        let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
17007        let daily = agg.daily.get(&key).expect("daily rollup key must exist");
17008        let model_key = (
17009            1_i64,
17010            "codex".to_string(),
17011            0_i64,
17012            "local".to_string(),
17013            "unknown".to_string(),
17014            "unknown".to_string(),
17015        );
17016        let models_daily = agg
17017            .models_daily
17018            .get(&model_key)
17019            .expect("model rollup key must exist");
17020
17021        // Content rollup includes both plan messages.
17022        assert_eq!(hourly.plan_message_count, 2);
17023        assert_eq!(hourly.plan_content_tokens_est_total, 50);
17024        // API plan tokens must include only api_data_source='api' rows.
17025        assert_eq!(hourly.plan_api_tokens_total, 50);
17026        assert_eq!(daily.plan_api_tokens_total, 50);
17027        assert_eq!(models_daily.plan_api_tokens_total, 50);
17028        // Overall API totals must also exclude estimated rows.
17029        assert_eq!(hourly.api_tokens_total, 50);
17030        assert_eq!(hourly.api_input_tokens_total, 40);
17031        assert_eq!(hourly.api_output_tokens_total, 10);
17032        assert_eq!(hourly.api_coverage_message_count, 1);
17033        assert_eq!(daily.api_tokens_total, 50);
17034        assert_eq!(models_daily.api_tokens_total, 50);
17035    }
17036
17037    #[test]
17038    fn has_plan_heuristic_curated_corpus_thresholds() {
17039        // Cross-agent-style positives.
17040        let positives = [
17041            "## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
17042            "Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
17043            "Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
17044            "Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
17045            "# Plan\n1. Gather requirements\n2. Ship changes",
17046            "Action plan:\n- Identify root cause\n- Fix it\n- Validate",
17047        ];
17048
17049        // Typical false positives we want to avoid.
17050        let negatives = [
17051            "The plan is to move fast and fix things later.",
17052            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
17053            "stdout:\n1. Build started\n2. Build finished\nexit code: 0",
17054            "I can help with that request. Let me know if you want details.",
17055            "Here is a list:\n- apples\n- oranges",
17056            "Status update: completed tasks and blockers below.",
17057        ];
17058
17059        let tp = positives
17060            .iter()
17061            .filter(|msg| has_plan_heuristic(msg))
17062            .count();
17063        let fp = negatives
17064            .iter()
17065            .filter(|msg| has_plan_heuristic(msg))
17066            .count();
17067
17068        let recall = tp as f64 / positives.len() as f64;
17069        let false_positive_rate = fp as f64 / negatives.len() as f64;
17070
17071        assert!(
17072            recall >= 0.80,
17073            "plan heuristic recall too low: got {recall:.2}"
17074        );
17075        assert!(
17076            false_positive_rate <= 0.20,
17077            "plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
17078        );
17079    }
17080
17081    #[test]
17082    fn rebuild_analytics_repopulates_from_messages() {
17083        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17084        use std::path::PathBuf;
17085
17086        let dir = TempDir::new().unwrap();
17087        let db_path = dir.path().join("test.db");
17088        let storage = SqliteStorage::open(&db_path).unwrap();
17089
17090        // Register agent
17091        let agent = Agent {
17092            id: None,
17093            slug: "claude_code".into(),
17094            name: "Claude Code".into(),
17095            version: Some("1.0".into()),
17096            kind: AgentKind::Cli,
17097        };
17098        let agent_id = storage.ensure_agent(&agent).unwrap();
17099
17100        // 2026-02-06 10:30:00 UTC
17101        let ts_ms = 1_770_551_400_000_i64;
17102        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
17103        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
17104
17105        let usage_json = serde_json::json!({
17106            "message": {
17107                "model": "claude-opus-4-6",
17108                "usage": {
17109                    "input_tokens": 100,
17110                    "output_tokens": 50,
17111                    "cache_read_input_tokens": 200,
17112                    "cache_creation_input_tokens": 30,
17113                    "service_tier": "standard"
17114                }
17115            }
17116        });
17117
17118        let conv = Conversation {
17119            id: None,
17120            agent_slug: "claude_code".into(),
17121            workspace: None,
17122            external_id: Some("test-rebuild-1".into()),
17123            title: Some("Test conversation".into()),
17124            source_path: PathBuf::from("/tmp/test.jsonl"),
17125            started_at: Some(ts_ms),
17126            ended_at: Some(ts_ms + 60_000),
17127            approx_tokens: None,
17128            metadata_json: serde_json::Value::Null,
17129            messages: vec![
17130                Message {
17131                    id: None,
17132                    idx: 0,
17133                    role: MessageRole::User,
17134                    author: None,
17135                    created_at: Some(ts_ms),
17136                    content: "Hello, can you help me with a plan?".into(),
17137                    extra_json: serde_json::Value::Null,
17138                    snippets: vec![],
17139                },
17140                Message {
17141                    id: None,
17142                    idx: 1,
17143                    role: MessageRole::Agent,
17144                    author: None,
17145                    created_at: Some(ts_ms + 30_000),
17146                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
17147                    extra_json: usage_json,
17148                    snippets: vec![],
17149                },
17150                Message {
17151                    id: None,
17152                    idx: 2,
17153                    role: MessageRole::User,
17154                    author: None,
17155                    created_at: Some(ts_ms + 60_000),
17156                    content: "Great, let's proceed!".into(),
17157                    extra_json: serde_json::Value::Null,
17158                    snippets: vec![],
17159                },
17160            ],
17161            source_id: "local".into(),
17162            origin_host: None,
17163        };
17164
17165        storage
17166            .insert_conversations_batched(&[(agent_id, None, &conv)])
17167            .unwrap();
17168
17169        // Save original analytics state
17170        let conn = storage.raw();
17171        let orig_mm: i64 = conn
17172            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17173                row.get_typed(0)
17174            })
17175            .unwrap();
17176        let orig_hourly: i64 = conn
17177            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
17178                row.get_typed(0)
17179            })
17180            .unwrap();
17181        let orig_daily: i64 = conn
17182            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
17183                row.get_typed(0)
17184            })
17185            .unwrap();
17186        let orig_models_daily: i64 = conn
17187            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
17188                row.get_typed(0)
17189            })
17190            .unwrap();
17191        let orig_api_input: i64 = conn
17192            .query_row_map(
17193                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
17194                &[],
17195                |row: &FrankenRow| row.get_typed(0),
17196            )
17197            .unwrap();
17198
17199        assert_eq!(orig_mm, 3);
17200        assert!(orig_hourly > 0);
17201        assert!(orig_daily > 0);
17202        assert!(orig_models_daily > 0);
17203
17204        // Destroy analytics tables (simulate corruption)
17205        conn.execute("DELETE FROM message_metrics").unwrap();
17206        conn.execute("DELETE FROM usage_hourly").unwrap();
17207        conn.execute("DELETE FROM usage_daily").unwrap();
17208        conn.execute("DELETE FROM usage_models_daily").unwrap();
17209
17210        // Verify they're empty
17211        let zero: i64 = conn
17212            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17213                row.get_typed(0)
17214            })
17215            .unwrap();
17216        assert_eq!(zero, 0);
17217
17218        // Rebuild analytics
17219        let result = storage.rebuild_analytics().unwrap();
17220
17221        assert_eq!(result.message_metrics_rows, 3);
17222        assert!(result.usage_hourly_rows > 0);
17223        assert!(result.usage_daily_rows > 0);
17224        assert!(result.usage_models_daily_rows > 0);
17225        assert!(
17226            result.elapsed_ms < 10_000,
17227            "Rebuild should be fast for 3 msgs"
17228        );
17229
17230        // Verify rebuilt data matches
17231        let conn = storage.raw();
17232        let rebuilt_mm: i64 = conn
17233            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
17234                row.get_typed(0)
17235            })
17236            .unwrap();
17237        assert_eq!(
17238            rebuilt_mm, orig_mm,
17239            "Rebuilt message_metrics count should match"
17240        );
17241
17242        let rebuilt_hourly: i64 = conn
17243            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
17244                row.get_typed(0)
17245            })
17246            .unwrap();
17247        assert_eq!(
17248            rebuilt_hourly, orig_hourly,
17249            "Rebuilt hourly rows should match"
17250        );
17251
17252        let rebuilt_daily: i64 = conn
17253            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
17254                row.get_typed(0)
17255            })
17256            .unwrap();
17257        assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
17258
17259        let rebuilt_models_daily: i64 = conn
17260            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
17261                row.get_typed(0)
17262            })
17263            .unwrap();
17264        assert_eq!(
17265            rebuilt_models_daily, orig_models_daily,
17266            "Rebuilt model rollup rows should match"
17267        );
17268
17269        // Verify API token data preserved through rebuild
17270        let rebuilt_api_input: i64 = conn
17271            .query_row_map(
17272                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
17273                &[],
17274                |row: &FrankenRow| row.get_typed(0),
17275            )
17276            .unwrap();
17277        assert_eq!(
17278            rebuilt_api_input, orig_api_input,
17279            "Rebuilt API input tokens should match original"
17280        );
17281
17282        // Verify rollups have correct data
17283        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
17284            i64,
17285            i64,
17286            i64,
17287            i64,
17288            i64,
17289            i64,
17290        ) = conn
17291            .query_row_map(
17292                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
17293                        plan_content_tokens_est_total, plan_api_tokens_total
17294                 FROM usage_hourly WHERE hour_id = ?",
17295                fparams![expected_hour],
17296                |row: &FrankenRow| {
17297                    Ok((
17298                        row.get_typed(0)?,
17299                        row.get_typed(1)?,
17300                        row.get_typed(2)?,
17301                        row.get_typed(3)?,
17302                        row.get_typed(4)?,
17303                        row.get_typed(5)?,
17304                    ))
17305                },
17306            )
17307            .unwrap();
17308        assert_eq!(uh_msg, 3);
17309        assert_eq!(uh_user, 2);
17310        assert_eq!(uh_asst, 1);
17311        assert_eq!(uh_plan, 1);
17312        assert!(uh_plan_content > 0);
17313        assert!(uh_plan_api > 0);
17314
17315        let ud_msg: i64 = conn
17316            .query_row_map(
17317                "SELECT message_count FROM usage_daily WHERE day_id = ?",
17318                fparams![expected_day],
17319                |row| row.get_typed(0),
17320            )
17321            .unwrap();
17322        assert_eq!(ud_msg, 3);
17323    }
17324
17325    #[test]
17326    fn insert_conversations_batched_flushes_large_fts_batches() {
17327        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17328        use std::path::PathBuf;
17329
17330        let dir = TempDir::new().unwrap();
17331        let db_path = dir.path().join("test.db");
17332        let storage = SqliteStorage::open(&db_path).unwrap();
17333        // V14 drops fts_messages during migration; cass normally recreates it
17334        // during startup via `ensure_search_fallback_fts_consistency`. Tests
17335        // that inspect fts_messages directly need to run the same repair pass
17336        // to exercise the "insert flushes FTS" contract.
17337        storage
17338            .ensure_search_fallback_fts_consistency()
17339            .expect("ensure FTS consistency before insert");
17340
17341        let agent = Agent {
17342            id: None,
17343            slug: "codex".into(),
17344            name: "Codex".into(),
17345            version: Some("0.2.3".into()),
17346            kind: AgentKind::Cli,
17347        };
17348        let agent_id = storage.ensure_agent(&agent).unwrap();
17349
17350        let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
17351        let messages: Vec<_> = (0_i64..2)
17352            .map(|i| Message {
17353                id: None,
17354                idx: i,
17355                role: MessageRole::Agent,
17356                author: None,
17357                created_at: Some(1_700_000_000_000 + i),
17358                content: format!("{i}-{content}"),
17359                extra_json: serde_json::Value::Null,
17360                snippets: Vec::new(),
17361            })
17362            .collect();
17363        let conv = Conversation {
17364            id: None,
17365            agent_slug: "codex".into(),
17366            workspace: Some(PathBuf::from("/tmp/workspace")),
17367            external_id: Some("fts-large-batch".into()),
17368            title: Some("FTS Large Batch".into()),
17369            source_path: PathBuf::from("/tmp/rollout.jsonl"),
17370            started_at: Some(1_700_000_000_000),
17371            ended_at: Some(1_700_000_000_999),
17372            approx_tokens: None,
17373            metadata_json: serde_json::Value::Null,
17374            messages,
17375            source_id: "local".into(),
17376            origin_host: None,
17377        };
17378
17379        let outcomes = storage
17380            .insert_conversations_batched(&[(agent_id, None, &conv)])
17381            .unwrap();
17382        assert_eq!(outcomes.len(), 1);
17383        assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
17384
17385        let message_count: i64 = storage
17386            .conn
17387            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
17388                row.get_typed(0)
17389            })
17390            .unwrap();
17391        let fts_count: i64 = storage
17392            .conn
17393            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
17394                row.get_typed(0)
17395            })
17396            .unwrap();
17397
17398        assert_eq!(message_count, conv.messages.len() as i64);
17399        assert_eq!(fts_count, conv.messages.len() as i64);
17400    }
17401
17402    fn make_profiled_storage_remote_conversation(
17403        external_id: i64,
17404        msg_count: usize,
17405    ) -> Conversation {
17406        Conversation {
17407            id: None,
17408            agent_slug: "codex".into(),
17409            workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
17410            external_id: Some(format!("profiled-storage-remote-{external_id}")),
17411            title: Some(format!(
17412                "Profiled storage remote conversation {external_id}"
17413            )),
17414            source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
17415            started_at: Some(10_000 + external_id * 100),
17416            ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
17417            approx_tokens: Some(msg_count as i64 * 32),
17418            metadata_json: serde_json::json!({ "bench": true }),
17419            messages: (0..msg_count)
17420                .map(|idx| Message {
17421                    id: None,
17422                    idx: idx as i64,
17423                    role: if idx % 2 == 0 {
17424                        MessageRole::User
17425                    } else {
17426                        MessageRole::Agent
17427                    },
17428                    author: Some("tester".into()),
17429                    created_at: Some(20_000 + external_id * 100 + idx as i64),
17430                    content: format!(
17431                        "profiled storage remote content ext={external_id} idx={idx} {}",
17432                        "x".repeat(64)
17433                    ),
17434                    extra_json: serde_json::json!({ "idx": idx }),
17435                    snippets: Vec::new(),
17436                })
17437                .collect(),
17438            source_id: "profiled-storage-remote-source".into(),
17439            origin_host: Some("builder-profile".into()),
17440        }
17441    }
17442
17443    fn make_profiled_append_remote_merge_conversation(
17444        external_id: i64,
17445        msg_count: usize,
17446    ) -> Conversation {
17447        let base_ts = 100_000 + external_id * 1_000;
17448        Conversation {
17449            id: None,
17450            agent_slug: "codex".into(),
17451            workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
17452            external_id: Some(format!("profiled-append-remote-{external_id}")),
17453            title: Some(format!("Profiled append remote conversation {external_id}")),
17454            source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
17455            started_at: Some(base_ts),
17456            ended_at: Some(base_ts + msg_count as i64),
17457            approx_tokens: Some(msg_count as i64 * 50),
17458            metadata_json: serde_json::json!({ "bench": true }),
17459            messages: (0..msg_count)
17460                .map(|idx| Message {
17461                    id: None,
17462                    idx: idx as i64,
17463                    role: if idx % 2 == 0 {
17464                        MessageRole::User
17465                    } else {
17466                        MessageRole::Agent
17467                    },
17468                    author: Some(format!("model-{}", external_id % 5)),
17469                    created_at: Some(base_ts + idx as i64),
17470                    content: format!(
17471                        "Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
17472                        external_id, idx
17473                    ),
17474                    extra_json: serde_json::json!({ "bench": true }),
17475                    snippets: Vec::new(),
17476                })
17477                .collect(),
17478            source_id: "profiled-append-remote-source".into(),
17479            origin_host: Some("builder-profile".into()),
17480        }
17481    }
17482
17483    #[test]
17484    fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
17485        let dir = TempDir::new().unwrap();
17486        let db_path = dir.path().join("batched-message-ids.db");
17487        let storage = SqliteStorage::open(&db_path).unwrap();
17488        let agent_id = storage
17489            .ensure_agent(&Agent {
17490                id: None,
17491                slug: "codex".into(),
17492                name: "Codex".into(),
17493                version: None,
17494                kind: AgentKind::Cli,
17495            })
17496            .unwrap();
17497        let workspace_id = storage
17498            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17499            .unwrap();
17500        let mut conv = make_profiled_storage_remote_conversation(42, 5);
17501        for (idx, msg) in conv.messages.iter_mut().enumerate() {
17502            msg.snippets.push(Snippet {
17503                id: None,
17504                file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
17505                start_line: Some((idx + 1) as i64),
17506                end_line: Some((idx + 2) as i64),
17507                language: Some("rust".into()),
17508                snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
17509            });
17510        }
17511        let outcome = storage
17512            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
17513            .unwrap();
17514
17515        let message_count: i64 = storage
17516            .conn
17517            .query_row_map(
17518                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17519                fparams![outcome.conversation_id],
17520                |row| row.get_typed(0),
17521            )
17522            .unwrap();
17523        let joined_snippet_count: i64 = storage
17524            .conn
17525            .query_row_map(
17526                "SELECT COUNT(*)
17527                 FROM snippets s
17528                 JOIN messages m ON s.message_id = m.id
17529                 WHERE m.conversation_id = ?1",
17530                fparams![outcome.conversation_id],
17531                |row| row.get_typed(0),
17532            )
17533            .unwrap();
17534
17535        assert_eq!(message_count, conv.messages.len() as i64);
17536        assert_eq!(joined_snippet_count, conv.messages.len() as i64);
17537    }
17538
17539    #[test]
17540    fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
17541        let dir = TempDir::new().unwrap();
17542        let db_path = dir.path().join("batched-append-message-ids.db");
17543        let storage = SqliteStorage::open(&db_path).unwrap();
17544        let agent_id = storage
17545            .ensure_agent(&Agent {
17546                id: None,
17547                slug: "codex".into(),
17548                name: "Codex".into(),
17549                version: None,
17550                kind: AgentKind::Cli,
17551            })
17552            .unwrap();
17553        let workspace_id = storage
17554            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17555            .unwrap();
17556
17557        let mut initial = make_profiled_storage_remote_conversation(77, 2);
17558        for (idx, msg) in initial.messages.iter_mut().enumerate() {
17559            msg.snippets.push(Snippet {
17560                id: None,
17561                file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
17562                start_line: Some((idx + 1) as i64),
17563                end_line: Some((idx + 2) as i64),
17564                language: Some("rust".into()),
17565                snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
17566            });
17567        }
17568        let first = storage
17569            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17570            .unwrap();
17571        assert_eq!(first.inserted_indices, vec![0, 1]);
17572
17573        let mut appended = make_profiled_storage_remote_conversation(77, 5);
17574        for (idx, msg) in appended.messages.iter_mut().enumerate() {
17575            msg.snippets.push(Snippet {
17576                id: None,
17577                file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
17578                start_line: Some((idx + 10) as i64),
17579                end_line: Some((idx + 11) as i64),
17580                language: Some("rust".into()),
17581                snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
17582            });
17583        }
17584        let second = storage
17585            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17586            .unwrap();
17587        assert_eq!(second.conversation_id, first.conversation_id);
17588        assert_eq!(second.inserted_indices, vec![2, 3, 4]);
17589
17590        let message_count: i64 = storage
17591            .conn
17592            .query_row_map(
17593                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17594                fparams![first.conversation_id],
17595                |row| row.get_typed(0),
17596            )
17597            .unwrap();
17598        let joined_snippets: Vec<(i64, String)> = storage
17599            .conn
17600            .query_map_collect(
17601                "SELECT m.idx, s.file_path
17602                 FROM snippets s
17603                 JOIN messages m ON s.message_id = m.id
17604                 WHERE m.conversation_id = ?1
17605                 ORDER BY m.idx, s.id",
17606                fparams![first.conversation_id],
17607                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17608            )
17609            .unwrap();
17610
17611        assert_eq!(message_count, 5);
17612        assert_eq!(
17613            joined_snippets,
17614            vec![
17615                (0, "src/append_initial_0.rs".to_string()),
17616                (1, "src/append_initial_1.rs".to_string()),
17617                (2, "src/append_full_2.rs".to_string()),
17618                (3, "src/append_full_3.rs".to_string()),
17619                (4, "src/append_full_4.rs".to_string()),
17620            ]
17621        );
17622    }
17623
17624    #[test]
17625    fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
17626        let dir = TempDir::new().unwrap();
17627        let db_path = dir.path().join("external-lookup-rehydrate.db");
17628        let storage = SqliteStorage::open(&db_path).unwrap();
17629        let agent_id = storage
17630            .ensure_agent(&Agent {
17631                id: None,
17632                slug: "codex".into(),
17633                name: "Codex".into(),
17634                version: None,
17635                kind: AgentKind::Cli,
17636            })
17637            .unwrap();
17638        let workspace_id = storage
17639            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17640            .unwrap();
17641
17642        let initial = make_profiled_storage_remote_conversation(88, 2);
17643        let first = storage
17644            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17645            .unwrap();
17646        let external_id = initial.external_id.as_deref().unwrap();
17647        let lookup_key =
17648            conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
17649        let lookup_id: i64 = storage
17650            .conn
17651            .query_row_map(
17652                "SELECT conversation_id
17653                 FROM conversation_external_tail_lookup
17654                 WHERE lookup_key = ?1",
17655                fparams![lookup_key.as_str()],
17656                |row| row.get_typed(0),
17657            )
17658            .unwrap();
17659        assert_eq!(lookup_id, first.conversation_id);
17660
17661        storage
17662            .conn
17663            .execute_compat(
17664                "DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
17665                fparams![lookup_key.as_str()],
17666            )
17667            .unwrap();
17668
17669        let appended = make_profiled_storage_remote_conversation(88, 4);
17670        let second = storage
17671            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17672            .unwrap();
17673        assert_eq!(second.conversation_id, first.conversation_id);
17674        assert_eq!(second.inserted_indices, vec![2, 3]);
17675
17676        let conversation_count: i64 = storage
17677            .conn
17678            .query_row_map(
17679                "SELECT COUNT(*)
17680                 FROM conversations
17681                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
17682                fparams![initial.source_id.as_str(), agent_id, external_id],
17683                |row| row.get_typed(0),
17684            )
17685            .unwrap();
17686        let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
17687            .conn
17688            .query_row_map(
17689                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
17690                 FROM conversation_external_tail_lookup
17691                 WHERE lookup_key = ?1",
17692                fparams![lookup_key.as_str()],
17693                |row| {
17694                    Ok((
17695                        row.get_typed(0)?,
17696                        row.get_typed(1)?,
17697                        row.get_typed(2)?,
17698                        row.get_typed(3)?,
17699                    ))
17700                },
17701            )
17702            .unwrap();
17703        let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
17704            .conn
17705            .query_row_map(
17706                "SELECT ended_at, last_message_idx, last_message_created_at
17707                 FROM conversation_tail_state
17708                 WHERE conversation_id = ?1",
17709                fparams![first.conversation_id],
17710                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
17711            )
17712            .unwrap();
17713        assert_eq!(conversation_count, 1);
17714        assert_eq!(
17715            restored_lookup,
17716            (
17717                first.conversation_id,
17718                tail_state.0,
17719                tail_state.1,
17720                tail_state.2
17721            )
17722        );
17723        assert_eq!(
17724            tail_state,
17725            (
17726                appended.messages[3].created_at,
17727                Some(3),
17728                appended.messages[3].created_at
17729            )
17730        );
17731    }
17732
17733    #[test]
17734    fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
17735        let dir = TempDir::new().unwrap();
17736        let db_path = dir.path().join("test.db");
17737        let storage = SqliteStorage::open(&db_path).unwrap();
17738        let agent_id = storage
17739            .ensure_agent(&Agent {
17740                id: None,
17741                slug: "codex".into(),
17742                name: "Codex".into(),
17743                version: None,
17744                kind: AgentKind::Cli,
17745            })
17746            .unwrap();
17747        let workspace = PathBuf::from("/ws/profiled-storage-remote");
17748        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17749
17750        storage
17751            .insert_conversation_tree(
17752                agent_id,
17753                Some(workspace_id),
17754                &make_profiled_storage_remote_conversation(0, 3),
17755            )
17756            .unwrap();
17757        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17758
17759        storage
17760            .insert_conversation_tree(
17761                agent_id,
17762                Some(workspace_id),
17763                &make_profiled_storage_remote_conversation(1, 2),
17764            )
17765            .unwrap();
17766
17767        let row_count: i64 = storage
17768            .conn
17769            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
17770                row.get_typed(0)
17771            })
17772            .unwrap();
17773        let (session_count, message_count): (i64, i64) = storage
17774            .conn
17775            .query_row_map(
17776                "SELECT session_count, message_count
17777                 FROM daily_stats
17778                 WHERE agent_slug = 'all' AND source_id = 'all'",
17779                fparams![],
17780                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17781            )
17782            .unwrap();
17783
17784        assert_eq!(row_count, 4);
17785        assert_eq!(session_count, 1);
17786        assert_eq!(message_count, 2);
17787    }
17788
17789    #[test]
17790    #[serial]
17791    fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
17792        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17793
17794        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17795            let dir = TempDir::new().unwrap();
17796            let db_path = dir.path().join(format!("profile-{msg_count}.db"));
17797            let storage = SqliteStorage::open(&db_path).unwrap();
17798            let agent_id = storage
17799                .ensure_agent(&Agent {
17800                    id: None,
17801                    slug: "codex".into(),
17802                    name: "Codex".into(),
17803                    version: None,
17804                    kind: AgentKind::Cli,
17805                })
17806                .unwrap();
17807            let workspace = PathBuf::from("/ws/profiled-storage-remote");
17808            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17809
17810            storage
17811                .insert_conversation_tree(
17812                    agent_id,
17813                    Some(workspace_id),
17814                    &make_profiled_storage_remote_conversation(0, msg_count),
17815                )
17816                .unwrap();
17817
17818            let mut profile = InsertConversationTreePerfProfile::default();
17819            for external_id in 1..=iterations {
17820                storage
17821                    .insert_conversation_tree_with_profile(
17822                        agent_id,
17823                        Some(workspace_id),
17824                        &make_profiled_storage_remote_conversation(external_id as i64, msg_count),
17825                        &mut profile,
17826                    )
17827                    .unwrap();
17828            }
17829
17830            let accounted_duration = profile.source_duration
17831                + profile.tx_open_duration
17832                + profile.existing_lookup_duration
17833                + profile.conversation_row_duration
17834                + profile.message_insert_duration
17835                + profile.snippet_insert_duration
17836                + profile.fts_entry_duration
17837                + profile.fts_flush_duration
17838                + profile.analytics_duration
17839                + profile.commit_duration;
17840            assert_eq!(profile.invocations, iterations);
17841            assert_eq!(profile.messages, iterations * msg_count);
17842            assert_eq!(profile.inserted_messages, iterations * msg_count);
17843            assert!(
17844                profile.total_duration >= accounted_duration,
17845                "accounted stage durations cannot exceed total duration"
17846            );
17847
17848            profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
17849        }
17850    }
17851
17852    #[test]
17853    #[serial]
17854    fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
17855        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17856
17857        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17858            let dir = TempDir::new().unwrap();
17859            let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
17860            let storage = SqliteStorage::open(&db_path).unwrap();
17861            let agent_id = storage
17862                .ensure_agent(&Agent {
17863                    id: None,
17864                    slug: "codex".into(),
17865                    name: "Codex".into(),
17866                    version: None,
17867                    kind: AgentKind::Cli,
17868                })
17869                .unwrap();
17870            let workspace = PathBuf::from("/ws/profiled-append-remote");
17871            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17872
17873            for external_id in 0..iterations {
17874                storage
17875                    .insert_conversation_tree(
17876                        agent_id,
17877                        Some(workspace_id),
17878                        &make_profiled_append_remote_merge_conversation(
17879                            external_id as i64,
17880                            msg_count,
17881                        ),
17882                    )
17883                    .unwrap();
17884            }
17885
17886            let mut profile = InsertConversationTreePerfProfile::default();
17887            for external_id in 0..iterations {
17888                storage
17889                    .append_existing_conversation_with_profile(
17890                        agent_id,
17891                        Some(workspace_id),
17892                        &make_profiled_append_remote_merge_conversation(
17893                            external_id as i64,
17894                            msg_count * 2,
17895                        ),
17896                        &mut profile,
17897                    )
17898                    .unwrap();
17899            }
17900
17901            let accounted_duration = profile.source_duration
17902                + profile.tx_open_duration
17903                + profile.existing_lookup_duration
17904                + profile.existing_idx_lookup_duration
17905                + profile.existing_replay_lookup_duration
17906                + profile.dedupe_filter_duration
17907                + profile.conversation_row_duration
17908                + profile.message_insert_duration
17909                + profile.snippet_insert_duration
17910                + profile.fts_entry_duration
17911                + profile.fts_flush_duration
17912                + profile.analytics_duration
17913                + profile.commit_duration;
17914            assert_eq!(profile.invocations, iterations);
17915            assert_eq!(profile.messages, iterations * msg_count * 2);
17916            assert_eq!(profile.inserted_messages, iterations * msg_count);
17917            assert!(
17918                profile.total_duration >= accounted_duration,
17919                "accounted append stage durations cannot exceed total duration"
17920            );
17921
17922            profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
17923        }
17924    }
17925
17926    #[test]
17927    fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
17928        let dir = TempDir::new().unwrap();
17929        let db_path = dir.path().join("test.db");
17930        let storage = SqliteStorage::open(&db_path).unwrap();
17931        let started_at = 1_700_000_000_000_i64;
17932        let day_id = FrankenStorage::day_id_from_millis(started_at);
17933        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17934
17935        storage
17936            .conn
17937            .execute_compat(
17938                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17939                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17940                fparams![1_i64, "codex", "Codex", "cli"],
17941            )
17942            .unwrap();
17943        storage
17944            .conn
17945            .execute_compat(
17946                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17947                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17948                fparams![2_i64, "claude", "Claude", "cli"],
17949            )
17950            .unwrap();
17951
17952        storage
17953            .conn
17954            .execute_compat(
17955                "INSERT INTO conversations (
17956                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17957                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17958                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17959                fparams![
17960                    1_i64,
17961                    1_i64,
17962                    LOCAL_SOURCE_ID,
17963                    "daily-a",
17964                    "Daily A",
17965                    "/tmp/daily-a.jsonl",
17966                    started_at,
17967                    started_at + 200,
17968                    "{}"
17969                ],
17970            )
17971            .unwrap();
17972        storage
17973            .conn
17974            .execute_compat(
17975                "INSERT INTO conversations (
17976                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17977                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17978                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17979                fparams![
17980                    2_i64,
17981                    2_i64,
17982                    LOCAL_SOURCE_ID,
17983                    "daily-b",
17984                    "Daily B",
17985                    "/tmp/daily-b.jsonl",
17986                    started_at,
17987                    started_at + 300,
17988                    "{}"
17989                ],
17990            )
17991            .unwrap();
17992
17993        storage
17994            .conn
17995            .execute_compat(
17996                "INSERT INTO messages (
17997                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17998                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17999                fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
18000            )
18001            .unwrap();
18002        storage
18003            .conn
18004            .execute_compat(
18005                "INSERT INTO messages (
18006                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18007                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18008                fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
18009            )
18010            .unwrap();
18011        storage
18012            .conn
18013            .execute_compat(
18014                "INSERT INTO messages (
18015                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18016                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18017                fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
18018            )
18019            .unwrap();
18020
18021        for (message_id, agent_slug, role, content_len) in [
18022            (1_i64, "codex", "user", 5_i64),
18023            (2_i64, "codex", "assistant", 8_i64),
18024            (3_i64, "claude", "user", 3_i64),
18025        ] {
18026            storage
18027                .conn
18028                .execute_compat(
18029                    "INSERT INTO message_metrics (
18030                        message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
18031                        role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
18032                        api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
18033                        api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
18034                        model_name, model_family, model_tier, provider
18035                     ) VALUES (
18036                        ?1, ?2, ?3, ?4, ?5, ?6, ?7,
18037                        ?8, ?9, ?10, ?11, ?12,
18038                        ?13, ?14, ?15,
18039                        ?16, ?17, ?18, ?19, ?20,
18040                        ?21, ?22, ?23, ?24
18041                     )",
18042                    fparams![
18043                        message_id,
18044                        started_at,
18045                        hour_id,
18046                        day_id,
18047                        agent_slug,
18048                        0_i64,
18049                        LOCAL_SOURCE_ID,
18050                        role,
18051                        content_len,
18052                        content_len / 4,
18053                        0_i64,
18054                        0_i64,
18055                        0_i64,
18056                        0_i64,
18057                        0_i64,
18058                        "",
18059                        "estimated",
18060                        0_i64,
18061                        0_i64,
18062                        0_i64,
18063                        "",
18064                        "unknown",
18065                        "unknown",
18066                        "unknown"
18067                    ],
18068                )
18069                .unwrap();
18070        }
18071
18072        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18073
18074        let rebuilt = storage.rebuild_daily_stats().unwrap();
18075        assert_eq!(rebuilt.total_sessions, 2);
18076
18077        let health = storage.daily_stats_health().unwrap();
18078        assert_eq!(health.conversation_count, 2);
18079        assert_eq!(health.materialized_total, 2);
18080        assert_eq!(health.drift, 0);
18081
18082        let total_messages: i64 = storage
18083            .conn
18084            .query_row_map(
18085                "SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18086                fparams![],
18087                |row| row.get_typed(0),
18088            )
18089            .unwrap();
18090        assert_eq!(total_messages, 3);
18091    }
18092
18093    #[test]
18094    fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
18095        let dir = TempDir::new().unwrap();
18096        let db_path = dir.path().join("test.db");
18097        let storage = SqliteStorage::open(&db_path).unwrap();
18098
18099        let content = "ASCII🙂é漢字";
18100        let expected_bytes = content.len() as i64;
18101        let started_at = 1_704_067_200_000_i64;
18102        let day_id = FrankenStorage::day_id_from_millis(started_at);
18103        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
18104
18105        storage
18106            .conn
18107            .execute_compat(
18108                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
18109                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
18110                fparams![1_i64, "tester", "Tester", "cli"],
18111            )
18112            .unwrap();
18113        storage
18114            .conn
18115            .execute_compat(
18116                "INSERT INTO conversations (
18117                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
18118                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
18119                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
18120                fparams![
18121                    1_i64,
18122                    1_i64,
18123                    LOCAL_SOURCE_ID,
18124                    "unicode-metrics",
18125                    "Unicode Metrics",
18126                    "/tmp/unicode-metrics.jsonl",
18127                    started_at,
18128                    "{}"
18129                ],
18130            )
18131            .unwrap();
18132        storage
18133            .conn
18134            .execute_compat(
18135                "INSERT INTO messages (
18136                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18137                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18138                fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
18139            )
18140            .unwrap();
18141        storage
18142            .conn
18143            .execute_compat(
18144                "INSERT INTO message_metrics (
18145                    message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
18146                    role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
18147                    api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
18148                    api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
18149                    model_name, model_family, model_tier, provider
18150                 ) VALUES (
18151                    ?1, ?2, ?3, ?4, ?5, ?6, ?7,
18152                    ?8, ?9, ?10, ?11, ?12,
18153                    ?13, ?14, ?15,
18154                    ?16, ?17, ?18, ?19, ?20,
18155                    ?21, ?22, ?23, ?24
18156                 )",
18157                fparams![
18158                    1_i64,
18159                    started_at,
18160                    hour_id,
18161                    day_id,
18162                    "tester",
18163                    0_i64,
18164                    LOCAL_SOURCE_ID,
18165                    "user",
18166                    expected_bytes,
18167                    expected_bytes / 4,
18168                    0_i64,
18169                    0_i64,
18170                    0_i64,
18171                    0_i64,
18172                    0_i64,
18173                    "",
18174                    "estimated",
18175                    0_i64,
18176                    0_i64,
18177                    0_i64,
18178                    "",
18179                    "unknown",
18180                    "unknown",
18181                    "unknown"
18182                ],
18183            )
18184            .unwrap();
18185
18186        let mut tx = storage.conn.transaction().unwrap();
18187        franken_update_daily_stats_in_tx(
18188            &storage,
18189            &tx,
18190            "tester",
18191            LOCAL_SOURCE_ID,
18192            Some(started_at),
18193            StatsDelta {
18194                session_count_delta: 1,
18195                message_count_delta: 1,
18196                total_chars_delta: expected_bytes,
18197            },
18198        )
18199        .unwrap();
18200        tx.commit().unwrap();
18201
18202        let inline_total: i64 = storage
18203            .conn
18204            .query_row_map(
18205                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18206                fparams![],
18207                |row| row.get_typed(0),
18208            )
18209            .unwrap();
18210        assert_eq!(inline_total, expected_bytes);
18211
18212        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18213
18214        let rebuilt = storage.rebuild_daily_stats().unwrap();
18215        assert_eq!(rebuilt.total_sessions, 1);
18216
18217        let rebuilt_total: i64 = storage
18218            .conn
18219            .query_row_map(
18220                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18221                fparams![],
18222                |row| row.get_typed(0),
18223            )
18224            .unwrap();
18225        assert_eq!(rebuilt_total, expected_bytes);
18226    }
18227
18228    #[test]
18229    fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
18230        let dir = TempDir::new().unwrap();
18231        let db_path = dir.path().join("test.db");
18232        let storage = SqliteStorage::open(&db_path).unwrap();
18233
18234        let content = "fallback🙂é漢字";
18235        let expected_bytes = content.len() as i64;
18236        let started_at = 1_704_067_200_000_i64;
18237        storage
18238            .conn
18239            .execute_compat(
18240                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
18241                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
18242                fparams![1_i64, "tester", "Tester", "cli"],
18243            )
18244            .unwrap();
18245        storage
18246            .conn
18247            .execute_compat(
18248                "INSERT INTO conversations (
18249                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
18250                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
18251                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
18252                fparams![
18253                    1_i64,
18254                    1_i64,
18255                    LOCAL_SOURCE_ID,
18256                    "unicode-fallback",
18257                    "Unicode Fallback",
18258                    "/tmp/unicode-fallback.jsonl",
18259                    started_at,
18260                    "{}"
18261                ],
18262            )
18263            .unwrap();
18264        storage
18265            .conn
18266            .execute_compat(
18267                "INSERT INTO messages (
18268                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
18269                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
18270                fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
18271            )
18272            .unwrap();
18273
18274        let mut tx = storage.conn.transaction().unwrap();
18275        franken_update_daily_stats_in_tx(
18276            &storage,
18277            &tx,
18278            "tester",
18279            LOCAL_SOURCE_ID,
18280            Some(started_at),
18281            StatsDelta {
18282                session_count_delta: 1,
18283                message_count_delta: 1,
18284                total_chars_delta: expected_bytes,
18285            },
18286        )
18287        .unwrap();
18288        tx.commit().unwrap();
18289
18290        storage.conn.execute("DELETE FROM daily_stats").unwrap();
18291
18292        let rebuilt = storage.rebuild_daily_stats().unwrap();
18293        assert_eq!(rebuilt.total_sessions, 1);
18294
18295        let rebuilt_total: i64 = storage
18296            .conn
18297            .query_row_map(
18298                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
18299                fparams![],
18300                |row| row.get_typed(0),
18301            )
18302            .unwrap();
18303        assert_eq!(rebuilt_total, expected_bytes);
18304    }
18305
18306    #[test]
18307    fn insert_conversations_batched_appends_duplicate_external_id() {
18308        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18309        use std::path::PathBuf;
18310
18311        let dir = TempDir::new().unwrap();
18312        let db_path = dir.path().join("test.db");
18313        let storage = SqliteStorage::open(&db_path).unwrap();
18314
18315        let agent = Agent {
18316            id: None,
18317            slug: "codex".into(),
18318            name: "Codex".into(),
18319            version: Some("0.2.3".into()),
18320            kind: AgentKind::Cli,
18321        };
18322        let agent_id = storage.ensure_agent(&agent).unwrap();
18323
18324        let base_conv = |messages: Vec<Message>| Conversation {
18325            id: None,
18326            agent_slug: "codex".into(),
18327            workspace: Some(PathBuf::from("/tmp/workspace")),
18328            external_id: Some("shared-session".into()),
18329            title: Some("Shared Session".into()),
18330            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18331            started_at: Some(1_700_000_000_000),
18332            ended_at: Some(1_700_000_000_999),
18333            approx_tokens: None,
18334            metadata_json: serde_json::Value::Null,
18335            messages,
18336            source_id: "local".into(),
18337            origin_host: None,
18338        };
18339
18340        let conv_a = base_conv(vec![
18341            Message {
18342                id: None,
18343                idx: 0,
18344                role: MessageRole::User,
18345                author: None,
18346                created_at: Some(1_700_000_000_000),
18347                content: "first".into(),
18348                extra_json: serde_json::Value::Null,
18349                snippets: Vec::new(),
18350            },
18351            Message {
18352                id: None,
18353                idx: 1,
18354                role: MessageRole::Agent,
18355                author: None,
18356                created_at: Some(1_700_000_000_100),
18357                content: "second".into(),
18358                extra_json: serde_json::Value::Null,
18359                snippets: Vec::new(),
18360            },
18361        ]);
18362        let conv_b = base_conv(vec![
18363            Message {
18364                id: None,
18365                idx: 0,
18366                role: MessageRole::User,
18367                author: None,
18368                created_at: Some(1_700_000_000_000),
18369                content: "first".into(),
18370                extra_json: serde_json::Value::Null,
18371                snippets: Vec::new(),
18372            },
18373            Message {
18374                id: None,
18375                idx: 1,
18376                role: MessageRole::Agent,
18377                author: None,
18378                created_at: Some(1_700_000_000_100),
18379                content: "second".into(),
18380                extra_json: serde_json::Value::Null,
18381                snippets: Vec::new(),
18382            },
18383            Message {
18384                id: None,
18385                idx: 2,
18386                role: MessageRole::User,
18387                author: None,
18388                created_at: Some(1_700_000_000_200),
18389                content: "third".into(),
18390                extra_json: serde_json::Value::Null,
18391                snippets: Vec::new(),
18392            },
18393            Message {
18394                id: None,
18395                idx: 3,
18396                role: MessageRole::Agent,
18397                author: None,
18398                created_at: Some(1_700_000_000_300),
18399                content: "fourth".into(),
18400                extra_json: serde_json::Value::Null,
18401                snippets: Vec::new(),
18402            },
18403        ]);
18404
18405        let outcomes = storage
18406            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18407            .unwrap();
18408        assert_eq!(outcomes.len(), 2);
18409        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
18410        assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
18411        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18412
18413        let conversation_count: i64 = storage
18414            .conn
18415            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18416                row.get_typed(0)
18417            })
18418            .unwrap();
18419        let conversation_count_not_indexed: i64 = storage
18420            .conn
18421            .query_row_map(
18422                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18423                fparams![],
18424                |row| row.get_typed(0),
18425            )
18426            .unwrap();
18427        let conversation_count_source_index: i64 = storage
18428            .conn
18429            .query_row_map(
18430                "SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
18431                fparams![],
18432                |row| row.get_typed(0),
18433            )
18434            .unwrap();
18435        let message_count: i64 = storage
18436            .conn
18437            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18438                row.get_typed(0)
18439            })
18440            .unwrap();
18441        let reopened_storage = SqliteStorage::open(&db_path).unwrap();
18442        let reopened_conversation_count: i64 = reopened_storage
18443            .conn
18444            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18445                row.get_typed(0)
18446            })
18447            .unwrap();
18448        let reopened_conversation_count_not_indexed: i64 = reopened_storage
18449            .conn
18450            .query_row_map(
18451                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18452                fparams![],
18453                |row| row.get_typed(0),
18454            )
18455            .unwrap();
18456        let reopened_conversation_ids: Vec<i64> = reopened_storage
18457            .conn
18458            .query_map_collect(
18459                "SELECT id FROM conversations ORDER BY id",
18460                fparams![],
18461                |row| row.get_typed(0),
18462            )
18463            .unwrap();
18464        let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
18465            .conn
18466            .query_map_collect(
18467                "SELECT id FROM conversations NOT INDEXED ORDER BY id",
18468                fparams![],
18469                |row| row.get_typed(0),
18470            )
18471            .unwrap();
18472        let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
18473            .conn
18474            .query_map_collect(
18475                "SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
18476                fparams![],
18477                |row| row.get_typed(0),
18478            )
18479            .unwrap();
18480
18481        assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
18482        assert_eq!(
18483            reopened_conversation_ids_not_indexed,
18484            vec![outcomes[0].conversation_id]
18485        );
18486        assert_eq!(
18487            reopened_conversation_ids_source_index,
18488            vec![outcomes[0].conversation_id]
18489        );
18490        assert_eq!(reopened_conversation_count, 1);
18491        assert_eq!(reopened_conversation_count_not_indexed, 1);
18492        assert_eq!(conversation_count_not_indexed, 1);
18493        assert_eq!(conversation_count_source_index, 1);
18494        assert_eq!(conversation_count, 1);
18495        assert_eq!(message_count, 4);
18496    }
18497
18498    #[test]
18499    fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
18500        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18501        use std::path::PathBuf;
18502
18503        let dir = TempDir::new().unwrap();
18504        let db_path = dir.path().join("test.db");
18505        let storage = SqliteStorage::open(&db_path).unwrap();
18506
18507        let agent = Agent {
18508            id: None,
18509            slug: "codex".into(),
18510            name: "Codex".into(),
18511            version: Some("0.2.3".into()),
18512            kind: AgentKind::Cli,
18513        };
18514        let agent_id = storage.ensure_agent(&agent).unwrap();
18515
18516        let conv = Conversation {
18517            id: None,
18518            agent_slug: "codex".into(),
18519            workspace: Some(PathBuf::from("/tmp/workspace")),
18520            external_id: Some("recover-duplicate".into()),
18521            title: Some("Recover Duplicate".into()),
18522            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18523            started_at: Some(1_700_000_000_000),
18524            ended_at: Some(1_700_000_000_100),
18525            approx_tokens: None,
18526            metadata_json: serde_json::Value::Null,
18527            messages: vec![Message {
18528                id: None,
18529                idx: 0,
18530                role: MessageRole::User,
18531                author: None,
18532                created_at: Some(1_700_000_000_000),
18533                content: "hello".into(),
18534                extra_json: serde_json::Value::Null,
18535                snippets: Vec::new(),
18536            }],
18537            source_id: "local".into(),
18538            origin_host: None,
18539        };
18540
18541        let tx = storage.conn.transaction().unwrap();
18542        let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
18543            .unwrap()
18544            .expect("first insert should succeed");
18545
18546        let conversation_key = conversation_merge_key(agent_id, &conv);
18547        let resolved = franken_insert_conversation_or_get_existing_after_miss(
18548            &tx,
18549            agent_id,
18550            None,
18551            &conv,
18552            &conversation_key,
18553        )
18554        .unwrap();
18555
18556        assert!(
18557            matches!(
18558                resolved,
18559                ConversationInsertStatus::Existing(existing_id)
18560                    if existing_id.cmp(&inserted_id).is_eq()
18561            ),
18562            "expected existing conversation id {inserted_id}"
18563        );
18564
18565        let conversation_count: i64 = tx
18566            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18567                row.get_typed(0)
18568            })
18569            .unwrap();
18570        assert_eq!(conversation_count, 1);
18571    }
18572
18573    #[test]
18574    fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
18575        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18576        use std::path::PathBuf;
18577
18578        let dir = TempDir::new().unwrap();
18579        let db_path = dir.path().join("test.db");
18580        let storage = SqliteStorage::open(&db_path).unwrap();
18581
18582        let agent = Agent {
18583            id: None,
18584            slug: "codex".into(),
18585            name: "Codex".into(),
18586            version: Some("0.2.3".into()),
18587            kind: AgentKind::Cli,
18588        };
18589        let agent_id = storage.ensure_agent(&agent).unwrap();
18590
18591        let base_conv = |messages: Vec<Message>| Conversation {
18592            id: None,
18593            agent_slug: "codex".into(),
18594            workspace: Some(PathBuf::from("/tmp/workspace")),
18595            external_id: Some("shared-session-gap".into()),
18596            title: Some("Shared Session Gap".into()),
18597            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18598            started_at: Some(1_700_000_000_000),
18599            ended_at: Some(1_700_000_000_999),
18600            approx_tokens: None,
18601            metadata_json: serde_json::Value::Null,
18602            messages,
18603            source_id: "local".into(),
18604            origin_host: None,
18605        };
18606
18607        let conv_a = base_conv(vec![
18608            Message {
18609                id: None,
18610                idx: 2,
18611                role: MessageRole::User,
18612                author: None,
18613                created_at: Some(1_700_000_000_200),
18614                content: "third".into(),
18615                extra_json: serde_json::Value::Null,
18616                snippets: Vec::new(),
18617            },
18618            Message {
18619                id: None,
18620                idx: 3,
18621                role: MessageRole::Agent,
18622                author: None,
18623                created_at: Some(1_700_000_000_300),
18624                content: "fourth".into(),
18625                extra_json: serde_json::Value::Null,
18626                snippets: Vec::new(),
18627            },
18628        ]);
18629        let conv_b = base_conv(vec![
18630            Message {
18631                id: None,
18632                idx: 0,
18633                role: MessageRole::User,
18634                author: None,
18635                created_at: Some(1_700_000_000_000),
18636                content: "first".into(),
18637                extra_json: serde_json::Value::Null,
18638                snippets: Vec::new(),
18639            },
18640            Message {
18641                id: None,
18642                idx: 1,
18643                role: MessageRole::Agent,
18644                author: None,
18645                created_at: Some(1_700_000_000_100),
18646                content: "second".into(),
18647                extra_json: serde_json::Value::Null,
18648                snippets: Vec::new(),
18649            },
18650            Message {
18651                id: None,
18652                idx: 3,
18653                role: MessageRole::Agent,
18654                author: None,
18655                created_at: Some(1_700_000_000_300),
18656                content: "fourth".into(),
18657                extra_json: serde_json::Value::Null,
18658                snippets: Vec::new(),
18659            },
18660        ]);
18661
18662        let outcomes = storage
18663            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18664            .unwrap();
18665        assert_eq!(outcomes.len(), 2);
18666        assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
18667        assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
18668        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18669
18670        let stored_indices: Vec<i64> = storage
18671            .conn
18672            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18673                row.get_typed(0)
18674            })
18675            .unwrap();
18676        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18677    }
18678
18679    #[test]
18680    fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
18681        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18682        use std::path::PathBuf;
18683
18684        let dir = TempDir::new().unwrap();
18685        let db_path = dir.path().join("test.db");
18686        let storage = SqliteStorage::open(&db_path).unwrap();
18687
18688        let agent = Agent {
18689            id: None,
18690            slug: "codex".into(),
18691            name: "Codex".into(),
18692            version: Some("0.2.3".into()),
18693            kind: AgentKind::Cli,
18694        };
18695        let agent_id = storage.ensure_agent(&agent).unwrap();
18696
18697        let make_message = |idx: i64, content: &str| Message {
18698            id: None,
18699            idx,
18700            role: if idx == 0 {
18701                MessageRole::User
18702            } else {
18703                MessageRole::Agent
18704            },
18705            author: None,
18706            created_at: Some(1_700_000_000_000 + idx),
18707            content: content.into(),
18708            extra_json: serde_json::Value::Null,
18709            snippets: Vec::new(),
18710        };
18711
18712        let base_conv = |messages: Vec<Message>| Conversation {
18713            id: None,
18714            agent_slug: "codex".into(),
18715            workspace: Some(PathBuf::from("/tmp/workspace")),
18716            external_id: Some("partial-cache-session".into()),
18717            title: Some("Partial cache session".into()),
18718            source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
18719            started_at: Some(1_700_000_000_000),
18720            ended_at: Some(1_700_000_000_100),
18721            approx_tokens: None,
18722            metadata_json: serde_json::Value::Null,
18723            messages,
18724            source_id: "local".into(),
18725            origin_host: None,
18726        };
18727
18728        let canonical = base_conv(vec![
18729            make_message(0, "canonical zero"),
18730            make_message(20, "canonical twenty"),
18731        ]);
18732        storage
18733            .insert_conversation_tree(agent_id, None, &canonical)
18734            .unwrap();
18735
18736        let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
18737        let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
18738
18739        let outcomes = storage
18740            .insert_conversations_batched(&[
18741                (agent_id, None, &exact_prefix),
18742                (agent_id, None, &conflicting_tail),
18743            ])
18744            .unwrap();
18745
18746        assert_eq!(outcomes.len(), 2);
18747        assert!(outcomes[0].inserted_indices.is_empty());
18748        assert!(
18749            outcomes[1].inserted_indices.is_empty(),
18750            "the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
18751        );
18752
18753        let stored_messages: Vec<(i64, String)> = storage
18754            .conn
18755            .query_map_collect(
18756                "SELECT idx, content FROM messages ORDER BY idx",
18757                fparams![],
18758                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18759            )
18760            .unwrap();
18761        assert_eq!(
18762            stored_messages,
18763            vec![
18764                (0, "canonical zero".to_string()),
18765                (20, "canonical twenty".to_string()),
18766            ]
18767        );
18768    }
18769
18770    #[test]
18771    fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
18772        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18773        use std::path::PathBuf;
18774
18775        const MESSAGE_COUNT: i64 = 64;
18776
18777        let dir = TempDir::new().unwrap();
18778        let db_path = dir.path().join("test.db");
18779        let storage = SqliteStorage::open(&db_path).unwrap();
18780
18781        let agent = Agent {
18782            id: None,
18783            slug: "codex".into(),
18784            name: "Codex".into(),
18785            version: Some("0.2.3".into()),
18786            kind: AgentKind::Cli,
18787        };
18788        let agent_id = storage.ensure_agent(&agent).unwrap();
18789
18790        let messages: Vec<Message> = (0..MESSAGE_COUNT)
18791            .map(|idx| Message {
18792                id: None,
18793                idx,
18794                role: if idx % 2 == 0 {
18795                    MessageRole::User
18796                } else {
18797                    MessageRole::Agent
18798                },
18799                author: None,
18800                created_at: Some(1_700_000_000_000 + idx),
18801                content: format!("message {idx}"),
18802                extra_json: serde_json::Value::Null,
18803                snippets: Vec::new(),
18804            })
18805            .collect();
18806
18807        let conversation = Conversation {
18808            id: None,
18809            agent_slug: "codex".into(),
18810            workspace: Some(PathBuf::from("/tmp/workspace")),
18811            external_id: Some("large-reprocess-session".into()),
18812            title: Some("Large Reprocess Session".into()),
18813            source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
18814            started_at: Some(1_700_000_000_000),
18815            ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
18816            approx_tokens: None,
18817            metadata_json: serde_json::Value::Null,
18818            messages,
18819            source_id: "local".into(),
18820            origin_host: None,
18821        };
18822
18823        let first = storage
18824            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18825            .unwrap();
18826        let second = storage
18827            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18828            .unwrap();
18829
18830        assert_eq!(first.len(), 1);
18831        assert_eq!(second.len(), 1);
18832        assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
18833        assert!(
18834            second[0].inserted_indices.is_empty(),
18835            "full reprocessing of a large conversation must not attempt duplicate idx inserts"
18836        );
18837        assert_eq!(first[0].conversation_id, second[0].conversation_id);
18838
18839        let conversation_count: i64 = storage
18840            .conn
18841            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18842                row.get_typed(0)
18843            })
18844            .unwrap();
18845        let message_count: i64 = storage
18846            .conn
18847            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18848                row.get_typed(0)
18849            })
18850            .unwrap();
18851
18852        assert_eq!(conversation_count, 1);
18853        assert_eq!(message_count, MESSAGE_COUNT);
18854    }
18855
18856    #[test]
18857    fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
18858        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18859        use crate::indexer::persist::map_to_internal;
18860        use crate::model::types::{Agent, AgentKind};
18861        use frankensqlite::compat::{ConnectionExt, RowExt};
18862        use rand::RngExt;
18863        use rayon::prelude::*;
18864
18865        fn retryable_franken_error(err: &anyhow::Error) -> bool {
18866            err.downcast_ref::<frankensqlite::FrankenError>()
18867                .or_else(|| {
18868                    err.root_cause()
18869                        .downcast_ref::<frankensqlite::FrankenError>()
18870                })
18871                .is_some_and(|inner| {
18872                    matches!(
18873                        inner,
18874                        frankensqlite::FrankenError::Busy
18875                            | frankensqlite::FrankenError::BusyRecovery
18876                            | frankensqlite::FrankenError::BusySnapshot { .. }
18877                            | frankensqlite::FrankenError::WriteConflict { .. }
18878                            | frankensqlite::FrankenError::SerializationFailure { .. }
18879                    )
18880                })
18881        }
18882
18883        fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
18884        where
18885            F: FnMut() -> anyhow::Result<T>,
18886        {
18887            let mut rng = rand::rng();
18888            let mut backoff_ms = 4_u64;
18889            for attempt in 0..=24 {
18890                match f() {
18891                    Ok(value) => return Ok(value),
18892                    Err(err) if attempt < 24 && retryable_franken_error(&err) => {
18893                        let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
18894                        std::thread::sleep(Duration::from_millis(sleep_ms));
18895                        backoff_ms = (backoff_ms * 2).min(512);
18896                    }
18897                    Err(err) => return Err(err),
18898                }
18899            }
18900            unreachable!("retry loop must return on success or final failure")
18901        }
18902
18903        let dir = TempDir::new().unwrap();
18904        let db_path = dir.path().join("parallel_insert_conversation_tree.db");
18905        let seed = FrankenStorage::open(&db_path).unwrap();
18906        drop(seed);
18907
18908        let conversations: Vec<NormalizedConversation> = (0..10)
18909            .map(|i| NormalizedConversation {
18910                agent_slug: format!("agent-{}", i % 3),
18911                external_id: Some(format!("conv-{i}")),
18912                title: Some(format!("Conversation {i}")),
18913                workspace: Some(PathBuf::from(format!("/ws/{i}"))),
18914                source_path: PathBuf::from(format!("/log/{i}.jsonl")),
18915                started_at: Some(1_000 + i * 100),
18916                ended_at: Some(1_000 + i * 100 + 50),
18917                metadata: serde_json::json!({}),
18918                messages: (0..3)
18919                    .map(|j| NormalizedMessage {
18920                        idx: j,
18921                        role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
18922                        author: Some("tester".into()),
18923                        created_at: Some(1_000 + i * 100 + j * 10),
18924                        content: format!("parallel-distinct-test conv={i} msg={j}"),
18925                        extra: serde_json::json!({}),
18926                        snippets: vec![],
18927                        invocations: Vec::new(),
18928                    })
18929                    .collect(),
18930            })
18931            .collect();
18932
18933        let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
18934            .par_chunks(3)
18935            .map(|chunk| {
18936                let storage = FrankenStorage::open_writer(&db_path).unwrap();
18937                let mut agent_cache: HashMap<String, i64> = HashMap::new();
18938                let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
18939                let mut chunk_outcomes = Vec::with_capacity(chunk.len());
18940
18941                for conv in chunk {
18942                    let agent_slug = conv.agent_slug.clone();
18943                    let workspace = conv.workspace.clone();
18944                    let external_id = conv.external_id.clone().expect("external id");
18945                    let internal = map_to_internal(conv);
18946                    let outcome = with_retry(|| {
18947                        let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
18948                            *id
18949                        } else {
18950                            let agent = Agent {
18951                                id: None,
18952                                slug: agent_slug.clone(),
18953                                name: agent_slug.clone(),
18954                                version: None,
18955                                kind: AgentKind::Cli,
18956                            };
18957                            let id = storage.ensure_agent(&agent)?;
18958                            agent_cache.insert(agent_slug.clone(), id);
18959                            id
18960                        };
18961                        let workspace_id = if let Some(path) = &workspace {
18962                            if let Some(id) = workspace_cache.get(path) {
18963                                Some(*id)
18964                            } else {
18965                                let id = storage.ensure_workspace(path, None)?;
18966                                workspace_cache.insert(path.clone(), id);
18967                                Some(id)
18968                            }
18969                        } else {
18970                            None
18971                        };
18972                        storage.insert_conversation_tree(agent_id, workspace_id, &internal)
18973                    })
18974                    .unwrap();
18975                    chunk_outcomes.push((
18976                        external_id,
18977                        outcome.conversation_id,
18978                        outcome.inserted_indices,
18979                    ));
18980                }
18981
18982                storage.close().unwrap();
18983                chunk_outcomes
18984            })
18985            .flatten()
18986            .collect();
18987        outcomes.sort_by(|left, right| left.0.cmp(&right.0));
18988
18989        assert!(
18990            outcomes
18991                .iter()
18992                .all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
18993            "unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
18994        );
18995
18996        let distinct_ids: HashSet<i64> = outcomes
18997            .iter()
18998            .map(|(_, conversation_id, _)| *conversation_id)
18999            .collect();
19000        assert_eq!(
19001            distinct_ids.len(),
19002            conversations.len(),
19003            "unique external ids must produce distinct conversation ids: {outcomes:?}"
19004        );
19005
19006        let reader = FrankenStorage::open(&db_path).unwrap();
19007        let stored_rows: Vec<(i64, String)> = reader
19008            .raw()
19009            .query_map_collect(
19010                "SELECT id, external_id FROM conversations ORDER BY id",
19011                &[],
19012                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
19013            )
19014            .unwrap();
19015        let stored_count: i64 = reader
19016            .raw()
19017            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
19018                row.get_typed(0)
19019            })
19020            .unwrap();
19021
19022        assert_eq!(
19023            stored_count as usize,
19024            conversations.len(),
19025            "parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
19026        );
19027        assert_eq!(
19028            stored_rows.len(),
19029            conversations.len(),
19030            "parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
19031        );
19032    }
19033
19034    #[test]
19035    fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
19036        use crate::connectors::{NormalizedConversation, NormalizedMessage};
19037        use crate::indexer::persist::map_to_internal;
19038        use crate::model::types::{Agent, AgentKind};
19039        use std::path::PathBuf;
19040
19041        let dir = TempDir::new().unwrap();
19042        let db_path = dir.path().join("test.db");
19043        let storage = SqliteStorage::open(&db_path).unwrap();
19044
19045        let agent = Agent {
19046            id: None,
19047            slug: "codex".into(),
19048            name: "Codex".into(),
19049            version: Some("0.2.3".into()),
19050            kind: AgentKind::Cli,
19051        };
19052        let agent_id = storage.ensure_agent(&agent).unwrap();
19053
19054        let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
19055            agent_slug: "codex".into(),
19056            workspace: Some(PathBuf::from("/tmp/workspace")),
19057            external_id: Some("tree-gap-session".into()),
19058            title: Some("Tree Gap Session".into()),
19059            source_path: PathBuf::from("/tmp/tree.jsonl"),
19060            started_at: Some(1_700_000_000_000),
19061            ended_at: Some(1_700_000_000_999),
19062            metadata: serde_json::Value::Null,
19063            messages,
19064        };
19065
19066        let conv_a = map_to_internal(&base_conv(vec![
19067            NormalizedMessage {
19068                idx: 2,
19069                role: "user".into(),
19070                author: None,
19071                created_at: Some(1_700_000_000_200),
19072                content: "third".into(),
19073                extra: serde_json::Value::Null,
19074                snippets: Vec::new(),
19075                invocations: Vec::new(),
19076            },
19077            NormalizedMessage {
19078                idx: 3,
19079                role: "assistant".into(),
19080                author: None,
19081                created_at: Some(1_700_000_000_300),
19082                content: "fourth".into(),
19083                extra: serde_json::Value::Null,
19084                snippets: Vec::new(),
19085                invocations: Vec::new(),
19086            },
19087        ]));
19088        let conv_b = map_to_internal(&base_conv(vec![
19089            NormalizedMessage {
19090                idx: 0,
19091                role: "user".into(),
19092                author: None,
19093                created_at: Some(1_700_000_000_000),
19094                content: "first".into(),
19095                extra: serde_json::Value::Null,
19096                snippets: Vec::new(),
19097                invocations: Vec::new(),
19098            },
19099            NormalizedMessage {
19100                idx: 1,
19101                role: "assistant".into(),
19102                author: None,
19103                created_at: Some(1_700_000_000_100),
19104                content: "second".into(),
19105                extra: serde_json::Value::Null,
19106                snippets: Vec::new(),
19107                invocations: Vec::new(),
19108            },
19109            NormalizedMessage {
19110                idx: 3,
19111                role: "assistant".into(),
19112                author: None,
19113                created_at: Some(1_700_000_000_300),
19114                content: "fourth".into(),
19115                extra: serde_json::Value::Null,
19116                snippets: Vec::new(),
19117                invocations: Vec::new(),
19118            },
19119        ]));
19120
19121        let first = storage
19122            .insert_conversation_tree(agent_id, None, &conv_a)
19123            .unwrap();
19124        let second = storage
19125            .insert_conversation_tree(agent_id, None, &conv_b)
19126            .unwrap();
19127
19128        assert_eq!(first.inserted_indices, vec![2, 3]);
19129        assert_eq!(second.inserted_indices, vec![0, 1]);
19130        assert_eq!(first.conversation_id, second.conversation_id);
19131
19132        let stored_indices: Vec<i64> = storage
19133            .conn
19134            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
19135                row.get_typed(0)
19136            })
19137            .unwrap();
19138        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
19139    }
19140
19141    #[test]
19142    fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
19143        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19144        use std::path::PathBuf;
19145
19146        let dir = TempDir::new().unwrap();
19147        let db_path = dir.path().join("test.db");
19148        let storage = SqliteStorage::open(&db_path).unwrap();
19149
19150        let agent = Agent {
19151            id: None,
19152            slug: "codex".into(),
19153            name: "Codex".into(),
19154            version: Some("0.2.3".into()),
19155            kind: AgentKind::Cli,
19156        };
19157        let agent_id = storage.ensure_agent(&agent).unwrap();
19158
19159        let conversation = Conversation {
19160            id: None,
19161            agent_slug: "codex".into(),
19162            workspace: Some(PathBuf::from("/tmp/workspace")),
19163            external_id: Some("duplicate-new-session".into()),
19164            title: Some("Duplicate New Session".into()),
19165            source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
19166            started_at: Some(1_700_000_000_000),
19167            ended_at: Some(1_700_000_000_999),
19168            approx_tokens: None,
19169            metadata_json: serde_json::Value::Null,
19170            messages: vec![
19171                Message {
19172                    id: None,
19173                    idx: 0,
19174                    role: MessageRole::User,
19175                    author: None,
19176                    created_at: Some(1_700_000_000_000),
19177                    content: "first canonical".into(),
19178                    extra_json: serde_json::Value::Null,
19179                    snippets: Vec::new(),
19180                },
19181                Message {
19182                    id: None,
19183                    idx: 0,
19184                    role: MessageRole::User,
19185                    author: None,
19186                    created_at: Some(1_700_000_000_001),
19187                    content: "duplicate idx should be skipped".into(),
19188                    extra_json: serde_json::Value::Null,
19189                    snippets: Vec::new(),
19190                },
19191                Message {
19192                    id: None,
19193                    idx: 1,
19194                    role: MessageRole::Agent,
19195                    author: None,
19196                    created_at: Some(1_700_000_000_100),
19197                    content: "second".into(),
19198                    extra_json: serde_json::Value::Null,
19199                    snippets: Vec::new(),
19200                },
19201            ],
19202            source_id: "local".into(),
19203            origin_host: None,
19204        };
19205
19206        let outcome = storage
19207            .insert_conversation_tree(agent_id, None, &conversation)
19208            .unwrap();
19209
19210        assert_eq!(outcome.inserted_indices, vec![0, 1]);
19211
19212        let stored_messages: Vec<(i64, String)> = storage
19213            .conn
19214            .query_map_collect(
19215                "SELECT idx, content FROM messages ORDER BY idx",
19216                fparams![],
19217                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
19218            )
19219            .unwrap();
19220        assert_eq!(
19221            stored_messages,
19222            vec![
19223                (0, "first canonical".to_string()),
19224                (1, "second".to_string())
19225            ]
19226        );
19227    }
19228
19229    #[test]
19230    fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
19231        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19232        use std::path::PathBuf;
19233
19234        let dir = TempDir::new().unwrap();
19235        let db_path = dir.path().join("test.db");
19236        let storage = SqliteStorage::open(&db_path).unwrap();
19237
19238        let agent = Agent {
19239            id: None,
19240            slug: "codex".into(),
19241            name: "Codex".into(),
19242            version: Some("0.2.3".into()),
19243            kind: AgentKind::Cli,
19244        };
19245        let agent_id = storage.ensure_agent(&agent).unwrap();
19246
19247        let base_conv = |messages: Vec<Message>| Conversation {
19248            id: None,
19249            agent_slug: "codex".into(),
19250            workspace: Some(PathBuf::from("/tmp/workspace")),
19251            external_id: None,
19252            title: Some("Source Path Merge".into()),
19253            source_path: PathBuf::from("/tmp/shared-session.jsonl"),
19254            started_at: Some(1_700_000_000_000),
19255            ended_at: Some(1_700_000_000_999),
19256            approx_tokens: None,
19257            metadata_json: serde_json::Value::Null,
19258            messages,
19259            source_id: "local".into(),
19260            origin_host: None,
19261        };
19262
19263        let first = storage
19264            .insert_conversation_tree(
19265                agent_id,
19266                None,
19267                &base_conv(vec![
19268                    Message {
19269                        id: None,
19270                        idx: 0,
19271                        role: MessageRole::User,
19272                        author: None,
19273                        created_at: Some(1_700_000_000_000),
19274                        content: "first".into(),
19275                        extra_json: serde_json::Value::Null,
19276                        snippets: Vec::new(),
19277                    },
19278                    Message {
19279                        id: None,
19280                        idx: 1,
19281                        role: MessageRole::Agent,
19282                        author: None,
19283                        created_at: Some(1_700_000_000_100),
19284                        content: "second".into(),
19285                        extra_json: serde_json::Value::Null,
19286                        snippets: Vec::new(),
19287                    },
19288                ]),
19289            )
19290            .unwrap();
19291
19292        let second = storage
19293            .insert_conversation_tree(
19294                agent_id,
19295                None,
19296                &base_conv(vec![
19297                    Message {
19298                        id: None,
19299                        idx: 1,
19300                        role: MessageRole::Agent,
19301                        author: None,
19302                        created_at: Some(1_700_000_000_100),
19303                        content: "second".into(),
19304                        extra_json: serde_json::Value::Null,
19305                        snippets: Vec::new(),
19306                    },
19307                    Message {
19308                        id: None,
19309                        idx: 2,
19310                        role: MessageRole::User,
19311                        author: None,
19312                        created_at: Some(1_700_000_000_200),
19313                        content: "third".into(),
19314                        extra_json: serde_json::Value::Null,
19315                        snippets: Vec::new(),
19316                    },
19317                ]),
19318            )
19319            .unwrap();
19320
19321        assert_eq!(first.conversation_id, second.conversation_id);
19322        assert_eq!(first.inserted_indices, vec![0, 1]);
19323        assert_eq!(second.inserted_indices, vec![2]);
19324
19325        let stored_indices: Vec<i64> = storage
19326            .conn
19327            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
19328                row.get_typed(0)
19329            })
19330            .unwrap();
19331        assert_eq!(stored_indices, vec![0, 1, 2]);
19332    }
19333
19334    #[test]
19335    fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
19336        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19337        use std::path::PathBuf;
19338
19339        let dir = TempDir::new().unwrap();
19340        let db_path = dir.path().join("test.db");
19341        let storage = SqliteStorage::open(&db_path).unwrap();
19342
19343        let agent = Agent {
19344            id: None,
19345            slug: "codex".into(),
19346            name: "Codex".into(),
19347            version: Some("0.2.3".into()),
19348            kind: AgentKind::Cli,
19349        };
19350        let agent_id = storage.ensure_agent(&agent).unwrap();
19351
19352        let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
19353            id: None,
19354            agent_slug: "codex".into(),
19355            workspace: Some(PathBuf::from("/tmp/workspace")),
19356            external_id: None,
19357            title: Some("Drift Merge".into()),
19358            source_path: PathBuf::from("/tmp/drift-session.jsonl"),
19359            started_at,
19360            ended_at: Some(1_700_000_000_999),
19361            approx_tokens: None,
19362            metadata_json: serde_json::Value::Null,
19363            messages,
19364            source_id: "local".into(),
19365            origin_host: None,
19366        };
19367
19368        let first = storage
19369            .insert_conversation_tree(
19370                agent_id,
19371                None,
19372                &base_conv(
19373                    Some(1_700_000_000_000),
19374                    vec![
19375                        Message {
19376                            id: None,
19377                            idx: 0,
19378                            role: MessageRole::User,
19379                            author: None,
19380                            created_at: Some(1_700_000_000_000),
19381                            content: "first".into(),
19382                            extra_json: serde_json::Value::Null,
19383                            snippets: Vec::new(),
19384                        },
19385                        Message {
19386                            id: None,
19387                            idx: 1,
19388                            role: MessageRole::Agent,
19389                            author: None,
19390                            created_at: Some(1_700_000_000_100),
19391                            content: "second".into(),
19392                            extra_json: serde_json::Value::Null,
19393                            snippets: Vec::new(),
19394                        },
19395                    ],
19396                ),
19397            )
19398            .unwrap();
19399
19400        let second = storage
19401            .insert_conversation_tree(
19402                agent_id,
19403                None,
19404                &base_conv(
19405                    Some(1_700_000_004_000),
19406                    vec![
19407                        Message {
19408                            id: None,
19409                            idx: 1,
19410                            role: MessageRole::Agent,
19411                            author: None,
19412                            created_at: Some(1_700_000_000_100),
19413                            content: "second".into(),
19414                            extra_json: serde_json::Value::Null,
19415                            snippets: Vec::new(),
19416                        },
19417                        Message {
19418                            id: None,
19419                            idx: 2,
19420                            role: MessageRole::User,
19421                            author: None,
19422                            created_at: Some(1_700_000_004_200),
19423                            content: "third".into(),
19424                            extra_json: serde_json::Value::Null,
19425                            snippets: Vec::new(),
19426                        },
19427                    ],
19428                ),
19429            )
19430            .unwrap();
19431
19432        assert_eq!(first.conversation_id, second.conversation_id);
19433        assert_eq!(second.inserted_indices, vec![2]);
19434    }
19435
19436    #[test]
19437    fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
19438        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19439        use std::path::PathBuf;
19440
19441        let dir = TempDir::new().unwrap();
19442        let db_path = dir.path().join("test.db");
19443        let storage = SqliteStorage::open(&db_path).unwrap();
19444
19445        let agent = Agent {
19446            id: None,
19447            slug: "codex".into(),
19448            name: "Codex".into(),
19449            version: Some("0.2.3".into()),
19450            kind: AgentKind::Cli,
19451        };
19452        let agent_id = storage.ensure_agent(&agent).unwrap();
19453
19454        let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
19455            id: None,
19456            agent_slug: "codex".into(),
19457            workspace: Some(PathBuf::from("/tmp/workspace")),
19458            external_id: None,
19459            title: Some("Partial overlap".into()),
19460            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19461            started_at: Some(started_at),
19462            ended_at: Some(started_at + 500),
19463            approx_tokens: None,
19464            metadata_json: serde_json::Value::Null,
19465            messages: vec![Message {
19466                id: None,
19467                idx,
19468                role: MessageRole::User,
19469                author: None,
19470                created_at: Some(started_at),
19471                content: content.into(),
19472                extra_json: serde_json::Value::Null,
19473                snippets: Vec::new(),
19474            }],
19475            source_id: "local".into(),
19476            origin_host: None,
19477        };
19478
19479        storage
19480            .insert_conversation_tree(
19481                agent_id,
19482                None,
19483                &Conversation {
19484                    messages: vec![
19485                        Message {
19486                            id: None,
19487                            idx: 0,
19488                            role: MessageRole::User,
19489                            author: None,
19490                            created_at: Some(1_700_000_000_000),
19491                            content: "shared opener".into(),
19492                            extra_json: serde_json::Value::Null,
19493                            snippets: Vec::new(),
19494                        },
19495                        Message {
19496                            id: None,
19497                            idx: 1,
19498                            role: MessageRole::Agent,
19499                            author: None,
19500                            created_at: Some(1_700_000_000_100),
19501                            content: "first session unique".into(),
19502                            extra_json: serde_json::Value::Null,
19503                            snippets: Vec::new(),
19504                        },
19505                    ],
19506                    ..make_conv(1_700_000_000_000, 0, "unused")
19507                },
19508            )
19509            .unwrap();
19510        storage
19511            .insert_conversation_tree(
19512                agent_id,
19513                None,
19514                &make_conv(1_700_000_900_000, 0, "shared opener"),
19515            )
19516            .unwrap();
19517
19518        let conversation_count: i64 = storage
19519            .conn
19520            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19521                row.get_typed(0)
19522            })
19523            .unwrap();
19524        assert_eq!(conversation_count, 2);
19525    }
19526
19527    #[test]
19528    fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
19529        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19530        use std::path::PathBuf;
19531
19532        let dir = TempDir::new().unwrap();
19533        let db_path = dir.path().join("test.db");
19534        let storage = SqliteStorage::open(&db_path).unwrap();
19535
19536        let agent = Agent {
19537            id: None,
19538            slug: "codex".into(),
19539            name: "Codex".into(),
19540            version: Some("0.2.3".into()),
19541            kind: AgentKind::Cli,
19542        };
19543        let agent_id = storage.ensure_agent(&agent).unwrap();
19544
19545        let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
19546            id: None,
19547            agent_slug: "codex".into(),
19548            workspace: Some(PathBuf::from("/tmp/workspace")),
19549            external_id: None,
19550            title: Some("Same Path Different Session".into()),
19551            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19552            started_at: Some(started_at),
19553            ended_at: Some(started_at + 500),
19554            approx_tokens: None,
19555            metadata_json: serde_json::Value::Null,
19556            messages: vec![Message {
19557                id: None,
19558                idx: 0,
19559                role: MessageRole::User,
19560                author: None,
19561                created_at: Some(created_at),
19562                content: content.into(),
19563                extra_json: serde_json::Value::Null,
19564                snippets: Vec::new(),
19565            }],
19566            source_id: "local".into(),
19567            origin_host: None,
19568        };
19569
19570        storage
19571            .insert_conversation_tree(
19572                agent_id,
19573                None,
19574                &make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
19575            )
19576            .unwrap();
19577        storage
19578            .insert_conversation_tree(
19579                agent_id,
19580                None,
19581                &make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
19582            )
19583            .unwrap();
19584
19585        let conversation_count: i64 = storage
19586            .conn
19587            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19588                row.get_typed(0)
19589            })
19590            .unwrap();
19591        assert_eq!(conversation_count, 2);
19592    }
19593
19594    #[test]
19595    fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
19596        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19597        use std::path::PathBuf;
19598
19599        let dir = TempDir::new().unwrap();
19600        let db_path = dir.path().join("test.db");
19601        let storage = SqliteStorage::open(&db_path).unwrap();
19602
19603        let agent = Agent {
19604            id: None,
19605            slug: "codex".into(),
19606            name: "Codex".into(),
19607            version: Some("0.2.3".into()),
19608            kind: AgentKind::Cli,
19609        };
19610        let agent_id = storage.ensure_agent(&agent).unwrap();
19611
19612        let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
19613            id: None,
19614            agent_slug: "codex".into(),
19615            workspace: Some(PathBuf::from("/tmp/workspace")),
19616            external_id: None,
19617            title: Some("Shifted replay".into()),
19618            source_path: PathBuf::from("/tmp/replay-session.jsonl"),
19619            started_at: Some(started_at),
19620            ended_at: Some(started_at + 500),
19621            approx_tokens: None,
19622            metadata_json: serde_json::Value::Null,
19623            messages,
19624            source_id: "local".into(),
19625            origin_host: None,
19626        };
19627
19628        let first = storage
19629            .insert_conversation_tree(
19630                agent_id,
19631                None,
19632                &make_conv(
19633                    1_700_000_000_000,
19634                    vec![
19635                        Message {
19636                            id: None,
19637                            idx: 0,
19638                            role: MessageRole::User,
19639                            author: None,
19640                            created_at: Some(1_700_000_000_000),
19641                            content: "first".into(),
19642                            extra_json: serde_json::Value::Null,
19643                            snippets: Vec::new(),
19644                        },
19645                        Message {
19646                            id: None,
19647                            idx: 1,
19648                            role: MessageRole::Agent,
19649                            author: None,
19650                            created_at: Some(1_700_000_000_100),
19651                            content: "second".into(),
19652                            extra_json: serde_json::Value::Null,
19653                            snippets: Vec::new(),
19654                        },
19655                    ],
19656                ),
19657            )
19658            .unwrap();
19659
19660        let second = storage
19661            .insert_conversation_tree(
19662                agent_id,
19663                None,
19664                &make_conv(
19665                    1_700_000_900_000,
19666                    vec![
19667                        Message {
19668                            id: None,
19669                            idx: 10,
19670                            role: MessageRole::User,
19671                            author: None,
19672                            created_at: Some(1_700_000_000_000),
19673                            content: "first".into(),
19674                            extra_json: serde_json::Value::Null,
19675                            snippets: Vec::new(),
19676                        },
19677                        Message {
19678                            id: None,
19679                            idx: 11,
19680                            role: MessageRole::Agent,
19681                            author: None,
19682                            created_at: Some(1_700_000_000_100),
19683                            content: "second".into(),
19684                            extra_json: serde_json::Value::Null,
19685                            snippets: Vec::new(),
19686                        },
19687                        Message {
19688                            id: None,
19689                            idx: 12,
19690                            role: MessageRole::User,
19691                            author: None,
19692                            created_at: Some(1_700_000_000_200),
19693                            content: "third".into(),
19694                            extra_json: serde_json::Value::Null,
19695                            snippets: Vec::new(),
19696                        },
19697                    ],
19698                ),
19699            )
19700            .unwrap();
19701
19702        assert_eq!(first.conversation_id, second.conversation_id);
19703        assert_eq!(second.inserted_indices, vec![12]);
19704
19705        let stored_indices: Vec<i64> = storage
19706            .conn
19707            .query_map_collect(
19708                "SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
19709                fparams![first.conversation_id],
19710                |row| row.get_typed(0),
19711            )
19712            .unwrap();
19713        assert_eq!(stored_indices, vec![0, 1, 12]);
19714    }
19715
19716    #[test]
19717    fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
19718        use crate::model::types::{Conversation, Message, MessageRole};
19719        use std::path::PathBuf;
19720
19721        fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
19722            Conversation {
19723                id: None,
19724                agent_slug: "codex".into(),
19725                workspace: Some(PathBuf::from("/tmp/workspace")),
19726                external_id: None,
19727                title: Some("Recovered".into()),
19728                source_path: PathBuf::from(source_path),
19729                started_at: Some(1_700_000_000_000),
19730                ended_at: Some(1_700_000_000_999),
19731                approx_tokens: None,
19732                metadata_json: serde_json::Value::Null,
19733                messages,
19734                source_id: "local".into(),
19735                origin_host: None,
19736            }
19737        }
19738
19739        let dir = TempDir::new().unwrap();
19740        let canonical_db = dir.path().join("agent_search.db");
19741        let storage = SqliteStorage::open(&canonical_db).unwrap();
19742
19743        let overlapping_a = base_conv(
19744            "/tmp/shared-history.jsonl",
19745            vec![
19746                Message {
19747                    id: None,
19748                    idx: 0,
19749                    role: MessageRole::User,
19750                    author: None,
19751                    created_at: Some(1_700_000_000_000),
19752                    content: "first".into(),
19753                    extra_json: serde_json::Value::Null,
19754                    snippets: Vec::new(),
19755                },
19756                Message {
19757                    id: None,
19758                    idx: 1,
19759                    role: MessageRole::Agent,
19760                    author: None,
19761                    created_at: Some(1_700_000_000_100),
19762                    content: "second".into(),
19763                    extra_json: serde_json::Value::Null,
19764                    snippets: Vec::new(),
19765                },
19766            ],
19767        );
19768        let overlapping_b = base_conv(
19769            "/tmp/shared-history.jsonl",
19770            vec![
19771                Message {
19772                    id: None,
19773                    idx: 1,
19774                    role: MessageRole::Agent,
19775                    author: None,
19776                    created_at: Some(1_700_000_000_100),
19777                    content: "second".into(),
19778                    extra_json: serde_json::Value::Null,
19779                    snippets: Vec::new(),
19780                },
19781                Message {
19782                    id: None,
19783                    idx: 2,
19784                    role: MessageRole::User,
19785                    author: None,
19786                    created_at: Some(1_700_000_000_200),
19787                    content: "third".into(),
19788                    extra_json: serde_json::Value::Null,
19789                    snippets: Vec::new(),
19790                },
19791            ],
19792        );
19793        let unique = Conversation {
19794            source_path: PathBuf::from("/tmp/unique-history.jsonl"),
19795            messages: vec![Message {
19796                id: None,
19797                idx: 0,
19798                role: MessageRole::User,
19799                author: None,
19800                created_at: Some(1_700_000_001_000),
19801                content: "unique".into(),
19802                extra_json: serde_json::Value::Null,
19803                snippets: Vec::new(),
19804            }],
19805            started_at: Some(1_700_000_001_000),
19806            ended_at: Some(1_700_000_001_100),
19807            ..base_conv("/tmp/unique-history.jsonl", Vec::new())
19808        };
19809
19810        seed_historical_db_direct(
19811            &dir.path()
19812                .join("backups/agent_search.db.20260322T020200.bak"),
19813            std::slice::from_ref(&overlapping_a),
19814        );
19815        seed_historical_db_direct(
19816            &dir.path().join("agent_search.corrupt.20260324_212907"),
19817            &[overlapping_b, unique],
19818        );
19819
19820        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19821        assert_eq!(first.bundles_considered, 2);
19822        assert_eq!(first.bundles_imported, 2);
19823        assert_eq!(first.messages_imported, 4);
19824
19825        let conversations = storage.list_conversations(10, 0).unwrap();
19826        assert_eq!(conversations.len(), 2);
19827
19828        let shared_id = conversations
19829            .iter()
19830            .find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
19831            .and_then(|conv| conv.id)
19832            .unwrap();
19833        let shared_indices: Vec<i64> = storage
19834            .fetch_messages(shared_id)
19835            .unwrap()
19836            .into_iter()
19837            .map(|msg| msg.idx)
19838            .collect();
19839        assert_eq!(shared_indices, vec![0, 1, 2]);
19840
19841        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19842        assert_eq!(second.bundles_imported, 0);
19843        assert_eq!(second.messages_imported, 0);
19844    }
19845
19846    #[test]
19847    fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
19848        use crate::model::types::{Conversation, Message, MessageRole};
19849        use std::path::PathBuf;
19850
19851        let dir = TempDir::new().unwrap();
19852        let canonical_db = dir.path().join("agent_search.db");
19853        let storage = SqliteStorage::open(&canonical_db).unwrap();
19854
19855        let host_only_remote = Conversation {
19856            id: None,
19857            agent_slug: "codex".into(),
19858            workspace: Some(PathBuf::from("/tmp/workspace")),
19859            external_id: None,
19860            title: Some("Recovered Host Only Remote".into()),
19861            source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
19862            started_at: Some(1_700_000_000_000),
19863            ended_at: Some(1_700_000_000_999),
19864            approx_tokens: None,
19865            metadata_json: serde_json::Value::Null,
19866            messages: vec![Message {
19867                id: None,
19868                idx: 0,
19869                role: MessageRole::User,
19870                author: None,
19871                created_at: Some(1_700_000_000_000),
19872                content: "host-only remote".into(),
19873                extra_json: serde_json::Value::Null,
19874                snippets: Vec::new(),
19875            }],
19876            source_id: "   ".into(),
19877            origin_host: Some("builder-5".into()),
19878        };
19879
19880        let historical_db = dir
19881            .path()
19882            .join("backups/agent_search.db.20260322T020200.bak");
19883        seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
19884
19885        let historical_conn =
19886            FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
19887        historical_conn
19888            .execute_compat(
19889                "INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
19890                fparams!["   ", "ssh", "builder-5", 0_i64, 0_i64],
19891            )
19892            .unwrap();
19893        historical_conn
19894            .execute_compat(
19895                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
19896                fparams!["   ", "builder-5", "/tmp/host-only-history.jsonl"],
19897            )
19898            .unwrap();
19899        historical_conn
19900            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
19901            .unwrap();
19902        drop(historical_conn);
19903
19904        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19905        assert_eq!(first.bundles_imported, 1);
19906        assert_eq!(first.messages_imported, 1);
19907
19908        let source_ids = storage.get_source_ids().unwrap();
19909        assert_eq!(source_ids, vec!["builder-5".to_string()]);
19910
19911        let conversations = storage.list_conversations(10, 0).unwrap();
19912        assert_eq!(conversations.len(), 1);
19913        assert_eq!(conversations[0].source_id, "builder-5");
19914        assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
19915    }
19916
19917    #[test]
19918    fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
19919        use crate::model::types::{Conversation, Message, MessageRole};
19920        use std::path::PathBuf;
19921
19922        let mut attempts: Vec<Vec<usize>> = Vec::new();
19923        let entry = HistoricalBatchEntry {
19924            source_row_id: 77,
19925            agent_id: 1,
19926            workspace_id: None,
19927            conversation: Conversation {
19928                id: None,
19929                agent_slug: "gemini".into(),
19930                workspace: Some(PathBuf::from("/tmp/workspace")),
19931                external_id: Some("conv-77".into()),
19932                title: Some("Large recovered conversation".into()),
19933                source_path: PathBuf::from("/tmp/history.jsonl"),
19934                started_at: Some(1_700_000_000_000),
19935                ended_at: Some(1_700_000_000_999),
19936                approx_tokens: None,
19937                metadata_json: serde_json::Value::Null,
19938                messages: (0..4)
19939                    .map(|idx| Message {
19940                        id: None,
19941                        idx,
19942                        role: MessageRole::User,
19943                        author: None,
19944                        created_at: Some(1_700_000_000_000 + idx),
19945                        content: format!("message-{idx}"),
19946                        extra_json: serde_json::Value::Null,
19947                        snippets: Vec::new(),
19948                    })
19949                    .collect(),
19950                source_id: LOCAL_SOURCE_ID.into(),
19951                origin_host: None,
19952            },
19953        };
19954
19955        let totals = SqliteStorage::import_historical_batch_with_retry(
19956            std::slice::from_ref(&entry),
19957            &mut |batch| {
19958                attempts.push(
19959                    batch
19960                        .iter()
19961                        .map(|entry| entry.conversation.messages.len())
19962                        .collect(),
19963                );
19964                let total_messages: usize = batch
19965                    .iter()
19966                    .map(|entry| entry.conversation.messages.len())
19967                    .sum();
19968                if total_messages > 1 {
19969                    Err(anyhow!("out of memory"))
19970                } else {
19971                    Ok(HistoricalBatchImportTotals {
19972                        inserted_source_rows: batch.len(),
19973                        inserted_messages: total_messages,
19974                    })
19975                }
19976            },
19977        )
19978        .unwrap();
19979
19980        assert_eq!(
19981            totals,
19982            HistoricalBatchImportTotals {
19983                inserted_source_rows: 1,
19984                inserted_messages: 4,
19985            }
19986        );
19987        assert_eq!(attempts.first().cloned(), Some(vec![4]));
19988        assert!(
19989            attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
19990            "expected recursive fallback to reach one-message slices"
19991        );
19992    }
19993
19994    #[test]
19995    fn salvage_historical_databases_resumes_from_progress_checkpoint() {
19996        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19997        use std::path::PathBuf;
19998
19999        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
20000            Conversation {
20001                id: None,
20002                agent_slug: "codex".into(),
20003                workspace: Some(PathBuf::from("/tmp/workspace")),
20004                external_id: Some(format!("conv-{idx_seed}")),
20005                title: Some(format!("Recovered {idx_seed}")),
20006                source_path: PathBuf::from(source_path),
20007                started_at: Some(1_700_000_000_000 + idx_seed),
20008                ended_at: Some(1_700_000_000_100 + idx_seed),
20009                approx_tokens: None,
20010                metadata_json: serde_json::Value::Null,
20011                messages: vec![Message {
20012                    id: None,
20013                    idx: 0,
20014                    role: MessageRole::User,
20015                    author: None,
20016                    created_at: Some(1_700_000_000_000 + idx_seed),
20017                    content: format!("message-{idx_seed}"),
20018                    extra_json: serde_json::Value::Null,
20019                    snippets: Vec::new(),
20020                }],
20021                source_id: LOCAL_SOURCE_ID.into(),
20022                origin_host: None,
20023            }
20024        }
20025
20026        let dir = TempDir::new().unwrap();
20027        let canonical_db = dir.path().join("agent_search.db");
20028        let backup_db = dir
20029            .path()
20030            .join("backups/agent_search.db.20260322T020200.bak");
20031        let storage = SqliteStorage::open(&canonical_db).unwrap();
20032        let conv_a = make_conv("/tmp/one.jsonl", 1);
20033        let conv_b = make_conv("/tmp/two.jsonl", 2);
20034        let conv_c = make_conv("/tmp/three.jsonl", 3);
20035        seed_historical_db_direct(
20036            &backup_db,
20037            &[conv_a.clone(), conv_b.clone(), conv_c.clone()],
20038        );
20039
20040        let agent = Agent {
20041            id: None,
20042            slug: "codex".into(),
20043            name: "Codex".into(),
20044            version: Some("0.2.3".into()),
20045            kind: AgentKind::Cli,
20046        };
20047        let agent_id = storage.ensure_agent(&agent).unwrap();
20048        storage
20049            .insert_conversation_tree(agent_id, None, &conv_a)
20050            .unwrap();
20051
20052        let bundle = discover_historical_database_bundles(&canonical_db)
20053            .into_iter()
20054            .find(|bundle| bundle.root_path == backup_db)
20055            .unwrap();
20056        let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
20057            .unwrap()
20058            .query_row_map(
20059                "SELECT id FROM conversations WHERE source_path = ?1",
20060                fparams!["/tmp/one.jsonl"],
20061                |row| row.get_typed(0),
20062            )
20063            .unwrap();
20064        storage
20065            .record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
20066            .unwrap();
20067
20068        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
20069        assert_eq!(outcome.bundles_imported, 1);
20070        assert_eq!(outcome.conversations_imported, 52);
20071        assert_eq!(outcome.messages_imported, 101);
20072        assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
20073
20074        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
20075        let progress_left: Option<String> = storage
20076            .conn
20077            .query_row_map(
20078                "SELECT value FROM meta WHERE key = ?1",
20079                fparams![progress_key.as_str()],
20080                |row| row.get_typed(0),
20081            )
20082            .optional()
20083            .unwrap();
20084        assert!(
20085            progress_left.is_none(),
20086            "completed salvage should clear bundle progress"
20087        );
20088
20089        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
20090        assert_eq!(second.bundles_imported, 0);
20091        assert_eq!(second.messages_imported, 0);
20092    }
20093
20094    #[test]
20095    fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
20096        // Regression for issue #247 (coding_agent_session_search-r8pcy): a bundle
20097        // whose progress checkpoint already covers the backup's entire conversation
20098        // row-id space (daemon OOM-killed after the last batch committed but before
20099        // the completion ledger marker landed) must be ledgered + skipped, not
20100        // re-scanned O(n) with imported=0 every batch.
20101        use crate::model::types::{Conversation, Message, MessageRole};
20102        use std::path::PathBuf;
20103
20104        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
20105            Conversation {
20106                id: None,
20107                agent_slug: "codex".into(),
20108                workspace: Some(PathBuf::from("/tmp/workspace")),
20109                external_id: Some(format!("conv-{idx_seed}")),
20110                title: Some(format!("Recovered {idx_seed}")),
20111                source_path: PathBuf::from(source_path),
20112                started_at: Some(1_700_000_000_000 + idx_seed),
20113                ended_at: Some(1_700_000_000_100 + idx_seed),
20114                approx_tokens: None,
20115                metadata_json: serde_json::Value::Null,
20116                messages: vec![Message {
20117                    id: None,
20118                    idx: 0,
20119                    role: MessageRole::User,
20120                    author: None,
20121                    created_at: Some(1_700_000_000_000 + idx_seed),
20122                    content: format!("message-{idx_seed}"),
20123                    extra_json: serde_json::Value::Null,
20124                    snippets: Vec::new(),
20125                }],
20126                source_id: LOCAL_SOURCE_ID.into(),
20127                origin_host: None,
20128            }
20129        }
20130
20131        let dir = TempDir::new().unwrap();
20132        let canonical_db = dir.path().join("agent_search.db");
20133        let backup_db = dir
20134            .path()
20135            .join("backups/agent_search.db.20260322T020200.bak");
20136        let storage = SqliteStorage::open(&canonical_db).unwrap();
20137        seed_historical_db_direct(
20138            &backup_db,
20139            &[
20140                make_conv("/tmp/one.jsonl", 1),
20141                make_conv("/tmp/two.jsonl", 2),
20142                make_conv("/tmp/three.jsonl", 3),
20143            ],
20144        );
20145
20146        let bundle = discover_historical_database_bundles(&canonical_db)
20147            .into_iter()
20148            .find(|bundle| bundle.root_path == backup_db)
20149            .unwrap();
20150
20151        // Checkpoint high-water mark == backup's max conversation id.
20152        let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
20153            .unwrap()
20154            .query_row_map(
20155                "SELECT COALESCE(MAX(id), 0) FROM conversations",
20156                fparams![],
20157                |row| row.get_typed(0),
20158            )
20159            .unwrap();
20160        assert!(backup_max_id > 0, "seeded backup should have conversations");
20161        storage
20162            .record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
20163            .unwrap();
20164
20165        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
20166        assert_eq!(
20167            outcome.bundles_imported, 0,
20168            "fully-checkpointed bundle must not be re-scanned"
20169        );
20170        assert_eq!(outcome.conversations_imported, 0);
20171        assert_eq!(outcome.messages_imported, 0);
20172        assert_eq!(
20173            storage.list_conversations(10, 0).unwrap().len(),
20174            0,
20175            "skip path must not import anything"
20176        );
20177        assert!(
20178            storage.historical_bundle_already_imported(&bundle).unwrap(),
20179            "skipped bundle must be ledgered as salvaged so future runs short-circuit"
20180        );
20181
20182        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
20183        let progress_left: Option<String> = storage
20184            .conn
20185            .query_row_map(
20186                "SELECT value FROM meta WHERE key = ?1",
20187                fparams![progress_key.as_str()],
20188                |row| row.get_typed(0),
20189            )
20190            .optional()
20191            .unwrap();
20192        assert!(
20193            progress_left.is_none(),
20194            "skip path must clear the bundle progress checkpoint"
20195        );
20196    }
20197
20198    #[test]
20199    fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
20200        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20201        use std::path::PathBuf;
20202
20203        let dir = TempDir::new().unwrap();
20204        let db_path = dir.path().join("agent_search.db");
20205        let storage = SqliteStorage::open(&db_path).unwrap();
20206        let agent = Agent {
20207            id: None,
20208            slug: "codex".into(),
20209            name: "Codex".into(),
20210            version: Some("0.2.3".into()),
20211            kind: AgentKind::Cli,
20212        };
20213        let agent_id = storage.ensure_agent(&agent).unwrap();
20214
20215        let make_conv = |source_path: &str, started_at: i64| Conversation {
20216            id: None,
20217            agent_slug: "codex".into(),
20218            workspace: Some(PathBuf::from("/tmp/workspace")),
20219            external_id: Some(source_path.to_string()),
20220            title: Some(source_path.to_string()),
20221            source_path: PathBuf::from(source_path),
20222            started_at: Some(started_at),
20223            ended_at: Some(started_at + 1),
20224            approx_tokens: None,
20225            metadata_json: serde_json::Value::Null,
20226            messages: vec![Message {
20227                id: None,
20228                idx: 0,
20229                role: MessageRole::User,
20230                author: None,
20231                created_at: Some(started_at),
20232                content: format!("message for {source_path}"),
20233                extra_json: serde_json::Value::Null,
20234                snippets: Vec::new(),
20235            }],
20236            source_id: LOCAL_SOURCE_ID.into(),
20237            origin_host: None,
20238        };
20239
20240        let conv_a = make_conv("/tmp/a.jsonl", 3_000);
20241        let conv_b = make_conv("/tmp/b.jsonl", 1_000);
20242        let conv_c = make_conv("/tmp/c.jsonl", 2_000);
20243
20244        storage
20245            .insert_conversation_tree(agent_id, None, &conv_a)
20246            .unwrap();
20247        storage
20248            .insert_conversation_tree(agent_id, None, &conv_b)
20249            .unwrap();
20250        storage
20251            .insert_conversation_tree(agent_id, None, &conv_c)
20252            .unwrap();
20253
20254        let user_order: Vec<PathBuf> = storage
20255            .list_conversations(10, 0)
20256            .unwrap()
20257            .into_iter()
20258            .map(|conv| conv.source_path)
20259            .collect();
20260        assert_eq!(
20261            user_order,
20262            vec![
20263                PathBuf::from("/tmp/a.jsonl"),
20264                PathBuf::from("/tmp/c.jsonl"),
20265                PathBuf::from("/tmp/b.jsonl"),
20266            ]
20267        );
20268
20269        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20270        let rebuild_order: Vec<PathBuf> = storage
20271            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
20272            .unwrap()
20273            .into_iter()
20274            .map(|conv| conv.source_path)
20275            .collect();
20276        assert_eq!(
20277            rebuild_order,
20278            vec![
20279                PathBuf::from("/tmp/a.jsonl"),
20280                PathBuf::from("/tmp/b.jsonl"),
20281                PathBuf::from("/tmp/c.jsonl"),
20282            ]
20283        );
20284
20285        let first_page = storage
20286            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
20287            .unwrap();
20288        let first_page_paths: Vec<PathBuf> = first_page
20289            .iter()
20290            .map(|conv| conv.source_path.clone())
20291            .collect();
20292        assert_eq!(
20293            first_page_paths,
20294            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
20295        );
20296
20297        let second_page = storage
20298            .list_conversations_for_lexical_rebuild_after_id(
20299                2,
20300                first_page
20301                    .last()
20302                    .and_then(|conv| conv.id)
20303                    .expect("first page should include an id"),
20304                &agent_slugs,
20305                &workspace_paths,
20306            )
20307            .unwrap();
20308        let second_page_paths: Vec<PathBuf> = second_page
20309            .iter()
20310            .map(|conv| conv.source_path.clone())
20311            .collect();
20312        assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
20313
20314        let bounded_page = storage
20315            .list_conversations_for_lexical_rebuild_after_id_through_id(
20316                10,
20317                0,
20318                first_page
20319                    .last()
20320                    .and_then(|conv| conv.id)
20321                    .expect("first page should include an id"),
20322                &agent_slugs,
20323                &workspace_paths,
20324            )
20325            .unwrap();
20326        let bounded_paths: Vec<PathBuf> = bounded_page
20327            .iter()
20328            .map(|conv| conv.source_path.clone())
20329            .collect();
20330        assert_eq!(
20331            bounded_paths,
20332            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
20333        );
20334    }
20335
20336    #[test]
20337    fn keyset_traversal_handles_sparse_holey_conversation_ids() {
20338        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20339        use std::path::PathBuf;
20340
20341        let dir = TempDir::new().unwrap();
20342        let db_path = dir.path().join("agent_search.db");
20343        let storage = SqliteStorage::open(&db_path).unwrap();
20344        let agent = Agent {
20345            id: None,
20346            slug: "codex".into(),
20347            name: "Codex".into(),
20348            version: Some("0.2.3".into()),
20349            kind: AgentKind::Cli,
20350        };
20351        let agent_id = storage.ensure_agent(&agent).unwrap();
20352
20353        let make_conv = |label: &str, ts: i64| Conversation {
20354            id: None,
20355            agent_slug: "codex".into(),
20356            workspace: Some(PathBuf::from("/tmp/workspace")),
20357            external_id: Some(label.to_string()),
20358            title: Some(label.to_string()),
20359            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20360            started_at: Some(ts),
20361            ended_at: Some(ts + 1),
20362            approx_tokens: None,
20363            metadata_json: serde_json::Value::Null,
20364            messages: vec![Message {
20365                id: None,
20366                idx: 0,
20367                role: MessageRole::User,
20368                author: None,
20369                created_at: Some(ts),
20370                content: format!("msg for {label}"),
20371                extra_json: serde_json::Value::Null,
20372                snippets: Vec::new(),
20373            }],
20374            source_id: LOCAL_SOURCE_ID.into(),
20375            origin_host: None,
20376        };
20377
20378        for i in 0..6 {
20379            storage
20380                .insert_conversation_tree(
20381                    agent_id,
20382                    None,
20383                    &make_conv(&format!("conv-{i}"), 1000 + i),
20384                )
20385                .unwrap();
20386        }
20387
20388        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20389        storage
20390            .conn
20391            .execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
20392            .unwrap();
20393        storage
20394            .conn
20395            .execute_compat(
20396                "DELETE FROM messages WHERE conversation_id IN (2, 4)",
20397                fparams![],
20398            )
20399            .unwrap();
20400        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20401
20402        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20403
20404        let page1 = storage
20405            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
20406            .unwrap();
20407        assert_eq!(page1.len(), 2);
20408        let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
20409        assert_eq!(page1_ids, vec![1, 3]);
20410
20411        let page2 = storage
20412            .list_conversations_for_lexical_rebuild_after_id(
20413                2,
20414                *page1_ids.last().unwrap(),
20415                &agent_slugs,
20416                &workspace_paths,
20417            )
20418            .unwrap();
20419        assert_eq!(page2.len(), 2);
20420        let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
20421        assert_eq!(page2_ids, vec![5, 6]);
20422
20423        let page3 = storage
20424            .list_conversations_for_lexical_rebuild_after_id(
20425                2,
20426                *page2_ids.last().unwrap(),
20427                &agent_slugs,
20428                &workspace_paths,
20429            )
20430            .unwrap();
20431        assert!(page3.is_empty());
20432
20433        let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
20434        assert_eq!(all_ids, vec![1, 3, 5, 6]);
20435    }
20436
20437    #[test]
20438    fn keyset_traversal_through_id_with_sparse_ranges() {
20439        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20440        use std::path::PathBuf;
20441
20442        let dir = TempDir::new().unwrap();
20443        let db_path = dir.path().join("agent_search.db");
20444        let storage = SqliteStorage::open(&db_path).unwrap();
20445        let agent = Agent {
20446            id: None,
20447            slug: "codex".into(),
20448            name: "Codex".into(),
20449            version: Some("0.2.3".into()),
20450            kind: AgentKind::Cli,
20451        };
20452        let agent_id = storage.ensure_agent(&agent).unwrap();
20453
20454        let make_conv = |label: &str, ts: i64| Conversation {
20455            id: None,
20456            agent_slug: "codex".into(),
20457            workspace: Some(PathBuf::from("/tmp/workspace")),
20458            external_id: Some(label.to_string()),
20459            title: Some(label.to_string()),
20460            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20461            started_at: Some(ts),
20462            ended_at: Some(ts + 1),
20463            approx_tokens: None,
20464            metadata_json: serde_json::Value::Null,
20465            messages: vec![Message {
20466                id: None,
20467                idx: 0,
20468                role: MessageRole::User,
20469                author: None,
20470                created_at: Some(ts),
20471                content: format!("msg for {label}"),
20472                extra_json: serde_json::Value::Null,
20473                snippets: Vec::new(),
20474            }],
20475            source_id: LOCAL_SOURCE_ID.into(),
20476            origin_host: None,
20477        };
20478
20479        for i in 0..10 {
20480            storage
20481                .insert_conversation_tree(
20482                    agent_id,
20483                    None,
20484                    &make_conv(&format!("conv-{i}"), 1000 + i),
20485                )
20486                .unwrap();
20487        }
20488
20489        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20490        storage
20491            .conn
20492            .execute_compat(
20493                "DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
20494                fparams![],
20495            )
20496            .unwrap();
20497        storage
20498            .conn
20499            .execute_compat(
20500                "DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
20501                fparams![],
20502            )
20503            .unwrap();
20504        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20505
20506        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20507
20508        let through_5 = storage
20509            .list_conversations_for_lexical_rebuild_after_id_through_id(
20510                100,
20511                0,
20512                5,
20513                &agent_slugs,
20514                &workspace_paths,
20515            )
20516            .unwrap();
20517        let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
20518        assert_eq!(through_5_ids, vec![1, 2, 4]);
20519
20520        let after_4_through_10 = storage
20521            .list_conversations_for_lexical_rebuild_after_id_through_id(
20522                100,
20523                4,
20524                10,
20525                &agent_slugs,
20526                &workspace_paths,
20527            )
20528            .unwrap();
20529        let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
20530        assert_eq!(ids, vec![6, 9, 10]);
20531
20532        let after_10 = storage
20533            .list_conversations_for_lexical_rebuild_after_id_through_id(
20534                100,
20535                10,
20536                20,
20537                &agent_slugs,
20538                &workspace_paths,
20539            )
20540            .unwrap();
20541        assert!(after_10.is_empty());
20542    }
20543
20544    #[test]
20545    fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
20546     {
20547        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20548        use std::path::PathBuf;
20549
20550        let dir = TempDir::new().unwrap();
20551        let db_path = dir.path().join("agent_search.db");
20552        let storage = SqliteStorage::open(&db_path).unwrap();
20553        let agent = Agent {
20554            id: None,
20555            slug: "codex".into(),
20556            name: "Codex".into(),
20557            version: Some("0.2.3".into()),
20558            kind: AgentKind::Cli,
20559        };
20560        let agent_id = storage.ensure_agent(&agent).unwrap();
20561
20562        let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
20563            storage
20564                .insert_conversation_tree(
20565                    agent_id,
20566                    None,
20567                    &Conversation {
20568                        id: None,
20569                        agent_slug: "codex".into(),
20570                        workspace: Some(PathBuf::from("/tmp/workspace")),
20571                        external_id: Some(external_id.to_string()),
20572                        title: Some(external_id.to_string()),
20573                        source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
20574                        started_at: Some(base_ts),
20575                        ended_at: Some(base_ts + 100),
20576                        approx_tokens: None,
20577                        metadata_json: serde_json::Value::Null,
20578                        messages,
20579                        source_id: LOCAL_SOURCE_ID.into(),
20580                        origin_host: None,
20581                    },
20582                )
20583                .unwrap()
20584                .conversation_id
20585        };
20586
20587        let ascii_id = insert(
20588            "footprint-ascii",
20589            1_700_000_000_000,
20590            vec![
20591                Message {
20592                    id: None,
20593                    idx: 0,
20594                    role: MessageRole::User,
20595                    author: None,
20596                    created_at: Some(1_700_000_000_001),
20597                    content: "abc".into(),
20598                    extra_json: serde_json::Value::Null,
20599                    snippets: Vec::new(),
20600                },
20601                Message {
20602                    id: None,
20603                    idx: 1,
20604                    role: MessageRole::Agent,
20605                    author: None,
20606                    created_at: Some(1_700_000_000_002),
20607                    content: "defg".into(),
20608                    extra_json: serde_json::Value::Null,
20609                    snippets: Vec::new(),
20610                },
20611            ],
20612        );
20613        let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
20614        let utf8_id = insert(
20615            "footprint-utf8",
20616            1_700_000_002_000,
20617            vec![Message {
20618                id: None,
20619                idx: 0,
20620                role: MessageRole::Tool,
20621                author: None,
20622                created_at: Some(1_700_000_002_001),
20623                content: "hé🙂".into(),
20624                extra_json: serde_json::Value::Null,
20625                snippets: Vec::new(),
20626            }],
20627        );
20628        let sparse_id = insert(
20629            "footprint-sparse",
20630            1_700_000_003_000,
20631            vec![Message {
20632                id: None,
20633                idx: 10,
20634                role: MessageRole::User,
20635                author: None,
20636                created_at: Some(1_700_000_003_010),
20637                content: "sparse".into(),
20638                extra_json: serde_json::Value::Null,
20639                snippets: Vec::new(),
20640            }],
20641        );
20642        storage
20643            .conn
20644            .execute_compat(
20645                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20646                fparams![utf8_id],
20647            )
20648            .unwrap();
20649
20650        let footprints = storage
20651            .list_conversation_footprints_for_lexical_rebuild()
20652            .unwrap();
20653        assert_eq!(
20654            footprints,
20655            vec![
20656                LexicalRebuildConversationFootprintRow {
20657                    conversation_id: ascii_id,
20658                    message_count: 2,
20659                    message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20660                },
20661                LexicalRebuildConversationFootprintRow {
20662                    conversation_id: empty_id,
20663                    message_count: 0,
20664                    message_bytes: 0,
20665                },
20666                LexicalRebuildConversationFootprintRow {
20667                    conversation_id: utf8_id,
20668                    message_count: 1,
20669                    message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20670                },
20671                LexicalRebuildConversationFootprintRow {
20672                    conversation_id: sparse_id,
20673                    message_count: 11,
20674                    message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20675                },
20676            ]
20677        );
20678    }
20679
20680    #[test]
20681    fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
20682        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20683        use std::path::PathBuf;
20684
20685        let dir = TempDir::new().unwrap();
20686        let db_path = dir.path().join("agent_search.db");
20687        let storage = SqliteStorage::open(&db_path).unwrap();
20688        let agent = Agent {
20689            id: None,
20690            slug: "codex".into(),
20691            name: "Codex".into(),
20692            version: Some("0.2.3".into()),
20693            kind: AgentKind::Cli,
20694        };
20695        let agent_id = storage.ensure_agent(&agent).unwrap();
20696        let conversation_id = storage
20697            .insert_conversation_tree(
20698                agent_id,
20699                None,
20700                &Conversation {
20701                    id: None,
20702                    agent_slug: "codex".into(),
20703                    workspace: Some(PathBuf::from("/tmp/workspace")),
20704                    external_id: Some("footprint-missing-tail".to_string()),
20705                    title: Some("footprint-missing-tail".to_string()),
20706                    source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
20707                    started_at: Some(1_700_000_000_000),
20708                    ended_at: Some(1_700_000_000_100),
20709                    approx_tokens: None,
20710                    metadata_json: serde_json::Value::Null,
20711                    messages: vec![Message {
20712                        id: None,
20713                        idx: 10,
20714                        role: MessageRole::User,
20715                        author: None,
20716                        created_at: Some(1_700_000_000_010),
20717                        content: "legacy sparse tail".into(),
20718                        extra_json: serde_json::Value::Null,
20719                        snippets: Vec::new(),
20720                    }],
20721                    source_id: LOCAL_SOURCE_ID.into(),
20722                    origin_host: None,
20723                },
20724            )
20725            .unwrap()
20726            .conversation_id;
20727
20728        storage
20729            .conn
20730            .execute_compat(
20731                "UPDATE conversations
20732                 SET last_message_idx = NULL, last_message_created_at = NULL
20733                 WHERE id = ?1",
20734                fparams![conversation_id],
20735            )
20736            .unwrap();
20737        storage
20738            .conn
20739            .execute_compat(
20740                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20741                fparams![conversation_id],
20742            )
20743            .unwrap();
20744
20745        let footprints = storage
20746            .list_conversation_footprints_for_lexical_rebuild()
20747            .unwrap();
20748
20749        assert_eq!(
20750            footprints,
20751            vec![LexicalRebuildConversationFootprintRow {
20752                conversation_id,
20753                message_count: 11,
20754                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20755            }],
20756            "missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
20757        );
20758    }
20759
20760    #[test]
20761    fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
20762        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20763        use std::path::PathBuf;
20764
20765        let dir = TempDir::new().unwrap();
20766        let db_path = dir.path().join("agent_search.db");
20767        let storage = SqliteStorage::open(&db_path).unwrap();
20768        let agent = Agent {
20769            id: None,
20770            slug: "codex".into(),
20771            name: "Codex".into(),
20772            version: Some("0.2.3".into()),
20773            kind: AgentKind::Cli,
20774        };
20775        let agent_id = storage.ensure_agent(&agent).unwrap();
20776        let conversation_id = storage
20777            .insert_conversation_tree(
20778                agent_id,
20779                None,
20780                &Conversation {
20781                    id: None,
20782                    agent_slug: "codex".into(),
20783                    workspace: Some(PathBuf::from("/tmp/workspace")),
20784                    external_id: Some("footprint-stale-tail".to_string()),
20785                    title: Some("footprint-stale-tail".to_string()),
20786                    source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
20787                    started_at: Some(1_700_000_000_000),
20788                    ended_at: Some(1_700_000_000_100),
20789                    approx_tokens: None,
20790                    metadata_json: serde_json::Value::Null,
20791                    messages: (0..3)
20792                        .map(|idx| Message {
20793                            id: None,
20794                            idx,
20795                            role: MessageRole::User,
20796                            author: None,
20797                            created_at: Some(1_700_000_000_010 + idx),
20798                            content: format!("message {idx}"),
20799                            extra_json: serde_json::Value::Null,
20800                            snippets: Vec::new(),
20801                        })
20802                        .collect(),
20803                    source_id: LOCAL_SOURCE_ID.into(),
20804                    origin_host: None,
20805                },
20806            )
20807            .unwrap()
20808            .conversation_id;
20809
20810        storage
20811            .conn
20812            .execute_compat(
20813                "UPDATE conversations
20814                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20815                 WHERE id = ?1",
20816                fparams![conversation_id],
20817            )
20818            .unwrap();
20819        storage
20820            .conn
20821            .execute_compat(
20822                "UPDATE conversation_tail_state
20823                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20824                 WHERE conversation_id = ?1",
20825                fparams![conversation_id],
20826            )
20827            .unwrap();
20828
20829        let footprints = storage
20830            .list_conversation_footprints_for_lexical_rebuild()
20831            .unwrap();
20832
20833        assert_eq!(
20834            footprints,
20835            vec![LexicalRebuildConversationFootprintRow {
20836                conversation_id,
20837                message_count: 3,
20838                message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20839            }],
20840            "stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
20841        );
20842    }
20843
20844    #[test]
20845    fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
20846        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20847        use std::path::PathBuf;
20848
20849        let dir = TempDir::new().unwrap();
20850        let db_path = dir.path().join("agent_search.db");
20851        let storage = SqliteStorage::open(&db_path).unwrap();
20852        let agent = Agent {
20853            id: None,
20854            slug: "codex".into(),
20855            name: "Codex".into(),
20856            version: Some("0.2.3".into()),
20857            kind: AgentKind::Cli,
20858        };
20859        let agent_id = storage.ensure_agent(&agent).unwrap();
20860        let conversation_id = storage
20861            .insert_conversation_tree(
20862                agent_id,
20863                None,
20864                &Conversation {
20865                    id: None,
20866                    agent_slug: "codex".into(),
20867                    workspace: Some(PathBuf::from("/tmp/workspace")),
20868                    external_id: Some("footprint-missing-tail-table".to_string()),
20869                    title: Some("footprint-missing-tail-table".to_string()),
20870                    source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
20871                    started_at: Some(1_700_000_000_000),
20872                    ended_at: Some(1_700_000_000_100),
20873                    approx_tokens: None,
20874                    metadata_json: serde_json::Value::Null,
20875                    messages: vec![Message {
20876                        id: None,
20877                        idx: 10,
20878                        role: MessageRole::User,
20879                        author: None,
20880                        created_at: Some(1_700_000_000_010),
20881                        content: "legacy sparse tail without hot table".into(),
20882                        extra_json: serde_json::Value::Null,
20883                        snippets: Vec::new(),
20884                    }],
20885                    source_id: LOCAL_SOURCE_ID.into(),
20886                    origin_host: None,
20887                },
20888            )
20889            .unwrap()
20890            .conversation_id;
20891
20892        storage
20893            .conn
20894            .execute_compat(
20895                "UPDATE conversations
20896                 SET last_message_idx = NULL, last_message_created_at = NULL
20897                 WHERE id = ?1",
20898                fparams![conversation_id],
20899            )
20900            .unwrap();
20901        storage
20902            .conn
20903            .execute_compat("DROP TABLE conversation_tail_state", fparams![])
20904            .unwrap();
20905
20906        let footprints = storage
20907            .list_conversation_footprints_for_lexical_rebuild()
20908            .unwrap();
20909
20910        assert_eq!(
20911            footprints,
20912            vec![LexicalRebuildConversationFootprintRow {
20913                conversation_id,
20914                message_count: 11,
20915                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20916            }],
20917            "read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
20918        );
20919    }
20920
20921    #[test]
20922    fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
20923        let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
20924            .join("tests")
20925            .join("fixtures")
20926            .join("search_demo_data")
20927            .join("agent_search.db");
20928        let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
20929
20930        let footprints = storage
20931            .list_conversation_footprints_for_lexical_rebuild()
20932            .unwrap();
20933
20934        assert!(
20935            !footprints.is_empty(),
20936            "search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
20937        );
20938        assert!(
20939            footprints
20940                .iter()
20941                .all(|footprint| footprint.message_count > 0),
20942            "legacy fixture conversations should derive message counts from messages when tail caches are absent"
20943        );
20944    }
20945
20946    #[test]
20947    fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
20948        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20949        use std::path::PathBuf;
20950
20951        let dir = TempDir::new().unwrap();
20952        let db_path = dir.path().join("agent_search.db");
20953        let storage = SqliteStorage::open(&db_path).unwrap();
20954        let agent = Agent {
20955            id: None,
20956            slug: "codex".into(),
20957            name: "Codex".into(),
20958            version: Some("0.2.3".into()),
20959            kind: AgentKind::Cli,
20960        };
20961        let agent_id = storage.ensure_agent(&agent).unwrap();
20962        let conversation = Conversation {
20963            id: None,
20964            agent_slug: "codex".into(),
20965            workspace: Some(PathBuf::from("/tmp/workspace")),
20966            external_id: Some("legacy-blank-source".into()),
20967            title: Some("Legacy blank source".into()),
20968            source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
20969            started_at: Some(1_700_000_000_000),
20970            ended_at: Some(1_700_000_000_100),
20971            approx_tokens: None,
20972            metadata_json: serde_json::Value::Null,
20973            messages: vec![Message {
20974                id: None,
20975                idx: 0,
20976                role: MessageRole::User,
20977                author: None,
20978                created_at: Some(1_700_000_000_000),
20979                content: "hello".into(),
20980                extra_json: serde_json::Value::Null,
20981                snippets: Vec::new(),
20982            }],
20983            source_id: LOCAL_SOURCE_ID.into(),
20984            origin_host: None,
20985        };
20986
20987        let conversation_id = storage
20988            .insert_conversation_tree(agent_id, None, &conversation)
20989            .unwrap()
20990            .conversation_id;
20991        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20992        storage
20993            .conn
20994            .execute_compat(
20995                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
20996                fparams!["   ", "dev@laptop", conversation_id],
20997            )
20998            .unwrap();
20999        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
21000
21001        let listed = storage.list_conversations(10, 0).unwrap();
21002        assert_eq!(listed.len(), 1);
21003        assert_eq!(listed[0].source_id, "dev@laptop");
21004        assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
21005
21006        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
21007        let rebuild_listed = storage
21008            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
21009            .unwrap();
21010        assert_eq!(rebuild_listed.len(), 1);
21011        assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
21012        assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
21013    }
21014
21015    #[test]
21016    fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
21017        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21018        use std::path::PathBuf;
21019
21020        let dir = TempDir::new().unwrap();
21021        let canonical_db = dir.path().join("agent_search.db");
21022        let source_db = dir
21023            .path()
21024            .join("backups/agent_search.db.20260322T020200.bak");
21025
21026        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
21027
21028        let source = SqliteStorage::open(&source_db).unwrap();
21029        let agent = Agent {
21030            id: None,
21031            slug: "codex".into(),
21032            name: "Codex".into(),
21033            version: Some("0.2.3".into()),
21034            kind: AgentKind::Cli,
21035        };
21036        let agent_id = source.ensure_agent(&agent).unwrap();
21037        let conversation = Conversation {
21038            id: None,
21039            agent_slug: "codex".into(),
21040            workspace: Some(PathBuf::from("/tmp/workspace")),
21041            external_id: Some("seed-conv".into()),
21042            title: Some("Historical seed".into()),
21043            source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
21044            started_at: Some(1_700_000_000_000),
21045            ended_at: Some(1_700_000_000_100),
21046            approx_tokens: Some(42),
21047            metadata_json: serde_json::json!({"seed": true}),
21048            messages: vec![Message {
21049                id: None,
21050                idx: 0,
21051                role: MessageRole::Agent,
21052                author: Some("assistant".into()),
21053                created_at: Some(1_700_000_000_050),
21054                content: "seeded message".into(),
21055                extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
21056                snippets: Vec::new(),
21057            }],
21058            source_id: LOCAL_SOURCE_ID.into(),
21059            origin_host: None,
21060        };
21061        source
21062            .insert_conversation_tree(agent_id, None, &conversation)
21063            .unwrap();
21064        source.set_last_scan_ts(123).unwrap();
21065        source.set_last_indexed_at(456).unwrap();
21066        source.set_last_embedded_message_id(789).unwrap();
21067        source
21068            .conn
21069            .execute_compat(
21070                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
21071                fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
21072            )
21073            .unwrap();
21074        drop(source);
21075
21076        #[cfg(not(windows))]
21077        {
21078            // Legacy "duplicate FTS" fixture reconstruction.
21079            //
21080            // Post-V14 migration cass drops the V13-era fts_messages virtual table
21081            // and recreates it lazily, so a freshly-opened canonical DB has zero
21082            // fts_messages entries in sqlite_master. To reproduce the historical
21083            // failure mode this test exercises — a legacy v13 bundle with a
21084            // duplicated CREATE VIRTUAL TABLE row — we have to inject *both*
21085            // entries: the original V13-era contentless row and the buggy duplicate
21086            // row. Before V14 existed the original was already present after
21087            // migration and only the duplicate needed manual injection.
21088            let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
21089            let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
21090            let legacy = rusqlite_test_fixture_conn(&source_db);
21091            legacy
21092                .execute_batch(
21093                    "UPDATE meta SET value = '13' WHERE key = 'schema_version';
21094                     DELETE FROM _schema_migrations WHERE version = 14;
21095                     PRAGMA writable_schema = ON;",
21096                )
21097                .unwrap();
21098            legacy
21099                .execute(
21100                    "DELETE FROM meta WHERE key = ?1",
21101                    [FTS_FRANKEN_REBUILD_META_KEY],
21102                )
21103                .unwrap();
21104            // Inject the V13 original first.
21105            legacy
21106                .execute(
21107                    "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
21108                     VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
21109                    [legacy_v13_fts_sql],
21110                )
21111                .unwrap();
21112            // Then the duplicate that's the real subject of the fixup logic.
21113            legacy
21114                .execute(
21115                    "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
21116                     VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
21117                    [duplicate_legacy_fts_sql],
21118                )
21119                .unwrap();
21120            legacy
21121                .execute_batch("PRAGMA writable_schema = OFF;")
21122                .unwrap();
21123            drop(legacy);
21124
21125            // Verify fixture with rusqlite+writable_schema to see raw
21126            // sqlite_master rows (frankensqlite deduplicates schema entries).
21127            {
21128                let verify = rusqlite_test_fixture_conn(&source_db);
21129                verify
21130                    .execute_batch("PRAGMA writable_schema = ON;")
21131                    .unwrap();
21132                let fts_entries: i64 = verify
21133                    .query_row(
21134                        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21135                        [],
21136                        |row| row.get(0),
21137                    )
21138                    .unwrap();
21139                assert_eq!(
21140                    fts_entries, 2,
21141                    "test fixture should reproduce the duplicate legacy fts_messages rows"
21142                );
21143                let msg_count: i64 = verify
21144                    .query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
21145                    .unwrap();
21146                assert_eq!(msg_count, 1);
21147            }
21148        }
21149
21150        let fresh = SqliteStorage::open(&canonical_db).unwrap();
21151        drop(fresh);
21152
21153        let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
21154            .unwrap()
21155            .unwrap();
21156        assert_eq!(outcome.bundles_imported, 1);
21157        assert_eq!(outcome.conversations_imported, 1);
21158        assert_eq!(outcome.messages_imported, 1);
21159
21160        let readonly = open_franken_with_flags(
21161            &canonical_db.to_string_lossy(),
21162            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21163        )
21164        .unwrap();
21165        let readonly_message_count: i64 = readonly
21166            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21167                row.get_typed(0)
21168            })
21169            .unwrap();
21170        assert_eq!(readonly_message_count, 1);
21171
21172        let seeded = SqliteStorage::open(&canonical_db).unwrap();
21173        assert_eq!(
21174            seeded
21175                .count_sessions_in_range(None, None, None, None)
21176                .unwrap()
21177                .0,
21178            1
21179        );
21180        let message_count: i64 = seeded
21181            .conn
21182            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21183                row.get_typed(0)
21184            })
21185            .unwrap();
21186        assert_eq!(message_count, 1);
21187        assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
21188        assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
21189
21190        let last_indexed: Option<String> = seeded
21191            .conn
21192            .query_row_map(
21193                "SELECT value FROM meta WHERE key = 'last_indexed_at'",
21194                fparams![],
21195                |row| row.get_typed(0),
21196            )
21197            .optional()
21198            .unwrap();
21199        assert!(last_indexed.is_none());
21200
21201        let salvage_keys: Vec<String> = seeded
21202            .conn
21203            .query_map_collect(
21204                "SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
21205                fparams![],
21206                |row| row.get_typed(0),
21207            )
21208            .unwrap();
21209        assert_eq!(salvage_keys.len(), 1);
21210
21211        let reopened_readonly = open_franken_with_flags(
21212            &canonical_db.to_string_lossy(),
21213            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21214        )
21215        .unwrap();
21216        let reopened_fts_entries: i64 = reopened_readonly
21217            .query_row_map(
21218                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21219                fparams![],
21220                |row| row.get_typed(0),
21221            )
21222            .unwrap();
21223        assert_eq!(
21224            reopened_fts_entries, 1,
21225            "seeded canonical db should keep a single stock-SQLite fts_messages schema row"
21226        );
21227        let reopened_message_count: i64 = reopened_readonly
21228            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
21229                row.get_typed(0)
21230            })
21231            .unwrap();
21232        assert_eq!(reopened_message_count, 1);
21233
21234        let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
21235        assert_eq!(
21236            franken_seeded.schema_version().unwrap(),
21237            CURRENT_SCHEMA_VERSION
21238        );
21239        // Post-V14 fts_messages is recreated lazily. `FrankenStorage::open`
21240        // alone doesn't re-register the virtual table for the frankensqlite
21241        // query engine — the consistency pass does, and this is exactly what
21242        // normal cass startup runs before the first search. Invoke it
21243        // explicitly so the query below exercises the expected post-repair
21244        // state rather than the between-steps state.
21245        franken_seeded
21246            .ensure_search_fallback_fts_consistency()
21247            .expect("ensure FTS consistency after seed");
21248        let post_franken_schema_rows: i64 = franken_seeded
21249            .raw()
21250            .query_row_map(
21251                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
21252                fparams![],
21253                |row| row.get_typed(0),
21254            )
21255            .unwrap();
21256        assert_eq!(post_franken_schema_rows, 1);
21257        let fts_probe = franken_seeded
21258            .raw()
21259            .query("SELECT COUNT(*) FROM fts_messages");
21260        assert!(
21261            fts_probe.is_ok(),
21262            "expected post-seed FTS to be queryable, got {fts_probe:?}"
21263        );
21264    }
21265
21266    #[test]
21267    fn failed_baseline_seed_preserves_existing_canonical_bundle() {
21268        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21269        use std::path::PathBuf;
21270
21271        let dir = TempDir::new().unwrap();
21272        let canonical_db = dir.path().join("agent_search.db");
21273        let source_db = dir
21274            .path()
21275            .join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
21276
21277        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
21278
21279        let canonical = SqliteStorage::open(&canonical_db).unwrap();
21280        canonical
21281            .conn
21282            .execute_compat(
21283                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
21284                fparams!["sentinel", "keep-me"],
21285            )
21286            .unwrap();
21287        drop(canonical);
21288
21289        let source = SqliteStorage::open(&source_db).unwrap();
21290        let agent = Agent {
21291            id: None,
21292            slug: "codex".into(),
21293            name: "Codex".into(),
21294            version: Some("0.2.3".into()),
21295            kind: AgentKind::Cli,
21296        };
21297        let agent_id = source.ensure_agent(&agent).unwrap();
21298        let conversation = Conversation {
21299            id: None,
21300            agent_slug: "codex".into(),
21301            workspace: Some(PathBuf::from("/tmp/workspace")),
21302            external_id: Some("bad-seed-conv".into()),
21303            title: Some("Bad seed".into()),
21304            source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
21305            started_at: Some(1_700_000_000_000),
21306            ended_at: Some(1_700_000_000_100),
21307            approx_tokens: Some(42),
21308            metadata_json: serde_json::json!({"seed": "bad"}),
21309            messages: vec![Message {
21310                id: None,
21311                idx: 0,
21312                role: MessageRole::Agent,
21313                author: Some("assistant".into()),
21314                created_at: Some(1_700_000_000_050),
21315                content: "this seed should fail".into(),
21316                extra_json: serde_json::Value::Null,
21317                snippets: Vec::new(),
21318            }],
21319            source_id: LOCAL_SOURCE_ID.into(),
21320            origin_host: None,
21321        };
21322        source
21323            .insert_conversation_tree(agent_id, None, &conversation)
21324            .unwrap();
21325        drop(source);
21326
21327        let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
21328        legacy
21329            .execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
21330            .unwrap();
21331        drop(legacy);
21332
21333        let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
21334        assert!(
21335            err.to_string()
21336                .contains("schema_version 12 is too old for baseline import"),
21337            "unexpected seed error: {err:#}"
21338        );
21339
21340        let reopened = SqliteStorage::open(&canonical_db).unwrap();
21341        let sentinel: Option<String> = reopened
21342            .conn
21343            .query_row_map(
21344                "SELECT value FROM meta WHERE key = 'sentinel'",
21345                fparams![],
21346                |row| row.get_typed(0),
21347            )
21348            .optional()
21349            .unwrap();
21350        assert_eq!(sentinel.as_deref(), Some("keep-me"));
21351
21352        let conversation_count: i64 = reopened
21353            .conn
21354            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
21355                row.get_typed(0)
21356            })
21357            .unwrap();
21358        assert_eq!(conversation_count, 0);
21359
21360        let readonly = open_franken_with_flags(
21361            &canonical_db.to_string_lossy(),
21362            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21363        )
21364        .unwrap();
21365        let readonly_conversation_count: i64 = readonly
21366            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
21367                row.get_typed(0)
21368            })
21369            .unwrap();
21370        assert_eq!(readonly_conversation_count, 0);
21371    }
21372
21373    #[test]
21374    fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
21375        let dir = TempDir::new().unwrap();
21376        let db_path = dir.path().join("test.db");
21377        let storage = SqliteStorage::open(&db_path).unwrap();
21378
21379        let agent = Agent {
21380            id: None,
21381            slug: "codex".into(),
21382            name: "Codex".into(),
21383            version: Some("0.2.3".into()),
21384            kind: AgentKind::Cli,
21385        };
21386        let agent_id = storage.ensure_agent(&agent).unwrap();
21387
21388        let conversation = Conversation {
21389            id: None,
21390            agent_slug: "codex".into(),
21391            workspace: Some(PathBuf::from("/tmp/workspace")),
21392            external_id: Some("lexical-rebuild-test".into()),
21393            title: Some("Lexical rebuild".into()),
21394            source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
21395            started_at: Some(1_700_000_000_000),
21396            ended_at: Some(1_700_000_000_100),
21397            approx_tokens: Some(42),
21398            metadata_json: serde_json::Value::Null,
21399            messages: vec![Message {
21400                id: None,
21401                idx: 0,
21402                role: MessageRole::Agent,
21403                author: Some("assistant".into()),
21404                created_at: Some(1_700_000_000_050),
21405                content: "indexed text".into(),
21406                extra_json: serde_json::json!({
21407                    "usage": { "total_tokens": 1234 },
21408                    "irrelevant_blob": "still preserved in canonical storage"
21409                }),
21410                snippets: Vec::new(),
21411            }],
21412            source_id: LOCAL_SOURCE_ID.into(),
21413            origin_host: None,
21414        };
21415
21416        let inserted = storage
21417            .insert_conversation_tree(agent_id, None, &conversation)
21418            .unwrap();
21419        let conversation_id = inserted.conversation_id;
21420
21421        let stored = storage.fetch_messages(conversation_id).unwrap();
21422        assert_eq!(stored.len(), 1);
21423        assert!(!stored[0].extra_json.is_null());
21424
21425        let lexical = storage
21426            .fetch_messages_for_lexical_rebuild(conversation_id)
21427            .unwrap();
21428        assert_eq!(lexical.len(), 1);
21429        assert_eq!(lexical[0].content, "indexed text");
21430        assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
21431        assert!(lexical[0].extra_json.is_null());
21432    }
21433
21434    #[test]
21435    fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
21436        let dir = TempDir::new().unwrap();
21437        let db_path = dir.path().join("test.db");
21438        let storage = SqliteStorage::open(&db_path).unwrap();
21439
21440        let agent = Agent {
21441            id: None,
21442            slug: "codex".into(),
21443            name: "Codex".into(),
21444            version: Some("0.2.3".into()),
21445            kind: AgentKind::Cli,
21446        };
21447        let agent_id = storage.ensure_agent(&agent).unwrap();
21448
21449        let first = Conversation {
21450            id: None,
21451            agent_slug: "codex".into(),
21452            workspace: Some(PathBuf::from("/tmp/workspace")),
21453            external_id: Some("lexical-batch-1".into()),
21454            title: Some("Lexical batch 1".into()),
21455            source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
21456            started_at: Some(1_700_000_000_000),
21457            ended_at: Some(1_700_000_000_100),
21458            approx_tokens: Some(42),
21459            metadata_json: serde_json::Value::Null,
21460            messages: vec![
21461                Message {
21462                    id: None,
21463                    idx: 0,
21464                    role: MessageRole::User,
21465                    author: Some("user".into()),
21466                    created_at: Some(1_700_000_000_010),
21467                    content: "first-a".into(),
21468                    extra_json: serde_json::json!({"opaque": true}),
21469                    snippets: Vec::new(),
21470                },
21471                Message {
21472                    id: None,
21473                    idx: 1,
21474                    role: MessageRole::Agent,
21475                    author: Some("assistant".into()),
21476                    created_at: Some(1_700_000_000_020),
21477                    content: "first-b".into(),
21478                    extra_json: serde_json::json!({"opaque": true}),
21479                    snippets: Vec::new(),
21480                },
21481            ],
21482            source_id: LOCAL_SOURCE_ID.into(),
21483            origin_host: None,
21484        };
21485
21486        let second = Conversation {
21487            id: None,
21488            agent_slug: "codex".into(),
21489            workspace: Some(PathBuf::from("/tmp/workspace")),
21490            external_id: Some("lexical-batch-2".into()),
21491            title: Some("Lexical batch 2".into()),
21492            source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
21493            started_at: Some(1_700_000_000_200),
21494            ended_at: Some(1_700_000_000_300),
21495            approx_tokens: Some(84),
21496            metadata_json: serde_json::Value::Null,
21497            messages: vec![Message {
21498                id: None,
21499                idx: 0,
21500                role: MessageRole::Tool,
21501                author: Some("tool".into()),
21502                created_at: Some(1_700_000_000_210),
21503                content: "second-a".into(),
21504                extra_json: serde_json::json!({"opaque": true}),
21505                snippets: Vec::new(),
21506            }],
21507            source_id: LOCAL_SOURCE_ID.into(),
21508            origin_host: None,
21509        };
21510        let third = Conversation {
21511            external_id: Some("lexical-batch-3".into()),
21512            title: Some("Lexical batch 3".into()),
21513            source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
21514            messages: vec![Message {
21515                id: None,
21516                idx: 0,
21517                role: MessageRole::System,
21518                author: Some("system".into()),
21519                created_at: Some(1_700_000_000_410),
21520                content: "third-a".into(),
21521                extra_json: serde_json::json!({"opaque": true}),
21522                snippets: Vec::new(),
21523            }],
21524            ..second.clone()
21525        };
21526
21527        let first_id = storage
21528            .insert_conversation_tree(agent_id, None, &first)
21529            .unwrap()
21530            .conversation_id;
21531        let second_id = storage
21532            .insert_conversation_tree(agent_id, None, &second)
21533            .unwrap()
21534            .conversation_id;
21535        let third_id = storage
21536            .insert_conversation_tree(agent_id, None, &third)
21537            .unwrap()
21538            .conversation_id;
21539
21540        let lexical = storage
21541            .fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
21542            .unwrap();
21543
21544        let first_messages = lexical.get(&first_id).expect("first conversation");
21545        assert_eq!(first_messages.len(), 2);
21546        assert_eq!(first_messages[0].content, "first-a");
21547        assert_eq!(first_messages[1].content, "first-b");
21548        assert!(
21549            first_messages
21550                .iter()
21551                .all(|message| message.extra_json.is_null())
21552        );
21553
21554        assert!(
21555            !lexical.contains_key(&second_id),
21556            "batch fetch must exclude conversations not requested by the caller"
21557        );
21558
21559        let third_messages = lexical.get(&third_id).expect("third conversation");
21560        assert_eq!(third_messages.len(), 1);
21561        assert_eq!(third_messages[0].content, "third-a");
21562        assert!(third_messages[0].extra_json.is_null());
21563    }
21564
21565    #[test]
21566    fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
21567        let dir = TempDir::new().unwrap();
21568        let db_path = dir.path().join("test.db");
21569        let storage = SqliteStorage::open(&db_path).unwrap();
21570
21571        let agent = Agent {
21572            id: None,
21573            slug: "codex".into(),
21574            name: "Codex".into(),
21575            version: Some("0.2.3".into()),
21576            kind: AgentKind::Cli,
21577        };
21578        let agent_id = storage.ensure_agent(&agent).unwrap();
21579
21580        let conversation = Conversation {
21581            id: None,
21582            agent_slug: "codex".into(),
21583            workspace: Some(PathBuf::from("/tmp/workspace")),
21584            external_id: Some("lexical-batch-guard".into()),
21585            title: Some("Lexical batch guard".into()),
21586            source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
21587            started_at: Some(1_700_000_000_000),
21588            ended_at: Some(1_700_000_000_100),
21589            approx_tokens: Some(42),
21590            metadata_json: serde_json::Value::Null,
21591            messages: vec![
21592                Message {
21593                    id: None,
21594                    idx: 0,
21595                    role: MessageRole::User,
21596                    author: Some("user".into()),
21597                    created_at: Some(1_700_000_000_010),
21598                    content: "123456".into(),
21599                    extra_json: serde_json::Value::Null,
21600                    snippets: Vec::new(),
21601                },
21602                Message {
21603                    id: None,
21604                    idx: 1,
21605                    role: MessageRole::Agent,
21606                    author: Some("assistant".into()),
21607                    created_at: Some(1_700_000_000_020),
21608                    content: "abcdef".into(),
21609                    extra_json: serde_json::Value::Null,
21610                    snippets: Vec::new(),
21611                },
21612            ],
21613            source_id: LOCAL_SOURCE_ID.into(),
21614            origin_host: None,
21615        };
21616
21617        let conversation_id = storage
21618            .insert_conversation_tree(agent_id, None, &conversation)
21619            .unwrap()
21620            .conversation_id;
21621
21622        let error = storage
21623            .fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
21624            .expect_err("guardrail should reject oversized batch content");
21625
21626        let message = format!("{error:#}");
21627        assert!(
21628            message.contains("content-byte guardrail"),
21629            "expected guardrail reason in error, got {message}"
21630        );
21631    }
21632
21633    #[test]
21634    fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
21635        let dir = TempDir::new().unwrap();
21636        let db_path = dir.path().join("manual-rows.db");
21637        let storage = FrankenStorage::open(&db_path).unwrap();
21638        let conn = storage.raw();
21639
21640        conn.execute(
21641            "INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
21642             VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
21643        )
21644        .unwrap();
21645        conn.execute(
21646            "INSERT INTO conversations
21647             (id, agent_id, external_id, title, source_path, source_id, started_at)
21648             VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
21649        )
21650        .unwrap();
21651        conn.execute(
21652            "INSERT INTO messages
21653             (id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
21654             VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
21655        )
21656        .unwrap();
21657
21658        let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
21659        assert_eq!(lexical.len(), 1);
21660        assert_eq!(lexical[0].content, "manual body");
21661
21662        let full = storage.fetch_messages(1).unwrap();
21663        assert_eq!(full.len(), 1);
21664        assert_eq!(full[0].content, "manual body");
21665        assert_eq!(full[0].author.as_deref(), Some("tester"));
21666        assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
21667    }
21668
21669    #[test]
21670    fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
21671        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21672        use std::path::PathBuf;
21673
21674        let dir = TempDir::new().unwrap();
21675        let db_path = dir.path().join("agent_search.db");
21676        let storage = SqliteStorage::open(&db_path).unwrap();
21677
21678        let agent = Agent {
21679            id: None,
21680            slug: "claude_code".into(),
21681            name: "Claude Code".into(),
21682            version: None,
21683            kind: AgentKind::Cli,
21684        };
21685        let agent_id = storage.ensure_agent(&agent).unwrap();
21686
21687        for (external_id, base_ts) in [
21688            ("conv-1", 1_700_000_000_000_i64),
21689            ("conv-2", 1_700_000_001_000_i64),
21690        ] {
21691            let conversation = Conversation {
21692                id: None,
21693                agent_slug: "claude_code".into(),
21694                workspace: Some(PathBuf::from("/tmp/workspace")),
21695                external_id: Some(external_id.to_string()),
21696                title: Some("Lexical rebuild".into()),
21697                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21698                started_at: Some(base_ts),
21699                ended_at: Some(base_ts + 100),
21700                approx_tokens: None,
21701                metadata_json: serde_json::Value::Null,
21702                messages: vec![
21703                    Message {
21704                        id: None,
21705                        idx: 0,
21706                        role: MessageRole::User,
21707                        author: Some("user".into()),
21708                        created_at: Some(base_ts + 10),
21709                        content: format!("{external_id}-first"),
21710                        extra_json: serde_json::Value::Null,
21711                        snippets: Vec::new(),
21712                    },
21713                    Message {
21714                        id: None,
21715                        idx: 1,
21716                        role: MessageRole::Agent,
21717                        author: Some("assistant".into()),
21718                        created_at: Some(base_ts + 20),
21719                        content: format!("{external_id}-second"),
21720                        extra_json: serde_json::Value::Null,
21721                        snippets: Vec::new(),
21722                    },
21723                ],
21724                source_id: LOCAL_SOURCE_ID.into(),
21725                origin_host: None,
21726            };
21727            storage
21728                .insert_conversation_tree(agent_id, None, &conversation)
21729                .unwrap();
21730        }
21731
21732        let conversation_ids: Vec<i64> = storage
21733            .conn
21734            .query_map_collect(
21735                "SELECT id FROM conversations ORDER BY id",
21736                fparams![],
21737                |row| row.get_typed(0),
21738            )
21739            .unwrap();
21740        assert_eq!(conversation_ids.len(), 2);
21741
21742        let plan_details: Vec<String> = storage
21743            .conn
21744            .query_map_collect(
21745                "EXPLAIN QUERY PLAN \
21746                 SELECT conversation_id, id, idx, role, author, created_at, content \
21747                 FROM messages \
21748                 WHERE conversation_id IN (?1, ?2) \
21749                 ORDER BY conversation_id ASC, idx ASC",
21750                fparams![conversation_ids[0], conversation_ids[1]],
21751                |row| row.get_typed(3),
21752            )
21753            .unwrap();
21754
21755        assert!(
21756            plan_details
21757                .iter()
21758                .any(|detail| detail.contains("sqlite_autoindex_messages_1")),
21759            "expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
21760        );
21761        assert!(
21762            !plan_details
21763                .iter()
21764                .any(|detail| detail.contains("TEMP B-TREE")),
21765            "expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
21766        );
21767    }
21768
21769    #[test]
21770    fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
21771        let dir = TempDir::new().unwrap();
21772        let db_path = dir.path().join("test.db");
21773        let storage = SqliteStorage::open(&db_path).unwrap();
21774
21775        let agent = Agent {
21776            id: None,
21777            slug: "codex".into(),
21778            name: "Codex".into(),
21779            version: Some("0.2.3".into()),
21780            kind: AgentKind::Cli,
21781        };
21782        let agent_id = storage.ensure_agent(&agent).unwrap();
21783
21784        let first = Conversation {
21785            id: None,
21786            agent_slug: "codex".into(),
21787            workspace: Some(PathBuf::from("/tmp/workspace")),
21788            external_id: Some("lexical-stream-1".into()),
21789            title: Some("Lexical stream 1".into()),
21790            source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
21791            started_at: Some(1_700_000_000_000),
21792            ended_at: Some(1_700_000_000_100),
21793            approx_tokens: Some(42),
21794            metadata_json: serde_json::Value::Null,
21795            messages: vec![
21796                Message {
21797                    id: None,
21798                    idx: 0,
21799                    role: MessageRole::User,
21800                    author: Some("user".into()),
21801                    created_at: Some(1_700_000_000_010),
21802                    content: "first-a".into(),
21803                    extra_json: serde_json::json!({"opaque": true}),
21804                    snippets: Vec::new(),
21805                },
21806                Message {
21807                    id: None,
21808                    idx: 1,
21809                    role: MessageRole::Agent,
21810                    author: Some("assistant".into()),
21811                    created_at: Some(1_700_000_000_020),
21812                    content: "first-b".into(),
21813                    extra_json: serde_json::json!({"opaque": true}),
21814                    snippets: Vec::new(),
21815                },
21816            ],
21817            source_id: LOCAL_SOURCE_ID.into(),
21818            origin_host: None,
21819        };
21820
21821        let second = Conversation {
21822            id: None,
21823            agent_slug: "codex".into(),
21824            workspace: Some(PathBuf::from("/tmp/workspace")),
21825            external_id: Some("lexical-stream-2".into()),
21826            title: Some("Lexical stream 2".into()),
21827            source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
21828            started_at: Some(1_700_000_000_200),
21829            ended_at: Some(1_700_000_000_300),
21830            approx_tokens: Some(84),
21831            metadata_json: serde_json::Value::Null,
21832            messages: vec![Message {
21833                id: None,
21834                idx: 0,
21835                role: MessageRole::Tool,
21836                author: Some("tool".into()),
21837                created_at: Some(1_700_000_000_210),
21838                content: "second-a".into(),
21839                extra_json: serde_json::json!({"opaque": true}),
21840                snippets: Vec::new(),
21841            }],
21842            source_id: LOCAL_SOURCE_ID.into(),
21843            origin_host: None,
21844        };
21845
21846        let first_id = storage
21847            .insert_conversation_tree(agent_id, None, &first)
21848            .unwrap()
21849            .conversation_id;
21850        let second_id = storage
21851            .insert_conversation_tree(agent_id, None, &second)
21852            .unwrap()
21853            .conversation_id;
21854
21855        let mut streamed = Vec::new();
21856        storage
21857            .stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
21858                streamed.push((
21859                    row.conversation_id,
21860                    row.idx,
21861                    row.role,
21862                    row.author,
21863                    row.content,
21864                ));
21865                Ok(())
21866            })
21867            .unwrap();
21868
21869        assert_eq!(
21870            streamed,
21871            vec![
21872                (
21873                    first_id,
21874                    0,
21875                    "user".to_string(),
21876                    Some("user".to_string()),
21877                    "first-a".to_string(),
21878                ),
21879                (
21880                    first_id,
21881                    1,
21882                    "agent".to_string(),
21883                    Some("assistant".to_string()),
21884                    "first-b".to_string(),
21885                ),
21886                (
21887                    second_id,
21888                    0,
21889                    "tool".to_string(),
21890                    Some("tool".to_string()),
21891                    "second-a".to_string(),
21892                ),
21893            ]
21894        );
21895    }
21896
21897    #[test]
21898    fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
21899        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21900        use std::path::PathBuf;
21901
21902        let dir = TempDir::new().unwrap();
21903        let db_path = dir.path().join("agent_search.db");
21904        let storage = SqliteStorage::open(&db_path).unwrap();
21905
21906        let agent = Agent {
21907            id: None,
21908            slug: "claude_code".into(),
21909            name: "Claude Code".into(),
21910            version: Some("1.2.3".into()),
21911            kind: AgentKind::Cli,
21912        };
21913        let agent_id = storage.ensure_agent(&agent).unwrap();
21914
21915        let first = Conversation {
21916            id: None,
21917            agent_slug: "claude_code".into(),
21918            workspace: Some(PathBuf::from("/tmp/workspace")),
21919            external_id: Some("lexical-range-1".into()),
21920            title: Some("Lexical range 1".into()),
21921            source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
21922            started_at: Some(1_700_000_000_000),
21923            ended_at: Some(1_700_000_000_100),
21924            approx_tokens: Some(42),
21925            metadata_json: serde_json::Value::Null,
21926            messages: vec![Message {
21927                id: None,
21928                idx: 0,
21929                role: MessageRole::User,
21930                author: Some("user".into()),
21931                created_at: Some(1_700_000_000_010),
21932                content: "first-only".into(),
21933                extra_json: serde_json::json!({"opaque": true}),
21934                snippets: Vec::new(),
21935            }],
21936            source_id: LOCAL_SOURCE_ID.into(),
21937            origin_host: None,
21938        };
21939
21940        let second = Conversation {
21941            id: None,
21942            agent_slug: "claude_code".into(),
21943            workspace: Some(PathBuf::from("/tmp/workspace")),
21944            external_id: Some("lexical-range-2".into()),
21945            title: Some("Lexical range 2".into()),
21946            source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
21947            started_at: Some(1_700_000_000_200),
21948            ended_at: Some(1_700_000_000_300),
21949            approx_tokens: Some(84),
21950            metadata_json: serde_json::Value::Null,
21951            messages: vec![Message {
21952                id: None,
21953                idx: 0,
21954                role: MessageRole::Tool,
21955                author: Some("tool".into()),
21956                created_at: Some(1_700_000_000_210),
21957                content: "second-should-not-appear".into(),
21958                extra_json: serde_json::json!({"opaque": true}),
21959                snippets: Vec::new(),
21960            }],
21961            source_id: LOCAL_SOURCE_ID.into(),
21962            origin_host: None,
21963        };
21964
21965        let first_id = storage
21966            .insert_conversation_tree(agent_id, None, &first)
21967            .unwrap()
21968            .conversation_id;
21969        let second_id = storage
21970            .insert_conversation_tree(agent_id, None, &second)
21971            .unwrap()
21972            .conversation_id;
21973
21974        let mut streamed = Vec::new();
21975        storage
21976            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21977                first_id,
21978                first_id,
21979                |row| {
21980                    streamed.push((row.conversation_id, row.idx, row.content));
21981                    Ok(())
21982                },
21983            )
21984            .unwrap();
21985
21986        assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
21987        assert!(
21988            streamed
21989                .iter()
21990                .all(|(conversation_id, _, _)| *conversation_id != second_id),
21991            "upper bound should exclude later conversation ids"
21992        );
21993    }
21994
21995    #[test]
21996    fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
21997        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21998        use std::path::PathBuf;
21999
22000        let dir = TempDir::new().unwrap();
22001        let db_path = dir.path().join("agent_search.db");
22002        let storage = SqliteStorage::open(&db_path).unwrap();
22003
22004        let claude_agent_id = storage
22005            .ensure_agent(&Agent {
22006                id: None,
22007                slug: "claude_code".into(),
22008                name: "Claude Code".into(),
22009                version: None,
22010                kind: AgentKind::Cli,
22011            })
22012            .unwrap();
22013        let aider_agent_id = storage
22014            .ensure_agent(&Agent {
22015                id: None,
22016                slug: "aider".into(),
22017                name: "Aider".into(),
22018                version: None,
22019                kind: AgentKind::Cli,
22020            })
22021            .unwrap();
22022
22023        type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
22024
22025        let mut expected = Vec::new();
22026        let mut first_conversation_id = None;
22027        let mut last_conversation_id = None;
22028        let mut insert_conversation =
22029            |agent_id: i64,
22030             external_id: &str,
22031             title: &str,
22032             source_path: &str,
22033             started_at: i64,
22034             message_specs: Vec<MessageSpec>| {
22035                let conversation = Conversation {
22036                    id: None,
22037                    agent_slug: if agent_id == aider_agent_id {
22038                        "aider".into()
22039                    } else {
22040                        "claude_code".into()
22041                    },
22042                    workspace: Some(PathBuf::from("/tmp/workspace")),
22043                    external_id: Some(external_id.to_string()),
22044                    title: Some(title.to_string()),
22045                    source_path: PathBuf::from(source_path),
22046                    started_at: Some(started_at),
22047                    ended_at: Some(started_at + 100),
22048                    approx_tokens: None,
22049                    metadata_json: serde_json::Value::Null,
22050                    messages: message_specs
22051                        .iter()
22052                        .map(|(idx, role, author, created_at, content)| Message {
22053                            id: None,
22054                            idx: *idx,
22055                            role: role.clone(),
22056                            author: author.clone(),
22057                            created_at: *created_at,
22058                            content: content.clone(),
22059                            extra_json: serde_json::Value::Null,
22060                            snippets: Vec::new(),
22061                        })
22062                        .collect(),
22063                    source_id: LOCAL_SOURCE_ID.into(),
22064                    origin_host: None,
22065                };
22066                let conversation_id = storage
22067                    .insert_conversation_tree(agent_id, None, &conversation)
22068                    .unwrap()
22069                    .conversation_id;
22070                if first_conversation_id.is_none() {
22071                    first_conversation_id = Some(conversation_id);
22072                }
22073                last_conversation_id = Some(conversation_id);
22074                expected.extend(message_specs.into_iter().map(
22075                    |(idx, role, author, created_at, content)| {
22076                        (
22077                            conversation_id,
22078                            idx,
22079                            match role {
22080                                MessageRole::User => "user".to_string(),
22081                                MessageRole::Agent => "agent".to_string(),
22082                                MessageRole::Tool => "tool".to_string(),
22083                                MessageRole::System => "system".to_string(),
22084                                MessageRole::Other(other) => other,
22085                            },
22086                            author,
22087                            created_at,
22088                            content,
22089                        )
22090                    },
22091                ));
22092            };
22093
22094        for (label, base_ts) in [
22095            ("alpha", 1_700_000_000_000_i64),
22096            ("beta", 1_700_000_001_000_i64),
22097            ("gamma", 1_700_000_002_000_i64),
22098            ("delta", 1_700_000_003_000_i64),
22099            ("epsilon", 1_700_000_004_000_i64),
22100        ] {
22101            insert_conversation(
22102                claude_agent_id,
22103                &format!("lexical-{label}"),
22104                &format!("Lexical {label}"),
22105                &format!("/tmp/{label}.jsonl"),
22106                base_ts,
22107                vec![
22108                    (
22109                        0,
22110                        MessageRole::User,
22111                        None,
22112                        Some(base_ts + 10),
22113                        format!("{label}_content"),
22114                    ),
22115                    (
22116                        1,
22117                        MessageRole::Agent,
22118                        None,
22119                        Some(base_ts + 20),
22120                        format!("{label}_content_response"),
22121                    ),
22122                ],
22123            );
22124        }
22125
22126        insert_conversation(
22127            aider_agent_id,
22128            "lexical-aider-history",
22129            "Aider Chat: coding_agent_session_search",
22130            "/tmp/.aider.chat.history.md",
22131            1_764_619_673_394,
22132            vec![
22133                (
22134                    0,
22135                    MessageRole::System,
22136                    Some("system".to_string()),
22137                    None,
22138                    "# aider chat started at 2025-12-01 20:07:47".to_string(),
22139                ),
22140                (
22141                    1,
22142                    MessageRole::User,
22143                    Some("user".to_string()),
22144                    None,
22145                    "/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
22146                ),
22147            ],
22148        );
22149        insert_conversation(
22150            aider_agent_id,
22151            "lexical-aider-fixture",
22152            "Aider Chat: aider",
22153            "/tmp/tests/fixtures/aider/.aider.chat.history.md",
22154            1_764_621_401_399,
22155            vec![
22156                (
22157                    0,
22158                    MessageRole::User,
22159                    Some("user".to_string()),
22160                    None,
22161                    "/add src/main.rs".to_string(),
22162                ),
22163                (
22164                    1,
22165                    MessageRole::Agent,
22166                    Some("assistant".to_string()),
22167                    None,
22168                    "Added src/main.rs to the chat.
22169
22170#### /add src/main.rs"
22171                        .to_string(),
22172                ),
22173                (
22174                    2,
22175                    MessageRole::User,
22176                    Some("user".to_string()),
22177                    None,
22178                    "Please refactor.".to_string(),
22179                ),
22180                (
22181                    3,
22182                    MessageRole::Agent,
22183                    Some("assistant".to_string()),
22184                    None,
22185                    "Sure, here is the code.".to_string(),
22186                ),
22187            ],
22188        );
22189
22190        let mut streamed = Vec::new();
22191        storage
22192            .stream_messages_for_lexical_rebuild_between_conversation_ids(
22193                first_conversation_id.unwrap(),
22194                last_conversation_id.unwrap(),
22195                |row| {
22196                    streamed.push((
22197                        row.conversation_id,
22198                        row.idx,
22199                        row.role,
22200                        row.author,
22201                        row.created_at,
22202                        row.content,
22203                    ));
22204                    Ok(())
22205                },
22206            )
22207            .unwrap();
22208
22209        assert_eq!(streamed, expected);
22210    }
22211
22212    #[test]
22213    fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
22214        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22215        use std::path::PathBuf;
22216
22217        let dir = TempDir::new().unwrap();
22218        let db_path = dir.path().join("agent_search.db");
22219        let storage = SqliteStorage::open(&db_path).unwrap();
22220
22221        let agent = Agent {
22222            id: None,
22223            slug: "claude_code".into(),
22224            name: "Claude Code".into(),
22225            version: None,
22226            kind: AgentKind::Cli,
22227        };
22228        let agent_id = storage.ensure_agent(&agent).unwrap();
22229
22230        for (external_id, base_ts) in [
22231            ("conv-1", 1_700_000_000_000_i64),
22232            ("conv-2", 1_700_000_001_000_i64),
22233        ] {
22234            let conversation = Conversation {
22235                id: None,
22236                agent_slug: "claude_code".into(),
22237                workspace: Some(PathBuf::from("/tmp/workspace")),
22238                external_id: Some(external_id.to_string()),
22239                title: Some("Lexical rebuild".into()),
22240                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
22241                started_at: Some(base_ts),
22242                ended_at: Some(base_ts + 100),
22243                approx_tokens: None,
22244                metadata_json: serde_json::Value::Null,
22245                messages: vec![
22246                    Message {
22247                        id: None,
22248                        idx: 0,
22249                        role: MessageRole::User,
22250                        author: Some("user".into()),
22251                        created_at: Some(base_ts + 10),
22252                        content: format!("{external_id}-first"),
22253                        extra_json: serde_json::Value::Null,
22254                        snippets: Vec::new(),
22255                    },
22256                    Message {
22257                        id: None,
22258                        idx: 1,
22259                        role: MessageRole::Agent,
22260                        author: Some("assistant".into()),
22261                        created_at: Some(base_ts + 20),
22262                        content: format!("{external_id}-second"),
22263                        extra_json: serde_json::Value::Null,
22264                        snippets: Vec::new(),
22265                    },
22266                ],
22267                source_id: LOCAL_SOURCE_ID.into(),
22268                origin_host: None,
22269            };
22270            storage
22271                .insert_conversation_tree(agent_id, None, &conversation)
22272                .unwrap();
22273        }
22274
22275        let first_id: i64 = storage
22276            .conn
22277            .query_row_map(
22278                "SELECT id FROM conversations ORDER BY id LIMIT 1",
22279                fparams![],
22280                |row| row.get_typed(0),
22281            )
22282            .unwrap();
22283        let last_id: i64 = storage
22284            .conn
22285            .query_row_map(
22286                "SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
22287                fparams![],
22288                |row| row.get_typed(0),
22289            )
22290            .unwrap();
22291
22292        let conversation_plan_details: Vec<String> = storage
22293            .conn
22294            .query_map_collect(
22295                "EXPLAIN QUERY PLAN                  SELECT id FROM conversations                  WHERE id >= ?1 AND id <= ?2                  ORDER BY id ASC",
22296                fparams![first_id, last_id],
22297                |row| row.get_typed(3),
22298            )
22299            .unwrap();
22300        assert!(
22301            !conversation_plan_details
22302                .iter()
22303                .any(|detail| detail.contains("TEMP B-TREE")),
22304            "expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
22305        );
22306
22307        let message_plan_details: Vec<String> = storage
22308            .conn
22309            .query_map_collect(
22310                "EXPLAIN QUERY PLAN                  SELECT id, idx, role, author, created_at, content                  FROM messages INDEXED BY sqlite_autoindex_messages_1                  WHERE conversation_id = ?1                  ORDER BY idx",
22311                fparams![first_id],
22312                |row| row.get_typed(3),
22313            )
22314            .unwrap();
22315        assert!(
22316            message_plan_details
22317                .iter()
22318                .any(|detail| detail.contains("sqlite_autoindex_messages_1")
22319                    || detail.contains("idx_messages_conv_idx")),
22320            "expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
22321        );
22322        assert!(
22323            !message_plan_details
22324                .iter()
22325                .any(|detail| detail.contains("TEMP B-TREE")),
22326            "expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
22327        );
22328    }
22329
22330    #[test]
22331    fn discover_historical_database_bundles_prefers_larger_archives_first() {
22332        let dir = TempDir::new().unwrap();
22333        let canonical_db = dir.path().join("agent_search.db");
22334        fs::write(&canonical_db, b"canonical").unwrap();
22335
22336        let smaller = dir.path().join("agent_search.corrupt.small");
22337        fs::write(&smaller, vec![0_u8; 32]).unwrap();
22338
22339        let backups_dir = dir.path().join("backups");
22340        fs::create_dir_all(&backups_dir).unwrap();
22341        let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
22342        fs::write(&larger, vec![0_u8; 128]).unwrap();
22343
22344        let bundles = discover_historical_database_bundles(&canonical_db);
22345        let ordered_paths: Vec<PathBuf> =
22346            bundles.into_iter().map(|bundle| bundle.root_path).collect();
22347
22348        assert_eq!(ordered_paths, vec![larger, smaller]);
22349    }
22350
22351    #[test]
22352    fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
22353        let dir = TempDir::new().unwrap();
22354        let canonical_db = dir.path().join("agent_search.db");
22355        fs::write(&canonical_db, b"canonical").unwrap();
22356
22357        let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
22358        fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
22359
22360        let backups_dir = dir.path().join("backups");
22361        fs::create_dir_all(&backups_dir).unwrap();
22362        let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
22363        let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
22364        conn.execute_batch(
22365            "CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
22366             CREATE TABLE messages (
22367                 id INTEGER PRIMARY KEY,
22368                 conversation_id INTEGER NOT NULL,
22369                 idx INTEGER NOT NULL,
22370                 content TEXT
22371             );
22372             INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
22373             INSERT INTO messages(id, conversation_id, idx, content)
22374             VALUES (1, 1, 0, 'seed');",
22375        )
22376        .unwrap();
22377        drop(conn);
22378
22379        let bundles = discover_historical_database_bundles(&canonical_db);
22380        let ordered_paths: Vec<PathBuf> = bundles
22381            .iter()
22382            .map(|bundle| bundle.root_path.clone())
22383            .collect();
22384
22385        assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
22386        assert!(bundles[0].supports_direct_readonly);
22387        assert!(!bundles[1].supports_direct_readonly);
22388    }
22389
22390    #[test]
22391    fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
22392        let dir = TempDir::new().unwrap();
22393        let canonical_db = dir.path().join("agent_search.db");
22394        let storage = SqliteStorage::open(&canonical_db).unwrap();
22395
22396        let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
22397        fs::write(&quarantined, b"not a sqlite database").unwrap();
22398
22399        let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
22400            .into_iter()
22401            .map(|bundle| bundle.root_path)
22402            .collect();
22403        assert_eq!(discovered, vec![quarantined]);
22404
22405        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
22406        assert_eq!(outcome.bundles_considered, 1);
22407        assert_eq!(outcome.bundles_imported, 0);
22408        assert_eq!(outcome.conversations_imported, 0);
22409        assert_eq!(outcome.messages_imported, 0);
22410        assert!(storage.list_conversations(10, 0).unwrap().is_empty());
22411    }
22412
22413    #[test]
22414    fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
22415        let dir = TempDir::new().unwrap();
22416        let canonical_db = dir.path().join("agent_search.db");
22417        fs::write(&canonical_db, b"canonical").unwrap();
22418
22419        let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
22420        fs::create_dir_all(&repair_lab_dir).unwrap();
22421        let repair_lab_db = repair_lab_dir.join("agent_search.db");
22422        fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
22423        fs::write(
22424            repair_lab_dir.join("agent_search.rebuild-test.db"),
22425            vec![0_u8; 192],
22426        )
22427        .unwrap();
22428
22429        let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
22430        fs::create_dir_all(&snapshots_dir).unwrap();
22431        let snapshot_db = snapshots_dir.join("agent_search.db");
22432        fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
22433
22434        let bundles = discover_historical_database_bundles(&canonical_db);
22435        let ordered_paths: Vec<PathBuf> =
22436            bundles.into_iter().map(|bundle| bundle.root_path).collect();
22437
22438        assert!(ordered_paths.contains(&repair_lab_db));
22439        assert!(ordered_paths.contains(&snapshot_db));
22440        assert!(
22441            !ordered_paths
22442                .iter()
22443                .any(|path| path.file_name().and_then(|name| name.to_str())
22444                    == Some("agent_search.rebuild-test.db"))
22445        );
22446    }
22447
22448    #[test]
22449    fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
22450        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22451
22452        let dir = TempDir::new().unwrap();
22453        let canonical_db = dir.path().join("agent_search.db");
22454        fs::write(&canonical_db, b"canonical").unwrap();
22455
22456        let replay_dir = dir
22457            .path()
22458            .join("repair-lab")
22459            .join("replay-20260324T070101Z");
22460        fs::create_dir_all(&replay_dir).unwrap();
22461        let replay_db = replay_dir.join("agent_search.db");
22462        let replay_storage = SqliteStorage::open(&replay_db).unwrap();
22463        let agent = Agent {
22464            id: None,
22465            slug: "codex".into(),
22466            name: "Codex".into(),
22467            version: Some("0.2.3".into()),
22468            kind: AgentKind::Cli,
22469        };
22470        let agent_id = replay_storage.ensure_agent(&agent).unwrap();
22471        let conversation = Conversation {
22472            id: None,
22473            agent_slug: "codex".into(),
22474            workspace: Some(PathBuf::from("/tmp/workspace")),
22475            external_id: Some("replay-conv".into()),
22476            title: Some("Replay bundle".into()),
22477            source_path: PathBuf::from("/tmp/replay.jsonl"),
22478            started_at: Some(1_700_000_000_000),
22479            ended_at: Some(1_700_000_000_100),
22480            approx_tokens: Some(42),
22481            metadata_json: serde_json::Value::Null,
22482            messages: vec![Message {
22483                id: None,
22484                idx: 0,
22485                role: MessageRole::Agent,
22486                author: Some("assistant".into()),
22487                created_at: Some(1_700_000_000_050),
22488                content: "replay message".into(),
22489                extra_json: serde_json::Value::Null,
22490                snippets: Vec::new(),
22491            }],
22492            source_id: LOCAL_SOURCE_ID.into(),
22493            origin_host: None,
22494        };
22495        replay_storage
22496            .insert_conversation_tree(agent_id, None, &conversation)
22497            .unwrap();
22498        drop(replay_storage);
22499
22500        let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
22501        replay_legacy
22502            .execute_batch(
22503                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
22504                 DELETE FROM _schema_migrations WHERE version = 14;
22505                 PRAGMA writable_schema = ON;",
22506            )
22507            .unwrap();
22508        replay_legacy
22509            .execute(
22510                "DELETE FROM meta WHERE key = ?1",
22511                [FTS_FRANKEN_REBUILD_META_KEY],
22512            )
22513            .unwrap();
22514        #[cfg(not(windows))]
22515        {
22516            let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
22517            replay_legacy
22518                .execute(
22519                    "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22520                     VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22521                    [duplicate_legacy_fts_sql],
22522                )
22523                .unwrap();
22524        }
22525        replay_legacy
22526            .execute_batch("PRAGMA writable_schema = OFF;")
22527            .unwrap();
22528        drop(replay_legacy);
22529
22530        let backups_dir = dir.path().join("backups");
22531        fs::create_dir_all(&backups_dir).unwrap();
22532        let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
22533        let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
22534        let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
22535        clean_storage
22536            .insert_conversation_tree(clean_agent_id, None, &conversation)
22537            .unwrap();
22538        drop(clean_storage);
22539
22540        let bundles = discover_historical_database_bundles(&canonical_db);
22541        let ordered_paths: Vec<PathBuf> = bundles
22542            .iter()
22543            .map(|bundle| bundle.root_path.clone())
22544            .collect();
22545
22546        assert_eq!(ordered_paths[0], clean_backup);
22547        assert_eq!(ordered_paths[1], replay_db);
22548        assert_eq!(
22549            bundles[0].probe.schema_version,
22550            Some(CURRENT_SCHEMA_VERSION)
22551        );
22552        // Post-V14 cass drops the fts_messages virtual table during migration
22553        // and recreates it lazily on first open, so a freshly-migrated "clean"
22554        // backup has zero fts_messages rows in sqlite_master. The bundle is
22555        // still ranked as healthy by `bundle_health_rank` because 0 rows is a
22556        // legitimate lazy-FTS state (see comment there).
22557        assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
22558        // `fts_queryable` mirrors a direct rusqlite probe; with 0 sqlite_master
22559        // rows the table isn't queryable until lazy repair runs.
22560        assert!(!bundles[0].probe.fts_queryable);
22561        assert_eq!(bundles[1].probe.schema_version, Some(13));
22562        // The replay bundle had V14 run (dropping fts_messages → 0 rows), then
22563        // the test rolls meta.schema_version back to 13 and deletes the V14
22564        // marker. On Unix CI we also inject a duplicate sqlite_master row to
22565        // exercise the malformed-bundle probe path that depends on sqlite3.
22566        let expected_fts_schema_rows = if cfg!(windows) { Some(0) } else { Some(1) };
22567        assert_eq!(bundles[1].probe.fts_schema_rows, expected_fts_schema_rows);
22568    }
22569
22570    #[test]
22571    fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
22572        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22573
22574        let dir = TempDir::new().unwrap();
22575        let db_path = dir.path().join("fts-catchup.db");
22576        let storage = SqliteStorage::open(&db_path).unwrap();
22577        let agent = Agent {
22578            id: None,
22579            slug: "codex".into(),
22580            name: "Codex".into(),
22581            version: Some("0.2.3".into()),
22582            kind: AgentKind::Cli,
22583        };
22584        let agent_id = storage.ensure_agent(&agent).unwrap();
22585        let conversation = Conversation {
22586            id: None,
22587            agent_slug: "codex".into(),
22588            workspace: Some(PathBuf::from("/tmp/workspace")),
22589            external_id: Some("fts-catchup".into()),
22590            title: Some("FTS catchup".into()),
22591            source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
22592            started_at: Some(1_700_000_000_000),
22593            ended_at: Some(1_700_000_000_100),
22594            approx_tokens: Some(42),
22595            metadata_json: serde_json::Value::Null,
22596            messages: vec![Message {
22597                id: None,
22598                idx: 0,
22599                role: MessageRole::User,
22600                author: Some("user".into()),
22601                created_at: Some(1_700_000_000_050),
22602                content: "initial message".into(),
22603                extra_json: serde_json::Value::Null,
22604                snippets: Vec::new(),
22605            }],
22606            source_id: LOCAL_SOURCE_ID.into(),
22607            origin_host: None,
22608        };
22609        storage
22610            .insert_conversation_tree(agent_id, None, &conversation)
22611            .unwrap();
22612        drop(storage);
22613
22614        rebuild_fts_via_rusqlite(&db_path).unwrap();
22615
22616        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22617        let conversation_id: i64 = conn
22618            .query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
22619                row.get_typed(0)
22620            })
22621            .unwrap();
22622        conn.execute_compat(
22623            "INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
22624             VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
22625            fparams![conversation_id],
22626        )
22627        .unwrap();
22628        drop(conn);
22629
22630        let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
22631        assert_eq!(
22632            repair,
22633            FtsConsistencyRepair::IncrementalCatchUp {
22634                inserted_rows: 1,
22635                total_rows: 2
22636            }
22637        );
22638
22639        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22640        let auth_rows: i64 = conn
22641            .query_row_map(
22642                "SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
22643                fparams![],
22644                |row| row.get_typed(0),
22645            )
22646            .unwrap();
22647        assert_eq!(auth_rows, 1);
22648    }
22649
22650    #[test]
22651    fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
22652        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22653
22654        let dir = TempDir::new().unwrap();
22655        let db_path = dir.path().join("fts-duplicate-rebuild.db");
22656
22657        let storage = SqliteStorage::open(&db_path).unwrap();
22658        let agent = Agent {
22659            id: None,
22660            slug: "codex".into(),
22661            name: "Codex".into(),
22662            version: Some("0.2.3".into()),
22663            kind: AgentKind::Cli,
22664        };
22665        let agent_id = storage.ensure_agent(&agent).unwrap();
22666        let conversation = Conversation {
22667            id: None,
22668            agent_slug: "codex".into(),
22669            workspace: Some(PathBuf::from("/ws")),
22670            external_id: Some("retro".into()),
22671            title: Some("retro".into()),
22672            source_path: PathBuf::from("/tmp/retro.jsonl"),
22673            started_at: Some(42),
22674            ended_at: Some(42),
22675            approx_tokens: None,
22676            metadata_json: serde_json::Value::Null,
22677            messages: vec![Message {
22678                id: None,
22679                idx: 0,
22680                role: MessageRole::User,
22681                author: None,
22682                created_at: Some(42),
22683                content: "retro investigation".into(),
22684                extra_json: serde_json::Value::Null,
22685                snippets: Vec::new(),
22686            }],
22687            source_id: LOCAL_SOURCE_ID.into(),
22688            origin_host: None,
22689        };
22690        storage
22691            .insert_conversation_tree(agent_id, None, &conversation)
22692            .unwrap();
22693        drop(storage);
22694        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
22695
22696        let conn = rusqlite_test_fixture_conn(&db_path);
22697        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
22698        conn.execute(
22699            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22700             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22701            ["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
22702        )
22703        .unwrap();
22704        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
22705        let duplicate_rows: i64 = conn
22706            .query_row(
22707                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
22708                [],
22709                |row| row.get(0),
22710            )
22711            .unwrap();
22712        assert_eq!(duplicate_rows, 2);
22713        drop(conn);
22714
22715        let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
22716        assert_eq!(inserted, 1);
22717
22718        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22719        let schema_rows = franken_fts_schema_rows(&conn).unwrap();
22720        assert_eq!(
22721            schema_rows, 1,
22722            "DROP TABLE should leave one clean FTS schema"
22723        );
22724        let match_count: i64 = conn
22725            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
22726                row.get_typed(0)
22727            })
22728            .unwrap();
22729        assert_eq!(match_count, 1);
22730    }
22731
22732    // =========================================================================
22733    // Agent storage tests (bead yln.4)
22734    // =========================================================================
22735
22736    #[test]
22737    fn ensure_agent_creates_new() {
22738        let dir = TempDir::new().unwrap();
22739        let db_path = dir.path().join("test.db");
22740        let storage = SqliteStorage::open(&db_path).unwrap();
22741
22742        let agent = Agent {
22743            id: None,
22744            slug: "test_agent".into(),
22745            name: "Test Agent".into(),
22746            version: Some("1.0".into()),
22747            kind: AgentKind::Cli,
22748        };
22749
22750        let id = storage.ensure_agent(&agent).unwrap();
22751        assert!(id > 0);
22752    }
22753
22754    #[test]
22755    fn ensure_agent_returns_existing_id() {
22756        let dir = TempDir::new().unwrap();
22757        let db_path = dir.path().join("test.db");
22758        let storage = SqliteStorage::open(&db_path).unwrap();
22759
22760        let agent = Agent {
22761            id: None,
22762            slug: "codex".into(),
22763            name: "Codex".into(),
22764            version: None,
22765            kind: AgentKind::Cli,
22766        };
22767
22768        let id1 = storage.ensure_agent(&agent).unwrap();
22769        let id2 = storage.ensure_agent(&agent).unwrap();
22770        assert_eq!(id1, id2);
22771    }
22772
22773    #[test]
22774    fn ensure_agent_unchanged_preserves_updated_at() {
22775        let dir = TempDir::new().unwrap();
22776        let db_path = dir.path().join("test.db");
22777        let storage = SqliteStorage::open(&db_path).unwrap();
22778
22779        let agent = Agent {
22780            id: None,
22781            slug: "codex".into(),
22782            name: "Codex".into(),
22783            version: Some("1.0".into()),
22784            kind: AgentKind::Cli,
22785        };
22786
22787        storage.ensure_agent(&agent).unwrap();
22788        let initial_updated_at: i64 = storage
22789            .conn
22790            .query_row_map(
22791                "SELECT updated_at FROM agents WHERE slug = ?1",
22792                fparams![agent.slug.as_str()],
22793                |row| row.get_typed(0),
22794            )
22795            .unwrap();
22796        std::thread::sleep(std::time::Duration::from_millis(5));
22797
22798        storage.ensure_agent(&agent).unwrap();
22799        let fetched_updated_at: i64 = storage
22800            .conn
22801            .query_row_map(
22802                "SELECT updated_at FROM agents WHERE slug = ?1",
22803                fparams![agent.slug.as_str()],
22804                |row| row.get_typed(0),
22805            )
22806            .unwrap();
22807
22808        assert_eq!(fetched_updated_at, initial_updated_at);
22809    }
22810
22811    #[test]
22812    fn ensure_agent_changed_metadata_updates_cached_slug() {
22813        let dir = TempDir::new().unwrap();
22814        let db_path = dir.path().join("test.db");
22815        let storage = SqliteStorage::open(&db_path).unwrap();
22816
22817        let mut agent = Agent {
22818            id: None,
22819            slug: "codex".into(),
22820            name: "Codex".into(),
22821            version: Some("1.0".into()),
22822            kind: AgentKind::Cli,
22823        };
22824
22825        let id1 = storage.ensure_agent(&agent).unwrap();
22826        agent.name = "Codex CLI".into();
22827        agent.version = Some("1.1".into());
22828        let id2 = storage.ensure_agent(&agent).unwrap();
22829
22830        let fetched: (String, Option<String>) = storage
22831            .conn
22832            .query_row_map(
22833                "SELECT name, version FROM agents WHERE slug = ?1",
22834                fparams![agent.slug.as_str()],
22835                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
22836            )
22837            .unwrap();
22838
22839        assert_eq!(id1, id2);
22840        assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
22841    }
22842
22843    #[test]
22844    fn list_agents_returns_inserted() {
22845        let dir = TempDir::new().unwrap();
22846        let db_path = dir.path().join("test.db");
22847        let storage = SqliteStorage::open(&db_path).unwrap();
22848
22849        let agent = Agent {
22850            id: None,
22851            slug: "new_agent".into(),
22852            name: "New Agent".into(),
22853            version: None,
22854            kind: AgentKind::VsCode,
22855        };
22856        storage.ensure_agent(&agent).unwrap();
22857
22858        let agents = storage.list_agents().unwrap();
22859        assert!(agents.iter().any(|a| a.slug == "new_agent"));
22860    }
22861
22862    // =========================================================================
22863    // Workspace storage tests (bead yln.4)
22864    // =========================================================================
22865
22866    #[test]
22867    fn ensure_workspace_creates_new() {
22868        let dir = TempDir::new().unwrap();
22869        let db_path = dir.path().join("test.db");
22870        let storage = SqliteStorage::open(&db_path).unwrap();
22871
22872        let id = storage
22873            .ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
22874            .unwrap();
22875        assert!(id > 0);
22876    }
22877
22878    #[test]
22879    fn ensure_workspace_returns_existing() {
22880        let dir = TempDir::new().unwrap();
22881        let db_path = dir.path().join("test.db");
22882        let storage = SqliteStorage::open(&db_path).unwrap();
22883
22884        let path = Path::new("/home/user/myproject");
22885        let id1 = storage.ensure_workspace(path, None).unwrap();
22886        let id2 = storage.ensure_workspace(path, None).unwrap();
22887        assert_eq!(id1, id2);
22888    }
22889
22890    #[test]
22891    fn ensure_workspace_changed_display_name_updates_cached_path() {
22892        let dir = TempDir::new().unwrap();
22893        let db_path = dir.path().join("test.db");
22894        let storage = SqliteStorage::open(&db_path).unwrap();
22895
22896        let path = Path::new("/home/user/myproject");
22897        let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
22898        let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
22899
22900        let display_name: Option<String> = storage
22901            .conn
22902            .query_row_map(
22903                "SELECT display_name FROM workspaces WHERE path = ?1",
22904                fparams![path.to_string_lossy().as_ref()],
22905                |row| row.get_typed(0),
22906            )
22907            .unwrap();
22908
22909        assert_eq!(id1, id2);
22910        assert_eq!(display_name.as_deref(), Some("After"));
22911    }
22912
22913    #[test]
22914    fn list_workspaces_returns_inserted() {
22915        let dir = TempDir::new().unwrap();
22916        let db_path = dir.path().join("test.db");
22917        let storage = SqliteStorage::open(&db_path).unwrap();
22918
22919        storage
22920            .ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
22921            .unwrap();
22922
22923        let workspaces = storage.list_workspaces().unwrap();
22924        assert!(
22925            workspaces
22926                .iter()
22927                .any(|w| w.path.to_str() == Some("/test/workspace"))
22928        );
22929    }
22930
22931    // =========================================================================
22932    // Source storage tests (bead yln.4)
22933    // =========================================================================
22934
22935    #[test]
22936    fn upsert_source_creates_new() {
22937        let dir = TempDir::new().unwrap();
22938        let db_path = dir.path().join("test.db");
22939        let storage = SqliteStorage::open(&db_path).unwrap();
22940
22941        let source = Source {
22942            id: "test-laptop".into(),
22943            kind: SourceKind::Ssh,
22944            host_label: Some("test.local".into()),
22945            machine_id: Some("test-machine-id".into()),
22946            platform: None,
22947            config_json: None,
22948            created_at: Some(SqliteStorage::now_millis()),
22949            updated_at: None,
22950        };
22951
22952        storage.upsert_source(&source).unwrap();
22953        let fetched = storage.get_source("test-laptop").unwrap();
22954        assert!(fetched.is_some());
22955        assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
22956    }
22957
22958    #[test]
22959    fn upsert_source_updates_existing() {
22960        let dir = TempDir::new().unwrap();
22961        let db_path = dir.path().join("test.db");
22962        let storage = SqliteStorage::open(&db_path).unwrap();
22963
22964        let source1 = Source {
22965            id: "my-source".into(),
22966            kind: SourceKind::Ssh,
22967            host_label: Some("Original Label".into()),
22968            machine_id: None,
22969            platform: None,
22970            config_json: None,
22971            created_at: Some(SqliteStorage::now_millis()),
22972            updated_at: None,
22973        };
22974        storage.upsert_source(&source1).unwrap();
22975
22976        let source2 = Source {
22977            id: "my-source".into(),
22978            kind: SourceKind::Ssh,
22979            host_label: Some("Updated Label".into()),
22980            machine_id: None,
22981            platform: Some("linux".into()),
22982            config_json: None,
22983            created_at: Some(SqliteStorage::now_millis()),
22984            updated_at: Some(SqliteStorage::now_millis()),
22985        };
22986        storage.upsert_source(&source2).unwrap();
22987
22988        let fetched = storage.get_source("my-source").unwrap().unwrap();
22989        assert_eq!(fetched.host_label, Some("Updated Label".into()));
22990        assert!(fetched.platform.is_some());
22991    }
22992
22993    #[test]
22994    fn upsert_source_unchanged_preserves_updated_at() {
22995        let dir = TempDir::new().unwrap();
22996        let db_path = dir.path().join("test.db");
22997        let storage = SqliteStorage::open(&db_path).unwrap();
22998
22999        let source = Source {
23000            id: "stable-source".into(),
23001            kind: SourceKind::Ssh,
23002            host_label: Some("builder.local".into()),
23003            machine_id: None,
23004            platform: Some("linux".into()),
23005            config_json: Some(serde_json::json!({"role": "bench"})),
23006            created_at: None,
23007            updated_at: None,
23008        };
23009
23010        storage.upsert_source(&source).unwrap();
23011        let initial = storage.get_source("stable-source").unwrap().unwrap();
23012        std::thread::sleep(std::time::Duration::from_millis(5));
23013
23014        storage.upsert_source(&source).unwrap();
23015        let fetched = storage.get_source("stable-source").unwrap().unwrap();
23016
23017        assert_eq!(fetched.created_at, initial.created_at);
23018        assert_eq!(fetched.updated_at, initial.updated_at);
23019        assert_eq!(fetched.host_label, initial.host_label);
23020        assert_eq!(fetched.platform, initial.platform);
23021        assert_eq!(fetched.config_json, initial.config_json);
23022    }
23023
23024    #[test]
23025    fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
23026        let dir = TempDir::new().unwrap();
23027        let db_path = dir.path().join("test.db");
23028        let storage = SqliteStorage::open(&db_path).unwrap();
23029
23030        let conversation = Conversation {
23031            id: None,
23032            agent_slug: "codex".into(),
23033            workspace: Some(PathBuf::from("/ws/cache-recreate")),
23034            external_id: Some("cache-recreate".into()),
23035            title: Some("Cache Recreate".into()),
23036            source_path: PathBuf::from("/log/cache-recreate.jsonl"),
23037            started_at: Some(1_700_000_000_000),
23038            ended_at: Some(1_700_000_000_001),
23039            approx_tokens: Some(16),
23040            metadata_json: serde_json::json!({}),
23041            messages: vec![Message {
23042                id: None,
23043                idx: 0,
23044                role: MessageRole::User,
23045                author: Some("tester".into()),
23046                created_at: Some(1_700_000_000_000),
23047                content: "cache recreate".into(),
23048                extra_json: serde_json::json!({}),
23049                snippets: Vec::new(),
23050            }],
23051            source_id: "cache-remote-source".into(),
23052            origin_host: Some("builder-cache".into()),
23053        };
23054
23055        storage
23056            .ensure_source_for_conversation(&conversation)
23057            .unwrap();
23058        assert!(storage.get_source("cache-remote-source").unwrap().is_some());
23059
23060        let deleted = storage.delete_source("cache-remote-source", false).unwrap();
23061        assert!(deleted);
23062        assert!(storage.get_source("cache-remote-source").unwrap().is_none());
23063
23064        storage
23065            .ensure_source_for_conversation(&conversation)
23066            .unwrap();
23067        let recreated = storage.get_source("cache-remote-source").unwrap();
23068        assert!(recreated.is_some());
23069        assert_eq!(
23070            recreated.unwrap().host_label.as_deref(),
23071            Some("builder-cache")
23072        );
23073    }
23074
23075    #[test]
23076    fn delete_source_removes_entry() {
23077        let dir = TempDir::new().unwrap();
23078        let db_path = dir.path().join("test.db");
23079        let storage = SqliteStorage::open(&db_path).unwrap();
23080
23081        let source = Source {
23082            id: "to-delete".into(),
23083            kind: SourceKind::Local,
23084            host_label: None,
23085            machine_id: None,
23086            platform: None,
23087            config_json: None,
23088            created_at: Some(SqliteStorage::now_millis()),
23089            updated_at: None,
23090        };
23091        storage.upsert_source(&source).unwrap();
23092
23093        let deleted = storage.delete_source("to-delete", false).unwrap();
23094        assert!(deleted);
23095
23096        let fetched = storage.get_source("to-delete").unwrap();
23097        assert!(fetched.is_none());
23098    }
23099
23100    #[test]
23101    fn delete_source_cannot_delete_local() {
23102        let dir = TempDir::new().unwrap();
23103        let db_path = dir.path().join("test.db");
23104        let storage = SqliteStorage::open(&db_path).unwrap();
23105
23106        let result = storage.delete_source(LOCAL_SOURCE_ID, false);
23107        assert!(result.is_err());
23108    }
23109
23110    #[test]
23111    fn list_sources_includes_local() {
23112        let dir = TempDir::new().unwrap();
23113        let db_path = dir.path().join("test.db");
23114        let storage = SqliteStorage::open(&db_path).unwrap();
23115
23116        let sources = storage.list_sources().unwrap();
23117        assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
23118    }
23119
23120    #[test]
23121    fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
23122        let dir = TempDir::new().unwrap();
23123        let db_path = dir.path().join("test.db");
23124        let storage = SqliteStorage::open(&db_path).unwrap();
23125
23126        let agent_id = storage
23127            .ensure_agent(&Agent {
23128                id: None,
23129                slug: "codex".into(),
23130                name: "Codex".into(),
23131                version: None,
23132                kind: AgentKind::Cli,
23133            })
23134            .unwrap();
23135
23136        let conversation = Conversation {
23137            id: None,
23138            agent_slug: "codex".into(),
23139            workspace: None,
23140            external_id: Some("blank-local-source".into()),
23141            title: Some("Blank local source".into()),
23142            source_path: dir.path().join("blank-local.jsonl"),
23143            started_at: Some(1_700_000_000_000),
23144            ended_at: Some(1_700_000_000_001),
23145            approx_tokens: None,
23146            metadata_json: serde_json::Value::Null,
23147            messages: vec![Message {
23148                id: None,
23149                idx: 0,
23150                role: MessageRole::User,
23151                author: None,
23152                created_at: Some(1_700_000_000_000),
23153                content: "hello".into(),
23154                extra_json: serde_json::Value::Null,
23155                snippets: Vec::new(),
23156            }],
23157            source_id: "   ".into(),
23158            origin_host: None,
23159        };
23160
23161        storage
23162            .insert_conversation_tree(agent_id, None, &conversation)
23163            .unwrap();
23164
23165        assert!(storage.get_source("   ").unwrap().is_none());
23166        let source = storage
23167            .get_source(LOCAL_SOURCE_ID)
23168            .unwrap()
23169            .expect("local source row should exist");
23170        assert_eq!(source.kind, SourceKind::Local);
23171        assert_eq!(source.host_label, None);
23172
23173        let conversations = storage.list_conversations(10, 0).unwrap();
23174        assert_eq!(conversations.len(), 1);
23175        assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
23176        assert_eq!(conversations[0].origin_host, None);
23177    }
23178
23179    #[test]
23180    fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
23181        let dir = TempDir::new().unwrap();
23182        let db_path = dir.path().join("test.db");
23183        let storage = SqliteStorage::open(&db_path).unwrap();
23184
23185        let agent_id = storage
23186            .ensure_agent(&Agent {
23187                id: None,
23188                slug: "codex".into(),
23189                name: "Codex".into(),
23190                version: None,
23191                kind: AgentKind::Cli,
23192            })
23193            .unwrap();
23194
23195        let bootstrap_updated_at: i64 = storage
23196            .conn
23197            .query_row_map(
23198                "SELECT updated_at FROM sources WHERE id = ?1",
23199                fparams![LOCAL_SOURCE_ID],
23200                |row| row.get_typed(0),
23201            )
23202            .unwrap();
23203
23204        let make_conversation = |external_id: &str, suffix: &str| Conversation {
23205            id: None,
23206            agent_slug: "codex".into(),
23207            workspace: None,
23208            external_id: Some(external_id.into()),
23209            title: Some(format!("Local source {suffix}")),
23210            source_path: dir.path().join(format!("local-{suffix}.jsonl")),
23211            started_at: Some(1_700_000_000_000),
23212            ended_at: Some(1_700_000_000_001),
23213            approx_tokens: None,
23214            metadata_json: serde_json::Value::Null,
23215            messages: vec![Message {
23216                id: None,
23217                idx: 0,
23218                role: MessageRole::User,
23219                author: None,
23220                created_at: Some(1_700_000_000_000),
23221                content: format!("hello-{suffix}"),
23222                extra_json: serde_json::Value::Null,
23223                snippets: Vec::new(),
23224            }],
23225            source_id: LOCAL_SOURCE_ID.into(),
23226            origin_host: None,
23227        };
23228
23229        std::thread::sleep(std::time::Duration::from_millis(5));
23230        storage
23231            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
23232            .unwrap();
23233        let after_first_insert: i64 = storage
23234            .conn
23235            .query_row_map(
23236                "SELECT updated_at FROM sources WHERE id = ?1",
23237                fparams![LOCAL_SOURCE_ID],
23238                |row| row.get_typed(0),
23239            )
23240            .unwrap();
23241
23242        std::thread::sleep(std::time::Duration::from_millis(5));
23243        storage
23244            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
23245            .unwrap();
23246        let after_second_insert: i64 = storage
23247            .conn
23248            .query_row_map(
23249                "SELECT updated_at FROM sources WHERE id = ?1",
23250                fparams![LOCAL_SOURCE_ID],
23251                |row| row.get_typed(0),
23252            )
23253            .unwrap();
23254
23255        assert_eq!(after_first_insert, bootstrap_updated_at);
23256        assert_eq!(after_second_insert, bootstrap_updated_at);
23257    }
23258
23259    #[test]
23260    fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
23261        let dir = TempDir::new().unwrap();
23262        let db_path = dir.path().join("test.db");
23263        let storage = SqliteStorage::open(&db_path).unwrap();
23264
23265        let agent_id = storage
23266            .ensure_agent(&Agent {
23267                id: None,
23268                slug: "codex".into(),
23269                name: "Codex".into(),
23270                version: None,
23271                kind: AgentKind::Cli,
23272            })
23273            .unwrap();
23274
23275        let conversation = Conversation {
23276            id: None,
23277            agent_slug: "codex".into(),
23278            workspace: None,
23279            external_id: Some("blank-remote-source".into()),
23280            title: Some("Blank remote source".into()),
23281            source_path: dir.path().join("blank-remote.jsonl"),
23282            started_at: Some(1_700_000_000_000),
23283            ended_at: Some(1_700_000_000_001),
23284            approx_tokens: None,
23285            metadata_json: serde_json::Value::Null,
23286            messages: vec![Message {
23287                id: None,
23288                idx: 0,
23289                role: MessageRole::User,
23290                author: None,
23291                created_at: Some(1_700_000_000_000),
23292                content: "hello".into(),
23293                extra_json: serde_json::Value::Null,
23294                snippets: Vec::new(),
23295            }],
23296            source_id: "   ".into(),
23297            origin_host: Some("user@work-laptop".into()),
23298        };
23299
23300        storage
23301            .insert_conversation_tree(agent_id, None, &conversation)
23302            .unwrap();
23303
23304        assert!(storage.get_source("   ").unwrap().is_none());
23305        let source = storage
23306            .get_source("user@work-laptop")
23307            .unwrap()
23308            .expect("normalized remote source row should exist");
23309        assert_eq!(source.kind, SourceKind::Ssh);
23310        assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
23311
23312        let conversations = storage.list_conversations(10, 0).unwrap();
23313        assert_eq!(conversations.len(), 1);
23314        assert_eq!(conversations[0].source_id, "user@work-laptop");
23315        assert_eq!(
23316            conversations[0].origin_host.as_deref(),
23317            Some("user@work-laptop")
23318        );
23319    }
23320
23321    #[test]
23322    fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
23323        let dir = TempDir::new().unwrap();
23324        let db_path = dir.path().join("test.db");
23325        let storage = SqliteStorage::open(&db_path).unwrap();
23326
23327        let agent_id = storage
23328            .ensure_agent(&Agent {
23329                id: None,
23330                slug: "codex".into(),
23331                name: "Codex".into(),
23332                version: None,
23333                kind: AgentKind::Cli,
23334            })
23335            .unwrap();
23336
23337        let conversation = Conversation {
23338            id: None,
23339            agent_slug: "codex".into(),
23340            workspace: None,
23341            external_id: Some("batched-blank-remote-source".into()),
23342            title: Some("Batched blank remote source".into()),
23343            source_path: dir.path().join("batched-blank-remote.jsonl"),
23344            started_at: Some(1_700_000_000_000),
23345            ended_at: Some(1_700_000_000_001),
23346            approx_tokens: None,
23347            metadata_json: serde_json::Value::Null,
23348            messages: vec![Message {
23349                id: None,
23350                idx: 0,
23351                role: MessageRole::User,
23352                author: None,
23353                created_at: Some(1_700_000_000_000),
23354                content: "hello".into(),
23355                extra_json: serde_json::Value::Null,
23356                snippets: Vec::new(),
23357            }],
23358            source_id: "   ".into(),
23359            origin_host: Some("user@batch-host".into()),
23360        };
23361
23362        storage
23363            .insert_conversations_batched(&[(agent_id, None, &conversation)])
23364            .unwrap();
23365
23366        assert!(storage.get_source("   ").unwrap().is_none());
23367        let source = storage
23368            .get_source("user@batch-host")
23369            .unwrap()
23370            .expect("normalized batched remote source row should exist");
23371        assert_eq!(source.kind, SourceKind::Ssh);
23372        assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
23373
23374        let conversations = storage.list_conversations(10, 0).unwrap();
23375        assert_eq!(conversations.len(), 1);
23376        assert_eq!(conversations[0].source_id, "user@batch-host");
23377        assert_eq!(
23378            conversations[0].origin_host.as_deref(),
23379            Some("user@batch-host")
23380        );
23381    }
23382
23383    #[test]
23384    fn get_source_ids_excludes_local() {
23385        let dir = TempDir::new().unwrap();
23386        let db_path = dir.path().join("test.db");
23387        let storage = SqliteStorage::open(&db_path).unwrap();
23388
23389        // Add a non-local source
23390        let source = Source {
23391            id: "remote-1".into(),
23392            kind: SourceKind::Ssh,
23393            host_label: Some("server".into()),
23394            machine_id: None,
23395            platform: None,
23396            config_json: None,
23397            created_at: Some(SqliteStorage::now_millis()),
23398            updated_at: None,
23399        };
23400        storage.upsert_source(&source).unwrap();
23401
23402        let ids = storage.get_source_ids().unwrap();
23403        assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
23404        assert!(ids.contains(&"remote-1".to_string()));
23405    }
23406
23407    // =========================================================================
23408    // Scan timestamp tests (bead yln.4)
23409    // =========================================================================
23410
23411    #[test]
23412    fn get_last_scan_ts_returns_none_initially() {
23413        let dir = TempDir::new().unwrap();
23414        let db_path = dir.path().join("test.db");
23415        let storage = SqliteStorage::open(&db_path).unwrap();
23416
23417        let ts = storage.get_last_scan_ts().unwrap();
23418        assert!(ts.is_none());
23419    }
23420
23421    #[test]
23422    fn set_and_get_last_scan_ts() {
23423        let dir = TempDir::new().unwrap();
23424        let db_path = dir.path().join("test.db");
23425        let storage = SqliteStorage::open(&db_path).unwrap();
23426
23427        let expected_ts = 1700000000000_i64;
23428        storage.set_last_scan_ts(expected_ts).unwrap();
23429
23430        let actual_ts = storage.get_last_scan_ts().unwrap();
23431        assert_eq!(actual_ts, Some(expected_ts));
23432    }
23433
23434    // =========================================================================
23435    // now_millis utility test (bead yln.4)
23436    // =========================================================================
23437
23438    #[test]
23439    fn now_millis_returns_reasonable_value() {
23440        let ts = SqliteStorage::now_millis();
23441        // Should be after Jan 1, 2020 (approx 1577836800000)
23442        assert!(ts > 1577836800000);
23443        // Should be before Jan 1, 2100 (approx 4102444800000)
23444        assert!(ts < 4102444800000);
23445    }
23446
23447    // =========================================================================
23448    // Binary Metadata Serialization Tests (Opt 3.1)
23449    // =========================================================================
23450
23451    #[test]
23452    fn msgpack_roundtrip_basic_object() {
23453        let value = serde_json::json!({
23454            "key": "value",
23455            "number": 42,
23456            "nested": { "inner": true }
23457        });
23458
23459        let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
23460        let recovered = deserialize_msgpack_to_json(&bytes);
23461
23462        assert_eq!(value, recovered);
23463    }
23464
23465    #[test]
23466    fn msgpack_returns_none_for_null() {
23467        let value = serde_json::Value::Null;
23468        assert!(serialize_json_to_msgpack(&value).is_none());
23469    }
23470
23471    #[test]
23472    fn message_insert_stores_null_extra_json_as_sql_null() {
23473        let dir = TempDir::new().unwrap();
23474        let db_path = dir.path().join("test.db");
23475        let storage = SqliteStorage::open(&db_path).unwrap();
23476        let agent_id = storage
23477            .ensure_agent(&Agent {
23478                id: None,
23479                slug: "codex".into(),
23480                name: "Codex".into(),
23481                version: None,
23482                kind: AgentKind::Cli,
23483            })
23484            .unwrap();
23485        let conversation = Conversation {
23486            id: None,
23487            agent_slug: "codex".into(),
23488            workspace: None,
23489            external_id: Some("null-extra-json".into()),
23490            title: Some("Null extra_json".into()),
23491            source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
23492            started_at: Some(1_700_000_000_000),
23493            ended_at: Some(1_700_000_000_001),
23494            approx_tokens: None,
23495            metadata_json: serde_json::Value::Null,
23496            messages: vec![Message {
23497                id: None,
23498                idx: 0,
23499                role: MessageRole::User,
23500                author: None,
23501                created_at: Some(1_700_000_000_000),
23502                content: "null metadata message".into(),
23503                extra_json: serde_json::Value::Null,
23504                snippets: Vec::new(),
23505            }],
23506            source_id: LOCAL_SOURCE_ID.into(),
23507            origin_host: None,
23508        };
23509
23510        let conversation_id = storage
23511            .insert_conversation_tree(agent_id, None, &conversation)
23512            .unwrap()
23513            .conversation_id;
23514
23515        let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23516            .conn
23517            .query_row_map(
23518                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23519                fparams![conversation_id],
23520                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23521            )
23522            .unwrap();
23523        assert!(extra_json.is_none());
23524        assert!(extra_bin.is_none());
23525
23526        let stored = storage.fetch_messages(conversation_id).unwrap();
23527        assert!(stored[0].extra_json.is_null());
23528    }
23529
23530    #[test]
23531    fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
23532        let dir = TempDir::new().unwrap();
23533        let db_path = dir.path().join("test.db");
23534        let storage = SqliteStorage::open(&db_path).unwrap();
23535        let agent_id = storage
23536            .ensure_agent(&Agent {
23537                id: None,
23538                slug: "codex".into(),
23539                name: "Codex".into(),
23540                version: None,
23541                kind: AgentKind::Cli,
23542            })
23543            .unwrap();
23544        let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
23545        let conversation = Conversation {
23546            id: None,
23547            agent_slug: "codex".into(),
23548            workspace: None,
23549            external_id: Some("msgpack-extra-json".into()),
23550            title: Some("MessagePack extra_json".into()),
23551            source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
23552            started_at: Some(1_700_000_000_000),
23553            ended_at: Some(1_700_000_000_001),
23554            approx_tokens: None,
23555            metadata_json: serde_json::Value::Null,
23556            messages: vec![Message {
23557                id: None,
23558                idx: 0,
23559                role: MessageRole::User,
23560                author: None,
23561                created_at: Some(1_700_000_000_000),
23562                content: "msgpack metadata message".into(),
23563                extra_json: extra_json.clone(),
23564                snippets: Vec::new(),
23565            }],
23566            source_id: LOCAL_SOURCE_ID.into(),
23567            origin_host: None,
23568        };
23569
23570        let conversation_id = storage
23571            .insert_conversation_tree(agent_id, None, &conversation)
23572            .unwrap()
23573            .conversation_id;
23574
23575        let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23576            .conn
23577            .query_row_map(
23578                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23579                fparams![conversation_id],
23580                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23581            )
23582            .unwrap();
23583        assert!(extra_json_text.is_none());
23584        assert!(extra_bin.is_some());
23585
23586        let stored = storage.fetch_messages(conversation_id).unwrap();
23587        assert_eq!(stored[0].extra_json, extra_json);
23588    }
23589
23590    #[test]
23591    fn conversation_insert_preserves_null_metadata_json_as_json_null() {
23592        let dir = TempDir::new().unwrap();
23593        let db_path = dir.path().join("test.db");
23594        let storage = SqliteStorage::open(&db_path).unwrap();
23595        let agent_id = storage
23596            .ensure_agent(&Agent {
23597                id: None,
23598                slug: "codex".into(),
23599                name: "Codex".into(),
23600                version: None,
23601                kind: AgentKind::Cli,
23602            })
23603            .unwrap();
23604        let conversation = Conversation {
23605            id: None,
23606            agent_slug: "codex".into(),
23607            workspace: None,
23608            external_id: Some("null-conversation-metadata".into()),
23609            title: Some("Null conversation metadata".into()),
23610            source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
23611            started_at: Some(1_700_000_000_000),
23612            ended_at: Some(1_700_000_000_001),
23613            approx_tokens: None,
23614            metadata_json: serde_json::Value::Null,
23615            messages: vec![Message {
23616                id: None,
23617                idx: 0,
23618                role: MessageRole::User,
23619                author: None,
23620                created_at: Some(1_700_000_000_000),
23621                content: "null conversation metadata message".into(),
23622                extra_json: serde_json::Value::Null,
23623                snippets: Vec::new(),
23624            }],
23625            source_id: LOCAL_SOURCE_ID.into(),
23626            origin_host: None,
23627        };
23628
23629        storage
23630            .insert_conversation_tree(agent_id, None, &conversation)
23631            .unwrap();
23632
23633        let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23634            .conn
23635            .query_row_map(
23636                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23637                fparams!["null-conversation-metadata"],
23638                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23639            )
23640            .unwrap();
23641        assert_eq!(metadata_json.as_deref(), Some("null"));
23642        assert!(metadata_bin.is_none());
23643
23644        let listed = storage.list_conversations(10, 0).unwrap();
23645        assert!(listed[0].metadata_json.is_null());
23646    }
23647
23648    #[test]
23649    fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
23650        let dir = TempDir::new().unwrap();
23651        let db_path = dir.path().join("test.db");
23652        let storage = SqliteStorage::open(&db_path).unwrap();
23653        let agent_id = storage
23654            .ensure_agent(&Agent {
23655                id: None,
23656                slug: "codex".into(),
23657                name: "Codex".into(),
23658                version: None,
23659                kind: AgentKind::Cli,
23660            })
23661            .unwrap();
23662        let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
23663        let conversation = Conversation {
23664            id: None,
23665            agent_slug: "codex".into(),
23666            workspace: None,
23667            external_id: Some("msgpack-conversation-metadata".into()),
23668            title: Some("MessagePack conversation metadata".into()),
23669            source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
23670            started_at: Some(1_700_000_000_000),
23671            ended_at: Some(1_700_000_000_001),
23672            approx_tokens: None,
23673            metadata_json: metadata_json.clone(),
23674            messages: vec![Message {
23675                id: None,
23676                idx: 0,
23677                role: MessageRole::User,
23678                author: None,
23679                created_at: Some(1_700_000_000_000),
23680                content: "msgpack conversation metadata message".into(),
23681                extra_json: serde_json::Value::Null,
23682                snippets: Vec::new(),
23683            }],
23684            source_id: LOCAL_SOURCE_ID.into(),
23685            origin_host: None,
23686        };
23687
23688        storage
23689            .insert_conversation_tree(agent_id, None, &conversation)
23690            .unwrap();
23691
23692        let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23693            .conn
23694            .query_row_map(
23695                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23696                fparams!["msgpack-conversation-metadata"],
23697                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23698            )
23699            .unwrap();
23700        assert!(metadata_text.is_none());
23701        assert!(metadata_bin.is_some());
23702
23703        let listed = storage.list_conversations(10, 0).unwrap();
23704        assert_eq!(listed[0].metadata_json, metadata_json);
23705    }
23706
23707    #[test]
23708    fn msgpack_returns_none_for_empty_object() {
23709        let value = serde_json::json!({});
23710        assert!(serialize_json_to_msgpack(&value).is_none());
23711    }
23712
23713    #[test]
23714    fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
23715        let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
23716
23717        let value = parse_historical_json_column(Some(raw.clone()));
23718
23719        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23720        assert_eq!(json_value_size_hint(&value), raw.len());
23721    }
23722
23723    #[test]
23724    fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
23725        let raw = String::from("{\"ok\":true,\"n\":1}");
23726
23727        let value = parse_historical_json_column(Some(raw.clone()));
23728
23729        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23730    }
23731
23732    #[test]
23733    fn msgpack_serializes_non_empty_array() {
23734        let value = serde_json::json!([1, 2, 3]);
23735        let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
23736        let recovered = deserialize_msgpack_to_json(&bytes);
23737        assert_eq!(value, recovered);
23738    }
23739
23740    #[test]
23741    fn msgpack_smaller_than_json() {
23742        let value = serde_json::json!({
23743            "field_name_one": "some_value",
23744            "field_name_two": 123456,
23745            "field_name_three": [1, 2, 3, 4, 5],
23746            "field_name_four": { "nested": true }
23747        });
23748
23749        let json_bytes = serde_json::to_vec(&value).unwrap();
23750        let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
23751
23752        // MessagePack should be smaller due to more compact encoding
23753        assert!(
23754            msgpack_bytes.len() < json_bytes.len(),
23755            "MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
23756            msgpack_bytes.len(),
23757            json_bytes.len()
23758        );
23759    }
23760
23761    #[test]
23762    fn migration_v7_adds_binary_columns() {
23763        let dir = TempDir::new().unwrap();
23764        let db_path = dir.path().join("test.db");
23765        let storage = SqliteStorage::open(&db_path).unwrap();
23766
23767        // Verify metadata_bin column exists
23768        let has_metadata_bin = storage
23769            .raw()
23770            .query("PRAGMA table_info(conversations)")
23771            .unwrap()
23772            .iter()
23773            .any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
23774        assert!(
23775            has_metadata_bin,
23776            "conversations should have metadata_bin column"
23777        );
23778
23779        // Verify extra_bin column exists
23780        let has_extra_bin = storage
23781            .raw()
23782            .query("PRAGMA table_info(messages)")
23783            .unwrap()
23784            .iter()
23785            .any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
23786        assert!(has_extra_bin, "messages should have extra_bin column");
23787    }
23788
23789    #[test]
23790    fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
23791        let dir = TempDir::new().unwrap();
23792        let db_path = dir.path().join("append-tail-state-cache.db");
23793        let storage = SqliteStorage::open(&db_path).unwrap();
23794        let agent_id = storage
23795            .ensure_agent(&Agent {
23796                id: None,
23797                slug: "codex".into(),
23798                name: "Codex".into(),
23799                version: None,
23800                kind: AgentKind::Cli,
23801            })
23802            .unwrap();
23803        let workspace = PathBuf::from("/ws/profiled-append-remote");
23804        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
23805
23806        let initial = make_profiled_append_remote_merge_conversation(11, 5);
23807        let insert_outcome = storage
23808            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
23809            .unwrap();
23810        let conversation_id = insert_outcome.conversation_id;
23811
23812        let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23813            .raw()
23814            .query_row_map(
23815                "SELECT ended_at, last_message_idx, last_message_created_at
23816                 FROM conversation_tail_state
23817                 WHERE conversation_id = ?1",
23818                fparams![conversation_id],
23819                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23820            )
23821            .unwrap();
23822        assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
23823
23824        storage
23825            .raw()
23826            .execute_compat(
23827                "UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
23828                fparams![111_999_i64, conversation_id],
23829            )
23830            .unwrap();
23831        storage
23832            .raw()
23833            .execute_compat(
23834                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
23835                fparams![conversation_id],
23836            )
23837            .unwrap();
23838
23839        let appended = make_profiled_append_remote_merge_conversation(11, 10);
23840        let append_outcome = storage
23841            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
23842            .unwrap();
23843        assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
23844
23845        let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23846            .raw()
23847            .query_row_map(
23848                "SELECT ended_at, last_message_idx, last_message_created_at
23849                 FROM conversation_tail_state
23850                 WHERE conversation_id = ?1",
23851                fparams![conversation_id],
23852                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23853            )
23854            .unwrap();
23855        assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
23856    }
23857
23858    #[test]
23859    fn msgpack_deserialize_empty_returns_default() {
23860        let recovered = deserialize_msgpack_to_json(&[]);
23861        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23862    }
23863
23864    #[test]
23865    fn msgpack_deserialize_garbage_returns_default() {
23866        // Use truncated msgpack data that will fail to parse
23867        // 0x85 indicates a fixmap with 5 elements, but we don't provide them
23868        let recovered = deserialize_msgpack_to_json(&[0x85]);
23869        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23870    }
23871
23872    #[test]
23873    fn stats_aggregator_collects_and_expands() {
23874        let mut agg = StatsAggregator::new();
23875        assert!(agg.is_empty());
23876
23877        // Record some stats
23878        // Day 100, agent "claude", source "local"
23879        agg.record("claude", "local", 100, 5, 500);
23880        // Day 100, agent "codex", source "local"
23881        agg.record("codex", "local", 100, 3, 300);
23882        // Day 101, agent "claude", source "local"
23883        agg.record("claude", "local", 101, 2, 200);
23884
23885        assert!(!agg.is_empty());
23886        assert_eq!(agg.raw_entry_count(), 3);
23887
23888        let entries = agg.expand();
23889        // Each raw entry expands to 4 permutations.
23890        // But (all, local) and (all, all) will aggregate.
23891        //
23892        // Raw:
23893        // 1. (100, claude, local) -> 1 sess, 5 msgs, 500 chars
23894        // 2. (100, codex, local)  -> 1 sess, 3 msgs, 300 chars
23895        // 3. (101, claude, local) -> 1 sess, 2 msgs, 200 chars
23896        //
23897        // Expanded 1 (day 100):
23898        // - (100, claude, local): 1 sess, 5 msgs, 500 chars
23899        // - (100, all, local):    1 (from claude) + 1 (from codex) = 2 sess, 8 msgs, 800 chars
23900        // - (100, claude, all):   1 sess, 5 msgs, 500 chars
23901        // - (100, codex, local):  1 sess, 3 msgs, 300 chars
23902        // - (100, codex, all):    1 sess, 3 msgs, 300 chars
23903        // - (100, all, all):      2 sess, 8 msgs, 800 chars
23904        //
23905        // Expanded 3 (day 101):
23906        // - (101, claude, local): 1 sess, 2 msgs, 200 chars
23907        // - (101, all, local):    1 sess, 2 msgs, 200 chars
23908        // - (101, claude, all):   1 sess, 2 msgs, 200 chars
23909        // - (101, all, all):      1 sess, 2 msgs, 200 chars
23910        //
23911        // Total unique keys in expanded map:
23912        // Day 100: (claude, local), (codex, local), (all, local), (claude, all), (codex, all), (all, all) = 6
23913        // Day 101: (claude, local), (all, local), (claude, all), (all, all) = 4
23914        // Total = 10 entries
23915
23916        assert_eq!(entries.len(), 10);
23917
23918        // Verify totals for day 100, all/all
23919        let day100_all = entries
23920            .iter()
23921            .find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
23922            .unwrap();
23923        assert_eq!(day100_all.3.session_count_delta, 2);
23924        assert_eq!(day100_all.3.message_count_delta, 8);
23925        assert_eq!(day100_all.3.total_chars_delta, 800);
23926    }
23927
23928    // =========================================================================
23929    // LazyFrankenDb tests (bd-1ueu)
23930    // =========================================================================
23931
23932    #[test]
23933    fn lazy_franken_db_not_open_before_get() {
23934        let dir = TempDir::new().unwrap();
23935        let db_path = dir.path().join("lazy_test.db");
23936
23937        // Create a real DB so the path exists
23938        let _storage = SqliteStorage::open(&db_path).unwrap();
23939
23940        let lazy = LazyFrankenDb::new(db_path);
23941        assert!(
23942            !lazy.is_open(),
23943            "LazyFrankenDb must not open on construction"
23944        );
23945    }
23946
23947    #[test]
23948    fn lazy_franken_db_opens_on_first_get() {
23949        let dir = TempDir::new().unwrap();
23950        let db_path = dir.path().join("lazy_test.db");
23951
23952        // Create a real DB so the path exists
23953        let _storage = SqliteStorage::open(&db_path).unwrap();
23954        drop(_storage);
23955
23956        let lazy = LazyFrankenDb::new(db_path);
23957        assert!(!lazy.is_open());
23958
23959        let conn = lazy.get("test").expect("should open successfully");
23960        let count: i64 = conn
23961            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
23962                r.get_typed(0)
23963            })
23964            .unwrap();
23965        assert_eq!(count, 0);
23966        drop(conn);
23967
23968        assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
23969    }
23970
23971    #[test]
23972    fn lazy_franken_db_reuses_connection() {
23973        let dir = TempDir::new().unwrap();
23974        let db_path = dir.path().join("lazy_test.db");
23975        let _storage = SqliteStorage::open(&db_path).unwrap();
23976        drop(_storage);
23977
23978        let lazy = LazyFrankenDb::new(db_path);
23979
23980        // First access opens
23981        {
23982            let conn = lazy.get("first").unwrap();
23983            conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
23984                .unwrap();
23985        }
23986
23987        // Second access reuses (table still exists)
23988        {
23989            let conn = lazy.get("second").unwrap();
23990            let count: i64 = conn
23991                .query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
23992                    r.get_typed(0)
23993                })
23994                .unwrap();
23995            assert_eq!(count, 0);
23996        }
23997    }
23998
23999    #[test]
24000    fn lazy_franken_db_not_found_error() {
24001        let dir = TempDir::new().unwrap();
24002        let db_path = dir.path().join("nonexistent.db");
24003
24004        let lazy = LazyFrankenDb::new(db_path);
24005        let result = lazy.get("test");
24006        assert!(result.is_err());
24007        assert!(
24008            matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
24009            "should return NotFound for missing DB"
24010        );
24011    }
24012
24013    #[test]
24014    fn lazy_franken_db_path_accessor() {
24015        let path = PathBuf::from("/tmp/test_lazy.db");
24016        let lazy = LazyFrankenDb::new(path.clone());
24017        assert_eq!(lazy.path(), path.as_path());
24018    }
24019
24020    // =========================================================================
24021    // Pricing / cost estimation tests (bead z9fse.10)
24022    // =========================================================================
24023
24024    #[test]
24025    fn sql_like_match_basic_patterns() {
24026        assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
24027        assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
24028        assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
24029
24030        // Middle wildcard (gemini pattern)
24031        assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
24032        assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
24033        assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
24034
24035        // Exact match
24036        assert!(sql_like_match("hello", "hello"));
24037        assert!(!sql_like_match("hello!", "hello"));
24038
24039        // Underscore wildcard
24040        assert!(sql_like_match("gpt-4o", "gpt-4_"));
24041        assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
24042
24043        // Case insensitive
24044        assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
24045    }
24046
24047    #[test]
24048    fn date_str_to_day_id_converts_correctly() {
24049        // 2025-10-01 is 2100 days after 2020-01-01
24050        assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
24051        // 2024-04-01 is 1552 days after 2020-01-01
24052        assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
24053        assert!(date_str_to_day_id("invalid").is_err());
24054    }
24055
24056    #[test]
24057    fn pricing_table_lookup_selects_matching_entry() {
24058        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24059        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
24060        let table = PricingTable {
24061            entries: vec![
24062                PricingEntry {
24063                    model_pattern: "claude-opus-4%".into(),
24064                    provider: "anthropic".into(),
24065                    input_cost_per_mtok: 15.0,
24066                    output_cost_per_mtok: 75.0,
24067                    cache_read_cost_per_mtok: Some(1.5),
24068                    cache_creation_cost_per_mtok: Some(18.75),
24069                    effective_day_id: effective_day,
24070                },
24071                PricingEntry {
24072                    model_pattern: "claude-sonnet-4%".into(),
24073                    provider: "anthropic".into(),
24074                    input_cost_per_mtok: 3.0,
24075                    output_cost_per_mtok: 15.0,
24076                    cache_read_cost_per_mtok: Some(0.3),
24077                    cache_creation_cost_per_mtok: Some(3.75),
24078                    effective_day_id: effective_day,
24079                },
24080            ],
24081        };
24082
24083        let result = table.lookup("claude-opus-4-20260101", lookup_day);
24084        assert!(result.is_some());
24085        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
24086
24087        let result = table.lookup("claude-sonnet-4-latest", lookup_day);
24088        assert!(result.is_some());
24089        assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
24090
24091        assert!(table.lookup("unknown-model", lookup_day).is_none());
24092    }
24093
24094    #[test]
24095    fn pricing_table_lookup_respects_effective_date() {
24096        let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
24097        let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
24098        let table = PricingTable {
24099            entries: vec![
24100                PricingEntry {
24101                    model_pattern: "claude-opus-4%".into(),
24102                    provider: "anthropic".into(),
24103                    input_cost_per_mtok: 15.0,
24104                    output_cost_per_mtok: 75.0,
24105                    cache_read_cost_per_mtok: None,
24106                    cache_creation_cost_per_mtok: None,
24107                    effective_day_id: effective_day_1,
24108                },
24109                PricingEntry {
24110                    model_pattern: "claude-opus-4%".into(),
24111                    provider: "anthropic".into(),
24112                    input_cost_per_mtok: 12.0,
24113                    output_cost_per_mtok: 60.0,
24114                    cache_read_cost_per_mtok: None,
24115                    cache_creation_cost_per_mtok: None,
24116                    effective_day_id: effective_day_2,
24117                },
24118            ],
24119        };
24120
24121        // Before price drop
24122        let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
24123        assert!(result.is_some());
24124        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
24125
24126        // After price drop
24127        let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
24128        assert!(result.is_some());
24129        assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
24130
24131        // Before all pricing
24132        assert!(
24133            table
24134                .lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
24135                .is_none()
24136        );
24137    }
24138
24139    #[test]
24140    fn pricing_table_lookup_specificity_tiebreak() {
24141        let effective_day = date_str_to_day_id("2025-01-01").unwrap();
24142        let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
24143        let table = PricingTable {
24144            entries: vec![
24145                PricingEntry {
24146                    model_pattern: "gpt-4%".into(),
24147                    provider: "openai".into(),
24148                    input_cost_per_mtok: 10.0,
24149                    output_cost_per_mtok: 30.0,
24150                    cache_read_cost_per_mtok: None,
24151                    cache_creation_cost_per_mtok: None,
24152                    effective_day_id: effective_day,
24153                },
24154                PricingEntry {
24155                    model_pattern: "gpt-4-turbo%".into(),
24156                    provider: "openai".into(),
24157                    input_cost_per_mtok: 5.0,
24158                    output_cost_per_mtok: 15.0,
24159                    cache_read_cost_per_mtok: None,
24160                    cache_creation_cost_per_mtok: None,
24161                    effective_day_id: effective_day,
24162                },
24163            ],
24164        };
24165
24166        // Longer pattern wins for specific model
24167        let result = table.lookup("gpt-4-turbo-2025", lookup_day);
24168        assert!(result.is_some());
24169        assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
24170
24171        // Shorter pattern matches broader model
24172        let result = table.lookup("gpt-4o", lookup_day);
24173        assert!(result.is_some());
24174        assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
24175    }
24176
24177    #[test]
24178    fn pricing_table_compute_cost_basic() {
24179        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24180        let table = PricingTable {
24181            entries: vec![PricingEntry {
24182                model_pattern: "claude-opus-4%".into(),
24183                provider: "anthropic".into(),
24184                input_cost_per_mtok: 15.0,
24185                output_cost_per_mtok: 75.0,
24186                cache_read_cost_per_mtok: Some(1.5),
24187                cache_creation_cost_per_mtok: Some(18.75),
24188                effective_day_id: effective_day,
24189            }],
24190        };
24191
24192        let cost = table.compute_cost(
24193            Some("claude-opus-4-latest"),
24194            date_str_to_day_id("2026-02-06").unwrap(),
24195            Some(1000),
24196            Some(500),
24197            None,
24198            None,
24199        );
24200        assert!(cost.is_some());
24201        // 1000 * 15.0 / 1M + 500 * 75.0 / 1M = 0.015 + 0.0375 = 0.0525
24202        assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
24203    }
24204
24205    #[test]
24206    fn pricing_table_compute_cost_with_cache() {
24207        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24208        let table = PricingTable {
24209            entries: vec![PricingEntry {
24210                model_pattern: "claude-opus-4%".into(),
24211                provider: "anthropic".into(),
24212                input_cost_per_mtok: 15.0,
24213                output_cost_per_mtok: 75.0,
24214                cache_read_cost_per_mtok: Some(1.5),
24215                cache_creation_cost_per_mtok: Some(18.75),
24216                effective_day_id: effective_day,
24217            }],
24218        };
24219
24220        let cost = table.compute_cost(
24221            Some("claude-opus-4-latest"),
24222            date_str_to_day_id("2026-02-06").unwrap(),
24223            Some(1_000_000),
24224            Some(100_000),
24225            Some(500_000),
24226            Some(200_000),
24227        );
24228        assert!(cost.is_some());
24229        // input excludes cache tokens to avoid double-charging them at both the
24230        // full input rate and the cache-specific rates.
24231        // non-cache input: 300K * 15/1M = 4.5, output: 100K * 75/1M = 7.5
24232        // cache_read: 500K * 1.5/1M = 0.75, cache_creation: 200K * 18.75/1M = 3.75
24233        // total = 16.5
24234        assert!((cost.unwrap() - 16.5).abs() < 1e-10);
24235    }
24236
24237    #[test]
24238    fn pricing_table_compute_cost_returns_none_for_unknown_model() {
24239        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
24240        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
24241        let table = PricingTable {
24242            entries: vec![PricingEntry {
24243                model_pattern: "claude-opus-4%".into(),
24244                provider: "anthropic".into(),
24245                input_cost_per_mtok: 15.0,
24246                output_cost_per_mtok: 75.0,
24247                cache_read_cost_per_mtok: None,
24248                cache_creation_cost_per_mtok: None,
24249                effective_day_id: effective_day,
24250            }],
24251        };
24252
24253        assert!(
24254            table
24255                .compute_cost(
24256                    Some("unknown-model"),
24257                    lookup_day,
24258                    Some(1000),
24259                    Some(500),
24260                    None,
24261                    None
24262                )
24263                .is_none()
24264        );
24265        assert!(
24266            table
24267                .compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
24268                .is_none()
24269        );
24270        assert!(
24271            table
24272                .compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
24273                .is_none()
24274        );
24275    }
24276
24277    #[test]
24278    fn pricing_table_load_from_db() {
24279        let dir = TempDir::new().unwrap();
24280        let db_path = dir.path().join("test.db");
24281        let storage = SqliteStorage::open(&db_path).unwrap();
24282
24283        let table = PricingTable::load(&storage.conn).unwrap();
24284        assert!(!table.is_empty());
24285
24286        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
24287
24288        let opus = table.lookup("claude-opus-4-latest", lookup_day);
24289        assert!(opus.is_some());
24290        assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
24291
24292        let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
24293        assert!(flash.is_some());
24294        assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
24295    }
24296
24297    #[test]
24298    fn pricing_table_load_rejects_invalid_effective_date() {
24299        let dir = TempDir::new().unwrap();
24300        let db_path = dir.path().join("test.db");
24301        let storage = SqliteStorage::open(&db_path).unwrap();
24302
24303        storage
24304            .conn
24305            .execute_compat(
24306                "INSERT INTO model_pricing (
24307                    model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
24308                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
24309                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
24310                fparams![
24311                    "broken-model%",
24312                    "test",
24313                    1.0_f64,
24314                    2.0_f64,
24315                    Option::<f64>::None,
24316                    Option::<f64>::None,
24317                    "not-a-date"
24318                ],
24319            )
24320            .unwrap();
24321
24322        let err = PricingTable::load(&storage.conn).unwrap_err();
24323        assert!(err.to_string().contains("invalid effective_date"));
24324    }
24325
24326    #[test]
24327    fn pricing_diagnostics_tracks_coverage() {
24328        let mut diag = PricingDiagnostics::default();
24329        diag.record_priced();
24330        diag.record_priced();
24331        diag.record_unpriced(Some("custom-model-v1"));
24332        diag.record_unpriced(Some("custom-model-v1"));
24333        diag.record_unpriced(None);
24334
24335        assert_eq!(diag.priced_count, 2);
24336        assert_eq!(diag.unpriced_count, 3);
24337        assert_eq!(diag.unknown_models.len(), 2);
24338        assert_eq!(diag.unknown_models["custom-model-v1"], 2);
24339        assert_eq!(diag.unknown_models["(none)"], 1);
24340    }
24341
24342    // =========================================================================
24343    // FrankenStorage migration tests (bead 2j6p6)
24344    // =========================================================================
24345
24346    /// Helper: create a FrankenStorage wrapping an in-memory connection and
24347    /// run migrations. This exercises the same code path as `open()` but avoids
24348    /// frankensqlite's file-based autoindex renaming limitation (V5 uses
24349    /// ALTER TABLE RENAME which triggers sqlite_autoindex lookup issues on
24350    /// file-based pagers).
24351    fn franken_storage_in_memory() -> FrankenStorage {
24352        let conn = FrankenConnection::open(":memory:").unwrap();
24353        let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
24354        storage.run_migrations().unwrap();
24355        storage.apply_config().unwrap();
24356        storage
24357    }
24358
24359    #[test]
24360    fn franken_migrations_create_all_tables() {
24361        let storage = franken_storage_in_memory();
24362
24363        // Should be at CURRENT_SCHEMA_VERSION.
24364        let version = storage.schema_version().unwrap();
24365        assert_eq!(
24366            version, CURRENT_SCHEMA_VERSION,
24367            "fresh FrankenStorage should be at current schema version"
24368        );
24369
24370        // Core tables from V1 should exist.
24371        let rows = storage
24372            .raw()
24373            .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
24374            .unwrap();
24375        let table_names: Vec<String> = rows
24376            .iter()
24377            .filter_map(|r| r.get_typed::<String>(0).ok())
24378            .collect();
24379
24380        for required in [
24381            "meta",
24382            "agents",
24383            "workspaces",
24384            "conversations",
24385            "messages",
24386            "snippets",
24387            "tags",
24388            "conversation_tags",
24389        ] {
24390            assert!(
24391                table_names.contains(&required.to_string()),
24392                "missing table: {required}"
24393            );
24394        }
24395
24396        // V4 sources table.
24397        assert!(
24398            table_names.contains(&"sources".to_string()),
24399            "missing sources table"
24400        );
24401
24402        // V8 daily_stats table.
24403        assert!(
24404            table_names.contains(&"daily_stats".to_string()),
24405            "missing daily_stats table"
24406        );
24407
24408        // V9 embedding_jobs table.
24409        assert!(
24410            table_names.contains(&"embedding_jobs".to_string()),
24411            "missing embedding_jobs table"
24412        );
24413
24414        // V11 message_metrics, usage_hourly, usage_daily tables.
24415        for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
24416            assert!(
24417                table_names.contains(&analytics_table.to_string()),
24418                "missing table: {analytics_table}"
24419            );
24420        }
24421        assert!(
24422            table_names.contains(&"conversation_tail_state".to_string()),
24423            "missing conversation_tail_state table"
24424        );
24425        assert!(
24426            table_names.contains(&"conversation_external_lookup".to_string()),
24427            "missing conversation_external_lookup table"
24428        );
24429        assert!(
24430            table_names.contains(&"conversation_external_tail_lookup".to_string()),
24431            "missing conversation_external_tail_lookup table"
24432        );
24433
24434        // Fresh frankensqlite databases should record the combined V13 base
24435        // schema plus every additive post-V13 migration.
24436        let rows = storage
24437            .raw()
24438            .query("SELECT COUNT(*) FROM _schema_migrations;")
24439            .unwrap();
24440        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24441        assert_eq!(
24442            count,
24443            (13..=CURRENT_SCHEMA_VERSION).count() as i64,
24444            "_schema_migrations should record the V13 base schema and post-V13 migrations"
24445        );
24446
24447        // The latest applied migration should be the current schema version.
24448        let rows = storage
24449            .raw()
24450            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24451            .unwrap();
24452        let versions: Vec<i64> = rows
24453            .iter()
24454            .map(|row| row.get_typed(0))
24455            .collect::<std::result::Result<_, _>>()
24456            .unwrap();
24457        assert_eq!(
24458            versions,
24459            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24460            "_schema_migrations should contain v13 through current"
24461        );
24462    }
24463
24464    #[test]
24465    fn franken_migrations_idempotent() {
24466        let storage = franken_storage_in_memory();
24467        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24468
24469        // Re-running migrations on the same connection is a no-op.
24470        storage.run_migrations().unwrap();
24471        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24472    }
24473
24474    #[test]
24475    fn migration_v20_backfills_conversation_external_tail_lookup() {
24476        let storage = franken_storage_in_memory();
24477        let agent_id = storage
24478            .ensure_agent(&Agent {
24479                id: None,
24480                slug: "codex".into(),
24481                name: "Codex".into(),
24482                version: None,
24483                kind: AgentKind::Cli,
24484            })
24485            .unwrap();
24486        let workspace_id = storage
24487            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
24488            .unwrap();
24489        let mut conv = make_profiled_storage_remote_conversation(1919, 2);
24490        conv.source_id = "profiled-storage-remote-source-東京".into();
24491        conv.external_id = Some("profiled-storage-remote-☃-1919".into());
24492        let outcome = storage
24493            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
24494            .unwrap();
24495        let external_id = conv.external_id.as_deref().unwrap();
24496        let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
24497
24498        storage
24499            .raw()
24500            .execute("DELETE FROM conversation_external_tail_lookup")
24501            .unwrap();
24502        storage
24503            .raw()
24504            .execute("DELETE FROM _schema_migrations WHERE version = 20")
24505            .unwrap();
24506        storage
24507            .raw()
24508            .execute_compat(
24509                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24510                fparams!["19"],
24511            )
24512            .unwrap();
24513
24514        storage.run_migrations().unwrap();
24515
24516        let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
24517            .raw()
24518            .query_row_map(
24519                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
24520                 FROM conversation_external_tail_lookup
24521                 WHERE lookup_key = ?1",
24522                fparams![lookup_key.as_str()],
24523                |row| {
24524                    Ok((
24525                        row.get_typed(0)?,
24526                        row.get_typed(1)?,
24527                        row.get_typed(2)?,
24528                        row.get_typed(3)?,
24529                    ))
24530                },
24531            )
24532            .unwrap();
24533        assert_eq!(
24534            backfilled,
24535            (
24536                outcome.conversation_id,
24537                conv.ended_at,
24538                Some(1),
24539                conv.messages[1].created_at
24540            )
24541        );
24542        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24543    }
24544
24545    #[test]
24546    fn migration_v15_creates_lazy_tail_state_cache() {
24547        let conn = FrankenConnection::open(":memory:").unwrap();
24548        conn.execute_batch(
24549            "CREATE TABLE conversations (
24550                 id INTEGER PRIMARY KEY,
24551                 ended_at INTEGER
24552             );
24553             CREATE TABLE messages (
24554                 id INTEGER PRIMARY KEY,
24555                 conversation_id INTEGER NOT NULL,
24556                 idx INTEGER NOT NULL,
24557                 created_at INTEGER
24558             );
24559             INSERT INTO conversations(id, ended_at) VALUES
24560                 (1, 1710000000300),
24561                 (2, NULL);
24562             INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
24563                 (10, 1, 0, 1710000000100),
24564                 (11, 1, 1, 1710000000200),
24565                 (12, 2, 0, 1710000000400);",
24566        )
24567        .unwrap();
24568
24569        conn.execute(
24570            "CREATE TABLE _schema_migrations (
24571                version INTEGER PRIMARY KEY,
24572                name TEXT NOT NULL,
24573                applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
24574             );",
24575        )
24576        .unwrap();
24577
24578        assert!(
24579            apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24580            "v15 migration should apply once"
24581        );
24582        assert!(
24583            !apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24584            "v15 migration should be idempotent once recorded"
24585        );
24586
24587        let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
24588        let column_names: HashSet<String> = columns
24589            .iter()
24590            .map(|row| row.get_typed(1))
24591            .collect::<std::result::Result<_, frankensqlite::FrankenError>>()
24592            .unwrap();
24593        assert!(column_names.contains("last_message_idx"));
24594        assert!(column_names.contains("last_message_created_at"));
24595
24596        let tail_rows: i64 = conn
24597            .query("SELECT COUNT(*) FROM conversation_tail_state;")
24598            .unwrap()
24599            .first()
24600            .unwrap()
24601            .get_typed(0)
24602            .unwrap();
24603        assert_eq!(
24604            tail_rows, 0,
24605            "v15 should create the cache without an open-time message scan"
24606        );
24607
24608        let applied: i64 = conn
24609            .query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
24610            .unwrap()
24611            .first()
24612            .unwrap()
24613            .get_typed(0)
24614            .unwrap();
24615        assert_eq!(applied, 1);
24616    }
24617
24618    #[test]
24619    fn schema_repair_adds_missing_conversations_token_columns() {
24620        let conn = FrankenConnection::open(":memory:").unwrap();
24621        conn.execute_batch(
24622            "CREATE TABLE conversations (
24623                 id INTEGER PRIMARY KEY,
24624                 agent_id INTEGER NOT NULL,
24625                 source_path TEXT NOT NULL
24626             );",
24627        )
24628        .unwrap();
24629        let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
24630
24631        storage.repair_missing_conversation_token_columns().unwrap();
24632        storage.repair_missing_conversation_token_columns().unwrap();
24633
24634        let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
24635        for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
24636            assert!(
24637                columns.contains(column_name),
24638                "schema repair should add conversations.{column_name}"
24639            );
24640        }
24641    }
24642
24643    #[test]
24644    fn franken_meta_schema_version_in_sync() {
24645        let storage = franken_storage_in_memory();
24646
24647        // meta.schema_version should be kept in sync.
24648        let rows = storage
24649            .raw()
24650            .query("SELECT value FROM meta WHERE key = 'schema_version';")
24651            .unwrap();
24652        let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
24653        assert_eq!(
24654            meta_version,
24655            CURRENT_SCHEMA_VERSION.to_string(),
24656            "meta.schema_version should match CURRENT_SCHEMA_VERSION"
24657        );
24658    }
24659
24660    #[test]
24661    fn franken_transition_from_meta_version() {
24662        let dir = TempDir::new().unwrap();
24663        let db_path = dir.path().join("test_transition.db");
24664
24665        // Simulate an existing database created by SqliteStorage at version 10.
24666        // We create just enough schema to test the transition.
24667        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24668        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24669            .unwrap();
24670        conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
24671            .unwrap();
24672        // Create a dummy conversations table so transition doesn't think it's corrupted.
24673        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24674            .unwrap();
24675        drop(conn);
24676
24677        // Now run the transition function.
24678        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24679        transition_from_meta_version(&conn).unwrap();
24680
24681        // The frankensqlite path uses a combined V13 base migration, so a
24682        // legacy V10 marker is bridged to V13 and later idempotent repair fills
24683        // in any missing V11-V13 objects.
24684        let rows = conn
24685            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24686            .unwrap();
24687        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24688        assert_eq!(
24689            versions,
24690            (1..=13).collect::<Vec<i64>>(),
24691            "transition should bridge legacy V10 databases through the combined V13 base marker"
24692        );
24693    }
24694
24695    #[test]
24696    fn franken_transition_from_current_meta_backfills_current_schema_marker() {
24697        let dir = TempDir::new().unwrap();
24698        let db_path = dir.path().join("test_current_transition.db");
24699
24700        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24701        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24702            .unwrap();
24703        conn.execute_compat(
24704            "INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
24705            &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24706        )
24707        .unwrap();
24708        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24709            .unwrap();
24710        drop(conn);
24711
24712        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24713        transition_from_meta_version(&conn).unwrap();
24714
24715        let rows = conn
24716            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24717            .unwrap();
24718        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24719        assert_eq!(
24720            versions,
24721            (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24722            "current meta schema marker should backfill every known migration"
24723        );
24724    }
24725
24726    #[test]
24727    fn franken_transition_skips_when_already_done() {
24728        let dir = TempDir::new().unwrap();
24729        let db_path = dir.path().join("test_transition_skip.db");
24730
24731        // Create a DB that already has _schema_migrations.
24732        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24733        conn.execute(
24734            "CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
24735        ).unwrap();
24736        conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
24737            .unwrap();
24738
24739        // Transition should be a no-op.
24740        transition_from_meta_version(&conn).unwrap();
24741
24742        // Should still have exactly 1 entry.
24743        let rows = conn
24744            .query("SELECT COUNT(*) FROM _schema_migrations;")
24745            .unwrap();
24746        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24747        assert_eq!(
24748            count, 1,
24749            "transition should not re-run on already-transitioned DB"
24750        );
24751    }
24752
24753    #[test]
24754    fn franken_transition_fresh_db_is_noop() {
24755        let dir = TempDir::new().unwrap();
24756        let db_path = dir.path().join("test_fresh_noop.db");
24757
24758        // Empty database — no meta table, no tables at all.
24759        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24760        transition_from_meta_version(&conn).unwrap();
24761
24762        // _schema_migrations should NOT have been created.
24763        let res = conn.query("SELECT * FROM \"_schema_migrations\";");
24764        assert!(
24765            res.is_err(),
24766            "transition should not create _schema_migrations on fresh DB"
24767        );
24768    }
24769
24770    #[test]
24771    fn franken_transition_with_fts_virtual_table_succeeds() {
24772        let dir = TempDir::new().unwrap();
24773        let db_path = dir.path().join("test_transition_with_fts.db");
24774
24775        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24776        conn.execute_batch(
24777            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24778             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24779             CREATE TABLE conversations (id INTEGER PRIMARY KEY);
24780             CREATE VIRTUAL TABLE fts_messages USING fts5(
24781                 content,
24782                 title,
24783                 agent,
24784                 workspace,
24785                 source_path,
24786                 created_at,
24787                 content='',
24788                 tokenize='porter unicode61'
24789             );",
24790        )
24791        .unwrap();
24792        drop(conn);
24793
24794        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24795        transition_from_meta_version(&conn).unwrap();
24796
24797        let rows = conn
24798            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24799            .unwrap();
24800        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24801        assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
24802    }
24803
24804    #[test]
24805    fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
24806        let dir = TempDir::new().unwrap();
24807        let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
24808
24809        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24810        conn.execute_batch(
24811            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24812             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24813             CREATE TABLE agents (
24814                 id INTEGER PRIMARY KEY,
24815                 slug TEXT NOT NULL
24816             );
24817             CREATE TABLE workspaces (
24818                 id INTEGER PRIMARY KEY,
24819                 path TEXT NOT NULL
24820             );
24821             CREATE TABLE sources (
24822                 id TEXT PRIMARY KEY,
24823                 kind TEXT NOT NULL,
24824                 host_label TEXT,
24825                 machine_id TEXT,
24826                 platform TEXT,
24827                 config_json TEXT,
24828                 created_at INTEGER NOT NULL,
24829                 updated_at INTEGER NOT NULL
24830             );
24831             CREATE TABLE conversations (
24832                 id INTEGER PRIMARY KEY,
24833                 agent_id INTEGER NOT NULL,
24834                 workspace_id INTEGER,
24835                 source_id TEXT NOT NULL DEFAULT 'local',
24836                 external_id TEXT,
24837                 title TEXT,
24838                 source_path TEXT NOT NULL,
24839                 started_at INTEGER,
24840                 ended_at INTEGER
24841             );
24842             CREATE TABLE messages (
24843                 id INTEGER PRIMARY KEY,
24844                 conversation_id INTEGER NOT NULL,
24845                 idx INTEGER NOT NULL,
24846                 role TEXT NOT NULL,
24847                 author TEXT,
24848                 created_at INTEGER,
24849                 content TEXT NOT NULL,
24850                 extra_json TEXT,
24851                 extra_bin BLOB
24852             );
24853             INSERT INTO agents(id, slug) VALUES (1, 'codex');
24854             INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
24855             INSERT INTO sources(id, kind, host_label, created_at, updated_at)
24856             VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
24857             INSERT INTO conversations(
24858                 id,
24859                 agent_id,
24860                 workspace_id,
24861                 source_id,
24862                 external_id,
24863                 title,
24864                 source_path,
24865                 started_at
24866             )
24867             VALUES (
24868                 1,
24869                 1,
24870                 1,
24871                 'local',
24872                 'legacy-session',
24873                 'legacy session',
24874                 '/tmp/legacy.jsonl',
24875                 1710000000000
24876             );
24877             INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
24878             VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
24879             CREATE VIRTUAL TABLE fts_messages USING fts5(
24880                 content,
24881                 title,
24882                 agent,
24883                 workspace,
24884                 source_path,
24885                 created_at,
24886                 message_id,
24887                 content='',
24888                 tokenize='porter unicode61'
24889             );",
24890        )
24891        .unwrap();
24892        drop(conn);
24893
24894        let storage = FrankenStorage::open(&db_path).unwrap();
24895        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24896
24897        let rows = storage
24898            .raw()
24899            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24900            .unwrap();
24901        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24902        assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
24903    }
24904
24905    #[test]
24906    fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
24907        let dir = TempDir::new().unwrap();
24908        let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
24909
24910        let storage = FrankenStorage::open(&db_path).unwrap();
24911        let agent = Agent {
24912            id: None,
24913            slug: "codex".into(),
24914            name: "Codex".into(),
24915            version: None,
24916            kind: AgentKind::Cli,
24917        };
24918        let agent_id = storage.ensure_agent(&agent).unwrap();
24919        let conversation = Conversation {
24920            id: None,
24921            agent_slug: "codex".into(),
24922            workspace: Some(PathBuf::from("/tmp/workspace")),
24923            external_id: Some("dup-fts-schema".into()),
24924            title: Some("Duplicate FTS schema".into()),
24925            source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
24926            started_at: Some(1_700_000_000_000),
24927            ended_at: Some(1_700_000_000_100),
24928            approx_tokens: Some(42),
24929            metadata_json: serde_json::Value::Null,
24930            messages: vec![Message {
24931                id: None,
24932                idx: 0,
24933                role: MessageRole::User,
24934                author: Some("user".into()),
24935                created_at: Some(1_700_000_000_050),
24936                content: "message that should remain queryable".into(),
24937                extra_json: serde_json::Value::Null,
24938                snippets: Vec::new(),
24939            }],
24940            source_id: LOCAL_SOURCE_ID.into(),
24941            origin_host: None,
24942        };
24943        storage
24944            .insert_conversation_tree(agent_id, None, &conversation)
24945            .unwrap();
24946        drop(storage);
24947        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
24948
24949        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
24950        let conn = rusqlite_test_fixture_conn(&db_path);
24951        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
24952        conn.execute(
24953            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
24954             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
24955            [duplicate_legacy_fts_sql],
24956        )
24957        .unwrap();
24958        conn.execute(
24959            "DELETE FROM meta WHERE key = ?1",
24960            [FTS_FRANKEN_REBUILD_META_KEY],
24961        )
24962        .unwrap();
24963        // Simulate a pre-fix upgraded database that has never gone through the
24964        // authoritative frankensqlite FTS rebuild generation yet.
24965        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
24966
24967        let duplicate_rows: i64 = conn
24968            .query_row(
24969                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
24970                [],
24971                |row| row.get(0),
24972            )
24973            .unwrap();
24974        assert_eq!(duplicate_rows, 2);
24975        drop(conn);
24976
24977        let reopened = FrankenStorage::open(&db_path).unwrap();
24978        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24979        let generation_rows: Vec<String> = reopened
24980            .raw()
24981            .query_map_collect(
24982                "SELECT value FROM meta WHERE key = ?1",
24983                fparams![FTS_FRANKEN_REBUILD_META_KEY],
24984                |row| row.get_typed(0),
24985            )
24986            .unwrap();
24987        assert_eq!(
24988            generation_rows.len(),
24989            0,
24990            "canonical open should not eagerly rewrite FTS repair metadata"
24991        );
24992        reopened.ensure_search_fallback_fts_consistency().unwrap();
24993        let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24994        assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
24995
24996        let total_messages: i64 = reopened
24997            .raw()
24998            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
24999                row.get_typed(0)
25000            })
25001            .unwrap();
25002        let total_fts_rows: i64 = reopened
25003            .raw()
25004            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25005                row.get_typed(0)
25006            })
25007            .unwrap();
25008        assert_eq!(total_fts_rows, total_messages);
25009    }
25010
25011    #[test]
25012    fn fts_messages_integrity_reports_missing_shadow_tables() {
25013        let dir = TempDir::new().unwrap();
25014        let healthy_db_path = dir.path().join("healthy_fts.db");
25015
25016        {
25017            let storage = FrankenStorage::open(&healthy_db_path).unwrap();
25018            storage.ensure_search_fallback_fts_consistency().unwrap();
25019            storage
25020                .validate_fts_messages_integrity()
25021                .expect("freshly materialized fts_messages should pass integrity validation");
25022        }
25023
25024        let corrupt_db_path = dir.path().join("test_corrupt_fts_missing_shadows.db");
25025        {
25026            let conn = rusqlite_test_fixture_conn(&corrupt_db_path);
25027            conn.execute("CREATE TABLE schema_anchor(id INTEGER PRIMARY KEY)", [])
25028                .unwrap();
25029            let orphaned_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
25030            conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
25031            conn.execute(
25032                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
25033                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
25034                [orphaned_fts_sql],
25035            )
25036            .unwrap();
25037            conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
25038        }
25039
25040        let open_err = FrankenConnection::open(corrupt_db_path.to_string_lossy().to_string())
25041            .expect_err("orphaned fts_messages schema should fail during connection open");
25042        let integrity = fts_messages_integrity_error_from_message(open_err.to_string())
25043            .expect("open-time FTS corruption should map to the typed FTS integrity kind");
25044        assert_eq!(integrity.missing_shadow_tables(), &["fts_messages_content"]);
25045        let rendered = integrity.to_string();
25046        assert!(
25047            rendered.contains("fts_messages")
25048                && rendered.contains("required FTS5 shadow tables")
25049                && rendered.contains("fts_messages_content"),
25050            "error should be an operator-facing FTS corruption diagnosis: {rendered}"
25051        );
25052    }
25053
25054    #[test]
25055    fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
25056        let dir = TempDir::new().unwrap();
25057        let db_path = dir.path().join("fresh-franken-storage-open.db");
25058
25059        let storage = FrankenStorage::open(&db_path).unwrap();
25060        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25061
25062        // The FTS5 virtual table is no longer created eagerly by the
25063        // migration runner (V14 drops the old internal-content table and the
25064        // current contentless table is recreated lazily — see MIGRATION_V14).
25065        // Invoke the repair path to match normal cass startup, then assert
25066        // there is exactly one fts_messages entry in sqlite_schema (no
25067        // duplicates).
25068        storage
25069            .ensure_search_fallback_fts_consistency()
25070            .expect("ensure FTS consistency after fresh open");
25071        drop(storage);
25072
25073        let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
25074            .expect("open DB via frankensqlite for sqlite_master inspection");
25075        assert_eq!(
25076            franken_fts_schema_rows(&c_reader).unwrap(),
25077            1,
25078            "exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
25079        );
25080        drop(c_reader);
25081
25082        let storage = FrankenStorage::open(&db_path).unwrap();
25083        assert!(
25084            storage
25085                .raw()
25086                .query("SELECT COUNT(*) FROM fts_messages")
25087                .is_ok(),
25088            "fts_messages must be queryable through frankensqlite after open"
25089        );
25090    }
25091
25092    #[test]
25093    fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
25094        let dir = TempDir::new().unwrap();
25095        let db_path = dir.path().join("test_repair_missing_analytics.db");
25096
25097        {
25098            let storage = FrankenStorage::open(&db_path).unwrap();
25099            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25100        }
25101
25102        {
25103            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
25104            for table in &[
25105                "usage_models_daily",
25106                "usage_daily",
25107                "usage_hourly",
25108                "message_metrics",
25109                "token_daily_stats",
25110                "token_usage",
25111                "model_pricing",
25112                "embedding_jobs",
25113                "daily_stats",
25114            ] {
25115                conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
25116                    .unwrap();
25117            }
25118            conn.execute_compat(
25119                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
25120                &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
25121            )
25122            .unwrap();
25123        }
25124
25125        let repaired = FrankenStorage::open(&db_path).unwrap();
25126        assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
25127
25128        let analytics_count: i64 = repaired
25129            .raw()
25130            .query_row_map(
25131                "SELECT COUNT(*) FROM sqlite_master
25132                 WHERE type='table'
25133                   AND name IN (
25134                     'daily_stats',
25135                     'embedding_jobs',
25136                     'token_usage',
25137                     'token_daily_stats',
25138                     'model_pricing',
25139                     'message_metrics',
25140                     'usage_hourly',
25141                     'usage_daily',
25142                     'usage_models_daily'
25143                   )",
25144                &[],
25145                |row| row.get_typed(0),
25146            )
25147            .unwrap();
25148        assert_eq!(
25149            analytics_count, 9,
25150            "open() should recreate the missing analytics tables even when schema_version already says current"
25151        );
25152    }
25153
25154    #[test]
25155    fn current_schema_repair_batches_cover_every_required_probe() {
25156        let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
25157            .iter()
25158            .map(|(table_name, _)| *table_name)
25159            .collect();
25160
25161        let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
25162        let covered_tables: HashSet<&'static str> = batches
25163            .iter()
25164            .flat_map(|batch| batch.tables.iter().copied())
25165            .collect();
25166
25167        for table_name in missing_tables {
25168            assert!(
25169                covered_tables.contains(table_name),
25170                "missing repair coverage for {table_name}"
25171            );
25172        }
25173    }
25174
25175    #[test]
25176    fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
25177        for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
25178            assert!(
25179                !batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
25180                "repair batch {} should not recreate meta",
25181                batch.name
25182            );
25183            assert!(
25184                !batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
25185                "repair batch {} should not recreate agents",
25186                batch.name
25187            );
25188            assert!(
25189                !batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
25190                "repair batch {} should not recreate workspaces",
25191                batch.name
25192            );
25193            assert!(
25194                !batch
25195                    .sql
25196                    .contains("CREATE TABLE IF NOT EXISTS conversations"),
25197                "repair batch {} should not recreate conversations",
25198                batch.name
25199            );
25200            assert!(
25201                !batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
25202                "repair batch {} should not recreate messages",
25203                batch.name
25204            );
25205            assert!(
25206                !batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
25207                "repair batch {} should not recreate snippets",
25208                batch.name
25209            );
25210            assert!(
25211                !batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
25212                "repair batch {} should not recreate FTS tables",
25213                batch.name
25214            );
25215            assert!(
25216                !batch.sql.contains("DROP TABLE"),
25217                "repair batch {} should never drop tables",
25218                batch.name
25219            );
25220        }
25221    }
25222
25223    #[test]
25224    fn build_cass_migrations_applies_combined_v13() {
25225        let conn = FrankenConnection::open(":memory:").unwrap();
25226        let base_result = build_cass_migrations_before_tail_cache()
25227            .run(&conn)
25228            .unwrap();
25229        assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
25230        let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
25231
25232        assert!(base_result.was_fresh);
25233        let mut applied = base_result.applied;
25234        applied.push(15);
25235        applied.extend(post_result.applied);
25236        assert_eq!(
25237            applied,
25238            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
25239            "should apply combined V13 plus additive post-V13 migrations"
25240        );
25241        let current: i64 = conn
25242            .query("SELECT MAX(version) FROM _schema_migrations;")
25243            .unwrap()
25244            .first()
25245            .unwrap()
25246            .get_typed(0)
25247            .unwrap();
25248        assert_eq!(current, CURRENT_SCHEMA_VERSION);
25249    }
25250
25251    #[test]
25252    fn franken_insert_conversations_batched_populates_analytics_rollups() {
25253        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25254        use frankensqlite::compat::{ConnectionExt, RowExt};
25255        use std::path::PathBuf;
25256
25257        let dir = TempDir::new().unwrap();
25258        let db_path = dir.path().join("franken-index.db");
25259        let storage = FrankenStorage::open(&db_path).unwrap();
25260
25261        let agent = Agent {
25262            id: None,
25263            slug: "claude_code".into(),
25264            name: "Claude Code".into(),
25265            version: Some("1.0".into()),
25266            kind: AgentKind::Cli,
25267        };
25268        let agent_id = storage.ensure_agent(&agent).unwrap();
25269
25270        let ts_ms = 1_770_551_400_000_i64;
25271        let usage_json = serde_json::json!({
25272            "message": {
25273                "model": "claude-opus-4-6",
25274                "usage": {
25275                    "input_tokens": 100,
25276                    "output_tokens": 50,
25277                    "cache_read_input_tokens": 25,
25278                    "cache_creation_input_tokens": 10,
25279                    "service_tier": "standard"
25280                }
25281            }
25282        });
25283
25284        let conv = Conversation {
25285            id: None,
25286            agent_slug: "claude_code".into(),
25287            workspace: Some(PathBuf::from("/tmp/workspace")),
25288            external_id: Some("franken-batch-upsert".into()),
25289            title: Some("Franken batch upsert".into()),
25290            source_path: PathBuf::from("/tmp/franken.jsonl"),
25291            started_at: Some(ts_ms),
25292            ended_at: Some(ts_ms + 60_000),
25293            approx_tokens: None,
25294            metadata_json: serde_json::Value::Null,
25295            messages: vec![
25296                Message {
25297                    id: None,
25298                    idx: 0,
25299                    role: MessageRole::User,
25300                    author: None,
25301                    created_at: Some(ts_ms),
25302                    content: "Please make a plan.".into(),
25303                    extra_json: serde_json::Value::Null,
25304                    snippets: vec![],
25305                },
25306                Message {
25307                    id: None,
25308                    idx: 1,
25309                    role: MessageRole::Agent,
25310                    author: None,
25311                    created_at: Some(ts_ms + 30_000),
25312                    content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
25313                    extra_json: usage_json,
25314                    snippets: vec![],
25315                },
25316            ],
25317            source_id: "local".into(),
25318            origin_host: None,
25319        };
25320
25321        let outcomes = storage
25322            .insert_conversations_batched(&[(agent_id, None, &conv)])
25323            .unwrap();
25324        assert_eq!(outcomes.len(), 1);
25325        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
25326
25327        let conn = storage.raw();
25328        let daily_stats_rows: i64 = conn
25329            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
25330                row.get_typed(0)
25331            })
25332            .unwrap();
25333        let token_daily_rows: i64 = conn
25334            .query_row_map(
25335                "SELECT COUNT(*) FROM token_daily_stats",
25336                fparams![],
25337                |row| row.get_typed(0),
25338            )
25339            .unwrap();
25340        let usage_daily_rows: i64 = conn
25341            .query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
25342                row.get_typed(0)
25343            })
25344            .unwrap();
25345        let model_daily_rows: i64 = conn
25346            .query_row_map(
25347                "SELECT COUNT(*) FROM usage_models_daily",
25348                fparams![],
25349                |row| row.get_typed(0),
25350            )
25351            .unwrap();
25352
25353        assert!(daily_stats_rows > 0, "daily_stats should be populated");
25354        assert!(
25355            token_daily_rows > 0,
25356            "token_daily_stats should be populated"
25357        );
25358        assert!(usage_daily_rows > 0, "usage_daily should be populated");
25359        assert!(
25360            model_daily_rows > 0,
25361            "usage_models_daily should be populated"
25362        );
25363    }
25364
25365    // =========================================================================
25366    // FrankenConnectionManager tests (bead 3rlf8)
25367    // =========================================================================
25368
25369    #[test]
25370    fn connection_manager_creates_readers() {
25371        let dir = TempDir::new().unwrap();
25372        let db_path = dir.path().join("cm.db");
25373
25374        // Create the DB first
25375        let fs = FrankenStorage::open(&db_path).unwrap();
25376        drop(fs);
25377
25378        let config = ConnectionManagerConfig {
25379            reader_count: 3,
25380            max_writers: 2,
25381        };
25382        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25383        assert_eq!(mgr.reader_count(), 3);
25384        assert_eq!(mgr.max_writers(), 2);
25385    }
25386
25387    #[test]
25388    fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
25389        let dir = TempDir::new().unwrap();
25390        let db_path = dir.path().join("cm.db");
25391
25392        let fs = FrankenStorage::open(&db_path).unwrap();
25393        drop(fs);
25394
25395        let mgr = std::sync::Arc::new(
25396            FrankenConnectionManager::new(
25397                &db_path,
25398                ConnectionManagerConfig {
25399                    reader_count: 0,
25400                    max_writers: 0,
25401                },
25402            )
25403            .unwrap(),
25404        );
25405        assert_eq!(mgr.reader_count(), 1);
25406        assert_eq!(mgr.max_writers(), 1);
25407
25408        let (tx, rx) = std::sync::mpsc::channel();
25409        let mgr_for_thread = std::sync::Arc::clone(&mgr);
25410        std::thread::spawn(move || {
25411            let result = mgr_for_thread.writer().map(|mut guard| {
25412                guard.mark_committed();
25413            });
25414            tx.send(result.is_ok()).expect("writer result send");
25415        });
25416
25417        assert!(
25418            rx.recv_timeout(Duration::from_secs(10)).unwrap(),
25419            "writer acquisition should not block forever when configured with zero writer slots"
25420        );
25421    }
25422
25423    #[test]
25424    fn connection_manager_reader_round_robin() {
25425        let dir = TempDir::new().unwrap();
25426        let db_path = dir.path().join("cm.db");
25427
25428        let fs = FrankenStorage::open(&db_path).unwrap();
25429        drop(fs);
25430
25431        let config = ConnectionManagerConfig {
25432            reader_count: 2,
25433            max_writers: 1,
25434        };
25435        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25436
25437        // Reader index should advance (round-robin)
25438        let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
25439        let _r1 = mgr.reader();
25440        let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
25441        assert_eq!(idx_after, idx_before + 1, "reader index should advance");
25442    }
25443
25444    #[test]
25445    fn connection_manager_writer_reads_and_writes() {
25446        use frankensqlite::compat::RowExt;
25447
25448        let dir = TempDir::new().unwrap();
25449        let db_path = dir.path().join("cm.db");
25450
25451        let fs = FrankenStorage::open(&db_path).unwrap();
25452        drop(fs);
25453
25454        let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
25455
25456        // Acquire writer and insert data
25457        {
25458            let mut guard = mgr.writer().unwrap();
25459            guard
25460                .storage()
25461                .raw()
25462                .execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
25463                .unwrap();
25464            guard
25465                .storage()
25466                .raw()
25467                .execute("INSERT INTO cm_test (val) VALUES ('hello')")
25468                .unwrap();
25469            guard.mark_committed();
25470        }
25471
25472        // Verify via reader (returns MutexGuard<SendFrankenConnection>)
25473        let reader_guard = mgr.reader();
25474        let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
25475        assert_eq!(rows.len(), 1);
25476        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
25477    }
25478
25479    #[test]
25480    fn connection_manager_writer_guard_drops_releases_slot() {
25481        let dir = TempDir::new().unwrap();
25482        let db_path = dir.path().join("cm.db");
25483
25484        let fs = FrankenStorage::open(&db_path).unwrap();
25485        drop(fs);
25486
25487        let config = ConnectionManagerConfig {
25488            reader_count: 1,
25489            max_writers: 1,
25490        };
25491        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25492
25493        // Acquire and release writer
25494        {
25495            let mut guard = mgr.writer().unwrap();
25496            guard.mark_committed();
25497        }
25498
25499        // Should be able to acquire again (slot released)
25500        let mut guard2 = mgr.writer().unwrap();
25501        guard2.mark_committed();
25502    }
25503
25504    #[test]
25505    fn connection_manager_concurrent_writer_works() {
25506        use frankensqlite::compat::RowExt;
25507
25508        let dir = TempDir::new().unwrap();
25509        let db_path = dir.path().join("cm.db");
25510
25511        let fs = FrankenStorage::open(&db_path).unwrap();
25512        drop(fs);
25513
25514        let config = ConnectionManagerConfig {
25515            reader_count: 1,
25516            max_writers: 2,
25517        };
25518        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25519
25520        {
25521            let mut guard = mgr.concurrent_writer().unwrap();
25522            guard
25523                .storage()
25524                .raw()
25525                .execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
25526                .unwrap();
25527            guard
25528                .storage()
25529                .raw()
25530                .execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
25531                .unwrap();
25532            guard.mark_committed();
25533        }
25534
25535        let reader_guard = mgr.reader();
25536        let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
25537        assert_eq!(rows.len(), 1);
25538        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
25539    }
25540
25541    #[test]
25542    fn connection_manager_default_config() {
25543        let config = ConnectionManagerConfig::default();
25544        assert_eq!(config.reader_count, 4);
25545        assert!(config.max_writers > 0);
25546    }
25547
25548    #[test]
25549    fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
25550        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25551        use std::path::PathBuf;
25552
25553        fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
25554            let agent = Agent {
25555                id: None,
25556                slug: agent_slug.into(),
25557                name: agent_slug.into(),
25558                version: None,
25559                kind: AgentKind::Cli,
25560            };
25561            let agent_id = storage.ensure_agent(&agent).unwrap();
25562            let conversation = Conversation {
25563                id: None,
25564                agent_slug: agent_slug.into(),
25565                workspace: Some(PathBuf::from("/tmp/workspace")),
25566                external_id: Some(format!("{agent_slug}-{marker}")),
25567                title: Some(format!("{agent_slug} {marker}")),
25568                source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
25569                started_at: Some(1_700_000_000_000),
25570                ended_at: Some(1_700_000_000_100),
25571                approx_tokens: None,
25572                metadata_json: serde_json::Value::Null,
25573                messages: vec![
25574                    Message {
25575                        id: None,
25576                        idx: 0,
25577                        role: MessageRole::User,
25578                        author: Some("user".into()),
25579                        created_at: Some(1_700_000_000_010),
25580                        content: format!("{agent_slug} {marker} user"),
25581                        extra_json: serde_json::Value::Null,
25582                        snippets: Vec::new(),
25583                    },
25584                    Message {
25585                        id: None,
25586                        idx: 1,
25587                        role: MessageRole::Agent,
25588                        author: Some("assistant".into()),
25589                        created_at: Some(1_700_000_000_020),
25590                        content: format!("{agent_slug} {marker} assistant"),
25591                        extra_json: serde_json::Value::Null,
25592                        snippets: Vec::new(),
25593                    },
25594                ],
25595                source_id: LOCAL_SOURCE_ID.into(),
25596                origin_host: None,
25597            };
25598            storage
25599                .insert_conversation_tree(agent_id, None, &conversation)
25600                .unwrap();
25601        }
25602
25603        let dir = TempDir::new().unwrap();
25604        let db_path = dir.path().join("agent_search.db");
25605        let storage = FrankenStorage::open(&db_path).unwrap();
25606
25607        seed_conversation(&storage, "openclaw", "purge-target");
25608        seed_conversation(&storage, "codex", "keep-target");
25609
25610        let purge = storage.purge_agent_archive_data("openclaw").unwrap();
25611        assert_eq!(purge.conversations_deleted, 1);
25612        assert_eq!(purge.messages_deleted, 2);
25613
25614        storage.rebuild_fts().unwrap();
25615        storage.rebuild_analytics().unwrap();
25616        storage.rebuild_daily_stats().unwrap();
25617        storage.rebuild_token_daily_stats().unwrap();
25618
25619        let agents = storage.list_agents().unwrap();
25620        assert_eq!(agents.len(), 1);
25621        assert_eq!(agents[0].slug, "codex");
25622        assert_eq!(storage.total_conversation_count().unwrap(), 1);
25623        assert_eq!(storage.total_message_count().unwrap(), 2);
25624
25625        let fts_rows: i64 = storage
25626            .raw()
25627            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25628                row.get_typed(0)
25629            })
25630            .unwrap();
25631        assert_eq!(fts_rows, 2);
25632
25633        let total_daily_sessions: i64 = storage
25634            .raw()
25635            .query_row_map(
25636                "SELECT COALESCE(SUM(session_count), 0)
25637                 FROM daily_stats
25638                 WHERE agent_slug = 'all' AND source_id = 'all'",
25639                fparams![],
25640                |row| row.get_typed(0),
25641            )
25642            .unwrap();
25643        assert_eq!(total_daily_sessions, 1);
25644
25645        let openclaw_token_rows: i64 = storage
25646            .raw()
25647            .query_row_map(
25648                "SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
25649                fparams![],
25650                |row| row.get_typed(0),
25651            )
25652            .unwrap();
25653        assert_eq!(openclaw_token_rows, 0);
25654    }
25655
25656    /// Regression for cass#202: a `Connection` dropped mid-transaction can
25657    /// leave child rows persisted without a matching parent. The next indexer
25658    /// pass then trips `FOREIGN KEY constraint failed` on every write, the
25659    /// session never gets marked indexed, and the pending backlog grows
25660    /// without bound. `cleanup_orphan_fk_rows` is the indexer-startup
25661    /// self-heal that breaks the cycle.
25662    #[test]
25663    fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
25664        let dir = TempDir::new().unwrap();
25665        let db_path = dir.path().join("orphan_fk_self_heal.db");
25666        let storage = FrankenStorage::open(&db_path).unwrap();
25667
25668        // Plant orphan rows directly: rows whose FK parent does not exist.
25669        // FK enforcement is temporarily off so the planted rows can land.
25670        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25671
25672        // Seed a real conversation so a subset of children DO have valid
25673        // parents — we want the cleanup to be precise, not a table-flush.
25674        storage
25675            .raw()
25676            .execute_compat(
25677                "INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
25678                 VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
25679                fparams![],
25680            )
25681            .unwrap();
25682        storage
25683            .raw()
25684            .execute_compat(
25685                "INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
25686                 VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
25687                fparams![],
25688            )
25689            .unwrap();
25690        storage
25691            .raw()
25692            .execute_compat(
25693                "INSERT INTO messages(id, conversation_id, idx, role, content) \
25694                 VALUES(1, 1, 0, 'user', 'real message')",
25695                fparams![],
25696            )
25697            .unwrap();
25698
25699        // Plant orphan messages referencing conversation_id=99999 (does not exist)
25700        // and conversation_id=0 (the specific shape reported in #202). Distinct
25701        // (conversation_id, idx) pairs are required by the UNIQUE constraint.
25702        for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
25703            storage
25704                .raw()
25705                .execute_compat(
25706                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25707                     VALUES(?1, ?2, ?3, 'user', 'orphan message')",
25708                    fparams![mid, cid, idx],
25709                )
25710                .unwrap();
25711        }
25712
25713        // Rows below are not directly orphaned because their immediate
25714        // `messages` parent exists, but that parent is itself orphaned. The
25715        // cleanup deletes them explicitly before deleting orphan messages so the
25716        // FK cascade engine does not have to run one delete program per orphan.
25717        for message_id in [1_i64, 101_i64, 102_i64] {
25718            storage
25719                .raw()
25720                .execute_compat(
25721                    "INSERT INTO message_metrics(
25722                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25723                         role, content_chars, content_tokens_est
25724                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
25725                    fparams![message_id],
25726                )
25727                .unwrap();
25728            storage
25729                .raw()
25730                .execute_compat(
25731                    "INSERT INTO token_usage(
25732                         message_id, conversation_id, agent_id, timestamp_ms, day_id,
25733                         role, content_chars
25734                     ) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
25735                    fparams![message_id],
25736                )
25737                .unwrap();
25738        }
25739
25740        // Plant a directly-orphan snippet — message_id=99999 does not exist
25741        // anywhere, so this exercises the snippets DELETE path rather than
25742        // riding on the cascade from the orphan-message DELETE.
25743        storage
25744            .raw()
25745            .execute_compat(
25746                "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
25747                 VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
25748                fparams![],
25749            )
25750            .unwrap();
25751
25752        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25753
25754        // Sanity: the planted orphans are visible.
25755        let messages_before: i64 = storage
25756            .raw()
25757            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25758                row.get_typed(0)
25759            })
25760            .unwrap();
25761        assert_eq!(messages_before, 4); // 1 real + 3 orphans
25762        let snippets_before: i64 = storage
25763            .raw()
25764            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25765                row.get_typed(0)
25766            })
25767            .unwrap();
25768        assert_eq!(snippets_before, 1);
25769        let metrics_before: i64 = storage
25770            .raw()
25771            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25772                row.get_typed(0)
25773            })
25774            .unwrap();
25775        assert_eq!(metrics_before, 3);
25776        let token_usage_before: i64 = storage
25777            .raw()
25778            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25779                row.get_typed(0)
25780            })
25781            .unwrap();
25782        assert_eq!(token_usage_before, 3);
25783
25784        // Run the self-heal.
25785        let report = storage.cleanup_orphan_fk_rows().unwrap();
25786
25787        // 3 orphan messages + 1 directly-orphan snippet = 4 primary orphans
25788        // reported. Dependent message_metrics/token_usage rows for orphan
25789        // messages are pruned too, but they are not double-counted because the
25790        // orphan message is the root row that made them invalid.
25791        let messages_after: i64 = storage
25792            .raw()
25793            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25794                row.get_typed(0)
25795            })
25796            .unwrap();
25797        assert_eq!(messages_after, 1, "real message must be preserved");
25798        let snippets_after: i64 = storage
25799            .raw()
25800            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25801                row.get_typed(0)
25802            })
25803            .unwrap();
25804        assert_eq!(snippets_after, 0);
25805        let metrics_after: i64 = storage
25806            .raw()
25807            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25808                row.get_typed(0)
25809            })
25810            .unwrap();
25811        assert_eq!(metrics_after, 1, "real message metric must be preserved");
25812        let token_usage_after: i64 = storage
25813            .raw()
25814            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25815                row.get_typed(0)
25816            })
25817            .unwrap();
25818        assert_eq!(token_usage_after, 1, "real token row must be preserved");
25819
25820        assert_eq!(report.total, 4, "report total: {:?}", report);
25821        let messages_count = report
25822            .per_table
25823            .iter()
25824            .find(|(t, _)| *t == "messages")
25825            .map(|(_, c)| *c);
25826        assert_eq!(messages_count, Some(3));
25827        let snippets_count = report
25828            .per_table
25829            .iter()
25830            .find(|(t, _)| *t == "snippets")
25831            .map(|(_, c)| *c);
25832        assert_eq!(snippets_count, Some(1));
25833
25834        // Second invocation on a now-clean DB must be a no-op.
25835        let second = storage.cleanup_orphan_fk_rows().unwrap();
25836        assert_eq!(second.total, 0);
25837        assert!(second.per_table.is_empty());
25838    }
25839
25840    #[test]
25841    fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
25842        let dir = TempDir::new().unwrap();
25843        let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
25844        let storage = FrankenStorage::open(&db_path).unwrap();
25845        let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
25846
25847        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25848        {
25849            let mut tx = storage.raw().transaction().unwrap();
25850            for idx in 0..orphan_count {
25851                let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
25852                let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
25853                tx.execute_compat(
25854                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25855                     VALUES(?1, ?2, 0, 'user', 'orphan message')",
25856                    fparams![message_id, conversation_id],
25857                )
25858                .unwrap();
25859                tx.execute_compat(
25860                    "INSERT INTO message_metrics(
25861                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25862                         role, content_chars, content_tokens_est
25863                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
25864                    fparams![message_id],
25865                )
25866                .unwrap();
25867            }
25868            tx.commit().unwrap();
25869        }
25870        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25871
25872        let report = storage.cleanup_orphan_fk_rows().unwrap();
25873
25874        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25875        let messages_count = report
25876            .per_table
25877            .iter()
25878            .find(|(table, _)| *table == "messages")
25879            .map(|(_, count)| *count);
25880        assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
25881        let messages_after: i64 = storage
25882            .raw()
25883            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25884                row.get_typed(0)
25885            })
25886            .unwrap();
25887        assert_eq!(messages_after, 0);
25888        let metrics_after: i64 = storage
25889            .raw()
25890            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25891                row.get_typed(0)
25892            })
25893            .unwrap();
25894        assert_eq!(metrics_after, 0);
25895    }
25896
25897    #[test]
25898    fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
25899        let dir = TempDir::new().unwrap();
25900        let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
25901        let storage = FrankenStorage::open(&db_path).unwrap();
25902        let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
25903
25904        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25905        {
25906            let mut tx = storage.raw().transaction().unwrap();
25907            for idx in 0..orphan_count {
25908                let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
25909                tx.execute_compat(
25910                    "INSERT INTO message_metrics(
25911                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25912                         role, content_chars, content_tokens_est
25913                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
25914                    fparams![message_id],
25915                )
25916                .unwrap();
25917            }
25918            tx.commit().unwrap();
25919        }
25920        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25921
25922        let report = storage.cleanup_orphan_fk_rows().unwrap();
25923
25924        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25925        let metrics_count = report
25926            .per_table
25927            .iter()
25928            .filter(|(table, _)| *table == "message_metrics")
25929            .map(|(_, count)| *count)
25930            .sum::<i64>();
25931        assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
25932        assert_eq!(
25933            report
25934                .per_table
25935                .iter()
25936                .filter(|(table, _)| *table == "message_metrics")
25937                .count(),
25938            1,
25939            "paged cleanup should aggregate report entries by table: {report:?}"
25940        );
25941        let metrics_after: i64 = storage
25942            .raw()
25943            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25944                row.get_typed(0)
25945            })
25946            .unwrap();
25947        assert_eq!(metrics_after, 0);
25948    }
25949}