Skip to main content

coding_agent_search/storage/
sqlite.rs

1//! `SQLite` backend: schema, pragmas, and migrations.
2
3use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole, Snippet};
4use crate::sources::provenance::{LOCAL_SOURCE_ID, Source, SourceKind};
5use anyhow::{Context, Result, anyhow, bail};
6use frankensqlite::{
7    Connection as FrankenConnection, Row as FrankenRow, SqliteValue,
8    compat::{
9        ConnectionExt as FrankenConnectionExt, OpenFlags as FrankenOpenFlags,
10        OptionalExtension as FrankenOptionalExtension, ParamValue, RowExt as FrankenRowExt,
11        Transaction as FrankenTransaction, TransactionExt as FrankenTransactionExt,
12        open_with_flags as open_franken_with_flags, param_slice_to_values, params_from_iter,
13    },
14    migrate::MigrationRunner,
15};
16use serde::{Deserialize, Serialize};
17use smallvec::SmallVec;
18use std::borrow::Cow;
19use std::collections::{HashMap, HashSet};
20use std::fs;
21use std::io::{BufRead, BufReader, Write};
22use std::process::{Command, Stdio};
23use std::sync::{
24    Arc,
25    atomic::{AtomicBool, AtomicI8, AtomicI64, AtomicU64, AtomicUsize, Ordering},
26};
27
28/// Frankensqlite parameter list builder.
29macro_rules! fparams {
30    () => {
31        &[] as &[ParamValue]
32    };
33    ($($val:expr),+ $(,)?) => {
34        &[$(ParamValue::from($val)),+] as &[ParamValue]
35    };
36}
37use std::path::{Path, PathBuf};
38use std::time::{Duration, Instant, SystemTime, UNIX_EPOCH};
39use thiserror::Error;
40use tracing::info;
41
42const DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT: Duration = Duration::from_secs(30);
43const DOCTOR_MUTATION_LOCK_MAX_METADATA_READ: u64 = 64 * 1024;
44
45// -------------------------------------------------------------------------
46// Lazy FrankenSQLite Connection (bd-1ueu)
47// -------------------------------------------------------------------------
48// Defers opening the database until first use, cutting startup cost for
49// commands that may not need the DB at all.  Thread-safe via parking_lot
50// Mutex; logs the reason and duration of the open on first access.
51
52/// Error from lazy database initialization.
53#[derive(Debug, Error)]
54pub enum LazyDbError {
55    #[error("Database not found at {0}")]
56    NotFound(PathBuf),
57    #[error("Failed to open FrankenSQLite database at {path}: {source}")]
58    FrankenOpenFailed {
59        path: PathBuf,
60        source: frankensqlite::FrankenError,
61    },
62}
63
64// -------------------------------------------------------------------------
65// LazyFrankenDb — lazy wrapper around FrankenConnection
66// -------------------------------------------------------------------------
67
68/// Wrapper around `FrankenConnection` that implements `Send`.
69///
70/// `FrankenConnection` is `!Send` because it uses `Rc` internally.
71/// However, the `Rc` values are entirely self-contained within the Connection
72/// and are not shared externally.  When wrapped in a `Mutex`,
73/// exclusive access is guaranteed, making cross-thread transfer safe.
74pub struct SendFrankenConnection(FrankenConnection, i64, u64);
75
76// Safety: Rc fields inside FrankenConnection are not cloned or shared externally.
77// The Mutex<Option<SendFrankenConnection>> ensures exclusive access.
78unsafe impl Send for SendFrankenConnection {}
79
80impl SendFrankenConnection {
81    pub(crate) fn new(conn: FrankenConnection) -> Self {
82        Self(
83            conn,
84            UNSET_INDEX_WRITER_CHECKPOINT_PAGES,
85            UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS,
86        )
87    }
88
89    pub(crate) fn new_with_index_writer_state(
90        conn: FrankenConnection,
91        checkpoint_pages: i64,
92        busy_timeout_ms: u64,
93    ) -> Self {
94        Self(conn, checkpoint_pages, busy_timeout_ms)
95    }
96
97    pub(crate) fn into_parts(self) -> (FrankenConnection, i64, u64) {
98        (self.0, self.1, self.2)
99    }
100}
101
102impl std::ops::Deref for SendFrankenConnection {
103    type Target = FrankenConnection;
104    fn deref(&self) -> &FrankenConnection {
105        &self.0
106    }
107}
108
109/// Lazy-opening wrapper for `FrankenConnection` (frankensqlite).
110///
111/// Constructing a `LazyFrankenDb` is cheap (no I/O).  The underlying
112/// `FrankenConnection` is opened on the first call to [`get`].
113/// Subsequent calls return the cached connection.
114pub struct LazyFrankenDb {
115    path: PathBuf,
116    conn: parking_lot::Mutex<Option<SendFrankenConnection>>,
117}
118
119/// RAII guard that dereferences to the inner `FrankenConnection`.
120pub struct LazyFrankenDbGuard<'a>(parking_lot::MutexGuard<'a, Option<SendFrankenConnection>>);
121
122impl std::fmt::Debug for LazyFrankenDbGuard<'_> {
123    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
124        f.debug_tuple("LazyFrankenDbGuard")
125            .field(&self.0.is_some())
126            .finish()
127    }
128}
129
130impl std::ops::Deref for LazyFrankenDbGuard<'_> {
131    type Target = FrankenConnection;
132    fn deref(&self) -> &FrankenConnection {
133        self.0
134            .as_ref()
135            .expect("LazyFrankenDb connection must be initialized before access")
136    }
137}
138
139impl LazyFrankenDb {
140    /// Create a lazy handle pointing at `path`.  No I/O is performed.
141    pub fn new(path: PathBuf) -> Self {
142        Self {
143            path,
144            conn: parking_lot::Mutex::new(None),
145        }
146    }
147
148    /// Resolve path from optional CLI overrides.
149    ///
150    /// Uses `data_dir / agent_search.db` as fallback.
151    pub fn from_overrides(data_dir: &Option<PathBuf>, db_override: Option<PathBuf>) -> Self {
152        let data_dir = data_dir.clone().unwrap_or_else(crate::default_data_dir);
153        let path = db_override.unwrap_or_else(|| data_dir.join("agent_search.db"));
154        Self::new(path)
155    }
156
157    /// Get the connection, opening the database on first access.
158    ///
159    /// `reason` is logged alongside the open duration so callers can
160    /// identify which command triggered the open.
161    pub fn get(&self, reason: &str) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
162        let mut guard = self.conn.lock();
163        if guard.is_none() {
164            if !self.path.exists() {
165                return Err(LazyDbError::NotFound(self.path.clone()));
166            }
167            let start = Instant::now();
168            let _doctor_guard = acquire_doctor_mutation_db_open_guard(
169                &self.path,
170                DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT,
171            )
172            .map_err(|err| LazyDbError::FrankenOpenFailed {
173                path: self.path.clone(),
174                source: frankensqlite::FrankenError::Internal(err.to_string()),
175            })?;
176            let conn =
177                FrankenConnection::open(self.path.to_string_lossy().into_owned()).map_err(|e| {
178                    LazyDbError::FrankenOpenFailed {
179                        path: self.path.clone(),
180                        source: e,
181                    }
182                })?;
183            let elapsed_ms = start.elapsed().as_millis();
184            info!(
185                path = %self.path.display(),
186                elapsed_ms = elapsed_ms,
187                reason = reason,
188                "lazily opened FrankenSQLite database"
189            );
190            *guard = Some(SendFrankenConnection::new(conn));
191        }
192        Ok(LazyFrankenDbGuard(guard))
193    }
194
195    /// Get the connection with a timeout, opening the database on first access.
196    ///
197    /// Like [`get`] but spawns the open in a background thread and waits up to
198    /// `timeout` for it to complete. Returns `LazyDbError::FrankenOpenFailed`
199    /// with a descriptive message if the timeout elapses. Fix for #128.
200    pub fn get_with_timeout(
201        &self,
202        reason: &str,
203        timeout: Duration,
204    ) -> std::result::Result<LazyFrankenDbGuard<'_>, LazyDbError> {
205        let mut guard = self.conn.lock();
206        if guard.is_none() {
207            if !self.path.exists() {
208                return Err(LazyDbError::NotFound(self.path.clone()));
209            }
210            let start = Instant::now();
211            let path_owned = self.path.to_string_lossy().into_owned();
212            let path_for_guard = self.path.clone();
213            let (tx, rx) = std::sync::mpsc::channel();
214            std::thread::spawn(move || {
215                let _doctor_guard =
216                    match acquire_doctor_mutation_db_open_guard(&path_for_guard, timeout) {
217                        Ok(guard) => guard,
218                        Err(err) => {
219                            let _ = tx
220                                .send(Err(frankensqlite::FrankenError::Internal(err.to_string())));
221                            return;
222                        }
223                    };
224                let _ =
225                    tx.send(FrankenConnection::open(path_owned).map(SendFrankenConnection::new));
226            });
227            let conn = rx
228                .recv_timeout(timeout)
229                .map_err(|_| LazyDbError::FrankenOpenFailed {
230                    path: self.path.clone(),
231                    source: frankensqlite::FrankenError::Internal(format!(
232                        "database open timed out after {}s (possible corruption or lock contention)",
233                        timeout.as_secs()
234                    )),
235                })?
236                .map_err(|e| LazyDbError::FrankenOpenFailed {
237                    path: self.path.clone(),
238                    source: e,
239                })?;
240            let elapsed_ms = start.elapsed().as_millis();
241            info!(
242                path = %self.path.display(),
243                elapsed_ms = elapsed_ms,
244                reason = reason,
245                "lazily opened FrankenSQLite database (with timeout)"
246            );
247            *guard = Some(conn);
248        }
249        Ok(LazyFrankenDbGuard(guard))
250    }
251
252    /// Path to the database file (even if not yet opened).
253    pub fn path(&self) -> &Path {
254        &self.path
255    }
256
257    /// Whether the connection has been opened.
258    pub fn is_open(&self) -> bool {
259        self.conn.lock().is_some()
260    }
261}
262
263static FRANKEN_RETRY_JITTER_STATE: AtomicU64 = AtomicU64::new(0x9e37_79b9_7f4a_7c15);
264static DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH: AtomicUsize = AtomicUsize::new(0);
265static MESSAGE_LOOKUP_TRACE_ENABLED: AtomicBool = AtomicBool::new(false);
266static MESSAGE_LOOKUP_EXACT_IDX_PROBES: AtomicU64 = AtomicU64::new(0);
267static MESSAGE_LOOKUP_BOUNDED_QUERIES: AtomicU64 = AtomicU64::new(0);
268static MESSAGE_LOOKUP_FULL_SCAN_QUERIES: AtomicU64 = AtomicU64::new(0);
269static MESSAGE_LOOKUP_ROWS_MATERIALIZED: AtomicU64 = AtomicU64::new(0);
270
271#[derive(Debug, Clone, Copy, Default, Serialize)]
272pub(crate) struct MessageLookupTraceCounters {
273    pub exact_idx_probes: u64,
274    pub bounded_lookup_queries: u64,
275    pub full_scan_queries: u64,
276    pub rows_materialized: u64,
277}
278
279impl MessageLookupTraceCounters {
280    pub(crate) fn saturating_sub(self, before: Self) -> Self {
281        Self {
282            exact_idx_probes: self
283                .exact_idx_probes
284                .saturating_sub(before.exact_idx_probes),
285            bounded_lookup_queries: self
286                .bounded_lookup_queries
287                .saturating_sub(before.bounded_lookup_queries),
288            full_scan_queries: self
289                .full_scan_queries
290                .saturating_sub(before.full_scan_queries),
291            rows_materialized: self
292                .rows_materialized
293                .saturating_sub(before.rows_materialized),
294        }
295    }
296
297    pub(crate) fn lookups_against_global(self) -> u64 {
298        self.exact_idx_probes.saturating_add(self.rows_materialized)
299    }
300}
301
302pub(crate) fn set_message_lookup_trace_enabled(enabled: bool) -> bool {
303    MESSAGE_LOOKUP_TRACE_ENABLED.swap(enabled, Ordering::Relaxed)
304}
305
306pub(crate) fn message_lookup_trace_snapshot() -> MessageLookupTraceCounters {
307    MessageLookupTraceCounters {
308        exact_idx_probes: MESSAGE_LOOKUP_EXACT_IDX_PROBES.load(Ordering::Relaxed),
309        bounded_lookup_queries: MESSAGE_LOOKUP_BOUNDED_QUERIES.load(Ordering::Relaxed),
310        full_scan_queries: MESSAGE_LOOKUP_FULL_SCAN_QUERIES.load(Ordering::Relaxed),
311        rows_materialized: MESSAGE_LOOKUP_ROWS_MATERIALIZED.load(Ordering::Relaxed),
312    }
313}
314
315fn record_message_lookup_exact_idx_probe() {
316    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
317        MESSAGE_LOOKUP_EXACT_IDX_PROBES.fetch_add(1, Ordering::Relaxed);
318    }
319}
320
321fn record_message_lookup_bounded_queries(query_count: u64, rows: usize) {
322    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
323        MESSAGE_LOOKUP_BOUNDED_QUERIES.fetch_add(query_count, Ordering::Relaxed);
324        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
325    }
326}
327
328fn record_message_lookup_full_scan_query(rows: usize) {
329    if MESSAGE_LOOKUP_TRACE_ENABLED.load(Ordering::Relaxed) {
330        MESSAGE_LOOKUP_FULL_SCAN_QUERIES.fetch_add(1, Ordering::Relaxed);
331        MESSAGE_LOOKUP_ROWS_MATERIALIZED.fetch_add(rows as u64, Ordering::Relaxed);
332    }
333}
334
335pub(crate) struct DoctorMutationDbOpenBypassGuard;
336
337impl Drop for DoctorMutationDbOpenBypassGuard {
338    fn drop(&mut self) {
339        DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_sub(1, Ordering::SeqCst);
340    }
341}
342
343pub(crate) fn enter_doctor_mutation_db_open_bypass() -> DoctorMutationDbOpenBypassGuard {
344    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.fetch_add(1, Ordering::SeqCst);
345    DoctorMutationDbOpenBypassGuard
346}
347
348fn doctor_mutation_db_open_bypass_active() -> bool {
349    DOCTOR_MUTATION_DB_OPEN_BYPASS_DEPTH.load(Ordering::SeqCst) > 0
350}
351
352fn next_franken_retry_jitter_ms(max_inclusive: u64) -> u64 {
353    let mut value = FRANKEN_RETRY_JITTER_STATE.fetch_add(0x9e37_79b9_7f4a_7c15, Ordering::Relaxed);
354    value ^= value >> 30;
355    value = value.wrapping_mul(0xbf58_476d_1ce4_e5b9);
356    value ^= value >> 27;
357    value = value.wrapping_mul(0x94d0_49bb_1331_11eb);
358    value ^= value >> 31;
359    value % max_inclusive.saturating_add(1)
360}
361
362/// Sleep with jittered exponential backoff to avoid lock-step retry storms
363/// when many threads hit the same transient SQLite/frankensqlite contention.
364pub(crate) fn sleep_with_franken_retry_backoff(
365    backoff: &mut Duration,
366    remaining: Duration,
367    max_backoff: Duration,
368) {
369    let capped = (*backoff).min(remaining);
370    let extra_budget = remaining.saturating_sub(capped).min(capped);
371    let extra_ms = extra_budget.as_millis().min(u128::from(u64::MAX)) as u64;
372    let sleep_for = if extra_ms == 0 {
373        capped
374    } else {
375        capped
376            .saturating_add(Duration::from_millis(next_franken_retry_jitter_ms(
377                extra_ms,
378            )))
379            .min(remaining)
380    };
381    std::thread::sleep(sleep_for);
382    *backoff = backoff.saturating_mul(2).min(max_backoff);
383}
384
385struct DoctorMutationDbOpenGuard(Option<fs::File>);
386
387impl Drop for DoctorMutationDbOpenGuard {
388    fn drop(&mut self) {
389        if let Some(file) = self.0.as_ref() {
390            let _ = fs2::FileExt::unlock(file);
391        }
392    }
393}
394
395fn doctor_mutation_lock_path_for_db_open(db_path: &Path) -> Option<PathBuf> {
396    if db_path.file_name().and_then(|name| name.to_str()) != Some("agent_search.db") {
397        return None;
398    }
399
400    Some(
401        db_path
402            .parent()?
403            .join("doctor")
404            .join("locks")
405            .join("doctor-repair.lock"),
406    )
407}
408
409fn doctor_lock_metadata_pid_is_current_process(raw: &str) -> bool {
410    raw.lines().any(|line| {
411        let Some((key, value)) = line.split_once('=') else {
412            return false;
413        };
414        key.trim() == "pid"
415            && value
416                .trim()
417                .parse::<u32>()
418                .is_ok_and(|pid| pid == std::process::id())
419    })
420}
421
422fn doctor_lock_file_pid_is_current_process(file: &fs::File) -> bool {
423    use std::io::Read as _;
424
425    let Ok(mut file) = file.try_clone() else {
426        return false;
427    };
428    let mut raw = String::new();
429    let _ = std::io::Read::take(&mut file, DOCTOR_MUTATION_LOCK_MAX_METADATA_READ)
430        .read_to_string(&mut raw);
431    doctor_lock_metadata_pid_is_current_process(&raw)
432}
433
434fn acquire_doctor_mutation_db_open_guard(
435    db_path: &Path,
436    timeout: Duration,
437) -> Result<DoctorMutationDbOpenGuard> {
438    let Some(lock_path) = doctor_mutation_lock_path_for_db_open(db_path) else {
439        return Ok(DoctorMutationDbOpenGuard(None));
440    };
441    if doctor_mutation_db_open_bypass_active() {
442        return Ok(DoctorMutationDbOpenGuard(None));
443    }
444
445    if let Some(parent) = lock_path.parent() {
446        fs::create_dir_all(parent).with_context(|| {
447            format!(
448                "creating doctor mutation lock directory {} before opening {}",
449                parent.display(),
450                db_path.display()
451            )
452        })?;
453    }
454
455    let deadline = Instant::now() + timeout;
456    let mut backoff = Duration::from_millis(4);
457    loop {
458        let file = fs::OpenOptions::new()
459            .create(true)
460            .truncate(false)
461            .read(true)
462            .write(true)
463            .open(&lock_path)
464            .with_context(|| {
465                format!(
466                    "opening doctor mutation lock {} before opening {}",
467                    lock_path.display(),
468                    db_path.display()
469                )
470            })?;
471
472        if doctor_lock_file_pid_is_current_process(&file) {
473            return Ok(DoctorMutationDbOpenGuard(None));
474        }
475
476        match fs2::FileExt::try_lock_shared(&file) {
477            Ok(()) => return Ok(DoctorMutationDbOpenGuard(Some(file))),
478            Err(err) if err.kind() == std::io::ErrorKind::WouldBlock => {
479                let now = Instant::now();
480                if now >= deadline {
481                    return Err(anyhow!(
482                        "doctor mutation lock {} is active while opening {}; refusing to open during repair after waiting {}ms",
483                        lock_path.display(),
484                        db_path.display(),
485                        timeout.as_millis()
486                    ));
487                }
488                let remaining = deadline.saturating_duration_since(now);
489                sleep_with_franken_retry_backoff(
490                    &mut backoff,
491                    remaining,
492                    Duration::from_millis(128),
493                );
494            }
495            Err(err) => {
496                return Err(anyhow!(
497                    "failed to acquire shared doctor mutation lock {} before opening {}: {}",
498                    lock_path.display(),
499                    db_path.display(),
500                    err
501                ));
502            }
503        }
504    }
505}
506
507pub(crate) fn open_franken_storage_with_timeout(
508    path: &Path,
509    timeout: Duration,
510) -> Result<FrankenStorage> {
511    if !path.exists() {
512        return Err(anyhow!("Database not found at {}", path.display()));
513    }
514
515    let deadline = Instant::now() + timeout;
516    let mut backoff = Duration::from_millis(4);
517    loop {
518        match FrankenStorage::open(path) {
519            Ok(storage) => return Ok(storage),
520            Err(err) if retryable_franken_anyhow(&err) => {
521                let now = Instant::now();
522                if now >= deadline {
523                    return Err(err);
524                }
525                let remaining = deadline.saturating_duration_since(now);
526                sleep_with_franken_retry_backoff(
527                    &mut backoff,
528                    remaining,
529                    Duration::from_millis(128),
530                );
531            }
532            Err(err) => return Err(err),
533        }
534    }
535}
536
537pub(crate) fn open_current_schema_storage_with_timeout(
538    path: &Path,
539    timeout: Duration,
540) -> Result<Option<FrankenStorage>> {
541    if !path.exists() {
542        return Ok(None);
543    }
544
545    let mut storage = FrankenStorage::new(
546        open_franken_raw_connection_with_timeout(path, timeout)?,
547        path.to_path_buf(),
548    );
549    storage.apply_open_stage_busy_timeout();
550
551    let version = storage
552        .raw()
553        .query("SELECT value FROM meta WHERE key = 'schema_version';")
554        .ok()
555        .and_then(|rows| rows.first().cloned())
556        .and_then(|row| row.get_typed::<String>(0).ok())
557        .and_then(|raw| raw.parse::<i64>().ok());
558
559    if version != Some(CURRENT_SCHEMA_VERSION) {
560        if let Err(close_err) = storage.close_without_checkpoint_in_place() {
561            tracing::debug!(
562                error = %close_err,
563                db_path = %path.display(),
564                "open_current_schema_storage_with_timeout: close_without_checkpoint_in_place failed; falling back to best-effort close"
565            );
566            storage.close_best_effort_in_place();
567        }
568        return Ok(None);
569    }
570
571    transition_from_meta_version(&storage.conn)?;
572    storage.repair_missing_current_schema_objects()?;
573    storage.apply_config()?;
574    Ok(Some(storage))
575}
576
577pub(crate) fn open_franken_readonly_storage_with_timeout(
578    path: &Path,
579    timeout: Duration,
580) -> Result<FrankenStorage> {
581    if !path.exists() {
582        return Err(anyhow!("Database not found at {}", path.display()));
583    }
584
585    let deadline = Instant::now() + timeout;
586    let mut backoff = Duration::from_millis(4);
587    loop {
588        match FrankenStorage::open_readonly(path) {
589            Ok(storage) => return Ok(storage),
590            Err(err) if retryable_franken_anyhow(&err) => {
591                let now = Instant::now();
592                if now >= deadline {
593                    return Err(err);
594                }
595                let remaining = deadline.saturating_duration_since(now);
596                sleep_with_franken_retry_backoff(
597                    &mut backoff,
598                    remaining,
599                    Duration::from_millis(128),
600                );
601            }
602            Err(err) => return Err(err),
603        }
604    }
605}
606
607pub(crate) fn open_franken_raw_connection_with_timeout(
608    path: &Path,
609    timeout: Duration,
610) -> Result<FrankenConnection> {
611    if !path.exists() {
612        return Err(anyhow!("Database not found at {}", path.display()));
613    }
614
615    let path_str = path.to_string_lossy().to_string();
616    let deadline = Instant::now() + timeout;
617    let mut backoff = Duration::from_millis(4);
618    loop {
619        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
620        match FrankenConnection::open(&path_str)
621            .with_context(|| format!("opening raw frankensqlite db at {}", path.display()))
622        {
623            Ok(conn) => return Ok(conn),
624            Err(err) if retryable_franken_anyhow(&err) => {
625                let now = Instant::now();
626                if now >= deadline {
627                    return Err(err);
628                }
629                let remaining = deadline.saturating_duration_since(now);
630                sleep_with_franken_retry_backoff(
631                    &mut backoff,
632                    remaining,
633                    Duration::from_millis(128),
634                );
635            }
636            Err(err) => return Err(err),
637        }
638    }
639}
640
641pub(crate) fn open_franken_raw_readonly_connection_with_timeout(
642    path: &Path,
643    timeout: Duration,
644) -> Result<FrankenConnection> {
645    if !path.exists() {
646        return Err(anyhow!("Database not found at {}", path.display()));
647    }
648
649    let path_str = path.to_string_lossy().to_string();
650    let deadline = Instant::now() + timeout;
651    let mut backoff = Duration::from_millis(4);
652    loop {
653        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
654        match open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
655            .with_context(|| {
656                format!(
657                    "opening raw frankensqlite db readonly at {}",
658                    path.display()
659                )
660            }) {
661            Ok(conn) => return Ok(conn),
662            Err(err) if retryable_franken_anyhow(&err) => {
663                let now = Instant::now();
664                if now >= deadline {
665                    return Err(err);
666                }
667                let remaining = deadline.saturating_duration_since(now);
668                sleep_with_franken_retry_backoff(
669                    &mut backoff,
670                    remaining,
671                    Duration::from_millis(128),
672                );
673            }
674            Err(err) => return Err(err),
675        }
676    }
677}
678
679pub(crate) fn retryable_franken_error(err: &frankensqlite::FrankenError) -> bool {
680    matches!(
681        err,
682        frankensqlite::FrankenError::Busy
683            | frankensqlite::FrankenError::BusyRecovery
684            | frankensqlite::FrankenError::BusySnapshot { .. }
685            | frankensqlite::FrankenError::DatabaseLocked { .. }
686            | frankensqlite::FrankenError::LockFailed { .. }
687            | frankensqlite::FrankenError::WriteConflict { .. }
688            | frankensqlite::FrankenError::SerializationFailure { .. }
689    ) || retryable_storage_error_message(&err.to_string())
690}
691
692pub(crate) fn retryable_storage_error_message(message: &str) -> bool {
693    let lower = message.to_ascii_lowercase();
694    lower.contains("busy")
695        || lower.contains("locked")
696        || lower.contains("locking")
697        || lower.contains("contention")
698        || lower.contains("temporarily unavailable")
699        || lower.contains("would block")
700}
701
702pub(crate) fn retryable_franken_anyhow(err: &anyhow::Error) -> bool {
703    err.chain().any(|cause| {
704        cause
705            .downcast_ref::<frankensqlite::FrankenError>()
706            .is_some_and(retryable_franken_error)
707            || retryable_storage_error_message(&cause.to_string())
708    })
709}
710
711impl Drop for LazyFrankenDb {
712    fn drop(&mut self) {
713        let Some(mut conn) = self.conn.get_mut().take() else {
714            return;
715        };
716        conn.0.close_best_effort_in_place();
717    }
718}
719
720// -------------------------------------------------------------------------
721// FrankenSQLite Connection Manager (bead 3rlf8)
722// -------------------------------------------------------------------------
723// Multi-connection management: reader pool + concurrent writer connections.
724// Replaces the LazyFrankenDb single-connection bottleneck for high-throughput
725// scenarios (indexer parallel writes, concurrent TUI reads + indexer writes).
726
727/// Configuration for the [`FrankenConnectionManager`].
728#[derive(Debug, Clone)]
729pub struct ConnectionManagerConfig {
730    /// Number of pre-opened reader connections (default: 4).
731    pub reader_count: usize,
732    /// Maximum concurrent writer connections (default: available parallelism).
733    pub max_writers: usize,
734}
735
736impl Default for ConnectionManagerConfig {
737    fn default() -> Self {
738        let cpus = std::thread::available_parallelism()
739            .map(|n| n.get())
740            .unwrap_or(4);
741        Self {
742            reader_count: 4,
743            max_writers: cpus,
744        }
745    }
746}
747
748/// Multi-connection manager for frankensqlite.
749///
750/// Provides:
751/// - A pool of pre-opened reader connections (round-robin, Mutex-protected)
752/// - Controlled creation of writer connections with token-based limits
753/// - RAII guards that auto-rollback uncommitted transactions on drop
754///
755/// Thread-safe: reader connections are wrapped in Mutex (FrankenConnection is !Sync).
756/// Writer connections are created per-request (each thread gets its own).
757pub struct FrankenConnectionManager {
758    db_path: PathBuf,
759    readers: Vec<parking_lot::Mutex<SendFrankenConnection>>,
760    reader_idx: std::sync::atomic::AtomicUsize,
761    /// Token-based writer limit: channel pre-filled with `max_writers` tokens.
762    /// `recv()` = acquire slot, `send()` = release slot.
763    writer_tokens: (
764        crossbeam_channel::Sender<()>,
765        crossbeam_channel::Receiver<()>,
766    ),
767    config: ConnectionManagerConfig,
768}
769
770// Safety: FrankenConnectionManager is Send+Sync because:
771// - readers wrapped in Mutex<SendFrankenConnection> (exclusive access)
772// - writer_tokens uses crossbeam (Send+Sync)
773// - db_path is PathBuf (Send+Sync)
774unsafe impl Send for FrankenConnectionManager {}
775unsafe impl Sync for FrankenConnectionManager {}
776
777impl FrankenConnectionManager {
778    /// Create a new connection manager.
779    ///
780    /// Opens `config.reader_count` reader connections immediately.
781    /// Writer connections are created on demand (up to `config.max_writers`).
782    pub fn new(db_path: impl Into<PathBuf>, config: ConnectionManagerConfig) -> Result<Self> {
783        let db_path = db_path.into();
784        let path_str = db_path.to_string_lossy().to_string();
785
786        let reader_count = config.reader_count.max(1);
787        let mut readers = Vec::with_capacity(reader_count);
788        for _ in 0..reader_count {
789            let conn = FrankenConnection::open(&path_str)
790                .with_context(|| format!("opening reader connection at {}", db_path.display()))?;
791            // Apply read-tuned config (no migration, no write PRAGMAs)
792            let _ = conn.execute("PRAGMA busy_timeout = 5000;"); // match writer config
793            let _ = conn.execute("PRAGMA cache_size = -16384;"); // 16MB reader cache
794            readers.push(parking_lot::Mutex::new(SendFrankenConnection::new(conn)));
795        }
796
797        let max_writers = config.max_writers.max(1);
798
799        // Pre-fill bounded channel with tokens (acts as counting semaphore).
800        // A zero-capacity channel with no initial tokens would make the first
801        // writer acquisition block forever.
802        let (tx, rx) = crossbeam_channel::bounded(max_writers);
803        for _ in 0..max_writers {
804            tx.send(())
805                .map_err(|_| anyhow!("writer token channel closed during initialization"))?;
806        }
807
808        Ok(Self {
809            db_path,
810            readers,
811            reader_idx: std::sync::atomic::AtomicUsize::new(0),
812            writer_tokens: (tx, rx),
813            config: ConnectionManagerConfig {
814                reader_count,
815                max_writers,
816            },
817        })
818    }
819
820    /// Get a reader connection (round-robin from the pool).
821    ///
822    /// Returns a mutex guard wrapping the connection. The guard prevents
823    /// concurrent access to the same connection (FrankenConnection is !Sync).
824    pub fn reader(&self) -> parking_lot::MutexGuard<'_, SendFrankenConnection> {
825        let idx = self
826            .reader_idx
827            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
828        self.readers[idx % self.readers.len()].lock()
829    }
830
831    /// Acquire a writer connection.
832    ///
833    /// Opens a new frankensqlite connection with full config (no migration).
834    /// Blocks if `max_writers` connections are already in use.
835    /// The returned [`WriterGuard`] auto-rolls back on drop.
836    pub fn writer(&self) -> Result<WriterGuard<'_>> {
837        self.writer_tokens
838            .1
839            .recv()
840            .map_err(|_| anyhow!("writer token channel closed"))?;
841        let path_str = self.db_path.to_string_lossy().to_string();
842        let conn = match FrankenConnection::open(&path_str) {
843            Ok(c) => c,
844            Err(e) => {
845                let _ = self.writer_tokens.0.send(());
846                return Err(anyhow::Error::from(e).context(format!(
847                    "opening writer connection at {}",
848                    self.db_path.display()
849                )));
850            }
851        };
852        let storage = FrankenStorage::new(conn, self.db_path.clone());
853        if let Err(e) = storage.apply_config() {
854            let _ = self.writer_tokens.0.send(());
855            return Err(e);
856        }
857        Ok(WriterGuard {
858            storage,
859            mgr: self,
860            committed: false,
861        })
862    }
863
864    /// Acquire a concurrent writer connection (BEGIN CONCURRENT via MVCC).
865    ///
866    /// Similar to [`writer`] but tuned for the parallel indexer write pool.
867    /// Uses reduced cache size and is designed for short-lived batch inserts.
868    pub fn concurrent_writer(&self) -> Result<WriterGuard<'_>> {
869        self.writer_tokens
870            .1
871            .recv()
872            .map_err(|_| anyhow!("writer token channel closed"))?;
873        let path_str = self.db_path.to_string_lossy().to_string();
874        let conn = match FrankenConnection::open(&path_str) {
875            Ok(c) => c,
876            Err(e) => {
877                let _ = self.writer_tokens.0.send(());
878                return Err(anyhow::Error::from(e).context(format!(
879                    "opening concurrent writer at {}",
880                    self.db_path.display()
881                )));
882            }
883        };
884        let storage = FrankenStorage::new(conn, self.db_path.clone());
885        if let Err(e) = storage.apply_config() {
886            let _ = self.writer_tokens.0.send(());
887            return Err(e);
888        }
889        // Reduced cache for concurrent writers (they're short-lived)
890        let _ = storage.raw().execute("PRAGMA cache_size = -4096;");
891        Ok(WriterGuard {
892            storage,
893            mgr: self,
894            committed: false,
895        })
896    }
897
898    /// Database path managed by this pool.
899    pub fn db_path(&self) -> &Path {
900        &self.db_path
901    }
902
903    /// Number of reader connections in the pool.
904    pub fn reader_count(&self) -> usize {
905        self.readers.len()
906    }
907
908    /// Maximum concurrent writers allowed.
909    pub fn max_writers(&self) -> usize {
910        self.config.max_writers
911    }
912}
913
914impl Drop for FrankenConnectionManager {
915    fn drop(&mut self) {
916        for reader in &mut self.readers {
917            reader.get_mut().0.close_best_effort_in_place();
918        }
919    }
920}
921
922/// RAII guard for a writer connection.
923///
924/// Provides access to a [`FrankenStorage`] for write operations.
925/// Releases the writer semaphore slot when dropped.
926pub struct WriterGuard<'a> {
927    storage: FrankenStorage,
928    mgr: &'a FrankenConnectionManager,
929    committed: bool,
930}
931
932impl<'a> WriterGuard<'a> {
933    /// Access the underlying storage for read/write operations.
934    pub fn storage(&self) -> &FrankenStorage {
935        &self.storage
936    }
937
938    /// Mark this writer as successfully committed.
939    ///
940    /// Call after your transaction's `commit()` succeeds. Prevents the drop
941    /// guard from attempting a rollback.
942    pub fn mark_committed(&mut self) {
943        self.committed = true;
944    }
945}
946
947impl Drop for WriterGuard<'_> {
948    fn drop(&mut self) {
949        if !self.committed {
950            // Best-effort rollback — connection may already be in autocommit
951            let _ = self.storage.raw().execute("ROLLBACK;");
952        }
953        self.storage.close_best_effort_in_place();
954        // Release writer token
955        let _ = self.mgr.writer_tokens.0.send(());
956    }
957}
958
959// -------------------------------------------------------------------------
960// Binary Metadata Serialization (Opt 3.1)
961// -------------------------------------------------------------------------
962// MessagePack provides 50-70% storage reduction vs JSON and faster parsing.
963// New rows use binary columns; existing JSON is read on fallback.
964
965/// Serialize a JSON value to MessagePack bytes.
966/// Returns None for null/empty values to save storage.
967fn serialize_json_to_msgpack(value: &serde_json::Value) -> Option<Vec<u8>> {
968    if value.is_null() || value.as_object().is_some_and(|o| o.is_empty()) {
969        return None;
970    }
971    rmp_serde::to_vec(value).ok()
972}
973
974/// Deserialize MessagePack bytes to a JSON value.
975/// Returns default Value::Object({}) on error or empty input.
976fn deserialize_msgpack_to_json(bytes: &[u8]) -> serde_json::Value {
977    if bytes.is_empty() {
978        return serde_json::Value::Object(serde_json::Map::new());
979    }
980    rmp_serde::from_slice(bytes).unwrap_or_else(|e| {
981        tracing::debug!(
982            error = %e,
983            bytes_len = bytes.len(),
984            "Failed to deserialize metadata - returning empty object"
985        );
986        serde_json::Value::Object(serde_json::Map::new())
987    })
988}
989
990/// Read metadata from a frankensqlite Row, preferring binary (msgpack) over JSON.
991fn franken_read_metadata_compat(
992    row: &FrankenRow,
993    json_idx: usize,
994    bin_idx: usize,
995) -> serde_json::Value {
996    // Try binary column first (new format)
997    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
998        && !bytes.is_empty()
999    {
1000        return deserialize_msgpack_to_json(&bytes);
1001    }
1002
1003    // Fall back to JSON column (old format or migration in progress)
1004    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1005        return serde_json::from_str(&json_str)
1006            .unwrap_or_else(|_| serde_json::Value::Object(serde_json::Map::new()));
1007    }
1008
1009    serde_json::Value::Object(serde_json::Map::new())
1010}
1011
1012fn franken_read_message_extra_compat(
1013    row: &FrankenRow,
1014    json_idx: usize,
1015    bin_idx: usize,
1016) -> serde_json::Value {
1017    if let Ok(Some(bytes)) = row.get_typed::<Option<Vec<u8>>>(bin_idx)
1018        && !bytes.is_empty()
1019    {
1020        return deserialize_msgpack_to_json(&bytes);
1021    }
1022
1023    if let Ok(Some(json_str)) = row.get_typed::<Option<String>>(json_idx) {
1024        return serde_json::from_str(&json_str).unwrap_or(serde_json::Value::Null);
1025    }
1026
1027    serde_json::Value::Null
1028}
1029
1030// -------------------------------------------------------------------------
1031// Migration Error Types (P1.5)
1032// -------------------------------------------------------------------------
1033
1034/// Error type for schema migration operations.
1035#[derive(Debug, Error)]
1036pub enum MigrationError {
1037    /// The schema requires a full rebuild. The database has been backed up.
1038    #[error("Rebuild required: {reason}")]
1039    RebuildRequired {
1040        reason: String,
1041        backup_path: Option<std::path::PathBuf>,
1042    },
1043
1044    /// A database error occurred during migration.
1045    #[error("Database error: {0}")]
1046    Database(#[from] frankensqlite::FrankenError),
1047
1048    /// An I/O error occurred during backup.
1049    #[error("I/O error: {0}")]
1050    Io(#[from] std::io::Error),
1051
1052    /// Other migration error.
1053    #[error("{0}")]
1054    Other(String),
1055}
1056
1057impl From<anyhow::Error> for MigrationError {
1058    fn from(e: anyhow::Error) -> Self {
1059        MigrationError::Other(e.to_string())
1060    }
1061}
1062
1063/// Maximum number of backup files to retain.
1064const MAX_BACKUPS: usize = 3;
1065const BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA: &str = "PRAGMA busy_timeout = 30000;";
1066
1067/// Files that contain user-authored state and must NEVER be deleted during rebuild.
1068const USER_DATA_FILES: &[&str] = &["bookmarks.db", "tui_state.json", "sources.toml", ".env"];
1069
1070/// Check if a file is user-authored data that must be preserved during rebuild.
1071pub fn is_user_data_file(path: &Path) -> bool {
1072    path.file_name()
1073        .and_then(|n| n.to_str())
1074        .map(|name| USER_DATA_FILES.contains(&name))
1075        .unwrap_or(false)
1076}
1077
1078/// SQL to register the FTS5 virtual table on a frankensqlite connection.
1079///
1080/// FrankenSQLite skips virtual-table entries (rootpage=0) when loading
1081/// `sqlite_master` from a stock-SQLite database.  Executing this CREATE
1082/// triggers the legacy FTS5 fallback path and materialises the table so
1083/// subsequent FTS queries work.
1084pub const FTS5_REGISTER_SQL: &str = "\
1085    CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(\
1086        content, title, agent, workspace, source_path, \
1087        created_at UNINDEXED, \
1088        content='', tokenize='porter'\
1089    )";
1090
1091const FTS_FRANKEN_REBUILD_META_KEY: &str = "fts_frankensqlite_rebuild_generation";
1092const FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY: &str = "fts_frankensqlite_archive_fingerprint";
1093const FTS_FRANKEN_REBUILD_GENERATION: i64 = 1;
1094const DAILY_STATS_HEALTH_META_KEY: &str = "daily_stats_archive_fingerprint";
1095const DAILY_STATS_HEALTH_GENERATION_META_KEY: &str = "daily_stats_health_generation";
1096const DAILY_STATS_HEALTH_GENERATION: i64 = 1;
1097
1098/// SQL to clear all rows from the contentless `fts_messages` table.
1099///
1100/// Contentless FTS5 tables reject ordinary `DELETE FROM ...` statements.
1101pub const FTS5_DELETE_ALL_SQL: &str =
1102    "INSERT INTO fts_messages(fts_messages) VALUES('delete-all');";
1103
1104#[cfg(test)]
1105pub(crate) fn materialize_fresh_fts_schema_via_rusqlite(db_path: &Path) -> Result<()> {
1106    // Delegate to FrankenStorage: DROP TABLE IF EXISTS + CREATE VIRTUAL TABLE
1107    // is fully supported by the frankensqlite FTS5 path at
1108    // FrankenStorage::rebuild_fts_via_frankensqlite. We call rebuild which
1109    // also populates rows, matching the historical semantics ("fresh FTS"
1110    // means the schema exists and is consistent with message rows).
1111    let storage = FrankenStorage::open(db_path).with_context(|| {
1112        format!(
1113            "opening frankensqlite db at {} for FTS materialization",
1114            db_path.display()
1115        )
1116    })?;
1117    storage.rebuild_fts_via_frankensqlite().map(|_| ())
1118}
1119
1120#[cfg(test)]
1121pub(crate) fn rebuild_fts_via_rusqlite(db_path: &Path) -> Result<usize> {
1122    let storage = FrankenStorage::open(db_path).with_context(|| {
1123        format!(
1124            "opening frankensqlite db at {} for FTS rebuild",
1125            db_path.display()
1126        )
1127    })?;
1128    let inserted = storage.rebuild_fts_via_frankensqlite()?;
1129    storage.record_fts_franken_rebuild_generation()?;
1130    Ok(inserted)
1131}
1132
1133pub(crate) fn ensure_fts_consistency_via_rusqlite(db_path: &Path) -> Result<FtsConsistencyRepair> {
1134    // Delegates to the FrankenStorage-native path. The function name retains
1135    // the `_via_rusqlite` suffix only for backwards compatibility with the
1136    // few test-site callers; all operations now run through frankensqlite.
1137    let storage = FrankenStorage::open(db_path).with_context(|| {
1138        format!(
1139            "opening frankensqlite db at {} for FTS consistency check",
1140            db_path.display()
1141        )
1142    })?;
1143    storage.ensure_search_fallback_fts_consistency()
1144}
1145
1146/// Create a uniquely named backup of the database file.
1147///
1148/// Returns the path to the backup file, or None if the source doesn't exist.
1149pub fn create_backup(db_path: &Path) -> Result<Option<std::path::PathBuf>, MigrationError> {
1150    if !bundle_path_exists(db_path)? {
1151        return Ok(None);
1152    }
1153
1154    if !copyable_bundle_file_exists(db_path)? {
1155        return Ok(None);
1156    }
1157    let _ = copyable_bundle_sidecar_sources(db_path)?;
1158
1159    let backup_path = unique_backup_path(db_path);
1160    let vacuum_stage_path = vacuum_stage_backup_path(&backup_path);
1161
1162    // Try to use SQLite's VACUUM INTO command first, which safely handles WAL files
1163    // and produces a clean, minimized backup.
1164    match vacuum_into_backup_stage(db_path, &vacuum_stage_path) {
1165        Ok(()) => {
1166            fs::rename(&vacuum_stage_path, &backup_path)?;
1167        }
1168        Err(err) if backup_vacuum_error_requires_consistent_retry(&err) => {
1169            tracing::warn!(
1170                db_path = %db_path.display(),
1171                error = %err,
1172                "create_backup: VACUUM INTO hit transient contention; refusing raw WAL bundle copy"
1173            );
1174            return Err(MigrationError::Database(err));
1175        }
1176        Err(err) => {
1177            tracing::warn!(
1178                db_path = %db_path.display(),
1179                error = %err,
1180                "create_backup: VACUUM INTO failed; falling back to raw evidence copy"
1181            );
1182        }
1183    }
1184
1185    if backup_path.exists() {
1186        sync_file_if_exists(&backup_path)?;
1187        if let Some(parent) = backup_path.parent() {
1188            sync_parent_directory(parent)?;
1189        }
1190        return Ok(Some(backup_path));
1191    }
1192
1193    // Fallback to a raw evidence copy if VACUUM INTO failed (e.g., older SQLite
1194    // or corruption). Keep this on the same symlink-safe bundle path as
1195    // historical seeding so a malformed archive root cannot make us copy an
1196    // arbitrary symlink target or publish a partial sidecar backup.
1197    copy_database_bundle(db_path, &backup_path)?;
1198
1199    Ok(Some(backup_path))
1200}
1201
1202fn vacuum_into_backup_stage(
1203    db_path: &Path,
1204    stage_path: &Path,
1205) -> std::result::Result<(), frankensqlite::FrankenError> {
1206    let mut conn = open_franken_with_flags(
1207        &db_path.to_string_lossy(),
1208        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
1209    )?;
1210    let result = (|| {
1211        conn.execute(BACKUP_VACUUM_BUSY_TIMEOUT_PRAGMA)?;
1212        let path_str = stage_path.to_string_lossy();
1213        conn.execute_compat("VACUUM INTO ?", fparams![path_str.as_ref()])?;
1214        Ok(())
1215    })();
1216    if let Err(close_err) = conn.close_in_place() {
1217        tracing::warn!(
1218            error = %close_err,
1219            db_path = %db_path.display(),
1220            "create_backup: close_in_place failed after VACUUM INTO; falling back to best-effort close"
1221        );
1222        conn.close_best_effort_in_place();
1223    }
1224    result
1225}
1226
1227fn backup_vacuum_error_requires_consistent_retry(err: &frankensqlite::FrankenError) -> bool {
1228    retryable_franken_error(err)
1229}
1230
1231#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1232pub struct DatabaseBundleMoveResult {
1233    pub database: bool,
1234    pub wal: bool,
1235    pub shm: bool,
1236}
1237
1238impl DatabaseBundleMoveResult {
1239    pub fn moved_any(&self) -> bool {
1240        self.database || self.wal || self.shm
1241    }
1242}
1243
1244fn database_sidecar_path(path: &Path, suffix: &str) -> PathBuf {
1245    PathBuf::from(format!("{}{}", path.to_string_lossy(), suffix))
1246}
1247
1248/// Move a database file and its WAL/SHM sidecars to a new basename.
1249///
1250/// This is used for non-destructive quarantine of a corrupted bundle before a
1251/// rebuild. If the main database file is already missing but orphaned sidecars
1252/// remain, those sidecars are still moved so a fresh database can be created
1253/// without inheriting stale WAL state.
1254pub(crate) fn move_database_bundle(
1255    source_root: &Path,
1256    destination_root: &Path,
1257) -> std::io::Result<DatabaseBundleMoveResult> {
1258    let mut moved = DatabaseBundleMoveResult::default();
1259    if let Some(parent) = destination_root.parent() {
1260        fs::create_dir_all(parent)?;
1261        sync_parent_directory(parent)?;
1262    }
1263
1264    if bundle_path_exists(source_root)? {
1265        fs::rename(source_root, destination_root)?;
1266        moved.database = true;
1267    }
1268
1269    let wal_source = database_sidecar_path(source_root, "-wal");
1270    if bundle_path_exists(&wal_source)? {
1271        fs::rename(&wal_source, database_sidecar_path(destination_root, "-wal"))?;
1272        moved.wal = true;
1273    }
1274
1275    let shm_source = database_sidecar_path(source_root, "-shm");
1276    if bundle_path_exists(&shm_source)? {
1277        fs::rename(&shm_source, database_sidecar_path(destination_root, "-shm"))?;
1278        moved.shm = true;
1279    }
1280
1281    if moved.moved_any() {
1282        if let Some(parent) = source_root.parent() {
1283            sync_parent_directory(parent)?;
1284        }
1285        if let Some(parent) = destination_root.parent() {
1286            sync_parent_directory(parent)?;
1287        }
1288    }
1289
1290    Ok(moved)
1291}
1292
1293fn bundle_path_exists(path: &Path) -> std::io::Result<bool> {
1294    match fs::symlink_metadata(path) {
1295        Ok(_) => Ok(true),
1296        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1297        Err(err) => Err(err),
1298    }
1299}
1300
1301fn copy_database_bundle(source_root: &Path, destination_root: &Path) -> Result<()> {
1302    if let Some(parent) = destination_root.parent() {
1303        fs::create_dir_all(parent).with_context(|| {
1304            format!(
1305                "creating destination directory for database bundle copy: {}",
1306                parent.display()
1307            )
1308        })?;
1309        sync_parent_directory(parent)
1310            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1311    }
1312
1313    if !copyable_bundle_file_exists(source_root)? {
1314        bail!(
1315            "database bundle root is missing before copy: {}",
1316            source_root.display()
1317        );
1318    }
1319
1320    let sidecars = copyable_bundle_sidecar_sources(source_root)?;
1321
1322    fs::copy(source_root, destination_root).with_context(|| {
1323        format!(
1324            "copying database bundle {} -> {}",
1325            source_root.display(),
1326            destination_root.display()
1327        )
1328    })?;
1329    sync_file_if_exists(destination_root).with_context(|| {
1330        format!(
1331            "syncing copied database bundle {}",
1332            destination_root.display()
1333        )
1334    })?;
1335
1336    for (source_sidecar, suffix) in sidecars {
1337        let destination_sidecar = database_sidecar_path(destination_root, suffix);
1338        fs::copy(&source_sidecar, &destination_sidecar).with_context(|| {
1339            format!(
1340                "copying database bundle sidecar {} -> {}",
1341                source_sidecar.display(),
1342                destination_sidecar.display()
1343            )
1344        })?;
1345        sync_file_if_exists(&destination_sidecar).with_context(|| {
1346            format!(
1347                "syncing copied database bundle sidecar {}",
1348                destination_sidecar.display()
1349            )
1350        })?;
1351    }
1352
1353    if let Some(parent) = destination_root.parent() {
1354        sync_parent_directory(parent)
1355            .with_context(|| format!("syncing destination directory {}", parent.display()))?;
1356    }
1357
1358    Ok(())
1359}
1360
1361fn copyable_bundle_sidecar_sources(source_root: &Path) -> Result<Vec<(PathBuf, &'static str)>> {
1362    let mut sidecars = Vec::new();
1363    for suffix in ["-wal", "-shm"] {
1364        let source_sidecar = database_sidecar_path(source_root, suffix);
1365        if copyable_bundle_file_exists(&source_sidecar)? {
1366            sidecars.push((source_sidecar, suffix));
1367        }
1368    }
1369    Ok(sidecars)
1370}
1371
1372fn copyable_bundle_file_exists(path: &Path) -> Result<bool> {
1373    match fs::symlink_metadata(path) {
1374        Ok(metadata) => {
1375            let file_type = metadata.file_type();
1376            if file_type.is_symlink() {
1377                bail!(
1378                    "refusing to copy database bundle symlink: {}",
1379                    path.display()
1380                );
1381            }
1382            if !file_type.is_file() {
1383                bail!(
1384                    "refusing to copy non-file database bundle path: {}",
1385                    path.display()
1386                );
1387            }
1388            Ok(true)
1389        }
1390        Err(err) if err.kind() == std::io::ErrorKind::NotFound => Ok(false),
1391        Err(err) => Err(err).with_context(|| {
1392            format!(
1393                "checking database bundle path before copy: {}",
1394                path.display()
1395            )
1396        }),
1397    }
1398}
1399
1400/// Helper to safely remove a database file and its potential WAL/SHM sidecars.
1401pub(crate) fn remove_database_files(path: &Path) -> std::io::Result<()> {
1402    let mut removed_any = false;
1403
1404    match fs::remove_file(path) {
1405        Ok(()) => removed_any = true,
1406        Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1407        Err(err) => return Err(err),
1408    }
1409
1410    // Best-effort removal of sidecar files (ignore errors if they don't exist)
1411    for suffix in ["-wal", "-shm"] {
1412        match fs::remove_file(database_sidecar_path(path, suffix)) {
1413            Ok(()) => removed_any = true,
1414            Err(err) if err.kind() == std::io::ErrorKind::NotFound => {}
1415            Err(err) => return Err(err),
1416        }
1417    }
1418
1419    if removed_any && let Some(parent) = path.parent() {
1420        sync_parent_directory(parent)?;
1421    }
1422
1423    Ok(())
1424}
1425
1426#[cfg(not(windows))]
1427fn sync_parent_directory(path: &Path) -> std::io::Result<()> {
1428    fs::File::open(path)?.sync_all()
1429}
1430
1431#[cfg(windows)]
1432fn sync_parent_directory(_path: &Path) -> std::io::Result<()> {
1433    Ok(())
1434}
1435
1436fn sync_file_if_exists(path: &Path) -> std::io::Result<()> {
1437    if path.exists() {
1438        fs::File::open(path)?.sync_all()?;
1439    }
1440    Ok(())
1441}
1442
1443/// Remove old backup files, keeping only the most recent `keep_count`.
1444pub fn cleanup_old_backups(db_path: &Path, keep_count: usize) -> Result<(), std::io::Error> {
1445    let parent = match db_path.parent() {
1446        Some(p) => p,
1447        None => return Ok(()),
1448    };
1449
1450    let db_name = db_path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
1451
1452    let prefix = format!("{}.backup.", db_name);
1453
1454    // Collect backup files matching the pattern
1455    let mut backups: Vec<(std::path::PathBuf, SystemTime)> = Vec::new();
1456
1457    if let Ok(entries) = fs::read_dir(parent) {
1458        for entry in entries.flatten() {
1459            let path = entry.path();
1460            if let Some(name) = path.file_name().and_then(|n| n.to_str())
1461                && is_backup_root_name(name, &prefix)
1462                && let Ok(meta) = fs::metadata(&path)
1463                && meta.is_file()
1464                && let Ok(mtime) = meta.modified()
1465            {
1466                backups.push((path, mtime));
1467            }
1468        }
1469    }
1470
1471    // Sort by modification time, newest first
1472    backups.sort_by_key(|entry| std::cmp::Reverse(entry.1));
1473
1474    // Delete oldest backups beyond keep_count
1475    for (path, _) in backups.into_iter().skip(keep_count) {
1476        let _ = fs::remove_file(&path);
1477
1478        // Also try to cleanup potential sidecars from fs::copy fallback
1479        let _ = fs::remove_file(database_sidecar_path(&path, "-wal"));
1480        let _ = fs::remove_file(database_sidecar_path(&path, "-shm"));
1481    }
1482
1483    Ok(())
1484}
1485
1486#[derive(Debug, Clone)]
1487pub(crate) struct HistoricalDatabaseBundle {
1488    root_path: PathBuf,
1489    total_bytes: u64,
1490    modified_at_ms: i64,
1491    supports_direct_readonly: bool,
1492    probe: HistoricalBundleProbe,
1493}
1494
1495#[derive(Debug, Clone, Copy, Default)]
1496struct HistoricalBundleProbe {
1497    schema_version: Option<i64>,
1498    fts_schema_rows: Option<i64>,
1499    fts_queryable: bool,
1500    max_message_id: i64,
1501}
1502
1503#[cfg(test)]
1504#[allow(dead_code)]
1505#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1506pub(crate) struct SqliteDatabaseHealthProbe {
1507    pub schema_version: Option<i64>,
1508    pub quick_check_ok: bool,
1509    pub fts_schema_rows: i64,
1510    pub fts_queryable: bool,
1511    pub message_count: i64,
1512    pub max_message_id: i64,
1513}
1514
1515#[derive(Debug, Clone, Copy, PartialEq, Eq)]
1516pub(crate) enum FtsConsistencyRepair {
1517    AlreadyHealthy {
1518        rows: usize,
1519    },
1520    IncrementalCatchUp {
1521        inserted_rows: usize,
1522        total_rows: usize,
1523    },
1524    Rebuilt {
1525        inserted_rows: usize,
1526    },
1527}
1528
1529#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
1530pub struct HistoricalSalvageOutcome {
1531    pub bundles_considered: usize,
1532    pub bundles_imported: usize,
1533    pub conversations_imported: usize,
1534    pub messages_imported: usize,
1535}
1536
1537impl HistoricalSalvageOutcome {
1538    pub(crate) fn accumulate(&mut self, other: Self) {
1539        self.bundles_considered += other.bundles_considered;
1540        self.bundles_imported += other.bundles_imported;
1541        self.conversations_imported += other.conversations_imported;
1542        self.messages_imported += other.messages_imported;
1543    }
1544}
1545
1546#[derive(Debug)]
1547struct HistoricalReadConnection {
1548    conn: FrankenConnection,
1549    method: &'static str,
1550    root_path: PathBuf,
1551    _tempdir: Option<tempfile::TempDir>,
1552}
1553
1554const HISTORICAL_RECOVERY_CORE_SCHEMA: &str = r"
1555CREATE TABLE sources (
1556    id TEXT PRIMARY KEY,
1557    kind TEXT,
1558    host_label TEXT,
1559    machine_id TEXT,
1560    platform TEXT,
1561    config_json TEXT,
1562    created_at INTEGER,
1563    updated_at INTEGER
1564);
1565CREATE TABLE agents (
1566    id INTEGER PRIMARY KEY,
1567    slug TEXT,
1568    name TEXT,
1569    version TEXT,
1570    kind TEXT,
1571    created_at INTEGER,
1572    updated_at INTEGER
1573);
1574CREATE TABLE workspaces (
1575    id INTEGER PRIMARY KEY,
1576    path TEXT,
1577    display_name TEXT
1578);
1579CREATE TABLE conversations (
1580    id INTEGER PRIMARY KEY,
1581    agent_id INTEGER,
1582    workspace_id INTEGER,
1583    source_id TEXT,
1584    external_id TEXT,
1585    title TEXT,
1586    source_path TEXT,
1587    started_at INTEGER,
1588    ended_at INTEGER,
1589    approx_tokens INTEGER,
1590    metadata_json TEXT,
1591    origin_host TEXT,
1592    metadata_bin BLOB,
1593    total_input_tokens INTEGER,
1594    total_output_tokens INTEGER,
1595    total_cache_read_tokens INTEGER,
1596    total_cache_creation_tokens INTEGER,
1597    grand_total_tokens INTEGER,
1598    estimated_cost_usd REAL,
1599    primary_model TEXT,
1600    api_call_count INTEGER,
1601    tool_call_count INTEGER,
1602    user_message_count INTEGER,
1603    assistant_message_count INTEGER,
1604    last_message_idx INTEGER,
1605    last_message_created_at INTEGER
1606);
1607CREATE TABLE messages (
1608    id INTEGER PRIMARY KEY,
1609    conversation_id INTEGER,
1610    idx INTEGER,
1611    role TEXT,
1612    author TEXT,
1613    created_at INTEGER,
1614    content TEXT,
1615    extra_json TEXT,
1616    extra_bin BLOB
1617);
1618CREATE TABLE snippets (
1619    id INTEGER PRIMARY KEY,
1620    message_id INTEGER,
1621    file_path TEXT,
1622    start_line INTEGER,
1623    end_line INTEGER,
1624    language TEXT,
1625    snippet_text TEXT
1626);
1627";
1628const HISTORICAL_SALVAGE_LEDGER_VERSION: u32 = 2;
1629const HISTORICAL_SALVAGE_PROGRESS_VERSION: u32 = 1;
1630const SOURCE_PATH_MERGE_START_TOLERANCE_MS: i64 = 5 * 60 * 1000;
1631
1632#[derive(Debug, Clone, Serialize, Deserialize)]
1633struct HistoricalBundleProgress {
1634    progress_version: u32,
1635    path: String,
1636    bytes: u64,
1637    modified_at_ms: i64,
1638    method: String,
1639    last_completed_source_row_id: i64,
1640    conversations_imported: usize,
1641    messages_imported: usize,
1642    updated_at_ms: i64,
1643}
1644
1645#[derive(Debug, Clone)]
1646struct HistoricalBatchEntry {
1647    source_row_id: i64,
1648    agent_id: i64,
1649    workspace_id: Option<i64>,
1650    conversation: Conversation,
1651}
1652
1653#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
1654struct HistoricalBatchImportTotals {
1655    inserted_source_rows: usize,
1656    inserted_messages: usize,
1657}
1658
1659fn historical_bundle_root_paths(db_path: &Path) -> Vec<PathBuf> {
1660    let mut roots = Vec::new();
1661    let Some(parent) = db_path.parent() else {
1662        return roots;
1663    };
1664    let db_name = db_path
1665        .file_name()
1666        .and_then(|n| n.to_str())
1667        .unwrap_or("agent_search.db");
1668    let db_stem = db_path
1669        .file_stem()
1670        .and_then(|n| n.to_str())
1671        .unwrap_or("agent_search");
1672
1673    let mut push_root = |path: PathBuf| {
1674        if path == db_path {
1675            return;
1676        }
1677        if !roots.iter().any(|existing| existing == &path) {
1678            roots.push(path);
1679        }
1680    };
1681
1682    if let Ok(entries) = fs::read_dir(parent) {
1683        for entry in entries.flatten() {
1684            let path = entry.path();
1685            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1686                continue;
1687            };
1688            if has_db_sidecar_suffix(name) {
1689                continue;
1690            }
1691            if name.starts_with(&format!("{db_name}.backup."))
1692                || name.starts_with(&format!("{db_stem}.corrupt."))
1693            {
1694                push_root(path);
1695            }
1696        }
1697    }
1698
1699    let backups_dir = parent.join("backups");
1700    if let Ok(entries) = fs::read_dir(backups_dir) {
1701        for entry in entries.flatten() {
1702            let path = entry.path();
1703            let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
1704                continue;
1705            };
1706            if has_db_sidecar_suffix(name) {
1707                continue;
1708            }
1709            if name.starts_with(&format!("{db_name}.")) && name.ends_with(".bak") {
1710                push_root(path);
1711            }
1712        }
1713    }
1714
1715    push_named_database_children(&mut roots, db_path, &parent.join("repair-lab"), db_name);
1716    push_named_database_children(&mut roots, db_path, &parent.join("snapshots"), db_name);
1717
1718    roots
1719}
1720
1721fn push_named_database_children(
1722    roots: &mut Vec<PathBuf>,
1723    canonical_db_path: &Path,
1724    dir: &Path,
1725    db_name: &str,
1726) {
1727    if let Ok(entries) = fs::read_dir(dir) {
1728        for entry in entries.flatten() {
1729            let candidate = entry.path().join(db_name);
1730            if candidate == canonical_db_path {
1731                continue;
1732            }
1733            if candidate.exists() && !roots.iter().any(|existing| existing == &candidate) {
1734                roots.push(candidate);
1735            }
1736        }
1737    }
1738}
1739
1740fn file_mtime_ms(path: &Path) -> i64 {
1741    fs::metadata(path)
1742        .and_then(|meta| meta.modified())
1743        .ok()
1744        .and_then(|ts| ts.duration_since(UNIX_EPOCH).ok())
1745        .map(|d| d.as_millis() as i64)
1746        .unwrap_or(0)
1747}
1748
1749fn bundle_total_bytes(root_path: &Path) -> u64 {
1750    let mut total = fs::metadata(root_path).map(|meta| meta.len()).unwrap_or(0);
1751    for suffix in ["-wal", "-shm"] {
1752        let sidecar = database_sidecar_path(root_path, suffix);
1753        total = total.saturating_add(fs::metadata(sidecar).map(|meta| meta.len()).unwrap_or(0));
1754    }
1755    total
1756}
1757
1758pub(crate) fn discover_historical_database_bundles(
1759    db_path: &Path,
1760) -> Vec<HistoricalDatabaseBundle> {
1761    let mut bundles: Vec<_> = historical_bundle_root_paths(db_path)
1762        .into_iter()
1763        .filter(|root| root.exists())
1764        .map(|root_path| {
1765            let modified_at_ms = file_mtime_ms(&root_path);
1766            let total_bytes = bundle_total_bytes(&root_path);
1767            let supports_direct_readonly = historical_bundle_supports_direct_readonly(&root_path);
1768            let probe = probe_historical_bundle(&root_path);
1769            HistoricalDatabaseBundle {
1770                modified_at_ms,
1771                total_bytes,
1772                supports_direct_readonly,
1773                root_path,
1774                probe,
1775            }
1776        })
1777        .filter(|bundle| bundle.total_bytes > 0)
1778        .collect();
1779
1780    fn bundle_priority(path: &Path) -> i32 {
1781        let path_str = path.to_string_lossy();
1782        if path_str.contains("/repair-lab/replay-") {
1783            return 5;
1784        }
1785        if path_str.contains("/repair-lab/") {
1786            return 4;
1787        }
1788        if path_str.contains("/snapshots/") {
1789            return 3;
1790        }
1791        if path_str.contains(".corrupt.") || path_str.contains("failed-baseline-seed") {
1792            return 0;
1793        }
1794        1
1795    }
1796
1797    fn bundle_health_rank(bundle: &HistoricalDatabaseBundle) -> i32 {
1798        // Classify FTS health. The probe only sets `fts_queryable = true`
1799        // when `fts_schema_rows == Some(1)` (see
1800        // `historical_bundle_fts_queryable_via_frankensqlite`), so we have
1801        // two legitimate "clean" shapes for a bundle:
1802        //
1803        //   * `fts_schema_rows == Some(1) && fts_queryable` — a pre-V14
1804        //     bundle where the FTS virtual table was eagerly created by
1805        //     migration and is queryable right now.
1806        //
1807        //   * `fts_schema_rows == Some(0) && schema_version == Some(V14+)` —
1808        //     a modern bundle where `MIGRATION_V14` dropped fts_messages on
1809        //     purpose and cass recreates it lazily via
1810        //     `ensure_search_fallback_fts_consistency` on the first open.
1811        //     Gating on `schema_version == CURRENT_SCHEMA_VERSION` is critical
1812        //     so an incomplete pre-V14 bundle with 0 fts rows is not promoted
1813        //     alongside real lazy-V14+ bundles. A `None` schema_version
1814        //     (schema marker unreadable) is excluded for the same reason.
1815        //
1816        // Everything else — `Some(1)` without queryability, `Some(n)` for
1817        // n >= 2 (duplicated CREATE VIRTUAL TABLE rows from a broken legacy
1818        // rebuild), `None` entirely, or `Some(0)` on a non-current schema —
1819        // is not "fts clean".
1820        let fts_clean = match bundle.probe.fts_schema_rows {
1821            Some(1) => bundle.probe.fts_queryable,
1822            Some(0) => bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION),
1823            _ => false,
1824        };
1825
1826        let clean_schema14_fts =
1827            bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION) && fts_clean;
1828        if clean_schema14_fts {
1829            return 5;
1830        }
1831
1832        if fts_clean {
1833            return 4;
1834        }
1835
1836        if bundle.probe.schema_version == Some(CURRENT_SCHEMA_VERSION)
1837            && bundle.supports_direct_readonly
1838        {
1839            return 3;
1840        }
1841
1842        if bundle.supports_direct_readonly {
1843            return 2;
1844        }
1845
1846        1
1847    }
1848
1849    bundles.sort_by(|left, right| {
1850        bundle_health_rank(right)
1851            .cmp(&bundle_health_rank(left))
1852            .then_with(|| right.probe.max_message_id.cmp(&left.probe.max_message_id))
1853            .then_with(|| bundle_priority(&right.root_path).cmp(&bundle_priority(&left.root_path)))
1854            .then_with(|| {
1855                right
1856                    .supports_direct_readonly
1857                    .cmp(&left.supports_direct_readonly)
1858            })
1859            .then_with(|| right.total_bytes.cmp(&left.total_bytes))
1860            .then_with(|| right.modified_at_ms.cmp(&left.modified_at_ms))
1861            .then_with(|| right.root_path.cmp(&left.root_path))
1862    });
1863    bundles
1864}
1865
1866fn probe_historical_bundle(root_path: &Path) -> HistoricalBundleProbe {
1867    let Ok(conn) = open_historical_bundle_readonly(root_path) else {
1868        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or_default();
1869    };
1870
1871    let schema_version = read_meta_schema_version(&conn).ok().flatten();
1872    let fts_schema_rows: Option<i64> = conn
1873        .query_row_map(
1874            "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
1875            fparams![],
1876            |row| row.get_typed(0),
1877        )
1878        .ok();
1879    let fts_queryable =
1880        historical_bundle_fts_queryable_via_frankensqlite(root_path, fts_schema_rows);
1881    let max_message_id: i64 = conn
1882        .query_row_map(
1883            "SELECT COALESCE(MAX(id), 0) FROM messages",
1884            fparams![],
1885            |row| row.get_typed(0),
1886        )
1887        .unwrap_or(0);
1888
1889    let probe = HistoricalBundleProbe {
1890        schema_version,
1891        fts_schema_rows,
1892        fts_queryable,
1893        max_message_id,
1894    };
1895
1896    if probe.schema_version.is_none()
1897        && probe.fts_schema_rows.is_none()
1898        && probe.max_message_id == 0
1899    {
1900        return probe_historical_bundle_via_sqlite3_metadata(root_path).unwrap_or(probe);
1901    }
1902
1903    probe
1904}
1905
1906fn probe_historical_bundle_via_sqlite3_metadata(root_path: &Path) -> Option<HistoricalBundleProbe> {
1907    let bundle_uri = format!("file:{}?immutable=1", root_path.to_string_lossy());
1908    let output = Command::new("sqlite3")
1909        .arg("-batch")
1910        .arg("-noheader")
1911        .arg(&bundle_uri)
1912        .arg(
1913            "PRAGMA writable_schema=ON;
1914             SELECT COALESCE((SELECT value FROM meta WHERE key = 'schema_version'), '');
1915             SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages';
1916             SELECT COALESCE(MAX(id), 0) FROM messages;",
1917        )
1918        .output()
1919        .ok()?;
1920    if !output.status.success() {
1921        return None;
1922    }
1923
1924    let stdout = String::from_utf8(output.stdout).ok()?;
1925    let mut lines = stdout.lines();
1926    let schema_version = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
1927    let fts_schema_rows = lines.next().and_then(|raw| raw.trim().parse::<i64>().ok());
1928    let max_message_id = lines
1929        .next()
1930        .and_then(|raw| raw.trim().parse::<i64>().ok())
1931        .unwrap_or(0);
1932
1933    Some(HistoricalBundleProbe {
1934        schema_version,
1935        fts_schema_rows,
1936        fts_queryable: false,
1937        max_message_id,
1938    })
1939}
1940
1941fn historical_bundle_fts_queryable_via_frankensqlite(
1942    root_path: &Path,
1943    fts_schema_rows: Option<i64>,
1944) -> bool {
1945    matches!(fts_schema_rows, Some(1))
1946        && FrankenStorage::open_readonly(root_path)
1947            .map(|storage| {
1948                storage
1949                    .raw()
1950                    .query("SELECT COUNT(*) FROM fts_messages")
1951                    .is_ok()
1952            })
1953            .unwrap_or(false)
1954}
1955
1956fn historical_bundle_supports_direct_readonly(root_path: &Path) -> bool {
1957    open_historical_bundle_readonly(root_path)
1958        .and_then(|conn| historical_bundle_has_queryable_core_tables(&conn))
1959        .is_ok()
1960}
1961
1962fn historical_table_exists(conn: &FrankenConnection, table: &str) -> Result<bool> {
1963    let found: Option<i64> = conn
1964        .query_row_map(
1965            "SELECT 1 FROM sqlite_master WHERE type = 'table' AND name = ?1 LIMIT 1",
1966            fparams![table],
1967            |row| row.get_typed(0),
1968        )
1969        .optional()
1970        .with_context(|| format!("checking for historical table {table}"))?;
1971    Ok(found.is_some())
1972}
1973
1974fn probe_historical_table_reads(conn: &FrankenConnection, table: &str) -> Result<()> {
1975    if !historical_table_exists(conn, table)? {
1976        return Err(anyhow!(
1977            "historical database missing required table {table}"
1978        ));
1979    }
1980
1981    let sql = format!("SELECT rowid FROM {table} LIMIT 1");
1982    let _: Option<i64> = conn
1983        .query_row_map(&sql, fparams![], |row| row.get_typed(0))
1984        .optional()
1985        .with_context(|| format!("probing rows from historical table {table}"))?;
1986    Ok(())
1987}
1988
1989fn historical_bundle_has_queryable_core_tables(conn: &FrankenConnection) -> Result<()> {
1990    probe_historical_table_reads(conn, "conversations")?;
1991    probe_historical_table_reads(conn, "messages")?;
1992    Ok(())
1993}
1994
1995fn open_historical_bundle_readonly(root_path: &Path) -> Result<FrankenConnection> {
1996    let path_str = root_path.to_string_lossy();
1997    let flags = FrankenOpenFlags::SQLITE_OPEN_READ_ONLY;
1998    let conn = open_franken_with_flags(&path_str, flags)
1999        .with_context(|| format!("opening historical database {}", root_path.display()))?;
2000    Ok(conn)
2001}
2002
2003fn is_recoverable_insert_line(line: &str) -> bool {
2004    [
2005        "sources",
2006        "agents",
2007        "workspaces",
2008        "conversations",
2009        "messages",
2010        "snippets",
2011    ]
2012    .iter()
2013    .any(|table| {
2014        line.starts_with(&format!("INSERT INTO '{table}'"))
2015            || line.starts_with(&format!("INSERT OR IGNORE INTO '{table}'"))
2016            || line.starts_with(&format!("INSERT INTO \"{table}\""))
2017            || line.starts_with(&format!("INSERT OR IGNORE INTO \"{table}\""))
2018    })
2019}
2020
2021fn recover_historical_bundle_via_sqlite3(
2022    bundle: &HistoricalDatabaseBundle,
2023) -> Result<HistoricalReadConnection> {
2024    let tempdir = tempfile::TempDir::new().context("creating temporary salvage directory")?;
2025    let recovered_db = tempdir.path().join("historical-recovered.db");
2026    let temp_conn = FrankenConnection::open(recovered_db.to_string_lossy().as_ref())
2027        .with_context(|| format!("creating recovered database {}", recovered_db.display()))?;
2028    temp_conn
2029        .execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA)
2030        .with_context(|| format!("initializing recovered schema {}", recovered_db.display()))?;
2031    drop(temp_conn);
2032
2033    let bundle_uri = format!("file:{}?immutable=1", bundle.root_path.to_string_lossy());
2034    let mut recover = Command::new("sqlite3")
2035        .arg(&bundle_uri)
2036        .arg(".recover")
2037        .stdout(Stdio::piped())
2038        .spawn()
2039        .with_context(|| {
2040            format!(
2041                "launching sqlite3 .recover for historical bundle {}",
2042                bundle.root_path.display()
2043            )
2044        })?;
2045    let recover_stdout = recover
2046        .stdout
2047        .take()
2048        .context("capturing sqlite3 .recover stdout")?;
2049
2050    let mut importer = Command::new("sqlite3")
2051        .arg(&recovered_db)
2052        .stdin(Stdio::piped())
2053        .spawn()
2054        .with_context(|| {
2055            format!(
2056                "launching sqlite3 importer for recovered bundle {}",
2057                recovered_db.display()
2058            )
2059        })?;
2060
2061    {
2062        let importer_stdin = importer
2063            .stdin
2064            .as_mut()
2065            .context("opening sqlite3 importer stdin")?;
2066        importer_stdin
2067            .write_all(b"BEGIN;\n")
2068            .context("starting recovery import transaction")?;
2069
2070        let reader = BufReader::new(recover_stdout);
2071        for line in reader.lines() {
2072            let line = line.context("reading sqlite3 .recover output")?;
2073            if is_recoverable_insert_line(&line) {
2074                importer_stdin
2075                    .write_all(line.as_bytes())
2076                    .context("writing recovered INSERT")?;
2077                importer_stdin
2078                    .write_all(b"\n")
2079                    .context("writing recovered INSERT newline")?;
2080            }
2081        }
2082
2083        importer_stdin
2084            .write_all(b"COMMIT;\n")
2085            .context("committing recovery import transaction")?;
2086    }
2087
2088    let recover_status = recover
2089        .wait()
2090        .context("waiting for sqlite3 .recover process")?;
2091    if !recover_status.success() {
2092        anyhow::bail!(
2093            "sqlite3 .recover exited with status {} for {}",
2094            recover_status,
2095            bundle.root_path.display()
2096        );
2097    }
2098
2099    let importer_status = importer
2100        .wait()
2101        .context("waiting for sqlite3 recovery importer")?;
2102    if !importer_status.success() {
2103        anyhow::bail!(
2104            "sqlite3 recovery importer exited with status {} for {}",
2105            importer_status,
2106            recovered_db.display()
2107        );
2108    }
2109
2110    let conn = open_historical_bundle_readonly(&recovered_db)?;
2111    historical_bundle_has_queryable_core_tables(&conn)?;
2112    Ok(HistoricalReadConnection {
2113        conn,
2114        method: "sqlite3-recover",
2115        root_path: recovered_db,
2116        _tempdir: Some(tempdir),
2117    })
2118}
2119
2120fn open_historical_bundle_for_salvage(
2121    bundle: &HistoricalDatabaseBundle,
2122) -> Result<HistoricalReadConnection> {
2123    match open_historical_bundle_readonly(&bundle.root_path) {
2124        Ok(conn) => {
2125            if historical_bundle_has_queryable_core_tables(&conn).is_ok() {
2126                return Ok(HistoricalReadConnection {
2127                    conn,
2128                    method: "direct-readonly",
2129                    root_path: bundle.root_path.clone(),
2130                    _tempdir: None,
2131                });
2132            }
2133        }
2134        Err(err) => {
2135            tracing::warn!(
2136                path = %bundle.root_path.display(),
2137                error = %err,
2138                "historical bundle direct open failed; falling back to sqlite3 .recover"
2139            );
2140        }
2141    }
2142
2143    recover_historical_bundle_via_sqlite3(bundle)
2144}
2145
2146fn historical_bundle_counts(conn: &FrankenConnection) -> Result<(usize, usize)> {
2147    let conversations: i64 =
2148        conn.query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
2149            row.get_typed(0)
2150        })?;
2151    let messages: i64 = conn.query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2152        row.get_typed(0)
2153    })?;
2154    Ok((
2155        usize::try_from(conversations.max(0)).unwrap_or(usize::MAX),
2156        usize::try_from(messages.max(0)).unwrap_or(usize::MAX),
2157    ))
2158}
2159
2160fn clear_seeded_runtime_meta(conn: &FrankenConnection) -> Result<()> {
2161    conn.execute(
2162        "DELETE FROM meta
2163         WHERE key LIKE 'historical_bundle_salvaged:%'
2164            OR key IN ('last_scan_ts', 'last_indexed_at', 'last_embedded_message_id')",
2165    )?;
2166    Ok(())
2167}
2168
2169fn record_historical_bundle_import(
2170    conn: &FrankenConnection,
2171    bundle: &HistoricalDatabaseBundle,
2172    method: &str,
2173    conversations_imported: usize,
2174    messages_imported: usize,
2175) -> Result<()> {
2176    let key = FrankenStorage::historical_bundle_meta_key(bundle);
2177    let value = serde_json::json!({
2178        "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
2179        "path": bundle.root_path.display().to_string(),
2180        "bytes": bundle.total_bytes,
2181        "modified_at_ms": bundle.modified_at_ms,
2182        "method": method,
2183        "conversations_imported": conversations_imported,
2184        "messages_imported": messages_imported,
2185        "recorded_at_ms": FrankenStorage::now_millis(),
2186    });
2187    let value_str = serde_json::to_string(&value)?;
2188    conn.execute_compat(
2189        "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
2190        fparams![key, value_str],
2191    )?;
2192    Ok(())
2193}
2194
2195fn finalize_seeded_canonical_bundle_via_rusqlite(
2196    canonical_db_path: &Path,
2197    bundle: &HistoricalDatabaseBundle,
2198    conversations_imported: usize,
2199    messages_imported: usize,
2200) -> Result<()> {
2201    let _fts_repair =
2202        ensure_fts_consistency_via_rusqlite(canonical_db_path).with_context(|| {
2203            format!(
2204                "repairing staged canonical FTS consistency before finalization: {}",
2205                canonical_db_path.display()
2206            )
2207        })?;
2208
2209    let path_str = canonical_db_path.to_string_lossy();
2210    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2211        format!(
2212            "opening seeded canonical database for post-seed finalization: {}",
2213            canonical_db_path.display()
2214        )
2215    })?;
2216    conn.execute("PRAGMA busy_timeout = 30000;")
2217        .with_context(|| {
2218            format!(
2219                "configuring busy timeout for seeded canonical database {}",
2220                canonical_db_path.display()
2221            )
2222        })?;
2223    let schema_version = read_meta_schema_version(&conn)?;
2224
2225    if let Some(version) = schema_version
2226        && version < CURRENT_SCHEMA_VERSION
2227        && version != 13
2228    {
2229        anyhow::bail!(
2230            "seeded canonical bundle schema_version {version} is too old for baseline import and cannot be finalized automatically"
2231        );
2232    }
2233
2234    clear_seeded_runtime_meta(&conn)?;
2235
2236    conn.execute_compat(
2237        "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1)",
2238        fparams![CURRENT_SCHEMA_VERSION.to_string()],
2239    )?;
2240
2241    conn.execute_compat(
2242        "INSERT OR IGNORE INTO _schema_migrations(version, name) VALUES(?1, 'fts_contentless')",
2243        fparams![CURRENT_SCHEMA_VERSION],
2244    )?;
2245    record_historical_bundle_import(
2246        &conn,
2247        bundle,
2248        "baseline-bulk-sql-copy",
2249        conversations_imported,
2250        messages_imported,
2251    )?;
2252    Ok(())
2253}
2254
2255fn read_meta_schema_version(conn: &FrankenConnection) -> Result<Option<i64>> {
2256    let version: Option<String> = conn
2257        .query_row_map(
2258            "SELECT value FROM meta WHERE key = 'schema_version'",
2259            fparams![],
2260            |row| row.get_typed(0),
2261        )
2262        .optional()?;
2263    Ok(version.and_then(|raw| raw.parse::<i64>().ok()))
2264}
2265
2266#[cfg(test)]
2267fn franken_fts_schema_rows(conn: &FrankenConnection) -> Result<i64> {
2268    conn.query_row_map(
2269        "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
2270        fparams![],
2271        |row| row.get_typed(0),
2272    )
2273    .context("counting sqlite_master rows for fts_messages via frankensqlite")
2274}
2275
2276#[cfg(test)]
2277fn franken_fts_limit_probe(conn: &FrankenConnection) -> bool {
2278    conn.query("SELECT COUNT(*) FROM fts_messages").is_ok()
2279}
2280
2281#[cfg(test)]
2282#[allow(dead_code)]
2283pub(crate) fn probe_database_health_via_frankensqlite(
2284    db_path: &Path,
2285) -> Result<SqliteDatabaseHealthProbe> {
2286    let path_str = db_path.to_string_lossy();
2287    let conn = FrankenConnection::open(path_str.as_ref()).with_context(|| {
2288        format!(
2289            "opening frankensqlite db at {} for database health probe",
2290            db_path.display()
2291        )
2292    })?;
2293    conn.execute_batch("PRAGMA busy_timeout = 30000;")
2294        .with_context(|| {
2295            format!(
2296                "configuring busy timeout for database health probe at {}",
2297                db_path.display()
2298            )
2299        })?;
2300
2301    let schema_version = read_meta_schema_version(&conn)?;
2302    let quick_check_status: String = conn
2303        .query_row_map("PRAGMA quick_check(1)", fparams![], |row| row.get_typed(0))
2304        .with_context(|| format!("running PRAGMA quick_check(1) for {}", db_path.display()))?;
2305    let quick_check_ok = quick_check_status.trim().eq_ignore_ascii_case("ok");
2306    let fts_schema_rows = franken_fts_schema_rows(&conn)?;
2307    let fts_queryable = fts_schema_rows == 1 && franken_fts_limit_probe(&conn);
2308
2309    if !quick_check_ok {
2310        return Ok(SqliteDatabaseHealthProbe {
2311            schema_version,
2312            quick_check_ok,
2313            fts_schema_rows,
2314            fts_queryable,
2315            message_count: 0,
2316            max_message_id: 0,
2317        });
2318    }
2319
2320    let message_count: i64 = conn
2321        .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
2322            row.get_typed(0)
2323        })
2324        .context("counting messages during frankensqlite database health probe")?;
2325    let max_message_id: i64 = conn
2326        .query_row_map(
2327            "SELECT COALESCE(MAX(id), 0) FROM messages",
2328            fparams![],
2329            |row| row.get_typed(0),
2330        )
2331        .context("reading max message id during frankensqlite database health probe")?;
2332
2333    Ok(SqliteDatabaseHealthProbe {
2334        schema_version,
2335        quick_check_ok,
2336        fts_schema_rows,
2337        fts_queryable,
2338        message_count,
2339        max_message_id,
2340    })
2341}
2342
2343struct StagedHistoricalSeed {
2344    tempdir: tempfile::TempDir,
2345    db_path: PathBuf,
2346}
2347
2348fn stage_historical_bundle_for_seed(
2349    canonical_db_path: &Path,
2350    source_root_path: &Path,
2351) -> Result<StagedHistoricalSeed> {
2352    let canonical_parent = canonical_db_path.parent().unwrap_or_else(|| Path::new("."));
2353    fs::create_dir_all(canonical_parent).with_context(|| {
2354        format!(
2355            "creating canonical database directory before bulk historical seed import: {}",
2356            canonical_parent.display()
2357        )
2358    })?;
2359    let tempdir = tempfile::TempDir::new_in(canonical_parent)
2360        .context("creating temporary baseline seed directory")?;
2361    let staged_seed_db = tempdir.path().join("baseline-seed-output.db");
2362    copy_database_bundle(source_root_path, &staged_seed_db)?;
2363
2364    Ok(StagedHistoricalSeed {
2365        tempdir,
2366        db_path: staged_seed_db,
2367    })
2368}
2369
2370fn promote_staged_historical_seed(
2371    canonical_db_path: &Path,
2372    staged_seed: &StagedHistoricalSeed,
2373) -> Result<()> {
2374    let canonical_backup = staged_seed
2375        .tempdir
2376        .path()
2377        .join("pre-seed-canonical-backup.db");
2378    let had_canonical = canonical_db_path.exists()
2379        || database_sidecar_path(canonical_db_path, "-wal").exists()
2380        || database_sidecar_path(canonical_db_path, "-shm").exists();
2381
2382    if had_canonical {
2383        move_database_bundle(canonical_db_path, &canonical_backup).with_context(|| {
2384            format!(
2385                "backing up canonical database before promoting staged historical seed import: {}",
2386                canonical_db_path.display()
2387            )
2388        })?;
2389    }
2390
2391    if let Err(err) =
2392        move_database_bundle(&staged_seed.db_path, canonical_db_path).with_context(|| {
2393            format!(
2394                "promoting staged historical seed database bundle {} into canonical path {}",
2395                staged_seed.db_path.display(),
2396                canonical_db_path.display()
2397            )
2398        })
2399    {
2400        if had_canonical {
2401            let _ = move_database_bundle(&canonical_backup, canonical_db_path);
2402        }
2403        return Err(err);
2404    }
2405
2406    Ok(())
2407}
2408
2409pub(crate) fn seed_canonical_from_best_historical_bundle(
2410    canonical_db_path: &Path,
2411) -> Result<Option<HistoricalSalvageOutcome>> {
2412    let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
2413    let mut last_seed_error: Option<anyhow::Error> = None;
2414    for bundle in ordered_bundles {
2415        if let Some(version) = bundle.probe.schema_version
2416            && version < 13
2417        {
2418            let err = anyhow!(
2419                "historical bundle {} schema_version {version} is too old for baseline import",
2420                bundle.root_path.display()
2421            );
2422            tracing::warn!(
2423                path = %bundle.root_path.display(),
2424                schema_version = version,
2425                "historical bundle is too old for baseline seed import"
2426            );
2427            last_seed_error = Some(err);
2428            continue;
2429        }
2430
2431        let source = open_historical_bundle_for_salvage(&bundle).with_context(|| {
2432            format!(
2433                "opening historical seed bundle {} for baseline import",
2434                bundle.root_path.display()
2435            )
2436        })?;
2437        let (conversations_imported, messages_imported) = historical_bundle_counts(&source.conn)?;
2438
2439        let staged_seed = match stage_historical_bundle_for_seed(
2440            canonical_db_path,
2441            &source.root_path,
2442        ) {
2443            Ok(staged_seed) => staged_seed,
2444            Err(err) => {
2445                tracing::warn!(
2446                    path = %bundle.root_path.display(),
2447                    error = %err,
2448                    "bulk baseline seed staging from historical bundle failed; trying next candidate"
2449                );
2450                last_seed_error = Some(err);
2451                continue;
2452            }
2453        };
2454
2455        if let Err(err) = finalize_seeded_canonical_bundle_via_rusqlite(
2456            &staged_seed.db_path,
2457            &bundle,
2458            conversations_imported,
2459            messages_imported,
2460        ) {
2461            tracing::warn!(
2462                path = %bundle.root_path.display(),
2463                error = %err,
2464                "finalizing staged historical seed import failed; trying next candidate"
2465            );
2466            last_seed_error = Some(err);
2467            continue;
2468        }
2469
2470        if let Err(err) = promote_staged_historical_seed(canonical_db_path, &staged_seed) {
2471            tracing::warn!(
2472                path = %bundle.root_path.display(),
2473                error = %err,
2474                "promoting staged historical seed import failed; trying next candidate"
2475            );
2476            last_seed_error = Some(err);
2477            continue;
2478        }
2479
2480        tracing::info!(
2481            path = %bundle.root_path.display(),
2482            conversations_imported,
2483            messages_imported,
2484            "seeded empty canonical database from largest healthy historical bundle"
2485        );
2486
2487        return Ok(Some(HistoricalSalvageOutcome {
2488            bundles_considered: 0,
2489            bundles_imported: 1,
2490            conversations_imported,
2491            messages_imported,
2492        }));
2493    }
2494    if let Some(err) = last_seed_error {
2495        return Err(err);
2496    }
2497    Ok(None)
2498}
2499
2500fn parse_json_column(value: Option<String>) -> serde_json::Value {
2501    value
2502        .and_then(|raw| serde_json::from_str(&raw).ok())
2503        .unwrap_or(serde_json::Value::Null)
2504}
2505
2506const HISTORICAL_RAW_JSON_SENTINEL_KEY: &str = "__cass_historical_raw_json__";
2507
2508fn wrap_historical_raw_json(raw: String) -> serde_json::Value {
2509    serde_json::json!({ HISTORICAL_RAW_JSON_SENTINEL_KEY: raw })
2510}
2511
2512fn historical_raw_json(value: &serde_json::Value) -> Option<&str> {
2513    match value {
2514        serde_json::Value::Object(map) if map.len() == 1 => map
2515            .get(HISTORICAL_RAW_JSON_SENTINEL_KEY)
2516            .and_then(serde_json::Value::as_str),
2517        _ => None,
2518    }
2519}
2520
2521fn parse_historical_json_column(value: Option<String>) -> serde_json::Value {
2522    match value {
2523        Some(raw) if raw.trim().is_empty() => serde_json::Value::Null,
2524        Some(raw) => wrap_historical_raw_json(raw),
2525        None => serde_json::Value::Null,
2526    }
2527}
2528
2529fn historical_salvage_debug_enabled() -> bool {
2530    std::env::var_os("CASS_DEBUG_HISTORICAL_SALVAGE").is_some()
2531}
2532
2533#[derive(Debug, Clone, Copy)]
2534struct HistoricalImportBatchLimits {
2535    conversations: usize,
2536    messages: usize,
2537    payload_chars: usize,
2538}
2539
2540fn env_positive_usize(key: &str) -> Option<usize> {
2541    dotenvy::var(key)
2542        .ok()
2543        .and_then(|value| value.parse::<usize>().ok())
2544        .filter(|value| *value > 0)
2545}
2546
2547fn historical_import_batch_limits() -> HistoricalImportBatchLimits {
2548    let cpu_count = std::thread::available_parallelism()
2549        .map(std::num::NonZeroUsize::get)
2550        .unwrap_or(1);
2551
2552    let default_limits = if cpu_count >= 32 {
2553        HistoricalImportBatchLimits {
2554            conversations: 128,
2555            messages: 16_384,
2556            payload_chars: 12_000_000,
2557        }
2558    } else {
2559        HistoricalImportBatchLimits {
2560            conversations: 32,
2561            messages: 4_096,
2562            payload_chars: 3_000_000,
2563        }
2564    };
2565
2566    HistoricalImportBatchLimits {
2567        conversations: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CONVERSATIONS")
2568            .unwrap_or(default_limits.conversations),
2569        messages: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_MESSAGES")
2570            .unwrap_or(default_limits.messages),
2571        payload_chars: env_positive_usize("CASS_HISTORICAL_IMPORT_BATCH_CHARS")
2572            .unwrap_or(default_limits.payload_chars),
2573    }
2574}
2575
2576fn json_value_size_hint(value: &serde_json::Value) -> usize {
2577    if let Some(raw) = historical_raw_json(value) {
2578        return raw.len();
2579    }
2580    match value {
2581        serde_json::Value::Null => 0,
2582        other => serde_json::to_string(other)
2583            .map(|raw| raw.len())
2584            .unwrap_or(0),
2585    }
2586}
2587
2588fn message_payload_size_hint(message: &Message) -> usize {
2589    message
2590        .content
2591        .len()
2592        .saturating_add(json_value_size_hint(&message.extra_json))
2593}
2594
2595fn is_backup_root_name(name: &str, prefix: &str) -> bool {
2596    name.starts_with(prefix) && !name.ends_with("-wal") && !name.ends_with("-shm")
2597}
2598
2599// Suffixes that mark sqlite sidecar files we must never re-open as a DB root.
2600// Includes the standard -wal/-shm pair plus frankensqlite's Windows advisory-
2601// lock sidecars (-lock-shared/-lock-reserved/-lock-pending). Used by directory
2602// enumeration paths in `historical_bundle_root_paths`; deliberately NOT used
2603// by `is_backup_root_name`, because the existing backup-rotation cleanup must
2604// continue to sweep up any pre-existing orphan lock sidecars.
2605fn has_db_sidecar_suffix(name: &str) -> bool {
2606    const SIDECAR_SUFFIXES: &[&str] = &[
2607        "-wal",
2608        "-shm",
2609        "-lock-shared",
2610        "-lock-reserved",
2611        "-lock-pending",
2612    ];
2613    SIDECAR_SUFFIXES.iter().any(|suffix| name.ends_with(suffix))
2614}
2615
2616/// Public schema version constant for external checks.
2617pub const CURRENT_SCHEMA_VERSION: i64 = 20;
2618const MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION: i64 = 13;
2619
2620/// Result of checking schema compatibility.
2621#[derive(Debug, Clone)]
2622pub enum SchemaCheck {
2623    /// Schema is up to date, no migration needed.
2624    Compatible,
2625    /// Schema needs migration but can be done incrementally.
2626    NeedsMigration,
2627    /// Schema is incompatible and needs a full rebuild (with reason).
2628    NeedsRebuild(String),
2629}
2630
2631fn schema_check_error_requires_rebuild(err: &frankensqlite::FrankenError) -> bool {
2632    // Only on-disk corruption classes justify destructive rebuild.
2633    // Locking, open, and generic I/O failures are often transient and must
2634    // surface as errors rather than deleting the database under the caller.
2635    matches!(
2636        err,
2637        frankensqlite::FrankenError::DatabaseCorrupt { .. }
2638            | frankensqlite::FrankenError::WalCorrupt { .. }
2639            | frankensqlite::FrankenError::NotADatabase { .. }
2640            | frankensqlite::FrankenError::ShortRead { .. }
2641    )
2642}
2643
2644fn unique_backup_path(path: &Path) -> PathBuf {
2645    static NEXT_NONCE: std::sync::atomic::AtomicU64 = std::sync::atomic::AtomicU64::new(0);
2646
2647    let timestamp = SystemTime::now()
2648        .duration_since(UNIX_EPOCH)
2649        .map(|d| d.as_nanos())
2650        .unwrap_or(0);
2651    let nonce = NEXT_NONCE.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
2652    let file_name = path.file_name().and_then(|n| n.to_str()).unwrap_or("db");
2653
2654    path.with_file_name(format!(
2655        "{file_name}.backup.{}.{}.{}",
2656        std::process::id(),
2657        timestamp,
2658        nonce
2659    ))
2660}
2661
2662fn vacuum_stage_backup_path(backup_path: &Path) -> PathBuf {
2663    let file_name = backup_path
2664        .file_name()
2665        .and_then(|name| name.to_str())
2666        .unwrap_or("db.backup");
2667    backup_path.with_file_name(format!(".{file_name}.vacuum-in-progress"))
2668}
2669
2670/// Check schema compatibility without modifying the database.
2671///
2672/// Opens the database read-only and checks the schema version.
2673fn check_schema_compatibility(
2674    path: &Path,
2675) -> std::result::Result<SchemaCheck, frankensqlite::FrankenError> {
2676    let mut conn = open_franken_with_flags(
2677        &path.to_string_lossy(),
2678        FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
2679    )?;
2680
2681    let result = (|| {
2682        // Check if meta table exists
2683        let meta_exists: i32 = conn.query_row_map(
2684            "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name='meta'",
2685            fparams![],
2686            |row| row.get_typed(0),
2687        )?;
2688
2689        if meta_exists == 0 {
2690            // No meta table - could be empty or very old schema, needs rebuild
2691            // But first check if there are any tables at all
2692            let table_count: i32 = conn.query_row_map(
2693                "SELECT COUNT(*) FROM sqlite_master WHERE type='table'",
2694                fparams![],
2695                |row| row.get_typed(0),
2696            )?;
2697
2698            if table_count == 0 {
2699                // Empty database, will be initialized fresh
2700                return Ok(SchemaCheck::NeedsMigration);
2701            }
2702
2703            // Has tables but no meta - very old or corrupted
2704            return Ok(SchemaCheck::NeedsRebuild(
2705                "Database missing schema version metadata".to_string(),
2706            ));
2707        }
2708
2709        // Get the schema version
2710        let version: Option<i64> = conn
2711            .query_row_map(
2712                "SELECT value FROM meta WHERE key = 'schema_version'",
2713                fparams![],
2714                |row| Ok(row.get_typed::<String>(0)?.parse().ok()),
2715            )
2716            .ok()
2717            .flatten();
2718
2719        match version {
2720            Some(v) if v == SCHEMA_VERSION => Ok(SchemaCheck::Compatible),
2721            Some(v) if (MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION..SCHEMA_VERSION).contains(&v) => {
2722                Ok(SchemaCheck::NeedsMigration)
2723            }
2724            Some(v) if v > 0 && v < MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION => {
2725                Ok(SchemaCheck::NeedsRebuild(format!(
2726                    "Schema version {} is too old for in-place migration; supported upgrade path starts at version {}",
2727                    v, MIN_IN_PLACE_MIGRATION_SCHEMA_VERSION
2728                )))
2729            }
2730            Some(v) => {
2731                // v > SCHEMA_VERSION - database is from a newer version
2732                Ok(SchemaCheck::NeedsRebuild(format!(
2733                    "Schema version {} is newer than supported version {}",
2734                    v, SCHEMA_VERSION
2735                )))
2736            }
2737            None => Ok(SchemaCheck::NeedsRebuild(
2738                "Schema version not found or invalid".to_string(),
2739            )),
2740        }
2741    })();
2742
2743    if let Err(close_err) = conn.close_in_place() {
2744        tracing::warn!(
2745            error = %close_err,
2746            db_path = %path.display(),
2747            "check_schema_compatibility: close_in_place failed; falling back to best-effort close"
2748        );
2749        conn.close_best_effort_in_place();
2750    }
2751
2752    result
2753}
2754
2755const SCHEMA_VERSION: i64 = CURRENT_SCHEMA_VERSION;
2756
2757#[cfg(test)]
2758const MIGRATION_V1: &str = r"
2759PRAGMA foreign_keys = ON;
2760
2761CREATE TABLE IF NOT EXISTS meta (
2762    key TEXT PRIMARY KEY,
2763    value TEXT NOT NULL
2764);
2765
2766CREATE TABLE IF NOT EXISTS agents (
2767    id INTEGER PRIMARY KEY,
2768    slug TEXT NOT NULL UNIQUE,
2769    name TEXT NOT NULL,
2770    version TEXT,
2771    kind TEXT NOT NULL,
2772    created_at INTEGER NOT NULL,
2773    updated_at INTEGER NOT NULL
2774);
2775
2776CREATE TABLE IF NOT EXISTS workspaces (
2777    id INTEGER PRIMARY KEY,
2778    path TEXT NOT NULL UNIQUE,
2779    display_name TEXT
2780);
2781
2782CREATE TABLE IF NOT EXISTS conversations (
2783    id INTEGER PRIMARY KEY,
2784    agent_id INTEGER NOT NULL REFERENCES agents(id),
2785    workspace_id INTEGER REFERENCES workspaces(id),
2786    external_id TEXT,
2787    title TEXT,
2788    source_path TEXT NOT NULL,
2789    started_at INTEGER,
2790    ended_at INTEGER,
2791    approx_tokens INTEGER,
2792    metadata_json TEXT,
2793    UNIQUE(agent_id, external_id)
2794);
2795
2796CREATE TABLE IF NOT EXISTS messages (
2797    id INTEGER PRIMARY KEY,
2798    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2799    idx INTEGER NOT NULL,
2800    role TEXT NOT NULL,
2801    author TEXT,
2802    created_at INTEGER,
2803    content TEXT NOT NULL,
2804    extra_json TEXT,
2805    UNIQUE(conversation_id, idx)
2806);
2807
2808CREATE TABLE IF NOT EXISTS snippets (
2809    id INTEGER PRIMARY KEY,
2810    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
2811    file_path TEXT,
2812    start_line INTEGER,
2813    end_line INTEGER,
2814    language TEXT,
2815    snippet_text TEXT
2816);
2817
2818CREATE TABLE IF NOT EXISTS tags (
2819    id INTEGER PRIMARY KEY,
2820    name TEXT NOT NULL UNIQUE
2821);
2822
2823CREATE TABLE IF NOT EXISTS conversation_tags (
2824    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
2825    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
2826    PRIMARY KEY (conversation_id, tag_id)
2827);
2828
2829CREATE INDEX IF NOT EXISTS idx_conversations_agent_started
2830    ON conversations(agent_id, started_at DESC);
2831
2832CREATE INDEX IF NOT EXISTS idx_messages_conv_idx
2833    ON messages(conversation_id, idx);
2834
2835";
2836
2837#[cfg(test)]
2838const MIGRATION_V2: &str = r"
2839CREATE VIRTUAL TABLE IF NOT EXISTS fts_messages USING fts5(
2840    content,
2841    title,
2842    agent,
2843    workspace,
2844    source_path,
2845    created_at UNINDEXED,
2846    message_id UNINDEXED,
2847    tokenize='porter'
2848);
2849INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
2850SELECT
2851    m.content,
2852    c.title,
2853    a.slug,
2854    w.path,
2855    c.source_path,
2856    m.created_at,
2857    m.id
2858FROM messages m
2859JOIN conversations c ON m.conversation_id = c.id
2860JOIN agents a ON c.agent_id = a.id
2861LEFT JOIN workspaces w ON c.workspace_id = w.id;
2862";
2863
2864#[cfg(test)]
2865#[allow(dead_code)]
2866const MIGRATION_V3: &str = r"
2867DROP TABLE IF EXISTS fts_messages;
2868CREATE VIRTUAL TABLE fts_messages USING fts5(
2869    content,
2870    title,
2871    agent,
2872    workspace,
2873    source_path,
2874    created_at UNINDEXED,
2875    message_id UNINDEXED,
2876    tokenize='porter'
2877);
2878INSERT INTO fts_messages(content, title, agent, workspace, source_path, created_at, message_id)
2879SELECT
2880    m.content,
2881    c.title,
2882    a.slug,
2883    w.path,
2884    c.source_path,
2885    m.created_at,
2886    m.id
2887FROM messages m
2888JOIN conversations c ON m.conversation_id = c.id
2889JOIN agents a ON c.agent_id = a.id
2890LEFT JOIN workspaces w ON c.workspace_id = w.id;
2891";
2892
2893#[cfg(test)]
2894const MIGRATION_V4: &str = r"
2895-- Sources table for tracking where conversations come from
2896CREATE TABLE IF NOT EXISTS sources (
2897    id TEXT PRIMARY KEY,           -- source_id (e.g., 'local', 'work-laptop')
2898    kind TEXT NOT NULL,            -- 'local', 'ssh', etc.
2899    host_label TEXT,               -- display label
2900    machine_id TEXT,               -- optional stable machine id
2901    platform TEXT,                 -- 'macos', 'linux', 'windows'
2902    config_json TEXT,              -- JSON blob for extra config (SSH params, path rewrites)
2903    created_at INTEGER NOT NULL,
2904    updated_at INTEGER NOT NULL
2905);
2906
2907-- Bootstrap: Insert the default 'local' source
2908INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
2909VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
2910";
2911
2912#[cfg(test)]
2913const MIGRATION_V5: &str = r"
2914-- Add provenance columns to conversations table
2915-- SQLite cannot alter unique constraints, so we need to recreate the table
2916
2917-- Create new table with provenance columns and updated unique constraint
2918CREATE TABLE conversations_new (
2919    id INTEGER PRIMARY KEY,
2920    agent_id INTEGER NOT NULL REFERENCES agents(id),
2921    workspace_id INTEGER REFERENCES workspaces(id),
2922    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
2923    external_id TEXT,
2924    title TEXT,
2925    source_path TEXT NOT NULL,
2926    started_at INTEGER,
2927    ended_at INTEGER,
2928    approx_tokens INTEGER,
2929    metadata_json TEXT,
2930    origin_host TEXT,
2931    UNIQUE(source_id, agent_id, external_id)
2932);
2933
2934-- Copy data from old table (all existing conversations get source_id='local')
2935INSERT INTO conversations_new (id, agent_id, workspace_id, source_id, external_id, title,
2936                               source_path, started_at, ended_at, approx_tokens, metadata_json, origin_host)
2937SELECT id, agent_id, workspace_id, 'local', external_id, title,
2938       source_path, started_at, ended_at, approx_tokens, metadata_json, NULL
2939FROM conversations;
2940
2941-- Drop old table and rename new
2942DROP TABLE conversations;
2943ALTER TABLE conversations_new RENAME TO conversations;
2944
2945-- Recreate indexes
2946CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
2947CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
2948";
2949
2950#[cfg(test)]
2951const MIGRATION_V6: &str = r"
2952-- Optimize lookup by source_path (used by TUI detail view)
2953CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
2954";
2955
2956#[cfg(test)]
2957const MIGRATION_V7: &str = r"
2958-- Add binary columns for MessagePack serialization (Opt 3.1)
2959-- Binary format is 50-70% smaller than JSON and faster to parse
2960ALTER TABLE conversations ADD COLUMN metadata_bin BLOB;
2961ALTER TABLE messages ADD COLUMN extra_bin BLOB;
2962";
2963
2964#[cfg(test)]
2965const MIGRATION_V8: &str = r"
2966-- Opt 3.2: Daily stats materialized table for O(1) time-range histograms
2967-- Provides fast aggregated queries for stats/dashboard without full table scans
2968
2969CREATE TABLE IF NOT EXISTS daily_stats (
2970    day_id INTEGER NOT NULL,              -- Days since 2020-01-01 (Unix epoch + offset)
2971    agent_slug TEXT NOT NULL,             -- 'all' for totals, or specific agent slug
2972    source_id TEXT NOT NULL DEFAULT 'all', -- 'all' for totals, or specific source
2973    session_count INTEGER NOT NULL DEFAULT 0,
2974    message_count INTEGER NOT NULL DEFAULT 0,
2975    total_chars INTEGER NOT NULL DEFAULT 0,
2976    last_updated INTEGER NOT NULL,
2977    PRIMARY KEY (day_id, agent_slug, source_id)
2978);
2979
2980CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
2981CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
2982";
2983
2984#[cfg(test)]
2985const MIGRATION_V9: &str = r"
2986-- Background embedding jobs tracking table
2987CREATE TABLE IF NOT EXISTS embedding_jobs (
2988    id INTEGER PRIMARY KEY AUTOINCREMENT,
2989    db_path TEXT NOT NULL,
2990    model_id TEXT NOT NULL,
2991    status TEXT NOT NULL DEFAULT 'pending',
2992    total_docs INTEGER NOT NULL DEFAULT 0,
2993    completed_docs INTEGER NOT NULL DEFAULT 0,
2994    error_message TEXT,
2995    created_at TEXT NOT NULL DEFAULT (datetime('now')),
2996    started_at TEXT,
2997    completed_at TEXT
2998);
2999
3000-- Only one pending or running job per (db_path, model_id) at a time.
3001-- Multiple completed/failed/cancelled jobs are allowed for history.
3002CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
3003ON embedding_jobs(db_path, model_id)
3004WHERE status IN ('pending', 'running');
3005";
3006
3007#[cfg(test)]
3008const MIGRATION_V10: &str = r"
3009-- Token analytics: per-message token usage ledger
3010CREATE TABLE IF NOT EXISTS token_usage (
3011    id INTEGER PRIMARY KEY AUTOINCREMENT,
3012    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
3013    conversation_id INTEGER NOT NULL,
3014    agent_id INTEGER NOT NULL,
3015    workspace_id INTEGER,
3016    source_id TEXT NOT NULL DEFAULT 'local',
3017
3018    -- Timing
3019    timestamp_ms INTEGER NOT NULL,
3020    day_id INTEGER NOT NULL,
3021
3022    -- Model identification
3023    model_name TEXT,
3024    model_family TEXT,
3025    model_tier TEXT,
3026    service_tier TEXT,
3027    provider TEXT,
3028
3029    -- Token counts (nullable — not all agents provide all fields)
3030    input_tokens INTEGER,
3031    output_tokens INTEGER,
3032    cache_read_tokens INTEGER,
3033    cache_creation_tokens INTEGER,
3034    thinking_tokens INTEGER,
3035    total_tokens INTEGER,
3036
3037    -- Cost estimation
3038    estimated_cost_usd REAL,
3039
3040    -- Message context
3041    role TEXT NOT NULL,
3042    content_chars INTEGER NOT NULL,
3043    has_tool_calls INTEGER NOT NULL DEFAULT 0,
3044    tool_call_count INTEGER NOT NULL DEFAULT 0,
3045
3046    -- Data quality
3047    data_source TEXT NOT NULL DEFAULT 'api',
3048
3049    UNIQUE(message_id)
3050);
3051
3052CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
3053CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
3054CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
3055CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
3056CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
3057
3058-- Token analytics: pre-aggregated daily rollups
3059CREATE TABLE IF NOT EXISTS token_daily_stats (
3060    day_id INTEGER NOT NULL,
3061    agent_slug TEXT NOT NULL,
3062    source_id TEXT NOT NULL DEFAULT 'all',
3063    model_family TEXT NOT NULL DEFAULT 'all',
3064
3065    api_call_count INTEGER NOT NULL DEFAULT 0,
3066    user_message_count INTEGER NOT NULL DEFAULT 0,
3067    assistant_message_count INTEGER NOT NULL DEFAULT 0,
3068    tool_message_count INTEGER NOT NULL DEFAULT 0,
3069
3070    total_input_tokens INTEGER NOT NULL DEFAULT 0,
3071    total_output_tokens INTEGER NOT NULL DEFAULT 0,
3072    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
3073    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
3074    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
3075    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
3076
3077    total_content_chars INTEGER NOT NULL DEFAULT 0,
3078    total_tool_calls INTEGER NOT NULL DEFAULT 0,
3079
3080    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
3081
3082    session_count INTEGER NOT NULL DEFAULT 0,
3083
3084    last_updated INTEGER NOT NULL,
3085
3086    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
3087);
3088
3089CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
3090CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
3091
3092-- Model pricing lookup table
3093CREATE TABLE IF NOT EXISTS model_pricing (
3094    model_pattern TEXT NOT NULL,
3095    provider TEXT NOT NULL,
3096    input_cost_per_mtok REAL NOT NULL,
3097    output_cost_per_mtok REAL NOT NULL,
3098    cache_read_cost_per_mtok REAL,
3099    cache_creation_cost_per_mtok REAL,
3100    effective_date TEXT NOT NULL,
3101    PRIMARY KEY (model_pattern, effective_date)
3102);
3103
3104-- Seed with current pricing (as of 2026-02)
3105INSERT OR IGNORE INTO model_pricing VALUES
3106    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
3107    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
3108    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
3109    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
3110    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
3111    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3112    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
3113    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
3114    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
3115    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
3116
3117-- Extend conversations table with token summary columns
3118ALTER TABLE conversations ADD COLUMN total_input_tokens INTEGER;
3119ALTER TABLE conversations ADD COLUMN total_output_tokens INTEGER;
3120ALTER TABLE conversations ADD COLUMN total_cache_read_tokens INTEGER;
3121ALTER TABLE conversations ADD COLUMN total_cache_creation_tokens INTEGER;
3122ALTER TABLE conversations ADD COLUMN grand_total_tokens INTEGER;
3123ALTER TABLE conversations ADD COLUMN estimated_cost_usd REAL;
3124ALTER TABLE conversations ADD COLUMN primary_model TEXT;
3125ALTER TABLE conversations ADD COLUMN api_call_count INTEGER;
3126ALTER TABLE conversations ADD COLUMN tool_call_count INTEGER;
3127ALTER TABLE conversations ADD COLUMN user_message_count INTEGER;
3128ALTER TABLE conversations ADD COLUMN assistant_message_count INTEGER;
3129";
3130
3131const MIGRATION_V14: &str = r"
3132-- Switch FTS5 from internal-content to contentless mode (CASS #163).
3133-- Drop the old V13 internal-content fts_messages first so that
3134-- sqlite_schema does not contain two conflicting CREATE VIRTUAL TABLE
3135-- entries, which makes the database completely unreadable.
3136-- The current contentless table is recreated lazily after open() only when the
3137-- frankensqlite FTS consistency check finds it missing or malformed.
3138DROP TABLE IF EXISTS fts_messages;
3139";
3140
3141const MIGRATION_V15_TAIL_STATE_TABLE: &str = r"
3142CREATE TABLE IF NOT EXISTS conversation_tail_state (
3143    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3144    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3145    conversation_id INTEGER PRIMARY KEY,
3146    ended_at INTEGER,
3147    last_message_idx INTEGER,
3148    last_message_created_at INTEGER
3149);
3150";
3151
3152const MIGRATION_V16: &str = r"
3153-- UNIQUE(conversation_id, idx) already creates sqlite_autoindex_messages_1,
3154-- which covers the same lookup/order key as idx_messages_conv_idx. Keeping both
3155-- doubles message insert index maintenance on the hot indexing path.
3156DROP INDEX IF EXISTS idx_messages_conv_idx;
3157";
3158
3159const MIGRATION_V17: &str = r"
3160-- Drop the global messages(created_at) secondary index from the ingest hot
3161-- path. Search/time filters are served by the derived search layer and
3162-- conversation/analytics indexes, while this index is maintained on every
3163-- message insert.
3164DROP INDEX IF EXISTS idx_messages_created;
3165";
3166
3167const MIGRATION_V18: &str = r"
3168-- Move append-tail state out of the wide, indexed conversations row. The hot
3169-- append path updates this cache for every appended conversation; keeping it in
3170-- a tiny rowid table avoids rewriting the large conversation record.
3171CREATE TABLE IF NOT EXISTS conversation_tail_state (
3172    -- Deliberately no FOREIGN KEY: this hot row is maintained by insert/append
3173    -- paths, and FK metadata keeps frankensqlite off the direct rowid update path.
3174    conversation_id INTEGER PRIMARY KEY,
3175    ended_at INTEGER,
3176    last_message_idx INTEGER,
3177    last_message_created_at INTEGER
3178);
3179
3180INSERT OR REPLACE INTO conversation_tail_state (
3181    conversation_id, ended_at, last_message_idx, last_message_created_at
3182)
3183SELECT id, ended_at, last_message_idx, last_message_created_at
3184FROM conversations
3185WHERE ended_at IS NOT NULL
3186   OR last_message_idx IS NOT NULL
3187   OR last_message_created_at IS NOT NULL;
3188";
3189
3190const MIGRATION_V19: &str = r"
3191-- Materialize external conversation provenance into one compact lookup key.
3192-- This keeps the hot append/new-conversation probe on a single primary-key
3193-- lookup instead of a composite conversations-table predicate.
3194CREATE TABLE IF NOT EXISTS conversation_external_lookup (
3195    lookup_key TEXT PRIMARY KEY,
3196    conversation_id INTEGER NOT NULL
3197);
3198
3199INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
3200SELECT
3201    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
3202    CAST(agent_id AS TEXT) || ':' ||
3203    CAST(length(external_id) AS TEXT) || ':' || external_id,
3204    id
3205FROM conversations
3206WHERE external_id IS NOT NULL;
3207";
3208
3209const MIGRATION_V20: &str = r"
3210-- Fuse external conversation lookup with append-tail state. Append-heavy
3211-- workloads can resolve both the conversation id and tail plan from one
3212-- primary-key probe.
3213CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
3214    lookup_key TEXT PRIMARY KEY,
3215    conversation_id INTEGER NOT NULL,
3216    ended_at INTEGER,
3217    last_message_idx INTEGER,
3218    last_message_created_at INTEGER
3219);
3220
3221INSERT OR REPLACE INTO conversation_external_tail_lookup (
3222    lookup_key,
3223    conversation_id,
3224    ended_at,
3225    last_message_idx,
3226    last_message_created_at
3227)
3228SELECT
3229    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
3230    CAST(c.agent_id AS TEXT) || ':' ||
3231    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
3232    c.id,
3233    (SELECT ts.ended_at
3234     FROM conversation_tail_state ts
3235     WHERE ts.conversation_id = c.id),
3236    (SELECT ts.last_message_idx
3237     FROM conversation_tail_state ts
3238     WHERE ts.conversation_id = c.id),
3239    (SELECT ts.last_message_created_at
3240     FROM conversation_tail_state ts
3241     WHERE ts.conversation_id = c.id)
3242FROM conversations c
3243WHERE c.external_id IS NOT NULL;
3244";
3245
3246/// Row from the embedding_jobs table.
3247#[derive(Debug, Clone)]
3248pub struct EmbeddingJobRow {
3249    pub id: i64,
3250    pub db_path: String,
3251    pub model_id: String,
3252    pub status: String,
3253    pub total_docs: i64,
3254    pub completed_docs: i64,
3255    pub error_message: Option<String>,
3256    pub created_at: String,
3257    pub started_at: Option<String>,
3258    pub completed_at: Option<String>,
3259}
3260
3261/// Lightweight conversation projection used while rebuilding the lexical index.
3262///
3263/// This intentionally omits `metadata_json` / `metadata_bin` and other bulky
3264/// fields because Tantivy only needs the stable envelope plus provenance
3265/// identifiers. Reading full metadata here can force frankensqlite to traverse
3266/// large overflow chains before the first lexical checkpoint is committed.
3267#[derive(Debug, Clone)]
3268pub struct LexicalRebuildConversationRow {
3269    pub id: Option<i64>,
3270    pub agent_slug: String,
3271    pub workspace: Option<PathBuf>,
3272    pub external_id: Option<String>,
3273    pub title: Option<String>,
3274    pub source_path: PathBuf,
3275    pub started_at: Option<i64>,
3276    pub ended_at: Option<i64>,
3277    pub source_id: String,
3278    pub origin_host: Option<String>,
3279}
3280
3281/// Lightweight per-conversation footprint used to pre-plan lexical rebuild
3282/// shard boundaries without re-reading full message bodies in the hot path.
3283#[derive(Debug, Clone, Copy, PartialEq, Eq)]
3284pub struct LexicalRebuildConversationFootprintRow {
3285    pub conversation_id: i64,
3286    pub message_count: usize,
3287    pub message_bytes: usize,
3288}
3289
3290pub(crate) const LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE: usize = 4 * 1024;
3291const LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT: usize = 64;
3292
3293fn lexical_rebuild_tail_metadata_coverage_is_sufficient(
3294    total_conversations: usize,
3295    covered_conversations: usize,
3296) -> bool {
3297    total_conversations == 0
3298        || total_conversations.saturating_sub(covered_conversations.min(total_conversations))
3299            <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT
3300}
3301
3302fn lexical_rebuild_message_count_from_tail_idx(last_message_idx: Option<i64>) -> Option<usize> {
3303    let last_message_idx = u64::try_from(last_message_idx?).ok()?;
3304    let high_water = last_message_idx.checked_add(1)?;
3305    usize::try_from(high_water).ok()
3306}
3307
3308fn lexical_rebuild_conversation_footprint_from_count(
3309    conversation_id: i64,
3310    message_count: usize,
3311) -> LexicalRebuildConversationFootprintRow {
3312    LexicalRebuildConversationFootprintRow {
3313        conversation_id,
3314        message_count,
3315        message_bytes: message_count
3316            .saturating_mul(LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE),
3317    }
3318}
3319
3320/// Lightweight message projection used by the streaming lexical rebuild path.
3321#[derive(Debug, Clone)]
3322pub struct LexicalRebuildMessageRow {
3323    pub conversation_id: i64,
3324    pub id: i64,
3325    pub idx: i64,
3326    pub role: String,
3327    pub author: Option<String>,
3328    pub created_at: Option<i64>,
3329    pub content: String,
3330}
3331
3332/// Even lighter message projection used only by the grouped lexical rebuild
3333/// stream hot path. It keeps just the per-message fields the rebuild consumes
3334/// and tracks the final message id at conversation scope instead.
3335#[derive(Debug, Clone, PartialEq, Eq)]
3336pub struct LexicalRebuildGroupedMessageRow {
3337    pub idx: i64,
3338    pub is_tool_role: bool,
3339    pub created_at: Option<i64>,
3340    pub content: String,
3341}
3342
3343pub type LexicalRebuildGroupedMessageRows = SmallVec<[LexicalRebuildGroupedMessageRow; 32]>;
3344
3345/// Compatibility alias retained while call sites finish converging on `FrankenStorage`.
3346pub type SqliteStorage = FrankenStorage;
3347
3348/// Primary frankensqlite-backed storage backend.
3349pub struct FrankenStorage {
3350    conn: FrankenConnection,
3351    db_path: PathBuf,
3352    ephemeral_writer_preflight_verified: AtomicBool,
3353    index_writer_checkpoint_pages: AtomicI64,
3354    index_writer_busy_timeout_ms: AtomicU64,
3355    cached_ephemeral_writer: parking_lot::Mutex<CachedEphemeralWriter>,
3356    ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3357    ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3358    ensured_conversation_sources: Arc<parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>>,
3359    ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3360    fts_messages_present_cache: AtomicI8,
3361}
3362
3363/// Keep ordinary storage commits from tripping over frequent auto-checkpoints
3364/// while still bounding WAL growth. Bulk index paths may override this through
3365/// their explicit checkpoint policy.
3366const DEFAULT_WAL_AUTOCHECKPOINT_PAGES: i64 = 4096;
3367const UNSET_INDEX_WRITER_CHECKPOINT_PAGES: i64 = i64::MIN;
3368const UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS: u64 = 0;
3369const FTS_MESSAGES_PRESENT_UNKNOWN: i8 = 0;
3370const FTS_MESSAGES_PRESENT_ABSENT: i8 = 1;
3371const FTS_MESSAGES_PRESENT_PRESENT: i8 = 2;
3372
3373enum CachedEphemeralWriter {
3374    Uninitialized,
3375    Cached(Box<SendFrankenConnection>),
3376    InUse,
3377}
3378
3379#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3380struct EnsuredAgentKey {
3381    slug: String,
3382    name: String,
3383    version: Option<String>,
3384    kind: String,
3385}
3386
3387impl EnsuredAgentKey {
3388    fn from_agent(agent: &Agent) -> Self {
3389        Self {
3390            slug: agent.slug.clone(),
3391            name: agent.name.clone(),
3392            version: agent.version.clone(),
3393            kind: agent_kind_str(agent.kind.clone()),
3394        }
3395    }
3396}
3397
3398#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3399struct EnsuredWorkspaceKey {
3400    path: String,
3401    display_name: Option<String>,
3402}
3403
3404impl EnsuredWorkspaceKey {
3405    fn new(path: String, display_name: Option<&str>) -> Self {
3406        Self {
3407            path,
3408            display_name: display_name.map(str::to_owned),
3409        }
3410    }
3411}
3412
3413#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3414struct EnsuredConversationSourceKey {
3415    id: String,
3416    kind: SourceKind,
3417    host_label: Option<String>,
3418}
3419
3420impl EnsuredConversationSourceKey {
3421    fn from_source(source: &Source) -> Self {
3422        Self {
3423            id: source.id.clone(),
3424            kind: source.kind,
3425            host_label: source.host_label.clone(),
3426        }
3427    }
3428}
3429
3430#[derive(Debug, Clone, PartialEq, Eq, Hash)]
3431struct EnsuredDailyStatsKey {
3432    day_id: i64,
3433    agent_slug: String,
3434    source_id: String,
3435}
3436
3437impl EnsuredDailyStatsKey {
3438    fn new(day_id: i64, agent_slug: &str, source_id: &str) -> Self {
3439        Self {
3440            day_id,
3441            agent_slug: agent_slug.to_owned(),
3442            source_id: source_id.to_owned(),
3443        }
3444    }
3445}
3446
3447const AUTOCOMMIT_RETAIN_OFF_PRAGMAS: [&str; 2] = [
3448    "PRAGMA fsqlite.autocommit_retain = OFF;",
3449    "PRAGMA autocommit_retain = OFF;",
3450];
3451
3452fn disable_autocommit_retain<E>(
3453    mut execute: impl FnMut(&'static str) -> std::result::Result<(), E>,
3454) -> Result<&'static str>
3455where
3456    E: std::fmt::Display,
3457{
3458    let mut failures = Vec::new();
3459    for pragma in AUTOCOMMIT_RETAIN_OFF_PRAGMAS {
3460        match execute(pragma) {
3461            Ok(()) => return Ok(pragma),
3462            Err(err) => {
3463                let error = err.to_string();
3464                tracing::debug!(
3465                    %pragma,
3466                    error = %error,
3467                    "autocommit_retain PRAGMA variant not supported"
3468                );
3469                failures.push(format!("{pragma}: {error}"));
3470            }
3471        }
3472    }
3473
3474    Err(anyhow!(
3475        "failed to disable autocommit_retain on frankensqlite connection; \
3476         refusing to keep a long-lived MVCC connection that may accumulate \
3477         unbounded write snapshots. Upgrade frankensqlite to a version that \
3478         supports one of these PRAGMAs or use a short-lived connection path. \
3479         attempts: {}",
3480        failures.join("; ")
3481    ))
3482}
3483
3484impl FrankenStorage {
3485    fn new(conn: FrankenConnection, db_path: PathBuf) -> Self {
3486        Self::new_with_shared_caches(
3487            conn,
3488            db_path,
3489            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3490            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3491            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3492            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3493        )
3494    }
3495
3496    fn new_with_shared_caches(
3497        conn: FrankenConnection,
3498        db_path: PathBuf,
3499        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3500        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3501        ensured_conversation_sources: Arc<
3502            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3503        >,
3504        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3505    ) -> Self {
3506        Self {
3507            conn,
3508            db_path,
3509            ephemeral_writer_preflight_verified: AtomicBool::new(false),
3510            index_writer_checkpoint_pages: AtomicI64::new(UNSET_INDEX_WRITER_CHECKPOINT_PAGES),
3511            index_writer_busy_timeout_ms: AtomicU64::new(UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS),
3512            cached_ephemeral_writer: parking_lot::Mutex::new(CachedEphemeralWriter::Uninitialized),
3513            ensured_agents,
3514            ensured_workspaces,
3515            ensured_conversation_sources,
3516            ensured_daily_stats_keys,
3517            fts_messages_present_cache: AtomicI8::new(FTS_MESSAGES_PRESENT_UNKNOWN),
3518        }
3519    }
3520
3521    fn apply_open_stage_busy_timeout(&self) {
3522        if let Err(err) = self.conn.execute("PRAGMA busy_timeout = 5000;") {
3523            tracing::debug!(
3524                error = %err,
3525                "failed to apply open-stage busy_timeout before migrations"
3526            );
3527        }
3528    }
3529
3530    /// Open a frankensqlite connection, run migrations, and apply config.
3531    ///
3532    /// This initializes canonical schema state only. Derived fallback search
3533    /// structures like the in-database `fts_messages` table are repaired
3534    /// separately so ordinary opens never block on heavyweight maintenance.
3535    pub fn open(path: &Path) -> Result<Self> {
3536        if let Some(parent) = path.parent() {
3537            fs::create_dir_all(parent)
3538                .with_context(|| format!("creating db directory {}", parent.display()))?;
3539        }
3540
3541        let path_str = path.to_string_lossy().to_string();
3542        let _doctor_guard =
3543            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3544        let conn = FrankenConnection::open(&path_str)
3545            .with_context(|| format!("opening frankensqlite db at {}", path.display()))?;
3546        let storage = Self::new(conn, path.to_path_buf());
3547        storage.apply_open_stage_busy_timeout();
3548        storage.run_migrations()?;
3549        storage.repair_missing_current_schema_objects()?;
3550        storage.apply_config()?;
3551        Ok(storage)
3552    }
3553
3554    /// Open a writer connection that skips migration (assumes DB already migrated).
3555    ///
3556    /// Used by the BEGIN CONCURRENT parallel writer pool: each writer needs its
3557    /// own connection with config applied, but migrations have already been run
3558    /// by the primary connection.
3559    pub fn open_writer(path: &Path) -> Result<Self> {
3560        Self::open_writer_with_shared_caches(
3561            path,
3562            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3563            Arc::new(parking_lot::Mutex::new(HashMap::new())),
3564            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3565            Arc::new(parking_lot::Mutex::new(HashSet::new())),
3566        )
3567    }
3568
3569    fn open_writer_with_shared_caches(
3570        path: &Path,
3571        ensured_agents: Arc<parking_lot::Mutex<HashMap<EnsuredAgentKey, i64>>>,
3572        ensured_workspaces: Arc<parking_lot::Mutex<HashMap<EnsuredWorkspaceKey, i64>>>,
3573        ensured_conversation_sources: Arc<
3574            parking_lot::Mutex<HashSet<EnsuredConversationSourceKey>>,
3575        >,
3576        ensured_daily_stats_keys: Arc<parking_lot::Mutex<HashSet<EnsuredDailyStatsKey>>>,
3577    ) -> Result<Self> {
3578        let path_str = path.to_string_lossy().to_string();
3579        let _doctor_guard =
3580            acquire_doctor_mutation_db_open_guard(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)?;
3581        let conn = FrankenConnection::open(&path_str)
3582            .with_context(|| format!("opening frankensqlite writer at {}", path.display()))?;
3583        let storage = Self::new_with_shared_caches(
3584            conn,
3585            path.to_path_buf(),
3586            ensured_agents,
3587            ensured_workspaces,
3588            ensured_conversation_sources,
3589            ensured_daily_stats_keys,
3590        );
3591        storage.apply_config()?;
3592        Ok(storage)
3593    }
3594
3595    pub(crate) fn acquire_cached_ephemeral_writer(&self) -> Result<(Self, bool)> {
3596        let mut cached = self.cached_ephemeral_writer.lock();
3597        match std::mem::replace(&mut *cached, CachedEphemeralWriter::InUse) {
3598            CachedEphemeralWriter::Cached(conn) => {
3599                let (conn, checkpoint_pages, busy_timeout_ms) = (*conn).into_parts();
3600                let writer = Self::new_with_shared_caches(
3601                    conn,
3602                    self.db_path.clone(),
3603                    Arc::clone(&self.ensured_agents),
3604                    Arc::clone(&self.ensured_workspaces),
3605                    Arc::clone(&self.ensured_conversation_sources),
3606                    Arc::clone(&self.ensured_daily_stats_keys),
3607                );
3608                writer
3609                    .index_writer_checkpoint_pages
3610                    .store(checkpoint_pages, Ordering::Relaxed);
3611                writer
3612                    .index_writer_busy_timeout_ms
3613                    .store(busy_timeout_ms, Ordering::Relaxed);
3614                Ok((writer, true))
3615            }
3616            CachedEphemeralWriter::Uninitialized => {
3617                drop(cached);
3618                match Self::open_writer_with_shared_caches(
3619                    &self.db_path,
3620                    Arc::clone(&self.ensured_agents),
3621                    Arc::clone(&self.ensured_workspaces),
3622                    Arc::clone(&self.ensured_conversation_sources),
3623                    Arc::clone(&self.ensured_daily_stats_keys),
3624                ) {
3625                    Ok(writer) => Ok((writer, true)),
3626                    Err(err) => {
3627                        let mut cached = self.cached_ephemeral_writer.lock();
3628                        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3629                            *cached = CachedEphemeralWriter::Uninitialized;
3630                        }
3631                        Err(err)
3632                    }
3633                }
3634            }
3635            CachedEphemeralWriter::InUse => {
3636                *cached = CachedEphemeralWriter::InUse;
3637                drop(cached);
3638                Ok((
3639                    Self::open_writer_with_shared_caches(
3640                        &self.db_path,
3641                        Arc::clone(&self.ensured_agents),
3642                        Arc::clone(&self.ensured_workspaces),
3643                        Arc::clone(&self.ensured_conversation_sources),
3644                        Arc::clone(&self.ensured_daily_stats_keys),
3645                    )?,
3646                    false,
3647                ))
3648            }
3649        }
3650    }
3651
3652    pub(crate) fn release_cached_ephemeral_writer(&self, writer: Self) {
3653        let checkpoint_pages = writer.index_writer_checkpoint_pages.load(Ordering::Relaxed);
3654        let busy_timeout_ms = writer.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
3655        let conn = writer.into_raw();
3656        let mut cached = self.cached_ephemeral_writer.lock();
3657        debug_assert!(
3658            matches!(&*cached, CachedEphemeralWriter::InUse),
3659            "cached ephemeral writer state should be in-use when releasing"
3660        );
3661        *cached = CachedEphemeralWriter::Cached(Box::new(
3662            SendFrankenConnection::new_with_index_writer_state(
3663                conn,
3664                checkpoint_pages,
3665                busy_timeout_ms,
3666            ),
3667        ));
3668    }
3669
3670    pub(crate) fn discard_cached_ephemeral_writer(&self, mut writer: Self) {
3671        writer.close_best_effort_in_place();
3672        let mut cached = self.cached_ephemeral_writer.lock();
3673        if matches!(&*cached, CachedEphemeralWriter::InUse) {
3674            *cached = CachedEphemeralWriter::Uninitialized;
3675        }
3676    }
3677
3678    fn cached_agent_id(&self, key: &EnsuredAgentKey) -> Option<i64> {
3679        self.ensured_agents.lock().get(key).copied()
3680    }
3681
3682    fn mark_agent_ensured(&self, key: EnsuredAgentKey, id: i64) {
3683        self.ensured_agents.lock().insert(key, id);
3684    }
3685
3686    fn cached_workspace_id(&self, key: &EnsuredWorkspaceKey) -> Option<i64> {
3687        self.ensured_workspaces.lock().get(key).copied()
3688    }
3689
3690    fn mark_workspace_ensured(&self, key: EnsuredWorkspaceKey, id: i64) {
3691        self.ensured_workspaces.lock().insert(key, id);
3692    }
3693
3694    fn conversation_source_already_ensured(&self, key: &EnsuredConversationSourceKey) -> bool {
3695        self.ensured_conversation_sources.lock().contains(key)
3696    }
3697
3698    fn mark_conversation_source_ensured(&self, key: EnsuredConversationSourceKey) {
3699        self.ensured_conversation_sources.lock().insert(key);
3700    }
3701
3702    fn daily_stats_key_already_ensured(&self, key: &EnsuredDailyStatsKey) -> bool {
3703        self.ensured_daily_stats_keys.lock().contains(key)
3704    }
3705
3706    fn daily_stats_keys_already_ensured(&self, keys: &[EnsuredDailyStatsKey; 4]) -> bool {
3707        let ensured = self.ensured_daily_stats_keys.lock();
3708        keys.iter().all(|key| ensured.contains(key))
3709    }
3710
3711    fn mark_daily_stats_key_ensured(&self, key: EnsuredDailyStatsKey) {
3712        self.ensured_daily_stats_keys.lock().insert(key);
3713    }
3714
3715    fn fts_messages_present_cached(&self, tx: &FrankenTransaction<'_>) -> bool {
3716        match self.fts_messages_present_cache.load(Ordering::Acquire) {
3717            FTS_MESSAGES_PRESENT_PRESENT => return true,
3718            FTS_MESSAGES_PRESENT_ABSENT => return false,
3719            _ => {}
3720        }
3721
3722        let present = tx
3723            .query_row_map(
3724                "SELECT COUNT(*) FROM sqlite_master
3725                 WHERE name = 'fts_messages'
3726                   AND rootpage > 0",
3727                fparams![],
3728                |row| row.get_typed::<i64>(0),
3729            )
3730            .map(|count| count > 0)
3731            .unwrap_or_else(|err| {
3732                tracing::debug!(
3733                    error = %err,
3734                    "failed to probe fts_messages presence; skipping db-resident FTS maintenance"
3735                );
3736                false
3737            });
3738        self.set_fts_messages_present_cache(present);
3739        present
3740    }
3741
3742    fn set_fts_messages_present_cache(&self, present: bool) {
3743        self.fts_messages_present_cache.store(
3744            if present {
3745                FTS_MESSAGES_PRESENT_PRESENT
3746            } else {
3747                FTS_MESSAGES_PRESENT_ABSENT
3748            },
3749            Ordering::Release,
3750        );
3751    }
3752
3753    fn invalidate_fts_messages_present_cache(&self) {
3754        self.fts_messages_present_cache
3755            .store(FTS_MESSAGES_PRESENT_UNKNOWN, Ordering::Release);
3756    }
3757
3758    fn invalidate_conversation_source_cache(&self, source_id: &str) {
3759        self.ensured_conversation_sources
3760            .lock()
3761            .retain(|key| key.id != source_id);
3762    }
3763
3764    fn close_cached_ephemeral_writer_best_effort_in_place(&mut self) {
3765        let cached = self.cached_ephemeral_writer.get_mut();
3766        if let CachedEphemeralWriter::Cached(conn) =
3767            std::mem::replace(cached, CachedEphemeralWriter::Uninitialized)
3768        {
3769            let mut conn = conn;
3770            conn.0.close_best_effort_in_place();
3771        }
3772    }
3773
3774    fn close_cached_ephemeral_writer_without_checkpoint_in_place(&mut self) -> Result<()> {
3775        let cached = self.cached_ephemeral_writer.get_mut();
3776        match std::mem::replace(cached, CachedEphemeralWriter::Uninitialized) {
3777            CachedEphemeralWriter::Cached(mut conn) => conn
3778                .0
3779                .close_without_checkpoint_in_place()
3780                .with_context(|| "closing cached frankensqlite writer without final checkpoint"),
3781            CachedEphemeralWriter::Uninitialized | CachedEphemeralWriter::InUse => Ok(()),
3782        }
3783    }
3784
3785    /// Open in read-only mode using frankensqlite compat flags.
3786    pub fn open_readonly(path: &Path) -> Result<Self> {
3787        Self::open_readonly_with_doctor_lock_timeout(path, DOCTOR_MUTATION_DB_OPEN_LOCK_TIMEOUT)
3788    }
3789
3790    /// Open in read-only mode with an explicit doctor mutation-lock timeout.
3791    ///
3792    /// This is primarily useful for probes that need to prove a reader would
3793    /// not enter the archive while `cass doctor --fix` owns the repair lock.
3794    pub fn open_readonly_with_doctor_lock_timeout(path: &Path, timeout: Duration) -> Result<Self> {
3795        let path_str = path.to_string_lossy().to_string();
3796        let _doctor_guard = acquire_doctor_mutation_db_open_guard(path, timeout)?;
3797        let conn = open_franken_with_flags(&path_str, FrankenOpenFlags::SQLITE_OPEN_READ_ONLY)
3798            .with_context(|| format!("opening frankensqlite db readonly at {}", path.display()))?;
3799        let storage = Self::new(conn, path.to_path_buf());
3800        storage.apply_readonly_config()?;
3801        Ok(storage)
3802    }
3803
3804    pub fn close(self) -> Result<()> {
3805        let mut this = self;
3806        this.close_cached_ephemeral_writer_best_effort_in_place();
3807        this.conn
3808            .close()
3809            .with_context(|| "closing frankensqlite connection")
3810    }
3811
3812    pub fn close_without_checkpoint(self) -> Result<()> {
3813        let mut this = self;
3814        this.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3815        this.conn
3816            .close_without_checkpoint()
3817            .with_context(|| "closing frankensqlite connection without final checkpoint")
3818    }
3819
3820    pub fn close_best_effort_in_place(&mut self) {
3821        self.close_cached_ephemeral_writer_best_effort_in_place();
3822        self.conn.close_best_effort_in_place();
3823    }
3824
3825    pub fn close_without_checkpoint_in_place(&mut self) -> Result<()> {
3826        self.close_cached_ephemeral_writer_without_checkpoint_in_place()?;
3827        self.conn
3828            .close_without_checkpoint_in_place()
3829            .with_context(|| "closing frankensqlite connection without final checkpoint")
3830    }
3831
3832    /// Access the raw frankensqlite connection.
3833    pub fn raw(&self) -> &FrankenConnection {
3834        &self.conn
3835    }
3836
3837    /// Consume the storage wrapper and return the underlying frankensqlite
3838    /// connection after migrations/repair have already been applied.
3839    pub fn into_raw(self) -> FrankenConnection {
3840        let mut this = self;
3841        this.close_cached_ephemeral_writer_best_effort_in_place();
3842        this.conn
3843    }
3844
3845    /// Apply connection PRAGMAs for parity with SqliteStorage's `apply_pragmas()`.
3846    ///
3847    /// Frankensqlite supports all PRAGMAs cass uses (journal_mode, synchronous,
3848    /// cache_size, foreign_keys, busy_timeout). Its default journal_mode is already
3849    /// WAL and default synchronous is NORMAL, matching cass's requirements.
3850    ///
3851    pub fn apply_config(&self) -> Result<()> {
3852        // journal_mode: frankensqlite defaults to WAL, same as cass.
3853        // synchronous: frankensqlite defaults to NORMAL, same as cass.
3854        // Both are set explicitly for clarity.
3855        self.conn
3856            .execute("PRAGMA journal_mode = WAL;")
3857            .with_context(|| "setting journal_mode")?;
3858        self.conn
3859            .execute("PRAGMA synchronous = NORMAL;")
3860            .with_context(|| "setting synchronous")?;
3861
3862        // cache_size: 64MB (negative value = KiB).
3863        self.conn
3864            .execute("PRAGMA cache_size = -65536;")
3865            .with_context(|| "setting cache_size")?;
3866
3867        // foreign_keys: enable constraint enforcement.
3868        self.conn
3869            .execute("PRAGMA foreign_keys = ON;")
3870            .with_context(|| "setting foreign_keys")?;
3871
3872        // busy_timeout: 5 seconds (in milliseconds).
3873        self.conn
3874            .execute("PRAGMA busy_timeout = 5000;")
3875            .with_context(|| "setting busy_timeout")?;
3876
3877        // temp_store = MEMORY and mmap_size are C SQLite performance knobs.
3878        // In frankensqlite's architecture (in-memory MVCC engine with pager
3879        // backend), temp_store is always memory-resident and mmap_size does not
3880        // apply. Skipped intentionally — these are no-ops or errors.
3881
3882        // wal_autocheckpoint: use a bounded cadence that avoids checkpointing
3883        // inside common append batches without deferring checkpoints forever.
3884        let checkpoint_pragma =
3885            format!("PRAGMA wal_autocheckpoint = {DEFAULT_WAL_AUTOCHECKPOINT_PAGES};");
3886        let _ = self.conn.execute(&checkpoint_pragma);
3887        self.index_writer_checkpoint_pages
3888            .store(DEFAULT_WAL_AUTOCHECKPOINT_PAGES, Ordering::Relaxed);
3889        // Explicitly enable concurrent writer mode for BEGIN/transaction paths.
3890        // Try both namespace variants for compatibility across fsqlite builds.
3891        let _ = self.conn.execute("PRAGMA fsqlite.concurrent_mode = ON;");
3892        let _ = self.conn.execute("PRAGMA concurrent_mode = ON;");
3893        // Frankensqlite retained autocommit currently mis-serves same-connection
3894        // read-after-write queries on cass's storage paths; keep it off here
3895        // until the upstream visibility bug is fixed.
3896        //
3897        // CASS #163 item 3: If neither PRAGMA variant succeeds, the MVCC engine
3898        // will accumulate write snapshots for the lifetime of the connection,
3899        // causing unbounded memory growth on long-lived watch-mode handles.
3900        // Log at warn level so the failure is visible instead of silently
3901        // swallowed, and set a flag for callers that need to periodically
3902        // recycle the connection.
3903        let autocommit_pragma =
3904            disable_autocommit_retain(|pragma| self.conn.execute(pragma).map(|_| ()))?;
3905        tracing::debug!(
3906            pragma = autocommit_pragma,
3907            "disabled frankensqlite autocommit_retain for storage connection"
3908        );
3909
3910        Ok(())
3911    }
3912
3913    fn apply_readonly_config(&self) -> Result<()> {
3914        self.conn
3915            .execute("PRAGMA query_only = 1;")
3916            .with_context(|| "setting query_only")?;
3917        self.conn
3918            .execute("PRAGMA busy_timeout = 5000;")
3919            .with_context(|| "setting busy_timeout")?;
3920        self.conn
3921            .execute("PRAGMA cache_size = -65536;")
3922            .with_context(|| "setting cache_size")?;
3923        self.conn
3924            .execute("PRAGMA foreign_keys = ON;")
3925            .with_context(|| "setting foreign_keys")?;
3926        Ok(())
3927    }
3928
3929    /// Run all schema migrations, handling transition from meta table versioning.
3930    ///
3931    /// The existing `SqliteStorage` tracks schema version in a `meta` table entry.
3932    /// The new `MigrationRunner` uses a `_schema_migrations` table. This method:
3933    /// 1. Transitions existing databases from meta table → `_schema_migrations`
3934    /// 2. Runs pending migrations via `MigrationRunner`
3935    /// 3. Syncs `meta.schema_version` for backward compatibility
3936    ///
3937    /// # Fresh vs existing databases
3938    ///
3939    /// Fresh databases use a single combined migration (`MIGRATION_FRESH_SCHEMA`)
3940    /// that creates the complete V13 schema directly. This avoids the incremental
3941    /// V5 migration which uses `DROP TABLE` — an operation that triggers a known
3942    /// frankensqlite autoindex limitation.
3943    ///
3944    /// Existing databases (transitioned from SqliteStorage) are typically at
3945    /// V13 or newer already; additive post-V13 migrations are applied normally.
3946    pub fn run_migrations(&self) -> Result<()> {
3947        transition_from_meta_version(&self.conn)?;
3948
3949        let base_result = build_cass_migrations_before_tail_cache()
3950            .run(&self.conn)
3951            .with_context(|| "running base schema migrations")?;
3952
3953        let mut applied = base_result.applied;
3954        if apply_conversation_tail_state_cache_migration(&self.conn)
3955            .with_context(|| "running conversation tail-state cache migration")?
3956        {
3957            applied.push(15);
3958        }
3959
3960        let post_result = build_cass_migrations_after_tail_cache()
3961            .run(&self.conn)
3962            .with_context(|| "running post-tail-cache schema migrations")?;
3963        applied.extend(post_result.applied);
3964
3965        let current = self.schema_version()?;
3966        if !applied.is_empty() {
3967            info!(
3968                applied = ?applied,
3969                current,
3970                was_fresh = base_result.was_fresh,
3971                "frankensqlite schema migrations applied"
3972            );
3973        }
3974
3975        // Keep meta.schema_version in sync for backward compatibility.
3976        self.sync_meta_schema_version(current)?;
3977
3978        Ok(())
3979    }
3980
3981    /// Some historical canonical rebuild paths produced databases whose
3982    /// version markers claim the current schema while post-V10 analytics
3983    /// tables were never materialized. Detect that drift and backfill the
3984    /// idempotent table/index set from the combined schema migration.
3985    fn repair_missing_current_schema_objects(&self) -> Result<()> {
3986        let mut missing_tables = Vec::new();
3987        for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
3988            if let Err(err) = self.conn.query(probe_sql) {
3989                if error_indicates_missing_table(&err) {
3990                    missing_tables.push(table_name);
3991                    continue;
3992                }
3993                return Err(err).with_context(|| {
3994                    format!("probing required schema table {table_name} for completeness")
3995                });
3996            }
3997        }
3998
3999        if !missing_tables.is_empty() {
4000            info!(
4001                missing_tables = ?missing_tables,
4002                "repairing missing current-schema tables on an already-versioned cass database"
4003            );
4004
4005            for batch in current_schema_repair_batches_for_missing_tables(&missing_tables)? {
4006                self.conn
4007                    .execute_batch(batch.sql)
4008                    .with_context(|| format!("repairing current-schema batch {}", batch.name))?;
4009            }
4010
4011            for &(table_name, probe_sql) in REQUIRED_CURRENT_SCHEMA_TABLE_PROBES {
4012                if !missing_tables.contains(&table_name) {
4013                    continue;
4014                }
4015                self.conn
4016                    .query(probe_sql)
4017                    .with_context(|| format!("verifying repaired schema table {table_name}"))?;
4018            }
4019        }
4020        self.repair_missing_conversation_token_columns()?;
4021        Ok(())
4022    }
4023
4024    fn repair_missing_conversation_token_columns(&self) -> Result<()> {
4025        let columns = franken_table_column_names(&self.conn, "conversations")
4026            .with_context(|| "inspecting conversations columns for token-summary repair")?;
4027        let mut missing_columns = Vec::new();
4028        for &(column_name, column_type) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
4029            if columns.contains(column_name) {
4030                continue;
4031            }
4032            let sql = format!("ALTER TABLE conversations ADD COLUMN {column_name} {column_type};");
4033            self.conn.execute(&sql).with_context(|| {
4034                format!("adding missing conversations.{column_name} token-summary column")
4035            })?;
4036            missing_columns.push(column_name);
4037        }
4038        if !missing_columns.is_empty() {
4039            tracing::warn!(
4040                target: "cass::schema_repair",
4041                db_path = %self.db_path.display(),
4042                missing_columns = ?missing_columns,
4043                "cass#222: repaired missing conversations token-summary columns"
4044            );
4045        }
4046        Ok(())
4047    }
4048
4049    /// Detect and remove orphan rows whose FK parent has gone missing.
4050    ///
4051    /// A `Connection` dropped mid-transaction (the `drop_close` warning emitted
4052    /// by frankensqlite's `Drop` impl) can leave child rows persisted without a
4053    /// matching parent — `messages` referencing a `conversation_id` that does
4054    /// not exist, `message_metrics`/`token_usage`/`snippets` referencing a
4055    /// `message_id` that does not exist, etc. With `PRAGMA foreign_keys = ON`,
4056    /// every subsequent indexer pass then trips `FOREIGN KEY constraint failed`
4057    /// on the next write, the session never gets marked indexed, and the
4058    /// pending backlog grows without bound (issue #202).
4059    ///
4060    /// This pass runs at indexer startup as defense in depth: it scans each
4061    /// child table for rows whose parent row has gone missing and removes them
4062    /// in bounded committed chunks, breaking the failure cycle even when the
4063    /// underlying transaction-discipline bug has not been fully root-caused.
4064    /// The pass is idempotent (a clean database is a no-op), and emits a
4065    /// `WARN` after successful cleanup so the upstream `drop_close` condition
4066    /// stays visible.
4067    pub(crate) fn cleanup_orphan_fk_rows(&self) -> Result<OrphanFkCleanupReport> {
4068        let mut report = OrphanFkCleanupReport::default();
4069        let orphan_message_ids = match collect_orphan_message_ids(&self.conn) {
4070            Ok(ids) => ids,
4071            Err(err) if error_indicates_missing_table(&err) => {
4072                tracing::debug!(
4073                    target: "cass::fk_repair",
4074                    child_table = "messages",
4075                    error = %err,
4076                    "skipping orphan-message probe (table or column unavailable)"
4077                );
4078                Vec::new()
4079            }
4080            Err(err) => return Err(err),
4081        };
4082        if !orphan_message_ids.is_empty() {
4083            report.record("messages", orphan_message_ids.len() as i64);
4084        }
4085
4086        if !orphan_message_ids.is_empty() {
4087            delete_orphan_message_ids_bisecting_oom(&self.conn, &orphan_message_ids)
4088                .context("deleting orphan message rows and dependent children")?;
4089        }
4090
4091        for entry in ORPHAN_DIRECT_CHILD_TABLES {
4092            loop {
4093                let ids = match collect_direct_orphan_id_page(&self.conn, entry) {
4094                    Ok(ids) => ids,
4095                    Err(err)
4096                        if error_indicates_missing_table(&err)
4097                            || error_indicates_missing_column(&err) =>
4098                    {
4099                        // Tolerant probe: a missing child/parent table or FK
4100                        // column on older schemas means there is nothing to
4101                        // clean up for this table.
4102                        tracing::debug!(
4103                            target: "cass::fk_repair",
4104                            child_table = entry.child_table,
4105                            error = %err,
4106                            "skipping orphan probe (table or column unavailable)"
4107                        );
4108                        break;
4109                    }
4110                    Err(err) => {
4111                        return Err(err).with_context(|| {
4112                            format!("probing orphan rows in {}", entry.child_table)
4113                        });
4114                    }
4115                };
4116                if ids.is_empty() {
4117                    break;
4118                }
4119
4120                let deleted = delete_direct_orphan_ids_bisecting_oom(&self.conn, entry, &ids)
4121                    .with_context(|| format!("deleting orphan rows from {}", entry.child_table))?;
4122                if deleted == 0 {
4123                    break;
4124                }
4125                report.record(
4126                    entry.child_table,
4127                    i64::try_from(deleted).unwrap_or(i64::MAX),
4128                );
4129            }
4130        }
4131
4132        if report.total == 0 {
4133            return Ok(report);
4134        }
4135
4136        // WARN only fires after a successful commit so the message accurately
4137        // reflects what actually happened on disk. db_path is included so logs
4138        // from concurrent indexers against different databases stay
4139        // disambiguated.
4140        tracing::warn!(
4141            target: "cass::fk_repair",
4142            db_path = %self.db_path.display(),
4143            total_orphans = report.total,
4144            per_table = ?report.per_table,
4145            "cass#202: removed orphan rows left behind by interrupted index transactions"
4146        );
4147
4148        Ok(report)
4149    }
4150
4151    /// Return the current schema version from `_schema_migrations`.
4152    pub fn schema_version(&self) -> Result<i64> {
4153        let rows = self
4154            .conn
4155            .query("SELECT MAX(version) FROM _schema_migrations;")
4156            .with_context(|| "reading schema version from _schema_migrations")?;
4157
4158        if let Some(row) = rows.first()
4159            && let Ok(v) = row.get_typed::<Option<i64>>(0)
4160        {
4161            return Ok(v.unwrap_or(0));
4162        }
4163        Ok(0)
4164    }
4165
4166    /// Keep `meta.schema_version` in sync for backward compatibility with `SqliteStorage`.
4167    fn sync_meta_schema_version(&self, version: i64) -> Result<()> {
4168        // The meta table is created by V1 migration. If it doesn't exist yet,
4169        // there's nothing to sync.
4170        if self.conn.query("SELECT key FROM meta LIMIT 1;").is_err() {
4171            return Ok(());
4172        }
4173
4174        // Only write if the version needs updating to avoid write lock contention
4175        if let Ok(rows) = self
4176            .conn
4177            .query("SELECT value FROM meta WHERE key = 'schema_version';")
4178            && let Some(row) = rows.first()
4179            && let Ok(val) = row.get_typed::<String>(0)
4180            && val == version.to_string()
4181        {
4182            return Ok(()); // Already up to date
4183        }
4184
4185        self.conn
4186            .execute_compat(
4187                "INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', ?1);",
4188                &[ParamValue::from(version.to_string())],
4189            )
4190            .with_context(|| "syncing meta schema_version")?;
4191
4192        Ok(())
4193    }
4194
4195    /// Resolve the database file path for this connection.
4196    pub fn database_path(&self) -> Result<PathBuf> {
4197        Ok(self.db_path.clone())
4198    }
4199
4200    pub(crate) fn ephemeral_writer_preflight_verified(&self) -> bool {
4201        self.ephemeral_writer_preflight_verified
4202            .load(Ordering::Relaxed)
4203    }
4204
4205    pub(crate) fn mark_ephemeral_writer_preflight_verified(&self) {
4206        self.ephemeral_writer_preflight_verified
4207            .store(true, Ordering::Relaxed);
4208    }
4209
4210    pub(crate) fn index_writer_checkpoint_pages(&self) -> Option<i64> {
4211        let pages = self.index_writer_checkpoint_pages.load(Ordering::Relaxed);
4212        (pages != UNSET_INDEX_WRITER_CHECKPOINT_PAGES).then_some(pages)
4213    }
4214
4215    pub(crate) fn mark_index_writer_checkpoint_pages(&self, pages: i64) {
4216        self.index_writer_checkpoint_pages
4217            .store(pages, Ordering::Relaxed);
4218    }
4219
4220    pub(crate) fn index_writer_busy_timeout_ms(&self) -> Option<u64> {
4221        let timeout_ms = self.index_writer_busy_timeout_ms.load(Ordering::Relaxed);
4222        (timeout_ms != UNSET_INDEX_WRITER_BUSY_TIMEOUT_MS).then_some(timeout_ms)
4223    }
4224
4225    pub(crate) fn mark_index_writer_busy_timeout_ms(&self, timeout_ms: u64) {
4226        self.index_writer_busy_timeout_ms
4227            .store(timeout_ms, Ordering::Relaxed);
4228    }
4229
4230    /// Open database with migration, backing up if schema is incompatible.
4231    pub fn open_or_rebuild(path: &Path) -> std::result::Result<Self, MigrationError> {
4232        if let Some(parent) = path.parent() {
4233            fs::create_dir_all(parent)?;
4234        }
4235
4236        if path.exists() {
4237            let check_result = check_schema_compatibility(path);
4238            match check_result {
4239                Ok(SchemaCheck::Compatible) | Ok(SchemaCheck::NeedsMigration) => {
4240                    // Continue with normal open
4241                }
4242                Ok(SchemaCheck::NeedsRebuild(reason)) => {
4243                    let backup_path = create_backup(path)?;
4244                    cleanup_old_backups(path, MAX_BACKUPS)?;
4245                    remove_database_files(path)?;
4246                    return Err(MigrationError::RebuildRequired {
4247                        reason,
4248                        backup_path,
4249                    });
4250                }
4251                Err(err) if schema_check_error_requires_rebuild(&err) => {
4252                    let backup_path = create_backup(path)?;
4253                    cleanup_old_backups(path, MAX_BACKUPS)?;
4254                    remove_database_files(path)?;
4255                    return Err(MigrationError::RebuildRequired {
4256                        reason: format!("Database appears corrupted: {err}"),
4257                        backup_path,
4258                    });
4259                }
4260                Err(err) => return Err(MigrationError::Database(err)),
4261            }
4262        }
4263
4264        let storage = Self::open(path).map_err(|e| MigrationError::Other(e.to_string()))?;
4265        Ok(storage)
4266    }
4267}
4268
4269// -------------------------------------------------------------------------
4270// Frankensqlite migration helpers
4271// -------------------------------------------------------------------------
4272
4273/// Build the `MigrationRunner` for the frankensqlite migration path.
4274///
4275/// Uses a single combined migration (version 13) that creates the complete
4276/// final schema in one step. This avoids the V5 `DROP TABLE conversations`
4277/// operation which triggers a known frankensqlite limitation: autoindex entries
4278/// in sqlite_master are not properly cleaned up during DROP TABLE, causing
4279/// "sqlite_master entry not found" errors.
4280///
4281/// For existing databases transitioned from SqliteStorage, the transition
4282/// function backfills `_schema_migrations`; post-V13 additive migrations then
4283/// run normally.
4284fn build_cass_migrations_before_tail_cache() -> MigrationRunner {
4285    MigrationRunner::new()
4286        .add(13, "full_schema_v13", MIGRATION_FRESH_SCHEMA)
4287        .add(14, "fts_contentless", MIGRATION_V14)
4288}
4289
4290fn build_cass_migrations_after_tail_cache() -> MigrationRunner {
4291    MigrationRunner::new()
4292        .add(16, "drop_redundant_message_conv_idx", MIGRATION_V16)
4293        .add(17, "drop_message_created_idx", MIGRATION_V17)
4294        .add(18, "conversation_tail_state_hot_table", MIGRATION_V18)
4295        .add(19, "conversation_external_lookup", MIGRATION_V19)
4296        .add(20, "conversation_external_tail_lookup", MIGRATION_V20)
4297}
4298
4299fn schema_migration_is_applied(conn: &FrankenConnection, version: i64) -> Result<bool> {
4300    let rows = conn
4301        .query_with_params(
4302            "SELECT 1 FROM _schema_migrations WHERE version = ?1 LIMIT 1;",
4303            &[SqliteValue::from(version)],
4304        )
4305        .with_context(|| format!("checking schema migration version {version}"))?;
4306    Ok(!rows.is_empty())
4307}
4308
4309fn apply_conversation_tail_state_cache_migration(conn: &FrankenConnection) -> Result<bool> {
4310    conn.execute("BEGIN IMMEDIATE;")
4311        .with_context(|| "starting v15 conversation tail-state migration transaction")?;
4312
4313    let result = (|| -> Result<bool> {
4314        if schema_migration_is_applied(conn, 15)? {
4315            conn.execute("COMMIT;")
4316                .with_context(|| "committing already-applied v15 migration transaction")?;
4317            return Ok(false);
4318        }
4319
4320        let started = Instant::now();
4321        let conversation_columns = franken_table_column_names(conn, "conversations")
4322            .with_context(|| "inspecting conversations columns before v15 migration")?;
4323        if !conversation_columns.contains("last_message_idx") {
4324            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_idx INTEGER;")
4325                .with_context(|| "adding v15 conversations.last_message_idx column")?;
4326        }
4327        if !conversation_columns.contains("last_message_created_at") {
4328            conn.execute("ALTER TABLE conversations ADD COLUMN last_message_created_at INTEGER;")
4329                .with_context(|| "adding v15 conversations.last_message_created_at column")?;
4330        }
4331        conn.execute_batch(MIGRATION_V15_TAIL_STATE_TABLE)
4332            .with_context(|| "applying v15 conversation tail-state table schema")?;
4333        conn.execute_compat(
4334            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
4335            fparams![15_i64, "conversation_tail_state_cache"],
4336        )
4337        .with_context(|| "recording v15 conversation tail-state migration")?;
4338        conn.execute("COMMIT;")
4339            .with_context(|| "committing v15 conversation tail-state migration")?;
4340        info!(
4341            elapsed_ms = started.elapsed().as_millis(),
4342            "applied v15 conversation tail-state cache migration"
4343        );
4344        Ok(true)
4345    })();
4346
4347    if result.is_err() {
4348        let _ = conn.execute("ROLLBACK;");
4349    }
4350
4351    result
4352}
4353
4354fn franken_table_column_names(
4355    conn: &FrankenConnection,
4356    table_name: &str,
4357) -> Result<HashSet<String>> {
4358    if !table_name
4359        .chars()
4360        .all(|c| c.is_ascii_alphanumeric() || c == '_')
4361    {
4362        return Err(anyhow!(
4363            "unsafe table name for PRAGMA table_info: {table_name}"
4364        ));
4365    }
4366
4367    conn.query_map_collect(
4368        &format!("PRAGMA table_info({table_name})"),
4369        fparams![],
4370        |row: &FrankenRow| row.get_typed::<String>(1),
4371    )
4372    .with_context(|| format!("reading PRAGMA table_info({table_name})"))
4373    .map(|columns| columns.into_iter().collect())
4374}
4375
4376/// Combined V13 schema for fresh databases.
4377///
4378/// Creates the complete final schema in a single migration, avoiding the
4379/// incremental V5 `DROP TABLE conversations` which triggers a frankensqlite
4380/// autoindex limitation. All columns from V1-V13 are included in their
4381/// respective CREATE TABLE statements.
4382///
4383/// Table creation order respects foreign key references:
4384/// sources → agents/workspaces → conversations → messages → snippets, etc.
4385const MIGRATION_FRESH_SCHEMA: &str = r"
4386-- Core tables (V1)
4387CREATE TABLE IF NOT EXISTS meta (
4388    key TEXT PRIMARY KEY,
4389    value TEXT NOT NULL
4390);
4391
4392CREATE TABLE IF NOT EXISTS agents (
4393    id INTEGER PRIMARY KEY,
4394    slug TEXT NOT NULL UNIQUE,
4395    name TEXT NOT NULL,
4396    version TEXT,
4397    kind TEXT NOT NULL,
4398    created_at INTEGER NOT NULL,
4399    updated_at INTEGER NOT NULL
4400);
4401
4402CREATE TABLE IF NOT EXISTS workspaces (
4403    id INTEGER PRIMARY KEY,
4404    path TEXT NOT NULL UNIQUE,
4405    display_name TEXT
4406);
4407
4408-- Sources (V4)
4409CREATE TABLE IF NOT EXISTS sources (
4410    id TEXT PRIMARY KEY,
4411    kind TEXT NOT NULL,
4412    host_label TEXT,
4413    machine_id TEXT,
4414    platform TEXT,
4415    config_json TEXT,
4416    created_at INTEGER NOT NULL,
4417    updated_at INTEGER NOT NULL
4418);
4419
4420INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4421VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4422
4423-- Conversations: V1 base + V5 provenance + V7 metadata_bin + V10 token summary
4424CREATE TABLE IF NOT EXISTS conversations (
4425    id INTEGER PRIMARY KEY,
4426    agent_id INTEGER NOT NULL REFERENCES agents(id),
4427    workspace_id INTEGER REFERENCES workspaces(id),
4428    source_id TEXT NOT NULL DEFAULT 'local' REFERENCES sources(id),
4429    external_id TEXT,
4430    title TEXT,
4431    source_path TEXT NOT NULL,
4432    started_at INTEGER,
4433    ended_at INTEGER,
4434    approx_tokens INTEGER,
4435    metadata_json TEXT,
4436    origin_host TEXT,
4437    metadata_bin BLOB,
4438    total_input_tokens INTEGER,
4439    total_output_tokens INTEGER,
4440    total_cache_read_tokens INTEGER,
4441    total_cache_creation_tokens INTEGER,
4442    grand_total_tokens INTEGER,
4443    estimated_cost_usd REAL,
4444    primary_model TEXT,
4445    api_call_count INTEGER,
4446    tool_call_count INTEGER,
4447    user_message_count INTEGER,
4448    assistant_message_count INTEGER,
4449    -- V15 columns are included in the fresh schema so fresh DB creation does
4450    -- not need ALTER TABLE on conversations. That ALTER path can duplicate
4451    -- provenance autoindex state in frankensqlite when the named unique
4452    -- provenance index already exists.
4453    last_message_idx INTEGER,
4454    last_message_created_at INTEGER
4455);
4456
4457-- Named unique index avoids autoindex issues if table is ever recreated
4458CREATE UNIQUE INDEX IF NOT EXISTS idx_conversations_provenance
4459    ON conversations(source_id, agent_id, external_id);
4460
4461-- Messages: V1 base + V7 extra_bin
4462CREATE TABLE IF NOT EXISTS messages (
4463    id INTEGER PRIMARY KEY,
4464    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4465    idx INTEGER NOT NULL,
4466    role TEXT NOT NULL,
4467    author TEXT,
4468    created_at INTEGER,
4469    content TEXT NOT NULL,
4470    extra_json TEXT,
4471    extra_bin BLOB,
4472    UNIQUE(conversation_id, idx)
4473);
4474
4475CREATE TABLE IF NOT EXISTS snippets (
4476    id INTEGER PRIMARY KEY,
4477    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4478    file_path TEXT,
4479    start_line INTEGER,
4480    end_line INTEGER,
4481    language TEXT,
4482    snippet_text TEXT
4483);
4484
4485CREATE TABLE IF NOT EXISTS tags (
4486    id INTEGER PRIMARY KEY,
4487    name TEXT NOT NULL UNIQUE
4488);
4489
4490CREATE TABLE IF NOT EXISTS conversation_tags (
4491    conversation_id INTEGER NOT NULL REFERENCES conversations(id) ON DELETE CASCADE,
4492    tag_id INTEGER NOT NULL REFERENCES tags(id) ON DELETE CASCADE,
4493    PRIMARY KEY (conversation_id, tag_id)
4494);
4495
4496-- Daily stats (V8)
4497CREATE TABLE IF NOT EXISTS daily_stats (
4498    day_id INTEGER NOT NULL,
4499    agent_slug TEXT NOT NULL,
4500    source_id TEXT NOT NULL DEFAULT 'all',
4501    session_count INTEGER NOT NULL DEFAULT 0,
4502    message_count INTEGER NOT NULL DEFAULT 0,
4503    total_chars INTEGER NOT NULL DEFAULT 0,
4504    last_updated INTEGER NOT NULL,
4505    PRIMARY KEY (day_id, agent_slug, source_id)
4506);
4507
4508-- Embedding jobs (V9)
4509CREATE TABLE IF NOT EXISTS embedding_jobs (
4510    id INTEGER PRIMARY KEY AUTOINCREMENT,
4511    db_path TEXT NOT NULL,
4512    model_id TEXT NOT NULL,
4513    status TEXT NOT NULL DEFAULT 'pending',
4514    total_docs INTEGER NOT NULL DEFAULT 0,
4515    completed_docs INTEGER NOT NULL DEFAULT 0,
4516    error_message TEXT,
4517    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4518    started_at TEXT,
4519    completed_at TEXT
4520);
4521
4522CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4523ON embedding_jobs(db_path, model_id)
4524WHERE status IN ('pending', 'running');
4525
4526-- Token usage ledger (V10)
4527CREATE TABLE IF NOT EXISTS token_usage (
4528    id INTEGER PRIMARY KEY AUTOINCREMENT,
4529    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4530    conversation_id INTEGER NOT NULL,
4531    agent_id INTEGER NOT NULL,
4532    workspace_id INTEGER,
4533    source_id TEXT NOT NULL DEFAULT 'local',
4534    timestamp_ms INTEGER NOT NULL,
4535    day_id INTEGER NOT NULL,
4536    model_name TEXT,
4537    model_family TEXT,
4538    model_tier TEXT,
4539    service_tier TEXT,
4540    provider TEXT,
4541    input_tokens INTEGER,
4542    output_tokens INTEGER,
4543    cache_read_tokens INTEGER,
4544    cache_creation_tokens INTEGER,
4545    thinking_tokens INTEGER,
4546    total_tokens INTEGER,
4547    estimated_cost_usd REAL,
4548    role TEXT NOT NULL,
4549    content_chars INTEGER NOT NULL,
4550    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4551    tool_call_count INTEGER NOT NULL DEFAULT 0,
4552    data_source TEXT NOT NULL DEFAULT 'api',
4553    UNIQUE(message_id)
4554);
4555
4556-- Token daily stats (V10)
4557CREATE TABLE IF NOT EXISTS token_daily_stats (
4558    day_id INTEGER NOT NULL,
4559    agent_slug TEXT NOT NULL,
4560    source_id TEXT NOT NULL DEFAULT 'all',
4561    model_family TEXT NOT NULL DEFAULT 'all',
4562    api_call_count INTEGER NOT NULL DEFAULT 0,
4563    user_message_count INTEGER NOT NULL DEFAULT 0,
4564    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4565    tool_message_count INTEGER NOT NULL DEFAULT 0,
4566    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4567    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4568    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4569    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4570    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4571    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4572    total_content_chars INTEGER NOT NULL DEFAULT 0,
4573    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4574    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4575    session_count INTEGER NOT NULL DEFAULT 0,
4576    last_updated INTEGER NOT NULL,
4577    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4578);
4579
4580-- Model pricing (V10)
4581CREATE TABLE IF NOT EXISTS model_pricing (
4582    model_pattern TEXT NOT NULL,
4583    provider TEXT NOT NULL,
4584    input_cost_per_mtok REAL NOT NULL,
4585    output_cost_per_mtok REAL NOT NULL,
4586    cache_read_cost_per_mtok REAL,
4587    cache_creation_cost_per_mtok REAL,
4588    effective_date TEXT NOT NULL,
4589    PRIMARY KEY (model_pattern, effective_date)
4590);
4591
4592INSERT OR IGNORE INTO model_pricing VALUES
4593    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4594    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4595    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4596    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4597    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4598    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4599    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4600    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4601    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4602    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4603
4604-- Message metrics: V11 base + V12 model dimensions
4605CREATE TABLE IF NOT EXISTS message_metrics (
4606    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4607    created_at_ms INTEGER NOT NULL,
4608    hour_id INTEGER NOT NULL,
4609    day_id INTEGER NOT NULL,
4610    agent_slug TEXT NOT NULL,
4611    workspace_id INTEGER NOT NULL DEFAULT 0,
4612    source_id TEXT NOT NULL DEFAULT 'local',
4613    role TEXT NOT NULL,
4614    content_chars INTEGER NOT NULL,
4615    content_tokens_est INTEGER NOT NULL,
4616    api_input_tokens INTEGER,
4617    api_output_tokens INTEGER,
4618    api_cache_read_tokens INTEGER,
4619    api_cache_creation_tokens INTEGER,
4620    api_thinking_tokens INTEGER,
4621    api_service_tier TEXT,
4622    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4623    tool_call_count INTEGER NOT NULL DEFAULT 0,
4624    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4625    has_plan INTEGER NOT NULL DEFAULT 0,
4626    model_name TEXT,
4627    model_family TEXT NOT NULL DEFAULT 'unknown',
4628    model_tier TEXT NOT NULL DEFAULT 'unknown',
4629    provider TEXT NOT NULL DEFAULT 'unknown'
4630);
4631
4632-- Hourly rollups: V11 base + V13 plan columns
4633CREATE TABLE IF NOT EXISTS usage_hourly (
4634    hour_id INTEGER NOT NULL,
4635    agent_slug TEXT NOT NULL,
4636    workspace_id INTEGER NOT NULL DEFAULT 0,
4637    source_id TEXT NOT NULL DEFAULT 'local',
4638    message_count INTEGER NOT NULL DEFAULT 0,
4639    user_message_count INTEGER NOT NULL DEFAULT 0,
4640    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4641    tool_call_count INTEGER NOT NULL DEFAULT 0,
4642    plan_message_count INTEGER NOT NULL DEFAULT 0,
4643    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4644    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4645    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4646    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4647    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4648    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4649    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4650    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4651    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4652    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4653    last_updated INTEGER NOT NULL DEFAULT 0,
4654    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4655    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4656    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4657);
4658
4659-- Daily rollups: V11 base + V13 plan columns
4660CREATE TABLE IF NOT EXISTS usage_daily (
4661    day_id INTEGER NOT NULL,
4662    agent_slug TEXT NOT NULL,
4663    workspace_id INTEGER NOT NULL DEFAULT 0,
4664    source_id TEXT NOT NULL DEFAULT 'local',
4665    message_count INTEGER NOT NULL DEFAULT 0,
4666    user_message_count INTEGER NOT NULL DEFAULT 0,
4667    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4668    tool_call_count INTEGER NOT NULL DEFAULT 0,
4669    plan_message_count INTEGER NOT NULL DEFAULT 0,
4670    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4671    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4672    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4673    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4674    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4675    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4676    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4677    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4678    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4679    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4680    last_updated INTEGER NOT NULL DEFAULT 0,
4681    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4682    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4683    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
4684);
4685
4686-- Model daily rollups (V12)
4687CREATE TABLE IF NOT EXISTS usage_models_daily (
4688    day_id INTEGER NOT NULL,
4689    agent_slug TEXT NOT NULL,
4690    workspace_id INTEGER NOT NULL DEFAULT 0,
4691    source_id TEXT NOT NULL DEFAULT 'local',
4692    model_family TEXT NOT NULL DEFAULT 'unknown',
4693    model_tier TEXT NOT NULL DEFAULT 'unknown',
4694    message_count INTEGER NOT NULL DEFAULT 0,
4695    user_message_count INTEGER NOT NULL DEFAULT 0,
4696    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4697    tool_call_count INTEGER NOT NULL DEFAULT 0,
4698    plan_message_count INTEGER NOT NULL DEFAULT 0,
4699    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4700    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4701    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4702    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4703    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4704    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4705    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4706    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4707    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4708    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4709    last_updated INTEGER NOT NULL DEFAULT 0,
4710    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
4711);
4712
4713-- All indexes
4714CREATE INDEX IF NOT EXISTS idx_conversations_agent_started ON conversations(agent_id, started_at DESC);
4715CREATE INDEX IF NOT EXISTS idx_conversations_source_id ON conversations(source_id);
4716CREATE INDEX IF NOT EXISTS idx_conversations_source_path ON conversations(source_path);
4717CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4718CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4719CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4720CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4721CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4722CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4723CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4724CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4725CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4726CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
4727CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
4728CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
4729CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
4730CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
4731CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
4732CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
4733CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
4734CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
4735CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
4736CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
4737CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
4738CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
4739CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
4740CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
4741CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
4742CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
4743CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
4744";
4745
4746#[derive(Clone, Copy)]
4747struct SchemaRepairBatch {
4748    name: &'static str,
4749    tables: &'static [&'static str],
4750    sql: &'static str,
4751}
4752
4753const CURRENT_SCHEMA_REPAIR_SOURCES_SQL: &str = r"
4754CREATE TABLE IF NOT EXISTS sources (
4755    id TEXT PRIMARY KEY,
4756    kind TEXT NOT NULL,
4757    host_label TEXT,
4758    machine_id TEXT,
4759    platform TEXT,
4760    config_json TEXT,
4761    created_at INTEGER NOT NULL,
4762    updated_at INTEGER NOT NULL
4763);
4764
4765INSERT OR IGNORE INTO sources (id, kind, host_label, created_at, updated_at)
4766VALUES ('local', 'local', NULL, strftime('%s','now')*1000, strftime('%s','now')*1000);
4767";
4768
4769const CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL: &str = r"
4770CREATE TABLE IF NOT EXISTS daily_stats (
4771    day_id INTEGER NOT NULL,
4772    agent_slug TEXT NOT NULL,
4773    source_id TEXT NOT NULL DEFAULT 'all',
4774    session_count INTEGER NOT NULL DEFAULT 0,
4775    message_count INTEGER NOT NULL DEFAULT 0,
4776    total_chars INTEGER NOT NULL DEFAULT 0,
4777    last_updated INTEGER NOT NULL,
4778    PRIMARY KEY (day_id, agent_slug, source_id)
4779);
4780
4781CREATE INDEX IF NOT EXISTS idx_daily_stats_agent ON daily_stats(agent_slug, day_id);
4782CREATE INDEX IF NOT EXISTS idx_daily_stats_source ON daily_stats(source_id, day_id);
4783";
4784
4785const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL: &str = r"
4786CREATE TABLE IF NOT EXISTS conversation_external_lookup (
4787    lookup_key TEXT PRIMARY KEY,
4788    conversation_id INTEGER NOT NULL
4789);
4790
4791INSERT OR REPLACE INTO conversation_external_lookup (lookup_key, conversation_id)
4792SELECT
4793    CAST(length(source_id) AS TEXT) || ':' || source_id || ':' ||
4794    CAST(agent_id AS TEXT) || ':' ||
4795    CAST(length(external_id) AS TEXT) || ':' || external_id,
4796    id
4797FROM conversations
4798WHERE external_id IS NOT NULL;
4799";
4800
4801const CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL: &str = r"
4802CREATE TABLE IF NOT EXISTS conversation_tail_state (
4803    conversation_id INTEGER PRIMARY KEY,
4804    ended_at INTEGER,
4805    last_message_idx INTEGER,
4806    last_message_created_at INTEGER
4807);
4808
4809CREATE TABLE IF NOT EXISTS conversation_external_tail_lookup (
4810    lookup_key TEXT PRIMARY KEY,
4811    conversation_id INTEGER NOT NULL,
4812    ended_at INTEGER,
4813    last_message_idx INTEGER,
4814    last_message_created_at INTEGER
4815);
4816
4817INSERT OR REPLACE INTO conversation_external_tail_lookup (
4818    lookup_key,
4819    conversation_id,
4820    ended_at,
4821    last_message_idx,
4822    last_message_created_at
4823)
4824SELECT
4825    CAST(length(c.source_id) AS TEXT) || ':' || c.source_id || ':' ||
4826    CAST(c.agent_id AS TEXT) || ':' ||
4827    CAST(length(c.external_id) AS TEXT) || ':' || c.external_id,
4828    c.id,
4829    ts.ended_at,
4830    ts.last_message_idx,
4831    ts.last_message_created_at
4832FROM conversations c
4833LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
4834WHERE c.external_id IS NOT NULL;
4835";
4836
4837const CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL: &str = r"
4838CREATE TABLE IF NOT EXISTS embedding_jobs (
4839    id INTEGER PRIMARY KEY AUTOINCREMENT,
4840    db_path TEXT NOT NULL,
4841    model_id TEXT NOT NULL,
4842    status TEXT NOT NULL DEFAULT 'pending',
4843    total_docs INTEGER NOT NULL DEFAULT 0,
4844    completed_docs INTEGER NOT NULL DEFAULT 0,
4845    error_message TEXT,
4846    created_at TEXT NOT NULL DEFAULT (datetime('now')),
4847    started_at TEXT,
4848    completed_at TEXT
4849);
4850
4851CREATE UNIQUE INDEX IF NOT EXISTS idx_embedding_jobs_active
4852ON embedding_jobs(db_path, model_id)
4853WHERE status IN ('pending', 'running');
4854";
4855
4856const CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL: &str = r"
4857CREATE TABLE IF NOT EXISTS token_usage (
4858    id INTEGER PRIMARY KEY AUTOINCREMENT,
4859    message_id INTEGER NOT NULL REFERENCES messages(id) ON DELETE CASCADE,
4860    conversation_id INTEGER NOT NULL,
4861    agent_id INTEGER NOT NULL,
4862    workspace_id INTEGER,
4863    source_id TEXT NOT NULL DEFAULT 'local',
4864    timestamp_ms INTEGER NOT NULL,
4865    day_id INTEGER NOT NULL,
4866    model_name TEXT,
4867    model_family TEXT,
4868    model_tier TEXT,
4869    service_tier TEXT,
4870    provider TEXT,
4871    input_tokens INTEGER,
4872    output_tokens INTEGER,
4873    cache_read_tokens INTEGER,
4874    cache_creation_tokens INTEGER,
4875    thinking_tokens INTEGER,
4876    total_tokens INTEGER,
4877    estimated_cost_usd REAL,
4878    role TEXT NOT NULL,
4879    content_chars INTEGER NOT NULL,
4880    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4881    tool_call_count INTEGER NOT NULL DEFAULT 0,
4882    data_source TEXT NOT NULL DEFAULT 'api',
4883    UNIQUE(message_id)
4884);
4885
4886CREATE INDEX IF NOT EXISTS idx_token_usage_day ON token_usage(day_id, agent_id);
4887CREATE INDEX IF NOT EXISTS idx_token_usage_conv ON token_usage(conversation_id);
4888CREATE INDEX IF NOT EXISTS idx_token_usage_model ON token_usage(model_family, day_id);
4889CREATE INDEX IF NOT EXISTS idx_token_usage_workspace ON token_usage(workspace_id, day_id);
4890CREATE INDEX IF NOT EXISTS idx_token_usage_timestamp ON token_usage(timestamp_ms);
4891
4892CREATE TABLE IF NOT EXISTS token_daily_stats (
4893    day_id INTEGER NOT NULL,
4894    agent_slug TEXT NOT NULL,
4895    source_id TEXT NOT NULL DEFAULT 'all',
4896    model_family TEXT NOT NULL DEFAULT 'all',
4897    api_call_count INTEGER NOT NULL DEFAULT 0,
4898    user_message_count INTEGER NOT NULL DEFAULT 0,
4899    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4900    tool_message_count INTEGER NOT NULL DEFAULT 0,
4901    total_input_tokens INTEGER NOT NULL DEFAULT 0,
4902    total_output_tokens INTEGER NOT NULL DEFAULT 0,
4903    total_cache_read_tokens INTEGER NOT NULL DEFAULT 0,
4904    total_cache_creation_tokens INTEGER NOT NULL DEFAULT 0,
4905    total_thinking_tokens INTEGER NOT NULL DEFAULT 0,
4906    grand_total_tokens INTEGER NOT NULL DEFAULT 0,
4907    total_content_chars INTEGER NOT NULL DEFAULT 0,
4908    total_tool_calls INTEGER NOT NULL DEFAULT 0,
4909    estimated_cost_usd REAL NOT NULL DEFAULT 0.0,
4910    session_count INTEGER NOT NULL DEFAULT 0,
4911    last_updated INTEGER NOT NULL,
4912    PRIMARY KEY (day_id, agent_slug, source_id, model_family)
4913);
4914
4915CREATE INDEX IF NOT EXISTS idx_token_daily_stats_agent ON token_daily_stats(agent_slug, day_id);
4916CREATE INDEX IF NOT EXISTS idx_token_daily_stats_model ON token_daily_stats(model_family, day_id);
4917
4918CREATE TABLE IF NOT EXISTS model_pricing (
4919    model_pattern TEXT NOT NULL,
4920    provider TEXT NOT NULL,
4921    input_cost_per_mtok REAL NOT NULL,
4922    output_cost_per_mtok REAL NOT NULL,
4923    cache_read_cost_per_mtok REAL,
4924    cache_creation_cost_per_mtok REAL,
4925    effective_date TEXT NOT NULL,
4926    PRIMARY KEY (model_pattern, effective_date)
4927);
4928
4929INSERT OR IGNORE INTO model_pricing VALUES
4930    ('claude-opus-4%', 'anthropic', 15.0, 75.0, 1.5, 18.75, '2025-10-01'),
4931    ('claude-sonnet-4%', 'anthropic', 3.0, 15.0, 0.3, 3.75, '2025-10-01'),
4932    ('claude-haiku-4%', 'anthropic', 0.80, 4.0, 0.08, 1.0, '2025-10-01'),
4933    ('gpt-4o%', 'openai', 2.50, 10.0, NULL, NULL, '2025-01-01'),
4934    ('gpt-4-turbo%', 'openai', 10.0, 30.0, NULL, NULL, '2024-04-01'),
4935    ('gpt-4.1%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4936    ('o3%', 'openai', 2.0, 8.0, NULL, NULL, '2025-04-01'),
4937    ('o4-mini%', 'openai', 1.10, 4.40, NULL, NULL, '2025-04-01'),
4938    ('gemini-2%flash%', 'google', 0.075, 0.30, NULL, NULL, '2025-01-01'),
4939    ('gemini-2%pro%', 'google', 1.25, 10.0, NULL, NULL, '2025-01-01');
4940";
4941
4942const CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL: &str = r"
4943CREATE TABLE IF NOT EXISTS message_metrics (
4944    message_id INTEGER PRIMARY KEY REFERENCES messages(id) ON DELETE CASCADE,
4945    created_at_ms INTEGER NOT NULL,
4946    hour_id INTEGER NOT NULL,
4947    day_id INTEGER NOT NULL,
4948    agent_slug TEXT NOT NULL,
4949    workspace_id INTEGER NOT NULL DEFAULT 0,
4950    source_id TEXT NOT NULL DEFAULT 'local',
4951    role TEXT NOT NULL,
4952    content_chars INTEGER NOT NULL,
4953    content_tokens_est INTEGER NOT NULL,
4954    api_input_tokens INTEGER,
4955    api_output_tokens INTEGER,
4956    api_cache_read_tokens INTEGER,
4957    api_cache_creation_tokens INTEGER,
4958    api_thinking_tokens INTEGER,
4959    api_service_tier TEXT,
4960    api_data_source TEXT NOT NULL DEFAULT 'estimated',
4961    tool_call_count INTEGER NOT NULL DEFAULT 0,
4962    has_tool_calls INTEGER NOT NULL DEFAULT 0,
4963    has_plan INTEGER NOT NULL DEFAULT 0,
4964    model_name TEXT,
4965    model_family TEXT NOT NULL DEFAULT 'unknown',
4966    model_tier TEXT NOT NULL DEFAULT 'unknown',
4967    provider TEXT NOT NULL DEFAULT 'unknown'
4968);
4969
4970CREATE TABLE IF NOT EXISTS usage_hourly (
4971    hour_id INTEGER NOT NULL,
4972    agent_slug TEXT NOT NULL,
4973    workspace_id INTEGER NOT NULL DEFAULT 0,
4974    source_id TEXT NOT NULL DEFAULT 'local',
4975    message_count INTEGER NOT NULL DEFAULT 0,
4976    user_message_count INTEGER NOT NULL DEFAULT 0,
4977    assistant_message_count INTEGER NOT NULL DEFAULT 0,
4978    tool_call_count INTEGER NOT NULL DEFAULT 0,
4979    plan_message_count INTEGER NOT NULL DEFAULT 0,
4980    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
4981    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4982    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
4983    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
4984    api_tokens_total INTEGER NOT NULL DEFAULT 0,
4985    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
4986    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
4987    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
4988    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
4989    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
4990    last_updated INTEGER NOT NULL DEFAULT 0,
4991    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
4992    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
4993    PRIMARY KEY (hour_id, agent_slug, workspace_id, source_id)
4994);
4995
4996CREATE TABLE IF NOT EXISTS usage_daily (
4997    day_id INTEGER NOT NULL,
4998    agent_slug TEXT NOT NULL,
4999    workspace_id INTEGER NOT NULL DEFAULT 0,
5000    source_id TEXT NOT NULL DEFAULT 'local',
5001    message_count INTEGER NOT NULL DEFAULT 0,
5002    user_message_count INTEGER NOT NULL DEFAULT 0,
5003    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5004    tool_call_count INTEGER NOT NULL DEFAULT 0,
5005    plan_message_count INTEGER NOT NULL DEFAULT 0,
5006    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5007    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5008    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5009    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5010    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5011    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5012    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5013    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5014    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5015    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5016    last_updated INTEGER NOT NULL DEFAULT 0,
5017    plan_content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5018    plan_api_tokens_total INTEGER NOT NULL DEFAULT 0,
5019    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id)
5020);
5021
5022CREATE TABLE IF NOT EXISTS usage_models_daily (
5023    day_id INTEGER NOT NULL,
5024    agent_slug TEXT NOT NULL,
5025    workspace_id INTEGER NOT NULL DEFAULT 0,
5026    source_id TEXT NOT NULL DEFAULT 'local',
5027    model_family TEXT NOT NULL DEFAULT 'unknown',
5028    model_tier TEXT NOT NULL DEFAULT 'unknown',
5029    message_count INTEGER NOT NULL DEFAULT 0,
5030    user_message_count INTEGER NOT NULL DEFAULT 0,
5031    assistant_message_count INTEGER NOT NULL DEFAULT 0,
5032    tool_call_count INTEGER NOT NULL DEFAULT 0,
5033    plan_message_count INTEGER NOT NULL DEFAULT 0,
5034    api_coverage_message_count INTEGER NOT NULL DEFAULT 0,
5035    content_tokens_est_total INTEGER NOT NULL DEFAULT 0,
5036    content_tokens_est_user INTEGER NOT NULL DEFAULT 0,
5037    content_tokens_est_assistant INTEGER NOT NULL DEFAULT 0,
5038    api_tokens_total INTEGER NOT NULL DEFAULT 0,
5039    api_input_tokens_total INTEGER NOT NULL DEFAULT 0,
5040    api_output_tokens_total INTEGER NOT NULL DEFAULT 0,
5041    api_cache_read_tokens_total INTEGER NOT NULL DEFAULT 0,
5042    api_cache_creation_tokens_total INTEGER NOT NULL DEFAULT 0,
5043    api_thinking_tokens_total INTEGER NOT NULL DEFAULT 0,
5044    last_updated INTEGER NOT NULL DEFAULT 0,
5045    PRIMARY KEY (day_id, agent_slug, workspace_id, source_id, model_family, model_tier)
5046);
5047
5048CREATE INDEX IF NOT EXISTS idx_mm_hour ON message_metrics(hour_id);
5049CREATE INDEX IF NOT EXISTS idx_mm_day ON message_metrics(day_id);
5050CREATE INDEX IF NOT EXISTS idx_mm_agent_hour ON message_metrics(agent_slug, hour_id);
5051CREATE INDEX IF NOT EXISTS idx_mm_agent_day ON message_metrics(agent_slug, day_id);
5052CREATE INDEX IF NOT EXISTS idx_mm_workspace_hour ON message_metrics(workspace_id, hour_id);
5053CREATE INDEX IF NOT EXISTS idx_mm_source_hour ON message_metrics(source_id, hour_id);
5054CREATE INDEX IF NOT EXISTS idx_mm_model_family_day ON message_metrics(model_family, day_id);
5055CREATE INDEX IF NOT EXISTS idx_mm_provider_day ON message_metrics(provider, day_id);
5056CREATE INDEX IF NOT EXISTS idx_uh_agent ON usage_hourly(agent_slug, hour_id);
5057CREATE INDEX IF NOT EXISTS idx_uh_workspace ON usage_hourly(workspace_id, hour_id);
5058CREATE INDEX IF NOT EXISTS idx_uh_source ON usage_hourly(source_id, hour_id);
5059CREATE INDEX IF NOT EXISTS idx_ud_agent ON usage_daily(agent_slug, day_id);
5060CREATE INDEX IF NOT EXISTS idx_ud_workspace ON usage_daily(workspace_id, day_id);
5061CREATE INDEX IF NOT EXISTS idx_ud_source ON usage_daily(source_id, day_id);
5062CREATE INDEX IF NOT EXISTS idx_umd_model_day ON usage_models_daily(model_family, day_id);
5063CREATE INDEX IF NOT EXISTS idx_umd_agent_day ON usage_models_daily(agent_slug, day_id);
5064CREATE INDEX IF NOT EXISTS idx_umd_workspace_day ON usage_models_daily(workspace_id, day_id);
5065CREATE INDEX IF NOT EXISTS idx_umd_source_day ON usage_models_daily(source_id, day_id);
5066";
5067
5068const CURRENT_SCHEMA_REPAIR_BATCHES: &[SchemaRepairBatch] = &[
5069    SchemaRepairBatch {
5070        name: "sources",
5071        tables: &["sources"],
5072        sql: CURRENT_SCHEMA_REPAIR_SOURCES_SQL,
5073    },
5074    SchemaRepairBatch {
5075        name: "daily_stats",
5076        tables: &["daily_stats"],
5077        sql: CURRENT_SCHEMA_REPAIR_DAILY_STATS_SQL,
5078    },
5079    SchemaRepairBatch {
5080        name: "conversation_external_lookup",
5081        tables: &["conversation_external_lookup"],
5082        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_LOOKUP_SQL,
5083    },
5084    SchemaRepairBatch {
5085        name: "conversation_external_tail_lookup",
5086        tables: &[
5087            "conversation_tail_state",
5088            "conversation_external_tail_lookup",
5089        ],
5090        sql: CURRENT_SCHEMA_REPAIR_CONVERSATION_EXTERNAL_TAIL_LOOKUP_SQL,
5091    },
5092    SchemaRepairBatch {
5093        name: "embedding_jobs",
5094        tables: &["embedding_jobs"],
5095        sql: CURRENT_SCHEMA_REPAIR_EMBEDDING_JOBS_SQL,
5096    },
5097    SchemaRepairBatch {
5098        name: "token_analytics",
5099        tables: &["token_usage", "token_daily_stats", "model_pricing"],
5100        sql: CURRENT_SCHEMA_REPAIR_TOKEN_ANALYTICS_SQL,
5101    },
5102    SchemaRepairBatch {
5103        name: "message_rollups",
5104        tables: &[
5105            "message_metrics",
5106            "usage_hourly",
5107            "usage_daily",
5108            "usage_models_daily",
5109        ],
5110        sql: CURRENT_SCHEMA_REPAIR_MESSAGE_METRICS_SQL,
5111    },
5112];
5113
5114fn current_schema_repair_batches_for_missing_tables(
5115    missing_tables: &[&'static str],
5116) -> Result<Vec<&'static SchemaRepairBatch>> {
5117    let missing_set: HashSet<&'static str> = missing_tables.iter().copied().collect();
5118    let mut selected_batches = Vec::new();
5119    let mut covered_tables = HashSet::new();
5120
5121    for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
5122        if !batch
5123            .tables
5124            .iter()
5125            .any(|table_name| missing_set.contains(table_name))
5126        {
5127            continue;
5128        }
5129        selected_batches.push(batch);
5130        covered_tables.extend(batch.tables.iter().copied());
5131    }
5132
5133    for &table_name in missing_tables {
5134        if !covered_tables.contains(table_name) {
5135            return Err(anyhow!(
5136                "no current-schema repair batch registered for missing table {table_name}"
5137            ));
5138        }
5139    }
5140
5141    Ok(selected_batches)
5142}
5143
5144/// Migration name lookup for backfilling `_schema_migrations` during transition.
5145const MIGRATION_NAMES: [(i64, &str); 20] = [
5146    (1, "core_tables"),
5147    (2, "fts_messages"),
5148    (3, "fts_messages_rebuild"),
5149    (4, "sources"),
5150    (5, "provenance_columns"),
5151    (6, "source_path_index"),
5152    (7, "msgpack_columns"),
5153    (8, "daily_stats"),
5154    (9, "embedding_jobs"),
5155    (10, "token_analytics"),
5156    (11, "message_metrics"),
5157    (12, "model_dimensions"),
5158    (13, "plan_token_rollups"),
5159    (14, "fts_contentless"),
5160    (15, "conversation_tail_state_cache"),
5161    (16, "drop_redundant_message_conv_idx"),
5162    (17, "drop_message_created_idx"),
5163    (18, "conversation_tail_state_hot_table"),
5164    (19, "conversation_external_lookup"),
5165    (20, "conversation_external_tail_lookup"),
5166];
5167
5168/// Transitions an existing database from `meta` table schema versioning to the
5169/// `_schema_migrations` table used by `MigrationRunner`.
5170///
5171/// The existing `SqliteStorage` tracks schema version as a string value in
5172/// `meta WHERE key = 'schema_version'`. The bead spec references
5173/// `PRAGMA user_version`, but the actual cass code uses the `meta` table.
5174/// This function handles the real code path.
5175///
5176/// Behavior:
5177/// - If `_schema_migrations` already exists → skip (already transitioned)
5178/// - If `meta` table has `schema_version > 0` → create `_schema_migrations`
5179///   and backfill entries for versions `1..=current_version`
5180/// - Legacy V10-V12 databases are represented as V13 in `_schema_migrations`
5181///   because frankensqlite uses one combined V13 base migration instead of
5182///   replaying the old incremental V11-V13 steps.
5183/// - If `meta` table missing or `schema_version = 0` with no tables → fresh DB,
5184///   let `MigrationRunner` handle it
5185/// - If `schema_version = 0` but tables exist → corrupted state, log warning
5186fn transition_from_meta_version(conn: &FrankenConnection) -> Result<()> {
5187    // Avoid sqlite_master enumeration here. Databases with FTS virtual tables
5188    // can trigger frankensqlite parse-recovery on sqlite_master reads, which is
5189    // enough to break the transition on otherwise-healthy legacy cass DBs.
5190    if conn
5191        .query("SELECT version FROM \"_schema_migrations\";")
5192        .is_ok()
5193    {
5194        return Ok(());
5195    }
5196
5197    // Check if the meta table exists.
5198    if conn.query("SELECT key FROM meta;").is_err() {
5199        // No meta table → fresh database, let MigrationRunner handle it.
5200        return Ok(());
5201    }
5202
5203    // Read the current schema version from the meta table.
5204    let rows = conn
5205        .query("SELECT value FROM meta WHERE key = 'schema_version';")
5206        .with_context(|| "reading schema_version from meta")?;
5207
5208    let current_version: i64 = rows
5209        .first()
5210        .and_then(|row| row.get_typed::<String>(0).ok())
5211        .and_then(|s| s.parse().ok())
5212        .unwrap_or(0);
5213
5214    if current_version == 0 {
5215        // Check if tables actually exist (corrupted state: tables present but version=0).
5216        if conn.query("SELECT id FROM conversations LIMIT 1;").is_err() {
5217            // Truly fresh DB (meta table exists but empty/reset). Let MigrationRunner handle it.
5218            return Ok(());
5219        }
5220
5221        // Tables exist but version=0: corrupted state. Log and skip transition;
5222        // MigrationRunner will fail on "table already exists" and surface the error.
5223        info!("meta.schema_version=0 but tables exist; skipping transition (corrupted state)");
5224        return Ok(());
5225    }
5226
5227    // Create _schema_migrations and backfill entries for all applied versions.
5228    info!(
5229        current_version,
5230        "transitioning schema tracking from meta table to _schema_migrations"
5231    );
5232
5233    conn.execute(
5234        "CREATE TABLE IF NOT EXISTS _schema_migrations (\
5235            version INTEGER PRIMARY KEY, \
5236            name TEXT NOT NULL, \
5237            applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))\
5238        );",
5239    )
5240    .with_context(|| "creating _schema_migrations table for transition")?;
5241
5242    let backfill_through_version = if (10..13).contains(&current_version) {
5243        13
5244    } else {
5245        current_version
5246    };
5247
5248    for &(version, name) in &MIGRATION_NAMES {
5249        if version > backfill_through_version {
5250            break;
5251        }
5252        conn.execute_compat(
5253            "INSERT INTO _schema_migrations (version, name) VALUES (?1, ?2);",
5254            &[ParamValue::from(version), ParamValue::from(name)],
5255        )
5256        .with_context(|| format!("backfilling _schema_migrations version {version}"))?;
5257    }
5258
5259    info!(
5260        current_version,
5261        backfill_through_version,
5262        "schema version transition complete: backfilled legacy meta schema versions"
5263    );
5264
5265    Ok(())
5266}
5267
5268const REQUIRED_CURRENT_SCHEMA_TABLE_PROBES: &[(&str, &str)] = &[
5269    ("sources", "SELECT id FROM sources LIMIT 1;"),
5270    ("daily_stats", "SELECT day_id FROM daily_stats LIMIT 1;"),
5271    (
5272        "conversation_external_lookup",
5273        "SELECT lookup_key FROM conversation_external_lookup LIMIT 1;",
5274    ),
5275    (
5276        "conversation_tail_state",
5277        "SELECT conversation_id FROM conversation_tail_state LIMIT 1;",
5278    ),
5279    (
5280        "conversation_external_tail_lookup",
5281        "SELECT lookup_key, last_message_idx FROM conversation_external_tail_lookup LIMIT 1;",
5282    ),
5283    ("embedding_jobs", "SELECT id FROM embedding_jobs LIMIT 1;"),
5284    ("token_usage", "SELECT id FROM token_usage LIMIT 1;"),
5285    (
5286        "token_daily_stats",
5287        "SELECT day_id FROM token_daily_stats LIMIT 1;",
5288    ),
5289    (
5290        "model_pricing",
5291        "SELECT model_pattern FROM model_pricing LIMIT 1;",
5292    ),
5293    (
5294        "message_metrics",
5295        "SELECT message_id FROM message_metrics LIMIT 1;",
5296    ),
5297    ("usage_hourly", "SELECT hour_id FROM usage_hourly LIMIT 1;"),
5298    ("usage_daily", "SELECT day_id FROM usage_daily LIMIT 1;"),
5299    (
5300        "usage_models_daily",
5301        "SELECT day_id FROM usage_models_daily LIMIT 1;",
5302    ),
5303];
5304
5305const REQUIRED_CONVERSATION_TOKEN_COLUMNS: &[(&str, &str)] = &[
5306    ("total_input_tokens", "INTEGER"),
5307    ("total_output_tokens", "INTEGER"),
5308    ("total_cache_read_tokens", "INTEGER"),
5309    ("total_cache_creation_tokens", "INTEGER"),
5310    ("grand_total_tokens", "INTEGER"),
5311    ("estimated_cost_usd", "REAL"),
5312    ("primary_model", "TEXT"),
5313    ("api_call_count", "INTEGER"),
5314    ("tool_call_count", "INTEGER"),
5315    ("user_message_count", "INTEGER"),
5316    ("assistant_message_count", "INTEGER"),
5317];
5318
5319fn error_indicates_missing_table(err: &impl std::fmt::Display) -> bool {
5320    err.to_string()
5321        .to_ascii_lowercase()
5322        .contains("no such table")
5323}
5324
5325fn error_indicates_missing_column(err: &impl std::fmt::Display) -> bool {
5326    err.to_string()
5327        .to_ascii_lowercase()
5328        .contains("no such column")
5329}
5330
5331const ORPHAN_FK_ID_CHUNK_SIZE: usize = 256;
5332
5333fn collect_orphan_message_ids(conn: &FrankenConnection) -> Result<Vec<i64>> {
5334    let min_conversation_id = conn
5335        .query_map_collect(
5336            "SELECT conversation_id
5337             FROM messages
5338             ORDER BY conversation_id ASC
5339             LIMIT 1",
5340            fparams![],
5341            |row| row.get_typed(0),
5342        )
5343        .context("finding minimum message conversation id for orphan FK cleanup")?
5344        .into_iter()
5345        .next();
5346    let Some(min_conversation_id) = min_conversation_id else {
5347        return Ok(Vec::new());
5348    };
5349    let max_conversation_id: i64 = conn
5350        .query_row_map(
5351            "SELECT conversation_id
5352             FROM messages
5353             ORDER BY conversation_id DESC
5354             LIMIT 1",
5355            fparams![],
5356            |row| row.get_typed(0),
5357        )
5358        .context("finding maximum message conversation id for orphan FK cleanup")?;
5359
5360    let parent_conversation_ids: Vec<i64> = conn
5361        .query_map_collect(
5362            "SELECT id
5363             FROM conversations
5364             WHERE id BETWEEN ?1 AND ?2
5365             ORDER BY id",
5366            fparams![min_conversation_id, max_conversation_id],
5367            |row| row.get_typed(0),
5368        )
5369        .context("listing parent conversation ids for orphan FK cleanup")?;
5370
5371    let mut message_ids = Vec::new();
5372    let mut gap_start = min_conversation_id;
5373    for parent_id in parent_conversation_ids {
5374        if parent_id < gap_start {
5375            continue;
5376        }
5377        if parent_id > max_conversation_id {
5378            break;
5379        }
5380        if gap_start < parent_id {
5381            collect_message_ids_for_conversation_gap(
5382                conn,
5383                gap_start,
5384                parent_id.saturating_sub(1),
5385                &mut message_ids,
5386            )?;
5387        }
5388        if parent_id == i64::MAX {
5389            return Ok(message_ids);
5390        }
5391        gap_start = parent_id + 1;
5392    }
5393    if gap_start <= max_conversation_id {
5394        collect_message_ids_for_conversation_gap(
5395            conn,
5396            gap_start,
5397            max_conversation_id,
5398            &mut message_ids,
5399        )?;
5400    }
5401
5402    Ok(message_ids)
5403}
5404
5405fn collect_message_ids_for_conversation_gap(
5406    conn: &FrankenConnection,
5407    gap_start: i64,
5408    gap_end: i64,
5409    message_ids: &mut Vec<i64>,
5410) -> Result<()> {
5411    let (sql, params) = if gap_start == gap_end {
5412        (
5413            "SELECT id FROM messages WHERE conversation_id = ?1",
5414            vec![SqliteValue::from(gap_start)],
5415        )
5416    } else {
5417        (
5418            "SELECT id FROM messages WHERE conversation_id BETWEEN ?1 AND ?2",
5419            vec![SqliteValue::from(gap_start), SqliteValue::from(gap_end)],
5420        )
5421    };
5422    let rows = conn.query_with_params(sql, &params).with_context(|| {
5423        format!("listing orphan message ids for conversation-id gap {gap_start}..={gap_end}")
5424    })?;
5425    message_ids.reserve(rows.len());
5426    for row in rows {
5427        message_ids.push(row.get_typed(0)?);
5428    }
5429    Ok(())
5430}
5431
5432fn delete_rows_by_i64_chunks(
5433    tx: &FrankenTransaction<'_>,
5434    delete_many_sql_prefix: &'static str,
5435    ids: &[i64],
5436) -> Result<usize> {
5437    if ids.is_empty() {
5438        return Ok(0);
5439    }
5440
5441    let full_chunk_sql = delete_rows_by_i64_sql(delete_many_sql_prefix, ORPHAN_FK_ID_CHUNK_SIZE);
5442    let tail_len = ids.len() % ORPHAN_FK_ID_CHUNK_SIZE;
5443    let tail_sql =
5444        (tail_len != 0).then(|| delete_rows_by_i64_sql(delete_many_sql_prefix, tail_len));
5445
5446    let mut deleted = 0;
5447    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5448        let sql = if chunk.len() == ORPHAN_FK_ID_CHUNK_SIZE {
5449            &full_chunk_sql
5450        } else {
5451            tail_sql.as_ref().unwrap_or(&full_chunk_sql)
5452        };
5453        let params = chunk
5454            .iter()
5455            .map(|id| SqliteValue::from(*id))
5456            .collect::<Vec<_>>();
5457        deleted += tx.execute_with_params(sql, &params)?;
5458    }
5459    Ok(deleted)
5460}
5461
5462fn delete_rows_by_i64_sql(delete_many_sql_prefix: &'static str, count: usize) -> String {
5463    let placeholders = sql_placeholders(count);
5464    format!("{delete_many_sql_prefix} ({placeholders})")
5465}
5466
5467fn sql_placeholders(count: usize) -> String {
5468    vec!["?"; count].join(", ")
5469}
5470
5471fn delete_orphan_message_ids_bisecting_oom(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5472    let mut deleted = 0usize;
5473    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5474        deleted = deleted.saturating_add(delete_orphan_message_id_chunk(conn, chunk)?);
5475    }
5476    Ok(deleted)
5477}
5478
5479fn delete_orphan_message_id_chunk(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5480    if ids.is_empty() {
5481        return Ok(0);
5482    }
5483
5484    match delete_orphan_message_id_chunk_once(conn, ids) {
5485        Ok(deleted) => Ok(deleted),
5486        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5487            let split_at = ids.len() / 2;
5488            tracing::warn!(
5489                target: "cass::fk_repair",
5490                rows = ids.len(),
5491                left = split_at,
5492                right = ids.len().saturating_sub(split_at),
5493                error = %err,
5494                "orphan-message cleanup ran out of memory; retrying as smaller batches"
5495            );
5496            let left = delete_orphan_message_id_chunk(conn, &ids[..split_at])?;
5497            let right = delete_orphan_message_id_chunk(conn, &ids[split_at..])?;
5498            Ok(left.saturating_add(right))
5499        }
5500        Err(err) => Err(err),
5501    }
5502}
5503
5504fn delete_orphan_message_id_chunk_once(conn: &FrankenConnection, ids: &[i64]) -> Result<usize> {
5505    let mut tx = conn.transaction()?;
5506    let mut deleted = 0usize;
5507    for entry in ORPHAN_MESSAGE_DEPENDENT_TABLES {
5508        match delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids) {
5509            Ok(count) => {
5510                deleted = deleted.saturating_add(count);
5511            }
5512            Err(err) if error_indicates_missing_table(&err) => {
5513                tracing::debug!(
5514                    target: "cass::fk_repair",
5515                    child_table = entry.child_table,
5516                    error = %err,
5517                    "skipping orphan-message dependent cleanup (table unavailable)"
5518                );
5519            }
5520            Err(err) => {
5521                return Err(err).with_context(|| {
5522                    format!(
5523                        "deleting rows from {} that depend on orphan messages",
5524                        entry.child_table
5525                    )
5526                });
5527            }
5528        }
5529    }
5530    deleted = deleted.saturating_add(
5531        delete_rows_by_i64_chunks(&tx, "DELETE FROM messages WHERE id IN", ids)
5532            .context("deleting orphan rows from messages")?,
5533    );
5534    tx.commit()?;
5535    Ok(deleted)
5536}
5537
5538fn collect_direct_orphan_id_page(
5539    conn: &FrankenConnection,
5540    entry: &'static OrphanFkTable,
5541) -> Result<Vec<i64>> {
5542    Ok(conn.query_map_collect(
5543        entry.orphan_id_page_sql,
5544        fparams![i64::try_from(ORPHAN_FK_ID_CHUNK_SIZE).unwrap_or(i64::MAX)],
5545        |row| row.get_typed(0),
5546    )?)
5547}
5548
5549fn delete_direct_orphan_ids_bisecting_oom(
5550    conn: &FrankenConnection,
5551    entry: &'static OrphanFkTable,
5552    ids: &[i64],
5553) -> Result<usize> {
5554    let mut deleted = 0usize;
5555    for chunk in ids.chunks(ORPHAN_FK_ID_CHUNK_SIZE) {
5556        deleted = deleted.saturating_add(delete_direct_orphan_id_chunk(conn, entry, chunk)?);
5557    }
5558    Ok(deleted)
5559}
5560
5561fn delete_direct_orphan_id_chunk(
5562    conn: &FrankenConnection,
5563    entry: &'static OrphanFkTable,
5564    ids: &[i64],
5565) -> Result<usize> {
5566    if ids.is_empty() {
5567        return Ok(0);
5568    }
5569
5570    match delete_direct_orphan_id_chunk_once(conn, entry, ids) {
5571        Ok(deleted) => Ok(deleted),
5572        Err(err) if is_out_of_memory_error(&err) && ids.len() > 1 => {
5573            let split_at = ids.len() / 2;
5574            tracing::warn!(
5575                target: "cass::fk_repair",
5576                child_table = entry.child_table,
5577                rows = ids.len(),
5578                left = split_at,
5579                right = ids.len().saturating_sub(split_at),
5580                error = %err,
5581                "direct orphan cleanup ran out of memory; retrying as smaller batches"
5582            );
5583            let left = delete_direct_orphan_id_chunk(conn, entry, &ids[..split_at])?;
5584            let right = delete_direct_orphan_id_chunk(conn, entry, &ids[split_at..])?;
5585            Ok(left.saturating_add(right))
5586        }
5587        Err(err) => Err(err),
5588    }
5589}
5590
5591fn delete_direct_orphan_id_chunk_once(
5592    conn: &FrankenConnection,
5593    entry: &'static OrphanFkTable,
5594    ids: &[i64],
5595) -> Result<usize> {
5596    let mut tx = conn.transaction()?;
5597    let deleted = delete_rows_by_i64_chunks(&tx, entry.delete_many_sql_prefix, ids)?;
5598    tx.commit()?;
5599    Ok(deleted)
5600}
5601
5602/// Tables whose FK parent rows can go missing when an index transaction is
5603/// dropped mid-flight. The select and delete SQL strings are intentionally
5604/// static (no dynamic table names) so they can be audited at a glance and so
5605/// they cannot be subverted by injected identifiers. The select statement
5606/// yields the integer FK key used by the matching chunked delete.
5607struct OrphanFkTable {
5608    child_table: &'static str,
5609    orphan_id_page_sql: &'static str,
5610    delete_many_sql_prefix: &'static str,
5611}
5612
5613const ORPHAN_DIRECT_CHILD_TABLES: &[OrphanFkTable] = &[
5614    OrphanFkTable {
5615        child_table: "message_metrics",
5616        orphan_id_page_sql: "SELECT message_id FROM message_metrics \
5617                             WHERE NOT EXISTS (\
5618                                 SELECT 1 FROM messages \
5619                                 WHERE messages.id = message_metrics.message_id\
5620                             ) \
5621                             ORDER BY message_id \
5622                             LIMIT ?1",
5623        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5624    },
5625    OrphanFkTable {
5626        child_table: "token_usage",
5627        orphan_id_page_sql: "SELECT message_id FROM token_usage \
5628                             WHERE NOT EXISTS (\
5629                                 SELECT 1 FROM messages \
5630                                 WHERE messages.id = token_usage.message_id\
5631                             ) \
5632                             ORDER BY message_id \
5633                             LIMIT ?1",
5634        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5635    },
5636    OrphanFkTable {
5637        child_table: "snippets",
5638        orphan_id_page_sql: "SELECT message_id FROM snippets \
5639                             WHERE NOT EXISTS (\
5640                                 SELECT 1 FROM messages \
5641                                 WHERE messages.id = snippets.message_id\
5642                             ) \
5643                             ORDER BY message_id \
5644                             LIMIT ?1",
5645        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5646    },
5647    OrphanFkTable {
5648        child_table: "conversation_tags",
5649        orphan_id_page_sql: "SELECT conversation_id FROM conversation_tags \
5650                             WHERE NOT EXISTS (\
5651                                 SELECT 1 FROM conversations \
5652                                 WHERE conversations.id = conversation_tags.conversation_id\
5653                             ) \
5654                             ORDER BY conversation_id \
5655                             LIMIT ?1",
5656        delete_many_sql_prefix: "DELETE FROM conversation_tags WHERE conversation_id IN",
5657    },
5658];
5659
5660struct OrphanMessageDependentTable {
5661    child_table: &'static str,
5662    delete_many_sql_prefix: &'static str,
5663}
5664
5665const ORPHAN_MESSAGE_DEPENDENT_TABLES: &[OrphanMessageDependentTable] = &[
5666    OrphanMessageDependentTable {
5667        child_table: "message_metrics",
5668        delete_many_sql_prefix: "DELETE FROM message_metrics WHERE message_id IN",
5669    },
5670    OrphanMessageDependentTable {
5671        child_table: "token_usage",
5672        delete_many_sql_prefix: "DELETE FROM token_usage WHERE message_id IN",
5673    },
5674    OrphanMessageDependentTable {
5675        child_table: "snippets",
5676        delete_many_sql_prefix: "DELETE FROM snippets WHERE message_id IN",
5677    },
5678];
5679
5680/// Summary of orphan rows detected and removed by `cleanup_orphan_fk_rows`.
5681///
5682/// Message-root counts come from the probe phase, while direct child counts
5683/// come from bounded page deletes. Under the function's intended use — a single
5684/// indexer-startup pass holding the index run lock — no concurrent writers
5685/// exist, so these counts match the primary orphan roots identified and
5686/// removed during cleanup. Dependent rows below an orphan message
5687/// (`message_metrics` / `token_usage` / `snippets`) are an expected consequence
5688/// of removing that root orphan and are *not* separately counted in `total` or
5689/// `per_table`.
5690#[derive(Debug, Default, Clone)]
5691pub(crate) struct OrphanFkCleanupReport {
5692    pub total: i64,
5693    pub per_table: Vec<(&'static str, i64)>,
5694}
5695
5696impl OrphanFkCleanupReport {
5697    fn record(&mut self, child_table: &'static str, count: i64) {
5698        if let Some((_, existing)) = self
5699            .per_table
5700            .iter_mut()
5701            .find(|(table, _)| *table == child_table)
5702        {
5703            *existing = existing.saturating_add(count);
5704        } else {
5705            self.per_table.push((child_table, count));
5706        }
5707        self.total = self.total.saturating_add(count);
5708    }
5709}
5710
5711pub struct InsertOutcome {
5712    pub conversation_id: i64,
5713    pub conversation_inserted: bool,
5714    pub inserted_indices: Vec<i64>,
5715}
5716
5717#[cfg(test)]
5718#[derive(Debug, Clone, Default)]
5719struct MessageInsertSubstageProfile {
5720    single_row_calls: usize,
5721    batch_calls: usize,
5722    batch_rows: usize,
5723    payload_duration: Duration,
5724    sql_build_duration: Duration,
5725    param_build_duration: Duration,
5726    execute_duration: Duration,
5727    rowid_duration: Duration,
5728}
5729
5730#[cfg(test)]
5731#[derive(Debug, Clone, Default)]
5732struct InsertConversationTreePerfProfile {
5733    invocations: usize,
5734    messages: usize,
5735    inserted_messages: usize,
5736    total_duration: Duration,
5737    source_duration: Duration,
5738    tx_open_duration: Duration,
5739    existing_lookup_duration: Duration,
5740    existing_idx_lookup_duration: Duration,
5741    existing_replay_lookup_duration: Duration,
5742    dedupe_filter_duration: Duration,
5743    conversation_row_duration: Duration,
5744    message_insert_duration: Duration,
5745    message_insert_breakdown: MessageInsertSubstageProfile,
5746    snippet_insert_duration: Duration,
5747    fts_entry_duration: Duration,
5748    fts_flush_duration: Duration,
5749    analytics_duration: Duration,
5750    commit_duration: Duration,
5751}
5752
5753#[cfg(test)]
5754impl InsertConversationTreePerfProfile {
5755    fn millis(duration: Duration) -> f64 {
5756        duration.as_secs_f64() * 1000.0
5757    }
5758
5759    fn log_summary(&self, label: &str) {
5760        let calls = self.invocations.max(1) as f64;
5761        let accounted_duration = self.source_duration
5762            + self.tx_open_duration
5763            + self.existing_lookup_duration
5764            + self.existing_idx_lookup_duration
5765            + self.existing_replay_lookup_duration
5766            + self.dedupe_filter_duration
5767            + self.conversation_row_duration
5768            + self.message_insert_duration
5769            + self.snippet_insert_duration
5770            + self.fts_entry_duration
5771            + self.fts_flush_duration
5772            + self.analytics_duration
5773            + self.commit_duration;
5774        let residual_duration = self.total_duration.saturating_sub(accounted_duration);
5775        eprintln!(
5776            concat!(
5777                "CASS_INSERT_TREE_STAGE_PROFILE ",
5778                "label={} calls={} messages={} inserted_messages={} ",
5779                "total_ms={:.3} source_ms={:.3} tx_open_ms={:.3} existing_lookup_ms={:.3} ",
5780                "existing_idx_lookup_ms={:.3} existing_replay_lookup_ms={:.3} dedupe_filter_ms={:.3} ",
5781                "conversation_row_ms={:.3} message_insert_ms={:.3} snippet_insert_ms={:.3} ",
5782                "fts_entry_ms={:.3} fts_flush_ms={:.3} analytics_ms={:.3} commit_ms={:.3} ",
5783                "msg_payload_ms={:.3} msg_sql_ms={:.3} msg_param_ms={:.3} msg_execute_ms={:.3} msg_rowid_ms={:.3} ",
5784                "residual_ms={:.3} avg_total_ms={:.3} avg_message_insert_ms={:.3} ",
5785                "avg_msg_execute_ms={:.3} avg_msg_payload_ms={:.3} avg_snippet_insert_ms={:.3} avg_fts_entry_ms={:.3} avg_commit_ms={:.3}"
5786            ),
5787            label,
5788            self.invocations,
5789            self.messages,
5790            self.inserted_messages,
5791            Self::millis(self.total_duration),
5792            Self::millis(self.source_duration),
5793            Self::millis(self.tx_open_duration),
5794            Self::millis(self.existing_lookup_duration),
5795            Self::millis(self.existing_idx_lookup_duration),
5796            Self::millis(self.existing_replay_lookup_duration),
5797            Self::millis(self.dedupe_filter_duration),
5798            Self::millis(self.conversation_row_duration),
5799            Self::millis(self.message_insert_duration),
5800            Self::millis(self.snippet_insert_duration),
5801            Self::millis(self.fts_entry_duration),
5802            Self::millis(self.fts_flush_duration),
5803            Self::millis(self.analytics_duration),
5804            Self::millis(self.commit_duration),
5805            Self::millis(self.message_insert_breakdown.payload_duration),
5806            Self::millis(self.message_insert_breakdown.sql_build_duration),
5807            Self::millis(self.message_insert_breakdown.param_build_duration),
5808            Self::millis(self.message_insert_breakdown.execute_duration),
5809            Self::millis(self.message_insert_breakdown.rowid_duration),
5810            Self::millis(residual_duration),
5811            Self::millis(self.total_duration) / calls,
5812            Self::millis(self.message_insert_duration) / calls,
5813            Self::millis(self.message_insert_breakdown.execute_duration) / calls,
5814            Self::millis(self.message_insert_breakdown.payload_duration) / calls,
5815            Self::millis(self.snippet_insert_duration) / calls,
5816            Self::millis(self.fts_entry_duration) / calls,
5817            Self::millis(self.commit_duration) / calls,
5818        );
5819    }
5820}
5821
5822#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5823enum PendingConversationKey {
5824    External {
5825        source_id: String,
5826        agent_id: i64,
5827        external_id: String,
5828    },
5829    SourcePath {
5830        source_id: String,
5831        agent_id: i64,
5832        source_path: String,
5833        started_at: Option<i64>,
5834    },
5835}
5836
5837fn conversation_external_lookup_key(source_id: &str, agent_id: i64, external_id: &str) -> String {
5838    format!(
5839        "{}:{source_id}:{agent_id}:{}:{external_id}",
5840        source_id.chars().count(),
5841        external_id.chars().count()
5842    )
5843}
5844
5845fn conversation_external_lookup_key_for_conv(agent_id: i64, conv: &Conversation) -> Option<String> {
5846    conv.external_id
5847        .as_deref()
5848        .map(|external_id| conversation_external_lookup_key(&conv.source_id, agent_id, external_id))
5849}
5850
5851#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5852struct MessageMergeFingerprint {
5853    idx: i64,
5854    created_at: Option<i64>,
5855    role: MessageRole,
5856    author: Option<String>,
5857    content_hash: [u8; 32],
5858}
5859
5860#[derive(Debug, Clone, PartialEq, Eq, Hash)]
5861struct MessageReplayFingerprint {
5862    created_at: Option<i64>,
5863    role: MessageRole,
5864    author: Option<String>,
5865    content_hash: [u8; 32],
5866}
5867
5868#[derive(Debug, Clone, Copy, PartialEq, Eq)]
5869struct ConversationMergeEvidence {
5870    exact_overlap: usize,
5871    replay_overlap: usize,
5872    smaller_replay_set: usize,
5873    started_close: bool,
5874    start_distance_ms: i64,
5875}
5876
5877struct ExistingConversationNewMessages<'a> {
5878    messages: Vec<&'a Message>,
5879    new_chars: i64,
5880    idx_collision_count: usize,
5881    first_collision_idx: Option<i64>,
5882}
5883
5884#[derive(Debug, Clone, Copy)]
5885struct ExistingConversationTailState {
5886    last_message_idx: i64,
5887    last_message_created_at: i64,
5888    ended_at: Option<i64>,
5889}
5890
5891#[derive(Debug, Clone, Copy)]
5892struct ExistingConversationWithTail {
5893    id: i64,
5894    tail_state: Option<ExistingConversationTailState>,
5895}
5896
5897fn conversation_effective_started_at(conv: &Conversation) -> Option<i64> {
5898    conv.started_at
5899        .or_else(|| conv.messages.iter().filter_map(|msg| msg.created_at).min())
5900}
5901
5902fn conversation_tail_state(conv: &Conversation) -> (Option<i64>, Option<i64>) {
5903    (
5904        conv.messages.iter().map(|msg| msg.idx).max(),
5905        conv.messages.iter().filter_map(|msg| msg.created_at).max(),
5906    )
5907}
5908
5909fn borrowed_messages_tail_state(messages: &[&Message]) -> (Option<i64>, Option<i64>) {
5910    (
5911        messages.iter().map(|msg| msg.idx).max(),
5912        messages.iter().filter_map(|msg| msg.created_at).max(),
5913    )
5914}
5915
5916fn role_from_str(role: &str) -> MessageRole {
5917    match role {
5918        "user" => MessageRole::User,
5919        "agent" | "assistant" => MessageRole::Agent,
5920        "tool" => MessageRole::Tool,
5921        "system" => MessageRole::System,
5922        other => MessageRole::Other(other.to_string()),
5923    }
5924}
5925
5926fn message_merge_fingerprint(msg: &Message) -> MessageMergeFingerprint {
5927    MessageMergeFingerprint {
5928        idx: msg.idx,
5929        created_at: msg.created_at,
5930        role: msg.role.clone(),
5931        author: msg.author.clone(),
5932        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
5933    }
5934}
5935
5936fn message_replay_fingerprint(msg: &Message) -> MessageReplayFingerprint {
5937    MessageReplayFingerprint {
5938        created_at: msg.created_at,
5939        role: msg.role.clone(),
5940        author: msg.author.clone(),
5941        content_hash: *blake3::hash(msg.content.as_bytes()).as_bytes(),
5942    }
5943}
5944
5945fn conversation_message_fingerprints(conv: &Conversation) -> HashSet<MessageMergeFingerprint> {
5946    conv.messages
5947        .iter()
5948        .map(message_merge_fingerprint)
5949        .collect()
5950}
5951
5952fn conversation_message_replay_fingerprints(
5953    conv: &Conversation,
5954) -> HashSet<MessageReplayFingerprint> {
5955    conv.messages
5956        .iter()
5957        .map(message_replay_fingerprint)
5958        .collect()
5959}
5960
5961fn replay_fingerprint_from_merge(
5962    fingerprint: &MessageMergeFingerprint,
5963) -> MessageReplayFingerprint {
5964    MessageReplayFingerprint {
5965        created_at: fingerprint.created_at,
5966        role: fingerprint.role.clone(),
5967        author: fingerprint.author.clone(),
5968        content_hash: fingerprint.content_hash,
5969    }
5970}
5971
5972fn replay_fingerprints_from_merge_set(
5973    fingerprints: &HashSet<MessageMergeFingerprint>,
5974) -> HashSet<MessageReplayFingerprint> {
5975    fingerprints
5976        .iter()
5977        .map(replay_fingerprint_from_merge)
5978        .collect()
5979}
5980
5981fn collect_new_messages_for_existing_conversation<'a>(
5982    conversation_id: i64,
5983    conv: &'a Conversation,
5984    existing_messages: &mut HashMap<i64, MessageMergeFingerprint>,
5985    existing_replay_fingerprints: &mut HashSet<MessageReplayFingerprint>,
5986    replay_skip_log: &'static str,
5987) -> ExistingConversationNewMessages<'a> {
5988    let mut idx_collision_count = 0usize;
5989    let mut first_collision_idx: Option<i64> = None;
5990    let mut new_chars: i64 = 0;
5991    let mut messages = Vec::new();
5992
5993    for msg in &conv.messages {
5994        let incoming_fingerprint = message_merge_fingerprint(msg);
5995        if let Some(existing_fingerprint) = existing_messages.get(&msg.idx) {
5996            if existing_fingerprint != &incoming_fingerprint {
5997                idx_collision_count = idx_collision_count.saturating_add(1);
5998                first_collision_idx.get_or_insert(msg.idx);
5999            }
6000            continue;
6001        }
6002
6003        let incoming_replay = replay_fingerprint_from_merge(&incoming_fingerprint);
6004        if existing_replay_fingerprints.contains(&incoming_replay) {
6005            tracing::debug!(
6006                conversation_id,
6007                idx = msg.idx,
6008                source_path = %conv.source_path.display(),
6009                "{replay_skip_log}"
6010            );
6011            continue;
6012        }
6013
6014        existing_messages.insert(msg.idx, incoming_fingerprint);
6015        existing_replay_fingerprints.insert(incoming_replay);
6016        new_chars += msg.content.len() as i64;
6017        messages.push(msg);
6018    }
6019
6020    ExistingConversationNewMessages {
6021        messages,
6022        new_chars,
6023        idx_collision_count,
6024        first_collision_idx,
6025    }
6026}
6027
6028fn franken_existing_conversation_append_tail_state(
6029    tx: &FrankenTransaction<'_>,
6030    conversation_id: i64,
6031) -> Result<Option<ExistingConversationTailState>> {
6032    let cached: Option<(Option<i64>, Option<i64>, Option<i64>)> = tx
6033        .query_row_map(
6034            "SELECT last_message_idx, last_message_created_at, ended_at
6035             FROM conversation_tail_state
6036             WHERE conversation_id = ?1",
6037            fparams![conversation_id],
6038            |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6039        )
6040        .optional()?;
6041    if let Some(cached) = cached {
6042        let (_, _, cached_ended_at) = cached;
6043        if let Some(tail_state) =
6044            existing_conversation_tail_state_from_cached(cached.0, cached.1, cached_ended_at)
6045        {
6046            return Ok(Some(tail_state));
6047        }
6048    }
6049
6050    let legacy_cached: (Option<i64>, Option<i64>, Option<i64>) = tx.query_row_map(
6051        "SELECT last_message_idx, last_message_created_at, ended_at
6052         FROM conversations
6053         WHERE id = ?1",
6054        fparams![conversation_id],
6055        |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
6056    )?;
6057    let (_, _, cached_ended_at) = legacy_cached;
6058    if let Some(tail_state) = existing_conversation_tail_state_from_cached(
6059        legacy_cached.0,
6060        legacy_cached.1,
6061        cached_ended_at,
6062    ) {
6063        franken_insert_conversation_tail_state(
6064            tx,
6065            conversation_id,
6066            cached_ended_at,
6067            Some(tail_state.last_message_idx),
6068            Some(tail_state.last_message_created_at),
6069        )?;
6070        return Ok(Some(tail_state));
6071    }
6072
6073    let (max_idx, max_created_at): (Option<i64>, Option<i64>) = tx.query_row_map(
6074        "SELECT MAX(idx), MAX(created_at)
6075         FROM messages
6076         WHERE conversation_id = ?1",
6077        fparams![conversation_id],
6078        |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
6079    )?;
6080    if let Some((last_message_idx, last_message_created_at)) = max_idx.zip(max_created_at) {
6081        franken_update_conversation_tail_state(
6082            tx,
6083            conversation_id,
6084            None,
6085            Some(last_message_idx),
6086            Some(last_message_created_at),
6087        )?;
6088        return Ok(Some(ExistingConversationTailState {
6089            last_message_idx,
6090            last_message_created_at,
6091            ended_at: cached_ended_at,
6092        }));
6093    }
6094    Ok(None)
6095}
6096
6097fn existing_conversation_tail_state_from_cached(
6098    last_message_idx: Option<i64>,
6099    last_message_created_at: Option<i64>,
6100    ended_at: Option<i64>,
6101) -> Option<ExistingConversationTailState> {
6102    let (last_message_idx, last_message_created_at) =
6103        last_message_idx.zip(last_message_created_at)?;
6104    Some(ExistingConversationTailState {
6105        last_message_idx,
6106        last_message_created_at,
6107        ended_at,
6108    })
6109}
6110
6111fn franken_find_existing_conversation_with_tail_by_key(
6112    tx: &FrankenTransaction<'_>,
6113    key: &PendingConversationKey,
6114    conv: Option<&Conversation>,
6115) -> Result<Option<ExistingConversationWithTail>> {
6116    if let PendingConversationKey::External {
6117        source_id,
6118        agent_id,
6119        external_id,
6120    } = key
6121    {
6122        let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
6123        if let Some(existing) = franken_find_external_conversation_tail_lookup(tx, &lookup_key)? {
6124            return Ok(Some(existing));
6125        }
6126        return Ok(None);
6127    }
6128
6129    let Some(id) = franken_find_existing_conversation_by_key(tx, key, conv)? else {
6130        return Ok(None);
6131    };
6132    let tail_state = franken_existing_conversation_append_tail_state(tx, id)?;
6133    Ok(Some(ExistingConversationWithTail { id, tail_state }))
6134}
6135
6136fn franken_insert_conversation_tail_state(
6137    tx: &FrankenTransaction<'_>,
6138    conversation_id: i64,
6139    ended_at: Option<i64>,
6140    last_message_idx: Option<i64>,
6141    last_message_created_at: Option<i64>,
6142) -> Result<()> {
6143    if ended_at.is_none() && last_message_idx.is_none() && last_message_created_at.is_none() {
6144        return Ok(());
6145    }
6146    tx.execute_compat(
6147        "INSERT OR REPLACE INTO conversation_tail_state (
6148             conversation_id, ended_at, last_message_idx, last_message_created_at
6149         ) VALUES (?1, ?2, ?3, ?4)",
6150        fparams![
6151            conversation_id,
6152            ended_at,
6153            last_message_idx,
6154            last_message_created_at
6155        ],
6156    )?;
6157    Ok(())
6158}
6159
6160fn franken_update_conversation_tail_columns(
6161    tx: &FrankenTransaction<'_>,
6162    conversation_id: i64,
6163    ended_at_candidate: Option<i64>,
6164    last_message_idx_candidate: Option<i64>,
6165    last_message_created_at_candidate: Option<i64>,
6166) -> Result<()> {
6167    if ended_at_candidate.is_none()
6168        && last_message_idx_candidate.is_none()
6169        && last_message_created_at_candidate.is_none()
6170    {
6171        return Ok(());
6172    }
6173
6174    tx.execute_compat(
6175        "UPDATE conversations
6176         SET ended_at = CASE
6177                 WHEN ?1 IS NULL THEN ended_at
6178                 WHEN ended_at IS NULL OR ended_at < ?1 THEN ?1
6179                 ELSE ended_at
6180             END,
6181             last_message_idx = CASE
6182                 WHEN ?2 IS NULL THEN last_message_idx
6183                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6184                 ELSE last_message_idx
6185             END,
6186             last_message_created_at = CASE
6187                 WHEN ?3 IS NULL THEN last_message_created_at
6188                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6189                 ELSE last_message_created_at
6190             END
6191         WHERE id = ?4",
6192        fparams![
6193            ended_at_candidate,
6194            last_message_idx_candidate,
6195            last_message_created_at_candidate,
6196            conversation_id
6197        ],
6198    )?;
6199    Ok(())
6200}
6201
6202fn franken_tail_state_insert_ended_at(
6203    tx: &FrankenTransaction<'_>,
6204    conversation_id: i64,
6205    candidate: Option<i64>,
6206) -> Result<Option<i64>> {
6207    let canonical: Option<i64> = tx
6208        .query_row_map(
6209            "SELECT ended_at FROM conversations WHERE id = ?1",
6210            fparams![conversation_id],
6211            |row| row.get_typed(0),
6212        )
6213        .optional()?
6214        .flatten();
6215    Ok(canonical.max(candidate))
6216}
6217
6218fn franken_update_conversation_tail_state(
6219    tx: &FrankenTransaction<'_>,
6220    conversation_id: i64,
6221    ended_at_candidate: Option<i64>,
6222    last_message_idx_candidate: Option<i64>,
6223    last_message_created_at_candidate: Option<i64>,
6224) -> Result<()> {
6225    if ended_at_candidate.is_none()
6226        && last_message_idx_candidate.is_none()
6227        && last_message_created_at_candidate.is_none()
6228    {
6229        return Ok(());
6230    }
6231
6232    let changed = tx.execute_compat(
6233        "UPDATE conversation_tail_state
6234         SET ended_at = CASE
6235                 WHEN ?1 IS NULL THEN ended_at
6236                 ELSE MAX(IFNULL(ended_at, 0), ?1)
6237             END,
6238             last_message_idx = CASE
6239                 WHEN ?2 IS NULL THEN last_message_idx
6240                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
6241                 ELSE last_message_idx
6242             END,
6243             last_message_created_at = CASE
6244                 WHEN ?3 IS NULL THEN last_message_created_at
6245                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
6246                 ELSE last_message_created_at
6247             END
6248         WHERE conversation_id = ?4",
6249        fparams![
6250            ended_at_candidate,
6251            last_message_idx_candidate,
6252            last_message_created_at_candidate,
6253            conversation_id
6254        ],
6255    )?;
6256    if changed == 0 {
6257        let insert_ended_at =
6258            franken_tail_state_insert_ended_at(tx, conversation_id, ended_at_candidate)?;
6259        franken_insert_conversation_tail_state(
6260            tx,
6261            conversation_id,
6262            insert_ended_at,
6263            last_message_idx_candidate,
6264            last_message_created_at_candidate,
6265        )?;
6266    }
6267    franken_update_conversation_tail_columns(
6268        tx,
6269        conversation_id,
6270        ended_at_candidate,
6271        last_message_idx_candidate,
6272        last_message_created_at_candidate,
6273    )?;
6274    Ok(())
6275}
6276
6277fn franken_set_conversation_tail_state_after_append(
6278    tx: &FrankenTransaction<'_>,
6279    conversation_id: i64,
6280    ended_at: i64,
6281    last_message_idx: i64,
6282    last_message_created_at: i64,
6283) -> Result<()> {
6284    let changed = tx.execute_compat(
6285        "UPDATE conversation_tail_state
6286         SET ended_at = ?1,
6287             last_message_idx = ?2,
6288             last_message_created_at = ?3
6289         WHERE conversation_id = ?4",
6290        fparams![
6291            ended_at,
6292            last_message_idx,
6293            last_message_created_at,
6294            conversation_id
6295        ],
6296    )?;
6297    if changed == 0 {
6298        let insert_ended_at =
6299            franken_tail_state_insert_ended_at(tx, conversation_id, Some(ended_at))?;
6300        franken_insert_conversation_tail_state(
6301            tx,
6302            conversation_id,
6303            insert_ended_at,
6304            Some(last_message_idx),
6305            Some(last_message_created_at),
6306        )?;
6307    }
6308    franken_update_conversation_tail_columns(
6309        tx,
6310        conversation_id,
6311        Some(ended_at),
6312        Some(last_message_idx),
6313        Some(last_message_created_at),
6314    )?;
6315    Ok(())
6316}
6317
6318fn collect_append_only_tail_messages<'a>(
6319    conv: &'a Conversation,
6320    existing_max_idx: i64,
6321    existing_max_created_at: i64,
6322) -> Option<ExistingConversationNewMessages<'a>> {
6323    if conv.messages.is_empty() {
6324        return Some(ExistingConversationNewMessages {
6325            messages: Vec::new(),
6326            new_chars: 0,
6327            idx_collision_count: 0,
6328            first_collision_idx: None,
6329        });
6330    }
6331
6332    let mut split_idx = None;
6333    let mut prev_idx = None;
6334    for (pos, msg) in conv.messages.iter().enumerate() {
6335        if prev_idx.is_some_and(|prev| msg.idx < prev) {
6336            return None;
6337        }
6338        prev_idx = Some(msg.idx);
6339        if split_idx.is_none() && msg.idx > existing_max_idx {
6340            split_idx = Some(pos);
6341        }
6342    }
6343    let split_idx = split_idx?;
6344
6345    let mut seen_tail_idx = HashSet::new();
6346    let mut seen_tail_replay = HashSet::new();
6347    let mut new_chars = 0i64;
6348    let mut messages = Vec::new();
6349    for msg in &conv.messages[split_idx..] {
6350        let created_at = msg.created_at?;
6351        if created_at <= existing_max_created_at {
6352            return None;
6353        }
6354
6355        if !seen_tail_idx.insert(msg.idx) {
6356            return None;
6357        }
6358
6359        let replay_fingerprint = message_replay_fingerprint(msg);
6360        if !seen_tail_replay.insert(replay_fingerprint) {
6361            return None;
6362        }
6363
6364        new_chars += msg.content.len() as i64;
6365        messages.push(msg);
6366    }
6367
6368    Some(ExistingConversationNewMessages {
6369        messages,
6370        new_chars,
6371        idx_collision_count: 0,
6372        first_collision_idx: None,
6373    })
6374}
6375
6376fn start_distance_ms(left: Option<i64>, right: Option<i64>) -> i64 {
6377    match (left, right) {
6378        (Some(left), Some(right)) => (i128::from(left) - i128::from(right))
6379            .abs()
6380            .try_into()
6381            .unwrap_or(i64::MAX),
6382        _ => i64::MAX,
6383    }
6384}
6385
6386fn conversation_merge_evidence(
6387    incoming_exact: &HashSet<MessageMergeFingerprint>,
6388    incoming_replay: &HashSet<MessageReplayFingerprint>,
6389    existing_exact: &HashSet<MessageMergeFingerprint>,
6390    existing_replay: &HashSet<MessageReplayFingerprint>,
6391    incoming_started_at: Option<i64>,
6392    existing_started_at: Option<i64>,
6393) -> Option<ConversationMergeEvidence> {
6394    let exact_overlap = incoming_exact.intersection(existing_exact).count();
6395    let replay_overlap = incoming_replay.intersection(existing_replay).count();
6396    if exact_overlap == 0 && replay_overlap == 0 {
6397        return None;
6398    }
6399
6400    let smaller_replay_set = incoming_replay.len().min(existing_replay.len());
6401    let started_close = timestamps_within_tolerance(
6402        incoming_started_at,
6403        existing_started_at,
6404        SOURCE_PATH_MERGE_START_TOLERANCE_MS,
6405    );
6406    let full_replay_subset_match = smaller_replay_set >= 2 && replay_overlap == smaller_replay_set;
6407
6408    let merge_allowed = if started_close {
6409        exact_overlap >= 1 || replay_overlap >= 2
6410    } else {
6411        exact_overlap >= 2 || full_replay_subset_match
6412    };
6413
6414    merge_allowed.then_some(ConversationMergeEvidence {
6415        exact_overlap,
6416        replay_overlap,
6417        smaller_replay_set,
6418        started_close,
6419        start_distance_ms: start_distance_ms(incoming_started_at, existing_started_at),
6420    })
6421}
6422
6423fn timestamps_within_tolerance(left: Option<i64>, right: Option<i64>, tolerance_ms: i64) -> bool {
6424    match (left, right) {
6425        (Some(left), Some(right)) => {
6426            (i128::from(left) - i128::from(right)).abs() <= i128::from(tolerance_ms)
6427        }
6428        _ => false,
6429    }
6430}
6431
6432fn conversation_merge_key(agent_id: i64, conv: &Conversation) -> PendingConversationKey {
6433    if let Some(external_id) = conv.external_id.clone() {
6434        PendingConversationKey::External {
6435            source_id: conv.source_id.clone(),
6436            agent_id,
6437            external_id,
6438        }
6439    } else {
6440        PendingConversationKey::SourcePath {
6441            source_id: conv.source_id.clone(),
6442            agent_id,
6443            source_path: path_to_string(&conv.source_path),
6444            started_at: conversation_effective_started_at(conv),
6445        }
6446    }
6447}
6448
6449/// Message data needed for semantic embedding generation.
6450pub struct MessageForEmbedding {
6451    pub message_id: i64,
6452    pub created_at: Option<i64>,
6453    pub agent_id: i64,
6454    pub workspace_id: Option<i64>,
6455    pub source_id_hash: u32,
6456    pub role: String,
6457    pub content: String,
6458}
6459
6460// =========================================================================
6461// FrankenStorage CRUD operations
6462// =========================================================================
6463
6464impl FrankenStorage {
6465    /// Ensure an agent exists in the database, returning its ID.
6466    pub fn ensure_agent(&self, agent: &Agent) -> Result<i64> {
6467        let cache_key = EnsuredAgentKey::from_agent(agent);
6468        if let Some(id) = self.cached_agent_id(&cache_key) {
6469            return Ok(id);
6470        }
6471
6472        let now = Self::now_millis();
6473        self.conn.execute_compat(
6474            "INSERT INTO agents(slug, name, version, kind, created_at, updated_at)
6475             VALUES(?1, ?2, ?3, ?4, ?5, ?6)
6476             ON CONFLICT(slug) DO UPDATE SET
6477                 name = excluded.name,
6478                 version = excluded.version,
6479                 kind = excluded.kind,
6480                 updated_at = excluded.updated_at
6481             WHERE NOT (
6482                 agents.name IS excluded.name
6483                 AND agents.version IS excluded.version
6484                 AND agents.kind IS excluded.kind
6485             )",
6486            fparams![
6487                agent.slug.as_str(),
6488                agent.name.as_str(),
6489                agent.version.as_deref(),
6490                cache_key.kind.as_str(),
6491                now,
6492                now
6493            ],
6494        )?;
6495
6496        let id = self
6497            .conn
6498            .query_row_map(
6499                "SELECT id FROM agents WHERE slug = ?1 LIMIT 1",
6500                fparams![agent.slug.as_str()],
6501                |row| row.get_typed(0),
6502            )
6503            .with_context(|| format!("fetching agent id for {}", agent.slug))?;
6504        self.mark_agent_ensured(cache_key, id);
6505        Ok(id)
6506    }
6507
6508    /// Ensure a workspace exists in the database, returning its ID.
6509    pub fn ensure_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
6510        let path_str = path.to_string_lossy().to_string();
6511        let cache_key = EnsuredWorkspaceKey::new(path_str.clone(), display_name);
6512        if let Some(id) = self.cached_workspace_id(&cache_key) {
6513            return Ok(id);
6514        }
6515
6516        if let Some(display_name) = display_name {
6517            self.conn.execute_compat(
6518                "INSERT INTO workspaces(path, display_name)
6519                 VALUES(?1, ?2)
6520                 ON CONFLICT(path) DO UPDATE SET
6521                     display_name = excluded.display_name
6522                 WHERE NOT (workspaces.display_name IS excluded.display_name)",
6523                fparams![path_str.as_str(), display_name],
6524            )?;
6525        } else {
6526            self.conn.execute_compat(
6527                "INSERT OR IGNORE INTO workspaces(path, display_name) VALUES(?1, NULL)",
6528                fparams![path_str.as_str()],
6529            )?;
6530        }
6531
6532        let id = self
6533            .conn
6534            .query_row_map(
6535                "SELECT id FROM workspaces WHERE path = ?1 LIMIT 1",
6536                fparams![path_str.as_str()],
6537                |row| row.get_typed(0),
6538            )
6539            .with_context(|| format!("fetching workspace id for {path_str}"))?;
6540        self.mark_workspace_ensured(cache_key, id);
6541        Ok(id)
6542    }
6543
6544    /// Get current time as milliseconds since epoch.
6545    pub fn now_millis() -> i64 {
6546        SystemTime::now()
6547            .duration_since(UNIX_EPOCH)
6548            .map(|d| i64::try_from(d.as_millis()).unwrap_or(i64::MAX))
6549            .unwrap_or(0)
6550    }
6551
6552    /// Convert a millisecond timestamp to a day ID (days since 2020-01-01).
6553    pub fn day_id_from_millis(timestamp_ms: i64) -> i64 {
6554        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6555        let secs = timestamp_ms.div_euclid(1000);
6556        (secs - EPOCH_2020_SECS).div_euclid(86400)
6557    }
6558
6559    /// Convert a millisecond timestamp to an hour ID (hours since 2020-01-01 00:00 UTC).
6560    pub fn hour_id_from_millis(timestamp_ms: i64) -> i64 {
6561        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6562        let secs = timestamp_ms.div_euclid(1000);
6563        (secs - EPOCH_2020_SECS).div_euclid(3600)
6564    }
6565
6566    /// Convert a day ID back to milliseconds (start of day).
6567    pub fn millis_from_day_id(day_id: i64) -> i64 {
6568        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6569        (EPOCH_2020_SECS + day_id * 86400) * 1000
6570    }
6571
6572    /// Convert an hour ID back to milliseconds (start of hour).
6573    pub fn millis_from_hour_id(hour_id: i64) -> i64 {
6574        const EPOCH_2020_SECS: i64 = 1_577_836_800;
6575        (EPOCH_2020_SECS + hour_id * 3600) * 1000
6576    }
6577
6578    /// Get the timestamp of the last successful scan.
6579    pub fn get_last_scan_ts(&self) -> Result<Option<i64>> {
6580        let result: Result<String, _> = self.conn.query_row_map(
6581            "SELECT value FROM meta WHERE key = 'last_scan_ts'",
6582            fparams![],
6583            |row| row.get_typed(0),
6584        );
6585        match result.optional() {
6586            Ok(Some(s)) => Ok(s.parse().ok()),
6587            Ok(None) => Ok(None),
6588            Err(e) => Err(e.into()),
6589        }
6590    }
6591
6592    /// Set the timestamp of the last successful scan (milliseconds since epoch).
6593    pub fn set_last_scan_ts(&self, ts: i64) -> Result<()> {
6594        self.conn.execute_compat(
6595            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_scan_ts', ?1)",
6596            fparams![ts.to_string()],
6597        )?;
6598        Ok(())
6599    }
6600
6601    /// Get the timestamp of the last successful index completion.
6602    pub fn get_last_indexed_at(&self) -> Result<Option<i64>> {
6603        let result: Result<String, _> = self.conn.query_row_map(
6604            "SELECT value FROM meta WHERE key = 'last_indexed_at'",
6605            fparams![],
6606            |row| row.get_typed(0),
6607        );
6608        match result.optional() {
6609            Ok(Some(s)) => Ok(s.parse().ok()),
6610            Ok(None) => Ok(None),
6611            Err(e) => Err(e.into()),
6612        }
6613    }
6614
6615    /// Set the timestamp of the last successful index completion (milliseconds since epoch).
6616    pub fn set_last_indexed_at(&self, ts: i64) -> Result<()> {
6617        self.conn.execute_compat(
6618            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_indexed_at', ?1)",
6619            fparams![ts.to_string()],
6620        )?;
6621        Ok(())
6622    }
6623
6624    /// List all registered agents.
6625    pub fn list_agents(&self) -> Result<Vec<Agent>> {
6626        self.conn
6627            .query_map_collect(
6628                "SELECT id, slug, name, version, kind FROM agents ORDER BY slug",
6629                fparams![],
6630                |row| {
6631                    let kind: String = row.get_typed(4)?;
6632                    Ok(Agent {
6633                        id: Some(row.get_typed(0)?),
6634                        slug: row.get_typed(1)?,
6635                        name: row.get_typed(2)?,
6636                        version: row.get_typed(3)?,
6637                        kind: match kind.as_str() {
6638                            "cli" => AgentKind::Cli,
6639                            "vscode" => AgentKind::VsCode,
6640                            _ => AgentKind::Hybrid,
6641                        },
6642                    })
6643                },
6644            )
6645            .with_context(|| "listing agents")
6646    }
6647
6648    /// Count all archived conversations.
6649    pub fn total_conversation_count(&self) -> Result<usize> {
6650        let count: i64 =
6651            self.conn
6652                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6653                    row.get_typed(0)
6654                })?;
6655        Ok(count.max(0) as usize)
6656    }
6657
6658    /// Count all archived messages.
6659    pub fn total_message_count(&self) -> Result<usize> {
6660        let count: i64 =
6661            self.conn
6662                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
6663                    row.get_typed(0)
6664                })?;
6665        Ok(count.max(0) as usize)
6666    }
6667
6668    /// Remove all archived conversations/messages for one agent slug.
6669    ///
6670    /// This only affects cass's local archive database. Source session files on
6671    /// disk are untouched.
6672    pub fn purge_agent_archive_data(&self, agent_slug: &str) -> Result<AgentArchivePurgeResult> {
6673        let normalized = agent_slug.trim().to_ascii_lowercase();
6674        if normalized.is_empty() {
6675            return Err(anyhow!("agent slug cannot be empty"));
6676        }
6677
6678        let Some(agent_id) = self
6679            .conn
6680            .query_row_map(
6681                "SELECT id FROM agents WHERE slug = ?1",
6682                fparams![normalized.as_str()],
6683                |row| row.get_typed::<i64>(0),
6684            )
6685            .optional()?
6686        else {
6687            return Ok(AgentArchivePurgeResult::default());
6688        };
6689
6690        let conversations_deleted: i64 = self.conn.query_row_map(
6691            "SELECT COUNT(*) FROM conversations WHERE agent_id = ?1",
6692            fparams![agent_id],
6693            |row| row.get_typed(0),
6694        )?;
6695        if conversations_deleted == 0 {
6696            return Ok(AgentArchivePurgeResult::default());
6697        }
6698
6699        let messages_deleted: i64 = self.conn.query_row_map(
6700            "SELECT COUNT(*)
6701             FROM messages
6702             WHERE conversation_id IN (
6703                 SELECT id FROM conversations WHERE agent_id = ?1
6704             )",
6705            fparams![agent_id],
6706            |row| row.get_typed(0),
6707        )?;
6708
6709        let mut tx = self.conn.transaction()?;
6710        tx.execute_compat(
6711            "DELETE FROM conversation_external_lookup
6712             WHERE conversation_id IN (
6713                 SELECT id FROM conversations WHERE agent_id = ?1
6714             )",
6715            fparams![agent_id],
6716        )?;
6717        tx.execute_compat(
6718            "DELETE FROM conversation_external_tail_lookup
6719             WHERE conversation_id IN (
6720                 SELECT id FROM conversations WHERE agent_id = ?1
6721             )",
6722            fparams![agent_id],
6723        )?;
6724        tx.execute_compat(
6725            "DELETE FROM conversations WHERE agent_id = ?1",
6726            fparams![agent_id],
6727        )?;
6728        tx.execute_compat(
6729            "DELETE FROM agents
6730             WHERE id = ?1
6731               AND NOT EXISTS (
6732                   SELECT 1 FROM conversations WHERE agent_id = ?1
6733               )",
6734            fparams![agent_id],
6735        )?;
6736        tx.commit()?;
6737
6738        Ok(AgentArchivePurgeResult {
6739            conversations_deleted: conversations_deleted.max(0) as usize,
6740            messages_deleted: messages_deleted.max(0) as usize,
6741        })
6742    }
6743
6744    /// List all registered workspaces.
6745    pub fn list_workspaces(&self) -> Result<Vec<crate::model::types::Workspace>> {
6746        self.conn
6747            .query_map_collect(
6748                "SELECT id, path, display_name FROM workspaces ORDER BY path",
6749                fparams![],
6750                |row| {
6751                    let path_str: String = row.get_typed(1)?;
6752                    Ok(crate::model::types::Workspace {
6753                        id: Some(row.get_typed(0)?),
6754                        path: Path::new(&path_str).to_path_buf(),
6755                        display_name: row.get_typed(2)?,
6756                    })
6757                },
6758            )
6759            .with_context(|| "listing workspaces")
6760    }
6761
6762    /// List conversations with pagination.
6763    pub fn list_conversations(&self, limit: i64, offset: i64) -> Result<Vec<Conversation>> {
6764        // Avoid the multi-table JOIN with LIMIT/OFFSET that triggers
6765        // frankensqlite's materialization fallback (see c38edcd9, 860acb12).
6766        // Use correlated subqueries for the tiny agents (~20 rows) and
6767        // workspaces (~30 rows) lookup tables and degrade NULL agent_id to
6768        // the same 'unknown' sentinel that 8a0c547c established for the
6769        // lexical rebuild path.
6770        self.conn
6771            .query_map_collect(
6772                r"SELECT c.id,
6773                         COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
6774                         (SELECT w.path FROM workspaces w WHERE w.id = c.workspace_id),
6775                         c.external_id, c.title, c.source_path,
6776                         c.started_at,
6777                         COALESCE(
6778                             (SELECT ts.ended_at
6779                              FROM conversation_tail_state ts
6780                              WHERE ts.conversation_id = c.id),
6781                             c.ended_at
6782                         ),
6783                         c.approx_tokens, c.metadata_json,
6784                         c.source_id, c.origin_host, c.metadata_bin
6785                FROM conversations c
6786                ORDER BY CASE WHEN c.started_at IS NULL THEN 1 ELSE 0 END, c.started_at DESC, c.id DESC
6787                LIMIT ?1 OFFSET ?2",
6788                fparams![limit, offset],
6789                |row| {
6790                    let workspace_path: Option<String> = row.get_typed(2)?;
6791                    let source_path: String = row.get_typed(5)?;
6792                    let raw_source_id: Option<String> = row.get_typed(10)?;
6793                    let raw_origin_host: Option<String> = row.get_typed(11)?;
6794                    let (source_id, _, origin_host) = normalized_storage_source_parts(
6795                        raw_source_id.as_deref(),
6796                        None,
6797                        raw_origin_host.as_deref(),
6798                    );
6799                    Ok(Conversation {
6800                        id: Some(row.get_typed(0)?),
6801                        agent_slug: row.get_typed(1)?,
6802                        workspace: workspace_path.map(|p| Path::new(&p).to_path_buf()),
6803                        external_id: row.get_typed(3)?,
6804                        title: row.get_typed(4)?,
6805                        source_path: Path::new(&source_path).to_path_buf(),
6806                        started_at: row.get_typed(6)?,
6807                        ended_at: row.get_typed(7)?,
6808                        approx_tokens: row.get_typed(8)?,
6809                        metadata_json: franken_read_metadata_compat(row, 9, 12),
6810                        messages: Vec::new(),
6811                        source_id,
6812                        origin_host,
6813                    })
6814                },
6815            )
6816            .with_context(|| "listing conversations")
6817    }
6818
6819    /// Build lookup maps for agents and workspaces to avoid JOINs in
6820    /// paged conversation queries.  Both tables are tiny (tens of rows)
6821    /// so this is effectively free.
6822    pub fn build_lexical_rebuild_lookups(
6823        &self,
6824    ) -> Result<(HashMap<i64, String>, HashMap<i64, PathBuf>)> {
6825        let agents: HashMap<i64, String> = self
6826            .conn
6827            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
6828                Ok((row.get_typed::<i64>(0)?, row.get_typed::<String>(1)?))
6829            })
6830            .with_context(|| "loading agent lookup for lexical rebuild")?
6831            .into_iter()
6832            .collect();
6833        let workspaces: HashMap<i64, PathBuf> = self
6834            .conn
6835            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
6836                let path_str: String = row.get_typed(1)?;
6837                Ok((row.get_typed::<i64>(0)?, PathBuf::from(path_str)))
6838            })
6839            .with_context(|| "loading workspace lookup for lexical rebuild")?
6840            .into_iter()
6841            .collect();
6842        Ok((agents, workspaces))
6843    }
6844
6845    /// List per-conversation message footprints in primary-key order.
6846    ///
6847    /// This deliberately avoids rebuild-path JOINs. Instead we merge ordered
6848    /// single-table reads over `conversations` and the narrow
6849    /// `conversation_tail_state` cache in Rust, then use `last_message_idx + 1`
6850    /// as a planning estimate.
6851    ///
6852    /// The planner only needs a sizing heuristic; exact message and byte
6853    /// accounting is performed later by the rebuild packet pipeline as it reads
6854    /// message content for indexing. Rows missing both tail-cache sources fall
6855    /// back to `MAX(messages.idx) + 1`, which preserves legacy upgraded
6856    /// databases without treating populated conversations as empty.
6857    pub fn list_conversation_footprints_for_lexical_rebuild(
6858        &self,
6859    ) -> Result<Vec<LexicalRebuildConversationFootprintRow>> {
6860        let tail_state_rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
6861            "SELECT conversation_id, last_message_idx
6862             FROM conversation_tail_state
6863             ORDER BY conversation_id ASC",
6864            fparams![],
6865            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
6866        ) {
6867            Ok(rows) => rows,
6868            Err(err) if error_indicates_missing_table(&err) => Vec::new(),
6869            Err(err) => {
6870                return Err(err).with_context(|| "listing lexical rebuild tail-state estimates");
6871            }
6872        };
6873        let tail_state_by_conversation: HashMap<i64, Option<i64>> =
6874            tail_state_rows.into_iter().collect();
6875
6876        let rows: Vec<(i64, Option<i64>)> = match self.conn.query_map_collect(
6877            "SELECT id, last_message_idx
6878             FROM conversations
6879             ORDER BY id ASC",
6880            fparams![],
6881            |row| Ok((row.get_typed::<i64>(0)?, row.get_typed::<Option<i64>>(1)?)),
6882        ) {
6883            Ok(rows) => rows,
6884            Err(err) if error_indicates_missing_column(&err) => self
6885                .conn
6886                .query_map_collect(
6887                    "SELECT id
6888                     FROM conversations
6889                     ORDER BY id ASC",
6890                    fparams![],
6891                    |row| Ok((row.get_typed::<i64>(0)?, None)),
6892                )
6893                .with_context(|| {
6894                    "listing lexical rebuild conversation ids after missing tail column fallback"
6895                })?,
6896            Err(err) => {
6897                return Err(err)
6898                    .with_context(|| "listing lexical rebuild conversation footprint estimates");
6899            }
6900        };
6901
6902        let mut footprints = Vec::with_capacity(rows.len());
6903        let mut missing_tail_positions = HashMap::new();
6904        for (conversation_id, conversation_last_message_idx) in rows {
6905            let last_message_idx = tail_state_by_conversation
6906                .get(&conversation_id)
6907                .copied()
6908                .flatten()
6909                .or(conversation_last_message_idx);
6910            let Some(message_count) = lexical_rebuild_message_count_from_tail_idx(last_message_idx)
6911            else {
6912                missing_tail_positions.insert(conversation_id, footprints.len());
6913                footprints.push(LexicalRebuildConversationFootprintRow {
6914                    conversation_id,
6915                    message_count: 0,
6916                    message_bytes: 0,
6917                });
6918                continue;
6919            };
6920            footprints.push(lexical_rebuild_conversation_footprint_from_count(
6921                conversation_id,
6922                message_count,
6923            ));
6924        }
6925
6926        let every_footprint_was_missing_tail = missing_tail_positions.len() == footprints.len();
6927        if !missing_tail_positions.is_empty() {
6928            self.fill_missing_lexical_rebuild_footprint_tails(
6929                &mut footprints,
6930                &missing_tail_positions,
6931            )?;
6932        }
6933        if !every_footprint_was_missing_tail {
6934            self.raise_lexical_rebuild_footprints_to_exact_message_counts(&mut footprints)?;
6935        }
6936
6937        Ok(footprints)
6938    }
6939
6940    pub fn lexical_rebuild_has_tail_footprint_metadata(&self) -> Result<bool> {
6941        let total_conversations: i64 = self
6942            .conn
6943            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
6944                row.get_typed(0)
6945            })
6946            .with_context(|| "counting conversations for lexical rebuild tail metadata coverage")?;
6947        let total_conversations = usize::try_from(total_conversations.max(0)).unwrap_or(usize::MAX);
6948        if total_conversations == 0 {
6949            return Ok(true);
6950        }
6951
6952        let conversation_columns = franken_table_column_names(&self.conn, "conversations")?;
6953        let conversations_have_tail_column = conversation_columns.contains("last_message_idx");
6954        let tail_state_has_tail_column =
6955            match franken_table_column_names(&self.conn, "conversation_tail_state") {
6956                Ok(columns) => columns.contains("last_message_idx"),
6957                Err(err) if error_indicates_missing_table(&err) => false,
6958                Err(err) => {
6959                    return Err(err)
6960                        .with_context(|| "reading lexical rebuild tail-state metadata columns");
6961                }
6962            };
6963        if !conversations_have_tail_column && !tail_state_has_tail_column {
6964            return Ok(false);
6965        }
6966
6967        let covered_sql = match (conversations_have_tail_column, tail_state_has_tail_column) {
6968            (true, true) => {
6969                "SELECT COUNT(*)
6970                 FROM conversations c
6971                 LEFT JOIN conversation_tail_state ts ON ts.conversation_id = c.id
6972                 WHERE c.last_message_idx IS NOT NULL
6973                    OR ts.last_message_idx IS NOT NULL"
6974            }
6975            (true, false) => {
6976                "SELECT COUNT(*)
6977                 FROM conversations
6978                 WHERE last_message_idx IS NOT NULL"
6979            }
6980            (false, true) => {
6981                "SELECT COUNT(*)
6982                 FROM conversations c
6983                 WHERE EXISTS (
6984                     SELECT 1
6985                     FROM conversation_tail_state ts
6986                     WHERE ts.conversation_id = c.id
6987                       AND ts.last_message_idx IS NOT NULL
6988                 )"
6989            }
6990            (false, false) => unreachable!("checked before covered_sql selection"),
6991        };
6992        let covered_conversations: i64 = self
6993            .conn
6994            .query_row_map(covered_sql, fparams![], |row| row.get_typed(0))
6995            .with_context(
6996                || "counting conversations covered by lexical rebuild tail footprint metadata",
6997            )?;
6998        let covered_conversations =
6999            usize::try_from(covered_conversations.max(0)).unwrap_or(usize::MAX);
7000
7001        Ok(lexical_rebuild_tail_metadata_coverage_is_sufficient(
7002            total_conversations,
7003            covered_conversations,
7004        ))
7005    }
7006
7007    fn raise_lexical_rebuild_footprints_to_exact_message_counts(
7008        &self,
7009        footprints: &mut [LexicalRebuildConversationFootprintRow],
7010    ) -> Result<()> {
7011        if footprints.is_empty() {
7012            return Ok(());
7013        }
7014
7015        let positions_by_conversation: HashMap<i64, usize> = footprints
7016            .iter()
7017            .enumerate()
7018            .map(|(position, footprint)| (footprint.conversation_id, position))
7019            .collect();
7020        self.conn
7021            .query_with_params_for_each(
7022                "SELECT conversation_id, COUNT(*) AS message_count
7023                 FROM messages
7024                 GROUP BY conversation_id
7025                 ORDER BY conversation_id ASC",
7026                &[] as &[SqliteValue],
7027                |row| {
7028                    let conversation_id: i64 = row.get_typed(0)?;
7029                    let exact_count: i64 = row.get_typed(1)?;
7030                    let Some(position) = positions_by_conversation.get(&conversation_id) else {
7031                        return Ok(());
7032                    };
7033                    let exact_count = usize::try_from(exact_count.max(0)).unwrap_or(usize::MAX);
7034                    let footprint = &mut footprints[*position];
7035                    if exact_count > footprint.message_count {
7036                        footprint.message_count = exact_count;
7037                        footprint.message_bytes =
7038                            footprint.message_bytes.max(exact_count.saturating_mul(
7039                                LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
7040                            ));
7041                    }
7042                    Ok(())
7043                },
7044            )
7045            .with_context(|| "raising lexical rebuild footprints to exact message counts")?;
7046        Ok(())
7047    }
7048
7049    fn fill_missing_lexical_rebuild_footprint_tails(
7050        &self,
7051        footprints: &mut [LexicalRebuildConversationFootprintRow],
7052        missing_tail_positions: &HashMap<i64, usize>,
7053    ) -> Result<()> {
7054        if missing_tail_positions.len() <= LEXICAL_REBUILD_FOOTPRINT_POINT_TAIL_FALLBACK_LIMIT {
7055            for (conversation_id, position) in missing_tail_positions {
7056                let last_message_idx: Option<i64> = self
7057                    .conn
7058                    .query_row_map(
7059                        "SELECT MAX(idx) FROM messages WHERE conversation_id = ?1",
7060                        fparams![*conversation_id],
7061                        |row| row.get_typed(0),
7062                    )
7063                    .with_context(|| {
7064                        format!(
7065                            "looking up missing lexical rebuild tail estimate for conversation {conversation_id}"
7066                        )
7067                    })?;
7068                if let Some(message_count) =
7069                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7070                {
7071                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7072                        *conversation_id,
7073                        message_count,
7074                    );
7075                }
7076            }
7077            return Ok(());
7078        }
7079
7080        self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7081            footprints,
7082            missing_tail_positions,
7083            "SELECT conversation_id, MAX(idx) AS last_message_idx
7084             FROM messages INDEXED BY idx_messages_conv_idx
7085             GROUP BY conversation_id
7086             ORDER BY conversation_id ASC",
7087        )
7088        .or_else(|err| {
7089            if err
7090                .to_string()
7091                .contains("no such index: idx_messages_conv_idx")
7092            {
7093                return self.fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7094                    footprints,
7095                    missing_tail_positions,
7096                    "SELECT conversation_id, MAX(idx) AS last_message_idx
7097                     FROM messages
7098                     GROUP BY conversation_id
7099                     ORDER BY conversation_id ASC",
7100                );
7101            }
7102            Err(err)
7103        })
7104        .with_context(|| "grouping missing lexical rebuild tail estimates from messages")?;
7105
7106        Ok(())
7107    }
7108
7109    fn fill_missing_lexical_rebuild_footprint_tails_from_grouped_messages(
7110        &self,
7111        footprints: &mut [LexicalRebuildConversationFootprintRow],
7112        missing_tail_positions: &HashMap<i64, usize>,
7113        sql: &str,
7114    ) -> Result<()> {
7115        self.conn
7116            .query_with_params_for_each(sql, &[] as &[SqliteValue], |row| {
7117                let conversation_id: i64 = row.get_typed(0)?;
7118                let last_message_idx: Option<i64> = row.get_typed(1)?;
7119                let Some(position) = missing_tail_positions.get(&conversation_id) else {
7120                    return Ok(());
7121                };
7122                if let Some(message_count) =
7123                    lexical_rebuild_message_count_from_tail_idx(last_message_idx)
7124                {
7125                    footprints[*position] = lexical_rebuild_conversation_footprint_from_count(
7126                        conversation_id,
7127                        message_count,
7128                    );
7129                }
7130                Ok(())
7131            })
7132            .with_context(|| "grouping lexical rebuild missing tail estimates")
7133    }
7134
7135    /// List conversation ids in the stable order used by lexical rebuilds.
7136    pub fn list_conversation_ids_for_lexical_rebuild(&self) -> Result<Vec<i64>> {
7137        self.conn
7138            .query_map_collect(
7139                "SELECT id FROM conversations ORDER BY id ASC",
7140                fparams![],
7141                |row| row.get_typed(0),
7142            )
7143            .with_context(|| "listing conversation ids for lexical rebuild")
7144    }
7145    /// Legacy OFFSET-based traversal for one-time checkpoint migration only.
7146    ///
7147    /// New code must use `list_conversations_for_lexical_rebuild_after_id`
7148    /// for keyset pagination.
7149    pub fn list_conversations_for_lexical_rebuild_by_offset(
7150        &self,
7151        limit: i64,
7152        offset: i64,
7153        agent_slugs: &HashMap<i64, String>,
7154        workspace_paths: &HashMap<i64, PathBuf>,
7155    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7156        // Single-table query avoids the 3-table JOIN that triggers
7157        // frankensqlite's full-materialization fallback path.
7158        self.conn
7159            .query_map_collect(
7160                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7161                       started_at,
7162                       COALESCE(
7163                           (SELECT ts.ended_at
7164                            FROM conversation_tail_state ts
7165                            WHERE ts.conversation_id = conversations.id),
7166                           ended_at
7167                       ),
7168                       source_id, origin_host
7169                FROM conversations
7170                ORDER BY id ASC
7171                LIMIT ?1 OFFSET ?2",
7172                fparams![limit, offset],
7173                |row| {
7174                    let agent_id: Option<i64> = row.get_typed(1)?;
7175                    let workspace_id: Option<i64> = row.get_typed(2)?;
7176                    let source_path: String = row.get_typed(5)?;
7177                    let raw_source_id: Option<String> = row.get_typed(8)?;
7178                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7179                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7180                        raw_source_id.as_deref(),
7181                        None,
7182                        raw_origin_host.as_deref(),
7183                    );
7184                    Ok(LexicalRebuildConversationRow {
7185                        id: Some(row.get_typed(0)?),
7186                        agent_slug: agent_id
7187                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7188                            .unwrap_or_else(|| "unknown".to_string()),
7189                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7190                        external_id: row.get_typed(3)?,
7191                        title: row.get_typed(4)?,
7192                        source_path: Path::new(&source_path).to_path_buf(),
7193                        started_at: row.get_typed(6)?,
7194                        ended_at: row.get_typed(7)?,
7195                        source_id,
7196                        origin_host,
7197                    })
7198                },
7199            )
7200            .with_context(|| "listing conversations for lexical rebuild")
7201    }
7202
7203    /// List lexical rebuild conversations strictly after the given primary key.
7204    ///
7205    /// Keyset pagination keeps later rebuild pages as cheap as earlier ones,
7206    /// avoiding the ever-growing `OFFSET` scan cost during large rebuilds.
7207    pub fn list_conversations_for_lexical_rebuild_after_id(
7208        &self,
7209        limit: i64,
7210        after_conversation_id: i64,
7211        agent_slugs: &HashMap<i64, String>,
7212        workspace_paths: &HashMap<i64, PathBuf>,
7213    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7214        self.conn
7215            .query_map_collect(
7216                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7217                       started_at,
7218                       COALESCE(
7219                           (SELECT ts.ended_at
7220                            FROM conversation_tail_state ts
7221                            WHERE ts.conversation_id = conversations.id),
7222                           ended_at
7223                       ),
7224                       source_id, origin_host
7225                FROM conversations
7226                WHERE id > ?2
7227                ORDER BY id ASC
7228                LIMIT ?1",
7229                fparams![limit, after_conversation_id],
7230                |row| {
7231                    let agent_id: Option<i64> = row.get_typed(1)?;
7232                    let workspace_id: Option<i64> = row.get_typed(2)?;
7233                    let source_path: String = row.get_typed(5)?;
7234                    let raw_source_id: Option<String> = row.get_typed(8)?;
7235                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7236                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7237                        raw_source_id.as_deref(),
7238                        None,
7239                        raw_origin_host.as_deref(),
7240                    );
7241                    Ok(LexicalRebuildConversationRow {
7242                        id: Some(row.get_typed(0)?),
7243                        agent_slug: agent_id
7244                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7245                            .unwrap_or_else(|| "unknown".to_string()),
7246                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7247                        external_id: row.get_typed(3)?,
7248                        title: row.get_typed(4)?,
7249                        source_path: Path::new(&source_path).to_path_buf(),
7250                        started_at: row.get_typed(6)?,
7251                        ended_at: row.get_typed(7)?,
7252                        source_id,
7253                        origin_host,
7254                    })
7255                },
7256            )
7257            .with_context(|| {
7258                format!(
7259                    "listing conversations for lexical rebuild after id {after_conversation_id}"
7260                )
7261            })
7262    }
7263
7264    /// List lexical rebuild conversations inside an `(after_id, through_id]`
7265    /// primary-key window.
7266    ///
7267    /// This lets the rebuild producer respect planned shard boundaries without
7268    /// falling back to client-side trimming or multi-table joins.
7269    pub fn list_conversations_for_lexical_rebuild_after_id_through_id(
7270        &self,
7271        limit: i64,
7272        after_conversation_id: i64,
7273        through_conversation_id: i64,
7274        agent_slugs: &HashMap<i64, String>,
7275        workspace_paths: &HashMap<i64, PathBuf>,
7276    ) -> Result<Vec<LexicalRebuildConversationRow>> {
7277        if through_conversation_id <= after_conversation_id {
7278            return Ok(Vec::new());
7279        }
7280        self.conn
7281            .query_map_collect(
7282                r"SELECT id, agent_id, workspace_id, external_id, title, source_path,
7283                       started_at,
7284                       COALESCE(
7285                           (SELECT ts.ended_at
7286                            FROM conversation_tail_state ts
7287                            WHERE ts.conversation_id = conversations.id),
7288                           ended_at
7289                       ),
7290                       source_id, origin_host
7291                FROM conversations
7292                WHERE id > ?2 AND id <= ?3
7293                ORDER BY id ASC
7294                LIMIT ?1",
7295                fparams![limit, after_conversation_id, through_conversation_id],
7296                |row| {
7297                    let agent_id: Option<i64> = row.get_typed(1)?;
7298                    let workspace_id: Option<i64> = row.get_typed(2)?;
7299                    let source_path: String = row.get_typed(5)?;
7300                    let raw_source_id: Option<String> = row.get_typed(8)?;
7301                    let raw_origin_host: Option<String> = row.get_typed(9)?;
7302                    let (source_id, _, origin_host) = normalized_storage_source_parts(
7303                        raw_source_id.as_deref(),
7304                        None,
7305                        raw_origin_host.as_deref(),
7306                    );
7307                    Ok(LexicalRebuildConversationRow {
7308                        id: Some(row.get_typed(0)?),
7309                        agent_slug: agent_id
7310                            .and_then(|aid| agent_slugs.get(&aid).cloned())
7311                            .unwrap_or_else(|| "unknown".to_string()),
7312                        workspace: workspace_id.and_then(|wid| workspace_paths.get(&wid).cloned()),
7313                        external_id: row.get_typed(3)?,
7314                        title: row.get_typed(4)?,
7315                        source_path: Path::new(&source_path).to_path_buf(),
7316                        started_at: row.get_typed(6)?,
7317                        ended_at: row.get_typed(7)?,
7318                        source_id,
7319                        origin_host,
7320                    })
7321                },
7322            )
7323            .with_context(|| {
7324                format!(
7325                    "listing conversations for lexical rebuild after id {after_conversation_id} through id {through_conversation_id}"
7326                )
7327            })
7328    }
7329
7330    /// Fetch messages for a conversation.
7331    pub fn fetch_messages(&self, conversation_id: i64) -> Result<Vec<Message>> {
7332        let hinted_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7333             FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7334             WHERE conversation_id = ?1 ORDER BY idx";
7335        let fallback_sql = "SELECT id, idx, role, author, created_at, content, extra_json, extra_bin \
7336             FROM messages \
7337             WHERE conversation_id = ?1 ORDER BY idx";
7338
7339        self.conn
7340            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7341                let role: String = row.get_typed(2)?;
7342                Ok(Message {
7343                    id: Some(row.get_typed(0)?),
7344                    idx: row.get_typed(1)?,
7345                    role: match role.as_str() {
7346                        "user" => MessageRole::User,
7347                        "agent" | "assistant" => MessageRole::Agent,
7348                        "tool" => MessageRole::Tool,
7349                        "system" => MessageRole::System,
7350                        other => MessageRole::Other(other.to_string()),
7351                    },
7352                    author: row.get_typed(3)?,
7353                    created_at: row.get_typed(4)?,
7354                    content: row.get_typed(5)?,
7355                    extra_json: franken_read_message_extra_compat(row, 6, 7),
7356                    snippets: Vec::new(),
7357                })
7358            })
7359            .or_else(|err| {
7360                if err
7361                    .to_string()
7362                    .contains("no such index: sqlite_autoindex_messages_1")
7363                {
7364                    return self.conn.query_map_collect(
7365                        fallback_sql,
7366                        fparams![conversation_id],
7367                        |row| {
7368                            let role: String = row.get_typed(2)?;
7369                            Ok(Message {
7370                                id: Some(row.get_typed(0)?),
7371                                idx: row.get_typed(1)?,
7372                                role: match role.as_str() {
7373                                    "user" => MessageRole::User,
7374                                    "agent" | "assistant" => MessageRole::Agent,
7375                                    "tool" => MessageRole::Tool,
7376                                    "system" => MessageRole::System,
7377                                    other => MessageRole::Other(other.to_string()),
7378                                },
7379                                author: row.get_typed(3)?,
7380                                created_at: row.get_typed(4)?,
7381                                content: row.get_typed(5)?,
7382                                extra_json: franken_read_message_extra_compat(row, 6, 7),
7383                                snippets: Vec::new(),
7384                            })
7385                        },
7386                    );
7387                }
7388                Err(err)
7389            })
7390            .with_context(|| format!("fetching messages for conversation {conversation_id}"))
7391    }
7392
7393    /// Fetch messages for lexical index rebuilds without deserializing extra metadata.
7394    ///
7395    /// Tantivy only needs message text and core envelope fields, so avoiding
7396    /// `extra_json` here prevents rebuilds from rehydrating enormous historical
7397    /// payloads that are irrelevant to lexical search.
7398    pub fn fetch_messages_for_lexical_rebuild(&self, conversation_id: i64) -> Result<Vec<Message>> {
7399        let hinted_sql = "SELECT id, idx, role, author, created_at, content \
7400                 FROM messages INDEXED BY sqlite_autoindex_messages_1 \
7401                 WHERE conversation_id = ?1 ORDER BY idx";
7402        let fallback_sql = "SELECT id, idx, role, author, created_at, content \
7403                 FROM messages \
7404                 WHERE conversation_id = ?1 ORDER BY idx";
7405
7406        self.conn
7407            .query_map_collect(hinted_sql, fparams![conversation_id], |row| {
7408                let role: String = row.get_typed(2)?;
7409                Ok(Message {
7410                    id: Some(row.get_typed(0)?),
7411                    idx: row.get_typed(1)?,
7412                    role: match role.as_str() {
7413                        "user" => MessageRole::User,
7414                        "agent" | "assistant" => MessageRole::Agent,
7415                        "tool" => MessageRole::Tool,
7416                        "system" => MessageRole::System,
7417                        other => MessageRole::Other(other.to_string()),
7418                    },
7419                    author: row.get_typed(3)?,
7420                    created_at: row.get_typed(4)?,
7421                    content: row.get_typed(5)?,
7422                    extra_json: serde_json::Value::Null,
7423                    snippets: Vec::new(),
7424                })
7425            })
7426            .or_else(|err| {
7427                if err
7428                    .to_string()
7429                    .contains("no such index: sqlite_autoindex_messages_1")
7430                {
7431                    return self.conn.query_map_collect(
7432                        fallback_sql,
7433                        fparams![conversation_id],
7434                        |row| {
7435                            let role: String = row.get_typed(2)?;
7436                            Ok(Message {
7437                                id: Some(row.get_typed(0)?),
7438                                idx: row.get_typed(1)?,
7439                                role: match role.as_str() {
7440                                    "user" => MessageRole::User,
7441                                    "agent" | "assistant" => MessageRole::Agent,
7442                                    "tool" => MessageRole::Tool,
7443                                    "system" => MessageRole::System,
7444                                    other => MessageRole::Other(other.to_string()),
7445                                },
7446                                author: row.get_typed(3)?,
7447                                created_at: row.get_typed(4)?,
7448                                content: row.get_typed(5)?,
7449                                extra_json: serde_json::Value::Null,
7450                                snippets: Vec::new(),
7451                            })
7452                        },
7453                    );
7454                }
7455                Err(err)
7456            })
7457            .with_context(|| {
7458                format!("fetching messages for lexical rebuild of conversation {conversation_id}")
7459            })
7460    }
7461
7462    /// Fetch messages for multiple conversations during lexical rebuilds.
7463    ///
7464    /// This preserves the lightweight lexical-rebuild projection while avoiding
7465    /// one round-trip per conversation when rebuilding large canonical indexes.
7466    pub fn fetch_messages_for_lexical_rebuild_batch(
7467        &self,
7468        conversation_ids: &[i64],
7469        max_messages: Option<usize>,
7470        max_content_bytes: Option<usize>,
7471    ) -> Result<HashMap<i64, Vec<Message>>> {
7472        if conversation_ids.is_empty() {
7473            return Ok(HashMap::new());
7474        }
7475
7476        let mut grouped: HashMap<i64, Vec<Message>> =
7477            HashMap::with_capacity(conversation_ids.len());
7478        let mut fetched_conversation_ids = HashSet::with_capacity(conversation_ids.len());
7479        let mut total_messages = 0usize;
7480        let mut total_content_bytes = 0usize;
7481
7482        // The apparent single-query shape (`WHERE conversation_id IN (...) ORDER BY ...`)
7483        // is a bad frankensqlite plan for large live databases: it can
7484        // materialize far more of `messages` than the requested conversations.
7485        // Reuse the hinted per-conversation primary-key lookup instead.
7486        for conversation_id in conversation_ids {
7487            if !fetched_conversation_ids.insert(*conversation_id) {
7488                continue;
7489            }
7490
7491            let messages = self
7492                .fetch_messages_for_lexical_rebuild(*conversation_id)
7493                .with_context(|| {
7494                    format!("fetching lexical rebuild messages for conversation {conversation_id}")
7495                })?;
7496            total_messages = total_messages.saturating_add(messages.len());
7497            if let Some(limit) = max_messages
7498                && total_messages > limit
7499            {
7500                return Err(anyhow!(
7501                    "lexical rebuild batch fetch exceeded message guardrail: messages={total_messages} limit={limit} conversations={}",
7502                    conversation_ids.len()
7503                ));
7504            }
7505
7506            let message_bytes = messages
7507                .iter()
7508                .map(|message| message.content.len())
7509                .sum::<usize>();
7510            total_content_bytes = total_content_bytes.saturating_add(message_bytes);
7511            if let Some(limit) = max_content_bytes
7512                && total_content_bytes > limit
7513            {
7514                return Err(anyhow!(
7515                    "lexical rebuild batch fetch exceeded content-byte guardrail: bytes={total_content_bytes} limit={limit} conversations={}",
7516                    conversation_ids.len()
7517                ));
7518            }
7519
7520            if !messages.is_empty() {
7521                grouped.insert(*conversation_id, messages);
7522            }
7523        }
7524
7525        Ok(grouped)
7526    }
7527
7528    /// Stream lexical rebuild message rows in `(conversation_id, idx)` order
7529    /// without materializing the full result set.
7530    pub fn stream_messages_for_lexical_rebuild_between_conversation_ids<F>(
7531        &self,
7532        start_conversation_id: i64,
7533        end_conversation_id: i64,
7534        mut f: F,
7535    ) -> Result<()>
7536    where
7537        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7538    {
7539        if end_conversation_id < start_conversation_id {
7540            return Ok(());
7541        }
7542
7543        let conversation_ids: Vec<i64> = self
7544            .conn
7545            .query_map_collect(
7546                "SELECT id FROM conversations WHERE id >= ?1 AND id <= ?2 ORDER BY id ASC",
7547                fparams![start_conversation_id, end_conversation_id],
7548                |row| row.get_typed(0),
7549            )
7550            .with_context(|| "listing conversation ids for streamed lexical rebuild")?;
7551
7552        for conversation_id in conversation_ids {
7553            let messages = self
7554                .fetch_messages_for_lexical_rebuild(conversation_id)
7555                .with_context(|| {
7556                    format!("streaming lexical rebuild messages for conversation {conversation_id}")
7557                })?;
7558
7559            for message in messages {
7560                let message_id = message.id.ok_or_else(|| {
7561                    anyhow!(
7562                        "lexical rebuild message missing id for conversation {conversation_id} idx {}",
7563                        message.idx
7564                    )
7565                })?;
7566                f(LexicalRebuildMessageRow {
7567                    conversation_id,
7568                    id: message_id,
7569                    idx: message.idx,
7570                    role: role_str(&message.role),
7571                    author: message.author,
7572                    created_at: message.created_at,
7573                    content: message.content,
7574                })?;
7575            }
7576        }
7577
7578        Ok(())
7579    }
7580
7581    /// Stream grouped lexical rebuild message rows in `(conversation_id, idx)`
7582    /// order by reusing the canonical per-message stream and coalescing rows
7583    /// per conversation.
7584    pub fn stream_grouped_messages_for_lexical_rebuild_between_conversation_ids<F>(
7585        &self,
7586        start_conversation_id: i64,
7587        end_conversation_id: i64,
7588        mut f: F,
7589    ) -> Result<()>
7590    where
7591        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7592    {
7593        if end_conversation_id < start_conversation_id {
7594            return Ok(());
7595        }
7596
7597        let mut current_conversation_id: Option<i64> = None;
7598        let mut current_messages: LexicalRebuildGroupedMessageRows = SmallVec::new();
7599        let mut current_last_message_id = 0i64;
7600        let mut flush_current = |current_conversation_id: &mut Option<i64>,
7601                                 current_messages: &mut LexicalRebuildGroupedMessageRows,
7602                                 current_last_message_id: &mut i64|
7603         -> Result<()> {
7604            let Some(conversation_id) = current_conversation_id.take() else {
7605                return Ok(());
7606            };
7607            let messages = std::mem::take(current_messages);
7608            let last_message_id = std::mem::take(current_last_message_id);
7609            f(conversation_id, messages, last_message_id)
7610        };
7611
7612        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7613            start_conversation_id,
7614            end_conversation_id,
7615            |row| {
7616                if current_conversation_id != Some(row.conversation_id) {
7617                    flush_current(
7618                        &mut current_conversation_id,
7619                        &mut current_messages,
7620                        &mut current_last_message_id,
7621                    )?;
7622                    current_conversation_id = Some(row.conversation_id);
7623                }
7624                current_last_message_id = row.id;
7625                current_messages.push(LexicalRebuildGroupedMessageRow {
7626                    idx: row.idx,
7627                    is_tool_role: row.role == "tool",
7628                    created_at: row.created_at,
7629                    content: row.content,
7630                });
7631                Ok(())
7632            },
7633        )
7634        .with_context(|| "streaming grouped lexical rebuild messages")?;
7635
7636        flush_current(
7637            &mut current_conversation_id,
7638            &mut current_messages,
7639            &mut current_last_message_id,
7640        )
7641        .with_context(|| "flushing grouped lexical rebuild messages")
7642    }
7643
7644    /// Stream grouped lexical rebuild message rows from a starting conversation
7645    /// id to the end of the table.
7646    pub fn stream_grouped_messages_for_lexical_rebuild_from_conversation_id<F>(
7647        &self,
7648        start_conversation_id: i64,
7649        f: F,
7650    ) -> Result<()>
7651    where
7652        F: FnMut(i64, LexicalRebuildGroupedMessageRows, i64) -> Result<()>,
7653    {
7654        self.stream_grouped_messages_for_lexical_rebuild_between_conversation_ids(
7655            start_conversation_id,
7656            i64::MAX,
7657            f,
7658        )
7659    }
7660
7661    /// Stream lexical rebuild message rows from a starting conversation id to
7662    /// the end of the table.
7663    pub fn stream_messages_for_lexical_rebuild_from_conversation_id<F>(
7664        &self,
7665        start_conversation_id: i64,
7666        f: F,
7667    ) -> Result<()>
7668    where
7669        F: FnMut(LexicalRebuildMessageRow) -> Result<()>,
7670    {
7671        self.stream_messages_for_lexical_rebuild_between_conversation_ids(
7672            start_conversation_id,
7673            i64::MAX,
7674            f,
7675        )
7676    }
7677
7678    /// Get a source by ID.
7679    pub fn get_source(&self, id: &str) -> Result<Option<Source>> {
7680        let result = self.conn.query_row_map(
7681            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources WHERE id = ?1",
7682            fparams![id],
7683            |row| {
7684                let kind_str: String = row.get_typed(1)?;
7685                let config_json_str: Option<String> = row.get_typed(5)?;
7686                Ok(Source {
7687                    id: row.get_typed(0)?,
7688                    kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7689                    host_label: row.get_typed(2)?,
7690                    machine_id: row.get_typed(3)?,
7691                    platform: row.get_typed(4)?,
7692                    config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7693                    created_at: row.get_typed(6)?,
7694                    updated_at: row.get_typed(7)?,
7695                })
7696            },
7697        );
7698        Ok(result.optional()?)
7699    }
7700
7701    /// List all sources.
7702    pub fn list_sources(&self) -> Result<Vec<Source>> {
7703        self.conn
7704            .query_map_collect(
7705                "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at FROM sources ORDER BY id",
7706                fparams![],
7707                |row| {
7708                    let kind_str: String = row.get_typed(1)?;
7709                    let config_json_str: Option<String> = row.get_typed(5)?;
7710                    Ok(Source {
7711                        id: row.get_typed(0)?,
7712                        kind: SourceKind::parse(&kind_str).unwrap_or_default(),
7713                        host_label: row.get_typed(2)?,
7714                        machine_id: row.get_typed(3)?,
7715                        platform: row.get_typed(4)?,
7716                        config_json: config_json_str.and_then(|s| serde_json::from_str(&s).ok()),
7717                        created_at: row.get_typed(6)?,
7718                        updated_at: row.get_typed(7)?,
7719                    })
7720                },
7721            )
7722            .with_context(|| "listing sources")
7723    }
7724
7725    /// Get IDs of all non-local sources.
7726    pub fn get_source_ids(&self) -> Result<Vec<String>> {
7727        self.conn
7728            .query_map_collect(
7729                "SELECT id FROM sources WHERE id != 'local' ORDER BY id",
7730                fparams![],
7731                |row| row.get_typed(0),
7732            )
7733            .with_context(|| "listing source ids")
7734    }
7735
7736    /// Create or update a source.
7737    pub fn upsert_source(&self, source: &Source) -> Result<()> {
7738        self.invalidate_conversation_source_cache(source.id.as_str());
7739        let now = Self::now_millis();
7740        let kind_str = source.kind.to_string();
7741        let config_json_str = source
7742            .config_json
7743            .as_ref()
7744            .map(serde_json::to_string)
7745            .transpose()?;
7746
7747        // Re-indexing commonly reuses the same normalized source metadata
7748        // across many conversations. Skip the write entirely when the row is
7749        // already identical so we avoid needless WAL churn and timestamp bumps.
7750        self.conn.execute_compat(
7751            "INSERT INTO sources(id, kind, host_label, machine_id, platform, config_json, created_at, updated_at)
7752             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)
7753             ON CONFLICT(id) DO UPDATE SET
7754                 kind = excluded.kind,
7755                 host_label = excluded.host_label,
7756                 machine_id = excluded.machine_id,
7757                 platform = excluded.platform,
7758                 config_json = excluded.config_json,
7759                 updated_at = excluded.updated_at
7760             WHERE NOT (
7761                 sources.kind IS excluded.kind
7762                 AND sources.host_label IS excluded.host_label
7763                 AND sources.machine_id IS excluded.machine_id
7764                 AND sources.platform IS excluded.platform
7765                 AND sources.config_json IS excluded.config_json
7766             )",
7767            fparams![
7768                source.id.as_str(),
7769                kind_str.as_str(),
7770                source.host_label.as_deref(),
7771                source.machine_id.as_deref(),
7772                source.platform.as_deref(),
7773                config_json_str.as_deref(),
7774                source.created_at.unwrap_or(now),
7775                now
7776            ],
7777        )?;
7778        Ok(())
7779    }
7780
7781    fn historical_bundle_key_hash(
7782        version: u32,
7783        bundle: &HistoricalDatabaseBundle,
7784        include_bundle_stats: bool,
7785    ) -> String {
7786        let signature = if include_bundle_stats {
7787            format!(
7788                "{}:{}:{}:{}",
7789                version,
7790                bundle.root_path.display(),
7791                bundle.total_bytes,
7792                bundle.modified_at_ms
7793            )
7794        } else {
7795            format!("{}:{}", version, bundle.root_path.display())
7796        };
7797        blake3::hash(signature.as_bytes()).to_hex().to_string()
7798    }
7799
7800    fn historical_bundle_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7801        format!(
7802            "historical_bundle_salvaged:{}",
7803            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_LEDGER_VERSION, bundle, false)
7804        )
7805    }
7806
7807    fn historical_bundle_legacy_meta_key(bundle: &HistoricalDatabaseBundle) -> String {
7808        let signature = format!(
7809            "{}:{}:{}:{}",
7810            HISTORICAL_SALVAGE_LEDGER_VERSION,
7811            bundle.root_path.display(),
7812            bundle.total_bytes,
7813            bundle.modified_at_ms
7814        );
7815        format!(
7816            "historical_bundle_salvaged:{}",
7817            blake3::hash(signature.as_bytes()).to_hex()
7818        )
7819    }
7820
7821    fn historical_bundle_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
7822        format!(
7823            "historical_bundle_progress:{}",
7824            Self::historical_bundle_key_hash(HISTORICAL_SALVAGE_PROGRESS_VERSION, bundle, false)
7825        )
7826    }
7827
7828    fn historical_bundle_legacy_progress_key(bundle: &HistoricalDatabaseBundle) -> String {
7829        let signature = format!(
7830            "{}:{}:{}:{}",
7831            HISTORICAL_SALVAGE_PROGRESS_VERSION,
7832            bundle.root_path.display(),
7833            bundle.total_bytes,
7834            bundle.modified_at_ms
7835        );
7836        format!(
7837            "historical_bundle_progress:{}",
7838            blake3::hash(signature.as_bytes()).to_hex()
7839        )
7840    }
7841
7842    fn historical_bundle_already_imported(
7843        &self,
7844        bundle: &HistoricalDatabaseBundle,
7845    ) -> Result<bool> {
7846        for key in [
7847            Self::historical_bundle_meta_key(bundle),
7848            Self::historical_bundle_legacy_meta_key(bundle),
7849        ] {
7850            let existing: Option<String> = self
7851                .conn
7852                .query_row_map(
7853                    "SELECT value FROM meta WHERE key = ?1",
7854                    fparams![key.as_str()],
7855                    |row| row.get_typed(0),
7856                )
7857                .optional()?;
7858            if existing.is_some() {
7859                return Ok(true);
7860            }
7861        }
7862        Ok(false)
7863    }
7864
7865    pub(crate) fn has_pending_historical_bundles(&self, canonical_db_path: &Path) -> Result<bool> {
7866        for bundle in discover_historical_database_bundles(canonical_db_path) {
7867            if !self.historical_bundle_already_imported(&bundle)? {
7868                return Ok(true);
7869            }
7870        }
7871        Ok(false)
7872    }
7873
7874    fn load_historical_bundle_progress(
7875        &self,
7876        bundle: &HistoricalDatabaseBundle,
7877    ) -> Result<Option<HistoricalBundleProgress>> {
7878        for key in [
7879            Self::historical_bundle_progress_key(bundle),
7880            Self::historical_bundle_legacy_progress_key(bundle),
7881        ] {
7882            let raw: Option<String> = self
7883                .conn
7884                .query_row_map(
7885                    "SELECT value FROM meta WHERE key = ?1",
7886                    fparams![key.as_str()],
7887                    |row| row.get_typed(0),
7888                )
7889                .optional()?;
7890            let Some(raw) = raw else {
7891                continue;
7892            };
7893            let parsed: HistoricalBundleProgress =
7894                serde_json::from_str(&raw).with_context(|| {
7895                    format!(
7896                        "parsing historical salvage progress checkpoint for {}",
7897                        bundle.root_path.display()
7898                    )
7899                })?;
7900            if parsed.progress_version == HISTORICAL_SALVAGE_PROGRESS_VERSION {
7901                return Ok(Some(parsed));
7902            }
7903        }
7904        Ok(None)
7905    }
7906
7907    fn record_historical_bundle_progress(
7908        &self,
7909        bundle: &HistoricalDatabaseBundle,
7910        method: &str,
7911        last_completed_source_row_id: i64,
7912        conversations_imported: usize,
7913        messages_imported: usize,
7914    ) -> Result<()> {
7915        let key = Self::historical_bundle_progress_key(bundle);
7916        let value = HistoricalBundleProgress {
7917            progress_version: HISTORICAL_SALVAGE_PROGRESS_VERSION,
7918            path: bundle.root_path.display().to_string(),
7919            bytes: bundle.total_bytes,
7920            modified_at_ms: bundle.modified_at_ms,
7921            method: method.to_string(),
7922            last_completed_source_row_id,
7923            conversations_imported,
7924            messages_imported,
7925            updated_at_ms: Self::now_millis(),
7926        };
7927        let value_str = serde_json::to_string(&value)?;
7928        self.conn.execute_compat(
7929            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7930            fparams![key.as_str(), value_str.as_str()],
7931        )?;
7932        Ok(())
7933    }
7934
7935    fn clear_historical_bundle_progress(&self, bundle: &HistoricalDatabaseBundle) -> Result<()> {
7936        for key in [
7937            Self::historical_bundle_progress_key(bundle),
7938            Self::historical_bundle_legacy_progress_key(bundle),
7939        ] {
7940            self.conn
7941                .execute_compat("DELETE FROM meta WHERE key = ?1", fparams![key.as_str()])?;
7942        }
7943        Ok(())
7944    }
7945
7946    fn record_historical_bundle_import(
7947        &self,
7948        bundle: &HistoricalDatabaseBundle,
7949        method: &str,
7950        conversations_imported: usize,
7951        messages_imported: usize,
7952    ) -> Result<()> {
7953        let key = Self::historical_bundle_meta_key(bundle);
7954        let value = serde_json::json!({
7955            "salvage_version": HISTORICAL_SALVAGE_LEDGER_VERSION,
7956            "path": bundle.root_path.display().to_string(),
7957            "bytes": bundle.total_bytes,
7958            "modified_at_ms": bundle.modified_at_ms,
7959            "method": method,
7960            "conversations_imported": conversations_imported,
7961            "messages_imported": messages_imported,
7962            "recorded_at_ms": Self::now_millis(),
7963        });
7964        let value_str = serde_json::to_string(&value)?;
7965        self.conn.execute_compat(
7966            "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
7967            fparams![key.as_str(), value_str.as_str()],
7968        )?;
7969        Ok(())
7970    }
7971
7972    fn historical_import_error_is_split_retryable(err: &anyhow::Error) -> bool {
7973        const RETRYABLE_PATTERNS: &[&str] = &[
7974            "out of memory",
7975            "string or blob too big",
7976            "too many sql variables",
7977        ];
7978        err.chain().any(|cause| {
7979            let rendered = cause.to_string().to_ascii_lowercase();
7980            RETRYABLE_PATTERNS
7981                .iter()
7982                .any(|pattern| rendered.contains(pattern))
7983        })
7984    }
7985
7986    fn split_historical_batch_entry_messages(
7987        entry: &HistoricalBatchEntry,
7988    ) -> Option<(HistoricalBatchEntry, HistoricalBatchEntry)> {
7989        if entry.conversation.messages.len() < 2 {
7990            return None;
7991        }
7992        let split_at = entry.conversation.messages.len() / 2;
7993        if split_at == 0 || split_at >= entry.conversation.messages.len() {
7994            return None;
7995        }
7996
7997        let mut left = entry.clone();
7998        left.conversation.messages = entry.conversation.messages[..split_at].to_vec();
7999
8000        let mut right = entry.clone();
8001        right.conversation.messages = entry.conversation.messages[split_at..].to_vec();
8002
8003        Some((left, right))
8004    }
8005
8006    fn import_historical_batch_with_retry<F>(
8007        entries: &[HistoricalBatchEntry],
8008        insert_batch: &mut F,
8009    ) -> Result<HistoricalBatchImportTotals>
8010    where
8011        F: FnMut(&[HistoricalBatchEntry]) -> Result<HistoricalBatchImportTotals>,
8012    {
8013        match insert_batch(entries) {
8014            Ok(totals) => Ok(totals),
8015            Err(err) if Self::historical_import_error_is_split_retryable(&err) => {
8016                if entries.len() > 1 {
8017                    let mid = entries.len() / 2;
8018                    tracing::warn!(
8019                        batch_entries = entries.len(),
8020                        split_left = mid,
8021                        split_right = entries.len() - mid,
8022                        error = %err,
8023                        "historical salvage batch failed; retrying in smaller sub-batches"
8024                    );
8025                    let left =
8026                        Self::import_historical_batch_with_retry(&entries[..mid], insert_batch)?;
8027                    let right =
8028                        Self::import_historical_batch_with_retry(&entries[mid..], insert_batch)?;
8029                    return Ok(HistoricalBatchImportTotals {
8030                        inserted_source_rows: left.inserted_source_rows
8031                            + right.inserted_source_rows,
8032                        inserted_messages: left.inserted_messages + right.inserted_messages,
8033                    });
8034                }
8035
8036                if let Some(entry) = entries.first()
8037                    && let Some((left, right)) = Self::split_historical_batch_entry_messages(entry)
8038                {
8039                    tracing::warn!(
8040                        source_row_id = entry.source_row_id,
8041                        message_count = entry.conversation.messages.len(),
8042                        error = %err,
8043                        "historical salvage conversation failed; retrying in smaller message slices"
8044                    );
8045                    let left_totals = Self::import_historical_batch_with_retry(
8046                        std::slice::from_ref(&left),
8047                        insert_batch,
8048                    )?;
8049                    let right_totals = Self::import_historical_batch_with_retry(
8050                        std::slice::from_ref(&right),
8051                        insert_batch,
8052                    )?;
8053                    return Ok(HistoricalBatchImportTotals {
8054                        inserted_source_rows: usize::from(
8055                            left_totals.inserted_source_rows > 0
8056                                || right_totals.inserted_source_rows > 0,
8057                        ),
8058                        inserted_messages: left_totals
8059                            .inserted_messages
8060                            .saturating_add(right_totals.inserted_messages),
8061                    });
8062                }
8063
8064                Err(err)
8065            }
8066            Err(err) => Err(err),
8067        }
8068    }
8069
8070    fn import_historical_sources(&self, source_conn: &FrankenConnection) -> Result<()> {
8071        let sources: Vec<Source> = match source_conn.query_map_collect(
8072            "SELECT id, kind, host_label, machine_id, platform, config_json, created_at, updated_at
8073             FROM sources",
8074            fparams![],
8075            |row| {
8076                let raw_source_id: String = row.get_typed(0)?;
8077                let kind_str: String = row.get_typed(1)?;
8078                let raw_host_label: Option<String> = row.get_typed(2)?;
8079                let config_json_raw: Option<String> = row.get_typed(5)?;
8080                let (source_id, source_kind, host_label) = normalized_storage_source_parts(
8081                    Some(raw_source_id.as_str()),
8082                    Some(kind_str.as_str()),
8083                    raw_host_label.as_deref(),
8084                );
8085                Ok(Source {
8086                    id: source_id,
8087                    kind: source_kind,
8088                    host_label,
8089                    machine_id: row.get_typed(3)?,
8090                    platform: row.get_typed(4)?,
8091                    config_json: config_json_raw.and_then(|raw| serde_json::from_str(&raw).ok()),
8092                    created_at: row.get_typed(6)?,
8093                    updated_at: row.get_typed(7)?,
8094                })
8095            },
8096        ) {
8097            Ok(rows) => rows,
8098            Err(err) => {
8099                tracing::warn!(error = %err, "historical sources table unavailable; skipping source import");
8100                return Ok(());
8101            }
8102        };
8103
8104        for source in sources {
8105            self.upsert_source(&source)?;
8106        }
8107        Ok(())
8108    }
8109
8110    fn import_historical_conversations(
8111        &self,
8112        bundle: &HistoricalDatabaseBundle,
8113        salvage_method: &str,
8114        source_conn: &FrankenConnection,
8115    ) -> Result<(usize, usize)> {
8116        let batch_limits = historical_import_batch_limits();
8117        let cache_enabled = IndexingCache::is_enabled();
8118        let mut indexing_cache = IndexingCache::new();
8119        let mut known_sources: HashSet<String> = self
8120            .list_sources()?
8121            .into_iter()
8122            .map(|source| source.id)
8123            .collect();
8124        let resume_progress = self.load_historical_bundle_progress(bundle)?;
8125        let resume_after_row_id = resume_progress
8126            .as_ref()
8127            .map(|progress| progress.last_completed_source_row_id)
8128            .filter(|row_id| *row_id > 0);
8129
8130        tracing::info!(
8131            target: "cass::historical_salvage",
8132            batch_conversations = batch_limits.conversations,
8133            batch_messages = batch_limits.messages,
8134            batch_payload_chars = batch_limits.payload_chars,
8135            cache_enabled,
8136            resume_after_row_id,
8137            "configured historical salvage batch limits"
8138        );
8139
8140        if let Some(progress) = &resume_progress {
8141            tracing::info!(
8142                target: "cass::historical_salvage",
8143                path = %bundle.root_path.display(),
8144                resume_after_row_id = progress.last_completed_source_row_id,
8145                prior_conversations_imported = progress.conversations_imported,
8146                prior_messages_imported = progress.messages_imported,
8147                "resuming historical salvage bundle from durable checkpoint"
8148            );
8149        }
8150
8151        // LEFT JOIN + COALESCE on agents so legacy source databases with NULL
8152        // agent_id (the V1 schema did not require NOT NULL) still have their
8153        // conversations imported, degrading to 'unknown' slug like the other
8154        // rebuild paths.  Using INNER JOIN here would silently drop those
8155        // conversations during historical salvage, which is data loss.
8156        let conv_sql = if resume_after_row_id.is_some() {
8157            "SELECT
8158                c.id,
8159                COALESCE(a.slug, 'unknown'),
8160                w.path,
8161                c.external_id,
8162                c.title,
8163                c.source_path,
8164                c.started_at,
8165                c.ended_at,
8166                c.approx_tokens,
8167                c.metadata_json,
8168                c.source_id,
8169                c.origin_host
8170             FROM conversations c
8171             LEFT JOIN agents a ON c.agent_id = a.id
8172             LEFT JOIN workspaces w ON c.workspace_id = w.id
8173             WHERE c.id > ?1
8174             ORDER BY c.id"
8175        } else {
8176            "SELECT
8177                c.id,
8178                COALESCE(a.slug, 'unknown'),
8179                w.path,
8180                c.external_id,
8181                c.title,
8182                c.source_path,
8183                c.started_at,
8184                c.ended_at,
8185                c.approx_tokens,
8186                c.metadata_json,
8187                c.source_id,
8188                c.origin_host
8189             FROM conversations c
8190             LEFT JOIN agents a ON c.agent_id = a.id
8191             LEFT JOIN workspaces w ON c.workspace_id = w.id
8192             ORDER BY c.id"
8193        };
8194        let conv_params: &[ParamValue] =
8195            if let Some(last_completed_source_row_id) = resume_after_row_id {
8196                &[ParamValue::from(last_completed_source_row_id)]
8197            } else {
8198                &[]
8199            };
8200
8201        #[allow(clippy::type_complexity)]
8202        let conv_rows: Vec<(
8203            i64,
8204            String,
8205            Option<String>,
8206            Option<String>,
8207            Option<String>,
8208            String,
8209            Option<i64>,
8210            Option<i64>,
8211            Option<i64>,
8212            Option<String>,
8213            Option<String>,
8214            Option<String>,
8215        )> = source_conn
8216            .query_map_collect(conv_sql, conv_params, |row| {
8217                Ok((
8218                    row.get_typed::<i64>(0)?,
8219                    row.get_typed::<String>(1)?,
8220                    row.get_typed::<Option<String>>(2)?,
8221                    row.get_typed::<Option<String>>(3)?,
8222                    row.get_typed::<Option<String>>(4)?,
8223                    row.get_typed::<String>(5)?,
8224                    row.get_typed::<Option<i64>>(6)?,
8225                    row.get_typed::<Option<i64>>(7)?,
8226                    row.get_typed::<Option<i64>>(8)?,
8227                    row.get_typed::<Option<String>>(9)?,
8228                    row.get_typed::<Option<String>>(10)?,
8229                    row.get_typed::<Option<String>>(11)?,
8230                ))
8231            })
8232            .context("querying historical conversations")?;
8233
8234        let msg_sql = "SELECT idx, role, author, created_at, content, extra_json
8235             FROM messages
8236             WHERE conversation_id = ?1
8237             ORDER BY idx";
8238
8239        let mut imported_conversations = resume_progress
8240            .as_ref()
8241            .map(|progress| progress.conversations_imported)
8242            .unwrap_or(0);
8243        let mut imported_messages = resume_progress
8244            .as_ref()
8245            .map(|progress| progress.messages_imported)
8246            .unwrap_or(0);
8247        let mut pending_batch: Vec<HistoricalBatchEntry> = Vec::new();
8248        let mut pending_batch_messages = 0usize;
8249        let mut pending_batch_chars = 0usize;
8250        let mut pending_batch_first_row_id: Option<i64> = None;
8251        let mut pending_batch_last_row_id: Option<i64> = None;
8252
8253        let flush_batch = |storage: &FrankenStorage,
8254                           batch: &mut Vec<HistoricalBatchEntry>,
8255                           pending_messages: &mut usize,
8256                           pending_chars: &mut usize,
8257                           first_row_id: &mut Option<i64>,
8258                           last_row_id: &mut Option<i64>,
8259                           imported_conversations: &mut usize,
8260                           imported_messages: &mut usize|
8261         -> Result<()> {
8262            if batch.is_empty() {
8263                return Ok(());
8264            }
8265
8266            let batch_first_row_id = *first_row_id;
8267            let batch_last_row_id = *last_row_id;
8268            if historical_salvage_debug_enabled() {
8269                eprintln!(
8270                    "[historical-salvage] flushing batch rows {:?}..{:?} conversations={} messages={} payload_chars={}",
8271                    batch_first_row_id,
8272                    batch_last_row_id,
8273                    batch.len(),
8274                    *pending_messages,
8275                    *pending_chars
8276                );
8277            }
8278            tracing::info!(
8279                target: "cass::historical_salvage",
8280                batch_conversations = batch.len(),
8281                batch_messages = *pending_messages,
8282                batch_payload_chars = *pending_chars,
8283                first_source_row_id = batch_first_row_id,
8284                last_source_row_id = batch_last_row_id,
8285                "flushing historical salvage batch"
8286            );
8287
8288            let mut insert_batch =
8289                |entries: &[HistoricalBatchEntry]| -> Result<HistoricalBatchImportTotals> {
8290                    let borrowed_batch: Vec<(i64, Option<i64>, &Conversation)> = entries
8291                        .iter()
8292                        .map(|entry| (entry.agent_id, entry.workspace_id, &entry.conversation))
8293                        .collect();
8294                    let outcomes = storage
8295                        .insert_conversations_batched(&borrowed_batch)
8296                        .with_context(|| {
8297                            let first_source_row_id =
8298                                entries.first().map(|entry| entry.source_row_id);
8299                            let last_source_row_id =
8300                                entries.last().map(|entry| entry.source_row_id);
8301                            format!(
8302                                "inserting historical salvage batch source rows {:?}..{:?}",
8303                                first_source_row_id, last_source_row_id
8304                            )
8305                        })?;
8306                    let mut totals = HistoricalBatchImportTotals::default();
8307                    for outcome in outcomes {
8308                        if !outcome.inserted_indices.is_empty() {
8309                            totals.inserted_source_rows += 1;
8310                            totals.inserted_messages += outcome.inserted_indices.len();
8311                        }
8312                    }
8313                    Ok(totals)
8314                };
8315            let totals =
8316                Self::import_historical_batch_with_retry(batch.as_slice(), &mut insert_batch)?;
8317            *imported_conversations =
8318                (*imported_conversations).saturating_add(totals.inserted_source_rows);
8319            *imported_messages = (*imported_messages).saturating_add(totals.inserted_messages);
8320            if let Some(last_completed_row_id) = batch_last_row_id {
8321                storage.record_historical_bundle_progress(
8322                    bundle,
8323                    salvage_method,
8324                    last_completed_row_id,
8325                    *imported_conversations,
8326                    *imported_messages,
8327                )?;
8328            }
8329            tracing::info!(
8330                target: "cass::historical_salvage",
8331                batch_conversations = batch.len(),
8332                batch_messages = *pending_messages,
8333                imported_conversations = *imported_conversations,
8334                imported_messages = *imported_messages,
8335                first_source_row_id = batch_first_row_id,
8336                last_source_row_id = batch_last_row_id,
8337                "historical salvage batch committed"
8338            );
8339            if historical_salvage_debug_enabled() {
8340                eprintln!(
8341                    "[historical-salvage] committed batch rows {:?}..{:?} imported_conversations={} imported_messages={}",
8342                    batch_first_row_id,
8343                    batch_last_row_id,
8344                    *imported_conversations,
8345                    *imported_messages
8346                );
8347            }
8348            batch.clear();
8349            *pending_messages = 0;
8350            *pending_chars = 0;
8351            *first_row_id = None;
8352            *last_row_id = None;
8353            Ok(())
8354        };
8355
8356        for (
8357            conversation_row_id,
8358            agent_slug,
8359            workspace_path,
8360            external_id,
8361            title,
8362            source_path,
8363            started_at,
8364            ended_at,
8365            approx_tokens,
8366            metadata_json_raw,
8367            raw_source_id,
8368            raw_origin_host,
8369        ) in conv_rows
8370        {
8371            let source_id = crate::search::tantivy::normalized_index_source_id(
8372                raw_source_id.as_deref(),
8373                None,
8374                raw_origin_host.as_deref(),
8375            );
8376            let origin_host =
8377                crate::search::tantivy::normalized_index_origin_host(raw_origin_host.as_deref());
8378
8379            let messages: Vec<Message> = source_conn
8380                .query_map_collect(msg_sql, fparams![conversation_row_id], |msg_row| {
8381                    let role: String = msg_row.get_typed(1)?;
8382                    Ok(Message {
8383                        id: None,
8384                        idx: msg_row.get_typed(0)?,
8385                        role: match role.as_str() {
8386                            "user" => MessageRole::User,
8387                            "agent" | "assistant" => MessageRole::Agent,
8388                            "tool" => MessageRole::Tool,
8389                            "system" => MessageRole::System,
8390                            other => MessageRole::Other(other.to_string()),
8391                        },
8392                        author: msg_row.get_typed(2)?,
8393                        created_at: msg_row.get_typed(3)?,
8394                        content: msg_row.get_typed(4)?,
8395                        extra_json: parse_historical_json_column(msg_row.get_typed(5)?),
8396                        snippets: Vec::new(),
8397                    })
8398                })
8399                .context("collecting historical message rows")?;
8400
8401            if messages.is_empty() {
8402                continue;
8403            }
8404
8405            let conversation_message_count = messages.len();
8406            let conversation_chars = messages
8407                .iter()
8408                .map(message_payload_size_hint)
8409                .sum::<usize>();
8410
8411            let conversation = Conversation {
8412                id: None,
8413                agent_slug: agent_slug.clone(),
8414                workspace: workspace_path.map(PathBuf::from),
8415                external_id,
8416                title,
8417                source_path: PathBuf::from(source_path),
8418                started_at,
8419                ended_at,
8420                approx_tokens,
8421                metadata_json: parse_json_column(metadata_json_raw),
8422                messages,
8423                source_id,
8424                origin_host,
8425            };
8426
8427            if !known_sources.contains(&conversation.source_id) {
8428                let placeholder = if conversation.source_id == LOCAL_SOURCE_ID {
8429                    Source::local()
8430                } else {
8431                    Source {
8432                        id: conversation.source_id.clone(),
8433                        kind: SourceKind::Ssh,
8434                        host_label: conversation.origin_host.clone(),
8435                        machine_id: None,
8436                        platform: None,
8437                        config_json: None,
8438                        created_at: None,
8439                        updated_at: None,
8440                    }
8441                };
8442                self.upsert_source(&placeholder)?;
8443                known_sources.insert(conversation.source_id.clone());
8444            }
8445
8446            let agent = Agent {
8447                id: None,
8448                slug: agent_slug.clone(),
8449                name: agent_slug,
8450                version: None,
8451                kind: AgentKind::Cli,
8452            };
8453            let agent_id = if cache_enabled {
8454                indexing_cache.get_or_insert_agent(self, &agent)?
8455            } else {
8456                self.ensure_agent(&agent)?
8457            };
8458            let workspace_id = if let Some(workspace) = &conversation.workspace {
8459                if cache_enabled {
8460                    Some(indexing_cache.get_or_insert_workspace(self, workspace, None)?)
8461                } else {
8462                    Some(self.ensure_workspace(workspace, None)?)
8463                }
8464            } else {
8465                None
8466            };
8467
8468            let exceeds_pending_limits = !pending_batch.is_empty()
8469                && (pending_batch.len() >= batch_limits.conversations
8470                    || pending_batch_messages.saturating_add(conversation_message_count)
8471                        > batch_limits.messages
8472                    || pending_batch_chars.saturating_add(conversation_chars)
8473                        > batch_limits.payload_chars);
8474            if exceeds_pending_limits {
8475                flush_batch(
8476                    self,
8477                    &mut pending_batch,
8478                    &mut pending_batch_messages,
8479                    &mut pending_batch_chars,
8480                    &mut pending_batch_first_row_id,
8481                    &mut pending_batch_last_row_id,
8482                    &mut imported_conversations,
8483                    &mut imported_messages,
8484                )?;
8485            }
8486
8487            if pending_batch_first_row_id.is_none() {
8488                pending_batch_first_row_id = Some(conversation_row_id);
8489            }
8490            pending_batch_last_row_id = Some(conversation_row_id);
8491            pending_batch_messages =
8492                pending_batch_messages.saturating_add(conversation_message_count);
8493            pending_batch_chars = pending_batch_chars.saturating_add(conversation_chars);
8494            pending_batch.push(HistoricalBatchEntry {
8495                source_row_id: conversation_row_id,
8496                agent_id,
8497                workspace_id,
8498                conversation,
8499            });
8500
8501            if pending_batch.len() >= batch_limits.conversations
8502                || pending_batch_messages >= batch_limits.messages
8503                || pending_batch_chars >= batch_limits.payload_chars
8504            {
8505                flush_batch(
8506                    self,
8507                    &mut pending_batch,
8508                    &mut pending_batch_messages,
8509                    &mut pending_batch_chars,
8510                    &mut pending_batch_first_row_id,
8511                    &mut pending_batch_last_row_id,
8512                    &mut imported_conversations,
8513                    &mut imported_messages,
8514                )?;
8515            }
8516        }
8517
8518        flush_batch(
8519            self,
8520            &mut pending_batch,
8521            &mut pending_batch_messages,
8522            &mut pending_batch_chars,
8523            &mut pending_batch_first_row_id,
8524            &mut pending_batch_last_row_id,
8525            &mut imported_conversations,
8526            &mut imported_messages,
8527        )?;
8528
8529        if cache_enabled {
8530            let (hits, misses, hit_rate) = indexing_cache.stats();
8531            tracing::info!(
8532                target: "cass::historical_salvage",
8533                hits,
8534                misses,
8535                hit_rate = format!("{:.1}%", hit_rate * 100.0),
8536                agents = indexing_cache.agent_count(),
8537                workspaces = indexing_cache.workspace_count(),
8538                sources = known_sources.len(),
8539                "historical salvage cache stats"
8540            );
8541        }
8542
8543        Ok((imported_conversations, imported_messages))
8544    }
8545
8546    pub fn salvage_historical_databases(
8547        &self,
8548        canonical_db_path: &Path,
8549    ) -> Result<HistoricalSalvageOutcome> {
8550        let ordered_bundles = discover_historical_database_bundles(canonical_db_path);
8551        let mut outcome = HistoricalSalvageOutcome {
8552            bundles_considered: ordered_bundles.len(),
8553            ..HistoricalSalvageOutcome::default()
8554        };
8555
8556        for bundle in ordered_bundles {
8557            if self.historical_bundle_already_imported(&bundle)? {
8558                self.clear_historical_bundle_progress(&bundle)?;
8559                continue;
8560            }
8561
8562            let source = match open_historical_bundle_for_salvage(&bundle).with_context(|| {
8563                format!(
8564                    "opening historical bundle {} for salvage",
8565                    bundle.root_path.display()
8566                )
8567            }) {
8568                Ok(source) => source,
8569                Err(err) => {
8570                    tracing::warn!(
8571                        path = %bundle.root_path.display(),
8572                        error = %err,
8573                        "skipping unreadable historical cass database bundle during salvage"
8574                    );
8575                    self.clear_historical_bundle_progress(&bundle)?;
8576                    continue;
8577                }
8578            };
8579
8580            // #247 (coding_agent_session_search-r8pcy): if a per-bundle progress
8581            // checkpoint already covers the backup's entire conversation row-id
8582            // space, the bundle was effectively fully imported but the daemon was
8583            // killed (e.g. OOM) before the completion ledger marker landed.
8584            // Re-scanning it is a pure O(n) no-op — every batch commits
8585            // imported=0 while taking 5-12 min. Detect it via the high-water
8586            // checkpoint, write the ledger marker, drop the checkpoint, and skip.
8587            if let Some(progress) = self.load_historical_bundle_progress(&bundle)? {
8588                let backup_max_conversation_id: i64 = source
8589                    .conn
8590                    .query_row_map(
8591                        "SELECT COALESCE(MAX(id), 0) FROM conversations",
8592                        fparams![],
8593                        |row| row.get_typed(0),
8594                    )
8595                    .unwrap_or(0);
8596                if backup_max_conversation_id > 0
8597                    && progress.last_completed_source_row_id >= backup_max_conversation_id
8598                {
8599                    self.record_historical_bundle_import(
8600                        &bundle,
8601                        source.method,
8602                        progress.conversations_imported,
8603                        progress.messages_imported,
8604                    )?;
8605                    self.clear_historical_bundle_progress(&bundle)?;
8606                    tracing::info!(
8607                        path = %bundle.root_path.display(),
8608                        last_completed_source_row_id = progress.last_completed_source_row_id,
8609                        backup_max_conversation_id,
8610                        conversations_imported = progress.conversations_imported,
8611                        messages_imported = progress.messages_imported,
8612                        "historical bundle already fully imported per checkpoint; marking salvaged and skipping O(n) re-scan"
8613                    );
8614                    continue;
8615                }
8616            }
8617
8618            self.import_historical_sources(&source.conn)?;
8619            let (imported_conversations, imported_messages) =
8620                self.import_historical_conversations(&bundle, source.method, &source.conn)?;
8621            self.record_historical_bundle_import(
8622                &bundle,
8623                source.method,
8624                imported_conversations,
8625                imported_messages,
8626            )?;
8627            self.clear_historical_bundle_progress(&bundle)?;
8628
8629            outcome.bundles_imported += 1;
8630            outcome.conversations_imported += imported_conversations;
8631            outcome.messages_imported += imported_messages;
8632
8633            tracing::info!(
8634                path = %bundle.root_path.display(),
8635                bytes = bundle.total_bytes,
8636                method = source.method,
8637                imported_conversations,
8638                imported_messages,
8639                "salvaged historical cass database bundle"
8640            );
8641        }
8642
8643        Ok(outcome)
8644    }
8645
8646    /// Delete a source by ID. Returns true if a row was deleted.
8647    pub fn delete_source(&self, id: &str, _cascade: bool) -> Result<bool> {
8648        if id == LOCAL_SOURCE_ID {
8649            anyhow::bail!("cannot delete the local source");
8650        }
8651        let count = self
8652            .conn
8653            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams![id])?;
8654        if count > 0 {
8655            self.invalidate_conversation_source_cache(id);
8656        }
8657        Ok(count > 0)
8658    }
8659
8660    /// Insert a conversation tree (conversation + messages + snippets + FTS).
8661    pub fn insert_conversation_tree(
8662        &self,
8663        agent_id: i64,
8664        workspace_id: Option<i64>,
8665        conv: &Conversation,
8666    ) -> Result<InsertOutcome> {
8667        let normalized_conv = normalized_conversation_for_storage(conv);
8668        let conv = normalized_conv.as_ref();
8669        self.ensure_source_for_conversation(conv)?;
8670        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8671        let defer_analytics_updates = defer_analytics_updates_enabled();
8672        let conversation_key = conversation_merge_key(agent_id, conv);
8673        let mut tx = self.conn.transaction()?;
8674        let existing = franken_find_existing_conversation_with_tail_by_key(
8675            &tx,
8676            &conversation_key,
8677            Some(conv),
8678        )?;
8679        if let Some(existing) = existing {
8680            let outcome = self.franken_append_messages_with_tail_in_tx(
8681                &tx,
8682                agent_id,
8683                existing.id,
8684                conv,
8685                existing.tail_state,
8686                defer_lexical_updates,
8687                defer_analytics_updates,
8688            )?;
8689            tx.commit()?;
8690            return Ok(outcome);
8691        }
8692
8693        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8694            &tx,
8695            agent_id,
8696            workspace_id,
8697            conv,
8698            &conversation_key,
8699        )? {
8700            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8701            ConversationInsertStatus::Existing(existing_id) => {
8702                let ExistingMessageLookup {
8703                    by_idx: mut existing_messages,
8704                    replay: mut existing_replay_fingerprints,
8705                } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
8706                let ExistingConversationNewMessages {
8707                    messages: new_messages,
8708                    new_chars,
8709                    idx_collision_count,
8710                    first_collision_idx,
8711                } = collect_new_messages_for_existing_conversation(
8712                    existing_id,
8713                    conv,
8714                    &mut existing_messages,
8715                    &mut existing_replay_fingerprints,
8716                    "skipping replay-equivalent recovered message with shifted idx",
8717                );
8718                let (inserted_last_idx, inserted_last_created_at) =
8719                    borrowed_messages_tail_state(&new_messages);
8720                let mut inserted_indices = Vec::new();
8721                let mut fts_entries = Vec::new();
8722                let mut fts_pending_chars = 0usize;
8723                let mut _fts_inserted_total = 0usize;
8724                let inserted_message_ids =
8725                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
8726                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8727                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8728                    if !defer_lexical_updates {
8729                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8730                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8731                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8732                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8733                        {
8734                            flush_pending_fts_entries(
8735                                self,
8736                                &tx,
8737                                &mut fts_entries,
8738                                &mut fts_pending_chars,
8739                                &mut _fts_inserted_total,
8740                            )?;
8741                        }
8742                    }
8743                    inserted_indices.push(msg.idx);
8744                }
8745
8746                if idx_collision_count > 0 {
8747                    tracing::warn!(
8748                        conversation_id = existing_id,
8749                        collision_count = idx_collision_count,
8750                        first_idx = first_collision_idx,
8751                        source_path = %conv.source_path.display(),
8752                        "message idx collisions encountered while merging recovered conversation; retaining canonical message variants"
8753                    );
8754                }
8755
8756                if !defer_lexical_updates {
8757                    flush_pending_fts_entries(
8758                        self,
8759                        &tx,
8760                        &mut fts_entries,
8761                        &mut fts_pending_chars,
8762                        &mut _fts_inserted_total,
8763                    )?;
8764                }
8765
8766                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
8767                franken_update_conversation_tail_state(
8768                    &tx,
8769                    existing_id,
8770                    conv_last_ts,
8771                    inserted_last_idx,
8772                    inserted_last_created_at,
8773                )?;
8774                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
8775                {
8776                    franken_update_external_conversation_tail_lookup_key(
8777                        &tx,
8778                        &lookup_key,
8779                        conv_last_ts,
8780                        inserted_last_idx,
8781                        inserted_last_created_at,
8782                    )?;
8783                }
8784
8785                if !defer_analytics_updates && !inserted_indices.is_empty() {
8786                    franken_update_daily_stats_in_tx(
8787                        self,
8788                        &tx,
8789                        &conv.agent_slug,
8790                        &conv.source_id,
8791                        conversation_effective_started_at(conv),
8792                        StatsDelta {
8793                            session_count_delta: 0,
8794                            message_count_delta: inserted_indices.len() as i64,
8795                            total_chars_delta: new_chars,
8796                        },
8797                    )?;
8798                }
8799
8800                tx.commit()?;
8801                return Ok(InsertOutcome {
8802                    conversation_id: existing_id,
8803                    conversation_inserted: false,
8804                    inserted_indices,
8805                });
8806            }
8807        };
8808        let mut fts_entries = Vec::new();
8809        let mut fts_pending_chars = 0usize;
8810        let mut _fts_inserted_total = 0usize;
8811        let mut total_chars: i64 = 0;
8812        let mut inserted_indices = Vec::new();
8813        let mut pending_messages = HashMap::new();
8814        let mut pending_replay_fingerprints = HashSet::new();
8815        let mut idx_collision_count = 0usize;
8816        let mut first_collision_idx: Option<i64> = None;
8817        let mut new_messages = Vec::new();
8818        for msg in &conv.messages {
8819            let incoming_fingerprint = message_merge_fingerprint(msg);
8820            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
8821                if existing_fingerprint != &incoming_fingerprint {
8822                    idx_collision_count = idx_collision_count.saturating_add(1);
8823                    first_collision_idx.get_or_insert(msg.idx);
8824                }
8825                continue;
8826            }
8827            let incoming_replay = message_replay_fingerprint(msg);
8828            if pending_replay_fingerprints.contains(&incoming_replay) {
8829                tracing::debug!(
8830                    conversation_id = conv_id,
8831                    idx = msg.idx,
8832                    source_path = %conv.source_path.display(),
8833                    "skipping replay-equivalent duplicate message within new conversation insert"
8834                );
8835                continue;
8836            }
8837            pending_messages.insert(msg.idx, incoming_fingerprint);
8838            pending_replay_fingerprints.insert(incoming_replay);
8839            new_messages.push(msg);
8840        }
8841        let inserted_message_ids = franken_batch_insert_new_messages(&tx, conv_id, &new_messages)?;
8842        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
8843            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
8844            if !defer_lexical_updates {
8845                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
8846                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
8847                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
8848                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
8849                {
8850                    flush_pending_fts_entries(
8851                        self,
8852                        &tx,
8853                        &mut fts_entries,
8854                        &mut fts_pending_chars,
8855                        &mut _fts_inserted_total,
8856                    )?;
8857                }
8858            }
8859            total_chars += msg.content.len() as i64;
8860            inserted_indices.push(msg.idx);
8861        }
8862        if idx_collision_count > 0 {
8863            tracing::warn!(
8864                conversation_id = conv_id,
8865                collision_count = idx_collision_count,
8866                first_idx = first_collision_idx,
8867                source_path = %conv.source_path.display(),
8868                "message idx collisions encountered while inserting a new conversation; retaining the first canonical variant per idx"
8869            );
8870        }
8871        if !defer_lexical_updates {
8872            flush_pending_fts_entries(
8873                self,
8874                &tx,
8875                &mut fts_entries,
8876                &mut fts_pending_chars,
8877                &mut _fts_inserted_total,
8878            )?;
8879        }
8880
8881        if !defer_analytics_updates {
8882            franken_update_daily_stats_in_tx(
8883                self,
8884                &tx,
8885                &conv.agent_slug,
8886                &conv.source_id,
8887                conversation_effective_started_at(conv),
8888                StatsDelta {
8889                    session_count_delta: 1,
8890                    message_count_delta: inserted_indices.len() as i64,
8891                    total_chars_delta: total_chars,
8892                },
8893            )?;
8894        }
8895
8896        tx.commit()?;
8897        Ok(InsertOutcome {
8898            conversation_id: conv_id,
8899            conversation_inserted: true,
8900            inserted_indices,
8901        })
8902    }
8903
8904    #[cfg(test)]
8905    fn insert_conversation_tree_with_profile(
8906        &self,
8907        agent_id: i64,
8908        workspace_id: Option<i64>,
8909        conv: &Conversation,
8910        profile: &mut InsertConversationTreePerfProfile,
8911    ) -> Result<InsertOutcome> {
8912        let total_start = Instant::now();
8913        let normalized_conv = normalized_conversation_for_storage(conv);
8914        let conv = normalized_conv.as_ref();
8915
8916        let source_start = Instant::now();
8917        self.ensure_source_for_conversation(conv)?;
8918        profile.source_duration += source_start.elapsed();
8919
8920        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
8921        let defer_analytics_updates = defer_analytics_updates_enabled();
8922        let conversation_key = conversation_merge_key(agent_id, conv);
8923
8924        let tx_open_start = Instant::now();
8925        let mut tx = self.conn.transaction()?;
8926        profile.tx_open_duration += tx_open_start.elapsed();
8927
8928        let existing_lookup_start = Instant::now();
8929        let existing =
8930            franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
8931        profile.existing_lookup_duration += existing_lookup_start.elapsed();
8932        if let Some(existing_id) = existing {
8933            return Err(anyhow!(
8934                "profile helper expects new conversation path, found existing id {existing_id}"
8935            ));
8936        }
8937
8938        let conversation_row_start = Instant::now();
8939        let conv_id = match franken_insert_conversation_or_get_existing_after_miss(
8940            &tx,
8941            agent_id,
8942            workspace_id,
8943            conv,
8944            &conversation_key,
8945        )? {
8946            ConversationInsertStatus::Inserted(conv_id) => conv_id,
8947            ConversationInsertStatus::Existing(existing_id) => {
8948                return Err(anyhow!(
8949                    "profile helper expected inserted conversation row, reused existing id {existing_id}"
8950                ));
8951            }
8952        };
8953        profile.conversation_row_duration += conversation_row_start.elapsed();
8954
8955        let mut fts_entries = Vec::new();
8956        let mut fts_pending_chars = 0usize;
8957        let mut fts_inserted_total = 0usize;
8958        let mut total_chars: i64 = 0;
8959        let mut inserted_indices = Vec::new();
8960        let mut pending_messages = HashMap::new();
8961        let mut pending_replay_fingerprints = HashSet::new();
8962        let mut idx_collision_count = 0usize;
8963        let mut first_collision_idx: Option<i64> = None;
8964        let mut new_messages = Vec::new();
8965
8966        for msg in &conv.messages {
8967            let incoming_fingerprint = message_merge_fingerprint(msg);
8968            if let Some(existing_fingerprint) = pending_messages.get(&msg.idx) {
8969                if existing_fingerprint != &incoming_fingerprint {
8970                    idx_collision_count = idx_collision_count.saturating_add(1);
8971                    first_collision_idx.get_or_insert(msg.idx);
8972                }
8973                continue;
8974            }
8975
8976            let incoming_replay = message_replay_fingerprint(msg);
8977            if pending_replay_fingerprints.contains(&incoming_replay) {
8978                tracing::debug!(
8979                    conversation_id = conv_id,
8980                    idx = msg.idx,
8981                    source_path = %conv.source_path.display(),
8982                    "skipping replay-equivalent duplicate message within profiled new conversation insert"
8983                );
8984                continue;
8985            }
8986
8987            pending_messages.insert(msg.idx, incoming_fingerprint);
8988            pending_replay_fingerprints.insert(incoming_replay);
8989            new_messages.push(msg);
8990        }
8991
8992        let message_insert_start = Instant::now();
8993        let inserted_message_ids = franken_batch_insert_new_messages_with_profile(
8994            &tx,
8995            conv_id,
8996            &new_messages,
8997            &mut profile.message_insert_breakdown,
8998        )?;
8999        profile.message_insert_duration += message_insert_start.elapsed();
9000
9001        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9002            let snippet_insert_start = Instant::now();
9003            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9004            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9005
9006            if !defer_lexical_updates {
9007                let fts_entry_start = Instant::now();
9008                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9009                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9010                profile.fts_entry_duration += fts_entry_start.elapsed();
9011                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9012                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9013                {
9014                    let fts_flush_start = Instant::now();
9015                    flush_pending_fts_entries(
9016                        self,
9017                        &tx,
9018                        &mut fts_entries,
9019                        &mut fts_pending_chars,
9020                        &mut fts_inserted_total,
9021                    )?;
9022                    profile.fts_flush_duration += fts_flush_start.elapsed();
9023                }
9024            }
9025
9026            total_chars += msg.content.len() as i64;
9027            inserted_indices.push(msg.idx);
9028        }
9029
9030        if idx_collision_count > 0 {
9031            tracing::warn!(
9032                conversation_id = conv_id,
9033                collision_count = idx_collision_count,
9034                first_idx = first_collision_idx,
9035                source_path = %conv.source_path.display(),
9036                "message idx collisions encountered while profiling a new conversation insert; retaining the first canonical variant per idx"
9037            );
9038        }
9039
9040        if !defer_lexical_updates {
9041            let fts_flush_start = Instant::now();
9042            flush_pending_fts_entries(
9043                self,
9044                &tx,
9045                &mut fts_entries,
9046                &mut fts_pending_chars,
9047                &mut fts_inserted_total,
9048            )?;
9049            profile.fts_flush_duration += fts_flush_start.elapsed();
9050        }
9051
9052        if !defer_analytics_updates {
9053            let analytics_start = Instant::now();
9054            franken_update_daily_stats_in_tx(
9055                self,
9056                &tx,
9057                &conv.agent_slug,
9058                &conv.source_id,
9059                conversation_effective_started_at(conv),
9060                StatsDelta {
9061                    session_count_delta: 1,
9062                    message_count_delta: inserted_indices.len() as i64,
9063                    total_chars_delta: total_chars,
9064                },
9065            )?;
9066            profile.analytics_duration += analytics_start.elapsed();
9067        }
9068
9069        let commit_start = Instant::now();
9070        tx.commit()?;
9071        profile.commit_duration += commit_start.elapsed();
9072        profile.invocations += 1;
9073        profile.messages += conv.messages.len();
9074        profile.inserted_messages += inserted_indices.len();
9075        profile.total_duration += total_start.elapsed();
9076
9077        Ok(InsertOutcome {
9078            conversation_id: conv_id,
9079            conversation_inserted: true,
9080            inserted_indices,
9081        })
9082    }
9083
9084    #[cfg(test)]
9085    fn append_existing_conversation_with_profile(
9086        &self,
9087        agent_id: i64,
9088        _workspace_id: Option<i64>,
9089        conv: &Conversation,
9090        profile: &mut InsertConversationTreePerfProfile,
9091    ) -> Result<InsertOutcome> {
9092        let total_start = Instant::now();
9093        let normalized_conv = normalized_conversation_for_storage(conv);
9094        let conv = normalized_conv.as_ref();
9095
9096        let source_start = Instant::now();
9097        self.ensure_source_for_conversation(conv)?;
9098        profile.source_duration += source_start.elapsed();
9099
9100        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
9101        let defer_analytics_updates = defer_analytics_updates_enabled();
9102        let conversation_key = conversation_merge_key(agent_id, conv);
9103
9104        let tx_open_start = Instant::now();
9105        let mut tx = self.conn.transaction()?;
9106        profile.tx_open_duration += tx_open_start.elapsed();
9107
9108        let existing_lookup_start = Instant::now();
9109        let existing = franken_find_existing_conversation_with_tail_by_key(
9110            &tx,
9111            &conversation_key,
9112            Some(conv),
9113        )?;
9114        profile.existing_lookup_duration += existing_lookup_start.elapsed();
9115        let existing = existing.ok_or_else(|| {
9116            anyhow!("append profile helper expects existing conversation for {conversation_key:?}")
9117        })?;
9118        let existing_id = existing.id;
9119
9120        let existing_idx_lookup_start = Instant::now();
9121        let append_tail_state = existing.tail_state;
9122        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9123        let existing_plan = append_tail_state.as_ref().and_then(|state| {
9124            collect_append_only_tail_messages(
9125                conv,
9126                state.last_message_idx,
9127                state.last_message_created_at,
9128            )
9129        });
9130        let used_append_tail_plan = existing_plan.is_some();
9131        profile.existing_idx_lookup_duration += existing_idx_lookup_start.elapsed();
9132
9133        let dedupe_filter_start = Instant::now();
9134        let ExistingConversationNewMessages {
9135            messages: new_messages,
9136            new_chars,
9137            idx_collision_count,
9138            first_collision_idx,
9139        } = if let Some(existing_plan) = existing_plan {
9140            existing_plan
9141        } else {
9142            let ExistingMessageLookup {
9143                by_idx: mut existing_messages,
9144                replay: mut existing_replay_fingerprints,
9145            } = franken_existing_message_lookup(&tx, existing_id, &conv.messages)?;
9146            collect_new_messages_for_existing_conversation(
9147                existing_id,
9148                conv,
9149                &mut existing_messages,
9150                &mut existing_replay_fingerprints,
9151                "skipping replay-equivalent profiled append message with shifted idx",
9152            )
9153        };
9154        profile.dedupe_filter_duration += dedupe_filter_start.elapsed();
9155
9156        let mut inserted_indices = Vec::new();
9157        let mut fts_entries = Vec::new();
9158        let mut fts_pending_chars = 0usize;
9159        let mut fts_inserted_total = 0usize;
9160        let (inserted_last_idx, inserted_last_created_at) =
9161            borrowed_messages_tail_state(&new_messages);
9162
9163        let message_insert_start = Instant::now();
9164        let inserted_message_ids = franken_append_insert_new_messages_with_profile(
9165            &tx,
9166            existing_id,
9167            &new_messages,
9168            &mut profile.message_insert_breakdown,
9169        )?;
9170        profile.message_insert_duration += message_insert_start.elapsed();
9171
9172        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9173            let snippet_insert_start = Instant::now();
9174            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
9175            profile.snippet_insert_duration += snippet_insert_start.elapsed();
9176
9177            if !defer_lexical_updates {
9178                let fts_entry_start = Instant::now();
9179                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9180                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9181                profile.fts_entry_duration += fts_entry_start.elapsed();
9182                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9183                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9184                {
9185                    let fts_flush_start = Instant::now();
9186                    flush_pending_fts_entries(
9187                        self,
9188                        &tx,
9189                        &mut fts_entries,
9190                        &mut fts_pending_chars,
9191                        &mut fts_inserted_total,
9192                    )?;
9193                    profile.fts_flush_duration += fts_flush_start.elapsed();
9194                }
9195            }
9196
9197            inserted_indices.push(msg.idx);
9198        }
9199
9200        if idx_collision_count > 0 {
9201            tracing::warn!(
9202                conversation_id = existing_id,
9203                collision_count = idx_collision_count,
9204                first_idx = first_collision_idx,
9205                source_path = %conv.source_path.display(),
9206                "message idx collisions encountered while profiling append merge; retaining canonical message variants"
9207            );
9208        }
9209
9210        if !defer_lexical_updates {
9211            let fts_flush_start = Instant::now();
9212            flush_pending_fts_entries(
9213                self,
9214                &tx,
9215                &mut fts_entries,
9216                &mut fts_pending_chars,
9217                &mut fts_inserted_total,
9218            )?;
9219            profile.fts_flush_duration += fts_flush_start.elapsed();
9220        }
9221
9222        let conversation_row_start = Instant::now();
9223        let mut exact_append_tail_set = false;
9224        if used_append_tail_plan {
9225            if let (Some(last_message_idx), Some(last_message_created_at)) =
9226                (inserted_last_idx, inserted_last_created_at)
9227            {
9228                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9229                    franken_set_conversation_tail_state_after_append(
9230                        &tx,
9231                        existing_id,
9232                        last_message_created_at,
9233                        last_message_idx,
9234                        last_message_created_at,
9235                    )?;
9236                    exact_append_tail_set = true;
9237                } else {
9238                    franken_update_conversation_tail_state(
9239                        &tx,
9240                        existing_id,
9241                        Some(last_message_created_at),
9242                        inserted_last_idx,
9243                        inserted_last_created_at,
9244                    )?;
9245                }
9246            }
9247        } else {
9248            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9249            franken_update_conversation_tail_state(
9250                &tx,
9251                existing_id,
9252                conv_last_ts,
9253                inserted_last_idx,
9254                inserted_last_created_at,
9255            )?;
9256        }
9257        franken_update_external_conversation_tail_after_append(
9258            &tx,
9259            agent_id,
9260            conv,
9261            used_append_tail_plan,
9262            exact_append_tail_set,
9263            inserted_last_idx,
9264            inserted_last_created_at,
9265        )?;
9266        profile.conversation_row_duration += conversation_row_start.elapsed();
9267
9268        if !defer_analytics_updates && !inserted_indices.is_empty() {
9269            let analytics_start = Instant::now();
9270            franken_update_daily_stats_in_tx(
9271                self,
9272                &tx,
9273                &conv.agent_slug,
9274                &conv.source_id,
9275                conversation_effective_started_at(conv),
9276                StatsDelta {
9277                    session_count_delta: 0,
9278                    message_count_delta: inserted_indices.len() as i64,
9279                    total_chars_delta: new_chars,
9280                },
9281            )?;
9282            profile.analytics_duration += analytics_start.elapsed();
9283        }
9284
9285        let commit_start = Instant::now();
9286        tx.commit()?;
9287        profile.commit_duration += commit_start.elapsed();
9288        profile.invocations += 1;
9289        profile.messages += conv.messages.len();
9290        profile.inserted_messages += inserted_indices.len();
9291        profile.total_duration += total_start.elapsed();
9292
9293        Ok(InsertOutcome {
9294            conversation_id: existing_id,
9295            conversation_inserted: false,
9296            inserted_indices,
9297        })
9298    }
9299
9300    /// Append new messages to an existing conversation within an active transaction.
9301    #[allow(clippy::too_many_arguments)]
9302    fn franken_append_messages_with_tail_in_tx(
9303        &self,
9304        tx: &FrankenTransaction<'_>,
9305        agent_id: i64,
9306        conversation_id: i64,
9307        conv: &Conversation,
9308        append_tail_state: Option<ExistingConversationTailState>,
9309        defer_lexical_updates: bool,
9310        defer_analytics_updates: bool,
9311    ) -> Result<InsertOutcome> {
9312        let append_tail_ended_at = append_tail_state.and_then(|state| state.ended_at);
9313        let append_plan = append_tail_state.as_ref().and_then(|state| {
9314            collect_append_only_tail_messages(
9315                conv,
9316                state.last_message_idx,
9317                state.last_message_created_at,
9318            )
9319        });
9320        let used_append_tail_plan = append_plan.is_some();
9321        let ExistingConversationNewMessages {
9322            messages: new_messages,
9323            new_chars,
9324            idx_collision_count,
9325            first_collision_idx,
9326        } = if let Some(append_plan) = append_plan {
9327            append_plan
9328        } else {
9329            let ExistingMessageLookup {
9330                by_idx: mut existing_messages,
9331                replay: mut existing_replay_fingerprints,
9332            } = franken_existing_message_lookup(tx, conversation_id, &conv.messages)?;
9333            collect_new_messages_for_existing_conversation(
9334                conversation_id,
9335                conv,
9336                &mut existing_messages,
9337                &mut existing_replay_fingerprints,
9338                "skipping replay-equivalent recovered message with shifted idx",
9339            )
9340        };
9341
9342        let mut inserted_indices = Vec::new();
9343        let mut fts_entries = Vec::new();
9344        let mut fts_pending_chars = 0usize;
9345        let mut _fts_inserted_total = 0usize;
9346        let (inserted_last_idx, inserted_last_created_at) =
9347            borrowed_messages_tail_state(&new_messages);
9348        let inserted_message_ids =
9349            franken_append_insert_new_messages(tx, conversation_id, &new_messages)?;
9350        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
9351            franken_insert_snippets(tx, msg_id, &msg.snippets)?;
9352            if !defer_lexical_updates {
9353                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
9354                fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
9355                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9356                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9357                {
9358                    flush_pending_fts_entries(
9359                        self,
9360                        tx,
9361                        &mut fts_entries,
9362                        &mut fts_pending_chars,
9363                        &mut _fts_inserted_total,
9364                    )?;
9365                }
9366            }
9367            inserted_indices.push(msg.idx);
9368        }
9369
9370        if idx_collision_count > 0 {
9371            tracing::warn!(
9372                conversation_id,
9373                collision_count = idx_collision_count,
9374                first_idx = first_collision_idx,
9375                source_path = %conv.source_path.display(),
9376                "message idx collisions encountered while appending to an existing conversation; retaining canonical message variants"
9377            );
9378        }
9379
9380        if !defer_lexical_updates {
9381            flush_pending_fts_entries(
9382                self,
9383                tx,
9384                &mut fts_entries,
9385                &mut fts_pending_chars,
9386                &mut _fts_inserted_total,
9387            )?;
9388        }
9389
9390        let mut exact_append_tail_set = false;
9391        if used_append_tail_plan {
9392            if let (Some(last_message_idx), Some(last_message_created_at)) =
9393                (inserted_last_idx, inserted_last_created_at)
9394            {
9395                if append_tail_ended_at.is_none_or(|ended_at| ended_at <= last_message_created_at) {
9396                    franken_set_conversation_tail_state_after_append(
9397                        tx,
9398                        conversation_id,
9399                        last_message_created_at,
9400                        last_message_idx,
9401                        last_message_created_at,
9402                    )?;
9403                    exact_append_tail_set = true;
9404                } else {
9405                    franken_update_conversation_tail_state(
9406                        tx,
9407                        conversation_id,
9408                        Some(last_message_created_at),
9409                        inserted_last_idx,
9410                        inserted_last_created_at,
9411                    )?;
9412                }
9413            }
9414        } else {
9415            let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
9416            franken_update_conversation_tail_state(
9417                tx,
9418                conversation_id,
9419                conv_last_ts,
9420                inserted_last_idx,
9421                inserted_last_created_at,
9422            )?;
9423        }
9424        franken_update_external_conversation_tail_after_append(
9425            tx,
9426            agent_id,
9427            conv,
9428            used_append_tail_plan,
9429            exact_append_tail_set,
9430            inserted_last_idx,
9431            inserted_last_created_at,
9432        )?;
9433
9434        if !defer_analytics_updates && !inserted_indices.is_empty() {
9435            let message_count = inserted_indices.len() as i64;
9436            franken_update_daily_stats_in_tx(
9437                self,
9438                tx,
9439                &conv.agent_slug,
9440                &conv.source_id,
9441                conversation_effective_started_at(conv),
9442                StatsDelta {
9443                    session_count_delta: 0,
9444                    message_count_delta: message_count,
9445                    total_chars_delta: new_chars,
9446                },
9447            )?;
9448        }
9449
9450        Ok(InsertOutcome {
9451            conversation_id,
9452            conversation_inserted: false,
9453            inserted_indices,
9454        })
9455    }
9456
9457    /// Rebuild the FTS5 index from scratch (chunked to avoid OOM on large databases, #110).
9458    pub fn rebuild_fts(&self) -> Result<()> {
9459        self.rebuild_fts_via_frankensqlite().map(|_| ())
9460    }
9461
9462    /// Best-effort repair for the derived SQLite FTS fallback index.
9463    ///
9464    /// The canonical archive and Tantivy index remain authoritative, so callers
9465    /// should invoke this from maintenance paths rather than ordinary opens.
9466    pub(crate) fn ensure_search_fallback_fts_consistency(&self) -> Result<FtsConsistencyRepair> {
9467        self.ensure_fts_consistency_via_frankensqlite()
9468    }
9469
9470    pub(crate) fn fallback_fts_is_known_healthy_for_archive_fingerprint(
9471        &self,
9472        archive_fingerprint: &str,
9473    ) -> Result<bool> {
9474        Ok(
9475            self.read_fts_franken_rebuild_generation()? == Some(FTS_FRANKEN_REBUILD_GENERATION)
9476                && self
9477                    .read_fts_franken_rebuild_archive_fingerprint()?
9478                    .as_deref()
9479                    == Some(archive_fingerprint),
9480        )
9481    }
9482
9483    pub(crate) fn record_search_fallback_fts_archive_fingerprint(
9484        &self,
9485        archive_fingerprint: &str,
9486    ) -> Result<()> {
9487        self.conn
9488            .execute_compat(
9489                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9490                fparams![
9491                    FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY,
9492                    archive_fingerprint.to_string()
9493                ],
9494            )
9495            .with_context(|| "recording frankensqlite FTS archive fingerprint")?;
9496        Ok(())
9497    }
9498
9499    pub(crate) fn daily_stats_is_known_healthy_for_archive_fingerprint(
9500        &self,
9501        archive_fingerprint: &str,
9502    ) -> Result<bool> {
9503        Ok(
9504            self.read_daily_stats_health_generation()? == Some(DAILY_STATS_HEALTH_GENERATION)
9505                && self.read_daily_stats_archive_fingerprint()?.as_deref()
9506                    == Some(archive_fingerprint),
9507        )
9508    }
9509
9510    pub(crate) fn record_daily_stats_archive_fingerprint(
9511        &self,
9512        archive_fingerprint: &str,
9513    ) -> Result<()> {
9514        self.conn
9515            .execute_compat(
9516                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9517                fparams![
9518                    DAILY_STATS_HEALTH_GENERATION_META_KEY,
9519                    DAILY_STATS_HEALTH_GENERATION.to_string()
9520                ],
9521            )
9522            .with_context(|| "recording daily_stats health generation")?;
9523        self.conn
9524            .execute_compat(
9525                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9526                fparams![DAILY_STATS_HEALTH_META_KEY, archive_fingerprint.to_string()],
9527            )
9528            .with_context(|| "recording daily_stats archive fingerprint")?;
9529        Ok(())
9530    }
9531
9532    fn read_fts_franken_rebuild_generation(&self) -> Result<Option<i64>> {
9533        let value: Option<String> = self
9534            .conn
9535            .query_row_map(
9536                "SELECT value FROM meta WHERE key = ?1",
9537                fparams![FTS_FRANKEN_REBUILD_META_KEY],
9538                |row| row.get_typed(0),
9539            )
9540            .optional()?;
9541        Ok(value.and_then(|v| v.parse::<i64>().ok()))
9542    }
9543
9544    fn read_fts_franken_rebuild_archive_fingerprint(&self) -> Result<Option<String>> {
9545        Ok(self
9546            .conn
9547            .query_row_map(
9548                "SELECT value FROM meta WHERE key = ?1",
9549                fparams![FTS_FRANKEN_REBUILD_FINGERPRINT_META_KEY],
9550                |row| row.get_typed(0),
9551            )
9552            .optional()?)
9553    }
9554
9555    fn read_daily_stats_health_generation(&self) -> Result<Option<i64>> {
9556        let value: Option<String> = self
9557            .conn
9558            .query_row_map(
9559                "SELECT value FROM meta WHERE key = ?1",
9560                fparams![DAILY_STATS_HEALTH_GENERATION_META_KEY],
9561                |row| row.get_typed(0),
9562            )
9563            .optional()?;
9564        Ok(value.and_then(|value| value.parse::<i64>().ok()))
9565    }
9566
9567    fn read_daily_stats_archive_fingerprint(&self) -> Result<Option<String>> {
9568        Ok(self
9569            .conn
9570            .query_row_map(
9571                "SELECT value FROM meta WHERE key = ?1",
9572                fparams![DAILY_STATS_HEALTH_META_KEY],
9573                |row| row.get_typed(0),
9574            )
9575            .optional()?)
9576    }
9577
9578    fn record_fts_franken_rebuild_generation(&self) -> Result<()> {
9579        self.conn
9580            .execute_compat(
9581                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
9582                fparams![
9583                    FTS_FRANKEN_REBUILD_META_KEY,
9584                    FTS_FRANKEN_REBUILD_GENERATION.to_string()
9585                ],
9586            )
9587            .with_context(|| "recording frankensqlite FTS rebuild generation")?;
9588        Ok(())
9589    }
9590
9591    fn ensure_fts_consistency_via_frankensqlite(&self) -> Result<FtsConsistencyRepair> {
9592        if self.read_fts_franken_rebuild_generation()? != Some(FTS_FRANKEN_REBUILD_GENERATION) {
9593            // Before triggering an expensive full rebuild, probe whether
9594            // fts_messages is already populated and consistent.  On large
9595            // databases the rebuild can take hours and OOM — skip it when
9596            // the only thing missing is the generation marker (#184).
9597            let fts_already_healthy = (|| -> Result<bool> {
9598                let fts_exists: i64 = self.conn.query_row_map(
9599                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9600                    fparams![],
9601                    |row| row.get_typed(0),
9602                )?;
9603                if fts_exists != 1 {
9604                    return Ok(false);
9605                }
9606                let total: i64 = self.conn.query_row_map(
9607                    "SELECT COUNT(*) FROM messages",
9608                    fparams![],
9609                    |row| row.get_typed(0),
9610                )?;
9611                if total == 0 {
9612                    return Ok(false);
9613                }
9614                let indexed: i64 = self.conn.query_row_map(
9615                    "SELECT COUNT(*) FROM fts_messages",
9616                    fparams![],
9617                    |row| row.get_typed(0),
9618                )?;
9619                // Consider healthy if >=90% of messages are indexed
9620                Ok(indexed > 0 && indexed * 100 >= total * 90)
9621            })()
9622            .unwrap_or(false);
9623
9624            if fts_already_healthy {
9625                tracing::info!(
9626                    target: "cass::fts_rebuild",
9627                    "FTS already populated and consistent; setting generation marker without rebuild"
9628                );
9629                self.record_fts_franken_rebuild_generation()?;
9630                self.set_fts_messages_present_cache(true);
9631            } else {
9632                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9633                self.record_fts_franken_rebuild_generation()?;
9634                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9635            }
9636        }
9637
9638        let inspection = (|| -> Result<(i64, bool)> {
9639            let fts_schema_rows = self.conn.query_row_map(
9640                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
9641                fparams![],
9642                |row| row.get_typed::<i64>(0),
9643            )?;
9644            let fts_queryable = fts_schema_rows == 1
9645                && self.conn.query("SELECT COUNT(*) FROM fts_messages").is_ok();
9646            Ok((fts_schema_rows, fts_queryable))
9647        })();
9648
9649        let (fts_schema_rows, fts_queryable) = match inspection {
9650            Ok(result) => result,
9651            Err(err) => {
9652                tracing::warn!(
9653                    error = %err,
9654                    "frankensqlite FTS consistency probe failed; rebuilding authoritative FTS"
9655                );
9656                let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9657                self.record_fts_franken_rebuild_generation()?;
9658                return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9659            }
9660        };
9661
9662        if fts_schema_rows != 1 || !fts_queryable {
9663            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9664            self.record_fts_franken_rebuild_generation()?;
9665            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9666        }
9667
9668        let total_messages =
9669            self.conn
9670                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
9671                    row.get_typed::<i64>(0)
9672                })?;
9673        let indexed_messages =
9674            self.conn
9675                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9676                    row.get_typed::<i64>(0)
9677                })?;
9678
9679        if indexed_messages == total_messages {
9680            self.set_fts_messages_present_cache(true);
9681            return Ok(FtsConsistencyRepair::AlreadyHealthy {
9682                rows: usize::try_from(total_messages.max(0)).unwrap_or(usize::MAX),
9683            });
9684        }
9685
9686        if indexed_messages > total_messages {
9687            let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9688            self.record_fts_franken_rebuild_generation()?;
9689            return Ok(FtsConsistencyRepair::Rebuilt { inserted_rows });
9690        }
9691
9692        let inserted_rows = self
9693            .stream_fts_rows_via_frankensqlite(true)
9694            .with_context(|| "incrementally repairing missing FTS rows via frankensqlite")?;
9695        let repaired_rows =
9696            self.conn
9697                .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
9698                    row.get_typed::<i64>(0)
9699                })?;
9700        if repaired_rows == total_messages {
9701            self.set_fts_messages_present_cache(true);
9702            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9703                inserted_rows,
9704                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9705            });
9706        }
9707
9708        // The incremental catch-up found nothing to insert, yet the gap
9709        // between total_messages (all rows, including orphans) and
9710        // indexed_messages (only rows with valid conversation_id, since the
9711        // FTS INSERT inner-joins on conversations) remains.  A full rebuild
9712        // cannot close this gap either — the orphaned messages will be
9713        // excluded again — so falling through to one would just re-do ~5 min
9714        // of work on every startup.  Accept the current state.
9715        if inserted_rows == 0 {
9716            tracing::debug!(
9717                target: "cass::fts_rebuild",
9718                indexed_messages = repaired_rows,
9719                total_messages,
9720                un_indexable_gap = total_messages.saturating_sub(repaired_rows),
9721                "FTS catch-up inserted 0 rows; remaining gap is un-indexable (likely orphaned messages with dangling conversation_id)"
9722            );
9723            self.set_fts_messages_present_cache(true);
9724            return Ok(FtsConsistencyRepair::IncrementalCatchUp {
9725                inserted_rows: 0,
9726                total_rows: usize::try_from(repaired_rows.max(0)).unwrap_or(usize::MAX),
9727            });
9728        }
9729
9730        // Incremental made progress but didn't fully close the gap — something
9731        // is genuinely inconsistent, so do a full rebuild.
9732        let inserted_rows = self.rebuild_fts_via_frankensqlite()?;
9733        self.record_fts_franken_rebuild_generation()?;
9734        Ok(FtsConsistencyRepair::Rebuilt { inserted_rows })
9735    }
9736
9737    pub(crate) fn rebuild_fts_via_frankensqlite(&self) -> Result<usize> {
9738        self.invalidate_fts_messages_present_cache();
9739        self.conn
9740            .execute("DROP TABLE IF EXISTS fts_messages;")
9741            .with_context(|| "dropping derived fts_messages before frankensqlite rebuild")?;
9742        self.conn
9743            .execute_compat(FTS5_REGISTER_SQL, fparams![])
9744            .with_context(|| "creating derived fts_messages via frankensqlite rebuild")?;
9745        self.set_fts_messages_present_cache(true);
9746
9747        self.stream_fts_rows_via_frankensqlite(false)
9748    }
9749
9750    fn stream_fts_rows_via_frankensqlite(&self, missing_only: bool) -> Result<usize> {
9751        let batch_size = fts_rebuild_batch_size().max(1);
9752        let batch_limit = i64::try_from(batch_size).unwrap_or(i64::MAX);
9753        let mut total_inserted: usize = 0;
9754        let mut total_skipped_orphans: usize = 0;
9755        let mut total_skipped_existing: usize = 0;
9756        let mut last_rowid: i64 = 0;
9757        let conversation_by_id = self.load_fts_conversation_projection_map()?;
9758        let agent_slug_by_id = self.load_fts_agent_slug_map()?;
9759        let workspace_path_by_id = self.load_fts_workspace_path_map()?;
9760        let existing_fts_rowids = if missing_only {
9761            Some(self.load_fts_message_rowid_set()?)
9762        } else {
9763            None
9764        };
9765        let mut entries = Vec::new();
9766        let mut pending_chars = 0usize;
9767
9768        loop {
9769            let rows = self.fetch_fts_rebuild_message_rows(last_rowid, batch_limit)?;
9770            let fetched_count = rows.len();
9771            if fetched_count == 0 {
9772                break;
9773            }
9774
9775            let inserted_before_batch = total_inserted;
9776            let skipped_before_batch = total_skipped_orphans;
9777            let existing_before_batch = total_skipped_existing;
9778
9779            for row in rows {
9780                last_rowid = row.rowid;
9781                if existing_fts_rowids
9782                    .as_ref()
9783                    .is_some_and(|rowids| rowids.contains(&row.message_id))
9784                {
9785                    total_skipped_existing = total_skipped_existing.saturating_add(1);
9786                    continue;
9787                }
9788                let Some(conversation) = conversation_by_id.get(&row.conversation_id) else {
9789                    total_skipped_orphans = total_skipped_orphans.saturating_add(1);
9790                    continue;
9791                };
9792                let agent = conversation
9793                    .agent_id
9794                    .and_then(|agent_id| agent_slug_by_id.get(&agent_id))
9795                    .filter(|slug| !slug.is_empty())
9796                    .cloned()
9797                    .unwrap_or_else(|| "unknown".to_string());
9798                let workspace = conversation
9799                    .workspace_id
9800                    .and_then(|workspace_id| workspace_path_by_id.get(&workspace_id))
9801                    .cloned()
9802                    .unwrap_or_default();
9803                pending_chars = pending_chars.saturating_add(row.content.len());
9804                entries.push(FtsEntry {
9805                    content: row.content,
9806                    title: conversation.title.clone(),
9807                    agent,
9808                    workspace,
9809                    source_path: conversation.source_path.clone(),
9810                    created_at: row.created_at,
9811                    message_id: row.message_id,
9812                });
9813                if entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
9814                    || pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
9815                {
9816                    total_inserted = total_inserted.saturating_add(
9817                        franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
9818                    );
9819                    entries.clear();
9820                    pending_chars = 0;
9821                }
9822            }
9823
9824            if !entries.is_empty() {
9825                total_inserted = total_inserted.saturating_add(
9826                    franken_batch_insert_fts_on_connection(&self.conn, &entries)?,
9827                );
9828                entries.clear();
9829                pending_chars = 0;
9830            }
9831
9832            tracing::debug!(
9833                target: "cass::fts_rebuild",
9834                batch_rows = fetched_count,
9835                batch_inserted = total_inserted.saturating_sub(inserted_before_batch),
9836                batch_skipped_orphans = total_skipped_orphans.saturating_sub(skipped_before_batch),
9837                batch_skipped_existing = total_skipped_existing.saturating_sub(existing_before_batch),
9838                total_inserted,
9839                total_skipped_orphans,
9840                total_skipped_existing,
9841                last_rowid,
9842                missing_only,
9843                "FTS streaming maintenance batch complete"
9844            );
9845
9846            if fetched_count < batch_size {
9847                break;
9848            }
9849        }
9850
9851        Ok(total_inserted)
9852    }
9853
9854    fn fetch_fts_rebuild_message_rows(
9855        &self,
9856        last_rowid: i64,
9857        batch_limit: i64,
9858    ) -> Result<Vec<FtsRebuildMessageRow>> {
9859        self.conn
9860            .query_map_collect(
9861                "SELECT m.rowid, m.id, m.conversation_id, m.content, m.created_at
9862                 FROM messages m
9863                 WHERE m.rowid > ?1
9864                 ORDER BY m.rowid
9865                 LIMIT ?2",
9866                fparams![last_rowid, batch_limit],
9867                |row| {
9868                    Ok(FtsRebuildMessageRow {
9869                        rowid: row.get_typed(0)?,
9870                        message_id: row.get_typed(1)?,
9871                        conversation_id: row.get_typed(2)?,
9872                        content: row.get_typed::<Option<String>>(3)?.unwrap_or_default(),
9873                        created_at: row.get_typed(4)?,
9874                    })
9875                },
9876            )
9877            .with_context(|| format!("fetching FTS maintenance messages after rowid {last_rowid}"))
9878    }
9879
9880    fn load_fts_message_rowid_set(&self) -> Result<HashSet<i64>> {
9881        let rows: Vec<i64> = self
9882            .conn
9883            .query_map_collect("SELECT rowid FROM fts_messages", fparams![], |row| {
9884                row.get_typed(0)
9885            })
9886            .with_context(|| "loading existing FTS message rowids")?;
9887        Ok(rows.into_iter().collect())
9888    }
9889
9890    fn load_fts_conversation_projection_map(
9891        &self,
9892    ) -> Result<HashMap<i64, FtsConversationProjection>> {
9893        let rows: Vec<(i64, FtsConversationProjection)> = self
9894            .conn
9895            .query_map_collect(
9896                "SELECT id, title, agent_id, workspace_id, source_path
9897                 FROM conversations",
9898                fparams![],
9899                |row| {
9900                    Ok((
9901                        row.get_typed(0)?,
9902                        FtsConversationProjection {
9903                            title: row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
9904                            agent_id: row.get_typed(2)?,
9905                            workspace_id: row.get_typed(3)?,
9906                            source_path: row.get_typed::<Option<String>>(4)?.unwrap_or_default(),
9907                        },
9908                    ))
9909                },
9910            )
9911            .with_context(|| "loading FTS conversation projection map")?;
9912        Ok(rows.into_iter().collect())
9913    }
9914
9915    fn load_fts_agent_slug_map(&self) -> Result<HashMap<i64, String>> {
9916        let rows: Vec<(i64, String)> = self
9917            .conn
9918            .query_map_collect("SELECT id, slug FROM agents", fparams![], |row| {
9919                Ok((
9920                    row.get_typed(0)?,
9921                    row.get_typed::<Option<String>>(1)?
9922                        .unwrap_or_else(|| "unknown".to_string()),
9923                ))
9924            })
9925            .with_context(|| "loading FTS agent slug map")?;
9926        Ok(rows.into_iter().collect())
9927    }
9928
9929    fn load_fts_workspace_path_map(&self) -> Result<HashMap<i64, String>> {
9930        let rows: Vec<(i64, String)> = self
9931            .conn
9932            .query_map_collect("SELECT id, path FROM workspaces", fparams![], |row| {
9933                Ok((
9934                    row.get_typed(0)?,
9935                    row.get_typed::<Option<String>>(1)?.unwrap_or_default(),
9936                ))
9937            })
9938            .with_context(|| "loading FTS workspace path map")?;
9939        Ok(rows.into_iter().collect())
9940    }
9941
9942    /// Fetch all messages for embedding generation.
9943    pub fn fetch_messages_for_embedding(&self) -> Result<Vec<MessageForEmbedding>> {
9944        // COALESCE(c.agent_id, 0) so legacy V1 conversations with NULL
9945        // agent_id don't cause a runtime row-decode failure (agent_id in
9946        // MessageForEmbedding is i64).  saturating_u32_from_i64 downstream
9947        // turns 0 into the "unknown agent" sentinel for doc-id hashing.
9948        self.conn
9949            .query_map_collect(
9950                "SELECT m.id, m.created_at, COALESCE(c.agent_id, 0), c.workspace_id, c.source_id, m.role, m.content
9951                 FROM messages m
9952                 JOIN conversations c ON m.conversation_id = c.id
9953                 ORDER BY m.id",
9954                fparams![],
9955                |row| {
9956                    let source_id: String = row.get_typed::<Option<String>>(4)?
9957                        .unwrap_or_else(|| "local".to_string());
9958                    Ok(MessageForEmbedding {
9959                        message_id: row.get_typed(0)?,
9960                        created_at: row.get_typed(1)?,
9961                        agent_id: row.get_typed(2)?,
9962                        workspace_id: row.get_typed(3)?,
9963                        source_id_hash: crc32fast::hash(source_id.as_bytes()),
9964                        role: row.get_typed(5)?,
9965                        content: row.get_typed(6)?,
9966                    })
9967                },
9968            )
9969            .with_context(|| "fetching messages for embedding")
9970    }
9971
9972    /// Get the watermark for incremental semantic embedding.
9973    pub fn get_last_embedded_message_id(&self) -> Result<Option<i64>> {
9974        let result: Result<String, _> = self.conn.query_row_map(
9975            "SELECT value FROM meta WHERE key = 'last_embedded_message_id'",
9976            fparams![],
9977            |row| row.get_typed(0),
9978        );
9979        match result.optional() {
9980            Ok(Some(s)) => Ok(s.parse().ok()),
9981            Ok(None) => Ok(None),
9982            Err(e) => Err(e.into()),
9983        }
9984    }
9985
9986    /// Set the watermark for incremental semantic embedding.
9987    pub fn set_last_embedded_message_id(&self, id: i64) -> Result<()> {
9988        self.conn.execute_compat(
9989            "INSERT OR REPLACE INTO meta(key, value) VALUES('last_embedded_message_id', ?1)",
9990            fparams![id.to_string()],
9991        )?;
9992        Ok(())
9993    }
9994
9995    /// Get embedding jobs for a database path.
9996    pub fn get_embedding_jobs(&self, db_path: &str) -> Result<Vec<EmbeddingJobRow>> {
9997        self.conn
9998            .query_map_collect(
9999                "SELECT id, db_path, model_id, status, total_docs, completed_docs, error_message, created_at, started_at, completed_at
10000                 FROM embedding_jobs WHERE db_path = ?1 ORDER BY id DESC",
10001                fparams![db_path],
10002                |row| {
10003                    Ok(EmbeddingJobRow {
10004                        id: row.get_typed(0)?,
10005                        db_path: row.get_typed(1)?,
10006                        model_id: row.get_typed(2)?,
10007                        status: row.get_typed(3)?,
10008                        total_docs: row.get_typed(4)?,
10009                        completed_docs: row.get_typed(5)?,
10010                        error_message: row.get_typed(6)?,
10011                        created_at: row.get_typed(7)?,
10012                        started_at: row.get_typed(8)?,
10013                        completed_at: row.get_typed(9)?,
10014                    })
10015                },
10016            )
10017            .with_context(|| format!("fetching embedding jobs for {db_path}"))
10018    }
10019
10020    /// Create or update an embedding job.
10021    pub fn upsert_embedding_job(
10022        &self,
10023        db_path: &str,
10024        model_id: &str,
10025        total_docs: i64,
10026    ) -> Result<i64> {
10027        let updated = self.conn.execute_compat(
10028            "UPDATE embedding_jobs
10029             SET total_docs = ?3
10030             WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10031            fparams![db_path, model_id, total_docs],
10032        )?;
10033        if updated == 0 {
10034            let insert_result = self.conn.execute_compat(
10035                "INSERT INTO embedding_jobs(db_path, model_id, total_docs) VALUES(?1,?2,?3)",
10036                fparams![db_path, model_id, total_docs],
10037            );
10038            if let Err(err) = insert_result {
10039                if !matches!(err, frankensqlite::FrankenError::UniqueViolation { .. }) {
10040                    return Err(err.into());
10041                }
10042                self.conn.execute_compat(
10043                    "UPDATE embedding_jobs
10044                     SET total_docs = ?3
10045                     WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10046                    fparams![db_path, model_id, total_docs],
10047                )?;
10048            }
10049        }
10050        self.conn
10051            .query_row_map(
10052                "SELECT id FROM embedding_jobs
10053                 WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')
10054                 ORDER BY id DESC
10055                 LIMIT 1",
10056                fparams![db_path, model_id],
10057                |row| row.get_typed(0),
10058            )
10059            .with_context(|| "resolving embedding job id after upsert")
10060    }
10061
10062    /// Mark an embedding job as started.
10063    pub fn start_embedding_job(&self, job_id: i64) -> Result<()> {
10064        self.conn.execute_compat(
10065            "UPDATE embedding_jobs SET status = 'running', started_at = datetime('now') WHERE id = ?1",
10066            fparams![job_id],
10067        )?;
10068        Ok(())
10069    }
10070
10071    /// Mark an embedding job as completed.
10072    pub fn complete_embedding_job(&self, job_id: i64) -> Result<()> {
10073        self.conn.execute_compat(
10074            "UPDATE embedding_jobs SET status = 'completed', completed_at = datetime('now') WHERE id = ?1",
10075            fparams![job_id],
10076        )?;
10077        Ok(())
10078    }
10079
10080    /// Mark an embedding job as failed.
10081    pub fn fail_embedding_job(&self, job_id: i64, error: &str) -> Result<()> {
10082        self.conn.execute_compat(
10083            "UPDATE embedding_jobs SET status = 'failed', error_message = ?2, completed_at = datetime('now') WHERE id = ?1",
10084            fparams![job_id, error],
10085        )?;
10086        Ok(())
10087    }
10088
10089    /// Cancel embedding jobs for a database path.
10090    pub fn cancel_embedding_jobs(&self, db_path: &str, model_id: Option<&str>) -> Result<usize> {
10091        if let Some(mid) = model_id {
10092            Ok(self.conn.execute_compat(
10093                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND model_id = ?2 AND status IN ('pending', 'running')",
10094                fparams![db_path, mid],
10095            )?)
10096        } else {
10097            Ok(self.conn.execute_compat(
10098                "UPDATE embedding_jobs SET status = 'cancelled' WHERE db_path = ?1 AND status IN ('pending', 'running')",
10099                fparams![db_path],
10100            )?)
10101        }
10102    }
10103
10104    /// Update embedding job progress.
10105    pub fn update_job_progress(&self, job_id: i64, completed_docs: i64) -> Result<()> {
10106        self.conn.execute_compat(
10107            "UPDATE embedding_jobs SET completed_docs = ?2 WHERE id = ?1",
10108            fparams![job_id, completed_docs],
10109        )?;
10110        Ok(())
10111    }
10112
10113    // =====================================================================
10114    // Analytics query methods
10115    // =====================================================================
10116
10117    /// Get session count for a date range using materialized stats.
10118    /// Returns (count, is_from_cache) where is_from_cache is true if from daily_stats.
10119    ///
10120    /// Falls back to COUNT(*) query when daily_stats table is empty or stale.
10121    pub fn count_sessions_in_range(
10122        &self,
10123        start_ts_ms: Option<i64>,
10124        end_ts_ms: Option<i64>,
10125        agent_slug: Option<&str>,
10126        source_id: Option<&str>,
10127    ) -> Result<(i64, bool)> {
10128        let agent = agent_slug.unwrap_or("all");
10129        let source = source_id.unwrap_or("all");
10130
10131        // Check if we have materialized stats
10132        let stats_count: i64 = self
10133            .conn
10134            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10135                row.get_typed(0)
10136            })
10137            .unwrap_or(0);
10138
10139        if stats_count == 0 {
10140            return self.count_sessions_direct(start_ts_ms, end_ts_ms, agent_slug, source_id);
10141        }
10142
10143        // Use materialized stats
10144        let start_day = start_ts_ms.map(Self::day_id_from_millis);
10145        let end_day = end_ts_ms.map(Self::day_id_from_millis);
10146
10147        let count: i64 = match (start_day, end_day) {
10148            (Some(start), Some(end)) => self.conn.query_row_map(
10149                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10150                 WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4",
10151                fparams![start, end, agent, source],
10152                |row| row.get_typed(0),
10153            )?,
10154            (Some(start), None) => self.conn.query_row_map(
10155                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10156                 WHERE day_id >= ?1 AND agent_slug = ?2 AND source_id = ?3",
10157                fparams![start, agent, source],
10158                |row| row.get_typed(0),
10159            )?,
10160            (None, Some(end)) => self.conn.query_row_map(
10161                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10162                 WHERE day_id <= ?1 AND agent_slug = ?2 AND source_id = ?3",
10163                fparams![end, agent, source],
10164                |row| row.get_typed(0),
10165            )?,
10166            (None, None) => self.conn.query_row_map(
10167                "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10168                 WHERE agent_slug = ?1 AND source_id = ?2",
10169                fparams![agent, source],
10170                |row| row.get_typed(0),
10171            )?,
10172        };
10173
10174        Ok((count, true))
10175    }
10176
10177    /// Direct COUNT(*) query as fallback when daily_stats is empty.
10178    fn count_sessions_direct(
10179        &self,
10180        start_ts_ms: Option<i64>,
10181        end_ts_ms: Option<i64>,
10182        agent_slug: Option<&str>,
10183        source_id: Option<&str>,
10184    ) -> Result<(i64, bool)> {
10185        // Build dynamic SQL with positional params.  Single-table scan of
10186        // conversations; filter on agent slug via an EXISTS subquery only
10187        // when that filter is actually requested.  This avoids the unneeded
10188        // 2-table JOIN (which also silently dropped legacy conversations
10189        // with NULL agent_id) and sidesteps frankensqlite's materialization
10190        // fallback entirely.
10191        let mut sql = "SELECT COUNT(*) FROM conversations c WHERE 1=1".to_string();
10192        let mut param_values: Vec<ParamValue> = Vec::new();
10193        let mut idx = 1;
10194
10195        if let Some(start) = start_ts_ms {
10196            sql.push_str(&format!(" AND c.started_at >= ?{idx}"));
10197            param_values.push(ParamValue::from(start));
10198            idx += 1;
10199        }
10200        if let Some(end) = end_ts_ms {
10201            sql.push_str(&format!(" AND c.started_at <= ?{idx}"));
10202            param_values.push(ParamValue::from(end));
10203            idx += 1;
10204        }
10205        if let Some(agent) = agent_slug
10206            && agent != "all"
10207        {
10208            sql.push_str(&format!(
10209                " AND EXISTS (SELECT 1 FROM agents a WHERE a.id = c.agent_id AND a.slug = ?{idx})"
10210            ));
10211            param_values.push(ParamValue::from(agent));
10212            idx += 1;
10213        }
10214        if let Some(source) = source_id
10215            && source != "all"
10216        {
10217            sql.push_str(&format!(" AND c.source_id = ?{idx}"));
10218            param_values.push(ParamValue::from(source));
10219            let _ = idx; // suppress unused warning
10220        }
10221
10222        let count: i64 = self
10223            .conn
10224            .query_row_map(&sql, &param_values, |row| row.get_typed(0))?;
10225        Ok((count, false))
10226    }
10227
10228    /// Get daily histogram data for a date range.
10229    pub fn get_daily_histogram(
10230        &self,
10231        start_ts_ms: i64,
10232        end_ts_ms: i64,
10233        agent_slug: Option<&str>,
10234        source_id: Option<&str>,
10235    ) -> Result<Vec<DailyCount>> {
10236        let start_day = Self::day_id_from_millis(start_ts_ms);
10237        let end_day = Self::day_id_from_millis(end_ts_ms);
10238        let agent = agent_slug.unwrap_or("all");
10239        let source = source_id.unwrap_or("all");
10240
10241        let rows = self.conn.query_map_collect(
10242            "SELECT day_id, session_count, message_count, total_chars
10243             FROM daily_stats
10244             WHERE day_id BETWEEN ?1 AND ?2 AND agent_slug = ?3 AND source_id = ?4
10245             ORDER BY day_id",
10246            fparams![start_day, end_day, agent, source],
10247            |row| {
10248                Ok(DailyCount {
10249                    day_id: row.get_typed(0)?,
10250                    sessions: row.get_typed(1)?,
10251                    messages: row.get_typed(2)?,
10252                    chars: row.get_typed(3)?,
10253                })
10254            },
10255        )?;
10256
10257        Ok(rows)
10258    }
10259
10260    /// Check health of daily stats table.
10261    pub fn daily_stats_health(&self) -> Result<DailyStatsHealth> {
10262        let row_count: i64 =
10263            self.conn
10264                .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
10265                    row.get_typed(0)
10266                })?;
10267
10268        let oldest_update: Option<i64> = self.conn.query_row_map(
10269            "SELECT MIN(last_updated) FROM daily_stats",
10270            fparams![],
10271            |row| row.get_typed(0),
10272        )?;
10273
10274        let conversation_count: i64 =
10275            self.conn
10276                .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
10277                    row.get_typed(0)
10278                })?;
10279
10280        let materialized_total: i64 = self.conn.query_row_map(
10281            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats
10282                 WHERE agent_slug = 'all' AND source_id = 'all'",
10283            fparams![],
10284            |row| row.get_typed(0),
10285        )?;
10286
10287        Ok(DailyStatsHealth {
10288            populated: row_count > 0,
10289            row_count,
10290            oldest_update_ms: oldest_update,
10291            conversation_count,
10292            materialized_total,
10293            drift: (conversation_count - materialized_total).abs(),
10294        })
10295    }
10296
10297    /// Batch insert multiple conversations with full analytics (token usage,
10298    /// message metrics, rollups).  Frankensqlite equivalent of
10299    /// `SqliteStorage::insert_conversations_batched`.
10300    pub fn insert_conversations_batched(
10301        &self,
10302        conversations: &[(i64, Option<i64>, &Conversation)],
10303    ) -> Result<Vec<InsertOutcome>> {
10304        if conversations.is_empty() {
10305            return Ok(Vec::new());
10306        }
10307
10308        self.ensure_sources_for_batch(conversations)?;
10309
10310        let defer_lexical_updates = defer_storage_lexical_updates_enabled();
10311        let defer_analytics_updates = defer_analytics_updates_enabled();
10312
10313        let pricing_table = PricingTable::franken_load(&self.conn).unwrap_or_else(|e| {
10314            tracing::warn!(target: "cass::analytics::pricing", error = %e, "failed to load pricing table");
10315            PricingTable { entries: Vec::new() }
10316        });
10317        let mut pricing_diag = PricingDiagnostics::default();
10318
10319        let mut tx = self.conn.transaction()?;
10320
10321        // Bug #167: Ensure all referenced agents, workspaces, and sources
10322        // exist inside the transaction so FK checks pass.  The caller resolves
10323        // IDs via ensure_agent / ensure_workspace / ensure_sources_for_batch
10324        // outside the transaction, but those autocommit writes may not be
10325        // visible inside the transaction snapshot in frankensqlite.  Re-verify
10326        // (and insert if missing) within the tx.
10327        ensure_agents_in_tx(&tx, conversations)?;
10328        ensure_workspaces_in_tx(&tx, conversations)?;
10329        ensure_sources_in_tx(&tx, conversations)?;
10330
10331        let mut outcomes = Vec::with_capacity(conversations.len());
10332        let mut fts_entries = Vec::new();
10333        let mut fts_pending_chars = 0usize;
10334        let mut fts_inserted_total = 0usize;
10335        let mut fts_count_total = 0usize;
10336        let mut stats = StatsAggregator::new();
10337        let mut token_stats = TokenStatsAggregator::new();
10338        let mut token_entries: Vec<TokenUsageEntry> = Vec::new();
10339        let mut metrics_entries: Vec<MessageMetricsEntry> = Vec::new();
10340        let mut rollup_agg = AnalyticsRollupAggregator::new();
10341        let mut conv_ids_to_summarize: Vec<i64> = Vec::new();
10342        let mut pending_conversation_ids: HashMap<PendingConversationKey, i64> = HashMap::new();
10343        let mut pending_message_fingerprints: HashMap<i64, HashMap<i64, MessageMergeFingerprint>> =
10344            HashMap::new();
10345        let mut pending_message_replay_fingerprints: HashMap<
10346            i64,
10347            HashSet<MessageReplayFingerprint>,
10348        > = HashMap::new();
10349
10350        for &(agent_id, workspace_id, raw_conv) in conversations {
10351            let normalized_conv = normalized_conversation_for_storage(raw_conv);
10352            let conv = normalized_conv.as_ref();
10353            let mut total_chars: i64 = 0;
10354            let mut inserted_indices = Vec::with_capacity(conv.messages.len());
10355            let mut inserted_messages: Vec<(i64, &Message)> =
10356                Vec::with_capacity(conv.messages.len());
10357            let mut session_count_delta = 1_i64;
10358            let conversation_key = conversation_merge_key(agent_id, conv);
10359
10360            let existing_conv_id = if let Some(existing_id) =
10361                pending_conversation_ids.get(&conversation_key)
10362            {
10363                Some(*existing_id)
10364            } else {
10365                let existing_id =
10366                    franken_find_existing_conversation_by_key(&tx, &conversation_key, Some(conv))?;
10367                if let Some(existing_id) = existing_id {
10368                    pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10369                }
10370                existing_id
10371            };
10372
10373            let conv_id = if let Some(existing_id) = existing_conv_id {
10374                session_count_delta = 0;
10375                let ExistingMessageLookup {
10376                    by_idx: mut existing_messages,
10377                    replay: mut existing_replay_fingerprints,
10378                } = franken_existing_message_lookup_with_pending(
10379                    &tx,
10380                    existing_id,
10381                    &conv.messages,
10382                    &mut pending_message_fingerprints,
10383                    &mut pending_message_replay_fingerprints,
10384                )?;
10385                let ExistingConversationNewMessages {
10386                    messages: new_messages,
10387                    new_chars,
10388                    idx_collision_count,
10389                    first_collision_idx,
10390                } = collect_new_messages_for_existing_conversation(
10391                    existing_id,
10392                    conv,
10393                    &mut existing_messages,
10394                    &mut existing_replay_fingerprints,
10395                    "skipping replay-equivalent recovered message with shifted idx during batched merge",
10396                );
10397                let (inserted_last_idx, inserted_last_created_at) =
10398                    borrowed_messages_tail_state(&new_messages);
10399                let inserted_message_ids =
10400                    franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10401                total_chars += new_chars;
10402                for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10403                    franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10404                    if !defer_lexical_updates {
10405                        fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10406                        fts_count_total += 1;
10407                        fts_pending_chars = fts_pending_chars.saturating_add(msg.content.len());
10408                        if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10409                            || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10410                        {
10411                            flush_pending_fts_entries(
10412                                self,
10413                                &tx,
10414                                &mut fts_entries,
10415                                &mut fts_pending_chars,
10416                                &mut fts_inserted_total,
10417                            )?;
10418                        }
10419                    }
10420                    inserted_indices.push(msg.idx);
10421                    inserted_messages.push((msg_id, msg));
10422                }
10423
10424                if idx_collision_count > 0 {
10425                    tracing::warn!(
10426                        conversation_id = existing_id,
10427                        collision_count = idx_collision_count,
10428                        first_idx = first_collision_idx,
10429                        source_path = %conv.source_path.display(),
10430                        "message idx collisions encountered during batched conversation merge; retaining canonical message variants"
10431                    );
10432                }
10433
10434                let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10435                franken_update_conversation_tail_state(
10436                    &tx,
10437                    existing_id,
10438                    conv_last_ts,
10439                    inserted_last_idx,
10440                    inserted_last_created_at,
10441                )?;
10442                if let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv)
10443                {
10444                    franken_update_external_conversation_tail_lookup_key(
10445                        &tx,
10446                        &lookup_key,
10447                        conv_last_ts,
10448                        inserted_last_idx,
10449                        inserted_last_created_at,
10450                    )?;
10451                }
10452
10453                pending_message_fingerprints.insert(existing_id, existing_messages);
10454                pending_message_replay_fingerprints
10455                    .insert(existing_id, existing_replay_fingerprints);
10456
10457                existing_id
10458            } else {
10459                match franken_insert_conversation_or_get_existing(
10460                    &tx,
10461                    agent_id,
10462                    workspace_id,
10463                    conv,
10464                )? {
10465                    ConversationInsertStatus::Inserted(new_conv_id) => {
10466                        pending_conversation_ids.insert(conversation_key.clone(), new_conv_id);
10467                        let pending_messages =
10468                            pending_message_fingerprints.entry(new_conv_id).or_default();
10469                        let pending_replay_fingerprints = pending_message_replay_fingerprints
10470                            .entry(new_conv_id)
10471                            .or_default();
10472                        let mut new_messages = Vec::new();
10473                        for msg in &conv.messages {
10474                            let incoming_replay = message_replay_fingerprint(msg);
10475                            if pending_messages.contains_key(&msg.idx)
10476                                || pending_replay_fingerprints.contains(&incoming_replay)
10477                            {
10478                                continue;
10479                            }
10480                            pending_messages.insert(msg.idx, message_merge_fingerprint(msg));
10481                            pending_replay_fingerprints.insert(incoming_replay);
10482                            new_messages.push(msg);
10483                        }
10484                        let inserted_message_ids =
10485                            franken_batch_insert_new_messages(&tx, new_conv_id, &new_messages)?;
10486                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10487                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10488                            if !defer_lexical_updates {
10489                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10490                                fts_count_total += 1;
10491                                fts_pending_chars =
10492                                    fts_pending_chars.saturating_add(msg.content.len());
10493                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10494                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10495                                {
10496                                    flush_pending_fts_entries(
10497                                        self,
10498                                        &tx,
10499                                        &mut fts_entries,
10500                                        &mut fts_pending_chars,
10501                                        &mut fts_inserted_total,
10502                                    )?;
10503                                }
10504                            }
10505                            total_chars += msg.content.len() as i64;
10506                            inserted_indices.push(msg.idx);
10507                            inserted_messages.push((msg_id, msg));
10508                        }
10509                        new_conv_id
10510                    }
10511                    ConversationInsertStatus::Existing(existing_id) => {
10512                        session_count_delta = 0;
10513                        pending_conversation_ids.insert(conversation_key.clone(), existing_id);
10514                        let ExistingMessageLookup {
10515                            by_idx: mut existing_messages,
10516                            replay: mut existing_replay_fingerprints,
10517                        } = franken_existing_message_lookup_with_pending(
10518                            &tx,
10519                            existing_id,
10520                            &conv.messages,
10521                            &mut pending_message_fingerprints,
10522                            &mut pending_message_replay_fingerprints,
10523                        )?;
10524                        let ExistingConversationNewMessages {
10525                            messages: new_messages,
10526                            new_chars,
10527                            idx_collision_count,
10528                            first_collision_idx,
10529                        } = collect_new_messages_for_existing_conversation(
10530                            existing_id,
10531                            conv,
10532                            &mut existing_messages,
10533                            &mut existing_replay_fingerprints,
10534                            "skipping replay-equivalent recovered message with shifted idx after duplicate conversation recovery",
10535                        );
10536                        let (inserted_last_idx, inserted_last_created_at) =
10537                            borrowed_messages_tail_state(&new_messages);
10538                        let inserted_message_ids =
10539                            franken_append_insert_new_messages(&tx, existing_id, &new_messages)?;
10540                        total_chars += new_chars;
10541                        for (msg_id, msg) in inserted_message_ids.into_iter().zip(new_messages) {
10542                            franken_insert_snippets(&tx, msg_id, &msg.snippets)?;
10543                            if !defer_lexical_updates {
10544                                fts_entries.push(FtsEntry::from_message(msg_id, msg, conv));
10545                                fts_count_total += 1;
10546                                fts_pending_chars =
10547                                    fts_pending_chars.saturating_add(msg.content.len());
10548                                if fts_entries.len() >= FTS_ENTRY_BATCH_MAX_DOCS
10549                                    || fts_pending_chars >= FTS_ENTRY_BATCH_MAX_CHARS
10550                                {
10551                                    flush_pending_fts_entries(
10552                                        self,
10553                                        &tx,
10554                                        &mut fts_entries,
10555                                        &mut fts_pending_chars,
10556                                        &mut fts_inserted_total,
10557                                    )?;
10558                                }
10559                            }
10560                            inserted_indices.push(msg.idx);
10561                            inserted_messages.push((msg_id, msg));
10562                        }
10563
10564                        if idx_collision_count > 0 {
10565                            tracing::warn!(
10566                                conversation_id = existing_id,
10567                                collision_count = idx_collision_count,
10568                                first_idx = first_collision_idx,
10569                                source_path = %conv.source_path.display(),
10570                                "message idx collisions encountered after duplicate conversation recovery; retaining canonical message variants"
10571                            );
10572                        }
10573
10574                        let conv_last_ts = conv.messages.iter().filter_map(|m| m.created_at).max();
10575                        franken_update_conversation_tail_state(
10576                            &tx,
10577                            existing_id,
10578                            conv_last_ts,
10579                            inserted_last_idx,
10580                            inserted_last_created_at,
10581                        )?;
10582                        if let Some(lookup_key) =
10583                            conversation_external_lookup_key_for_conv(agent_id, conv)
10584                        {
10585                            franken_update_external_conversation_tail_lookup_key(
10586                                &tx,
10587                                &lookup_key,
10588                                conv_last_ts,
10589                                inserted_last_idx,
10590                                inserted_last_created_at,
10591                            )?;
10592                        }
10593
10594                        pending_message_fingerprints.insert(existing_id, existing_messages);
10595                        pending_message_replay_fingerprints
10596                            .insert(existing_id, existing_replay_fingerprints);
10597
10598                        existing_id
10599                    }
10600                }
10601            };
10602
10603            if !defer_analytics_updates {
10604                let delta = StatsDelta {
10605                    session_count_delta,
10606                    message_count_delta: inserted_messages.len() as i64,
10607                    total_chars_delta: total_chars,
10608                };
10609
10610                let effective_started_at = conversation_effective_started_at(conv);
10611                let day_id = effective_started_at
10612                    .map(FrankenStorage::day_id_from_millis)
10613                    .unwrap_or(0);
10614                stats.record_delta(
10615                    &conv.agent_slug,
10616                    &conv.source_id,
10617                    day_id,
10618                    delta.session_count_delta,
10619                    delta.message_count_delta,
10620                    delta.total_chars_delta,
10621                );
10622
10623                let conv_day_id = day_id;
10624                let mut session_model_family = String::from("unknown");
10625                let mut has_any_tokens = false;
10626
10627                for &(message_id, msg) in &inserted_messages {
10628                    let role_s = role_str(&msg.role);
10629                    let usage = if historical_raw_json(&msg.extra_json).is_some() {
10630                        crate::connectors::extract_tokens_for_agent(
10631                            &conv.agent_slug,
10632                            &serde_json::Value::Null,
10633                            &msg.content,
10634                            &role_s,
10635                        )
10636                    } else {
10637                        crate::connectors::extract_tokens_for_agent(
10638                            &conv.agent_slug,
10639                            &msg.extra_json,
10640                            &msg.content,
10641                            &role_s,
10642                        )
10643                    };
10644
10645                    let msg_ts = msg
10646                        .created_at
10647                        .or(conversation_effective_started_at(conv))
10648                        .unwrap_or(0);
10649                    let msg_day_id = if msg_ts > 0 {
10650                        FrankenStorage::day_id_from_millis(msg_ts)
10651                    } else {
10652                        conv_day_id
10653                    };
10654
10655                    let model_info = usage
10656                        .model_name
10657                        .as_deref()
10658                        .map(crate::connectors::normalize_model);
10659
10660                    let model_family = model_info
10661                        .as_ref()
10662                        .map(|i| i.family.clone())
10663                        .unwrap_or_else(|| "unknown".into());
10664                    let model_tier = model_info
10665                        .as_ref()
10666                        .map(|i| i.tier.clone())
10667                        .unwrap_or_else(|| "unknown".into());
10668                    let provider = usage
10669                        .provider
10670                        .clone()
10671                        .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
10672                        .unwrap_or_else(|| "unknown".into());
10673
10674                    if model_family != "unknown" {
10675                        session_model_family = model_family.clone();
10676                    }
10677
10678                    let estimated_cost = pricing_table.compute_cost(
10679                        usage.model_name.as_deref(),
10680                        msg_day_id,
10681                        usage.input_tokens,
10682                        usage.output_tokens,
10683                        usage.cache_read_tokens,
10684                        usage.cache_creation_tokens,
10685                    );
10686                    if estimated_cost.is_some() {
10687                        pricing_diag.record_priced();
10688                    } else if usage.has_token_data() {
10689                        pricing_diag.record_unpriced(usage.model_name.as_deref());
10690                    }
10691
10692                    token_stats.record(
10693                        &conv.agent_slug,
10694                        &conv.source_id,
10695                        msg_day_id,
10696                        &model_family,
10697                        &role_s,
10698                        &usage,
10699                        msg.content.len() as i64,
10700                        estimated_cost.unwrap_or(0.0),
10701                    );
10702
10703                    if usage.has_token_data() {
10704                        has_any_tokens = true;
10705                    }
10706
10707                    let content_chars = msg.content.len() as i64;
10708                    let content_tokens_est = content_chars / 4;
10709                    let msg_hour_id = FrankenStorage::hour_id_from_millis(msg_ts);
10710                    let has_plan = has_plan_for_role(&role_s, &msg.content);
10711
10712                    token_entries.push(TokenUsageEntry {
10713                        message_id,
10714                        conversation_id: conv_id,
10715                        agent_id,
10716                        workspace_id,
10717                        source_id: conv.source_id.clone(),
10718                        timestamp_ms: msg_ts,
10719                        day_id: msg_day_id,
10720                        model_name: usage.model_name.clone(),
10721                        model_family: Some(model_family.clone()),
10722                        model_tier: Some(model_tier.clone()),
10723                        service_tier: usage.service_tier.clone(),
10724                        provider: Some(provider.clone()),
10725                        input_tokens: usage.input_tokens,
10726                        output_tokens: usage.output_tokens,
10727                        cache_read_tokens: usage.cache_read_tokens,
10728                        cache_creation_tokens: usage.cache_creation_tokens,
10729                        thinking_tokens: usage.thinking_tokens,
10730                        total_tokens: usage.total_tokens(),
10731                        estimated_cost_usd: estimated_cost,
10732                        role: role_s.to_string(),
10733                        content_chars,
10734                        has_tool_calls: usage.has_tool_calls,
10735                        tool_call_count: usage.tool_call_count,
10736                        data_source: usage.data_source.as_str().to_string(),
10737                    });
10738
10739                    let mm = MessageMetricsEntry {
10740                        message_id,
10741                        created_at_ms: msg_ts,
10742                        hour_id: msg_hour_id,
10743                        day_id: msg_day_id,
10744                        agent_slug: conv.agent_slug.clone(),
10745                        workspace_id: workspace_id.unwrap_or(0),
10746                        source_id: conv.source_id.clone(),
10747                        role: role_s.to_string(),
10748                        content_chars,
10749                        content_tokens_est,
10750                        model_name: usage.model_name.clone(),
10751                        model_family: model_family.clone(),
10752                        model_tier: model_tier.clone(),
10753                        provider,
10754                        api_input_tokens: usage.input_tokens,
10755                        api_output_tokens: usage.output_tokens,
10756                        api_cache_read_tokens: usage.cache_read_tokens,
10757                        api_cache_creation_tokens: usage.cache_creation_tokens,
10758                        api_thinking_tokens: usage.thinking_tokens,
10759                        api_service_tier: usage.service_tier.clone(),
10760                        api_data_source: usage.data_source.as_str().to_string(),
10761                        tool_call_count: usage.tool_call_count as i64,
10762                        has_tool_calls: usage.has_tool_calls,
10763                        has_plan,
10764                    };
10765                    rollup_agg.record(&mm);
10766                    metrics_entries.push(mm);
10767                }
10768
10769                if session_count_delta > 0 {
10770                    token_stats.record_session(
10771                        &conv.agent_slug,
10772                        &conv.source_id,
10773                        conv_day_id,
10774                        &session_model_family,
10775                    );
10776                }
10777
10778                if has_any_tokens {
10779                    conv_ids_to_summarize.push(conv_id);
10780                }
10781            }
10782
10783            outcomes.push(InsertOutcome {
10784                conversation_id: conv_id,
10785                conversation_inserted: session_count_delta > 0,
10786                inserted_indices,
10787            });
10788        }
10789
10790        // Batch insert all FTS entries at once
10791        if !defer_lexical_updates {
10792            flush_pending_fts_entries(
10793                self,
10794                &tx,
10795                &mut fts_entries,
10796                &mut fts_pending_chars,
10797                &mut fts_inserted_total,
10798            )?;
10799        }
10800        if !defer_lexical_updates && fts_count_total > 0 {
10801            tracing::debug!(
10802                target: "cass::perf::fts5",
10803                total = fts_count_total,
10804                inserted = fts_inserted_total,
10805                conversations = conversations.len(),
10806                "franken_batch_fts_insert_complete"
10807            );
10808        }
10809
10810        // Batched daily_stats update
10811        if !defer_analytics_updates && !stats.is_empty() {
10812            let entries = stats.expand();
10813            let affected = franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
10814            tracing::debug!(
10815                target: "cass::perf::daily_stats",
10816                raw = stats.raw_entry_count(),
10817                expanded = entries.len(),
10818                affected = affected,
10819                "franken_batched_stats_update_complete"
10820            );
10821        }
10822
10823        // Batch insert token_usage rows
10824        if !defer_analytics_updates && !token_entries.is_empty() {
10825            let token_count = token_entries.len();
10826            let inserted = franken_insert_token_usage_batched_in_tx(&tx, &token_entries)?;
10827            tracing::debug!(
10828                target: "cass::perf::token_usage",
10829                total = token_count,
10830                inserted = inserted,
10831                "franken_batch_token_usage_insert_complete"
10832            );
10833        }
10834
10835        // Batched token_daily_stats update
10836        if !defer_analytics_updates && !token_stats.is_empty() {
10837            let entries = token_stats.expand();
10838            let affected = franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
10839            tracing::debug!(
10840                target: "cass::perf::token_daily_stats",
10841                raw = token_stats.raw_entry_count(),
10842                expanded = entries.len(),
10843                affected = affected,
10844                "franken_batched_token_stats_update_complete"
10845            );
10846        }
10847
10848        // Batch insert message_metrics rows
10849        if !defer_analytics_updates && !metrics_entries.is_empty() {
10850            let mm_count = metrics_entries.len();
10851            let inserted = franken_insert_message_metrics_batched_in_tx(&tx, &metrics_entries)?;
10852            tracing::debug!(
10853                target: "cass::perf::message_metrics",
10854                total = mm_count,
10855                inserted = inserted,
10856                "franken_batch_message_metrics_insert_complete"
10857            );
10858        }
10859
10860        // Flush usage_hourly + usage_daily rollups
10861        if !defer_analytics_updates && !rollup_agg.is_empty() {
10862            let (hourly, daily, models_daily) =
10863                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
10864            tracing::debug!(
10865                target: "cass::perf::usage_rollups",
10866                hourly_buckets = rollup_agg.hourly_entry_count(),
10867                daily_buckets = rollup_agg.daily_entry_count(),
10868                models_daily_buckets = rollup_agg.models_daily_entry_count(),
10869                hourly_affected = hourly,
10870                daily_affected = daily,
10871                models_daily_affected = models_daily,
10872                "franken_batched_usage_rollups_complete"
10873            );
10874        }
10875
10876        // Update conversation-level token summaries
10877        if !defer_analytics_updates {
10878            for conv_id in &conv_ids_to_summarize {
10879                franken_update_conversation_token_summaries_in_tx(&tx, *conv_id)?;
10880            }
10881        }
10882
10883        tx.commit()?;
10884
10885        pricing_diag.log_summary();
10886
10887        Ok(outcomes)
10888    }
10889}
10890
10891fn normalized_storage_source_parts(
10892    source_id: Option<&str>,
10893    origin_kind: Option<&str>,
10894    origin_host: Option<&str>,
10895) -> (String, SourceKind, Option<String>) {
10896    let host_label = crate::search::tantivy::normalized_index_origin_host(origin_host);
10897    let source_id = crate::search::tantivy::normalized_index_source_id(
10898        source_id,
10899        origin_kind,
10900        host_label.as_deref(),
10901    );
10902
10903    if source_id == LOCAL_SOURCE_ID {
10904        (source_id, SourceKind::Local, None)
10905    } else {
10906        (source_id, SourceKind::Ssh, host_label)
10907    }
10908}
10909
10910fn normalized_source_for_conversation(conv: &Conversation) -> Source {
10911    let (id, kind, host_label) = normalized_storage_source_parts(
10912        Some(conv.source_id.as_str()),
10913        None,
10914        conv.origin_host.as_deref(),
10915    );
10916    Source {
10917        id,
10918        kind,
10919        host_label,
10920        machine_id: None,
10921        platform: None,
10922        config_json: None,
10923        created_at: None,
10924        updated_at: None,
10925    }
10926}
10927
10928fn is_bootstrap_local_source(source: &Source) -> bool {
10929    source.id == LOCAL_SOURCE_ID
10930        && matches!(source.kind, SourceKind::Local)
10931        && source.host_label.is_none()
10932        && source.machine_id.is_none()
10933        && source.platform.is_none()
10934        && source.config_json.is_none()
10935        && source.created_at.is_none()
10936        && source.updated_at.is_none()
10937}
10938
10939fn normalized_conversation_for_storage<'a>(conv: &'a Conversation) -> Cow<'a, Conversation> {
10940    let normalized_source = normalized_source_for_conversation(conv);
10941    if normalized_source.id == conv.source_id && normalized_source.host_label == conv.origin_host {
10942        Cow::Borrowed(conv)
10943    } else {
10944        let mut normalized = conv.clone();
10945        normalized.source_id = normalized_source.id;
10946        normalized.origin_host = normalized_source.host_label;
10947        Cow::Owned(normalized)
10948    }
10949}
10950
10951impl FrankenStorage {
10952    fn ensure_source_for_conversation(&self, conv: &Conversation) -> Result<()> {
10953        let source = normalized_source_for_conversation(conv);
10954        if is_bootstrap_local_source(&source) {
10955            // `open()` and schema repair always seed the canonical local source row.
10956            // Avoid an autocommit UPDATE on every local conversation insert.
10957            return Ok(());
10958        }
10959        let cache_key = EnsuredConversationSourceKey::from_source(&source);
10960        if self.conversation_source_already_ensured(&cache_key) {
10961            return Ok(());
10962        }
10963        self.upsert_source(&source)?;
10964        self.mark_conversation_source_ensured(cache_key);
10965        Ok(())
10966    }
10967
10968    fn ensure_sources_for_batch(
10969        &self,
10970        conversations: &[(i64, Option<i64>, &Conversation)],
10971    ) -> Result<()> {
10972        let mut seen = HashSet::with_capacity(conversations.len());
10973        for &(_, _, conv) in conversations {
10974            let source = normalized_source_for_conversation(conv);
10975            if seen.insert(source.id.clone()) {
10976                if is_bootstrap_local_source(&source) {
10977                    continue;
10978                }
10979                self.upsert_source(&source)?;
10980                self.mark_conversation_source_ensured(EnsuredConversationSourceKey::from_source(
10981                    &source,
10982                ));
10983            }
10984        }
10985        Ok(())
10986    }
10987}
10988
10989// =========================================================================
10990// FrankenStorage transaction helper functions
10991// =========================================================================
10992
10993/// Get last_insert_rowid from a frankensqlite transaction.
10994fn franken_last_rowid(tx: &FrankenTransaction<'_>) -> Result<i64> {
10995    tx.last_insert_rowid()
10996        .ok()
10997        .filter(|&id| id > 0)
10998        .with_context(|| "last_insert_rowid() returned NULL or 0 after INSERT")
10999}
11000
11001/// Bug #167: Ensure all agents referenced by a batch exist within the
11002/// transaction.  The caller already resolved `agent_id` values via
11003/// `ensure_agent` outside the transaction, but those autocommit writes may
11004/// not be visible inside a frankensqlite transaction snapshot.  This function
11005/// checks each unique agent_id and creates a stub row if it's missing.
11006fn ensure_agents_in_tx(
11007    tx: &FrankenTransaction<'_>,
11008    conversations: &[(i64, Option<i64>, &Conversation)],
11009) -> Result<()> {
11010    let mut seen = HashSet::new();
11011    let now = FrankenStorage::now_millis();
11012    for &(agent_id, _, conv) in conversations {
11013        if !seen.insert(agent_id) {
11014            continue;
11015        }
11016        let exists: i64 = tx.query_row_map(
11017            "SELECT COUNT(*) FROM agents WHERE id = ?1",
11018            fparams![agent_id],
11019            |row| row.get_typed(0),
11020        )?;
11021        if exists == 0 {
11022            tracing::debug!(
11023                target: "cass::fk_guard",
11024                agent_id,
11025                slug = %conv.agent_slug,
11026                "inserting agent row inside transaction to satisfy FK constraint"
11027            );
11028            // INSERT OR IGNORE: the slug might already exist with a different
11029            // id from a concurrent writer.  If the slug row exists, the FK
11030            // constraint is already satisfied (the caller just got a stale id).
11031            tx.execute_compat(
11032                "INSERT OR IGNORE INTO agents(id, slug, name, kind, created_at, updated_at)
11033                 VALUES(?1, ?2, ?3, 'cli', ?4, ?5)",
11034                fparams![
11035                    agent_id,
11036                    conv.agent_slug.as_str(),
11037                    conv.agent_slug.as_str(),
11038                    now,
11039                    now
11040                ],
11041            )?;
11042        }
11043    }
11044    Ok(())
11045}
11046
11047/// Bug #167: Ensure all workspaces referenced by a batch exist within the
11048/// transaction.  Same rationale as `ensure_agents_in_tx`.
11049fn ensure_workspaces_in_tx(
11050    tx: &FrankenTransaction<'_>,
11051    conversations: &[(i64, Option<i64>, &Conversation)],
11052) -> Result<()> {
11053    let mut seen = HashSet::new();
11054    for &(_, workspace_id, conv) in conversations {
11055        let ws_id = match workspace_id {
11056            Some(id) => id,
11057            None => continue,
11058        };
11059        if !seen.insert(ws_id) {
11060            continue;
11061        }
11062        let exists: i64 = tx.query_row_map(
11063            "SELECT COUNT(*) FROM workspaces WHERE id = ?1",
11064            fparams![ws_id],
11065            |row| row.get_typed(0),
11066        )?;
11067        if exists == 0 {
11068            let path_str = conv
11069                .workspace
11070                .as_ref()
11071                .map(|p| p.to_string_lossy().to_string())
11072                .unwrap_or_default();
11073            tracing::debug!(
11074                target: "cass::fk_guard",
11075                workspace_id = ws_id,
11076                path = %path_str,
11077                "inserting workspace row inside transaction to satisfy FK constraint"
11078            );
11079            tx.execute_compat(
11080                "INSERT OR IGNORE INTO workspaces(id, path) VALUES(?1, ?2)",
11081                fparams![ws_id, path_str.as_str()],
11082            )?;
11083        }
11084    }
11085    Ok(())
11086}
11087
11088/// Bug #167: Ensure all sources referenced by a batch exist within the
11089/// transaction.  Same rationale as `ensure_agents_in_tx` — source_id is a
11090/// TEXT FK on the conversations table.
11091fn ensure_sources_in_tx(
11092    tx: &FrankenTransaction<'_>,
11093    conversations: &[(i64, Option<i64>, &Conversation)],
11094) -> Result<()> {
11095    let mut seen = HashSet::new();
11096    for &(_, _, conv) in conversations {
11097        let (source_id, source_kind, host_label) = normalized_storage_source_parts(
11098            Some(conv.source_id.as_str()),
11099            None,
11100            conv.origin_host.as_deref(),
11101        );
11102        if !seen.insert(source_id.clone()) {
11103            continue;
11104        }
11105        let exists: i64 = tx.query_row_map(
11106            "SELECT COUNT(*) FROM sources WHERE id = ?1",
11107            fparams![source_id.as_str()],
11108            |row| row.get_typed(0),
11109        )?;
11110        if exists == 0 {
11111            let kind_str = source_kind.to_string();
11112            let now = FrankenStorage::now_millis();
11113            tracing::debug!(
11114                target: "cass::fk_guard",
11115                source_id = %source_id,
11116                kind = kind_str.as_str(),
11117                "inserting source row inside transaction to satisfy FK constraint"
11118            );
11119            tx.execute_compat(
11120                "INSERT OR IGNORE INTO sources(id, kind, host_label, created_at, updated_at)
11121                 VALUES(?1, ?2, ?3, ?4, ?5)",
11122                fparams![
11123                    source_id.as_str(),
11124                    kind_str.as_str(),
11125                    host_label.as_deref(),
11126                    now,
11127                    now
11128                ],
11129            )?;
11130        }
11131    }
11132    Ok(())
11133}
11134
11135fn env_flag_enabled(name: &str) -> bool {
11136    dotenvy::var(name)
11137        .map(|v| !(v == "0" || v.eq_ignore_ascii_case("false")))
11138        .unwrap_or(false)
11139}
11140
11141fn defer_storage_lexical_updates_enabled() -> bool {
11142    env_flag_enabled("CASS_DEFER_LEXICAL_UPDATES")
11143}
11144
11145fn defer_analytics_updates_enabled() -> bool {
11146    env_flag_enabled("CASS_DEFER_ANALYTICS_UPDATES")
11147}
11148
11149enum ConversationInsertStatus {
11150    Inserted(i64),
11151    Existing(i64),
11152}
11153
11154fn franken_find_external_conversation_tail_lookup(
11155    tx: &FrankenTransaction<'_>,
11156    lookup_key: &str,
11157) -> Result<Option<ExistingConversationWithTail>> {
11158    let params = [SqliteValue::from(lookup_key)];
11159    let row = tx
11160        .query_row_with_params(
11161            "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
11162             FROM conversation_external_tail_lookup
11163             WHERE lookup_key = ?1",
11164            &params,
11165        )
11166        .optional()?;
11167    let Some(row) = row else {
11168        return Ok(None);
11169    };
11170    let id = row.get_typed(0)?;
11171    let ended_at = row.get_typed(1)?;
11172    let last_message_idx = row.get_typed(2)?;
11173    let last_message_created_at = row.get_typed(3)?;
11174    Ok(Some(ExistingConversationWithTail {
11175        id,
11176        tail_state: existing_conversation_tail_state_from_cached(
11177            last_message_idx,
11178            last_message_created_at,
11179            ended_at,
11180        ),
11181    }))
11182}
11183
11184fn franken_find_external_conversation_lookup(
11185    tx: &FrankenTransaction<'_>,
11186    lookup_key: &str,
11187) -> Result<Option<i64>> {
11188    Ok(franken_find_external_conversation_tail_lookup(tx, lookup_key)?.map(|existing| existing.id))
11189}
11190
11191fn franken_insert_external_conversation_tail_lookup_key(
11192    tx: &FrankenTransaction<'_>,
11193    lookup_key: &str,
11194    conversation_id: i64,
11195    ended_at: Option<i64>,
11196    last_message_idx: Option<i64>,
11197    last_message_created_at: Option<i64>,
11198) -> Result<()> {
11199    let params = [
11200        SqliteValue::from(lookup_key),
11201        SqliteValue::from(conversation_id),
11202        SqliteValue::from(ended_at),
11203        SqliteValue::from(last_message_idx),
11204        SqliteValue::from(last_message_created_at),
11205    ];
11206    tx.execute_with_params(
11207        "INSERT OR REPLACE INTO conversation_external_tail_lookup(
11208             lookup_key, conversation_id, ended_at, last_message_idx, last_message_created_at
11209         ) VALUES(?1, ?2, ?3, ?4, ?5)",
11210        &params,
11211    )?;
11212    Ok(())
11213}
11214
11215fn franken_insert_external_conversation_tail_lookup(
11216    tx: &FrankenTransaction<'_>,
11217    source_id: &str,
11218    agent_id: i64,
11219    external_id: &str,
11220    existing: ExistingConversationWithTail,
11221) -> Result<()> {
11222    let lookup_key = conversation_external_lookup_key(source_id, agent_id, external_id);
11223    let ended_at = existing.tail_state.and_then(|state| state.ended_at);
11224    let last_message_idx = existing.tail_state.map(|state| state.last_message_idx);
11225    let last_message_created_at = existing
11226        .tail_state
11227        .map(|state| state.last_message_created_at);
11228    franken_insert_external_conversation_tail_lookup_key(
11229        tx,
11230        &lookup_key,
11231        existing.id,
11232        ended_at,
11233        last_message_idx,
11234        last_message_created_at,
11235    )
11236}
11237
11238fn franken_update_external_conversation_tail_lookup_key(
11239    tx: &FrankenTransaction<'_>,
11240    lookup_key: &str,
11241    ended_at_candidate: Option<i64>,
11242    last_message_idx_candidate: Option<i64>,
11243    last_message_created_at_candidate: Option<i64>,
11244) -> Result<()> {
11245    if ended_at_candidate.is_none()
11246        && last_message_idx_candidate.is_none()
11247        && last_message_created_at_candidate.is_none()
11248    {
11249        return Ok(());
11250    }
11251    tx.execute_compat(
11252        "UPDATE conversation_external_tail_lookup
11253         SET ended_at = CASE
11254                 WHEN ?1 IS NULL THEN ended_at
11255                 ELSE MAX(IFNULL(ended_at, 0), ?1)
11256             END,
11257             last_message_idx = CASE
11258                 WHEN ?2 IS NULL THEN last_message_idx
11259                 WHEN last_message_idx IS NULL OR last_message_idx < ?2 THEN ?2
11260                 ELSE last_message_idx
11261             END,
11262             last_message_created_at = CASE
11263                 WHEN ?3 IS NULL THEN last_message_created_at
11264                 WHEN last_message_created_at IS NULL OR last_message_created_at < ?3 THEN ?3
11265                 ELSE last_message_created_at
11266             END
11267         WHERE lookup_key = ?4",
11268        fparams![
11269            ended_at_candidate,
11270            last_message_idx_candidate,
11271            last_message_created_at_candidate,
11272            lookup_key
11273        ],
11274    )?;
11275    Ok(())
11276}
11277
11278fn franken_set_external_conversation_tail_lookup_after_append(
11279    tx: &FrankenTransaction<'_>,
11280    lookup_key: &str,
11281    ended_at: i64,
11282    last_message_idx: i64,
11283    last_message_created_at: i64,
11284) -> Result<()> {
11285    tx.execute_compat(
11286        "UPDATE conversation_external_tail_lookup
11287         SET ended_at = ?1,
11288             last_message_idx = ?2,
11289             last_message_created_at = ?3
11290         WHERE lookup_key = ?4",
11291        fparams![
11292            ended_at,
11293            last_message_idx,
11294            last_message_created_at,
11295            lookup_key
11296        ],
11297    )?;
11298    Ok(())
11299}
11300
11301fn franken_update_external_conversation_tail_after_append(
11302    tx: &FrankenTransaction<'_>,
11303    agent_id: i64,
11304    conv: &Conversation,
11305    used_append_tail_plan: bool,
11306    exact_append_set: bool,
11307    inserted_last_idx: Option<i64>,
11308    inserted_last_created_at: Option<i64>,
11309) -> Result<()> {
11310    let Some(lookup_key) = conversation_external_lookup_key_for_conv(agent_id, conv) else {
11311        return Ok(());
11312    };
11313
11314    if exact_append_set
11315        && let (Some(last_message_idx), Some(last_message_created_at)) =
11316            (inserted_last_idx, inserted_last_created_at)
11317    {
11318        return franken_set_external_conversation_tail_lookup_after_append(
11319            tx,
11320            &lookup_key,
11321            last_message_created_at,
11322            last_message_idx,
11323            last_message_created_at,
11324        );
11325    }
11326
11327    let ended_at_candidate = if used_append_tail_plan {
11328        inserted_last_created_at
11329    } else {
11330        conv.messages.iter().filter_map(|m| m.created_at).max()
11331    };
11332    franken_update_external_conversation_tail_lookup_key(
11333        tx,
11334        &lookup_key,
11335        ended_at_candidate,
11336        inserted_last_idx,
11337        inserted_last_created_at,
11338    )
11339}
11340
11341fn franken_find_existing_conversation_by_key(
11342    tx: &FrankenTransaction<'_>,
11343    key: &PendingConversationKey,
11344    conv: Option<&Conversation>,
11345) -> Result<Option<i64>> {
11346    franken_find_existing_conversation_by_key_impl(tx, key, conv, false)
11347}
11348
11349fn franken_find_existing_conversation_by_key_after_conflict(
11350    tx: &FrankenTransaction<'_>,
11351    key: &PendingConversationKey,
11352    conv: Option<&Conversation>,
11353) -> Result<Option<i64>> {
11354    franken_find_existing_conversation_by_key_impl(tx, key, conv, true)
11355}
11356
11357fn franken_find_existing_conversation_by_key_impl(
11358    tx: &FrankenTransaction<'_>,
11359    key: &PendingConversationKey,
11360    conv: Option<&Conversation>,
11361    allow_legacy_external_scan: bool,
11362) -> Result<Option<i64>> {
11363    match key {
11364        PendingConversationKey::External {
11365            source_id,
11366            agent_id,
11367            external_id,
11368        } => {
11369            let lookup_key = conversation_external_lookup_key(source_id, *agent_id, external_id);
11370            if let Some(existing_id) = franken_find_external_conversation_lookup(tx, &lookup_key)? {
11371                return Ok(Some(existing_id));
11372            }
11373            if !allow_legacy_external_scan {
11374                return Ok(None);
11375            }
11376
11377            let existing_id = tx
11378                .query_row_map(
11379                    "SELECT id
11380                 FROM conversations
11381                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
11382                    fparams![source_id.as_str(), *agent_id, external_id.as_str()],
11383                    |row| row.get_typed(0),
11384                )
11385                .optional()?;
11386            if let Some(existing_id) = existing_id {
11387                let tail_state = franken_existing_conversation_append_tail_state(tx, existing_id)?;
11388                franken_insert_external_conversation_tail_lookup_key(
11389                    tx,
11390                    &lookup_key,
11391                    existing_id,
11392                    tail_state.and_then(|state| state.ended_at),
11393                    tail_state.map(|state| state.last_message_idx),
11394                    tail_state.map(|state| state.last_message_created_at),
11395                )?;
11396                Ok(Some(existing_id))
11397            } else {
11398                Ok(None)
11399            }
11400        }
11401        PendingConversationKey::SourcePath {
11402            source_id,
11403            agent_id,
11404            source_path,
11405            started_at,
11406        } => {
11407            let exact_match = tx
11408                .query_row_map(
11409                    "SELECT c.id
11410                     FROM conversations c
11411                     WHERE c.source_id = ?1
11412                       AND c.agent_id = ?2
11413                       AND c.source_path = ?3
11414                       AND ((
11415                            COALESCE(
11416                                c.started_at,
11417                                (SELECT MIN(created_at)
11418                                 FROM messages
11419                                 WHERE conversation_id = c.id
11420                                   AND created_at IS NOT NULL)
11421                            ) IS NULL
11422                            AND ?4 IS NULL
11423                       ) OR COALESCE(
11424                            c.started_at,
11425                            (SELECT MIN(created_at)
11426                             FROM messages
11427                             WHERE conversation_id = c.id
11428                               AND created_at IS NOT NULL)
11429                       ) = ?4)
11430                     ORDER BY c.id
11431                     LIMIT 1",
11432                    fparams![
11433                        source_id.as_str(),
11434                        *agent_id,
11435                        source_path.as_str(),
11436                        *started_at
11437                    ],
11438                    |row| row.get_typed(0),
11439                )
11440                .optional()?;
11441            if exact_match.is_some() {
11442                return Ok(exact_match);
11443            }
11444
11445            let Some(conv) = conv else {
11446                return Ok(None);
11447            };
11448            let incoming_fingerprints = conversation_message_fingerprints(conv);
11449            if incoming_fingerprints.is_empty() {
11450                return Ok(None);
11451            }
11452            let incoming_replay_fingerprints = conversation_message_replay_fingerprints(conv);
11453
11454            let candidates: Vec<(i64, Option<i64>)> = tx.query_map_collect(
11455                "SELECT
11456                     c.id,
11457                     COALESCE(
11458                         c.started_at,
11459                         (SELECT MIN(created_at)
11460                          FROM messages
11461                          WHERE conversation_id = c.id
11462                            AND created_at IS NOT NULL)
11463                     ) AS effective_started_at
11464                 FROM conversations c
11465                 WHERE c.source_id = ?1
11466                   AND c.agent_id = ?2
11467                   AND c.source_path = ?3
11468                 ORDER BY c.id",
11469                fparams![source_id.as_str(), *agent_id, source_path.as_str()],
11470                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
11471            )?;
11472
11473            let mut best_candidate: Option<(i64, ConversationMergeEvidence)> = None;
11474            for (candidate_id, candidate_started_at) in candidates {
11475                let existing_fingerprints =
11476                    franken_existing_message_fingerprints(tx, candidate_id)?;
11477                let existing_replay_fingerprints =
11478                    replay_fingerprints_from_merge_set(&existing_fingerprints);
11479                let Some(evidence) = conversation_merge_evidence(
11480                    &incoming_fingerprints,
11481                    &incoming_replay_fingerprints,
11482                    &existing_fingerprints,
11483                    &existing_replay_fingerprints,
11484                    *started_at,
11485                    candidate_started_at,
11486                ) else {
11487                    continue;
11488                };
11489
11490                let candidate_key = (
11491                    evidence.exact_overlap,
11492                    evidence.replay_overlap,
11493                    evidence.started_close,
11494                    evidence.smaller_replay_set,
11495                    std::cmp::Reverse(evidence.start_distance_ms),
11496                );
11497                let should_replace = best_candidate
11498                    .as_ref()
11499                    .map(|(_, best_evidence)| {
11500                        candidate_key
11501                            > (
11502                                best_evidence.exact_overlap,
11503                                best_evidence.replay_overlap,
11504                                best_evidence.started_close,
11505                                best_evidence.smaller_replay_set,
11506                                std::cmp::Reverse(best_evidence.start_distance_ms),
11507                            )
11508                    })
11509                    .unwrap_or(true);
11510
11511                if should_replace {
11512                    best_candidate = Some((candidate_id, evidence));
11513                }
11514            }
11515
11516            Ok(best_candidate.map(|(candidate_id, _)| candidate_id))
11517        }
11518    }
11519}
11520
11521fn franken_insert_conversation_or_get_existing(
11522    tx: &FrankenTransaction<'_>,
11523    agent_id: i64,
11524    workspace_id: Option<i64>,
11525    conv: &Conversation,
11526) -> Result<ConversationInsertStatus> {
11527    let conversation_key = conversation_merge_key(agent_id, conv);
11528    if let Some(existing_id) =
11529        franken_find_existing_conversation_by_key(tx, &conversation_key, Some(conv))?
11530    {
11531        return Ok(ConversationInsertStatus::Existing(existing_id));
11532    }
11533
11534    franken_insert_conversation_or_get_existing_after_miss(
11535        tx,
11536        agent_id,
11537        workspace_id,
11538        conv,
11539        &conversation_key,
11540    )
11541}
11542
11543fn franken_insert_conversation_or_get_existing_after_miss(
11544    tx: &FrankenTransaction<'_>,
11545    agent_id: i64,
11546    workspace_id: Option<i64>,
11547    conv: &Conversation,
11548    conversation_key: &PendingConversationKey,
11549) -> Result<ConversationInsertStatus> {
11550    match franken_insert_conversation(tx, agent_id, workspace_id, conv) {
11551        Ok(Some(conv_id)) => Ok(ConversationInsertStatus::Inserted(conv_id)),
11552        Ok(None) => {
11553            // A concurrent writer won the unique-provenance race. Resolve the
11554            // canonical row so callers can merge messages into it.
11555            let existing_id =
11556                franken_find_existing_conversation_by_key_after_conflict(
11557                    tx,
11558                    conversation_key,
11559                    Some(conv),
11560                )?
11561                    .with_context(|| {
11562                        format!(
11563                            "conversation INSERT produced a duplicate conflict but existing row was not found for source_id={} agent_id={} external_id={:?} source_path={}",
11564                            conv.source_id,
11565                            agent_id,
11566                            conv.external_id,
11567                            conv.source_path.display()
11568                        )
11569                    })?;
11570            tracing::warn!(
11571                source_id = %conv.source_id,
11572                agent_id,
11573                external_id = ?conv.external_id,
11574                existing_id,
11575                source_path = %conv.source_path.display(),
11576                "conversation INSERT: duplicate gracefully recovered, reusing existing row"
11577            );
11578            Ok(ConversationInsertStatus::Existing(existing_id))
11579        }
11580        Err(error) => {
11581            tracing::error!(
11582                source_id = %conv.source_id,
11583                agent_id,
11584                external_id = ?conv.external_id,
11585                error = %error,
11586                source_path = %conv.source_path.display(),
11587                "franken_insert_conversation failed"
11588            );
11589            Err(error)
11590        }
11591    }
11592}
11593
11594/// Insert a conversation into the DB within a frankensqlite transaction.
11595///
11596/// Uses a plain `INSERT` so the common miss path stays on the slim direct
11597/// insert lane. Duplicate provenance conflicts are converted into `Ok(None)`
11598/// so callers can recover the canonical row and merge messages into it.
11599fn franken_insert_conversation(
11600    tx: &FrankenTransaction<'_>,
11601    agent_id: i64,
11602    workspace_id: Option<i64>,
11603    conv: &Conversation,
11604) -> Result<Option<i64>> {
11605    let (metadata_json_str, metadata_bin) = franken_metadata_insert_payload(&conv.metadata_json)?;
11606    let (last_message_idx, last_message_created_at) = conversation_tail_state(conv);
11607    let metadata_bin_bytes = metadata_bin.as_deref();
11608
11609    match tx.execute_compat(
11610        "INSERT INTO conversations(
11611            agent_id, workspace_id, source_id, external_id, title, source_path,
11612            started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin,
11613            last_message_idx, last_message_created_at
11614        ) VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14)",
11615        fparams![
11616            agent_id,
11617            workspace_id,
11618            conv.source_id.as_str(),
11619            conv.external_id.as_deref(),
11620            conv.title.as_deref(),
11621            path_to_string(&conv.source_path),
11622            conv.started_at,
11623            conv.ended_at,
11624            conv.approx_tokens,
11625            metadata_json_str.as_deref(),
11626            conv.origin_host.as_deref(),
11627            metadata_bin_bytes,
11628            last_message_idx,
11629            last_message_created_at
11630        ],
11631    ) {
11632        Ok(_) => {
11633            let conv_id = franken_last_rowid(tx)?;
11634            franken_insert_conversation_tail_state(
11635                tx,
11636                conv_id,
11637                conv.ended_at,
11638                last_message_idx,
11639                last_message_created_at,
11640            )?;
11641            if let Some(external_id) = conv.external_id.as_deref() {
11642                franken_insert_external_conversation_tail_lookup(
11643                    tx,
11644                    conv.source_id.as_str(),
11645                    agent_id,
11646                    external_id,
11647                    ExistingConversationWithTail {
11648                        id: conv_id,
11649                        tail_state: existing_conversation_tail_state_from_cached(
11650                            last_message_idx,
11651                            last_message_created_at,
11652                            conv.ended_at,
11653                        ),
11654                    },
11655                )?;
11656            }
11657            Ok(Some(conv_id))
11658        }
11659        Err(frankensqlite::FrankenError::UniqueViolation { .. }) => {
11660            tracing::debug!(
11661                source_id = %conv.source_id,
11662                agent_id,
11663                external_id = ?conv.external_id,
11664                source_path = %conv.source_path.display(),
11665                "conversation INSERT: duplicate provenance conflict"
11666            );
11667            Ok(None)
11668        }
11669        Err(error) => Err(error.into()),
11670    }
11671}
11672
11673type MetadataInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11674
11675fn franken_metadata_insert_payload(value: &serde_json::Value) -> Result<MetadataInsertPayload<'_>> {
11676    if let Some(raw) = historical_raw_json(value) {
11677        Ok((Some(Cow::Borrowed(raw)), None))
11678    } else if value.is_null() {
11679        Ok((Some(Cow::Borrowed("null")), None))
11680    } else if value.as_object().is_some_and(|object| object.is_empty()) {
11681        Ok((None, None))
11682    } else if let Some(metadata_bin) = serialize_json_to_msgpack(value) {
11683        Ok((None, Some(metadata_bin)))
11684    } else {
11685        Ok((Some(Cow::Owned(serde_json::to_string(value)?)), None))
11686    }
11687}
11688
11689fn franken_insert_new_message(
11690    tx: &FrankenTransaction<'_>,
11691    conversation_id: i64,
11692    msg: &Message,
11693) -> Result<i64> {
11694    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11695    let extra_bin_bytes = extra_bin.as_deref();
11696
11697    tx.execute_compat(
11698        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11699         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11700            fparams![
11701                conversation_id,
11702                msg.idx,
11703                role_as_str(&msg.role),
11704                msg.author.as_deref(),
11705                msg.created_at,
11706                msg.content.as_str(),
11707                extra_json_str.as_deref(),
11708                extra_bin_bytes
11709        ],
11710    )?;
11711    franken_last_rowid(tx)
11712}
11713
11714type MessageInsertPayload<'a> = (Option<Cow<'a, str>>, Option<Vec<u8>>);
11715
11716fn franken_message_insert_payload(msg: &Message) -> Result<MessageInsertPayload<'_>> {
11717    if let Some(raw) = historical_raw_json(&msg.extra_json) {
11718        Ok((Some(Cow::Borrowed(raw)), None))
11719    } else if msg.extra_json.is_null() {
11720        Ok((None, None))
11721    } else {
11722        let extra_bin = serialize_json_to_msgpack(&msg.extra_json);
11723        if extra_bin.is_some() {
11724            Ok((None, extra_bin))
11725        } else {
11726            Ok((
11727                Some(Cow::Owned(serde_json::to_string(&msg.extra_json)?)),
11728                None,
11729            ))
11730        }
11731    }
11732}
11733
11734/// Batch size for proven-new message inserts.
11735///
11736/// Each row binds 8 values, so 100 rows stays well under SQLite's default
11737/// `SQLITE_MAX_VARIABLE_NUMBER` limit of 999 while still amortizing parse cost.
11738const MESSAGE_INSERT_BATCH_SIZE: usize = 100;
11739
11740/// Append workloads profile fastest with larger chunks on current frankensqlite.
11741///
11742/// After the tail-state hot table removed conversation-row rewrites from the
11743/// append path, 50-row chunks beat the old 20-row setting on the append-merge
11744/// profile. 100-row chunks slightly regress the 20-message workload.
11745const APPEND_MESSAGE_INSERT_BATCH_SIZE: usize = 50;
11746
11747fn message_insert_batch_sql(row_count: usize) -> &'static str {
11748    static MESSAGE_INSERT_BATCH_SQL: std::sync::OnceLock<Vec<String>> = std::sync::OnceLock::new();
11749
11750    let max_batch_size = MESSAGE_INSERT_BATCH_SIZE.max(APPEND_MESSAGE_INSERT_BATCH_SIZE);
11751    let cached_sql = MESSAGE_INSERT_BATCH_SQL.get_or_init(|| {
11752        let mut sql_by_row_count = Vec::with_capacity(max_batch_size + 1);
11753        sql_by_row_count.push(String::new());
11754        for row_count in 1..=max_batch_size {
11755            let placeholders = (0..row_count)
11756                .map(|idx| {
11757                    let base = idx * 8;
11758                    format!(
11759                        "(?{},?{},?{},?{},?{},?{},?{},?{})",
11760                        base + 1,
11761                        base + 2,
11762                        base + 3,
11763                        base + 4,
11764                        base + 5,
11765                        base + 6,
11766                        base + 7,
11767                        base + 8
11768                    )
11769                })
11770                .collect::<Vec<_>>()
11771                .join(",");
11772            sql_by_row_count.push(format!(
11773                "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin) VALUES {placeholders}"
11774            ));
11775        }
11776        sql_by_row_count
11777    });
11778
11779    cached_sql
11780        .get(row_count)
11781        .map(String::as_str)
11782        .expect("message insert batch size must be covered by the cached SQL table")
11783}
11784
11785fn franken_batch_insert_new_messages(
11786    tx: &FrankenTransaction<'_>,
11787    conversation_id: i64,
11788    messages: &[&Message],
11789) -> Result<Vec<i64>> {
11790    franken_batch_insert_new_messages_with_batch_size(
11791        tx,
11792        conversation_id,
11793        messages,
11794        MESSAGE_INSERT_BATCH_SIZE,
11795    )
11796}
11797
11798fn franken_append_insert_new_messages(
11799    tx: &FrankenTransaction<'_>,
11800    conversation_id: i64,
11801    messages: &[&Message],
11802) -> Result<Vec<i64>> {
11803    franken_batch_insert_new_messages_with_batch_size(
11804        tx,
11805        conversation_id,
11806        messages,
11807        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11808    )
11809}
11810
11811fn franken_batch_insert_new_messages_with_batch_size(
11812    tx: &FrankenTransaction<'_>,
11813    conversation_id: i64,
11814    messages: &[&Message],
11815    batch_size: usize,
11816) -> Result<Vec<i64>> {
11817    let batch_size = batch_size.max(1);
11818    let mut inserted_ids = Vec::with_capacity(messages.len());
11819    for chunk in messages.chunks(batch_size) {
11820        if chunk.len() == 1 {
11821            inserted_ids.push(franken_insert_new_message(tx, conversation_id, chunk[0])?);
11822            continue;
11823        }
11824        let sql = message_insert_batch_sql(chunk.len());
11825
11826        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
11827        for msg in chunk {
11828            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11829            param_values.push(SqliteValue::from(conversation_id));
11830            param_values.push(SqliteValue::from(msg.idx));
11831            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
11832            param_values.push(SqliteValue::from(msg.author.as_deref()));
11833            param_values.push(SqliteValue::from(msg.created_at));
11834            param_values.push(SqliteValue::from(msg.content.as_str()));
11835            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
11836            param_values.push(SqliteValue::from(extra_bin.as_deref()));
11837        }
11838
11839        tx.execute_with_params(sql, &param_values)?;
11840
11841        let last_id = franken_last_rowid(tx)?;
11842        let first_id = last_id
11843            .checked_sub((chunk.len() - 1) as i64)
11844            .with_context(|| {
11845                format!(
11846                    "inferring rowid range for {}-row message batch ending at {last_id}",
11847                    chunk.len()
11848                )
11849            })?;
11850        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
11851    }
11852
11853    Ok(inserted_ids)
11854}
11855
11856#[cfg(test)]
11857fn franken_insert_new_message_with_profile(
11858    tx: &FrankenTransaction<'_>,
11859    conversation_id: i64,
11860    msg: &Message,
11861    profile: &mut MessageInsertSubstageProfile,
11862) -> Result<i64> {
11863    profile.single_row_calls += 1;
11864    profile.batch_rows += 1;
11865
11866    let payload_start = Instant::now();
11867    let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11868    profile.payload_duration += payload_start.elapsed();
11869    let extra_bin_bytes = extra_bin.as_deref();
11870
11871    let execute_start = Instant::now();
11872    tx.execute_compat(
11873        "INSERT INTO messages(conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
11874         VALUES(?1,?2,?3,?4,?5,?6,?7,?8)",
11875            fparams![
11876                conversation_id,
11877                msg.idx,
11878                role_as_str(&msg.role),
11879                msg.author.as_deref(),
11880                msg.created_at,
11881                msg.content.as_str(),
11882                extra_json_str.as_deref(),
11883                extra_bin_bytes
11884        ],
11885    )?;
11886    profile.execute_duration += execute_start.elapsed();
11887
11888    let rowid_start = Instant::now();
11889    let rowid = franken_last_rowid(tx)?;
11890    profile.rowid_duration += rowid_start.elapsed();
11891    Ok(rowid)
11892}
11893
11894#[cfg(test)]
11895fn franken_batch_insert_new_messages_with_profile(
11896    tx: &FrankenTransaction<'_>,
11897    conversation_id: i64,
11898    messages: &[&Message],
11899    profile: &mut MessageInsertSubstageProfile,
11900) -> Result<Vec<i64>> {
11901    franken_batch_insert_new_messages_with_profile_batch_size(
11902        tx,
11903        conversation_id,
11904        messages,
11905        profile,
11906        MESSAGE_INSERT_BATCH_SIZE,
11907    )
11908}
11909
11910#[cfg(test)]
11911fn franken_append_insert_new_messages_with_profile(
11912    tx: &FrankenTransaction<'_>,
11913    conversation_id: i64,
11914    messages: &[&Message],
11915    profile: &mut MessageInsertSubstageProfile,
11916) -> Result<Vec<i64>> {
11917    franken_batch_insert_new_messages_with_profile_batch_size(
11918        tx,
11919        conversation_id,
11920        messages,
11921        profile,
11922        APPEND_MESSAGE_INSERT_BATCH_SIZE,
11923    )
11924}
11925
11926#[cfg(test)]
11927fn franken_batch_insert_new_messages_with_profile_batch_size(
11928    tx: &FrankenTransaction<'_>,
11929    conversation_id: i64,
11930    messages: &[&Message],
11931    profile: &mut MessageInsertSubstageProfile,
11932    batch_size: usize,
11933) -> Result<Vec<i64>> {
11934    let batch_size = batch_size.max(1);
11935    let mut inserted_ids = Vec::with_capacity(messages.len());
11936    for chunk in messages.chunks(batch_size) {
11937        if chunk.len() == 1 {
11938            inserted_ids.push(franken_insert_new_message_with_profile(
11939                tx,
11940                conversation_id,
11941                chunk[0],
11942                profile,
11943            )?);
11944            continue;
11945        }
11946
11947        profile.batch_calls += 1;
11948        profile.batch_rows += chunk.len();
11949
11950        let sql_build_start = Instant::now();
11951        let sql = message_insert_batch_sql(chunk.len());
11952        profile.sql_build_duration += sql_build_start.elapsed();
11953
11954        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 8);
11955        for msg in chunk {
11956            let payload_start = Instant::now();
11957            let (extra_json_str, extra_bin) = franken_message_insert_payload(msg)?;
11958            profile.payload_duration += payload_start.elapsed();
11959
11960            let param_build_start = Instant::now();
11961            param_values.push(SqliteValue::from(conversation_id));
11962            param_values.push(SqliteValue::from(msg.idx));
11963            param_values.push(SqliteValue::from(role_as_str(&msg.role)));
11964            param_values.push(SqliteValue::from(msg.author.as_deref()));
11965            param_values.push(SqliteValue::from(msg.created_at));
11966            param_values.push(SqliteValue::from(msg.content.as_str()));
11967            param_values.push(SqliteValue::from(extra_json_str.as_deref()));
11968            param_values.push(SqliteValue::from(extra_bin.as_deref()));
11969            profile.param_build_duration += param_build_start.elapsed();
11970        }
11971
11972        let execute_start = Instant::now();
11973        tx.execute_with_params(sql, &param_values)?;
11974        profile.execute_duration += execute_start.elapsed();
11975
11976        let rowid_start = Instant::now();
11977        let last_id = franken_last_rowid(tx)?;
11978        let first_id = last_id
11979            .checked_sub((chunk.len() - 1) as i64)
11980            .with_context(|| {
11981                format!(
11982                    "inferring rowid range for {}-row message batch ending at {last_id}",
11983                    chunk.len()
11984                )
11985            })?;
11986        inserted_ids.extend((0..chunk.len()).map(|offset| first_id + offset as i64));
11987        profile.rowid_duration += rowid_start.elapsed();
11988    }
11989
11990    Ok(inserted_ids)
11991}
11992
11993/// Insert snippets within a frankensqlite transaction.
11994fn franken_insert_snippets(
11995    tx: &FrankenTransaction<'_>,
11996    message_id: i64,
11997    snippets: &[Snippet],
11998) -> Result<()> {
11999    for snip in snippets {
12000        let file_path_str = snip.file_path.as_ref().map(path_to_string);
12001        tx.execute_compat(
12002            "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text)
12003             VALUES(?1,?2,?3,?4,?5,?6)",
12004            fparams![
12005                message_id,
12006                file_path_str.as_deref(),
12007                snip.start_line,
12008                snip.end_line,
12009                snip.language.as_deref(),
12010                snip.snippet_text.as_deref()
12011            ],
12012        )?;
12013    }
12014    Ok(())
12015}
12016
12017fn franken_existing_message_fingerprints(
12018    tx: &FrankenTransaction<'_>,
12019    conversation_id: i64,
12020) -> Result<HashSet<MessageMergeFingerprint>> {
12021    let rows = tx.query_params(
12022        "SELECT idx, role, author, created_at, content
12023         FROM messages
12024         WHERE conversation_id = ?1",
12025        fparams![conversation_id],
12026    )?;
12027    let mut fingerprints = HashSet::with_capacity(rows.len());
12028    for row in rows {
12029        let role: String = row.get_typed(1)?;
12030        let content: String = row.get_typed(4)?;
12031        fingerprints.insert(MessageMergeFingerprint {
12032            idx: row.get_typed(0)?,
12033            created_at: row.get_typed(3)?,
12034            role: role_from_str(&role),
12035            author: row.get_typed(2)?,
12036            content_hash: *blake3::hash(content.as_bytes()).as_bytes(),
12037        });
12038    }
12039    Ok(fingerprints)
12040}
12041
12042struct ExistingMessageLookup {
12043    by_idx: HashMap<i64, MessageMergeFingerprint>,
12044    replay: HashSet<MessageReplayFingerprint>,
12045}
12046
12047fn franken_existing_message_lookup(
12048    tx: &FrankenTransaction<'_>,
12049    conversation_id: i64,
12050    incoming_messages: &[Message],
12051) -> Result<ExistingMessageLookup> {
12052    if incoming_messages.is_empty() {
12053        return Ok(ExistingMessageLookup {
12054            by_idx: HashMap::new(),
12055            replay: HashSet::new(),
12056        });
12057    }
12058
12059    let min_idx = incoming_messages
12060        .iter()
12061        .map(|msg| msg.idx)
12062        .min()
12063        .unwrap_or(0);
12064    let max_idx = incoming_messages
12065        .iter()
12066        .map(|msg| msg.idx)
12067        .max()
12068        .unwrap_or(min_idx);
12069    let requires_full_scan = incoming_messages.iter().any(|msg| msg.created_at.is_none());
12070    let created_bounds = incoming_messages
12071        .iter()
12072        .filter_map(|msg| msg.created_at)
12073        .fold(None, |bounds: Option<(i64, i64)>, created_at| {
12074            Some(match bounds {
12075                Some((min_created_at, max_created_at)) => (
12076                    min_created_at.min(created_at),
12077                    max_created_at.max(created_at),
12078                ),
12079                None => (created_at, created_at),
12080            })
12081        });
12082
12083    let mut indexed_by_idx = HashMap::with_capacity(incoming_messages.len());
12084    let mut indexed_replay = HashSet::with_capacity(incoming_messages.len());
12085    let mut exact_idx_match = true;
12086    for msg in incoming_messages {
12087        record_message_lookup_exact_idx_probe();
12088        let Some((role, author, created_at, content)) = tx
12089            .query_row_map(
12090                "SELECT role, author, created_at, content
12091                 FROM messages INDEXED BY sqlite_autoindex_messages_1
12092                 WHERE conversation_id = ?1 AND idx = ?2
12093                 LIMIT 1",
12094                fparams![conversation_id, msg.idx],
12095                |row| {
12096                    Ok((
12097                        row.get_typed::<String>(0)?,
12098                        row.get_typed::<Option<String>>(1)?,
12099                        row.get_typed::<Option<i64>>(2)?,
12100                        row.get_typed::<String>(3)?,
12101                    ))
12102                },
12103            )
12104            .optional()?
12105        else {
12106            exact_idx_match = false;
12107            break;
12108        };
12109        let role = role_from_str(&role);
12110        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12111        let fingerprint = MessageMergeFingerprint {
12112            idx: msg.idx,
12113            created_at,
12114            role: role.clone(),
12115            author: author.clone(),
12116            content_hash,
12117        };
12118        if fingerprint != message_merge_fingerprint(msg) {
12119            exact_idx_match = false;
12120            break;
12121        }
12122        indexed_by_idx.insert(msg.idx, fingerprint);
12123        indexed_replay.insert(MessageReplayFingerprint {
12124            created_at,
12125            role,
12126            author,
12127            content_hash,
12128        });
12129    }
12130
12131    if exact_idx_match {
12132        return Ok(ExistingMessageLookup {
12133            by_idx: indexed_by_idx,
12134            replay: indexed_replay,
12135        });
12136    }
12137
12138    let (rows, replay_full_scan) = if requires_full_scan {
12139        let rows = tx.query_params(
12140            "SELECT idx, role, author, created_at, content
12141             FROM messages INDEXED BY sqlite_autoindex_messages_1
12142             WHERE conversation_id = ?1",
12143            fparams![conversation_id],
12144        )?;
12145        record_message_lookup_full_scan_query(rows.len());
12146        (rows, true)
12147    } else if let Some((min_created_at, max_created_at)) = created_bounds {
12148        let mut rows = tx.query_params(
12149            "SELECT idx, role, author, created_at, content
12150             FROM messages INDEXED BY sqlite_autoindex_messages_1
12151             WHERE conversation_id = ?1
12152               AND idx >= ?2
12153               AND idx <= ?3",
12154            fparams![conversation_id, min_idx, max_idx],
12155        )?;
12156        rows.extend(tx.query_params(
12157            "SELECT idx, role, author, created_at, content
12158             FROM messages INDEXED BY sqlite_autoindex_messages_1
12159             WHERE conversation_id = ?1
12160               AND created_at IS NOT NULL
12161               AND created_at >= ?2
12162               AND created_at <= ?3",
12163            fparams![conversation_id, min_created_at, max_created_at],
12164        )?);
12165        record_message_lookup_bounded_queries(2, rows.len());
12166        (rows, false)
12167    } else {
12168        let rows = tx.query_params(
12169            "SELECT idx, role, author, created_at, content
12170             FROM messages INDEXED BY sqlite_autoindex_messages_1
12171             WHERE conversation_id = ?1",
12172            fparams![conversation_id],
12173        )?;
12174        record_message_lookup_full_scan_query(rows.len());
12175        (rows, true)
12176    };
12177
12178    let mut by_idx = HashMap::with_capacity(rows.len());
12179    let mut replay = HashSet::with_capacity(rows.len());
12180    for row in rows {
12181        let idx: i64 = row.get_typed(0)?;
12182        let role: String = row.get_typed(1)?;
12183        let author: Option<String> = row.get_typed(2)?;
12184        let created_at: Option<i64> = row.get_typed(3)?;
12185        let content: String = row.get_typed(4)?;
12186        let role = role_from_str(&role);
12187        let content_hash = *blake3::hash(content.as_bytes()).as_bytes();
12188
12189        if idx >= min_idx && idx <= max_idx {
12190            by_idx.insert(
12191                idx,
12192                MessageMergeFingerprint {
12193                    idx,
12194                    created_at,
12195                    role: role.clone(),
12196                    author: author.clone(),
12197                    content_hash,
12198                },
12199            );
12200        }
12201
12202        let replay_matches = if replay_full_scan {
12203            true
12204        } else if let Some((min_created_at, max_created_at)) = created_bounds {
12205            created_at.is_some_and(|ts| ts >= min_created_at && ts <= max_created_at)
12206        } else {
12207            true
12208        };
12209        if replay_matches {
12210            replay.insert(MessageReplayFingerprint {
12211                created_at,
12212                role,
12213                author,
12214                content_hash,
12215            });
12216        }
12217    }
12218
12219    Ok(ExistingMessageLookup { by_idx, replay })
12220}
12221
12222fn franken_existing_message_lookup_with_pending(
12223    tx: &FrankenTransaction<'_>,
12224    conversation_id: i64,
12225    incoming_messages: &[Message],
12226    pending_message_fingerprints: &mut HashMap<i64, HashMap<i64, MessageMergeFingerprint>>,
12227    pending_message_replay_fingerprints: &mut HashMap<i64, HashSet<MessageReplayFingerprint>>,
12228) -> Result<ExistingMessageLookup> {
12229    if let (Some(by_idx), Some(replay)) = (
12230        pending_message_fingerprints.get(&conversation_id),
12231        pending_message_replay_fingerprints.get(&conversation_id),
12232    ) {
12233        if incoming_messages.iter().all(|msg| {
12234            by_idx.contains_key(&msg.idx) || replay.contains(&message_replay_fingerprint(msg))
12235        }) {
12236            return Ok(ExistingMessageLookup {
12237                by_idx: by_idx.clone(),
12238                replay: replay.clone(),
12239            });
12240        }
12241
12242        let fresh = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12243        let mut merged_by_idx = by_idx.clone();
12244        let mut merged_replay = replay.clone();
12245        merged_by_idx.extend(fresh.by_idx);
12246        merged_replay.extend(fresh.replay);
12247        pending_message_fingerprints.insert(conversation_id, merged_by_idx.clone());
12248        pending_message_replay_fingerprints.insert(conversation_id, merged_replay.clone());
12249        return Ok(ExistingMessageLookup {
12250            by_idx: merged_by_idx,
12251            replay: merged_replay,
12252        });
12253    }
12254
12255    let lookup = franken_existing_message_lookup(tx, conversation_id, incoming_messages)?;
12256    pending_message_fingerprints.insert(conversation_id, lookup.by_idx.clone());
12257    pending_message_replay_fingerprints.insert(conversation_id, lookup.replay.clone());
12258    Ok(lookup)
12259}
12260
12261/// Batch insert FTS5 entries within a frankensqlite transaction.
12262fn franken_batch_insert_fts(tx: &FrankenTransaction<'_>, entries: &[FtsEntry]) -> Result<usize> {
12263    if entries.is_empty() {
12264        return Ok(0);
12265    }
12266
12267    let mut inserted = 0;
12268
12269    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12270        let placeholders: String = chunk
12271            .iter()
12272            .enumerate()
12273            .map(|(i, _)| {
12274                let base = i * 7 + 1; // +1 for 1-indexed params
12275                format!(
12276                    "(?{},?{},?{},?{},?{},?{},?{})",
12277                    base,
12278                    base + 1,
12279                    base + 2,
12280                    base + 3,
12281                    base + 4,
12282                    base + 5,
12283                    base + 6
12284                )
12285            })
12286            .collect::<Vec<_>>()
12287            .join(",");
12288
12289        let sql = format!(
12290            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12291        );
12292
12293        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12294        for entry in chunk {
12295            param_values.push(SqliteValue::from(entry.message_id));
12296            param_values.push(SqliteValue::from(entry.content.as_str()));
12297            param_values.push(SqliteValue::from(entry.title.as_str()));
12298            param_values.push(SqliteValue::from(entry.agent.as_str()));
12299            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12300            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12301            param_values.push(SqliteValue::from(entry.created_at));
12302        }
12303
12304        match tx.execute_with_params(&sql, &param_values) {
12305            Ok(_) => {
12306                inserted += chunk.len();
12307            }
12308            Err(err) => {
12309                tracing::warn!(
12310                    error = %err,
12311                    chunk_docs = chunk.len(),
12312                    "frankensqlite FTS batch insert failed; skipping db-resident FTS maintenance because Tantivy is authoritative"
12313                );
12314                return Ok(inserted);
12315            }
12316        }
12317    }
12318
12319    Ok(inserted)
12320}
12321
12322fn franken_batch_insert_fts_on_connection(
12323    conn: &FrankenConnection,
12324    entries: &[FtsEntry],
12325) -> Result<usize> {
12326    if entries.is_empty() {
12327        return Ok(0);
12328    }
12329
12330    let mut inserted = 0;
12331
12332    for chunk in entries.chunks(FTS5_BATCH_SIZE) {
12333        let placeholders: String = chunk
12334            .iter()
12335            .enumerate()
12336            .map(|(i, _)| {
12337                let base = i * 7 + 1;
12338                format!(
12339                    "(?{},?{},?{},?{},?{},?{},?{})",
12340                    base,
12341                    base + 1,
12342                    base + 2,
12343                    base + 3,
12344                    base + 4,
12345                    base + 5,
12346                    base + 6
12347                )
12348            })
12349            .collect::<Vec<_>>()
12350            .join(",");
12351
12352        let sql = format!(
12353            "INSERT INTO fts_messages(rowid, content, title, agent, workspace, source_path, created_at) VALUES {placeholders}"
12354        );
12355
12356        let mut param_values: Vec<SqliteValue> = Vec::with_capacity(chunk.len() * 7);
12357        for entry in chunk {
12358            param_values.push(SqliteValue::from(entry.message_id));
12359            param_values.push(SqliteValue::from(entry.content.as_str()));
12360            param_values.push(SqliteValue::from(entry.title.as_str()));
12361            param_values.push(SqliteValue::from(entry.agent.as_str()));
12362            param_values.push(SqliteValue::from(entry.workspace.as_str()));
12363            param_values.push(SqliteValue::from(entry.source_path.as_str()));
12364            param_values.push(SqliteValue::from(entry.created_at));
12365        }
12366
12367        conn.execute_with_params(&sql, &param_values)
12368            .with_context(|| {
12369                format!(
12370                    "inserting {} rows into fts_messages during streaming FTS maintenance",
12371                    chunk.len()
12372                )
12373            })?;
12374        inserted += chunk.len();
12375    }
12376
12377    Ok(inserted)
12378}
12379
12380/// Update daily stats within a frankensqlite transaction.
12381fn franken_update_daily_stats_in_tx(
12382    storage: &FrankenStorage,
12383    tx: &FrankenTransaction<'_>,
12384    agent_slug: &str,
12385    source_id: &str,
12386    started_at: Option<i64>,
12387    delta: StatsDelta,
12388) -> Result<()> {
12389    let day_id = started_at
12390        .map(FrankenStorage::day_id_from_millis)
12391        .unwrap_or(0);
12392    let now = FrankenStorage::now_millis();
12393
12394    let targets = [
12395        DailyStatsTarget {
12396            day_id,
12397            agent_slug,
12398            source_id,
12399        },
12400        DailyStatsTarget {
12401            day_id,
12402            agent_slug: "all",
12403            source_id,
12404        },
12405        DailyStatsTarget {
12406            day_id,
12407            agent_slug,
12408            source_id: "all",
12409        },
12410        DailyStatsTarget {
12411            day_id,
12412            agent_slug: "all",
12413            source_id: "all",
12414        },
12415    ];
12416
12417    if agent_slug != "all"
12418        && source_id != "all"
12419        && franken_update_ensured_daily_stats_targets_in_tx(storage, tx, &targets, now, delta)?
12420    {
12421        return Ok(());
12422    }
12423
12424    for target in targets {
12425        franken_apply_daily_stats_delta_in_tx(storage, tx, target, now, delta)?;
12426    }
12427
12428    Ok(())
12429}
12430
12431#[derive(Clone, Copy)]
12432struct DailyStatsTarget<'a> {
12433    day_id: i64,
12434    agent_slug: &'a str,
12435    source_id: &'a str,
12436}
12437
12438fn franken_update_ensured_daily_stats_targets_in_tx(
12439    storage: &FrankenStorage,
12440    tx: &FrankenTransaction<'_>,
12441    targets: &[DailyStatsTarget<'_>; 4],
12442    now: i64,
12443    delta: StatsDelta,
12444) -> Result<bool> {
12445    let cache_keys = targets.map(|target| {
12446        EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id)
12447    });
12448    if !storage.daily_stats_keys_already_ensured(&cache_keys) {
12449        return Ok(false);
12450    }
12451
12452    let primary = targets[0];
12453    let rows_changed = tx.execute_compat(
12454        "UPDATE daily_stats
12455         SET session_count = session_count + ?4,
12456             message_count = message_count + ?5,
12457             total_chars = total_chars + ?6,
12458             last_updated = ?7
12459         WHERE day_id = ?1
12460           AND ((agent_slug = ?2 AND source_id = ?3)
12461                OR (agent_slug = 'all' AND source_id = ?3)
12462                OR (agent_slug = ?2 AND source_id = 'all')
12463                OR (agent_slug = 'all' AND source_id = 'all'))",
12464        fparams![
12465            primary.day_id,
12466            primary.agent_slug,
12467            primary.source_id,
12468            delta.session_count_delta,
12469            delta.message_count_delta,
12470            delta.total_chars_delta,
12471            now
12472        ],
12473    )?;
12474    if rows_changed == targets.len() {
12475        return Ok(true);
12476    }
12477
12478    for (target, cache_key) in targets.iter().copied().zip(cache_keys) {
12479        let exists = tx
12480            .query_row_map(
12481                "SELECT 1 FROM daily_stats
12482                 WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3
12483                 LIMIT 1",
12484                fparams![target.day_id, target.agent_slug, target.source_id],
12485                |row| row.get_typed::<i64>(0),
12486            )
12487            .optional()?
12488            .is_some();
12489        if exists {
12490            continue;
12491        }
12492
12493        tx.execute_compat(
12494            "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12495             VALUES(?1,?2,?3,?4,?5,?6,?7)",
12496            fparams![
12497                target.day_id,
12498                target.agent_slug,
12499                target.source_id,
12500                delta.session_count_delta,
12501                delta.message_count_delta,
12502                delta.total_chars_delta,
12503                now
12504            ],
12505        )?;
12506        storage.mark_daily_stats_key_ensured(cache_key);
12507    }
12508
12509    Ok(true)
12510}
12511
12512fn franken_apply_daily_stats_delta_in_tx(
12513    storage: &FrankenStorage,
12514    tx: &FrankenTransaction<'_>,
12515    target: DailyStatsTarget<'_>,
12516    now: i64,
12517    delta: StatsDelta,
12518) -> Result<()> {
12519    let cache_key = EnsuredDailyStatsKey::new(target.day_id, target.agent_slug, target.source_id);
12520    if storage.daily_stats_key_already_ensured(&cache_key) {
12521        let rows_changed = tx.execute_compat(
12522            "UPDATE daily_stats
12523             SET session_count = session_count + ?4,
12524                 message_count = message_count + ?5,
12525                 total_chars = total_chars + ?6,
12526                 last_updated = ?7
12527             WHERE day_id = ?1 AND agent_slug = ?2 AND source_id = ?3",
12528            fparams![
12529                target.day_id,
12530                target.agent_slug,
12531                target.source_id,
12532                delta.session_count_delta,
12533                delta.message_count_delta,
12534                delta.total_chars_delta,
12535                now
12536            ],
12537        )?;
12538        if rows_changed > 0 {
12539            return Ok(());
12540        }
12541    }
12542
12543    tx.execute_compat(
12544        "INSERT INTO daily_stats(day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12545         VALUES(?1,?2,?3,?4,?5,?6,?7)
12546         ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12547            session_count = session_count + excluded.session_count,
12548            message_count = message_count + excluded.message_count,
12549            total_chars = total_chars + excluded.total_chars,
12550            last_updated = excluded.last_updated",
12551        fparams![
12552            target.day_id,
12553            target.agent_slug,
12554            target.source_id,
12555            delta.session_count_delta,
12556            delta.message_count_delta,
12557            delta.total_chars_delta,
12558            now
12559        ],
12560    )?;
12561    storage.mark_daily_stats_key_ensured(cache_key);
12562    Ok(())
12563}
12564
12565// -------------------------------------------------------------------------
12566// Frankensqlite batch helpers
12567// -------------------------------------------------------------------------
12568
12569/// Batch upsert daily_stats within a frankensqlite transaction.
12570fn franken_update_daily_stats_batched_in_tx(
12571    tx: &FrankenTransaction<'_>,
12572    entries: &[(i64, String, String, StatsDelta)],
12573) -> Result<usize> {
12574    if entries.is_empty() {
12575        return Ok(0);
12576    }
12577
12578    let now = FrankenStorage::now_millis();
12579    let mut total_affected = 0;
12580
12581    // Keep frankensqlite UPSERTs row-wise inside the transaction. The
12582    // multi-row VALUES ... ON CONFLICT form still falls back through
12583    // INSERT...SELECT in fsqlite-core, which rejects UPSERT/RETURNING during
12584    // real cass indexing.
12585    for (day_id, agent, source, delta) in entries {
12586        total_affected += tx.execute_compat(
12587            "INSERT INTO daily_stats (day_id, agent_slug, source_id, session_count, message_count, total_chars, last_updated)
12588             VALUES(?1,?2,?3,?4,?5,?6,?7)
12589             ON CONFLICT(day_id, agent_slug, source_id) DO UPDATE SET
12590                 session_count = session_count + excluded.session_count,
12591                 message_count = message_count + excluded.message_count,
12592                 total_chars = total_chars + excluded.total_chars,
12593                 last_updated = excluded.last_updated",
12594            fparams![
12595                *day_id,
12596                agent.as_str(),
12597                source.as_str(),
12598                delta.session_count_delta,
12599                delta.message_count_delta,
12600                delta.total_chars_delta,
12601                now
12602            ],
12603        )?;
12604    }
12605
12606    Ok(total_affected)
12607}
12608
12609/// Batch insert token_usage rows within a frankensqlite transaction.
12610///
12611/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12612/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12613/// UPSERT/OR IGNORE conflict clauses.
12614fn franken_insert_token_usage_batched_in_tx(
12615    tx: &FrankenTransaction<'_>,
12616    entries: &[TokenUsageEntry],
12617) -> Result<usize> {
12618    if entries.is_empty() {
12619        return Ok(0);
12620    }
12621
12622    let mut total_inserted = 0;
12623
12624    for e in entries {
12625        let params_vec: Vec<ParamValue> = vec![
12626            ParamValue::from(e.message_id),
12627            ParamValue::from(e.conversation_id),
12628            ParamValue::from(e.agent_id),
12629            ParamValue::from(e.workspace_id),
12630            ParamValue::from(e.source_id.clone()),
12631            ParamValue::from(e.timestamp_ms),
12632            ParamValue::from(e.day_id),
12633            ParamValue::from(e.model_name.clone()),
12634            ParamValue::from(e.model_family.clone()),
12635            ParamValue::from(e.model_tier.clone()),
12636            ParamValue::from(e.service_tier.clone()),
12637            ParamValue::from(e.provider.clone()),
12638            ParamValue::from(e.input_tokens),
12639            ParamValue::from(e.output_tokens),
12640            ParamValue::from(e.cache_read_tokens),
12641            ParamValue::from(e.cache_creation_tokens),
12642            ParamValue::from(e.thinking_tokens),
12643            ParamValue::from(e.total_tokens),
12644            ParamValue::from(e.estimated_cost_usd),
12645            ParamValue::from(e.role.clone()),
12646            ParamValue::from(e.content_chars),
12647            ParamValue::from(e.has_tool_calls as i64),
12648            ParamValue::from(e.tool_call_count as i64),
12649            ParamValue::from(e.data_source.clone()),
12650        ];
12651
12652        let values = param_slice_to_values(&params_vec);
12653        total_inserted += tx.execute_with_params(
12654            "INSERT OR IGNORE INTO token_usage (
12655                message_id, conversation_id, agent_id, workspace_id, source_id,
12656                timestamp_ms, day_id,
12657                model_name, model_family, model_tier, service_tier, provider,
12658                input_tokens, output_tokens, cache_read_tokens, cache_creation_tokens,
12659                thinking_tokens, total_tokens, estimated_cost_usd,
12660                role, content_chars, has_tool_calls, tool_call_count, data_source
12661            )
12662            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12663            &values,
12664        )?;
12665    }
12666
12667    Ok(total_inserted)
12668}
12669
12670/// Batch upsert token_daily_stats within a frankensqlite transaction.
12671fn franken_update_token_daily_stats_batched_in_tx(
12672    tx: &FrankenTransaction<'_>,
12673    entries: &[(i64, String, String, String, TokenStatsDelta)],
12674) -> Result<usize> {
12675    if entries.is_empty() {
12676        return Ok(0);
12677    }
12678
12679    let now = FrankenStorage::now_millis();
12680    let mut total_affected = 0;
12681
12682    for (day_id, agent, source, model, delta) in entries {
12683        total_affected += tx.execute_compat(
12684            "INSERT INTO token_daily_stats (
12685                day_id, agent_slug, source_id, model_family,
12686                api_call_count, user_message_count, assistant_message_count, tool_message_count,
12687                total_input_tokens, total_output_tokens, total_cache_read_tokens,
12688                total_cache_creation_tokens, total_thinking_tokens, grand_total_tokens,
12689                total_content_chars, total_tool_calls, estimated_cost_usd, session_count,
12690                last_updated
12691            )
12692            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19)
12693            ON CONFLICT(day_id, agent_slug, source_id, model_family) DO UPDATE SET
12694                api_call_count = api_call_count + excluded.api_call_count,
12695                user_message_count = user_message_count + excluded.user_message_count,
12696                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12697                tool_message_count = tool_message_count + excluded.tool_message_count,
12698                total_input_tokens = total_input_tokens + excluded.total_input_tokens,
12699                total_output_tokens = total_output_tokens + excluded.total_output_tokens,
12700                total_cache_read_tokens = total_cache_read_tokens + excluded.total_cache_read_tokens,
12701                total_cache_creation_tokens = total_cache_creation_tokens + excluded.total_cache_creation_tokens,
12702                total_thinking_tokens = total_thinking_tokens + excluded.total_thinking_tokens,
12703                grand_total_tokens = grand_total_tokens + excluded.grand_total_tokens,
12704                total_content_chars = total_content_chars + excluded.total_content_chars,
12705                total_tool_calls = total_tool_calls + excluded.total_tool_calls,
12706                estimated_cost_usd = estimated_cost_usd + excluded.estimated_cost_usd,
12707                session_count = session_count + excluded.session_count,
12708                last_updated = excluded.last_updated",
12709            fparams![
12710                *day_id,
12711                agent.as_str(),
12712                source.as_str(),
12713                model.as_str(),
12714                delta.api_call_count,
12715                delta.user_message_count,
12716                delta.assistant_message_count,
12717                delta.tool_message_count,
12718                delta.total_input_tokens,
12719                delta.total_output_tokens,
12720                delta.total_cache_read_tokens,
12721                delta.total_cache_creation_tokens,
12722                delta.total_thinking_tokens,
12723                delta.grand_total_tokens,
12724                delta.total_content_chars,
12725                delta.total_tool_calls,
12726                delta.estimated_cost_usd,
12727                delta.session_count,
12728                now
12729            ],
12730        )?;
12731    }
12732
12733    Ok(total_affected)
12734}
12735
12736/// Batch insert message_metrics rows within a frankensqlite transaction.
12737///
12738/// Uses row-wise INSERT OR IGNORE to avoid the frankensqlite limitation where
12739/// multi-row VALUES lists fall through to INSERT...SELECT, which rejects
12740/// UPSERT/OR IGNORE conflict clauses.
12741fn franken_insert_message_metrics_batched_in_tx(
12742    tx: &FrankenTransaction<'_>,
12743    entries: &[MessageMetricsEntry],
12744) -> Result<usize> {
12745    if entries.is_empty() {
12746        return Ok(0);
12747    }
12748
12749    let mut total_inserted = 0;
12750
12751    for e in entries {
12752        let params_vec: Vec<ParamValue> = vec![
12753            ParamValue::from(e.message_id),
12754            ParamValue::from(e.created_at_ms),
12755            ParamValue::from(e.hour_id),
12756            ParamValue::from(e.day_id),
12757            ParamValue::from(e.agent_slug.clone()),
12758            ParamValue::from(e.workspace_id),
12759            ParamValue::from(e.source_id.clone()),
12760            ParamValue::from(e.role.clone()),
12761            ParamValue::from(e.content_chars),
12762            ParamValue::from(e.content_tokens_est),
12763            ParamValue::from(e.model_name.clone()),
12764            ParamValue::from(e.model_family.clone()),
12765            ParamValue::from(e.model_tier.clone()),
12766            ParamValue::from(e.provider.clone()),
12767            ParamValue::from(e.api_input_tokens),
12768            ParamValue::from(e.api_output_tokens),
12769            ParamValue::from(e.api_cache_read_tokens),
12770            ParamValue::from(e.api_cache_creation_tokens),
12771            ParamValue::from(e.api_thinking_tokens),
12772            ParamValue::from(e.api_service_tier.clone()),
12773            ParamValue::from(e.api_data_source.clone()),
12774            ParamValue::from(e.tool_call_count),
12775            ParamValue::from(e.has_tool_calls as i64),
12776            ParamValue::from(e.has_plan as i64),
12777        ];
12778
12779        let values = param_slice_to_values(&params_vec);
12780        total_inserted += tx.execute_with_params(
12781            "INSERT OR IGNORE INTO message_metrics (
12782                message_id, created_at_ms, hour_id, day_id,
12783                agent_slug, workspace_id, source_id, role,
12784                content_chars, content_tokens_est,
12785                model_name, model_family, model_tier, provider,
12786                api_input_tokens, api_output_tokens, api_cache_read_tokens,
12787                api_cache_creation_tokens, api_thinking_tokens,
12788                api_service_tier, api_data_source,
12789                tool_call_count, has_tool_calls, has_plan
12790            )
12791            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22,?23,?24)",
12792            &values,
12793        )?;
12794    }
12795
12796    Ok(total_inserted)
12797}
12798
12799/// Flush one rollup table (shared logic for hourly + daily) within a frankensqlite transaction.
12800fn franken_flush_rollup_table(
12801    tx: &FrankenTransaction<'_>,
12802    table: &str,
12803    bucket_col: &str,
12804    deltas: &HashMap<(i64, String, i64, String), UsageRollupDelta>,
12805    now: i64,
12806) -> Result<usize> {
12807    if deltas.is_empty() {
12808        return Ok(0);
12809    }
12810
12811    let mut total_affected = 0;
12812
12813    for ((bucket_id, agent, workspace_id, source), d) in deltas {
12814        let sql = format!(
12815            "INSERT INTO {table} (
12816                {bucket_col}, agent_slug, workspace_id, source_id,
12817                message_count, user_message_count, assistant_message_count,
12818                tool_call_count, plan_message_count, plan_content_tokens_est_total,
12819                plan_api_tokens_total, api_coverage_message_count,
12820                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
12821                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
12822                api_cache_read_tokens_total, api_cache_creation_tokens_total,
12823                api_thinking_tokens_total, last_updated
12824            )
12825            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
12826            ON CONFLICT({bucket_col}, agent_slug, workspace_id, source_id) DO UPDATE SET
12827                message_count = message_count + excluded.message_count,
12828                user_message_count = user_message_count + excluded.user_message_count,
12829                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12830                tool_call_count = tool_call_count + excluded.tool_call_count,
12831                plan_message_count = plan_message_count + excluded.plan_message_count,
12832                plan_content_tokens_est_total = plan_content_tokens_est_total + excluded.plan_content_tokens_est_total,
12833                plan_api_tokens_total = plan_api_tokens_total + excluded.plan_api_tokens_total,
12834                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
12835                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
12836                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
12837                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
12838                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
12839                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
12840                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
12841                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
12842                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
12843                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
12844                last_updated = excluded.last_updated"
12845        );
12846
12847        total_affected += tx.execute_compat(
12848            &sql,
12849            fparams![
12850                *bucket_id,
12851                agent.as_str(),
12852                *workspace_id,
12853                source.as_str(),
12854                d.message_count,
12855                d.user_message_count,
12856                d.assistant_message_count,
12857                d.tool_call_count,
12858                d.plan_message_count,
12859                d.plan_content_tokens_est_total,
12860                d.plan_api_tokens_total,
12861                d.api_coverage_message_count,
12862                d.content_tokens_est_total,
12863                d.content_tokens_est_user,
12864                d.content_tokens_est_assistant,
12865                d.api_tokens_total,
12866                d.api_input_tokens_total,
12867                d.api_output_tokens_total,
12868                d.api_cache_read_tokens_total,
12869                d.api_cache_creation_tokens_total,
12870                d.api_thinking_tokens_total,
12871                now
12872            ],
12873        )?;
12874    }
12875
12876    Ok(total_affected)
12877}
12878
12879/// Flush usage_models_daily rollup within a frankensqlite transaction.
12880fn franken_flush_model_daily_rollup_table(
12881    tx: &FrankenTransaction<'_>,
12882    deltas: &HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
12883    now: i64,
12884) -> Result<usize> {
12885    if deltas.is_empty() {
12886        return Ok(0);
12887    }
12888
12889    let mut total_affected = 0;
12890
12891    for ((day_id, agent, workspace_id, source, model_family, model_tier), d) in deltas {
12892        total_affected += tx.execute_compat(
12893            "INSERT INTO usage_models_daily (
12894                day_id, agent_slug, workspace_id, source_id, model_family, model_tier,
12895                message_count, user_message_count, assistant_message_count,
12896                tool_call_count, plan_message_count, api_coverage_message_count,
12897                content_tokens_est_total, content_tokens_est_user, content_tokens_est_assistant,
12898                api_tokens_total, api_input_tokens_total, api_output_tokens_total,
12899                api_cache_read_tokens_total, api_cache_creation_tokens_total,
12900                api_thinking_tokens_total, last_updated
12901            )
12902            VALUES(?1,?2,?3,?4,?5,?6,?7,?8,?9,?10,?11,?12,?13,?14,?15,?16,?17,?18,?19,?20,?21,?22)
12903            ON CONFLICT(day_id, agent_slug, workspace_id, source_id, model_family, model_tier) DO UPDATE SET
12904                message_count = message_count + excluded.message_count,
12905                user_message_count = user_message_count + excluded.user_message_count,
12906                assistant_message_count = assistant_message_count + excluded.assistant_message_count,
12907                tool_call_count = tool_call_count + excluded.tool_call_count,
12908                plan_message_count = plan_message_count + excluded.plan_message_count,
12909                api_coverage_message_count = api_coverage_message_count + excluded.api_coverage_message_count,
12910                content_tokens_est_total = content_tokens_est_total + excluded.content_tokens_est_total,
12911                content_tokens_est_user = content_tokens_est_user + excluded.content_tokens_est_user,
12912                content_tokens_est_assistant = content_tokens_est_assistant + excluded.content_tokens_est_assistant,
12913                api_tokens_total = api_tokens_total + excluded.api_tokens_total,
12914                api_input_tokens_total = api_input_tokens_total + excluded.api_input_tokens_total,
12915                api_output_tokens_total = api_output_tokens_total + excluded.api_output_tokens_total,
12916                api_cache_read_tokens_total = api_cache_read_tokens_total + excluded.api_cache_read_tokens_total,
12917                api_cache_creation_tokens_total = api_cache_creation_tokens_total + excluded.api_cache_creation_tokens_total,
12918                api_thinking_tokens_total = api_thinking_tokens_total + excluded.api_thinking_tokens_total,
12919                last_updated = excluded.last_updated",
12920            fparams![
12921                *day_id,
12922                agent.as_str(),
12923                *workspace_id,
12924                source.as_str(),
12925                model_family.as_str(),
12926                model_tier.as_str(),
12927                d.message_count,
12928                d.user_message_count,
12929                d.assistant_message_count,
12930                d.tool_call_count,
12931                d.plan_message_count,
12932                d.api_coverage_message_count,
12933                d.content_tokens_est_total,
12934                d.content_tokens_est_user,
12935                d.content_tokens_est_assistant,
12936                d.api_tokens_total,
12937                d.api_input_tokens_total,
12938                d.api_output_tokens_total,
12939                d.api_cache_read_tokens_total,
12940                d.api_cache_creation_tokens_total,
12941                d.api_thinking_tokens_total,
12942                now
12943            ],
12944        )?;
12945    }
12946
12947    Ok(total_affected)
12948}
12949
12950/// Flush AnalyticsRollupAggregator deltas via frankensqlite transaction.
12951fn franken_flush_analytics_rollups_in_tx(
12952    tx: &FrankenTransaction<'_>,
12953    agg: &AnalyticsRollupAggregator,
12954) -> Result<(usize, usize, usize)> {
12955    let now = FrankenStorage::now_millis();
12956
12957    let hourly_affected =
12958        franken_flush_rollup_table(tx, "usage_hourly", "hour_id", &agg.hourly, now)?;
12959    let daily_affected = franken_flush_rollup_table(tx, "usage_daily", "day_id", &agg.daily, now)?;
12960    let models_daily_affected = franken_flush_model_daily_rollup_table(tx, &agg.models_daily, now)?;
12961
12962    Ok((hourly_affected, daily_affected, models_daily_affected))
12963}
12964
12965/// Update conversation-level token summary columns via frankensqlite transaction.
12966fn franken_update_conversation_token_summaries_in_tx(
12967    tx: &FrankenTransaction<'_>,
12968    conversation_id: i64,
12969) -> Result<()> {
12970    tx.execute_compat(
12971        "UPDATE conversations SET
12972            total_input_tokens = (SELECT SUM(input_tokens) FROM token_usage WHERE conversation_id = ?1),
12973            total_output_tokens = (SELECT SUM(output_tokens) FROM token_usage WHERE conversation_id = ?1),
12974            total_cache_read_tokens = (SELECT SUM(cache_read_tokens) FROM token_usage WHERE conversation_id = ?1),
12975            total_cache_creation_tokens = (SELECT SUM(cache_creation_tokens) FROM token_usage WHERE conversation_id = ?1),
12976            grand_total_tokens = (SELECT SUM(total_tokens) FROM token_usage WHERE conversation_id = ?1),
12977            estimated_cost_usd = (SELECT SUM(estimated_cost_usd) FROM token_usage WHERE conversation_id = ?1),
12978            primary_model = (SELECT model_name FROM token_usage WHERE conversation_id = ?1
12979                             AND model_name IS NOT NULL
12980                             GROUP BY model_name ORDER BY COUNT(*) DESC LIMIT 1),
12981            api_call_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12982                              AND data_source = 'api'),
12983            tool_call_count = (SELECT SUM(tool_call_count) FROM token_usage WHERE conversation_id = ?1),
12984            user_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12985                                  AND role = 'user'),
12986            assistant_message_count = (SELECT COUNT(*) FROM token_usage WHERE conversation_id = ?1
12987                                       AND role IN ('assistant', 'agent'))
12988         WHERE id = ?1",
12989        fparams![conversation_id],
12990    )?;
12991    Ok(())
12992}
12993
12994impl FrankenStorage {
12995    /// Rebuild token_daily_stats from the token_usage ledger.
12996    pub fn rebuild_token_daily_stats(&self) -> Result<usize> {
12997        const CONVERSATION_BATCH_SIZE: usize = 1_000;
12998        const TOKEN_USAGE_BATCH_SIZE: usize = 10_000;
12999
13000        let total_usage_rows: i64 =
13001            self.conn
13002                .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
13003                    row.get_typed(0)
13004                })?;
13005        tracing::info!(
13006            target: "cass::analytics",
13007            total_usage_rows,
13008            "token_daily_stats_rebuild_start"
13009        );
13010
13011        let mut tx = self.conn.transaction()?;
13012        tx.execute("DELETE FROM token_daily_stats")?;
13013
13014        let mut last_conversation_id = 0_i64;
13015        let mut rows_created = 0_usize;
13016
13017        loop {
13018            let conversation_rows = tx.query_map_collect(
13019                "SELECT c.id, c.started_at, c.source_id,
13020                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown')
13021                 FROM conversations c
13022                 WHERE c.id > ?1
13023                 ORDER BY c.id
13024                 LIMIT ?2",
13025                fparams![last_conversation_id, CONVERSATION_BATCH_SIZE as i64],
13026                |row| {
13027                    Ok((
13028                        row.get_typed::<i64>(0)?,
13029                        row.get_typed::<Option<i64>>(1)?,
13030                        row.get_typed::<String>(2)?,
13031                        row.get_typed::<String>(3)?,
13032                    ))
13033                },
13034            )?;
13035            if conversation_rows.is_empty() {
13036                break;
13037            }
13038
13039            let mut aggregate = TokenStatsAggregator::new();
13040
13041            for (conversation_id, started_at, source_id, agent_slug) in conversation_rows {
13042                last_conversation_id = conversation_id;
13043                let conversation_day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13044                let mut last_token_usage_id = 0_i64;
13045                let mut session_model_family = String::from("unknown");
13046
13047                loop {
13048                    let usage_rows = tx.query_map_collect(
13049                        "SELECT id, day_id, role,
13050                                COALESCE(model_family, 'unknown'),
13051                                input_tokens, output_tokens, cache_read_tokens,
13052                                cache_creation_tokens, thinking_tokens,
13053                                has_tool_calls, tool_call_count,
13054                                content_chars, estimated_cost_usd
13055                         FROM token_usage
13056                         WHERE conversation_id = ?1
13057                           AND id > ?2
13058                         ORDER BY id
13059                         LIMIT ?3",
13060                        fparams![
13061                            conversation_id,
13062                            last_token_usage_id,
13063                            TOKEN_USAGE_BATCH_SIZE as i64
13064                        ],
13065                        |row| {
13066                            Ok((
13067                                row.get_typed::<i64>(0)?,
13068                                row.get_typed::<i64>(1)?,
13069                                row.get_typed::<String>(2)?,
13070                                row.get_typed::<String>(3)?,
13071                                row.get_typed::<Option<i64>>(4)?,
13072                                row.get_typed::<Option<i64>>(5)?,
13073                                row.get_typed::<Option<i64>>(6)?,
13074                                row.get_typed::<Option<i64>>(7)?,
13075                                row.get_typed::<Option<i64>>(8)?,
13076                                row.get_typed::<i64>(9)?,
13077                                row.get_typed::<i64>(10)?,
13078                                row.get_typed::<i64>(11)?,
13079                                row.get_typed::<Option<f64>>(12)?,
13080                            ))
13081                        },
13082                    )?;
13083                    if usage_rows.is_empty() {
13084                        break;
13085                    }
13086
13087                    for (
13088                        token_usage_id,
13089                        day_id,
13090                        role,
13091                        model_family,
13092                        input_tokens,
13093                        output_tokens,
13094                        cache_read_tokens,
13095                        cache_creation_tokens,
13096                        thinking_tokens,
13097                        has_tool_calls,
13098                        tool_call_count,
13099                        content_chars,
13100                        estimated_cost_usd,
13101                    ) in usage_rows
13102                    {
13103                        last_token_usage_id = token_usage_id;
13104                        if model_family != "unknown" {
13105                            session_model_family = model_family.clone();
13106                        }
13107                        let usage = crate::connectors::ExtractedTokenUsage {
13108                            model_name: None,
13109                            provider: None,
13110                            input_tokens,
13111                            output_tokens,
13112                            cache_read_tokens,
13113                            cache_creation_tokens,
13114                            thinking_tokens,
13115                            service_tier: None,
13116                            has_tool_calls: has_tool_calls != 0,
13117                            tool_call_count: u32::try_from(tool_call_count.max(0)).unwrap_or(0),
13118                            data_source: franken_agent_detection::TokenDataSource::Api,
13119                        };
13120                        aggregate.record(
13121                            &agent_slug,
13122                            &source_id,
13123                            day_id,
13124                            &model_family,
13125                            &role,
13126                            &usage,
13127                            content_chars,
13128                            estimated_cost_usd.unwrap_or(0.0),
13129                        );
13130                    }
13131                }
13132
13133                aggregate.record_session(
13134                    &agent_slug,
13135                    &source_id,
13136                    conversation_day_id,
13137                    &session_model_family,
13138                );
13139            }
13140
13141            let entries = aggregate.expand();
13142            rows_created = rows_created.saturating_add(entries.len());
13143            franken_update_token_daily_stats_batched_in_tx(&tx, &entries)?;
13144        }
13145
13146        tx.commit()?;
13147
13148        tracing::info!(
13149            target: "cass::analytics",
13150            rows_created,
13151            "token_daily_stats_rebuild_complete"
13152        );
13153
13154        Ok(rows_created)
13155    }
13156
13157    /// Rebuild analytics tables (message_metrics + rollups) from existing
13158    /// messages in the database. Does NOT re-parse raw agent session files.
13159    pub fn rebuild_analytics(&self) -> Result<AnalyticsRebuildResult> {
13160        let start = Instant::now();
13161
13162        let total_messages: i64 =
13163            self.conn
13164                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13165                    row.get_typed(0)
13166                })?;
13167        tracing::info!(
13168            target: "cass::analytics",
13169            total_messages,
13170            "analytics_rebuild_start"
13171        );
13172
13173        let mut tx = self.conn.transaction()?;
13174
13175        tx.execute("DELETE FROM message_metrics")?;
13176        tx.execute("DELETE FROM usage_hourly")?;
13177        tx.execute("DELETE FROM usage_daily")?;
13178        tx.execute("DELETE FROM usage_models_daily")?;
13179
13180        const CHUNK_SIZE: i64 = 10_000;
13181        let mut offset: i64 = 0;
13182        let mut total_inserted: usize = 0;
13183        let mut usage_hourly_rows: usize = 0;
13184        let mut usage_daily_rows: usize = 0;
13185        let mut usage_models_daily_rows: usize = 0;
13186
13187        loop {
13188            #[allow(clippy::type_complexity)]
13189            let rows: Vec<(
13190                i64,
13191                String,
13192                String,
13193                Option<serde_json::Value>,
13194                Option<i64>,
13195                Option<i64>,
13196                String,
13197                Option<i64>,
13198                String,
13199            )> = tx.query_map_collect(
13200                // Avoid the 3-table JOIN with LIMIT/OFFSET that triggers
13201                // frankensqlite's materialization fallback (see 860acb12).
13202                // Inline the agent slug lookup as a correlated subquery and
13203                // fall back to 'unknown' for NULL agent_id, matching the
13204                // FTS / lexical rebuild paths.
13205                "SELECT m.id, m.idx, m.role, m.content, m.extra_json, m.extra_bin,
13206                        m.created_at,
13207                        c.id AS conv_id, c.started_at AS conv_started_at,
13208                        c.source_id, c.workspace_id,
13209                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown') AS agent_slug
13210                 FROM messages m
13211                 JOIN conversations c ON m.conversation_id = c.id
13212                 ORDER BY m.id
13213                 LIMIT ?1 OFFSET ?2",
13214                fparams![CHUNK_SIZE, offset],
13215                |row| {
13216                    let msg_id: i64 = row.get_typed(0)?;
13217                    let role: String = row.get_typed(2)?;
13218                    let content: String = row.get_typed(3)?;
13219                    let extra_json = row
13220                        .get_typed::<Option<String>>(4)?
13221                        .and_then(|s| serde_json::from_str(&s).ok())
13222                        .or_else(|| {
13223                            row.get_typed::<Option<Vec<u8>>>(5)
13224                                .ok()
13225                                .flatten()
13226                                .and_then(|b| rmp_serde::from_slice(&b).ok())
13227                        });
13228                    let msg_ts: Option<i64> = row.get_typed(6)?;
13229                    let conv_started_at: Option<i64> = row.get_typed(8)?;
13230                    let source_id: String = row.get_typed(9)?;
13231                    let workspace_id: Option<i64> = row.get_typed(10)?;
13232                    let agent_slug: String = row.get_typed(11)?;
13233                    let effective_ts = msg_ts.or(conv_started_at).unwrap_or(0);
13234
13235                    Ok((
13236                        msg_id,
13237                        role,
13238                        content,
13239                        extra_json,
13240                        Some(effective_ts),
13241                        workspace_id,
13242                        source_id,
13243                        conv_started_at,
13244                        agent_slug,
13245                    ))
13246                },
13247            )?;
13248
13249            if rows.is_empty() {
13250                break;
13251            }
13252
13253            let chunk_len = rows.len();
13254            let mut entries = Vec::with_capacity(chunk_len);
13255            let mut rollup_agg = AnalyticsRollupAggregator::new();
13256
13257            for (
13258                msg_id,
13259                role,
13260                content,
13261                extra_json,
13262                effective_ts,
13263                workspace_id,
13264                source_id,
13265                _conv_started_at,
13266                agent_slug,
13267            ) in &rows
13268            {
13269                let ts = effective_ts.unwrap_or(0);
13270                let day_id = Self::day_id_from_millis(ts);
13271                let hour_id = Self::hour_id_from_millis(ts);
13272                let content_chars = content.len() as i64;
13273                let content_tokens_est = content_chars / 4;
13274                let extra = extra_json
13275                    .as_ref()
13276                    .cloned()
13277                    .unwrap_or(serde_json::Value::Null);
13278                let usage =
13279                    crate::connectors::extract_tokens_for_agent(agent_slug, &extra, content, role);
13280                let model_info = usage
13281                    .model_name
13282                    .as_deref()
13283                    .map(crate::connectors::normalize_model);
13284                let model_family = model_info
13285                    .as_ref()
13286                    .map(|i| i.family.clone())
13287                    .unwrap_or_else(|| "unknown".into());
13288                let model_tier = model_info
13289                    .as_ref()
13290                    .map(|i| i.tier.clone())
13291                    .unwrap_or_else(|| "unknown".into());
13292                let provider = usage
13293                    .provider
13294                    .clone()
13295                    .or_else(|| model_info.as_ref().map(|i| i.provider.clone()))
13296                    .unwrap_or_else(|| "unknown".into());
13297
13298                let entry = MessageMetricsEntry {
13299                    message_id: *msg_id,
13300                    created_at_ms: ts,
13301                    hour_id,
13302                    day_id,
13303                    agent_slug: agent_slug.clone(),
13304                    workspace_id: workspace_id.unwrap_or(0),
13305                    source_id: source_id.clone(),
13306                    role: role.clone(),
13307                    content_chars,
13308                    content_tokens_est,
13309                    model_name: usage.model_name.clone(),
13310                    model_family,
13311                    model_tier,
13312                    provider,
13313                    api_input_tokens: usage.input_tokens,
13314                    api_output_tokens: usage.output_tokens,
13315                    api_cache_read_tokens: usage.cache_read_tokens,
13316                    api_cache_creation_tokens: usage.cache_creation_tokens,
13317                    api_thinking_tokens: usage.thinking_tokens,
13318                    api_service_tier: usage.service_tier,
13319                    api_data_source: usage.data_source.as_str().to_string(),
13320                    tool_call_count: usage.tool_call_count as i64,
13321                    has_tool_calls: usage.has_tool_calls,
13322                    has_plan: has_plan_for_role(role, content),
13323                };
13324                rollup_agg.record(&entry);
13325                entries.push(entry);
13326            }
13327
13328            total_inserted += franken_insert_message_metrics_batched_in_tx(&tx, &entries)?;
13329            let (hourly, daily, models_daily) =
13330                franken_flush_analytics_rollups_in_tx(&tx, &rollup_agg)?;
13331            usage_hourly_rows += hourly;
13332            usage_daily_rows += daily;
13333            usage_models_daily_rows += models_daily;
13334            offset += chunk_len as i64;
13335
13336            tracing::debug!(
13337                target: "cass::analytics",
13338                offset,
13339                chunk = chunk_len,
13340                inserted = entries.len(),
13341                total = total_inserted,
13342                "analytics_rebuild_chunk"
13343            );
13344
13345            if (chunk_len as i64) < CHUNK_SIZE {
13346                break;
13347            }
13348        }
13349
13350        tx.commit()?;
13351
13352        let elapsed = start.elapsed();
13353        let elapsed_ms = elapsed.as_millis() as u64;
13354        let msgs_per_sec = if elapsed_ms > 0 {
13355            (total_inserted as f64) / (elapsed_ms as f64 / 1000.0)
13356        } else {
13357            0.0
13358        };
13359
13360        tracing::info!(
13361            target: "cass::analytics",
13362            message_metrics_rows = total_inserted,
13363            usage_hourly_rows,
13364            usage_daily_rows,
13365            usage_models_daily_rows,
13366            elapsed_ms,
13367            messages_per_sec = format!("{:.0}", msgs_per_sec),
13368            "analytics_rebuild_complete"
13369        );
13370
13371        Ok(AnalyticsRebuildResult {
13372            message_metrics_rows: total_inserted,
13373            usage_hourly_rows,
13374            usage_daily_rows,
13375            usage_models_daily_rows,
13376            elapsed_ms,
13377            messages_per_sec: msgs_per_sec,
13378        })
13379    }
13380
13381    /// Rebuild all daily stats from scratch.
13382    pub fn rebuild_daily_stats(&self) -> Result<DailyStatsRebuildResult> {
13383        const DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE: usize = 1_000;
13384        const DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE: usize = 10_000;
13385
13386        let mut conversation_batch_size = rebuild_batch_size_env(
13387            "CASS_DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE",
13388            DAILY_STATS_REBUILD_CONVERSATION_BATCH_SIZE,
13389        );
13390        let mut message_batch_size = rebuild_batch_size_env(
13391            "CASS_DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE",
13392            DAILY_STATS_REBUILD_MESSAGE_BATCH_SIZE,
13393        );
13394
13395        let total_messages: i64 =
13396            self.conn
13397                .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
13398                    row.get_typed(0)
13399                })?;
13400        let message_metrics_rows: i64 =
13401            self.conn
13402                .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
13403                    row.get_typed(0)
13404                })?;
13405        let use_message_metrics = total_messages > 0 && total_messages == message_metrics_rows;
13406
13407        tracing::info!(
13408            target: "cass::perf::daily_stats",
13409            total_messages,
13410            message_metrics_rows,
13411            use_message_metrics,
13412            "daily_stats rebuild selected message source"
13413        );
13414
13415        let mut tx = self.conn.transaction()?;
13416        tx.execute("DELETE FROM daily_stats")?;
13417
13418        let mut last_conversation_id = 0_i64;
13419        let mut conversation_batch_count = 0_usize;
13420        let mut conversations_processed = 0_usize;
13421        let mut messages_processed = 0_usize;
13422        let mut message_batch_count = 0_usize;
13423        let mut raw_entries_flushed = 0_usize;
13424        let mut expanded_entries_flushed = 0_usize;
13425        let message_scan_sql = if use_message_metrics {
13426            "SELECT m.idx, mm.content_chars
13427             FROM messages m
13428             JOIN message_metrics mm ON mm.message_id = m.id
13429             WHERE m.conversation_id = ?1
13430               AND m.idx > ?2
13431             ORDER BY m.conversation_id, m.idx
13432             LIMIT ?3"
13433        } else {
13434            "SELECT m.idx, COALESCE(LENGTH(CAST(m.content AS BLOB)), 0)
13435             FROM messages m
13436             WHERE m.conversation_id = ?1
13437               AND m.idx > ?2
13438             ORDER BY m.conversation_id, m.idx
13439             LIMIT ?3"
13440        };
13441
13442        loop {
13443            // Avoid the 2-table JOIN with LIMIT that triggers frankensqlite's
13444            // materialization fallback (which is what the OOM retry below is
13445            // defending against — see 860acb12).  Inline agent slug via
13446            // correlated subquery and degrade NULL agent_id to 'unknown' for
13447            // consistency with the lexical/FTS rebuild paths.
13448            let conversation_rows = match self.conn.query_with_params(
13449                "SELECT c.id, c.started_at,
13450                        COALESCE((SELECT a.slug FROM agents a WHERE a.id = c.agent_id), 'unknown'),
13451                        c.source_id
13452                 FROM conversations c
13453                 WHERE c.id > ?1
13454                 ORDER BY c.id
13455                 LIMIT ?2",
13456                &params_from_iter([
13457                    ParamValue::from(last_conversation_id),
13458                    ParamValue::from(conversation_batch_size as i64),
13459                ]),
13460            ) {
13461                Ok(rows) => rows,
13462                Err(err) if is_out_of_memory_error(&err) && conversation_batch_size > 1 => {
13463                    let previous_batch_size = conversation_batch_size;
13464                    conversation_batch_size = (conversation_batch_size / 2).max(1);
13465                    tracing::warn!(
13466                        previous_batch_size,
13467                        conversation_batch_size,
13468                        last_conversation_id,
13469                        "daily_stats conversation scan ran out of memory; retrying with smaller batch"
13470                    );
13471                    continue;
13472                }
13473                Err(err) => return Err(err.into()),
13474            };
13475            if conversation_rows.is_empty() {
13476                break;
13477            }
13478
13479            let mut aggregate = StatsAggregator::new();
13480            let mut conversation_batch_meta: Vec<(i64, i64, String, String)> =
13481                Vec::with_capacity(conversation_rows.len());
13482            for row in &conversation_rows {
13483                let conversation_id: i64 = row.get_typed(0)?;
13484                let started_at: Option<i64> = row.get_typed(1)?;
13485                let agent_slug: String = row.get_typed(2)?;
13486                let source_id: String = row.get_typed(3)?;
13487                last_conversation_id = conversation_id;
13488                let day_id = started_at.map(Self::day_id_from_millis).unwrap_or(0);
13489                aggregate.record_delta(&agent_slug, &source_id, day_id, 1, 0, 0);
13490                conversation_batch_meta.push((conversation_id, day_id, agent_slug, source_id));
13491                conversations_processed += 1;
13492            }
13493
13494            conversation_batch_count += 1;
13495            raw_entries_flushed += aggregate.raw_entry_count();
13496            let entries = aggregate.expand();
13497            expanded_entries_flushed += entries.len();
13498            if !entries.is_empty() {
13499                franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13500            }
13501            if conversation_batch_count.is_multiple_of(25) {
13502                tracing::info!(
13503                    target: "cass::perf::daily_stats",
13504                    conversations_processed,
13505                    batches = conversation_batch_count,
13506                    batch_size = conversation_batch_size,
13507                    last_conversation_id,
13508                    "daily_stats rebuild conversation scan progress"
13509                );
13510            }
13511            if conversation_batch_meta.is_empty() {
13512                continue;
13513            }
13514
13515            for (conversation_id, day_id, agent_slug, source_id) in conversation_batch_meta {
13516                let mut cursor_message_idx = -1_i64;
13517                loop {
13518                    let message_rows = match self.conn.query_with_params(
13519                        message_scan_sql,
13520                        &params_from_iter([
13521                            ParamValue::from(conversation_id),
13522                            ParamValue::from(cursor_message_idx),
13523                            ParamValue::from(message_batch_size as i64),
13524                        ]),
13525                    ) {
13526                        Ok(rows) => rows,
13527                        Err(err) if is_out_of_memory_error(&err) && message_batch_size > 1 => {
13528                            let previous_batch_size = message_batch_size;
13529                            message_batch_size = (message_batch_size / 2).max(1);
13530                            tracing::warn!(
13531                                previous_batch_size,
13532                                message_batch_size,
13533                                conversation_id,
13534                                cursor_message_idx,
13535                                "daily_stats message scan ran out of memory; retrying with smaller batch"
13536                            );
13537                            continue;
13538                        }
13539                        Err(err) => return Err(err.into()),
13540                    };
13541                    if message_rows.is_empty() {
13542                        break;
13543                    }
13544
13545                    let mut aggregate = StatsAggregator::new();
13546                    for row in &message_rows {
13547                        let message_idx: i64 = row.get_typed(0)?;
13548                        let content_len: i64 = row.get_typed(1)?;
13549                        cursor_message_idx = message_idx;
13550                        aggregate.record_delta(&agent_slug, &source_id, day_id, 0, 1, content_len);
13551                        messages_processed += 1;
13552                    }
13553
13554                    message_batch_count += 1;
13555                    raw_entries_flushed += aggregate.raw_entry_count();
13556                    let entries = aggregate.expand();
13557                    expanded_entries_flushed += entries.len();
13558                    if !entries.is_empty() {
13559                        franken_update_daily_stats_batched_in_tx(&tx, &entries)?;
13560                    }
13561                    if message_batch_count.is_multiple_of(50) {
13562                        tracing::info!(
13563                            target: "cass::perf::daily_stats",
13564                            messages_processed,
13565                            batches = message_batch_count,
13566                            batch_size = message_batch_size,
13567                            source = if use_message_metrics {
13568                                "message_metrics"
13569                            } else {
13570                                "messages"
13571                            },
13572                            conversation_id,
13573                            cursor_message_idx,
13574                            "daily_stats rebuild message scan progress"
13575                        );
13576                    }
13577                }
13578            }
13579        }
13580
13581        let rows_created: i64 =
13582            tx.query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
13583                row.get_typed(0)
13584            })?;
13585        let total_sessions: i64 = tx.query_row_map(
13586            "SELECT COALESCE(SUM(session_count), 0) FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
13587            fparams![],
13588            |row| row.get_typed(0),
13589        )?;
13590
13591        tx.commit()?;
13592
13593        tracing::info!(
13594            target: "cass::perf::daily_stats",
13595            rows_created,
13596            total_sessions,
13597            conversations_processed,
13598            conversation_batches = conversation_batch_count,
13599            conversation_batch_size,
13600            message_batches = message_batch_count,
13601            message_batch_size,
13602            messages_processed,
13603            use_message_metrics,
13604            raw_entries_flushed,
13605            expanded_entries_flushed,
13606            "Daily stats rebuilt from conversations"
13607        );
13608
13609        Ok(DailyStatsRebuildResult {
13610            rows_created,
13611            total_sessions,
13612        })
13613    }
13614}
13615
13616// SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
13617// All methods are available through FrankenStorage.
13618
13619// -------------------------------------------------------------------------
13620// IndexingCache (Opt 7.2) - N+1 Prevention for Agent/Workspace IDs
13621// -------------------------------------------------------------------------
13622
13623/// Cache for agent and workspace IDs during batch indexing.
13624///
13625/// Prevents N+1 database queries by caching the results of ensure_agent
13626/// and ensure_workspace calls within a batch. This is per-batch and
13627/// single-threaded, so no synchronization is needed.
13628///
13629/// # Usage
13630/// ```ignore
13631/// let mut cache = IndexingCache::new();
13632/// for conv in conversations {
13633///     let agent_id = cache.get_or_insert_agent(storage, &agent)?;
13634///     let workspace_id = cache.get_or_insert_workspace(storage, workspace)?;
13635///     // ... use agent_id and workspace_id
13636/// }
13637/// ```
13638///
13639/// # Rollback
13640/// Set environment variable `CASS_SQLITE_CACHE=0` to bypass caching
13641/// and use direct DB calls (useful for debugging).
13642#[derive(Debug, Default)]
13643pub struct IndexingCache {
13644    agent_ids: HashMap<String, i64>,
13645    workspace_ids: HashMap<PathBuf, i64>,
13646    hits: u64,
13647    misses: u64,
13648}
13649
13650pub trait IndexingCacheStorage {
13651    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64>;
13652    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64>;
13653}
13654
13655impl IndexingCacheStorage for FrankenStorage {
13656    fn ensure_indexing_agent(&self, agent: &Agent) -> Result<i64> {
13657        self.ensure_agent(agent)
13658    }
13659
13660    fn ensure_indexing_workspace(&self, path: &Path, display_name: Option<&str>) -> Result<i64> {
13661        self.ensure_workspace(path, display_name)
13662    }
13663}
13664
13665// IndexingCacheStorage for SqliteStorage removed: SqliteStorage is a type alias for FrankenStorage.
13666
13667impl IndexingCache {
13668    /// Create a new empty cache.
13669    pub fn new() -> Self {
13670        Self {
13671            agent_ids: HashMap::new(),
13672            workspace_ids: HashMap::new(),
13673            hits: 0,
13674            misses: 0,
13675        }
13676    }
13677
13678    /// Check if caching is enabled via environment variable.
13679    /// Returns true unless CASS_SQLITE_CACHE is set to "0" or "false".
13680    pub fn is_enabled() -> bool {
13681        dotenvy::var("CASS_SQLITE_CACHE")
13682            .map(|v| v != "0" && v.to_lowercase() != "false")
13683            .unwrap_or(true)
13684    }
13685
13686    /// Get or insert an agent ID, using cache if available.
13687    ///
13688    /// Returns the cached ID if present, otherwise calls ensure_agent
13689    /// and caches the result.
13690    pub fn get_or_insert_agent<S>(&mut self, storage: &S, agent: &Agent) -> Result<i64>
13691    where
13692        S: IndexingCacheStorage + ?Sized,
13693    {
13694        if let Some(&cached) = self.agent_ids.get(&agent.slug) {
13695            self.hits += 1;
13696            return Ok(cached);
13697        }
13698
13699        self.misses += 1;
13700        let id = storage.ensure_indexing_agent(agent)?;
13701        self.agent_ids.insert(agent.slug.clone(), id);
13702        Ok(id)
13703    }
13704
13705    /// Get or insert a workspace ID, using cache if available.
13706    ///
13707    /// Returns the cached ID if present, otherwise calls ensure_workspace
13708    /// and caches the result.
13709    pub fn get_or_insert_workspace(
13710        &mut self,
13711        storage: &(impl IndexingCacheStorage + ?Sized),
13712        path: &Path,
13713        display_name: Option<&str>,
13714    ) -> Result<i64> {
13715        if let Some(&cached) = self.workspace_ids.get(path) {
13716            self.hits += 1;
13717            return Ok(cached);
13718        }
13719
13720        self.misses += 1;
13721        let id = storage.ensure_indexing_workspace(path, display_name)?;
13722        self.workspace_ids.insert(path.to_path_buf(), id);
13723        Ok(id)
13724    }
13725
13726    /// Get cache statistics: (hits, misses, hit_rate).
13727    pub fn stats(&self) -> (u64, u64, f64) {
13728        let total = self.hits + self.misses;
13729        let hit_rate = if total > 0 {
13730            self.hits as f64 / total as f64
13731        } else {
13732            0.0
13733        };
13734        (self.hits, self.misses, hit_rate)
13735    }
13736
13737    /// Clear the cache, resetting all state.
13738    pub fn clear(&mut self) {
13739        self.agent_ids.clear();
13740        self.workspace_ids.clear();
13741        self.hits = 0;
13742        self.misses = 0;
13743    }
13744
13745    /// Number of cached agents.
13746    pub fn agent_count(&self) -> usize {
13747        self.agent_ids.len()
13748    }
13749
13750    /// Number of cached workspaces.
13751    pub fn workspace_count(&self) -> usize {
13752        self.workspace_ids.len()
13753    }
13754}
13755
13756// -------------------------------------------------------------------------
13757// StatsAggregator (kzxu) - Batched Daily Stats Updates
13758// -------------------------------------------------------------------------
13759// Aggregates daily stats in memory during batch ingestion, then flushes
13760// to the database in a single batched INSERT...ON CONFLICT operation.
13761// This prevents N×4 database writes (4 permutations per conversation).
13762
13763/// Accumulated statistics delta for a single (day_id, agent, source) combination.
13764#[derive(Clone, Copy, Debug, Default)]
13765pub struct StatsDelta {
13766    pub session_count_delta: i64,
13767    pub message_count_delta: i64,
13768    pub total_chars_delta: i64,
13769}
13770
13771/// In-memory aggregator for batched daily stats updates.
13772///
13773/// During batch ingestion, we accumulate deltas per (day_id, agent, source) key.
13774/// After processing all conversations, call `expand()` to generate the 4
13775/// permutations per raw entry, then flush via `SqliteStorage::update_daily_stats_batched`.
13776///
13777/// # Example
13778/// ```ignore
13779/// let mut agg = StatsAggregator::new();
13780/// for conv in conversations {
13781///     agg.record(&conv.agent_slug, source_id, day_id, msg_count, char_count);
13782/// }
13783/// let entries = agg.expand();
13784/// storage.update_daily_stats_batched(&entries)?;
13785/// ```
13786#[derive(Debug, Default)]
13787pub struct StatsAggregator {
13788    /// Raw deltas keyed by (day_id, agent_slug, source_id).
13789    /// Only stores specific (non-"all") combinations.
13790    deltas: HashMap<(i64, String, String), StatsDelta>,
13791}
13792
13793impl StatsAggregator {
13794    /// Create a new empty aggregator.
13795    pub fn new() -> Self {
13796        Self {
13797            deltas: HashMap::new(),
13798        }
13799    }
13800
13801    /// Record a conversation's contribution to stats (session + messages + chars).
13802    ///
13803    /// This increments session_count by 1.
13804    ///
13805    /// # Arguments
13806    /// * `agent_slug` - The specific agent slug (not "all")
13807    /// * `source_id` - The specific source ID (not "all")
13808    /// * `day_id` - Days since 2020-01-01 (from `SqliteStorage::day_id_from_millis`)
13809    /// * `message_count` - Number of messages in the conversation
13810    /// * `total_chars` - Total character count across all messages
13811    pub fn record(
13812        &mut self,
13813        agent_slug: &str,
13814        source_id: &str,
13815        day_id: i64,
13816        message_count: i64,
13817        total_chars: i64,
13818    ) {
13819        self.record_delta(agent_slug, source_id, day_id, 1, message_count, total_chars);
13820    }
13821
13822    /// Record an arbitrary delta. Use this for append-only updates where
13823    /// `session_count_delta` may be 0 but message/char deltas are non-zero.
13824    pub fn record_delta(
13825        &mut self,
13826        agent_slug: &str,
13827        source_id: &str,
13828        day_id: i64,
13829        session_count_delta: i64,
13830        message_count_delta: i64,
13831        total_chars_delta: i64,
13832    ) {
13833        if session_count_delta == 0 && message_count_delta == 0 && total_chars_delta == 0 {
13834            return;
13835        }
13836        let key = (day_id, agent_slug.to_owned(), source_id.to_owned());
13837        let delta = self.deltas.entry(key).or_default();
13838        delta.session_count_delta += session_count_delta;
13839        delta.message_count_delta += message_count_delta;
13840        delta.total_chars_delta += total_chars_delta;
13841    }
13842
13843    /// Expand raw deltas into the 4 permutation keys:
13844    /// - (agent, source) - specific both
13845    /// - ("all", source) - all agents, specific source
13846    /// - (agent, "all") - specific agent, all sources
13847    /// - ("all", "all") - totals
13848    ///
13849    /// Returns entries sorted by (day_id, agent_slug, source_id) for deterministic batching.
13850    pub fn expand(&self) -> Vec<(i64, String, String, StatsDelta)> {
13851        let mut expanded: HashMap<(i64, String, String), StatsDelta> = HashMap::new();
13852
13853        for ((day_id, agent, source), delta) in &self.deltas {
13854            let permutations = [
13855                (agent.as_str(), source.as_str()),
13856                ("all", source.as_str()),
13857                (agent.as_str(), "all"),
13858                ("all", "all"),
13859            ];
13860
13861            // Ensure we don't double-apply deltas if agent/source is already "all".
13862            for idx in 0..permutations.len() {
13863                let (a, s) = permutations[idx];
13864                if permutations[..idx].contains(&(a, s)) {
13865                    continue;
13866                }
13867                let key = (*day_id, a.to_owned(), s.to_owned());
13868                let entry = expanded.entry(key).or_default();
13869                entry.session_count_delta += delta.session_count_delta;
13870                entry.message_count_delta += delta.message_count_delta;
13871                entry.total_chars_delta += delta.total_chars_delta;
13872            }
13873        }
13874
13875        let mut out: Vec<(i64, String, String, StatsDelta)> = expanded
13876            .into_iter()
13877            .map(|((d, a, s), delta)| (d, a, s, delta))
13878            .collect();
13879        out.sort_by(|(d1, a1, s1, _), (d2, a2, s2, _)| {
13880            d1.cmp(d2).then_with(|| a1.cmp(a2)).then_with(|| s1.cmp(s2))
13881        });
13882        out
13883    }
13884
13885    /// Check if the aggregator is empty (no data recorded).
13886    pub fn is_empty(&self) -> bool {
13887        self.deltas.is_empty()
13888    }
13889
13890    /// Get number of distinct raw (day, agent, source) combinations recorded.
13891    pub fn raw_entry_count(&self) -> usize {
13892        self.deltas.len()
13893    }
13894}
13895
13896// -------------------------------------------------------------------------
13897// TokenStatsAggregator — Batched Token Analytics Daily Stats
13898// -------------------------------------------------------------------------
13899// Mirrors StatsAggregator pattern for token-level metrics.
13900// Aggregates token usage in memory during batch ingestion, then flushes
13901// to token_daily_stats in a single batched INSERT...ON CONFLICT operation.
13902
13903/// Accumulated token statistics delta for a single (day_id, agent, source, model_family) combination.
13904#[derive(Clone, Debug, Default)]
13905pub struct TokenStatsDelta {
13906    pub api_call_count: i64,
13907    pub user_message_count: i64,
13908    pub assistant_message_count: i64,
13909    pub tool_message_count: i64,
13910    pub total_input_tokens: i64,
13911    pub total_output_tokens: i64,
13912    pub total_cache_read_tokens: i64,
13913    pub total_cache_creation_tokens: i64,
13914    pub total_thinking_tokens: i64,
13915    pub grand_total_tokens: i64,
13916    pub total_content_chars: i64,
13917    pub total_tool_calls: i64,
13918    pub estimated_cost_usd: f64,
13919    pub session_count: i64,
13920}
13921
13922/// In-memory aggregator for batched token daily stats updates.
13923///
13924/// During batch ingestion, accumulate token deltas per (day_id, agent, source, model_family) key.
13925/// After processing, call `expand()` to generate the 5 permutation keys, then flush via
13926/// `update_token_daily_stats_batched_in_tx`.
13927#[derive(Debug, Default)]
13928pub struct TokenStatsAggregator {
13929    /// Raw deltas keyed by (day_id, agent_slug, source_id, model_family).
13930    deltas: HashMap<(i64, String, String, String), TokenStatsDelta>,
13931}
13932
13933impl TokenStatsAggregator {
13934    pub fn new() -> Self {
13935        Self {
13936            deltas: HashMap::new(),
13937        }
13938    }
13939
13940    /// Record a single message's token contribution.
13941    #[allow(clippy::too_many_arguments)]
13942    pub fn record(
13943        &mut self,
13944        agent_slug: &str,
13945        source_id: &str,
13946        day_id: i64,
13947        model_family: &str,
13948        role: &str,
13949        usage: &crate::connectors::ExtractedTokenUsage,
13950        content_chars: i64,
13951        estimated_cost_usd: f64,
13952    ) {
13953        let key = (
13954            day_id,
13955            agent_slug.to_owned(),
13956            source_id.to_owned(),
13957            model_family.to_owned(),
13958        );
13959        let delta = self.deltas.entry(key).or_default();
13960
13961        delta.api_call_count += 1;
13962        match role {
13963            "user" => delta.user_message_count += 1,
13964            "assistant" | "agent" => delta.assistant_message_count += 1,
13965            "tool" => delta.tool_message_count += 1,
13966            _ => {}
13967        }
13968
13969        delta.total_input_tokens += usage.input_tokens.unwrap_or(0);
13970        delta.total_output_tokens += usage.output_tokens.unwrap_or(0);
13971        delta.total_cache_read_tokens += usage.cache_read_tokens.unwrap_or(0);
13972        delta.total_cache_creation_tokens += usage.cache_creation_tokens.unwrap_or(0);
13973        delta.total_thinking_tokens += usage.thinking_tokens.unwrap_or(0);
13974        delta.grand_total_tokens += usage.total_tokens().unwrap_or(0);
13975        delta.total_content_chars += content_chars;
13976        delta.total_tool_calls += usage.tool_call_count as i64;
13977        delta.estimated_cost_usd += estimated_cost_usd;
13978    }
13979
13980    /// Record a session count bump for a given day/agent/source/model.
13981    pub fn record_session(
13982        &mut self,
13983        agent_slug: &str,
13984        source_id: &str,
13985        day_id: i64,
13986        model_family: &str,
13987    ) {
13988        let key = (
13989            day_id,
13990            agent_slug.to_owned(),
13991            source_id.to_owned(),
13992            model_family.to_owned(),
13993        );
13994        self.deltas.entry(key).or_default().session_count += 1;
13995    }
13996
13997    /// Expand raw deltas into 5 permutation keys for the 4-dimensional composite PK:
13998    /// - (agent, source, model)  — specific all three
13999    /// - ("all", source, model)  — all agents
14000    /// - (agent, "all", model)   — all sources
14001    /// - (agent, source, "all")  — all models
14002    /// - ("all", "all", "all")   — global total
14003    pub fn expand(&self) -> Vec<(i64, String, String, String, TokenStatsDelta)> {
14004        let mut expanded: HashMap<(i64, String, String, String), TokenStatsDelta> = HashMap::new();
14005
14006        for ((day_id, agent, source, model), delta) in &self.deltas {
14007            let permutations = [
14008                (agent.as_str(), source.as_str(), model.as_str()),
14009                ("all", source.as_str(), model.as_str()),
14010                (agent.as_str(), "all", model.as_str()),
14011                (agent.as_str(), source.as_str(), "all"),
14012                ("all", "all", "all"),
14013            ];
14014
14015            for idx in 0..permutations.len() {
14016                let (a, s, m) = permutations[idx];
14017                // Deduplicate if agent/source/model is already "all"
14018                if permutations[..idx].contains(&(a, s, m)) {
14019                    continue;
14020                }
14021                let key = (*day_id, a.to_owned(), s.to_owned(), m.to_owned());
14022                let entry = expanded.entry(key).or_default();
14023                entry.api_call_count += delta.api_call_count;
14024                entry.user_message_count += delta.user_message_count;
14025                entry.assistant_message_count += delta.assistant_message_count;
14026                entry.tool_message_count += delta.tool_message_count;
14027                entry.total_input_tokens += delta.total_input_tokens;
14028                entry.total_output_tokens += delta.total_output_tokens;
14029                entry.total_cache_read_tokens += delta.total_cache_read_tokens;
14030                entry.total_cache_creation_tokens += delta.total_cache_creation_tokens;
14031                entry.total_thinking_tokens += delta.total_thinking_tokens;
14032                entry.grand_total_tokens += delta.grand_total_tokens;
14033                entry.total_content_chars += delta.total_content_chars;
14034                entry.total_tool_calls += delta.total_tool_calls;
14035                entry.estimated_cost_usd += delta.estimated_cost_usd;
14036                entry.session_count += delta.session_count;
14037            }
14038        }
14039
14040        let mut out: Vec<(i64, String, String, String, TokenStatsDelta)> = expanded
14041            .into_iter()
14042            .map(|((d, a, s, m), delta)| (d, a, s, m, delta))
14043            .collect();
14044        out.sort_by(|(d1, a1, s1, m1, _), (d2, a2, s2, m2, _)| {
14045            d1.cmp(d2)
14046                .then_with(|| a1.cmp(a2))
14047                .then_with(|| s1.cmp(s2))
14048                .then_with(|| m1.cmp(m2))
14049        });
14050        out
14051    }
14052
14053    pub fn is_empty(&self) -> bool {
14054        self.deltas.is_empty()
14055    }
14056
14057    pub fn raw_entry_count(&self) -> usize {
14058        self.deltas.len()
14059    }
14060}
14061
14062// -------------------------------------------------------------------------
14063// AnalyticsRollupAggregator — Batched usage_hourly + usage_daily Updates
14064// -------------------------------------------------------------------------
14065// Accumulates per-message deltas in memory, then flushes to both
14066// usage_hourly and usage_daily in a single batched operation.
14067
14068/// Delta for a single (bucket, agent_slug, workspace_id, source_id) rollup key.
14069#[derive(Clone, Debug, Default)]
14070pub struct UsageRollupDelta {
14071    pub message_count: i64,
14072    pub user_message_count: i64,
14073    pub assistant_message_count: i64,
14074    pub tool_call_count: i64,
14075    pub plan_message_count: i64,
14076    pub plan_content_tokens_est_total: i64,
14077    pub plan_api_tokens_total: i64,
14078    pub api_coverage_message_count: i64,
14079    pub content_tokens_est_total: i64,
14080    pub content_tokens_est_user: i64,
14081    pub content_tokens_est_assistant: i64,
14082    pub api_tokens_total: i64,
14083    pub api_input_tokens_total: i64,
14084    pub api_output_tokens_total: i64,
14085    pub api_cache_read_tokens_total: i64,
14086    pub api_cache_creation_tokens_total: i64,
14087    pub api_thinking_tokens_total: i64,
14088}
14089
14090/// Pending message_metrics row for batch insertion.
14091#[derive(Debug, Clone)]
14092pub struct MessageMetricsEntry {
14093    pub message_id: i64,
14094    pub created_at_ms: i64,
14095    pub hour_id: i64,
14096    pub day_id: i64,
14097    pub agent_slug: String,
14098    pub workspace_id: i64,
14099    pub source_id: String,
14100    pub role: String,
14101    pub content_chars: i64,
14102    pub content_tokens_est: i64,
14103    pub model_name: Option<String>,
14104    pub model_family: String,
14105    pub model_tier: String,
14106    pub provider: String,
14107    pub api_input_tokens: Option<i64>,
14108    pub api_output_tokens: Option<i64>,
14109    pub api_cache_read_tokens: Option<i64>,
14110    pub api_cache_creation_tokens: Option<i64>,
14111    pub api_thinking_tokens: Option<i64>,
14112    pub api_service_tier: Option<String>,
14113    pub api_data_source: String,
14114    pub tool_call_count: i64,
14115    pub has_tool_calls: bool,
14116    pub has_plan: bool,
14117}
14118
14119/// In-memory aggregator for batched usage_hourly and usage_daily rollup updates.
14120///
14121/// Keyed by (bucket_id, agent_slug, workspace_id, source_id).
14122/// Maintains separate hourly and daily delta maps.
14123#[derive(Debug, Default)]
14124pub struct AnalyticsRollupAggregator {
14125    hourly: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14126    daily: HashMap<(i64, String, i64, String), UsageRollupDelta>,
14127    models_daily: HashMap<(i64, String, i64, String, String, String), UsageRollupDelta>,
14128}
14129
14130impl AnalyticsRollupAggregator {
14131    pub fn new() -> Self {
14132        Self::default()
14133    }
14134
14135    /// Record a single message's contribution to both hourly and daily rollups.
14136    pub fn record(&mut self, entry: &MessageMetricsEntry) {
14137        let content_est = entry.content_tokens_est;
14138        let api_total = entry.api_input_tokens.unwrap_or(0)
14139            + entry.api_output_tokens.unwrap_or(0)
14140            + entry.api_cache_read_tokens.unwrap_or(0)
14141            + entry.api_cache_creation_tokens.unwrap_or(0)
14142            + entry.api_thinking_tokens.unwrap_or(0);
14143        let is_api = entry.api_data_source == "api";
14144        let is_user = entry.role == "user";
14145        let is_assistant = entry.role == "assistant" || entry.role == "agent";
14146
14147        // Apply to both hourly and daily
14148        for (map, bucket_id) in [
14149            (&mut self.hourly, entry.hour_id),
14150            (&mut self.daily, entry.day_id),
14151        ] {
14152            let key = (
14153                bucket_id,
14154                entry.agent_slug.clone(),
14155                entry.workspace_id,
14156                entry.source_id.clone(),
14157            );
14158            let d = map.entry(key).or_default();
14159            d.message_count += 1;
14160            if is_user {
14161                d.user_message_count += 1;
14162                d.content_tokens_est_user += content_est;
14163            }
14164            if is_assistant {
14165                d.assistant_message_count += 1;
14166                d.content_tokens_est_assistant += content_est;
14167            }
14168            d.tool_call_count += entry.tool_call_count;
14169            if entry.has_plan {
14170                d.plan_message_count += 1;
14171                d.plan_content_tokens_est_total += content_est;
14172                if is_api {
14173                    d.plan_api_tokens_total += api_total;
14174                }
14175            }
14176            if is_api {
14177                d.api_coverage_message_count += 1;
14178                d.api_tokens_total += api_total;
14179                d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14180                d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14181                d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14182                d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14183                d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14184            }
14185            d.content_tokens_est_total += content_est;
14186        }
14187
14188        let model_key = (
14189            entry.day_id,
14190            entry.agent_slug.clone(),
14191            entry.workspace_id,
14192            entry.source_id.clone(),
14193            entry.model_family.clone(),
14194            entry.model_tier.clone(),
14195        );
14196        let d = self.models_daily.entry(model_key).or_default();
14197        d.message_count += 1;
14198        if is_user {
14199            d.user_message_count += 1;
14200            d.content_tokens_est_user += content_est;
14201        }
14202        if is_assistant {
14203            d.assistant_message_count += 1;
14204            d.content_tokens_est_assistant += content_est;
14205        }
14206        d.tool_call_count += entry.tool_call_count;
14207        if entry.has_plan {
14208            d.plan_message_count += 1;
14209            d.plan_content_tokens_est_total += content_est;
14210            if is_api {
14211                d.plan_api_tokens_total += api_total;
14212            }
14213        }
14214        if is_api {
14215            d.api_coverage_message_count += 1;
14216            d.api_tokens_total += api_total;
14217            d.api_input_tokens_total += entry.api_input_tokens.unwrap_or(0);
14218            d.api_output_tokens_total += entry.api_output_tokens.unwrap_or(0);
14219            d.api_cache_read_tokens_total += entry.api_cache_read_tokens.unwrap_or(0);
14220            d.api_cache_creation_tokens_total += entry.api_cache_creation_tokens.unwrap_or(0);
14221            d.api_thinking_tokens_total += entry.api_thinking_tokens.unwrap_or(0);
14222        }
14223        d.content_tokens_est_total += content_est;
14224    }
14225
14226    pub fn is_empty(&self) -> bool {
14227        self.hourly.is_empty() && self.daily.is_empty() && self.models_daily.is_empty()
14228    }
14229
14230    pub fn hourly_entry_count(&self) -> usize {
14231        self.hourly.len()
14232    }
14233
14234    pub fn daily_entry_count(&self) -> usize {
14235        self.daily.len()
14236    }
14237
14238    pub fn models_daily_entry_count(&self) -> usize {
14239        self.models_daily.len()
14240    }
14241}
14242
14243/// Whether the current role should be considered for plan attribution.
14244///
14245/// Plan attribution v2 defaults to assistant/agent messages only.
14246fn has_plan_for_role(role: &str, content: &str) -> bool {
14247    let role = role.trim();
14248    (role.eq_ignore_ascii_case("assistant") || role.eq_ignore_ascii_case("agent"))
14249        && has_plan_heuristic(content)
14250}
14251
14252/// Heuristic to detect "plan" messages.
14253///
14254/// v2 behavior:
14255/// - Require an explicit plan marker near the top of the message.
14256/// - Require structured steps (numbered or bullets) to reduce false positives.
14257/// - Avoid classifying tool-output blobs as plans.
14258fn has_plan_heuristic(content: &str) -> bool {
14259    if content.len() < 24 {
14260        return false;
14261    }
14262
14263    let lower = content.to_lowercase();
14264
14265    // Ignore tool-output-like blobs unless they also have a strong plan header.
14266    let looks_like_tool_blob = lower.contains("```")
14267        || lower.contains("\"tool\"")
14268        || lower.contains("stdout:")
14269        || lower.contains("stderr:")
14270        || lower.contains("exit code:");
14271
14272    let mut lines: Vec<&str> = Vec::with_capacity(60);
14273    let mut in_fenced_code = false;
14274    for raw in lower.lines() {
14275        let line = raw.trim();
14276        if line.starts_with("```") {
14277            in_fenced_code = !in_fenced_code;
14278            continue;
14279        }
14280        if in_fenced_code || line.is_empty() {
14281            continue;
14282        }
14283        lines.push(line);
14284        if lines.len() >= 60 {
14285            break;
14286        }
14287    }
14288
14289    let header_pos = lines.iter().position(|line| {
14290        line.starts_with("## plan")
14291            || line.starts_with("# plan")
14292            || line.starts_with("plan:")
14293            || line.starts_with("implementation plan")
14294            || line.starts_with("next steps:")
14295            || line.starts_with("action plan:")
14296    });
14297    let preview_top = lines.iter().take(8).copied().collect::<Vec<_>>().join("\n");
14298    let header_near_top = header_pos.is_some_and(|idx| idx <= 6) || preview_top.contains("plan:");
14299
14300    if !header_near_top {
14301        return false;
14302    }
14303    if looks_like_tool_blob && header_pos.is_none() {
14304        return false;
14305    }
14306
14307    let numbered_steps = lines
14308        .iter()
14309        .filter(|line| is_numbered_step_line(line))
14310        .count();
14311    let bullet_steps = lines
14312        .iter()
14313        .filter(|line| {
14314            line.starts_with("- ")
14315                || line.starts_with("* ")
14316                || line.starts_with("+ ")
14317                || line.starts_with("- [ ] ")
14318                || line.starts_with("- [x] ")
14319        })
14320        .count();
14321
14322    numbered_steps >= 2 || (numbered_steps >= 1 && bullet_steps >= 1) || bullet_steps >= 3
14323}
14324
14325fn is_numbered_step_line(line: &str) -> bool {
14326    let trimmed = line.trim_start();
14327    let digit_count = trimmed.chars().take_while(|c| c.is_ascii_digit()).count();
14328    if digit_count == 0 || digit_count > 3 {
14329        return false;
14330    }
14331    let rest = &trimmed[digit_count..];
14332    rest.starts_with(". ") || rest.starts_with(") ")
14333}
14334
14335/// Pending token_usage row to be batch-inserted.
14336#[derive(Debug, Clone)]
14337pub struct TokenUsageEntry {
14338    pub message_id: i64,
14339    pub conversation_id: i64,
14340    pub agent_id: i64,
14341    pub workspace_id: Option<i64>,
14342    pub source_id: String,
14343    pub timestamp_ms: i64,
14344    pub day_id: i64,
14345    pub model_name: Option<String>,
14346    pub model_family: Option<String>,
14347    pub model_tier: Option<String>,
14348    pub service_tier: Option<String>,
14349    pub provider: Option<String>,
14350    pub input_tokens: Option<i64>,
14351    pub output_tokens: Option<i64>,
14352    pub cache_read_tokens: Option<i64>,
14353    pub cache_creation_tokens: Option<i64>,
14354    pub thinking_tokens: Option<i64>,
14355    pub total_tokens: Option<i64>,
14356    pub estimated_cost_usd: Option<f64>,
14357    pub role: String,
14358    pub content_chars: i64,
14359    pub has_tool_calls: bool,
14360    pub tool_call_count: u32,
14361    pub data_source: String,
14362}
14363
14364// -------------------------------------------------------------------------
14365// PricingTable — In-memory cache for model_pricing lookups (bead z9fse.10)
14366// -------------------------------------------------------------------------
14367
14368/// One pricing row loaded from the `model_pricing` table.
14369#[derive(Debug, Clone)]
14370pub struct PricingEntry {
14371    pub model_pattern: String,
14372    pub provider: String,
14373    pub input_cost_per_mtok: f64,
14374    pub output_cost_per_mtok: f64,
14375    pub cache_read_cost_per_mtok: Option<f64>,
14376    pub cache_creation_cost_per_mtok: Option<f64>,
14377    /// Effective date as day_id (days since 2020-01-01).
14378    pub effective_day_id: i64,
14379}
14380
14381/// Diagnostics for pricing coverage during a batch operation.
14382#[derive(Debug, Clone, Default)]
14383pub struct PricingDiagnostics {
14384    pub priced_count: u64,
14385    pub unpriced_count: u64,
14386    /// Top unknown model names → count.
14387    pub unknown_models: HashMap<String, u64>,
14388}
14389
14390impl PricingDiagnostics {
14391    fn record_priced(&mut self) {
14392        self.priced_count += 1;
14393    }
14394
14395    fn record_unpriced(&mut self, model_name: Option<&str>) {
14396        self.unpriced_count += 1;
14397        let key = model_name.unwrap_or("(none)").to_string();
14398        *self.unknown_models.entry(key).or_insert(0) += 1;
14399    }
14400
14401    /// Log a summary of pricing coverage.
14402    pub fn log_summary(&self) {
14403        let total = self.priced_count + self.unpriced_count;
14404        if total == 0 {
14405            return;
14406        }
14407        let pct = (self.priced_count as f64 / total as f64) * 100.0;
14408        tracing::info!(
14409            target: "cass::analytics::pricing",
14410            priced = self.priced_count,
14411            unpriced = self.unpriced_count,
14412            total = total,
14413            coverage_pct = format!("{pct:.1}%"),
14414            "pricing coverage"
14415        );
14416        if !self.unknown_models.is_empty() {
14417            let mut sorted: Vec<_> = self.unknown_models.iter().collect();
14418            sorted.sort_by(|a, b| b.1.cmp(a.1));
14419            for (model, count) in sorted.iter().take(5) {
14420                tracing::debug!(
14421                    target: "cass::analytics::pricing",
14422                    model = model.as_str(),
14423                    count = count,
14424                    "unknown model (no pricing)"
14425                );
14426            }
14427        }
14428    }
14429}
14430
14431/// In-memory pricing table loaded from `model_pricing` for fast lookups.
14432#[derive(Debug, Clone)]
14433pub struct PricingTable {
14434    entries: Vec<PricingEntry>,
14435}
14436
14437impl PricingTable {
14438    /// Load all pricing entries from the database.
14439    pub fn load(conn: &FrankenConnection) -> Result<Self> {
14440        Self::franken_load(conn)
14441    }
14442
14443    /// Load all pricing entries from a frankensqlite connection.
14444    pub fn franken_load(conn: &FrankenConnection) -> Result<Self> {
14445        let rows = conn.query(
14446            "SELECT model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
14447                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
14448             FROM model_pricing
14449             ORDER BY effective_date DESC",
14450        )?;
14451        let mut entries = Vec::with_capacity(rows.len());
14452        for row in &rows {
14453            let effective_date: String = row.get_typed(6)?;
14454            let effective_day_id = date_str_to_day_id(&effective_date)?;
14455            entries.push(PricingEntry {
14456                model_pattern: row.get_typed(0)?,
14457                provider: row.get_typed(1)?,
14458                input_cost_per_mtok: row.get_typed(2)?,
14459                output_cost_per_mtok: row.get_typed(3)?,
14460                cache_read_cost_per_mtok: row.get_typed(4)?,
14461                cache_creation_cost_per_mtok: row.get_typed(5)?,
14462                effective_day_id,
14463            });
14464        }
14465        Ok(Self { entries })
14466    }
14467
14468    /// Look up the best pricing entry for a given model name and date.
14469    ///
14470    /// Selection rules:
14471    /// 1. Pattern must match model_name (SQL LIKE semantics).
14472    /// 2. effective_day_id must be <= message_day_id.
14473    /// 3. Among matches, prefer the most recent effective_date.
14474    /// 4. Tie-break by pattern specificity (longest pattern wins).
14475    pub fn lookup(&self, model_name: &str, message_day_id: i64) -> Option<&PricingEntry> {
14476        let mut best: Option<&PricingEntry> = None;
14477
14478        for entry in &self.entries {
14479            if entry.effective_day_id > message_day_id {
14480                continue;
14481            }
14482            if !sql_like_match(model_name, &entry.model_pattern) {
14483                continue;
14484            }
14485
14486            match best {
14487                None => best = Some(entry),
14488                Some(current) => {
14489                    if entry.effective_day_id > current.effective_day_id
14490                        || (entry.effective_day_id == current.effective_day_id
14491                            && entry.model_pattern.len() > current.model_pattern.len())
14492                    {
14493                        best = Some(entry);
14494                    }
14495                }
14496            }
14497        }
14498
14499        best
14500    }
14501
14502    /// Compute estimated cost in USD for a set of token counts.
14503    ///
14504    /// Returns `None` if no pricing entry matches or if no token counts are available.
14505    pub fn compute_cost(
14506        &self,
14507        model_name: Option<&str>,
14508        message_day_id: i64,
14509        input_tokens: Option<i64>,
14510        output_tokens: Option<i64>,
14511        cache_read_tokens: Option<i64>,
14512        cache_creation_tokens: Option<i64>,
14513    ) -> Option<f64> {
14514        let model = model_name?;
14515        let pricing = self.lookup(model, message_day_id)?;
14516
14517        if input_tokens.is_none() && output_tokens.is_none() {
14518            return None;
14519        }
14520
14521        let mut cost = 0.0;
14522        let cache_read = cache_read_tokens.unwrap_or(0);
14523        let cache_creation = cache_creation_tokens.unwrap_or(0);
14524        // input_tokens includes cache tokens as a subset; subtract them
14525        // so we don't charge at both the full input rate AND the cache rate.
14526        let non_cache_input = input_tokens
14527            .unwrap_or(0)
14528            .saturating_sub(cache_read)
14529            .saturating_sub(cache_creation)
14530            .max(0);
14531        cost += non_cache_input as f64 * pricing.input_cost_per_mtok / 1_000_000.0;
14532        cost += output_tokens.unwrap_or(0) as f64 * pricing.output_cost_per_mtok / 1_000_000.0;
14533
14534        if let Some(cache_price) = pricing.cache_read_cost_per_mtok {
14535            cost += cache_read as f64 * cache_price / 1_000_000.0;
14536        }
14537        if let Some(cache_price) = pricing.cache_creation_cost_per_mtok {
14538            cost += cache_creation as f64 * cache_price / 1_000_000.0;
14539        }
14540
14541        Some(cost)
14542    }
14543
14544    /// Whether the pricing table has any entries.
14545    pub fn is_empty(&self) -> bool {
14546        self.entries.is_empty()
14547    }
14548}
14549
14550/// Convert "YYYY-MM-DD" date string to day_id (days since 2020-01-01),
14551/// matching the format produced by `day_id_from_millis`.
14552fn date_str_to_day_id(s: &str) -> Result<i64> {
14553    use chrono::NaiveDate;
14554    const EPOCH_2020: NaiveDate = match NaiveDate::from_ymd_opt(2020, 1, 1) {
14555        Some(d) => d,
14556        None => unreachable!(),
14557    };
14558    NaiveDate::parse_from_str(s, "%Y-%m-%d")
14559        .map(|d| (d - EPOCH_2020).num_days())
14560        .with_context(|| format!("invalid effective_date '{s}'"))
14561}
14562
14563/// SQL LIKE pattern matcher (case-insensitive). `%` = any sequence, `_` = any single char.
14564fn sql_like_match(value: &str, pattern: &str) -> bool {
14565    sql_like_match_bytes(
14566        value.to_ascii_lowercase().as_bytes(),
14567        pattern.to_ascii_lowercase().as_bytes(),
14568    )
14569}
14570
14571/// Determine the byte length of the UTF-8 character starting at `b`.
14572fn utf8_char_len(b: u8) -> usize {
14573    if b < 0x80 {
14574        1
14575    } else if b < 0xE0 {
14576        2
14577    } else if b < 0xF0 {
14578        3
14579    } else {
14580        4
14581    }
14582}
14583
14584fn sql_like_match_bytes(val: &[u8], pat: &[u8]) -> bool {
14585    if pat.is_empty() {
14586        return val.is_empty();
14587    }
14588    match pat[0] {
14589        b'%' => {
14590            let mut p = 1;
14591            while p < pat.len() && pat[p] == b'%' {
14592                p += 1;
14593            }
14594            let rest = &pat[p..];
14595            // Iterate only at UTF-8 char boundaries
14596            let mut i = 0;
14597            while i <= val.len() {
14598                if sql_like_match_bytes(&val[i..], rest) {
14599                    return true;
14600                }
14601                if i < val.len() {
14602                    i += utf8_char_len(val[i]);
14603                } else {
14604                    break;
14605                }
14606            }
14607            false
14608        }
14609        b'_' => {
14610            // Match one full UTF-8 character, not just one byte
14611            if val.is_empty() {
14612                return false;
14613            }
14614            let char_len = utf8_char_len(val[0]);
14615            val.len() >= char_len && sql_like_match_bytes(&val[char_len..], &pat[1..])
14616        }
14617        c => !val.is_empty() && val[0] == c && sql_like_match_bytes(&val[1..], &pat[1..]),
14618    }
14619}
14620
14621fn rebuild_batch_size_env(var: &str, default: usize) -> usize {
14622    dotenvy::var(var)
14623        .ok()
14624        .and_then(|raw| raw.parse::<usize>().ok())
14625        .filter(|value| *value > 0)
14626        .unwrap_or(default)
14627}
14628
14629/// Returns true when the error chain represents a real `FrankenError::OutOfMemory`
14630/// (typed variant) or a bare "out of memory" / "not enough memory" message.
14631///
14632/// We *deliberately* do not do substring matching on the rendered chain: frankensqlite's
14633/// `FrankenError::OutOfMemory` renders as the literal "out of memory" and is also emitted
14634/// for several non-process-OOM internal conditions (VFS buffer / VDBE register allocation).
14635/// Contextual messages like "connector parse failed: not enough memory in record" must not
14636/// be promoted into the OOM-bisect/quarantine path. See `retryable_franken_anyhow` above
14637/// for the same downcast idiom.
14638fn is_out_of_memory_error<E: OutOfMemoryProbe + ?Sized>(err: &E) -> bool {
14639    err.is_out_of_memory()
14640}
14641
14642trait OutOfMemoryProbe {
14643    fn is_out_of_memory(&self) -> bool;
14644}
14645
14646impl OutOfMemoryProbe for anyhow::Error {
14647    fn is_out_of_memory(&self) -> bool {
14648        self.chain().any(|cause| {
14649            if cause
14650                .downcast_ref::<frankensqlite::FrankenError>()
14651                .is_some_and(|err| matches!(err, frankensqlite::FrankenError::OutOfMemory))
14652            {
14653                return true;
14654            }
14655            is_exact_out_of_memory_message(&cause.to_string())
14656        })
14657    }
14658}
14659
14660impl OutOfMemoryProbe for frankensqlite::FrankenError {
14661    fn is_out_of_memory(&self) -> bool {
14662        matches!(self, frankensqlite::FrankenError::OutOfMemory)
14663    }
14664}
14665
14666fn is_exact_out_of_memory_message(message: &str) -> bool {
14667    matches!(
14668        message.trim().to_ascii_lowercase().as_str(),
14669        "out of memory" | "not enough memory"
14670    )
14671}
14672
14673// Second SqliteStorage impl block removed: SqliteStorage is now a type alias for FrankenStorage.
14674// All methods (insert_conversation_tree, list_agents, list_conversations, etc.) are
14675// available through FrankenStorage.
14676
14677/// Daily count data for histogram display.
14678#[derive(Debug, Clone)]
14679pub struct DailyCount {
14680    pub day_id: i64,
14681    pub sessions: i64,
14682    pub messages: i64,
14683    pub chars: i64,
14684}
14685
14686/// Result of an analytics rebuild operation.
14687#[derive(Debug, Clone)]
14688pub struct AnalyticsRebuildResult {
14689    pub message_metrics_rows: usize,
14690    pub usage_hourly_rows: usize,
14691    pub usage_daily_rows: usize,
14692    pub usage_models_daily_rows: usize,
14693    pub elapsed_ms: u64,
14694    pub messages_per_sec: f64,
14695}
14696
14697/// Result of rebuilding daily stats.
14698#[derive(Debug, Clone)]
14699pub struct DailyStatsRebuildResult {
14700    pub rows_created: i64,
14701    pub total_sessions: i64,
14702}
14703
14704/// Result of purging archived data for a single agent.
14705#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
14706pub struct AgentArchivePurgeResult {
14707    pub conversations_deleted: usize,
14708    pub messages_deleted: usize,
14709}
14710
14711/// Health status of daily stats table.
14712#[derive(Debug, Clone)]
14713pub struct DailyStatsHealth {
14714    pub populated: bool,
14715    pub row_count: i64,
14716    pub oldest_update_ms: Option<i64>,
14717    pub conversation_count: i64,
14718    pub materialized_total: i64,
14719    pub drift: i64,
14720}
14721
14722// -------------------------------------------------------------------------
14723// FTS5 Batch Insert (P2 Opt 2.1)
14724// -------------------------------------------------------------------------
14725
14726/// Batch size for FTS5 inserts. With 7 columns per row (rowid + 6 cols) and
14727/// SQLite's SQLITE_MAX_VARIABLE_NUMBER default of 999, max batch is ~142 rows.
14728/// Using 100 for safety margin and memory efficiency.
14729const FTS5_BATCH_SIZE: usize = 100;
14730
14731#[derive(Debug, Clone)]
14732struct FtsRebuildMessageRow {
14733    rowid: i64,
14734    message_id: i64,
14735    conversation_id: i64,
14736    content: String,
14737    created_at: Option<i64>,
14738}
14739
14740#[derive(Debug, Clone)]
14741struct FtsConversationProjection {
14742    title: String,
14743    agent_id: Option<i64>,
14744    workspace_id: Option<i64>,
14745    source_path: String,
14746}
14747
14748/// Entry for pending FTS5 insert.
14749#[derive(Debug, Clone)]
14750pub struct FtsEntry {
14751    pub content: String,
14752    pub title: String,
14753    pub agent: String,
14754    pub workspace: String,
14755    pub source_path: String,
14756    pub created_at: Option<i64>,
14757    pub message_id: i64,
14758}
14759
14760impl FtsEntry {
14761    /// Create an FTS entry from a message and conversation.
14762    pub fn from_message(message_id: i64, msg: &Message, conv: &Conversation) -> Self {
14763        FtsEntry {
14764            content: msg.content.clone(),
14765            title: conv.title.clone().unwrap_or_default(),
14766            agent: conv.agent_slug.clone(),
14767            workspace: conv
14768                .workspace
14769                .as_ref()
14770                .map(|p| p.to_string_lossy().into_owned())
14771                .unwrap_or_default(),
14772            source_path: path_to_string(&conv.source_path),
14773            created_at: msg.created_at.or(conv.started_at),
14774            message_id,
14775        }
14776    }
14777}
14778
14779const FTS_ENTRY_BATCH_MAX_DOCS: usize = 512;
14780const FTS_ENTRY_BATCH_MAX_CHARS: usize = 1024 * 1024;
14781
14782/// Default batch size for the FTS rebuild INSERT (Bug #168).  When
14783/// `fts_messages` is empty but `messages` has 100K+ rows, a single unbounded
14784/// INSERT-SELECT OOMs.  This constant caps each batch so peak memory stays
14785/// bounded.  Override via `CASS_FTS_REBUILD_BATCH_SIZE` for tuning.
14786const FTS_REBUILD_BATCH_SIZE_DEFAULT: usize = 5_000;
14787
14788/// Read the FTS rebuild batch size from the environment, falling back to the
14789/// compiled-in default.
14790fn fts_rebuild_batch_size() -> usize {
14791    dotenvy::var("CASS_FTS_REBUILD_BATCH_SIZE")
14792        .ok()
14793        .and_then(|v| v.parse::<usize>().ok())
14794        .filter(|&n| n > 0)
14795        .unwrap_or(FTS_REBUILD_BATCH_SIZE_DEFAULT)
14796}
14797
14798fn flush_pending_fts_entries(
14799    storage: &FrankenStorage,
14800    tx: &FrankenTransaction<'_>,
14801    entries: &mut Vec<FtsEntry>,
14802    pending_chars: &mut usize,
14803    inserted_total: &mut usize,
14804) -> Result<()> {
14805    if entries.is_empty() {
14806        return Ok(());
14807    }
14808
14809    if storage.fts_messages_present_cached(tx) {
14810        *inserted_total += franken_batch_insert_fts(tx, entries)?;
14811    }
14812    entries.clear();
14813    *pending_chars = 0;
14814    Ok(())
14815}
14816
14817fn path_to_string<P: AsRef<Path>>(p: P) -> String {
14818    p.as_ref().to_string_lossy().into_owned()
14819}
14820
14821fn role_str(role: &MessageRole) -> String {
14822    role_as_str(role).to_owned()
14823}
14824
14825fn role_as_str(role: &MessageRole) -> &str {
14826    match role {
14827        MessageRole::User => "user",
14828        MessageRole::Agent => "agent",
14829        MessageRole::Tool => "tool",
14830        MessageRole::System => "system",
14831        MessageRole::Other(v) => v.as_str(),
14832    }
14833}
14834
14835fn agent_kind_str(kind: AgentKind) -> String {
14836    match kind {
14837        AgentKind::Cli => "cli".into(),
14838        AgentKind::VsCode => "vscode".into(),
14839        AgentKind::Hybrid => "hybrid".into(),
14840    }
14841}
14842
14843// =============================================================================
14844// Tests (bead yln.4)
14845// =============================================================================
14846
14847#[cfg(test)]
14848mod tests {
14849    use super::*;
14850    use serial_test::serial;
14851    use tempfile::TempDir;
14852
14853    struct EnvGuard {
14854        key: &'static str,
14855        previous: Option<String>,
14856    }
14857
14858    impl Drop for EnvGuard {
14859        fn drop(&mut self) {
14860            if let Some(value) = &self.previous {
14861                // SAFETY: test helper restores prior process env for isolation.
14862                unsafe {
14863                    std::env::set_var(self.key, value);
14864                }
14865            } else {
14866                // SAFETY: test helper restores prior process env for isolation.
14867                unsafe {
14868                    std::env::remove_var(self.key);
14869                }
14870            }
14871        }
14872    }
14873
14874    fn set_env_var(key: &'static str, value: impl AsRef<str>) -> EnvGuard {
14875        let previous = dotenvy::var(key).ok();
14876        // SAFETY: test helper toggles a process-local env var for isolation.
14877        unsafe {
14878            std::env::set_var(key, value.as_ref());
14879        }
14880        EnvGuard { key, previous }
14881    }
14882
14883    #[test]
14884    fn doctor_mutation_open_guard_only_targets_canonical_archive_db() {
14885        let dir = TempDir::new().unwrap();
14886        let canonical = dir.path().join("agent_search.db");
14887        let scratch = dir.path().join("scratch.db");
14888
14889        assert_eq!(
14890            doctor_mutation_lock_path_for_db_open(&canonical),
14891            Some(dir.path().join("doctor/locks/doctor-repair.lock"))
14892        );
14893        assert_eq!(doctor_mutation_lock_path_for_db_open(&scratch), None);
14894    }
14895
14896    #[test]
14897    fn doctor_lock_metadata_pid_detection_is_exact() {
14898        let current = std::process::id();
14899
14900        assert!(doctor_lock_metadata_pid_is_current_process(&format!(
14901            "schema_version=1\npid={current}\nmode=safe_auto_run\n"
14902        )));
14903        assert!(!doctor_lock_metadata_pid_is_current_process(
14904            "schema_version=1\npid=not-a-pid\n"
14905        ));
14906        assert!(!doctor_lock_metadata_pid_is_current_process(&format!(
14907            "pid={}\n",
14908            current.saturating_add(1)
14909        )));
14910    }
14911
14912    #[test]
14913    fn doctor_storage_open_refuses_active_doctor_mutation_lock_from_other_process() {
14914        use std::io::Write as _;
14915
14916        let dir = TempDir::new().unwrap();
14917        let db_path = dir.path().join("agent_search.db");
14918        {
14919            let storage = FrankenStorage::open(&db_path).unwrap();
14920            storage.close().unwrap();
14921        }
14922
14923        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
14924        let mut lock_file = fs::OpenOptions::new()
14925            .create(true)
14926            .truncate(false)
14927            .read(true)
14928            .write(true)
14929            .open(&lock_path)
14930            .unwrap();
14931        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
14932        lock_file.set_len(0).unwrap();
14933        lock_file.write_all(b"schema_version=1\npid=1\n").unwrap();
14934        lock_file.sync_all().unwrap();
14935
14936        let err =
14937            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
14938                .expect_err("active doctor mutation lock must block canonical DB opens");
14939        let message = err.to_string();
14940        assert!(
14941            message.contains("doctor mutation lock") && message.contains("active"),
14942            "error should identify the active doctor mutation lock: {message}"
14943        );
14944
14945        fs2::FileExt::unlock(&lock_file).unwrap();
14946    }
14947
14948    #[test]
14949    fn doctor_storage_open_allows_current_doctor_process_probe() {
14950        use std::io::Write as _;
14951
14952        let dir = TempDir::new().unwrap();
14953        let db_path = dir.path().join("agent_search.db");
14954        {
14955            let storage = FrankenStorage::open(&db_path).unwrap();
14956            storage.close().unwrap();
14957        }
14958
14959        let lock_path = doctor_mutation_lock_path_for_db_open(&db_path).unwrap();
14960        let mut lock_file = fs::OpenOptions::new()
14961            .create(true)
14962            .truncate(false)
14963            .read(true)
14964            .write(true)
14965            .open(&lock_path)
14966            .unwrap();
14967        fs2::FileExt::try_lock_exclusive(&lock_file).unwrap();
14968        lock_file.set_len(0).unwrap();
14969        write!(lock_file, "schema_version=1\npid={}\n", std::process::id()).unwrap();
14970        lock_file.sync_all().unwrap();
14971
14972        let conn =
14973            open_franken_raw_readonly_connection_with_timeout(&db_path, Duration::from_millis(25))
14974                .expect(
14975                    "doctor process must be able to run post-repair read probes under its own lock",
14976                );
14977        drop(conn);
14978
14979        fs2::FileExt::unlock(&lock_file).unwrap();
14980    }
14981
14982    #[test]
14983    fn autocommit_retain_disable_tries_compat_then_canonical_pragma() {
14984        let mut attempts = Vec::new();
14985
14986        let selected = disable_autocommit_retain(|pragma| {
14987            attempts.push(pragma);
14988            if pragma == "PRAGMA fsqlite.autocommit_retain = OFF;" {
14989                Err("compat namespace unavailable")
14990            } else {
14991                Ok(())
14992            }
14993        })
14994        .expect("canonical pragma should disable autocommit retain");
14995
14996        assert_eq!(selected, "PRAGMA autocommit_retain = OFF;");
14997        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
14998    }
14999
15000    #[test]
15001    fn autocommit_retain_disable_fails_closed_when_no_pragma_works() {
15002        let mut attempts = Vec::new();
15003
15004        let err = disable_autocommit_retain(|pragma| {
15005            attempts.push(pragma);
15006            Err("unsupported pragma")
15007        })
15008        .expect_err("unsupported autocommit retain controls should fail closed");
15009
15010        assert_eq!(attempts, AUTOCOMMIT_RETAIN_OFF_PRAGMAS);
15011        let message = err.to_string();
15012        assert!(
15013            message.contains("refusing to keep a long-lived MVCC connection"),
15014            "error should force callers away from unbounded snapshot retention: {message}"
15015        );
15016        assert!(
15017            message.contains("PRAGMA fsqlite.autocommit_retain = OFF;")
15018                && message.contains("PRAGMA autocommit_retain = OFF;"),
15019            "error should preserve attempted PRAGMAs for diagnostics: {message}"
15020        );
15021    }
15022
15023    /// Open a rusqlite connection on `db_path` for the narrow purpose of
15024    /// injecting (or inspecting the raw projection of) sqlite_master
15025    /// corruption patterns in test fixtures. Frankensqlite intentionally does
15026    /// not support `PRAGMA writable_schema` writes or raw inserts to
15027    /// sqlite_master (see AGENTS.md: "PRAGMA writable_schema: Not supported for
15028    /// write operations"), so these fixtures retain rusqlite as the standard-
15029    /// SQLite interop layer. All callers are in this test module and run under
15030    /// #[cfg(test)]; no production code path touches rusqlite here.
15031    fn rusqlite_test_fixture_conn(db_path: &Path) -> rusqlite::Connection {
15032        rusqlite::Connection::open(db_path).expect("open rusqlite test fixture connection")
15033    }
15034
15035    fn seed_historical_db_direct(
15036        db_path: &Path,
15037        conversations: &[crate::model::types::Conversation],
15038    ) {
15039        if let Some(parent) = db_path.parent() {
15040            fs::create_dir_all(parent).unwrap();
15041        }
15042
15043        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
15044        conn.execute_batch(HISTORICAL_RECOVERY_CORE_SCHEMA).unwrap();
15045        conn.execute_compat(
15046            "INSERT INTO agents(id, slug, name, version, kind, created_at, updated_at)
15047             VALUES(?1, ?2, ?3, ?4, ?5, ?6, ?7)",
15048            fparams![1_i64, "codex", "Codex", "0.2.3", "cli", 0_i64, 0_i64],
15049        )
15050        .unwrap();
15051
15052        let mut next_message_id = 1_i64;
15053        for (conv_index, conv) in conversations.iter().enumerate() {
15054            let conversation_id = i64::try_from(conv_index + 1).unwrap();
15055            let workspace_id = conv.workspace.as_ref().map(|workspace| {
15056                let workspace_id = conversation_id;
15057                let workspace_path = workspace.to_string_lossy().into_owned();
15058                conn.execute_compat(
15059                    "INSERT INTO workspaces(id, path, display_name) VALUES(?1, ?2, ?3)",
15060                    fparams![
15061                        workspace_id,
15062                        workspace_path.as_str(),
15063                        workspace_path.as_str()
15064                    ],
15065                )
15066                .unwrap();
15067                workspace_id
15068            });
15069            let source_path = conv.source_path.to_string_lossy().into_owned();
15070            let metadata_json = conv.metadata_json.to_string();
15071            conn.execute_compat(
15072                "INSERT INTO conversations (
15073                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
15074                    started_at, ended_at, approx_tokens, metadata_json, origin_host
15075                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8, ?9, ?10, ?11, ?12)",
15076                fparams![
15077                    conversation_id,
15078                    1_i64,
15079                    workspace_id,
15080                    conv.source_id.as_str(),
15081                    conv.external_id.as_deref(),
15082                    conv.title.as_deref(),
15083                    source_path.as_str(),
15084                    conv.started_at,
15085                    conv.ended_at,
15086                    conv.approx_tokens,
15087                    metadata_json.as_str(),
15088                    conv.origin_host.as_deref()
15089                ],
15090            )
15091            .unwrap();
15092
15093            for msg in &conv.messages {
15094                let extra_json = msg.extra_json.to_string();
15095                let role = role_str(&msg.role);
15096                conn.execute_compat(
15097                    "INSERT INTO messages(
15098                        id, conversation_id, idx, role, author, created_at, content, extra_json
15099                     ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7, ?8)",
15100                    fparams![
15101                        next_message_id,
15102                        conversation_id,
15103                        msg.idx,
15104                        role.as_str(),
15105                        msg.author.as_deref(),
15106                        msg.created_at,
15107                        msg.content.as_str(),
15108                        extra_json.as_str()
15109                    ],
15110                )
15111                .unwrap();
15112                next_message_id += 1;
15113            }
15114        }
15115    }
15116
15117    // =========================================================================
15118    // User data file protection tests (bead yln.4)
15119    // =========================================================================
15120
15121    #[test]
15122    fn is_user_data_file_detects_bookmarks() {
15123        assert!(is_user_data_file(Path::new("/data/bookmarks.db")));
15124        assert!(is_user_data_file(Path::new("bookmarks.db")));
15125    }
15126
15127    #[test]
15128    fn is_user_data_file_detects_tui_state() {
15129        assert!(is_user_data_file(Path::new("/data/tui_state.json")));
15130    }
15131
15132    #[test]
15133    fn is_user_data_file_detects_sources_toml() {
15134        assert!(is_user_data_file(Path::new("/config/sources.toml")));
15135    }
15136
15137    #[test]
15138    fn is_user_data_file_detects_env() {
15139        assert!(is_user_data_file(Path::new(".env")));
15140    }
15141
15142    #[test]
15143    fn is_user_data_file_rejects_other_files() {
15144        assert!(!is_user_data_file(Path::new("index.db")));
15145        assert!(!is_user_data_file(Path::new("conversations.db")));
15146        assert!(!is_user_data_file(Path::new("random.txt")));
15147    }
15148
15149    // =========================================================================
15150    // Backup creation tests (bead yln.4)
15151    // =========================================================================
15152
15153    #[test]
15154    fn create_backup_returns_none_for_nonexistent() {
15155        let dir = TempDir::new().unwrap();
15156        let db_path = dir.path().join("nonexistent.db");
15157        let result = create_backup(&db_path).unwrap();
15158        assert!(result.is_none());
15159    }
15160
15161    #[test]
15162    fn create_backup_creates_named_file() {
15163        let dir = TempDir::new().unwrap();
15164        let db_path = dir.path().join("test.db");
15165        std::fs::write(&db_path, b"test data").unwrap();
15166
15167        let backup_path = create_backup(&db_path).unwrap();
15168        assert!(backup_path.is_some());
15169        let backup = backup_path.unwrap();
15170        assert!(backup.exists());
15171        assert!(
15172            backup
15173                .file_name()
15174                .unwrap()
15175                .to_str()
15176                .unwrap()
15177                .contains("backup")
15178        );
15179    }
15180
15181    #[test]
15182    fn create_backup_paths_are_unique() {
15183        let dir = TempDir::new().unwrap();
15184        let db_path = dir.path().join("test.db");
15185        std::fs::write(&db_path, b"test data").unwrap();
15186
15187        let first = create_backup(&db_path).unwrap().unwrap();
15188        let second = create_backup(&db_path).unwrap().unwrap();
15189
15190        assert_ne!(first, second);
15191        assert!(first.exists());
15192        assert!(second.exists());
15193    }
15194
15195    #[test]
15196    fn lexical_rebuild_messages_query_uses_conversation_idx_access_path() {
15197        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
15198        use std::path::PathBuf;
15199
15200        let dir = TempDir::new().unwrap();
15201        let db_path = dir.path().join("agent_search.db");
15202        let storage = SqliteStorage::open(&db_path).unwrap();
15203
15204        let agent = Agent {
15205            id: None,
15206            slug: "claude_code".into(),
15207            name: "Claude Code".into(),
15208            version: None,
15209            kind: AgentKind::Cli,
15210        };
15211        let agent_id = storage.ensure_agent(&agent).unwrap();
15212        let conversation = Conversation {
15213            id: None,
15214            agent_slug: "claude_code".into(),
15215            workspace: Some(PathBuf::from("/tmp/workspace")),
15216            external_id: Some("conv-1".into()),
15217            title: Some("Lexical rebuild".into()),
15218            source_path: PathBuf::from("/tmp/conv-1.jsonl"),
15219            started_at: Some(1_700_000_000_000),
15220            ended_at: Some(1_700_000_000_100),
15221            approx_tokens: None,
15222            metadata_json: serde_json::Value::Null,
15223            messages: vec![
15224                Message {
15225                    id: None,
15226                    idx: 0,
15227                    role: MessageRole::User,
15228                    author: Some("user".into()),
15229                    created_at: Some(1_700_000_000_010),
15230                    content: "first".into(),
15231                    extra_json: serde_json::Value::Null,
15232                    snippets: Vec::new(),
15233                },
15234                Message {
15235                    id: None,
15236                    idx: 1,
15237                    role: MessageRole::Agent,
15238                    author: Some("assistant".into()),
15239                    created_at: Some(1_700_000_000_020),
15240                    content: "second".into(),
15241                    extra_json: serde_json::Value::Null,
15242                    snippets: Vec::new(),
15243                },
15244            ],
15245            source_id: LOCAL_SOURCE_ID.into(),
15246            origin_host: None,
15247        };
15248        storage
15249            .insert_conversation_tree(agent_id, None, &conversation)
15250            .unwrap();
15251        let conversation_id = storage
15252            .conn
15253            .query_row_map(
15254                "SELECT id FROM conversations WHERE external_id = ?1",
15255                fparams!["conv-1"],
15256                |row| row.get_typed::<i64>(0),
15257            )
15258            .unwrap();
15259
15260        let opcodes: Vec<String> = storage
15261            .conn
15262            .query_map_collect(
15263                "EXPLAIN \
15264                 SELECT id, idx, role, author, created_at, content \
15265                 FROM messages \
15266                 WHERE conversation_id = ?1 ORDER BY idx",
15267                fparams![conversation_id],
15268                |row| row.get_typed(1),
15269            )
15270            .unwrap();
15271
15272        assert!(
15273            opcodes.iter().any(|opcode| opcode == "SeekGE"),
15274            "expected lexical rebuild message fetch to seek into the conversation_id/idx access path, got {opcodes:?}"
15275        );
15276        assert!(
15277            !opcodes.iter().any(|opcode| opcode == "SorterOpen"),
15278            "expected lexical rebuild message fetch to avoid sorter temp b-trees, got {opcodes:?}"
15279        );
15280    }
15281
15282    #[test]
15283    fn schema_check_rebuild_classification_ignores_transient_errors() {
15284        assert!(!schema_check_error_requires_rebuild(
15285            &frankensqlite::FrankenError::Busy
15286        ));
15287        assert!(!schema_check_error_requires_rebuild(
15288            &frankensqlite::FrankenError::DatabaseLocked {
15289                path: PathBuf::from("/tmp/test.db"),
15290            }
15291        ));
15292        assert!(!schema_check_error_requires_rebuild(
15293            &frankensqlite::FrankenError::CannotOpen {
15294                path: PathBuf::from("/tmp/test.db"),
15295            }
15296        ));
15297        assert!(!schema_check_error_requires_rebuild(
15298            &frankensqlite::FrankenError::Io(std::io::Error::other("disk hiccup"))
15299        ));
15300    }
15301
15302    #[test]
15303    fn schema_check_rebuild_classification_keeps_corruption_errors() {
15304        assert!(schema_check_error_requires_rebuild(
15305            &frankensqlite::FrankenError::DatabaseCorrupt {
15306                detail: "bad header".to_string(),
15307            }
15308        ));
15309        assert!(schema_check_error_requires_rebuild(
15310            &frankensqlite::FrankenError::WalCorrupt {
15311                detail: "bad wal".to_string(),
15312            }
15313        ));
15314        assert!(schema_check_error_requires_rebuild(
15315            &frankensqlite::FrankenError::NotADatabase {
15316                path: PathBuf::from("/tmp/test.db"),
15317            }
15318        ));
15319        assert!(schema_check_error_requires_rebuild(
15320            &frankensqlite::FrankenError::ShortRead {
15321                expected: 4096,
15322                actual: 64,
15323            }
15324        ));
15325    }
15326
15327    #[test]
15328    fn create_backup_refuses_raw_copy_after_retryable_vacuum_errors() {
15329        let retryable_errors = [
15330            frankensqlite::FrankenError::Busy,
15331            frankensqlite::FrankenError::BusyRecovery,
15332            frankensqlite::FrankenError::BusySnapshot {
15333                conflicting_pages: "1,2".to_string(),
15334            },
15335            frankensqlite::FrankenError::DatabaseLocked {
15336                path: PathBuf::from("/tmp/test.db"),
15337            },
15338            frankensqlite::FrankenError::LockFailed {
15339                detail: "fcntl lock still held".to_string(),
15340            },
15341            frankensqlite::FrankenError::WriteConflict { page: 7, holder: 9 },
15342            frankensqlite::FrankenError::SerializationFailure { page: 11 },
15343            frankensqlite::FrankenError::Internal("database is locked".to_string()),
15344        ];
15345
15346        for err in retryable_errors {
15347            assert!(
15348                backup_vacuum_error_requires_consistent_retry(&err),
15349                "retryable VACUUM failure must not fall back to raw bundle copy: {err}"
15350            );
15351        }
15352
15353        assert!(!backup_vacuum_error_requires_consistent_retry(
15354            &frankensqlite::FrankenError::NotADatabase {
15355                path: PathBuf::from("/tmp/test.db")
15356            }
15357        ));
15358        assert!(!backup_vacuum_error_requires_consistent_retry(
15359            &frankensqlite::FrankenError::DatabaseCorrupt {
15360                detail: "bad header".to_string()
15361            }
15362        ));
15363    }
15364
15365    #[test]
15366    fn create_backup_uses_hidden_vacuum_stage_path() {
15367        let backup_path = PathBuf::from("/tmp/test.db.backup.123.456.0");
15368        let stage_path = vacuum_stage_backup_path(&backup_path);
15369        let stage_name = stage_path
15370            .file_name()
15371            .and_then(|name| name.to_str())
15372            .unwrap_or_default();
15373
15374        assert!(stage_name.starts_with('.'));
15375        assert!(stage_name.ends_with(".vacuum-in-progress"));
15376        assert!(
15377            !is_backup_root_name(stage_name, "test.db.backup."),
15378            "incomplete VACUUM output must not be discoverable as a backup root"
15379        );
15380    }
15381
15382    #[test]
15383    fn create_backup_preserves_content() {
15384        let dir = TempDir::new().unwrap();
15385        let db_path = dir.path().join("test.db");
15386        let original_content = b"test database content 12345";
15387        std::fs::write(&db_path, original_content).unwrap();
15388
15389        let backup_path = create_backup(&db_path).unwrap().unwrap();
15390        let backup_content = std::fs::read(&backup_path).unwrap();
15391        assert_eq!(backup_content, original_content);
15392    }
15393
15394    #[test]
15395    fn create_backup_copies_sidecars_when_present() {
15396        let dir = TempDir::new().unwrap();
15397        let db_path = dir.path().join("test.db");
15398        std::fs::write(&db_path, b"db").unwrap();
15399        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15400        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15401
15402        let backup_path = create_backup(&db_path).unwrap().unwrap();
15403
15404        assert_eq!(
15405            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15406            b"wal"
15407        );
15408        assert_eq!(
15409            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15410            b"shm"
15411        );
15412    }
15413
15414    #[test]
15415    #[cfg(unix)]
15416    fn create_backup_rejects_symlink_root_during_raw_fallback() {
15417        use std::os::unix::fs::symlink;
15418
15419        let dir = TempDir::new().unwrap();
15420        let outside_db = dir.path().join("outside.db");
15421        let db_path = dir.path().join("test.db");
15422        std::fs::write(&outside_db, b"not sqlite").unwrap();
15423        symlink(&outside_db, &db_path).unwrap();
15424
15425        let err = create_backup(&db_path).unwrap_err();
15426
15427        assert!(
15428            err.to_string().contains("bundle symlink"),
15429            "unexpected error: {err:#}"
15430        );
15431        assert_eq!(std::fs::read(&outside_db).unwrap(), b"not sqlite");
15432        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15433            .unwrap()
15434            .filter_map(|entry| entry.ok())
15435            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15436            .filter(|name| name.starts_with("test.db.backup."))
15437            .collect();
15438        assert!(
15439            backup_roots.is_empty(),
15440            "symlinked backup source must not publish backup roots: {backup_roots:?}"
15441        );
15442    }
15443
15444    #[test]
15445    #[cfg(unix)]
15446    fn create_backup_rejects_symlink_sidecar_without_partial_backup() {
15447        use std::os::unix::fs::symlink;
15448
15449        let dir = TempDir::new().unwrap();
15450        let db_path = dir.path().join("test.db");
15451        let outside_wal = dir.path().join("outside.wal");
15452        let wal_path = database_sidecar_path(&db_path, "-wal");
15453        std::fs::write(&db_path, b"not sqlite").unwrap();
15454        std::fs::write(&outside_wal, b"outside wal").unwrap();
15455        symlink(&outside_wal, &wal_path).unwrap();
15456
15457        let err = create_backup(&db_path).unwrap_err();
15458
15459        assert!(
15460            err.to_string().contains("bundle symlink"),
15461            "unexpected error: {err:#}"
15462        );
15463        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15464        let backup_roots: Vec<_> = std::fs::read_dir(dir.path())
15465            .unwrap()
15466            .filter_map(|entry| entry.ok())
15467            .map(|entry| entry.file_name().to_string_lossy().into_owned())
15468            .filter(|name| name.starts_with("test.db.backup."))
15469            .collect();
15470        assert!(
15471            backup_roots.is_empty(),
15472            "sidecar preflight failure must not leave a partial backup root: {backup_roots:?}"
15473        );
15474    }
15475
15476    // =========================================================================
15477    // Backup cleanup tests (bead yln.4)
15478    // =========================================================================
15479
15480    #[test]
15481    fn cleanup_old_backups_keeps_recent() {
15482        let dir = TempDir::new().unwrap();
15483        let db_path = dir.path().join("test.db");
15484
15485        // Create 5 backup files with different timestamps
15486        for i in 0..5 {
15487            let backup_name = format!("test.db.backup.{}", 1000 + i);
15488            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15489        }
15490
15491        cleanup_old_backups(&db_path, 3).unwrap();
15492
15493        // Count remaining backup files
15494        let backups: Vec<_> = std::fs::read_dir(dir.path())
15495            .unwrap()
15496            .filter_map(|e| e.ok())
15497            .filter(|e| e.file_name().to_str().unwrap_or("").contains("backup"))
15498            .collect();
15499
15500        assert_eq!(backups.len(), 3);
15501    }
15502
15503    #[test]
15504    fn cleanup_old_backups_ignores_wal_and_shm_sidecars() {
15505        let dir = TempDir::new().unwrap();
15506        let db_path = dir.path().join("test.db");
15507
15508        for i in 0..3 {
15509            let backup_name = format!("test.db.backup.{}", 1000 + i);
15510            let backup_path = dir.path().join(&backup_name);
15511            std::fs::write(&backup_path, format!("backup {i}")).unwrap();
15512            std::fs::write(format!("{}-wal", backup_path.display()), b"wal").unwrap();
15513            std::fs::write(format!("{}-shm", backup_path.display()), b"shm").unwrap();
15514            std::thread::sleep(std::time::Duration::from_millis(20));
15515        }
15516
15517        cleanup_old_backups(&db_path, 2).unwrap();
15518
15519        let mut roots = Vec::new();
15520        let mut wals = Vec::new();
15521        let mut shms = Vec::new();
15522        for entry in std::fs::read_dir(dir.path())
15523            .unwrap()
15524            .filter_map(|e| e.ok())
15525        {
15526            let name = entry.file_name().to_string_lossy().into_owned();
15527            if name.ends_with("-wal") {
15528                wals.push(name);
15529            } else if name.ends_with("-shm") {
15530                shms.push(name);
15531            } else if name.contains("backup") {
15532                roots.push(name);
15533            }
15534        }
15535
15536        assert_eq!(roots.len(), 2, "should keep two backup roots");
15537        assert_eq!(
15538            wals.len(),
15539            2,
15540            "should keep WAL sidecars only for retained backups"
15541        );
15542        assert_eq!(
15543            shms.len(),
15544            2,
15545            "should keep SHM sidecars only for retained backups"
15546        );
15547    }
15548
15549    #[test]
15550    fn move_database_bundle_moves_database_and_sidecars() {
15551        let dir = TempDir::new().unwrap();
15552        let db_path = dir.path().join("test.db");
15553        let backup_path = dir.path().join("test.db.corrupt");
15554
15555        std::fs::write(&db_path, b"db").unwrap();
15556        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15557        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15558
15559        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15560        assert_eq!(
15561            moved,
15562            DatabaseBundleMoveResult {
15563                database: true,
15564                wal: true,
15565                shm: true
15566            }
15567        );
15568        assert!(moved.moved_any());
15569
15570        assert!(!db_path.exists());
15571        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15572        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15573
15574        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15575        assert_eq!(
15576            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15577            b"wal"
15578        );
15579        assert_eq!(
15580            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15581            b"shm"
15582        );
15583    }
15584
15585    #[test]
15586    fn move_database_bundle_preserves_orphan_sidecars_without_main_db() {
15587        let dir = TempDir::new().unwrap();
15588        let db_path = dir.path().join("test.db");
15589        let backup_path = dir.path().join("test.db.corrupt");
15590
15591        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15592        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15593
15594        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15595        assert_eq!(
15596            moved,
15597            DatabaseBundleMoveResult {
15598                database: false,
15599                wal: true,
15600                shm: true
15601            }
15602        );
15603        assert!(moved.moved_any());
15604        assert!(!db_path.exists());
15605        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15606        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15607        assert_eq!(
15608            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15609            b"wal"
15610        );
15611        assert_eq!(
15612            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15613            b"shm"
15614        );
15615    }
15616
15617    #[test]
15618    #[cfg(unix)]
15619    fn move_database_bundle_moves_dangling_symlink_database_root() {
15620        use std::os::unix::fs::symlink;
15621
15622        let dir = TempDir::new().unwrap();
15623        let db_path = dir.path().join("test.db");
15624        let backup_path = dir.path().join("test.db.corrupt");
15625        let missing_target = dir.path().join("missing-target.db");
15626
15627        symlink(&missing_target, &db_path).unwrap();
15628
15629        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15630
15631        assert_eq!(
15632            moved,
15633            DatabaseBundleMoveResult {
15634                database: true,
15635                wal: false,
15636                shm: false
15637            }
15638        );
15639        assert!(std::fs::symlink_metadata(&db_path).is_err());
15640        assert!(
15641            std::fs::symlink_metadata(&backup_path)
15642                .unwrap()
15643                .file_type()
15644                .is_symlink()
15645        );
15646        assert!(!missing_target.exists());
15647    }
15648
15649    #[test]
15650    #[cfg(unix)]
15651    fn move_database_bundle_moves_dangling_symlink_sidecars_without_main_db() {
15652        use std::os::unix::fs::symlink;
15653
15654        let dir = TempDir::new().unwrap();
15655        let db_path = dir.path().join("test.db");
15656        let backup_path = dir.path().join("test.db.corrupt");
15657        let missing_wal_target = dir.path().join("missing-wal");
15658        let missing_shm_target = dir.path().join("missing-shm");
15659        let wal_path = database_sidecar_path(&db_path, "-wal");
15660        let shm_path = database_sidecar_path(&db_path, "-shm");
15661
15662        symlink(&missing_wal_target, &wal_path).unwrap();
15663        symlink(&missing_shm_target, &shm_path).unwrap();
15664
15665        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15666
15667        assert_eq!(
15668            moved,
15669            DatabaseBundleMoveResult {
15670                database: false,
15671                wal: true,
15672                shm: true
15673            }
15674        );
15675        assert!(std::fs::symlink_metadata(&wal_path).is_err());
15676        assert!(std::fs::symlink_metadata(&shm_path).is_err());
15677        assert!(
15678            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-wal"))
15679                .unwrap()
15680                .file_type()
15681                .is_symlink()
15682        );
15683        assert!(
15684            std::fs::symlink_metadata(database_sidecar_path(&backup_path, "-shm"))
15685                .unwrap()
15686                .file_type()
15687                .is_symlink()
15688        );
15689        assert!(!missing_wal_target.exists());
15690        assert!(!missing_shm_target.exists());
15691    }
15692
15693    #[test]
15694    fn copy_database_bundle_copies_database_and_sidecars() {
15695        let dir = TempDir::new().unwrap();
15696        let db_path = dir.path().join("test.db");
15697        let copied_path = dir.path().join("copy.db");
15698
15699        std::fs::write(&db_path, b"db").unwrap();
15700        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15701        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15702
15703        copy_database_bundle(&db_path, &copied_path).unwrap();
15704
15705        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15706        assert_eq!(
15707            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15708            b"wal"
15709        );
15710        assert_eq!(
15711            std::fs::read(database_sidecar_path(&copied_path, "-shm")).unwrap(),
15712            b"shm"
15713        );
15714        assert_eq!(std::fs::read(&db_path).unwrap(), b"db");
15715    }
15716
15717    #[test]
15718    fn copy_database_bundle_creates_destination_parent() {
15719        let dir = TempDir::new().unwrap();
15720        let db_path = dir.path().join("test.db");
15721        let copied_path = dir.path().join("nested/copies/copy.db");
15722
15723        std::fs::write(&db_path, b"db").unwrap();
15724        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15725
15726        copy_database_bundle(&db_path, &copied_path).unwrap();
15727
15728        assert!(copied_path.parent().unwrap().is_dir());
15729        assert_eq!(std::fs::read(&copied_path).unwrap(), b"db");
15730        assert_eq!(
15731            std::fs::read(database_sidecar_path(&copied_path, "-wal")).unwrap(),
15732            b"wal"
15733        );
15734    }
15735
15736    #[test]
15737    #[cfg(unix)]
15738    fn copy_database_bundle_rejects_symlink_source_root() {
15739        use std::os::unix::fs::symlink;
15740
15741        let dir = TempDir::new().unwrap();
15742        let outside_db = dir.path().join("outside.db");
15743        let db_path = dir.path().join("test.db");
15744        let copied_path = dir.path().join("copy.db");
15745
15746        std::fs::write(&outside_db, b"outside").unwrap();
15747        symlink(&outside_db, &db_path).unwrap();
15748
15749        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15750
15751        assert!(
15752            err.to_string().contains("bundle symlink"),
15753            "unexpected error: {err:#}"
15754        );
15755        assert!(!copied_path.exists());
15756        assert_eq!(std::fs::read(&outside_db).unwrap(), b"outside");
15757    }
15758
15759    #[test]
15760    #[cfg(unix)]
15761    fn copy_database_bundle_rejects_symlink_sidecar() {
15762        use std::os::unix::fs::symlink;
15763
15764        let dir = TempDir::new().unwrap();
15765        let db_path = dir.path().join("test.db");
15766        let copied_path = dir.path().join("copy.db");
15767        let outside_wal = dir.path().join("outside.wal");
15768        let wal_path = database_sidecar_path(&db_path, "-wal");
15769
15770        std::fs::write(&db_path, b"db").unwrap();
15771        std::fs::write(&outside_wal, b"outside wal").unwrap();
15772        symlink(&outside_wal, &wal_path).unwrap();
15773
15774        let err = copy_database_bundle(&db_path, &copied_path).unwrap_err();
15775
15776        assert!(
15777            err.to_string().contains("bundle symlink"),
15778            "unexpected error: {err:#}"
15779        );
15780        assert_eq!(std::fs::read(&outside_wal).unwrap(), b"outside wal");
15781        assert!(!copied_path.exists());
15782        assert!(!database_sidecar_path(&copied_path, "-wal").exists());
15783    }
15784
15785    #[test]
15786    fn move_database_bundle_creates_destination_parent_and_moves_sidecars() {
15787        let dir = TempDir::new().unwrap();
15788        let db_path = dir.path().join("test.db");
15789        let backup_path = dir.path().join("nested/backups/test.db.corrupt");
15790
15791        std::fs::write(&db_path, b"db").unwrap();
15792        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15793        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15794
15795        let moved = move_database_bundle(&db_path, &backup_path).unwrap();
15796        assert_eq!(
15797            moved,
15798            DatabaseBundleMoveResult {
15799                database: true,
15800                wal: true,
15801                shm: true
15802            }
15803        );
15804        assert!(backup_path.parent().unwrap().is_dir());
15805        assert_eq!(std::fs::read(&backup_path).unwrap(), b"db");
15806        assert_eq!(
15807            std::fs::read(database_sidecar_path(&backup_path, "-wal")).unwrap(),
15808            b"wal"
15809        );
15810        assert_eq!(
15811            std::fs::read(database_sidecar_path(&backup_path, "-shm")).unwrap(),
15812            b"shm"
15813        );
15814    }
15815
15816    #[test]
15817    fn remove_database_files_removes_orphan_sidecars_without_main_db() {
15818        let dir = TempDir::new().unwrap();
15819        let db_path = dir.path().join("test.db");
15820
15821        std::fs::write(database_sidecar_path(&db_path, "-wal"), b"wal").unwrap();
15822        std::fs::write(database_sidecar_path(&db_path, "-shm"), b"shm").unwrap();
15823
15824        remove_database_files(&db_path).unwrap();
15825
15826        assert!(!db_path.exists());
15827        assert!(!database_sidecar_path(&db_path, "-wal").exists());
15828        assert!(!database_sidecar_path(&db_path, "-shm").exists());
15829    }
15830
15831    #[test]
15832    fn cleanup_old_backups_ignores_backup_named_directories() {
15833        let dir = TempDir::new().unwrap();
15834        let db_path = dir.path().join("test.db");
15835
15836        for i in 0..3 {
15837            let backup_name = format!("test.db.backup.{}", 1000 + i);
15838            std::fs::write(dir.path().join(&backup_name), format!("backup {i}")).unwrap();
15839        }
15840        std::fs::create_dir(dir.path().join("test.db.backup.directory")).unwrap();
15841
15842        cleanup_old_backups(&db_path, 2).unwrap();
15843
15844        let mut backup_files = Vec::new();
15845        let mut backup_dirs = Vec::new();
15846        for entry in std::fs::read_dir(dir.path())
15847            .unwrap()
15848            .filter_map(|e| e.ok())
15849        {
15850            let name = entry.file_name().to_string_lossy().into_owned();
15851            if !name.starts_with("test.db.backup.") {
15852                continue;
15853            }
15854            if entry.path().is_dir() {
15855                backup_dirs.push(name);
15856            } else {
15857                backup_files.push(name);
15858            }
15859        }
15860
15861        assert_eq!(
15862            backup_files.len(),
15863            2,
15864            "only real backup files count toward retention"
15865        );
15866        assert_eq!(
15867            backup_dirs.len(),
15868            1,
15869            "backup-named directories should be ignored"
15870        );
15871    }
15872
15873    // =========================================================================
15874    // Storage open/create tests (bead yln.4)
15875    // =========================================================================
15876
15877    #[test]
15878    fn open_creates_new_database() {
15879        let dir = TempDir::new().unwrap();
15880        let db_path = dir.path().join("new.db");
15881        assert!(!db_path.exists());
15882
15883        let storage = SqliteStorage::open(&db_path).unwrap();
15884        assert!(db_path.exists());
15885        storage.close().unwrap();
15886    }
15887
15888    #[test]
15889    fn open_readonly_fails_for_nonexistent() {
15890        let dir = TempDir::new().unwrap();
15891        let db_path = dir.path().join("nonexistent.db");
15892        let result = SqliteStorage::open_readonly(&db_path);
15893        assert!(result.is_err());
15894    }
15895
15896    #[test]
15897    fn open_readonly_succeeds_for_existing() {
15898        let dir = TempDir::new().unwrap();
15899        let db_path = dir.path().join("existing.db");
15900
15901        // Create first
15902        let _storage = SqliteStorage::open(&db_path).unwrap();
15903        drop(_storage);
15904
15905        // Now open readonly
15906        let storage = SqliteStorage::open_readonly(&db_path).unwrap();
15907        assert!(storage.schema_version().is_ok());
15908    }
15909
15910    #[test]
15911    fn reopen_existing_current_schema_is_idempotent() {
15912        let dir = TempDir::new().unwrap();
15913        let db_path = dir.path().join("existing.db");
15914
15915        // First open creates and migrates to current schema.
15916        {
15917            let storage = SqliteStorage::open(&db_path).unwrap();
15918            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15919        }
15920
15921        // Re-open should not fail on current schema.
15922        let reopened = SqliteStorage::open(&db_path).unwrap();
15923        assert_eq!(
15924            reopened.schema_version().unwrap(),
15925            CURRENT_SCHEMA_VERSION,
15926            "reopening current schema DB should be idempotent"
15927        );
15928    }
15929
15930    #[test]
15931    fn open_or_rebuild_current_schema_does_not_trigger_rebuild() {
15932        let dir = TempDir::new().unwrap();
15933        let db_path = dir.path().join("existing.db");
15934
15935        // Create DB at current schema.
15936        {
15937            let storage = SqliteStorage::open(&db_path).unwrap();
15938            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15939        }
15940
15941        // Should open normally, not require rebuild.
15942        let reopened = SqliteStorage::open_or_rebuild(&db_path)
15943            .expect("current schema DB should open without rebuild");
15944        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
15945    }
15946
15947    #[test]
15948    fn open_or_rebuild_does_not_treat_non_database_paths_as_corruption() {
15949        let dir = TempDir::new().unwrap();
15950        let db_path = dir.path().join("db_dir");
15951        std::fs::create_dir(&db_path).unwrap();
15952
15953        let result = SqliteStorage::open_or_rebuild(&db_path);
15954
15955        match result {
15956            Err(MigrationError::Database(_)) | Err(MigrationError::Io(_)) => {}
15957            Err(MigrationError::RebuildRequired { reason, .. }) => {
15958                panic!("should not rebuild non-database path: {reason}")
15959            }
15960            Err(MigrationError::Other(msg)) => {
15961                panic!("should preserve underlying open error, got Other: {msg}")
15962            }
15963            Ok(_) => panic!("directory path must not open as a database"),
15964        }
15965
15966        assert!(
15967            db_path.is_dir(),
15968            "non-database directory must be left in place"
15969        );
15970    }
15971
15972    // =========================================================================
15973    // Schema version tests (bead yln.4)
15974    // =========================================================================
15975
15976    #[test]
15977    fn schema_version_returns_current() {
15978        let dir = TempDir::new().unwrap();
15979        let db_path = dir.path().join("test.db");
15980        let storage = SqliteStorage::open(&db_path).unwrap();
15981        let version = storage.schema_version().unwrap();
15982        assert!(version >= 5, "Schema version should be at least 5");
15983    }
15984
15985    // =========================================================================
15986    // Current analytics/schema smoke test (bead z9fse.11)
15987    // =========================================================================
15988
15989    #[test]
15990    fn migration_v13_creates_analytics_tables() {
15991        let dir = TempDir::new().unwrap();
15992        let db_path = dir.path().join("test.db");
15993        let storage = SqliteStorage::open(&db_path).unwrap();
15994
15995        // Schema version should be current.
15996        let version = storage.schema_version().unwrap();
15997        assert_eq!(
15998            version, CURRENT_SCHEMA_VERSION,
15999            "Schema version must match CURRENT_SCHEMA_VERSION after migration"
16000        );
16001
16002        let conn = storage.raw();
16003
16004        // Helper: collect column names from PRAGMA table_info
16005        fn col_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
16006            conn.query_map_collect(
16007                &format!("PRAGMA table_info({})", table),
16008                fparams![],
16009                |row: &FrankenRow| row.get_typed(1),
16010            )
16011            .unwrap()
16012        }
16013
16014        // Helper: collect index names from PRAGMA index_list
16015        fn idx_names(conn: &FrankenConnection, table: &str) -> Vec<String> {
16016            conn.query_map_collect(
16017                &format!("PRAGMA index_list({})", table),
16018                fparams![],
16019                |row: &FrankenRow| row.get_typed(1),
16020            )
16021            .unwrap()
16022        }
16023
16024        // Verify message_metrics table exists with expected columns
16025        let mm_cols = col_names(conn, "message_metrics");
16026        for expected in &[
16027            "message_id",
16028            "hour_id",
16029            "day_id",
16030            "content_tokens_est",
16031            "model_name",
16032            "model_family",
16033            "model_tier",
16034            "provider",
16035            "api_input_tokens",
16036            "has_plan",
16037            "agent_slug",
16038            "role",
16039            "api_data_source",
16040        ] {
16041            assert!(
16042                mm_cols.contains(&expected.to_string()),
16043                "message_metrics missing column: {expected}"
16044            );
16045        }
16046
16047        // Verify usage_hourly table
16048        let uh_cols = col_names(conn, "usage_hourly");
16049        for expected in &[
16050            "hour_id",
16051            "plan_message_count",
16052            "plan_content_tokens_est_total",
16053            "plan_api_tokens_total",
16054            "api_coverage_message_count",
16055            "content_tokens_est_user",
16056            "api_thinking_tokens_total",
16057        ] {
16058            assert!(
16059                uh_cols.contains(&expected.to_string()),
16060                "usage_hourly missing column: {expected}"
16061            );
16062        }
16063
16064        // Verify usage_daily table
16065        let ud_cols = col_names(conn, "usage_daily");
16066        for expected in &[
16067            "day_id",
16068            "plan_content_tokens_est_total",
16069            "plan_api_tokens_total",
16070            "api_thinking_tokens_total",
16071            "content_tokens_est_assistant",
16072            "message_count",
16073        ] {
16074            assert!(
16075                ud_cols.contains(&expected.to_string()),
16076                "usage_daily missing column: {expected}"
16077            );
16078        }
16079
16080        // Verify usage_models_daily table
16081        let umd_cols = col_names(conn, "usage_models_daily");
16082        for expected in &[
16083            "day_id",
16084            "model_family",
16085            "model_tier",
16086            "message_count",
16087            "api_tokens_total",
16088            "api_coverage_message_count",
16089        ] {
16090            assert!(
16091                umd_cols.contains(&expected.to_string()),
16092                "usage_models_daily missing column: {expected}"
16093            );
16094        }
16095
16096        // Verify indexes on message_metrics
16097        let mm_idxs = idx_names(conn, "message_metrics");
16098        assert!(
16099            mm_idxs.iter().any(|n| n.contains("idx_mm_hour")),
16100            "message_metrics must have hour index"
16101        );
16102        assert!(
16103            mm_idxs.iter().any(|n| n.contains("idx_mm_agent_day")),
16104            "message_metrics must have agent+day index"
16105        );
16106        assert!(
16107            mm_idxs
16108                .iter()
16109                .any(|n| n.contains("idx_mm_model_family_day")),
16110            "message_metrics must have model_family+day index"
16111        );
16112
16113        // Verify indexes on usage_hourly
16114        let uh_idxs = idx_names(conn, "usage_hourly");
16115        assert!(
16116            uh_idxs.iter().any(|n| n.contains("idx_uh_agent")),
16117            "usage_hourly must have agent index"
16118        );
16119
16120        // Verify indexes on usage_daily
16121        let ud_idxs = idx_names(conn, "usage_daily");
16122        assert!(
16123            ud_idxs.iter().any(|n| n.contains("idx_ud_agent")),
16124            "usage_daily must have agent index"
16125        );
16126
16127        // Verify indexes on usage_models_daily
16128        let umd_idxs = idx_names(conn, "usage_models_daily");
16129        assert!(
16130            umd_idxs.iter().any(|n| n.contains("idx_umd_model_day")),
16131            "usage_models_daily must have model+day index"
16132        );
16133
16134        let conversation_cols = col_names(conn, "conversations");
16135        assert!(
16136            conversation_cols.contains(&"last_message_idx".to_string())
16137                && conversation_cols.contains(&"last_message_created_at".to_string()),
16138            "fresh schema must include V15 tail columns without ALTER TABLE on conversations"
16139        );
16140        let fts_schema_rows: i64 = conn
16141            .query_row_map(
16142                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
16143                fparams![],
16144                |row: &FrankenRow| row.get_typed(0),
16145            )
16146            .unwrap();
16147        assert_eq!(
16148            fts_schema_rows, 0,
16149            "fresh schema should not create and immediately drop derived fts_messages"
16150        );
16151        let integrity: Vec<String> = conn
16152            .query_map_collect("PRAGMA integrity_check;", fparams![], |row: &FrankenRow| {
16153                row.get_typed(0)
16154            })
16155            .unwrap();
16156        assert_eq!(
16157            integrity,
16158            vec!["ok".to_string()],
16159            "fresh schema must pass SQLite integrity_check"
16160        );
16161    }
16162
16163    #[test]
16164    fn hour_id_round_trip() {
16165        // 2026-02-06 12:00:00 UTC
16166        let ts_ms = 1_770_508_800_000_i64;
16167        let hour_id = SqliteStorage::hour_id_from_millis(ts_ms);
16168        let day_id = SqliteStorage::day_id_from_millis(ts_ms);
16169
16170        // hour_id should be 24x day_id (approximately)
16171        assert_eq!(hour_id / 24, day_id, "hour_id/24 should equal day_id");
16172
16173        // Round-trip: millis_from_hour_id should give start of that hour
16174        let back = SqliteStorage::millis_from_hour_id(hour_id);
16175        assert!(
16176            back <= ts_ms && ts_ms - back < 3_600_000,
16177            "Round-trip should land within the same hour"
16178        );
16179    }
16180
16181    #[test]
16182    fn day_and_hour_ids_floor_negative_millis() {
16183        // One millisecond before the Unix epoch should still floor into the
16184        // previous second/hour/day rather than truncating toward zero.
16185        let ts_ms = -1_i64;
16186        let expected_secs = -1_i64;
16187        let epoch_2020_secs = 1_577_836_800_i64;
16188
16189        assert_eq!(
16190            SqliteStorage::day_id_from_millis(ts_ms),
16191            (expected_secs - epoch_2020_secs).div_euclid(86_400)
16192        );
16193        assert_eq!(
16194            SqliteStorage::hour_id_from_millis(ts_ms),
16195            (expected_secs - epoch_2020_secs).div_euclid(3_600)
16196        );
16197    }
16198
16199    #[test]
16200    fn migration_v13_from_v10() {
16201        let dir = TempDir::new().unwrap();
16202        let db_path = dir.path().join("test.db");
16203
16204        // Open at v10 first by faking it
16205        {
16206            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
16207            conn.execute_batch("PRAGMA journal_mode=WAL;").unwrap();
16208            conn.execute_batch(
16209                "CREATE TABLE IF NOT EXISTS meta (key TEXT PRIMARY KEY, value TEXT);",
16210            )
16211            .unwrap();
16212            conn.execute("INSERT OR REPLACE INTO meta(key, value) VALUES('schema_version', '10')")
16213                .unwrap();
16214            // Apply V1-V10 so schema is correct. Keep each historical DDL batch
16215            // in autocommit mode; the fixture is testing cass migration
16216            // transition behavior, not frankensqlite's handling of a giant
16217            // synthetic legacy-DDL transaction.
16218            conn.execute_batch(MIGRATION_V1).unwrap();
16219            conn.execute_batch(MIGRATION_V2).unwrap();
16220            conn.execute_batch(MIGRATION_V4).unwrap();
16221            conn.execute_batch(MIGRATION_V5).unwrap();
16222            conn.execute_batch(MIGRATION_V6).unwrap();
16223            conn.execute_batch(MIGRATION_V7).unwrap();
16224            conn.execute_batch(MIGRATION_V8).unwrap();
16225            conn.execute_batch(MIGRATION_V9).unwrap();
16226            conn.execute_batch(MIGRATION_V10).unwrap();
16227            conn.execute("UPDATE meta SET value = '10' WHERE key = 'schema_version'")
16228                .unwrap();
16229        }
16230        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
16231
16232        // Now open with SqliteStorage — should auto-migrate to current schema
16233        let storage = SqliteStorage::open(&db_path).unwrap();
16234        let version = storage.schema_version().unwrap();
16235        assert_eq!(
16236            version, CURRENT_SCHEMA_VERSION,
16237            "Should have migrated from v10 to the current schema"
16238        );
16239
16240        // Verify new tables exist
16241        let count: i64 = storage
16242            .raw()
16243            .query_row_map(
16244                "SELECT COUNT(*) FROM sqlite_master WHERE type='table' AND name IN ('message_metrics', 'usage_hourly', 'usage_daily', 'usage_models_daily')",
16245                &[],
16246                |row: &FrankenRow| row.get_typed::<i64>(0),
16247            )
16248            .unwrap();
16249        assert_eq!(count, 4, "All 4 analytics tables should exist");
16250    }
16251
16252    // =========================================================================
16253    // Analytics ingest integration test (bead z9fse.2)
16254    // =========================================================================
16255
16256    #[test]
16257    fn analytics_ingest_populates_metrics_and_rollups() {
16258        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16259        use std::path::PathBuf;
16260
16261        let dir = TempDir::new().unwrap();
16262        let db_path = dir.path().join("test.db");
16263        let storage = SqliteStorage::open(&db_path).unwrap();
16264
16265        // Register agent + workspace
16266        let agent = Agent {
16267            id: None,
16268            slug: "claude_code".into(),
16269            name: "Claude Code".into(),
16270            version: Some("1.0".into()),
16271            kind: AgentKind::Cli,
16272        };
16273        let agent_id = storage.ensure_agent(&agent).unwrap();
16274
16275        // Create a synthetic conversation with 3 messages at a known timestamp
16276        // 2026-02-06 10:30:00 UTC → day_id = 2228, hour_id = 53472
16277        let ts_ms = 1_770_551_400_000_i64;
16278        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16279        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16280
16281        // Include a JSON usage block on the assistant message (like Claude Code data)
16282        let usage_json = serde_json::json!({
16283            "message": {
16284                "model": "claude-opus-4-6",
16285                "usage": {
16286                    "input_tokens": 100,
16287                    "output_tokens": 50,
16288                    "cache_read_input_tokens": 200,
16289                    "cache_creation_input_tokens": 30,
16290                    "service_tier": "standard"
16291                }
16292            }
16293        });
16294
16295        let conv = Conversation {
16296            id: None,
16297            agent_slug: "claude_code".into(),
16298            workspace: None,
16299            external_id: Some("test-conv-1".into()),
16300            title: Some("Test conversation".into()),
16301            source_path: PathBuf::from("/tmp/test.jsonl"),
16302            started_at: Some(ts_ms),
16303            ended_at: Some(ts_ms + 60_000),
16304            approx_tokens: None,
16305            metadata_json: serde_json::Value::Null,
16306            messages: vec![
16307                Message {
16308                    id: None,
16309                    idx: 0,
16310                    role: MessageRole::User,
16311                    author: None,
16312                    created_at: Some(ts_ms),
16313                    content: "Hello, can you help me with a plan?".into(),
16314                    extra_json: serde_json::Value::Null,
16315                    snippets: vec![],
16316                },
16317                Message {
16318                    id: None,
16319                    idx: 1,
16320                    role: MessageRole::Agent,
16321                    author: None,
16322                    created_at: Some(ts_ms + 30_000),
16323                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16324                    extra_json: usage_json,
16325                    snippets: vec![],
16326                },
16327                Message {
16328                    id: None,
16329                    idx: 2,
16330                    role: MessageRole::User,
16331                    author: None,
16332                    created_at: Some(ts_ms + 60_000),
16333                    content: "Great, let's proceed!".into(),
16334                    extra_json: serde_json::Value::Null,
16335                    snippets: vec![],
16336                },
16337            ],
16338            source_id: "local".into(),
16339            origin_host: None,
16340        };
16341
16342        let outcomes = storage
16343            .insert_conversations_batched(&[(agent_id, None, &conv)])
16344            .unwrap();
16345        assert_eq!(outcomes.len(), 1);
16346        assert_eq!(outcomes[0].inserted_indices.len(), 3);
16347
16348        let conn = storage.raw();
16349
16350        // Verify message_metrics rows
16351        let mm_count: i64 = conn
16352            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16353                row.get_typed::<i64>(0)
16354            })
16355            .unwrap();
16356        assert_eq!(mm_count, 3, "Should have 3 message_metrics rows");
16357
16358        // Verify hour_id and day_id are correct
16359        #[allow(clippy::type_complexity)]
16360        let rows: Vec<(i64, i64, String, i64, i64, String, String, String, String)> = conn
16361            .query_map_collect(
16362                "SELECT hour_id, day_id, role, content_tokens_est, has_plan, api_data_source, model_family, model_tier, provider FROM message_metrics ORDER BY message_id",
16363                fparams![],
16364                |row: &FrankenRow| {
16365                    Ok((
16366                        row.get_typed(0)?,
16367                        row.get_typed(1)?,
16368                        row.get_typed(2)?,
16369                        row.get_typed(3)?,
16370                        row.get_typed(4)?,
16371                        row.get_typed(5)?,
16372                        row.get_typed(6)?,
16373                        row.get_typed(7)?,
16374                        row.get_typed(8)?,
16375                    ))
16376                },
16377            )
16378            .unwrap();
16379
16380        assert_eq!(rows.len(), 3);
16381        // All messages in the same hour/day
16382        assert_eq!(rows[0].0, expected_hour);
16383        assert_eq!(rows[0].1, expected_day);
16384        // First message is user
16385        assert_eq!(rows[0].2, "user");
16386        // Second message (assistant) should have has_plan=1 (contains "## Plan" + numbered steps)
16387        assert_eq!(
16388            rows[1].4, 1,
16389            "Assistant message with plan should have has_plan=1"
16390        );
16391        // Second message should have api data source
16392        assert_eq!(
16393            rows[1].5, "api",
16394            "Claude Code assistant message should have api data source"
16395        );
16396        // First and third (user) messages should be estimated
16397        assert_eq!(rows[0].5, "estimated");
16398        assert_eq!(rows[2].5, "estimated");
16399        assert_eq!(rows[1].6, "claude");
16400        assert_eq!(rows[1].7, "opus");
16401        assert_eq!(rows[1].8, "anthropic");
16402        assert_eq!(rows[0].6, "unknown");
16403        // content_tokens_est = chars / 4
16404        let user_chars = "Hello, can you help me with a plan?".len() as i64;
16405        assert_eq!(rows[0].3, user_chars / 4);
16406
16407        // Verify usage_hourly rollup
16408        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api, uh_api_cov): (
16409            i64,
16410            i64,
16411            i64,
16412            i64,
16413            i64,
16414            i64,
16415            i64,
16416        ) = conn
16417            .query_row_map(
16418                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16419                        plan_content_tokens_est_total, plan_api_tokens_total, api_coverage_message_count
16420                 FROM usage_hourly WHERE hour_id = ?",
16421                fparams![expected_hour],
16422                |row: &FrankenRow| {
16423                    Ok((
16424                        row.get_typed(0)?,
16425                        row.get_typed(1)?,
16426                        row.get_typed(2)?,
16427                        row.get_typed(3)?,
16428                        row.get_typed(4)?,
16429                        row.get_typed(5)?,
16430                        row.get_typed(6)?,
16431                    ))
16432                },
16433            )
16434            .unwrap();
16435        assert_eq!(uh_msg, 3, "Hourly rollup should have 3 messages");
16436        assert_eq!(uh_user, 2, "Hourly rollup should have 2 user messages");
16437        assert_eq!(uh_asst, 1, "Hourly rollup should have 1 assistant message");
16438        assert_eq!(uh_plan, 1, "Hourly rollup should have 1 plan message");
16439        assert!(
16440            uh_plan_content > 0,
16441            "Hourly rollup should include plan content tokens"
16442        );
16443        assert!(
16444            uh_plan_api > 0,
16445            "Hourly rollup should include plan API tokens"
16446        );
16447        assert_eq!(
16448            uh_api_cov, 1,
16449            "Hourly rollup should have 1 API-covered message"
16450        );
16451
16452        // Verify usage_daily rollup matches hourly (same day)
16453        let (ud_msg, ud_api_cov): (i64, i64) = conn
16454            .query_row_map(
16455                "SELECT message_count, api_coverage_message_count FROM usage_daily WHERE day_id = ?",
16456                fparams![expected_day],
16457                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16458            )
16459            .unwrap();
16460        assert_eq!(ud_msg, 3, "Daily rollup should match hourly");
16461        assert_eq!(
16462            ud_api_cov, 1,
16463            "Daily api_coverage should be 1 (only assistant msg has real API data)"
16464        );
16465
16466        // Verify the API input tokens from message_metrics (only API-sourced)
16467        let api_only_input: i64 = conn
16468            .query_row_map(
16469                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE day_id = ? AND api_data_source = 'api'",
16470                fparams![expected_day],
16471                |row: &FrankenRow| row.get_typed::<i64>(0),
16472            )
16473            .unwrap();
16474        assert_eq!(
16475            api_only_input, 100,
16476            "Only API-sourced input tokens should be 100"
16477        );
16478
16479        // Verify rollups match summed message_metrics
16480        let mm_total_content_est: i64 = conn
16481            .query_row_map(
16482                "SELECT SUM(content_tokens_est) FROM message_metrics WHERE day_id = ?",
16483                fparams![expected_day],
16484                |row| row.get_typed::<i64>(0),
16485            )
16486            .unwrap();
16487        let mm_plan_content_est: i64 = conn
16488            .query_row_map(
16489                "SELECT COALESCE(SUM(content_tokens_est), 0) FROM message_metrics WHERE day_id = ? AND has_plan = 1",
16490                fparams![expected_day],
16491                |row: &FrankenRow| row.get_typed::<i64>(0),
16492            )
16493            .unwrap();
16494        let mm_plan_api_total: i64 = conn
16495            .query_row_map(
16496                "SELECT COALESCE(SUM(COALESCE(api_input_tokens, 0) + COALESCE(api_output_tokens, 0) + COALESCE(api_cache_read_tokens, 0) + COALESCE(api_cache_creation_tokens, 0) + COALESCE(api_thinking_tokens, 0)), 0)
16497                 FROM message_metrics WHERE day_id = ? AND has_plan = 1 AND api_data_source = 'api'",
16498                fparams![expected_day],
16499                |row: &FrankenRow| row.get_typed::<i64>(0),
16500            )
16501            .unwrap();
16502        let ud_content_est: i64 = conn
16503            .query_row_map(
16504                "SELECT content_tokens_est_total FROM usage_daily WHERE day_id = ?",
16505                fparams![expected_day],
16506                |row| row.get_typed::<i64>(0),
16507            )
16508            .unwrap();
16509        let (ud_plan_content_est, ud_plan_api_total): (i64, i64) = conn
16510            .query_row_map(
16511                "SELECT plan_content_tokens_est_total, plan_api_tokens_total FROM usage_daily WHERE day_id = ?",
16512                fparams![expected_day],
16513                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?)),
16514            )
16515            .unwrap();
16516        assert_eq!(
16517            mm_total_content_est, ud_content_est,
16518            "Daily rollup content_tokens_est_total must equal SUM of message_metrics"
16519        );
16520        assert_eq!(
16521            mm_plan_content_est, ud_plan_content_est,
16522            "Daily rollup plan_content_tokens_est_total must equal planned message_metrics content sum"
16523        );
16524        assert_eq!(
16525            mm_plan_api_total, ud_plan_api_total,
16526            "Daily rollup plan_api_tokens_total must equal planned message_metrics API token sum"
16527        );
16528
16529        // Verify model rollup rows
16530        let (claude_msg, claude_user, claude_asst, claude_api_total, claude_api_cov): (
16531            i64,
16532            i64,
16533            i64,
16534            i64,
16535            i64,
16536        ) = conn
16537            .query_row_map(
16538                "SELECT message_count, user_message_count, assistant_message_count, api_tokens_total, api_coverage_message_count
16539                 FROM usage_models_daily
16540                 WHERE day_id = ? AND model_family = 'claude' AND model_tier = 'opus'",
16541                fparams![expected_day],
16542                |row: &FrankenRow| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?, row.get_typed(3)?, row.get_typed(4)?)),
16543            )
16544            .unwrap();
16545        assert_eq!(claude_msg, 1);
16546        assert_eq!(claude_user, 0);
16547        assert_eq!(claude_asst, 1);
16548        assert_eq!(claude_api_total, 380);
16549        assert_eq!(claude_api_cov, 1);
16550
16551        let unknown_msg: i64 = conn
16552            .query_row_map(
16553                "SELECT message_count FROM usage_models_daily
16554                 WHERE day_id = ? AND model_family = 'unknown' AND model_tier = 'unknown'",
16555                fparams![expected_day],
16556                |row| row.get_typed(0),
16557            )
16558            .unwrap();
16559        assert_eq!(
16560            unknown_msg, 2,
16561            "user messages should land in unknown model bucket"
16562        );
16563    }
16564
16565    #[test]
16566    fn has_plan_heuristic_detects_plans() {
16567        assert!(has_plan_heuristic(
16568            "## Plan\n\n1. First step\n2. Second step"
16569        ));
16570        assert!(has_plan_heuristic(
16571            "# Plan\nHere is what we will do:\n1. Step one\n2. Step two"
16572        ));
16573        assert!(has_plan_heuristic(
16574            "Plan:\n- Gather baseline\n- Implement changes\n- Validate with tests"
16575        ));
16576        assert!(has_plan_heuristic(
16577            "Next steps:\n1. Update schema\n2. Rebuild rollups"
16578        ));
16579        assert!(!has_plan_heuristic("Hello world"));
16580        assert!(!has_plan_heuristic("Short"));
16581        assert!(!has_plan_heuristic(
16582            "This is a regular message without plans"
16583        ));
16584        assert!(!has_plan_heuristic(
16585            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. install\\n2. run\"}\n```"
16586        ));
16587    }
16588
16589    #[test]
16590    fn has_plan_for_role_only_counts_assistant_messages() {
16591        let plan_text = "## Plan\n1. First\n2. Second";
16592        assert!(has_plan_for_role("assistant", plan_text));
16593        assert!(has_plan_for_role("agent", plan_text));
16594        assert!(has_plan_for_role("Assistant", plan_text));
16595        assert!(!has_plan_for_role("user", plan_text));
16596        assert!(!has_plan_for_role("tool", plan_text));
16597    }
16598
16599    #[test]
16600    fn api_rollups_require_api_data_source() {
16601        let mut agg = AnalyticsRollupAggregator::new();
16602
16603        let estimated_plan = MessageMetricsEntry {
16604            message_id: 1,
16605            created_at_ms: 0,
16606            hour_id: 1,
16607            day_id: 1,
16608            agent_slug: "codex".into(),
16609            workspace_id: 0,
16610            source_id: "local".into(),
16611            role: "assistant".into(),
16612            content_chars: 120,
16613            content_tokens_est: 30,
16614            model_name: None,
16615            model_family: "unknown".into(),
16616            model_tier: "unknown".into(),
16617            provider: "unknown".into(),
16618            api_input_tokens: Some(100),
16619            api_output_tokens: Some(50),
16620            api_cache_read_tokens: Some(0),
16621            api_cache_creation_tokens: Some(0),
16622            api_thinking_tokens: Some(0),
16623            api_service_tier: None,
16624            api_data_source: "estimated".into(),
16625            tool_call_count: 0,
16626            has_tool_calls: false,
16627            has_plan: true,
16628        };
16629        agg.record(&estimated_plan);
16630
16631        let api_plan = MessageMetricsEntry {
16632            message_id: 2,
16633            created_at_ms: 0,
16634            hour_id: 1,
16635            day_id: 1,
16636            agent_slug: "codex".into(),
16637            workspace_id: 0,
16638            source_id: "local".into(),
16639            role: "assistant".into(),
16640            content_chars: 80,
16641            content_tokens_est: 20,
16642            model_name: None,
16643            model_family: "unknown".into(),
16644            model_tier: "unknown".into(),
16645            provider: "unknown".into(),
16646            api_input_tokens: Some(40),
16647            api_output_tokens: Some(10),
16648            api_cache_read_tokens: Some(0),
16649            api_cache_creation_tokens: Some(0),
16650            api_thinking_tokens: Some(0),
16651            api_service_tier: None,
16652            api_data_source: "api".into(),
16653            tool_call_count: 0,
16654            has_tool_calls: false,
16655            has_plan: true,
16656        };
16657        agg.record(&api_plan);
16658
16659        let key = (1_i64, "codex".to_string(), 0_i64, "local".to_string());
16660        let hourly = agg.hourly.get(&key).expect("hourly rollup key must exist");
16661        let daily = agg.daily.get(&key).expect("daily rollup key must exist");
16662        let model_key = (
16663            1_i64,
16664            "codex".to_string(),
16665            0_i64,
16666            "local".to_string(),
16667            "unknown".to_string(),
16668            "unknown".to_string(),
16669        );
16670        let models_daily = agg
16671            .models_daily
16672            .get(&model_key)
16673            .expect("model rollup key must exist");
16674
16675        // Content rollup includes both plan messages.
16676        assert_eq!(hourly.plan_message_count, 2);
16677        assert_eq!(hourly.plan_content_tokens_est_total, 50);
16678        // API plan tokens must include only api_data_source='api' rows.
16679        assert_eq!(hourly.plan_api_tokens_total, 50);
16680        assert_eq!(daily.plan_api_tokens_total, 50);
16681        assert_eq!(models_daily.plan_api_tokens_total, 50);
16682        // Overall API totals must also exclude estimated rows.
16683        assert_eq!(hourly.api_tokens_total, 50);
16684        assert_eq!(hourly.api_input_tokens_total, 40);
16685        assert_eq!(hourly.api_output_tokens_total, 10);
16686        assert_eq!(hourly.api_coverage_message_count, 1);
16687        assert_eq!(daily.api_tokens_total, 50);
16688        assert_eq!(models_daily.api_tokens_total, 50);
16689    }
16690
16691    #[test]
16692    fn has_plan_heuristic_curated_corpus_thresholds() {
16693        // Cross-agent-style positives.
16694        let positives = [
16695            "## Plan\n1. Inspect current schema\n2. Add migration\n3. Verify rebuild",
16696            "Plan:\n1) Reproduce\n2) Patch\n3) Add tests",
16697            "Implementation plan:\n- Parse inputs\n- Update rollups\n- Run checks",
16698            "Next steps:\n1. Reserve file\n2. Implement\n3. Report status",
16699            "# Plan\n1. Gather requirements\n2. Ship changes",
16700            "Action plan:\n- Identify root cause\n- Fix it\n- Validate",
16701        ];
16702
16703        // Typical false positives we want to avoid.
16704        let negatives = [
16705            "The plan is to move fast and fix things later.",
16706            "```json\n{\"tool\":\"shell\",\"stdout\":\"1. ls\\n2. cat\"}\n```",
16707            "stdout:\n1. Build started\n2. Build finished\nexit code: 0",
16708            "I can help with that request. Let me know if you want details.",
16709            "Here is a list:\n- apples\n- oranges",
16710            "Status update: completed tasks and blockers below.",
16711        ];
16712
16713        let tp = positives
16714            .iter()
16715            .filter(|msg| has_plan_heuristic(msg))
16716            .count();
16717        let fp = negatives
16718            .iter()
16719            .filter(|msg| has_plan_heuristic(msg))
16720            .count();
16721
16722        let recall = tp as f64 / positives.len() as f64;
16723        let false_positive_rate = fp as f64 / negatives.len() as f64;
16724
16725        assert!(
16726            recall >= 0.80,
16727            "plan heuristic recall too low: got {recall:.2}"
16728        );
16729        assert!(
16730            false_positive_rate <= 0.20,
16731            "plan heuristic false-positive rate too high: got {false_positive_rate:.2}"
16732        );
16733    }
16734
16735    #[test]
16736    fn rebuild_analytics_repopulates_from_messages() {
16737        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16738        use std::path::PathBuf;
16739
16740        let dir = TempDir::new().unwrap();
16741        let db_path = dir.path().join("test.db");
16742        let storage = SqliteStorage::open(&db_path).unwrap();
16743
16744        // Register agent
16745        let agent = Agent {
16746            id: None,
16747            slug: "claude_code".into(),
16748            name: "Claude Code".into(),
16749            version: Some("1.0".into()),
16750            kind: AgentKind::Cli,
16751        };
16752        let agent_id = storage.ensure_agent(&agent).unwrap();
16753
16754        // 2026-02-06 10:30:00 UTC
16755        let ts_ms = 1_770_551_400_000_i64;
16756        let expected_day = SqliteStorage::day_id_from_millis(ts_ms);
16757        let expected_hour = SqliteStorage::hour_id_from_millis(ts_ms);
16758
16759        let usage_json = serde_json::json!({
16760            "message": {
16761                "model": "claude-opus-4-6",
16762                "usage": {
16763                    "input_tokens": 100,
16764                    "output_tokens": 50,
16765                    "cache_read_input_tokens": 200,
16766                    "cache_creation_input_tokens": 30,
16767                    "service_tier": "standard"
16768                }
16769            }
16770        });
16771
16772        let conv = Conversation {
16773            id: None,
16774            agent_slug: "claude_code".into(),
16775            workspace: None,
16776            external_id: Some("test-rebuild-1".into()),
16777            title: Some("Test conversation".into()),
16778            source_path: PathBuf::from("/tmp/test.jsonl"),
16779            started_at: Some(ts_ms),
16780            ended_at: Some(ts_ms + 60_000),
16781            approx_tokens: None,
16782            metadata_json: serde_json::Value::Null,
16783            messages: vec![
16784                Message {
16785                    id: None,
16786                    idx: 0,
16787                    role: MessageRole::User,
16788                    author: None,
16789                    created_at: Some(ts_ms),
16790                    content: "Hello, can you help me with a plan?".into(),
16791                    extra_json: serde_json::Value::Null,
16792                    snippets: vec![],
16793                },
16794                Message {
16795                    id: None,
16796                    idx: 1,
16797                    role: MessageRole::Agent,
16798                    author: None,
16799                    created_at: Some(ts_ms + 30_000),
16800                    content: "## Plan\n\n1. First step\n2. Second step\n3. Third step".into(),
16801                    extra_json: usage_json,
16802                    snippets: vec![],
16803                },
16804                Message {
16805                    id: None,
16806                    idx: 2,
16807                    role: MessageRole::User,
16808                    author: None,
16809                    created_at: Some(ts_ms + 60_000),
16810                    content: "Great, let's proceed!".into(),
16811                    extra_json: serde_json::Value::Null,
16812                    snippets: vec![],
16813                },
16814            ],
16815            source_id: "local".into(),
16816            origin_host: None,
16817        };
16818
16819        storage
16820            .insert_conversations_batched(&[(agent_id, None, &conv)])
16821            .unwrap();
16822
16823        // Save original analytics state
16824        let conn = storage.raw();
16825        let orig_mm: i64 = conn
16826            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16827                row.get_typed(0)
16828            })
16829            .unwrap();
16830        let orig_hourly: i64 = conn
16831            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
16832                row.get_typed(0)
16833            })
16834            .unwrap();
16835        let orig_daily: i64 = conn
16836            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
16837                row.get_typed(0)
16838            })
16839            .unwrap();
16840        let orig_models_daily: i64 = conn
16841            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
16842                row.get_typed(0)
16843            })
16844            .unwrap();
16845        let orig_api_input: i64 = conn
16846            .query_row_map(
16847                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
16848                &[],
16849                |row: &FrankenRow| row.get_typed(0),
16850            )
16851            .unwrap();
16852
16853        assert_eq!(orig_mm, 3);
16854        assert!(orig_hourly > 0);
16855        assert!(orig_daily > 0);
16856        assert!(orig_models_daily > 0);
16857
16858        // Destroy analytics tables (simulate corruption)
16859        conn.execute("DELETE FROM message_metrics").unwrap();
16860        conn.execute("DELETE FROM usage_hourly").unwrap();
16861        conn.execute("DELETE FROM usage_daily").unwrap();
16862        conn.execute("DELETE FROM usage_models_daily").unwrap();
16863
16864        // Verify they're empty
16865        let zero: i64 = conn
16866            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16867                row.get_typed(0)
16868            })
16869            .unwrap();
16870        assert_eq!(zero, 0);
16871
16872        // Rebuild analytics
16873        let result = storage.rebuild_analytics().unwrap();
16874
16875        assert_eq!(result.message_metrics_rows, 3);
16876        assert!(result.usage_hourly_rows > 0);
16877        assert!(result.usage_daily_rows > 0);
16878        assert!(result.usage_models_daily_rows > 0);
16879        assert!(
16880            result.elapsed_ms < 10_000,
16881            "Rebuild should be fast for 3 msgs"
16882        );
16883
16884        // Verify rebuilt data matches
16885        let conn = storage.raw();
16886        let rebuilt_mm: i64 = conn
16887            .query_row_map("SELECT COUNT(*) FROM message_metrics", &[], |row| {
16888                row.get_typed(0)
16889            })
16890            .unwrap();
16891        assert_eq!(
16892            rebuilt_mm, orig_mm,
16893            "Rebuilt message_metrics count should match"
16894        );
16895
16896        let rebuilt_hourly: i64 = conn
16897            .query_row_map("SELECT COUNT(*) FROM usage_hourly", &[], |row| {
16898                row.get_typed(0)
16899            })
16900            .unwrap();
16901        assert_eq!(
16902            rebuilt_hourly, orig_hourly,
16903            "Rebuilt hourly rows should match"
16904        );
16905
16906        let rebuilt_daily: i64 = conn
16907            .query_row_map("SELECT COUNT(*) FROM usage_daily", &[], |row| {
16908                row.get_typed(0)
16909            })
16910            .unwrap();
16911        assert_eq!(rebuilt_daily, orig_daily, "Rebuilt daily rows should match");
16912
16913        let rebuilt_models_daily: i64 = conn
16914            .query_row_map("SELECT COUNT(*) FROM usage_models_daily", &[], |row| {
16915                row.get_typed(0)
16916            })
16917            .unwrap();
16918        assert_eq!(
16919            rebuilt_models_daily, orig_models_daily,
16920            "Rebuilt model rollup rows should match"
16921        );
16922
16923        // Verify API token data preserved through rebuild
16924        let rebuilt_api_input: i64 = conn
16925            .query_row_map(
16926                "SELECT COALESCE(SUM(api_input_tokens), 0) FROM message_metrics WHERE api_data_source = 'api'",
16927                &[],
16928                |row: &FrankenRow| row.get_typed(0),
16929            )
16930            .unwrap();
16931        assert_eq!(
16932            rebuilt_api_input, orig_api_input,
16933            "Rebuilt API input tokens should match original"
16934        );
16935
16936        // Verify rollups have correct data
16937        let (uh_msg, uh_user, uh_asst, uh_plan, uh_plan_content, uh_plan_api): (
16938            i64,
16939            i64,
16940            i64,
16941            i64,
16942            i64,
16943            i64,
16944        ) = conn
16945            .query_row_map(
16946                "SELECT message_count, user_message_count, assistant_message_count, plan_message_count,
16947                        plan_content_tokens_est_total, plan_api_tokens_total
16948                 FROM usage_hourly WHERE hour_id = ?",
16949                fparams![expected_hour],
16950                |row: &FrankenRow| {
16951                    Ok((
16952                        row.get_typed(0)?,
16953                        row.get_typed(1)?,
16954                        row.get_typed(2)?,
16955                        row.get_typed(3)?,
16956                        row.get_typed(4)?,
16957                        row.get_typed(5)?,
16958                    ))
16959                },
16960            )
16961            .unwrap();
16962        assert_eq!(uh_msg, 3);
16963        assert_eq!(uh_user, 2);
16964        assert_eq!(uh_asst, 1);
16965        assert_eq!(uh_plan, 1);
16966        assert!(uh_plan_content > 0);
16967        assert!(uh_plan_api > 0);
16968
16969        let ud_msg: i64 = conn
16970            .query_row_map(
16971                "SELECT message_count FROM usage_daily WHERE day_id = ?",
16972                fparams![expected_day],
16973                |row| row.get_typed(0),
16974            )
16975            .unwrap();
16976        assert_eq!(ud_msg, 3);
16977    }
16978
16979    #[test]
16980    fn insert_conversations_batched_flushes_large_fts_batches() {
16981        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
16982        use std::path::PathBuf;
16983
16984        let dir = TempDir::new().unwrap();
16985        let db_path = dir.path().join("test.db");
16986        let storage = SqliteStorage::open(&db_path).unwrap();
16987        // V14 drops fts_messages during migration; cass normally recreates it
16988        // during startup via `ensure_search_fallback_fts_consistency`. Tests
16989        // that inspect fts_messages directly need to run the same repair pass
16990        // to exercise the "insert flushes FTS" contract.
16991        storage
16992            .ensure_search_fallback_fts_consistency()
16993            .expect("ensure FTS consistency before insert");
16994
16995        let agent = Agent {
16996            id: None,
16997            slug: "codex".into(),
16998            name: "Codex".into(),
16999            version: Some("0.2.3".into()),
17000            kind: AgentKind::Cli,
17001        };
17002        let agent_id = storage.ensure_agent(&agent).unwrap();
17003
17004        let content = "y".repeat((FTS_ENTRY_BATCH_MAX_CHARS / 2) + 1);
17005        let messages: Vec<_> = (0_i64..2)
17006            .map(|i| Message {
17007                id: None,
17008                idx: i,
17009                role: MessageRole::Agent,
17010                author: None,
17011                created_at: Some(1_700_000_000_000 + i),
17012                content: format!("{i}-{content}"),
17013                extra_json: serde_json::Value::Null,
17014                snippets: Vec::new(),
17015            })
17016            .collect();
17017        let conv = Conversation {
17018            id: None,
17019            agent_slug: "codex".into(),
17020            workspace: Some(PathBuf::from("/tmp/workspace")),
17021            external_id: Some("fts-large-batch".into()),
17022            title: Some("FTS Large Batch".into()),
17023            source_path: PathBuf::from("/tmp/rollout.jsonl"),
17024            started_at: Some(1_700_000_000_000),
17025            ended_at: Some(1_700_000_000_999),
17026            approx_tokens: None,
17027            metadata_json: serde_json::Value::Null,
17028            messages,
17029            source_id: "local".into(),
17030            origin_host: None,
17031        };
17032
17033        let outcomes = storage
17034            .insert_conversations_batched(&[(agent_id, None, &conv)])
17035            .unwrap();
17036        assert_eq!(outcomes.len(), 1);
17037        assert_eq!(outcomes[0].inserted_indices.len(), conv.messages.len());
17038
17039        let message_count: i64 = storage
17040            .conn
17041            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
17042                row.get_typed(0)
17043            })
17044            .unwrap();
17045        let fts_count: i64 = storage
17046            .conn
17047            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
17048                row.get_typed(0)
17049            })
17050            .unwrap();
17051
17052        assert_eq!(message_count, conv.messages.len() as i64);
17053        assert_eq!(fts_count, conv.messages.len() as i64);
17054    }
17055
17056    fn make_profiled_storage_remote_conversation(
17057        external_id: i64,
17058        msg_count: usize,
17059    ) -> Conversation {
17060        Conversation {
17061            id: None,
17062            agent_slug: "codex".into(),
17063            workspace: Some(PathBuf::from("/ws/profiled-storage-remote")),
17064            external_id: Some(format!("profiled-storage-remote-{external_id}")),
17065            title: Some(format!(
17066                "Profiled storage remote conversation {external_id}"
17067            )),
17068            source_path: PathBuf::from(format!("/log/profiled-storage-remote-{external_id}.jsonl")),
17069            started_at: Some(10_000 + external_id * 100),
17070            ended_at: Some(10_000 + external_id * 100 + msg_count as i64),
17071            approx_tokens: Some(msg_count as i64 * 32),
17072            metadata_json: serde_json::json!({ "bench": true }),
17073            messages: (0..msg_count)
17074                .map(|idx| Message {
17075                    id: None,
17076                    idx: idx as i64,
17077                    role: if idx % 2 == 0 {
17078                        MessageRole::User
17079                    } else {
17080                        MessageRole::Agent
17081                    },
17082                    author: Some("tester".into()),
17083                    created_at: Some(20_000 + external_id * 100 + idx as i64),
17084                    content: format!(
17085                        "profiled storage remote content ext={external_id} idx={idx} {}",
17086                        "x".repeat(64)
17087                    ),
17088                    extra_json: serde_json::json!({ "idx": idx }),
17089                    snippets: Vec::new(),
17090                })
17091                .collect(),
17092            source_id: "profiled-storage-remote-source".into(),
17093            origin_host: Some("builder-profile".into()),
17094        }
17095    }
17096
17097    fn make_profiled_append_remote_merge_conversation(
17098        external_id: i64,
17099        msg_count: usize,
17100    ) -> Conversation {
17101        let base_ts = 100_000 + external_id * 1_000;
17102        Conversation {
17103            id: None,
17104            agent_slug: "codex".into(),
17105            workspace: Some(PathBuf::from("/ws/profiled-append-remote")),
17106            external_id: Some(format!("profiled-append-remote-{external_id}")),
17107            title: Some(format!("Profiled append remote conversation {external_id}")),
17108            source_path: PathBuf::from(format!("/log/profiled-append-remote-{external_id}.jsonl")),
17109            started_at: Some(base_ts),
17110            ended_at: Some(base_ts + msg_count as i64),
17111            approx_tokens: Some(msg_count as i64 * 50),
17112            metadata_json: serde_json::json!({ "bench": true }),
17113            messages: (0..msg_count)
17114                .map(|idx| Message {
17115                    id: None,
17116                    idx: idx as i64,
17117                    role: if idx % 2 == 0 {
17118                        MessageRole::User
17119                    } else {
17120                        MessageRole::Agent
17121                    },
17122                    author: Some(format!("model-{}", external_id % 5)),
17123                    created_at: Some(base_ts + idx as i64),
17124                    content: format!(
17125                        "Profiled append remote conversation {} message {}: Lorem ipsum dolor sit amet, consectetur adipiscing elit.",
17126                        external_id, idx
17127                    ),
17128                    extra_json: serde_json::json!({ "bench": true }),
17129                    snippets: Vec::new(),
17130                })
17131                .collect(),
17132            source_id: "profiled-append-remote-source".into(),
17133            origin_host: Some("builder-profile".into()),
17134        }
17135    }
17136
17137    #[test]
17138    fn insert_conversation_tree_batched_new_message_ids_match_snippet_rows() {
17139        let dir = TempDir::new().unwrap();
17140        let db_path = dir.path().join("batched-message-ids.db");
17141        let storage = SqliteStorage::open(&db_path).unwrap();
17142        let agent_id = storage
17143            .ensure_agent(&Agent {
17144                id: None,
17145                slug: "codex".into(),
17146                name: "Codex".into(),
17147                version: None,
17148                kind: AgentKind::Cli,
17149            })
17150            .unwrap();
17151        let workspace_id = storage
17152            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17153            .unwrap();
17154        let mut conv = make_profiled_storage_remote_conversation(42, 5);
17155        for (idx, msg) in conv.messages.iter_mut().enumerate() {
17156            msg.snippets.push(Snippet {
17157                id: None,
17158                file_path: Some(PathBuf::from(format!("src/file_{idx}.rs"))),
17159                start_line: Some((idx + 1) as i64),
17160                end_line: Some((idx + 2) as i64),
17161                language: Some("rust".into()),
17162                snippet_text: Some(format!("fn snippet_{idx}() {{}}")),
17163            });
17164        }
17165        let outcome = storage
17166            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
17167            .unwrap();
17168
17169        let message_count: i64 = storage
17170            .conn
17171            .query_row_map(
17172                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17173                fparams![outcome.conversation_id],
17174                |row| row.get_typed(0),
17175            )
17176            .unwrap();
17177        let joined_snippet_count: i64 = storage
17178            .conn
17179            .query_row_map(
17180                "SELECT COUNT(*)
17181                 FROM snippets s
17182                 JOIN messages m ON s.message_id = m.id
17183                 WHERE m.conversation_id = ?1",
17184                fparams![outcome.conversation_id],
17185                |row| row.get_typed(0),
17186            )
17187            .unwrap();
17188
17189        assert_eq!(message_count, conv.messages.len() as i64);
17190        assert_eq!(joined_snippet_count, conv.messages.len() as i64);
17191    }
17192
17193    #[test]
17194    fn insert_conversation_tree_batched_appended_message_ids_match_snippet_rows() {
17195        let dir = TempDir::new().unwrap();
17196        let db_path = dir.path().join("batched-append-message-ids.db");
17197        let storage = SqliteStorage::open(&db_path).unwrap();
17198        let agent_id = storage
17199            .ensure_agent(&Agent {
17200                id: None,
17201                slug: "codex".into(),
17202                name: "Codex".into(),
17203                version: None,
17204                kind: AgentKind::Cli,
17205            })
17206            .unwrap();
17207        let workspace_id = storage
17208            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17209            .unwrap();
17210
17211        let mut initial = make_profiled_storage_remote_conversation(77, 2);
17212        for (idx, msg) in initial.messages.iter_mut().enumerate() {
17213            msg.snippets.push(Snippet {
17214                id: None,
17215                file_path: Some(PathBuf::from(format!("src/append_initial_{idx}.rs"))),
17216                start_line: Some((idx + 1) as i64),
17217                end_line: Some((idx + 2) as i64),
17218                language: Some("rust".into()),
17219                snippet_text: Some(format!("fn append_initial_{idx}() {{}}")),
17220            });
17221        }
17222        let first = storage
17223            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17224            .unwrap();
17225        assert_eq!(first.inserted_indices, vec![0, 1]);
17226
17227        let mut appended = make_profiled_storage_remote_conversation(77, 5);
17228        for (idx, msg) in appended.messages.iter_mut().enumerate() {
17229            msg.snippets.push(Snippet {
17230                id: None,
17231                file_path: Some(PathBuf::from(format!("src/append_full_{idx}.rs"))),
17232                start_line: Some((idx + 10) as i64),
17233                end_line: Some((idx + 11) as i64),
17234                language: Some("rust".into()),
17235                snippet_text: Some(format!("fn append_full_{idx}() {{}}")),
17236            });
17237        }
17238        let second = storage
17239            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17240            .unwrap();
17241        assert_eq!(second.conversation_id, first.conversation_id);
17242        assert_eq!(second.inserted_indices, vec![2, 3, 4]);
17243
17244        let message_count: i64 = storage
17245            .conn
17246            .query_row_map(
17247                "SELECT COUNT(*) FROM messages WHERE conversation_id = ?1",
17248                fparams![first.conversation_id],
17249                |row| row.get_typed(0),
17250            )
17251            .unwrap();
17252        let joined_snippets: Vec<(i64, String)> = storage
17253            .conn
17254            .query_map_collect(
17255                "SELECT m.idx, s.file_path
17256                 FROM snippets s
17257                 JOIN messages m ON s.message_id = m.id
17258                 WHERE m.conversation_id = ?1
17259                 ORDER BY m.idx, s.id",
17260                fparams![first.conversation_id],
17261                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17262            )
17263            .unwrap();
17264
17265        assert_eq!(message_count, 5);
17266        assert_eq!(
17267            joined_snippets,
17268            vec![
17269                (0, "src/append_initial_0.rs".to_string()),
17270                (1, "src/append_initial_1.rs".to_string()),
17271                (2, "src/append_full_2.rs".to_string()),
17272                (3, "src/append_full_3.rs".to_string()),
17273                (4, "src/append_full_4.rs".to_string()),
17274            ]
17275        );
17276    }
17277
17278    #[test]
17279    fn insert_conversation_tree_rehydrates_external_lookup_after_manual_clear() {
17280        let dir = TempDir::new().unwrap();
17281        let db_path = dir.path().join("external-lookup-rehydrate.db");
17282        let storage = SqliteStorage::open(&db_path).unwrap();
17283        let agent_id = storage
17284            .ensure_agent(&Agent {
17285                id: None,
17286                slug: "codex".into(),
17287                name: "Codex".into(),
17288                version: None,
17289                kind: AgentKind::Cli,
17290            })
17291            .unwrap();
17292        let workspace_id = storage
17293            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
17294            .unwrap();
17295
17296        let initial = make_profiled_storage_remote_conversation(88, 2);
17297        let first = storage
17298            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
17299            .unwrap();
17300        let external_id = initial.external_id.as_deref().unwrap();
17301        let lookup_key =
17302            conversation_external_lookup_key(&initial.source_id, agent_id, external_id);
17303        let lookup_id: i64 = storage
17304            .conn
17305            .query_row_map(
17306                "SELECT conversation_id
17307                 FROM conversation_external_tail_lookup
17308                 WHERE lookup_key = ?1",
17309                fparams![lookup_key.as_str()],
17310                |row| row.get_typed(0),
17311            )
17312            .unwrap();
17313        assert_eq!(lookup_id, first.conversation_id);
17314
17315        storage
17316            .conn
17317            .execute_compat(
17318                "DELETE FROM conversation_external_tail_lookup WHERE lookup_key = ?1",
17319                fparams![lookup_key.as_str()],
17320            )
17321            .unwrap();
17322
17323        let appended = make_profiled_storage_remote_conversation(88, 4);
17324        let second = storage
17325            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
17326            .unwrap();
17327        assert_eq!(second.conversation_id, first.conversation_id);
17328        assert_eq!(second.inserted_indices, vec![2, 3]);
17329
17330        let conversation_count: i64 = storage
17331            .conn
17332            .query_row_map(
17333                "SELECT COUNT(*)
17334                 FROM conversations
17335                 WHERE source_id = ?1 AND agent_id = ?2 AND external_id = ?3",
17336                fparams![initial.source_id.as_str(), agent_id, external_id],
17337                |row| row.get_typed(0),
17338            )
17339            .unwrap();
17340        let restored_lookup: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
17341            .conn
17342            .query_row_map(
17343                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
17344                 FROM conversation_external_tail_lookup
17345                 WHERE lookup_key = ?1",
17346                fparams![lookup_key.as_str()],
17347                |row| {
17348                    Ok((
17349                        row.get_typed(0)?,
17350                        row.get_typed(1)?,
17351                        row.get_typed(2)?,
17352                        row.get_typed(3)?,
17353                    ))
17354                },
17355            )
17356            .unwrap();
17357        let tail_state: (Option<i64>, Option<i64>, Option<i64>) = storage
17358            .conn
17359            .query_row_map(
17360                "SELECT ended_at, last_message_idx, last_message_created_at
17361                 FROM conversation_tail_state
17362                 WHERE conversation_id = ?1",
17363                fparams![first.conversation_id],
17364                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
17365            )
17366            .unwrap();
17367        assert_eq!(conversation_count, 1);
17368        assert_eq!(
17369            restored_lookup,
17370            (
17371                first.conversation_id,
17372                tail_state.0,
17373                tail_state.1,
17374                tail_state.2
17375            )
17376        );
17377        assert_eq!(
17378            tail_state,
17379            (
17380                appended.messages[3].created_at,
17381                Some(3),
17382                appended.messages[3].created_at
17383            )
17384        );
17385    }
17386
17387    #[test]
17388    fn insert_conversation_tree_recreates_daily_stats_after_manual_clear() {
17389        let dir = TempDir::new().unwrap();
17390        let db_path = dir.path().join("test.db");
17391        let storage = SqliteStorage::open(&db_path).unwrap();
17392        let agent_id = storage
17393            .ensure_agent(&Agent {
17394                id: None,
17395                slug: "codex".into(),
17396                name: "Codex".into(),
17397                version: None,
17398                kind: AgentKind::Cli,
17399            })
17400            .unwrap();
17401        let workspace = PathBuf::from("/ws/profiled-storage-remote");
17402        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17403
17404        storage
17405            .insert_conversation_tree(
17406                agent_id,
17407                Some(workspace_id),
17408                &make_profiled_storage_remote_conversation(0, 3),
17409            )
17410            .unwrap();
17411        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17412
17413        storage
17414            .insert_conversation_tree(
17415                agent_id,
17416                Some(workspace_id),
17417                &make_profiled_storage_remote_conversation(1, 2),
17418            )
17419            .unwrap();
17420
17421        let row_count: i64 = storage
17422            .conn
17423            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
17424                row.get_typed(0)
17425            })
17426            .unwrap();
17427        let (session_count, message_count): (i64, i64) = storage
17428            .conn
17429            .query_row_map(
17430                "SELECT session_count, message_count
17431                 FROM daily_stats
17432                 WHERE agent_slug = 'all' AND source_id = 'all'",
17433                fparams![],
17434                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
17435            )
17436            .unwrap();
17437
17438        assert_eq!(row_count, 4);
17439        assert_eq!(session_count, 1);
17440        assert_eq!(message_count, 2);
17441    }
17442
17443    #[test]
17444    #[serial]
17445    fn insert_conversation_tree_stage_profile_tracks_steady_state_remote_reuse() {
17446        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17447
17448        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17449            let dir = TempDir::new().unwrap();
17450            let db_path = dir.path().join(format!("profile-{msg_count}.db"));
17451            let storage = SqliteStorage::open(&db_path).unwrap();
17452            let agent_id = storage
17453                .ensure_agent(&Agent {
17454                    id: None,
17455                    slug: "codex".into(),
17456                    name: "Codex".into(),
17457                    version: None,
17458                    kind: AgentKind::Cli,
17459                })
17460                .unwrap();
17461            let workspace = PathBuf::from("/ws/profiled-storage-remote");
17462            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17463
17464            storage
17465                .insert_conversation_tree(
17466                    agent_id,
17467                    Some(workspace_id),
17468                    &make_profiled_storage_remote_conversation(0, msg_count),
17469                )
17470                .unwrap();
17471
17472            let mut profile = InsertConversationTreePerfProfile::default();
17473            for external_id in 1..=iterations {
17474                storage
17475                    .insert_conversation_tree_with_profile(
17476                        agent_id,
17477                        Some(workspace_id),
17478                        &make_profiled_storage_remote_conversation(external_id as i64, msg_count),
17479                        &mut profile,
17480                    )
17481                    .unwrap();
17482            }
17483
17484            let accounted_duration = profile.source_duration
17485                + profile.tx_open_duration
17486                + profile.existing_lookup_duration
17487                + profile.conversation_row_duration
17488                + profile.message_insert_duration
17489                + profile.snippet_insert_duration
17490                + profile.fts_entry_duration
17491                + profile.fts_flush_duration
17492                + profile.analytics_duration
17493                + profile.commit_duration;
17494            assert_eq!(profile.invocations, iterations);
17495            assert_eq!(profile.messages, iterations * msg_count);
17496            assert_eq!(profile.inserted_messages, iterations * msg_count);
17497            assert!(
17498                profile.total_duration >= accounted_duration,
17499                "accounted stage durations cannot exceed total duration"
17500            );
17501
17502            profile.log_summary(&format!("remote_reuse_{msg_count}_msgs"));
17503        }
17504    }
17505
17506    #[test]
17507    #[serial]
17508    fn insert_conversation_tree_stage_profile_tracks_append_remote_source_merge() {
17509        let _defer_guard = set_env_var("CASS_DEFER_LEXICAL_UPDATES", "0");
17510
17511        for &(msg_count, iterations) in &[(5usize, 80usize), (20, 50), (50, 24)] {
17512            let dir = TempDir::new().unwrap();
17513            let db_path = dir.path().join(format!("append-profile-{msg_count}.db"));
17514            let storage = SqliteStorage::open(&db_path).unwrap();
17515            let agent_id = storage
17516                .ensure_agent(&Agent {
17517                    id: None,
17518                    slug: "codex".into(),
17519                    name: "Codex".into(),
17520                    version: None,
17521                    kind: AgentKind::Cli,
17522                })
17523                .unwrap();
17524            let workspace = PathBuf::from("/ws/profiled-append-remote");
17525            let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
17526
17527            for external_id in 0..iterations {
17528                storage
17529                    .insert_conversation_tree(
17530                        agent_id,
17531                        Some(workspace_id),
17532                        &make_profiled_append_remote_merge_conversation(
17533                            external_id as i64,
17534                            msg_count,
17535                        ),
17536                    )
17537                    .unwrap();
17538            }
17539
17540            let mut profile = InsertConversationTreePerfProfile::default();
17541            for external_id in 0..iterations {
17542                storage
17543                    .append_existing_conversation_with_profile(
17544                        agent_id,
17545                        Some(workspace_id),
17546                        &make_profiled_append_remote_merge_conversation(
17547                            external_id as i64,
17548                            msg_count * 2,
17549                        ),
17550                        &mut profile,
17551                    )
17552                    .unwrap();
17553            }
17554
17555            let accounted_duration = profile.source_duration
17556                + profile.tx_open_duration
17557                + profile.existing_lookup_duration
17558                + profile.existing_idx_lookup_duration
17559                + profile.existing_replay_lookup_duration
17560                + profile.dedupe_filter_duration
17561                + profile.conversation_row_duration
17562                + profile.message_insert_duration
17563                + profile.snippet_insert_duration
17564                + profile.fts_entry_duration
17565                + profile.fts_flush_duration
17566                + profile.analytics_duration
17567                + profile.commit_duration;
17568            assert_eq!(profile.invocations, iterations);
17569            assert_eq!(profile.messages, iterations * msg_count * 2);
17570            assert_eq!(profile.inserted_messages, iterations * msg_count);
17571            assert!(
17572                profile.total_duration >= accounted_duration,
17573                "accounted append stage durations cannot exceed total duration"
17574            );
17575
17576            profile.log_summary(&format!("append_remote_merge_{msg_count}_msgs"));
17577        }
17578    }
17579
17580    #[test]
17581    fn rebuild_daily_stats_recomputes_materialized_totals_without_monolithic_group_by() {
17582        let dir = TempDir::new().unwrap();
17583        let db_path = dir.path().join("test.db");
17584        let storage = SqliteStorage::open(&db_path).unwrap();
17585        let started_at = 1_700_000_000_000_i64;
17586        let day_id = FrankenStorage::day_id_from_millis(started_at);
17587        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17588
17589        storage
17590            .conn
17591            .execute_compat(
17592                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17593                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17594                fparams![1_i64, "codex", "Codex", "cli"],
17595            )
17596            .unwrap();
17597        storage
17598            .conn
17599            .execute_compat(
17600                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17601                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17602                fparams![2_i64, "claude", "Claude", "cli"],
17603            )
17604            .unwrap();
17605
17606        storage
17607            .conn
17608            .execute_compat(
17609                "INSERT INTO conversations (
17610                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17611                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17612                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17613                fparams![
17614                    1_i64,
17615                    1_i64,
17616                    LOCAL_SOURCE_ID,
17617                    "daily-a",
17618                    "Daily A",
17619                    "/tmp/daily-a.jsonl",
17620                    started_at,
17621                    started_at + 200,
17622                    "{}"
17623                ],
17624            )
17625            .unwrap();
17626        storage
17627            .conn
17628            .execute_compat(
17629                "INSERT INTO conversations (
17630                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17631                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17632                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, ?8, NULL, ?9, NULL, NULL)",
17633                fparams![
17634                    2_i64,
17635                    2_i64,
17636                    LOCAL_SOURCE_ID,
17637                    "daily-b",
17638                    "Daily B",
17639                    "/tmp/daily-b.jsonl",
17640                    started_at,
17641                    started_at + 300,
17642                    "{}"
17643                ],
17644            )
17645            .unwrap();
17646
17647        storage
17648            .conn
17649            .execute_compat(
17650                "INSERT INTO messages (
17651                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17652                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17653                fparams![1_i64, 1_i64, 0_i64, "user", started_at, "hello"],
17654            )
17655            .unwrap();
17656        storage
17657            .conn
17658            .execute_compat(
17659                "INSERT INTO messages (
17660                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17661                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17662                fparams![2_i64, 1_i64, 1_i64, "assistant", started_at + 100, "response"],
17663            )
17664            .unwrap();
17665        storage
17666            .conn
17667            .execute_compat(
17668                "INSERT INTO messages (
17669                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17670                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17671                fparams![3_i64, 2_i64, 0_i64, "user", started_at + 50, "abc"],
17672            )
17673            .unwrap();
17674
17675        for (message_id, agent_slug, role, content_len) in [
17676            (1_i64, "codex", "user", 5_i64),
17677            (2_i64, "codex", "assistant", 8_i64),
17678            (3_i64, "claude", "user", 3_i64),
17679        ] {
17680            storage
17681                .conn
17682                .execute_compat(
17683                    "INSERT INTO message_metrics (
17684                        message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17685                        role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17686                        api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17687                        api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17688                        model_name, model_family, model_tier, provider
17689                     ) VALUES (
17690                        ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17691                        ?8, ?9, ?10, ?11, ?12,
17692                        ?13, ?14, ?15,
17693                        ?16, ?17, ?18, ?19, ?20,
17694                        ?21, ?22, ?23, ?24
17695                     )",
17696                    fparams![
17697                        message_id,
17698                        started_at,
17699                        hour_id,
17700                        day_id,
17701                        agent_slug,
17702                        0_i64,
17703                        LOCAL_SOURCE_ID,
17704                        role,
17705                        content_len,
17706                        content_len / 4,
17707                        0_i64,
17708                        0_i64,
17709                        0_i64,
17710                        0_i64,
17711                        0_i64,
17712                        "",
17713                        "estimated",
17714                        0_i64,
17715                        0_i64,
17716                        0_i64,
17717                        "",
17718                        "unknown",
17719                        "unknown",
17720                        "unknown"
17721                    ],
17722                )
17723                .unwrap();
17724        }
17725
17726        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17727
17728        let rebuilt = storage.rebuild_daily_stats().unwrap();
17729        assert_eq!(rebuilt.total_sessions, 2);
17730
17731        let health = storage.daily_stats_health().unwrap();
17732        assert_eq!(health.conversation_count, 2);
17733        assert_eq!(health.materialized_total, 2);
17734        assert_eq!(health.drift, 0);
17735
17736        let total_messages: i64 = storage
17737            .conn
17738            .query_row_map(
17739                "SELECT message_count FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17740                fparams![],
17741                |row| row.get_typed(0),
17742            )
17743            .unwrap();
17744        assert_eq!(total_messages, 3);
17745    }
17746
17747    #[test]
17748    fn rebuild_daily_stats_preserves_byte_counts_with_message_metrics() {
17749        let dir = TempDir::new().unwrap();
17750        let db_path = dir.path().join("test.db");
17751        let storage = SqliteStorage::open(&db_path).unwrap();
17752
17753        let content = "ASCII🙂é漢字";
17754        let expected_bytes = content.len() as i64;
17755        let started_at = 1_704_067_200_000_i64;
17756        let day_id = FrankenStorage::day_id_from_millis(started_at);
17757        let hour_id = FrankenStorage::hour_id_from_millis(started_at);
17758
17759        storage
17760            .conn
17761            .execute_compat(
17762                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17763                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17764                fparams![1_i64, "tester", "Tester", "cli"],
17765            )
17766            .unwrap();
17767        storage
17768            .conn
17769            .execute_compat(
17770                "INSERT INTO conversations (
17771                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17772                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17773                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17774                fparams![
17775                    1_i64,
17776                    1_i64,
17777                    LOCAL_SOURCE_ID,
17778                    "unicode-metrics",
17779                    "Unicode Metrics",
17780                    "/tmp/unicode-metrics.jsonl",
17781                    started_at,
17782                    "{}"
17783                ],
17784            )
17785            .unwrap();
17786        storage
17787            .conn
17788            .execute_compat(
17789                "INSERT INTO messages (
17790                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17791                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17792                fparams![1_i64, 1_i64, 0_i64, "user", started_at, content],
17793            )
17794            .unwrap();
17795        storage
17796            .conn
17797            .execute_compat(
17798                "INSERT INTO message_metrics (
17799                    message_id, created_at_ms, hour_id, day_id, agent_slug, workspace_id, source_id,
17800                    role, content_chars, content_tokens_est, api_input_tokens, api_output_tokens,
17801                    api_cache_read_tokens, api_cache_creation_tokens, api_thinking_tokens,
17802                    api_service_tier, api_data_source, tool_call_count, has_tool_calls, has_plan,
17803                    model_name, model_family, model_tier, provider
17804                 ) VALUES (
17805                    ?1, ?2, ?3, ?4, ?5, ?6, ?7,
17806                    ?8, ?9, ?10, ?11, ?12,
17807                    ?13, ?14, ?15,
17808                    ?16, ?17, ?18, ?19, ?20,
17809                    ?21, ?22, ?23, ?24
17810                 )",
17811                fparams![
17812                    1_i64,
17813                    started_at,
17814                    hour_id,
17815                    day_id,
17816                    "tester",
17817                    0_i64,
17818                    LOCAL_SOURCE_ID,
17819                    "user",
17820                    expected_bytes,
17821                    expected_bytes / 4,
17822                    0_i64,
17823                    0_i64,
17824                    0_i64,
17825                    0_i64,
17826                    0_i64,
17827                    "",
17828                    "estimated",
17829                    0_i64,
17830                    0_i64,
17831                    0_i64,
17832                    "",
17833                    "unknown",
17834                    "unknown",
17835                    "unknown"
17836                ],
17837            )
17838            .unwrap();
17839
17840        let mut tx = storage.conn.transaction().unwrap();
17841        franken_update_daily_stats_in_tx(
17842            &storage,
17843            &tx,
17844            "tester",
17845            LOCAL_SOURCE_ID,
17846            Some(started_at),
17847            StatsDelta {
17848                session_count_delta: 1,
17849                message_count_delta: 1,
17850                total_chars_delta: expected_bytes,
17851            },
17852        )
17853        .unwrap();
17854        tx.commit().unwrap();
17855
17856        let inline_total: i64 = storage
17857            .conn
17858            .query_row_map(
17859                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17860                fparams![],
17861                |row| row.get_typed(0),
17862            )
17863            .unwrap();
17864        assert_eq!(inline_total, expected_bytes);
17865
17866        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17867
17868        let rebuilt = storage.rebuild_daily_stats().unwrap();
17869        assert_eq!(rebuilt.total_sessions, 1);
17870
17871        let rebuilt_total: i64 = storage
17872            .conn
17873            .query_row_map(
17874                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17875                fparams![],
17876                |row| row.get_typed(0),
17877            )
17878            .unwrap();
17879        assert_eq!(rebuilt_total, expected_bytes);
17880    }
17881
17882    #[test]
17883    fn rebuild_daily_stats_raw_fallback_preserves_byte_counts() {
17884        let dir = TempDir::new().unwrap();
17885        let db_path = dir.path().join("test.db");
17886        let storage = SqliteStorage::open(&db_path).unwrap();
17887
17888        let content = "fallback🙂é漢字";
17889        let expected_bytes = content.len() as i64;
17890        let started_at = 1_704_067_200_000_i64;
17891        storage
17892            .conn
17893            .execute_compat(
17894                "INSERT INTO agents (id, slug, name, version, kind, created_at, updated_at)
17895                 VALUES (?1, ?2, ?3, NULL, ?4, 0, 0)",
17896                fparams![1_i64, "tester", "Tester", "cli"],
17897            )
17898            .unwrap();
17899        storage
17900            .conn
17901            .execute_compat(
17902                "INSERT INTO conversations (
17903                    id, agent_id, workspace_id, source_id, external_id, title, source_path,
17904                    started_at, ended_at, approx_tokens, metadata_json, origin_host, metadata_bin
17905                 ) VALUES (?1, ?2, NULL, ?3, ?4, ?5, ?6, ?7, NULL, NULL, ?8, NULL, NULL)",
17906                fparams![
17907                    1_i64,
17908                    1_i64,
17909                    LOCAL_SOURCE_ID,
17910                    "unicode-fallback",
17911                    "Unicode Fallback",
17912                    "/tmp/unicode-fallback.jsonl",
17913                    started_at,
17914                    "{}"
17915                ],
17916            )
17917            .unwrap();
17918        storage
17919            .conn
17920            .execute_compat(
17921                "INSERT INTO messages (
17922                    id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin
17923                 ) VALUES (?1, ?2, ?3, ?4, NULL, ?5, ?6, NULL, NULL)",
17924                fparams![1_i64, 1_i64, 0_i64, "assistant", started_at, content],
17925            )
17926            .unwrap();
17927
17928        let mut tx = storage.conn.transaction().unwrap();
17929        franken_update_daily_stats_in_tx(
17930            &storage,
17931            &tx,
17932            "tester",
17933            LOCAL_SOURCE_ID,
17934            Some(started_at),
17935            StatsDelta {
17936                session_count_delta: 1,
17937                message_count_delta: 1,
17938                total_chars_delta: expected_bytes,
17939            },
17940        )
17941        .unwrap();
17942        tx.commit().unwrap();
17943
17944        storage.conn.execute("DELETE FROM daily_stats").unwrap();
17945
17946        let rebuilt = storage.rebuild_daily_stats().unwrap();
17947        assert_eq!(rebuilt.total_sessions, 1);
17948
17949        let rebuilt_total: i64 = storage
17950            .conn
17951            .query_row_map(
17952                "SELECT total_chars FROM daily_stats WHERE agent_slug = 'all' AND source_id = 'all'",
17953                fparams![],
17954                |row| row.get_typed(0),
17955            )
17956            .unwrap();
17957        assert_eq!(rebuilt_total, expected_bytes);
17958    }
17959
17960    #[test]
17961    fn insert_conversations_batched_appends_duplicate_external_id() {
17962        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
17963        use std::path::PathBuf;
17964
17965        let dir = TempDir::new().unwrap();
17966        let db_path = dir.path().join("test.db");
17967        let storage = SqliteStorage::open(&db_path).unwrap();
17968
17969        let agent = Agent {
17970            id: None,
17971            slug: "codex".into(),
17972            name: "Codex".into(),
17973            version: Some("0.2.3".into()),
17974            kind: AgentKind::Cli,
17975        };
17976        let agent_id = storage.ensure_agent(&agent).unwrap();
17977
17978        let base_conv = |messages: Vec<Message>| Conversation {
17979            id: None,
17980            agent_slug: "codex".into(),
17981            workspace: Some(PathBuf::from("/tmp/workspace")),
17982            external_id: Some("shared-session".into()),
17983            title: Some("Shared Session".into()),
17984            source_path: PathBuf::from("/tmp/rollout.jsonl"),
17985            started_at: Some(1_700_000_000_000),
17986            ended_at: Some(1_700_000_000_999),
17987            approx_tokens: None,
17988            metadata_json: serde_json::Value::Null,
17989            messages,
17990            source_id: "local".into(),
17991            origin_host: None,
17992        };
17993
17994        let conv_a = base_conv(vec![
17995            Message {
17996                id: None,
17997                idx: 0,
17998                role: MessageRole::User,
17999                author: None,
18000                created_at: Some(1_700_000_000_000),
18001                content: "first".into(),
18002                extra_json: serde_json::Value::Null,
18003                snippets: Vec::new(),
18004            },
18005            Message {
18006                id: None,
18007                idx: 1,
18008                role: MessageRole::Agent,
18009                author: None,
18010                created_at: Some(1_700_000_000_100),
18011                content: "second".into(),
18012                extra_json: serde_json::Value::Null,
18013                snippets: Vec::new(),
18014            },
18015        ]);
18016        let conv_b = base_conv(vec![
18017            Message {
18018                id: None,
18019                idx: 0,
18020                role: MessageRole::User,
18021                author: None,
18022                created_at: Some(1_700_000_000_000),
18023                content: "first".into(),
18024                extra_json: serde_json::Value::Null,
18025                snippets: Vec::new(),
18026            },
18027            Message {
18028                id: None,
18029                idx: 1,
18030                role: MessageRole::Agent,
18031                author: None,
18032                created_at: Some(1_700_000_000_100),
18033                content: "second".into(),
18034                extra_json: serde_json::Value::Null,
18035                snippets: Vec::new(),
18036            },
18037            Message {
18038                id: None,
18039                idx: 2,
18040                role: MessageRole::User,
18041                author: None,
18042                created_at: Some(1_700_000_000_200),
18043                content: "third".into(),
18044                extra_json: serde_json::Value::Null,
18045                snippets: Vec::new(),
18046            },
18047            Message {
18048                id: None,
18049                idx: 3,
18050                role: MessageRole::Agent,
18051                author: None,
18052                created_at: Some(1_700_000_000_300),
18053                content: "fourth".into(),
18054                extra_json: serde_json::Value::Null,
18055                snippets: Vec::new(),
18056            },
18057        ]);
18058
18059        let outcomes = storage
18060            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18061            .unwrap();
18062        assert_eq!(outcomes.len(), 2);
18063        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
18064        assert_eq!(outcomes[1].inserted_indices, vec![2, 3]);
18065        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18066
18067        let conversation_count: i64 = storage
18068            .conn
18069            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18070                row.get_typed(0)
18071            })
18072            .unwrap();
18073        let conversation_count_not_indexed: i64 = storage
18074            .conn
18075            .query_row_map(
18076                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18077                fparams![],
18078                |row| row.get_typed(0),
18079            )
18080            .unwrap();
18081        let conversation_count_source_index: i64 = storage
18082            .conn
18083            .query_row_map(
18084                "SELECT COUNT(*) FROM conversations INDEXED BY idx_conversations_source_id",
18085                fparams![],
18086                |row| row.get_typed(0),
18087            )
18088            .unwrap();
18089        let message_count: i64 = storage
18090            .conn
18091            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18092                row.get_typed(0)
18093            })
18094            .unwrap();
18095        let reopened_storage = SqliteStorage::open(&db_path).unwrap();
18096        let reopened_conversation_count: i64 = reopened_storage
18097            .conn
18098            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18099                row.get_typed(0)
18100            })
18101            .unwrap();
18102        let reopened_conversation_count_not_indexed: i64 = reopened_storage
18103            .conn
18104            .query_row_map(
18105                "SELECT COUNT(*) FROM conversations NOT INDEXED",
18106                fparams![],
18107                |row| row.get_typed(0),
18108            )
18109            .unwrap();
18110        let reopened_conversation_ids: Vec<i64> = reopened_storage
18111            .conn
18112            .query_map_collect(
18113                "SELECT id FROM conversations ORDER BY id",
18114                fparams![],
18115                |row| row.get_typed(0),
18116            )
18117            .unwrap();
18118        let reopened_conversation_ids_not_indexed: Vec<i64> = reopened_storage
18119            .conn
18120            .query_map_collect(
18121                "SELECT id FROM conversations NOT INDEXED ORDER BY id",
18122                fparams![],
18123                |row| row.get_typed(0),
18124            )
18125            .unwrap();
18126        let reopened_conversation_ids_source_index: Vec<i64> = reopened_storage
18127            .conn
18128            .query_map_collect(
18129                "SELECT id FROM conversations INDEXED BY idx_conversations_source_id ORDER BY id",
18130                fparams![],
18131                |row| row.get_typed(0),
18132            )
18133            .unwrap();
18134
18135        assert_eq!(reopened_conversation_ids, vec![outcomes[0].conversation_id]);
18136        assert_eq!(
18137            reopened_conversation_ids_not_indexed,
18138            vec![outcomes[0].conversation_id]
18139        );
18140        assert_eq!(
18141            reopened_conversation_ids_source_index,
18142            vec![outcomes[0].conversation_id]
18143        );
18144        assert_eq!(reopened_conversation_count, 1);
18145        assert_eq!(reopened_conversation_count_not_indexed, 1);
18146        assert_eq!(conversation_count_not_indexed, 1);
18147        assert_eq!(conversation_count_source_index, 1);
18148        assert_eq!(conversation_count, 1);
18149        assert_eq!(message_count, 4);
18150    }
18151
18152    #[test]
18153    fn franken_insert_conversation_or_get_existing_recovers_unique_conflict() {
18154        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18155        use std::path::PathBuf;
18156
18157        let dir = TempDir::new().unwrap();
18158        let db_path = dir.path().join("test.db");
18159        let storage = SqliteStorage::open(&db_path).unwrap();
18160
18161        let agent = Agent {
18162            id: None,
18163            slug: "codex".into(),
18164            name: "Codex".into(),
18165            version: Some("0.2.3".into()),
18166            kind: AgentKind::Cli,
18167        };
18168        let agent_id = storage.ensure_agent(&agent).unwrap();
18169
18170        let conv = Conversation {
18171            id: None,
18172            agent_slug: "codex".into(),
18173            workspace: Some(PathBuf::from("/tmp/workspace")),
18174            external_id: Some("recover-duplicate".into()),
18175            title: Some("Recover Duplicate".into()),
18176            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18177            started_at: Some(1_700_000_000_000),
18178            ended_at: Some(1_700_000_000_100),
18179            approx_tokens: None,
18180            metadata_json: serde_json::Value::Null,
18181            messages: vec![Message {
18182                id: None,
18183                idx: 0,
18184                role: MessageRole::User,
18185                author: None,
18186                created_at: Some(1_700_000_000_000),
18187                content: "hello".into(),
18188                extra_json: serde_json::Value::Null,
18189                snippets: Vec::new(),
18190            }],
18191            source_id: "local".into(),
18192            origin_host: None,
18193        };
18194
18195        let tx = storage.conn.transaction().unwrap();
18196        let inserted_id = franken_insert_conversation(&tx, agent_id, None, &conv)
18197            .unwrap()
18198            .expect("first insert should succeed");
18199
18200        let conversation_key = conversation_merge_key(agent_id, &conv);
18201        let resolved = franken_insert_conversation_or_get_existing_after_miss(
18202            &tx,
18203            agent_id,
18204            None,
18205            &conv,
18206            &conversation_key,
18207        )
18208        .unwrap();
18209
18210        match resolved {
18211            ConversationInsertStatus::Existing(existing_id) => {
18212                assert_eq!(existing_id, inserted_id);
18213            }
18214            ConversationInsertStatus::Inserted(new_id) => {
18215                panic!("expected existing conversation id, got freshly inserted {new_id}");
18216            }
18217        }
18218
18219        let conversation_count: i64 = tx
18220            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18221                row.get_typed(0)
18222            })
18223            .unwrap();
18224        assert_eq!(conversation_count, 1);
18225    }
18226
18227    #[test]
18228    fn insert_conversations_batched_merges_duplicate_external_id_with_gaps() {
18229        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18230        use std::path::PathBuf;
18231
18232        let dir = TempDir::new().unwrap();
18233        let db_path = dir.path().join("test.db");
18234        let storage = SqliteStorage::open(&db_path).unwrap();
18235
18236        let agent = Agent {
18237            id: None,
18238            slug: "codex".into(),
18239            name: "Codex".into(),
18240            version: Some("0.2.3".into()),
18241            kind: AgentKind::Cli,
18242        };
18243        let agent_id = storage.ensure_agent(&agent).unwrap();
18244
18245        let base_conv = |messages: Vec<Message>| Conversation {
18246            id: None,
18247            agent_slug: "codex".into(),
18248            workspace: Some(PathBuf::from("/tmp/workspace")),
18249            external_id: Some("shared-session-gap".into()),
18250            title: Some("Shared Session Gap".into()),
18251            source_path: PathBuf::from("/tmp/rollout.jsonl"),
18252            started_at: Some(1_700_000_000_000),
18253            ended_at: Some(1_700_000_000_999),
18254            approx_tokens: None,
18255            metadata_json: serde_json::Value::Null,
18256            messages,
18257            source_id: "local".into(),
18258            origin_host: None,
18259        };
18260
18261        let conv_a = base_conv(vec![
18262            Message {
18263                id: None,
18264                idx: 2,
18265                role: MessageRole::User,
18266                author: None,
18267                created_at: Some(1_700_000_000_200),
18268                content: "third".into(),
18269                extra_json: serde_json::Value::Null,
18270                snippets: Vec::new(),
18271            },
18272            Message {
18273                id: None,
18274                idx: 3,
18275                role: MessageRole::Agent,
18276                author: None,
18277                created_at: Some(1_700_000_000_300),
18278                content: "fourth".into(),
18279                extra_json: serde_json::Value::Null,
18280                snippets: Vec::new(),
18281            },
18282        ]);
18283        let conv_b = base_conv(vec![
18284            Message {
18285                id: None,
18286                idx: 0,
18287                role: MessageRole::User,
18288                author: None,
18289                created_at: Some(1_700_000_000_000),
18290                content: "first".into(),
18291                extra_json: serde_json::Value::Null,
18292                snippets: Vec::new(),
18293            },
18294            Message {
18295                id: None,
18296                idx: 1,
18297                role: MessageRole::Agent,
18298                author: None,
18299                created_at: Some(1_700_000_000_100),
18300                content: "second".into(),
18301                extra_json: serde_json::Value::Null,
18302                snippets: Vec::new(),
18303            },
18304            Message {
18305                id: None,
18306                idx: 3,
18307                role: MessageRole::Agent,
18308                author: None,
18309                created_at: Some(1_700_000_000_300),
18310                content: "fourth".into(),
18311                extra_json: serde_json::Value::Null,
18312                snippets: Vec::new(),
18313            },
18314        ]);
18315
18316        let outcomes = storage
18317            .insert_conversations_batched(&[(agent_id, None, &conv_a), (agent_id, None, &conv_b)])
18318            .unwrap();
18319        assert_eq!(outcomes.len(), 2);
18320        assert_eq!(outcomes[0].inserted_indices, vec![2, 3]);
18321        assert_eq!(outcomes[1].inserted_indices, vec![0, 1]);
18322        assert_eq!(outcomes[0].conversation_id, outcomes[1].conversation_id);
18323
18324        let stored_indices: Vec<i64> = storage
18325            .conn
18326            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18327                row.get_typed(0)
18328            })
18329            .unwrap();
18330        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18331    }
18332
18333    #[test]
18334    fn insert_conversations_batched_refreshes_partial_pending_message_lookup() {
18335        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18336        use std::path::PathBuf;
18337
18338        let dir = TempDir::new().unwrap();
18339        let db_path = dir.path().join("test.db");
18340        let storage = SqliteStorage::open(&db_path).unwrap();
18341
18342        let agent = Agent {
18343            id: None,
18344            slug: "codex".into(),
18345            name: "Codex".into(),
18346            version: Some("0.2.3".into()),
18347            kind: AgentKind::Cli,
18348        };
18349        let agent_id = storage.ensure_agent(&agent).unwrap();
18350
18351        let make_message = |idx: i64, content: &str| Message {
18352            id: None,
18353            idx,
18354            role: if idx == 0 {
18355                MessageRole::User
18356            } else {
18357                MessageRole::Agent
18358            },
18359            author: None,
18360            created_at: Some(1_700_000_000_000 + idx),
18361            content: content.into(),
18362            extra_json: serde_json::Value::Null,
18363            snippets: Vec::new(),
18364        };
18365
18366        let base_conv = |messages: Vec<Message>| Conversation {
18367            id: None,
18368            agent_slug: "codex".into(),
18369            workspace: Some(PathBuf::from("/tmp/workspace")),
18370            external_id: Some("partial-cache-session".into()),
18371            title: Some("Partial cache session".into()),
18372            source_path: PathBuf::from("/tmp/partial-cache.jsonl"),
18373            started_at: Some(1_700_000_000_000),
18374            ended_at: Some(1_700_000_000_100),
18375            approx_tokens: None,
18376            metadata_json: serde_json::Value::Null,
18377            messages,
18378            source_id: "local".into(),
18379            origin_host: None,
18380        };
18381
18382        let canonical = base_conv(vec![
18383            make_message(0, "canonical zero"),
18384            make_message(20, "canonical twenty"),
18385        ]);
18386        storage
18387            .insert_conversation_tree(agent_id, None, &canonical)
18388            .unwrap();
18389
18390        let exact_prefix = base_conv(vec![make_message(0, "canonical zero")]);
18391        let conflicting_tail = base_conv(vec![make_message(20, "conflicting twenty")]);
18392
18393        let outcomes = storage
18394            .insert_conversations_batched(&[
18395                (agent_id, None, &exact_prefix),
18396                (agent_id, None, &conflicting_tail),
18397            ])
18398            .unwrap();
18399
18400        assert_eq!(outcomes.len(), 2);
18401        assert!(outcomes[0].inserted_indices.is_empty());
18402        assert!(
18403            outcomes[1].inserted_indices.is_empty(),
18404            "the second batch item must refresh the partial pending lookup and retain the canonical idx=20 row"
18405        );
18406
18407        let stored_messages: Vec<(i64, String)> = storage
18408            .conn
18409            .query_map_collect(
18410                "SELECT idx, content FROM messages ORDER BY idx",
18411                fparams![],
18412                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18413            )
18414            .unwrap();
18415        assert_eq!(
18416            stored_messages,
18417            vec![
18418                (0, "canonical zero".to_string()),
18419                (20, "canonical twenty".to_string()),
18420            ]
18421        );
18422    }
18423
18424    #[test]
18425    fn insert_conversations_batched_reprocessing_conversation_is_idempotent() {
18426        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18427        use std::path::PathBuf;
18428
18429        const MESSAGE_COUNT: i64 = 64;
18430
18431        let dir = TempDir::new().unwrap();
18432        let db_path = dir.path().join("test.db");
18433        let storage = SqliteStorage::open(&db_path).unwrap();
18434
18435        let agent = Agent {
18436            id: None,
18437            slug: "codex".into(),
18438            name: "Codex".into(),
18439            version: Some("0.2.3".into()),
18440            kind: AgentKind::Cli,
18441        };
18442        let agent_id = storage.ensure_agent(&agent).unwrap();
18443
18444        let messages: Vec<Message> = (0..MESSAGE_COUNT)
18445            .map(|idx| Message {
18446                id: None,
18447                idx,
18448                role: if idx % 2 == 0 {
18449                    MessageRole::User
18450                } else {
18451                    MessageRole::Agent
18452                },
18453                author: None,
18454                created_at: Some(1_700_000_000_000 + idx),
18455                content: format!("message {idx}"),
18456                extra_json: serde_json::Value::Null,
18457                snippets: Vec::new(),
18458            })
18459            .collect();
18460
18461        let conversation = Conversation {
18462            id: None,
18463            agent_slug: "codex".into(),
18464            workspace: Some(PathBuf::from("/tmp/workspace")),
18465            external_id: Some("large-reprocess-session".into()),
18466            title: Some("Large Reprocess Session".into()),
18467            source_path: PathBuf::from("/tmp/large-reprocess-session.jsonl"),
18468            started_at: Some(1_700_000_000_000),
18469            ended_at: Some(1_700_000_000_000 + MESSAGE_COUNT - 1),
18470            approx_tokens: None,
18471            metadata_json: serde_json::Value::Null,
18472            messages,
18473            source_id: "local".into(),
18474            origin_host: None,
18475        };
18476
18477        let first = storage
18478            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18479            .unwrap();
18480        let second = storage
18481            .insert_conversations_batched(&[(agent_id, None, &conversation)])
18482            .unwrap();
18483
18484        assert_eq!(first.len(), 1);
18485        assert_eq!(second.len(), 1);
18486        assert_eq!(first[0].inserted_indices.len(), MESSAGE_COUNT as usize);
18487        assert!(
18488            second[0].inserted_indices.is_empty(),
18489            "full reprocessing of a large conversation must not attempt duplicate idx inserts"
18490        );
18491        assert_eq!(first[0].conversation_id, second[0].conversation_id);
18492
18493        let conversation_count: i64 = storage
18494            .conn
18495            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
18496                row.get_typed(0)
18497            })
18498            .unwrap();
18499        let message_count: i64 = storage
18500            .conn
18501            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
18502                row.get_typed(0)
18503            })
18504            .unwrap();
18505
18506        assert_eq!(conversation_count, 1);
18507        assert_eq!(message_count, MESSAGE_COUNT);
18508    }
18509
18510    #[test]
18511    fn parallel_insert_conversation_tree_keeps_unique_external_ids_distinct() {
18512        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18513        use crate::indexer::persist::map_to_internal;
18514        use crate::model::types::{Agent, AgentKind};
18515        use frankensqlite::compat::{ConnectionExt, RowExt};
18516        use rand::RngExt;
18517        use rayon::prelude::*;
18518
18519        fn retryable_franken_error(err: &anyhow::Error) -> bool {
18520            err.downcast_ref::<frankensqlite::FrankenError>()
18521                .or_else(|| {
18522                    err.root_cause()
18523                        .downcast_ref::<frankensqlite::FrankenError>()
18524                })
18525                .is_some_and(|inner| {
18526                    matches!(
18527                        inner,
18528                        frankensqlite::FrankenError::Busy
18529                            | frankensqlite::FrankenError::BusyRecovery
18530                            | frankensqlite::FrankenError::BusySnapshot { .. }
18531                            | frankensqlite::FrankenError::WriteConflict { .. }
18532                            | frankensqlite::FrankenError::SerializationFailure { .. }
18533                    )
18534                })
18535        }
18536
18537        fn with_retry<F, T>(mut f: F) -> anyhow::Result<T>
18538        where
18539            F: FnMut() -> anyhow::Result<T>,
18540        {
18541            let mut rng = rand::rng();
18542            let mut backoff_ms = 4_u64;
18543            for attempt in 0..=24 {
18544                match f() {
18545                    Ok(value) => return Ok(value),
18546                    Err(err) if attempt < 24 && retryable_franken_error(&err) => {
18547                        let sleep_ms = backoff_ms + rng.random_range(0..=backoff_ms);
18548                        std::thread::sleep(Duration::from_millis(sleep_ms));
18549                        backoff_ms = (backoff_ms * 2).min(512);
18550                    }
18551                    Err(err) => return Err(err),
18552                }
18553            }
18554            unreachable!("retry loop must return on success or final failure")
18555        }
18556
18557        let dir = TempDir::new().unwrap();
18558        let db_path = dir.path().join("parallel_insert_conversation_tree.db");
18559        let seed = FrankenStorage::open(&db_path).unwrap();
18560        drop(seed);
18561
18562        let conversations: Vec<NormalizedConversation> = (0..10)
18563            .map(|i| NormalizedConversation {
18564                agent_slug: format!("agent-{}", i % 3),
18565                external_id: Some(format!("conv-{i}")),
18566                title: Some(format!("Conversation {i}")),
18567                workspace: Some(PathBuf::from(format!("/ws/{i}"))),
18568                source_path: PathBuf::from(format!("/log/{i}.jsonl")),
18569                started_at: Some(1_000 + i * 100),
18570                ended_at: Some(1_000 + i * 100 + 50),
18571                metadata: serde_json::json!({}),
18572                messages: (0..3)
18573                    .map(|j| NormalizedMessage {
18574                        idx: j,
18575                        role: if j % 2 == 0 { "user" } else { "assistant" }.to_string(),
18576                        author: Some("tester".into()),
18577                        created_at: Some(1_000 + i * 100 + j * 10),
18578                        content: format!("parallel-distinct-test conv={i} msg={j}"),
18579                        extra: serde_json::json!({}),
18580                        snippets: vec![],
18581                        invocations: Vec::new(),
18582                    })
18583                    .collect(),
18584            })
18585            .collect();
18586
18587        let mut outcomes: Vec<(String, i64, Vec<i64>)> = conversations
18588            .par_chunks(3)
18589            .map(|chunk| {
18590                let storage = FrankenStorage::open_writer(&db_path).unwrap();
18591                let mut agent_cache: HashMap<String, i64> = HashMap::new();
18592                let mut workspace_cache: HashMap<PathBuf, i64> = HashMap::new();
18593                let mut chunk_outcomes = Vec::with_capacity(chunk.len());
18594
18595                for conv in chunk {
18596                    let agent_slug = conv.agent_slug.clone();
18597                    let workspace = conv.workspace.clone();
18598                    let external_id = conv.external_id.clone().expect("external id");
18599                    let internal = map_to_internal(conv);
18600                    let outcome = with_retry(|| {
18601                        let agent_id = if let Some(id) = agent_cache.get(&agent_slug) {
18602                            *id
18603                        } else {
18604                            let agent = Agent {
18605                                id: None,
18606                                slug: agent_slug.clone(),
18607                                name: agent_slug.clone(),
18608                                version: None,
18609                                kind: AgentKind::Cli,
18610                            };
18611                            let id = storage.ensure_agent(&agent)?;
18612                            agent_cache.insert(agent_slug.clone(), id);
18613                            id
18614                        };
18615                        let workspace_id = if let Some(path) = &workspace {
18616                            if let Some(id) = workspace_cache.get(path) {
18617                                Some(*id)
18618                            } else {
18619                                let id = storage.ensure_workspace(path, None)?;
18620                                workspace_cache.insert(path.clone(), id);
18621                                Some(id)
18622                            }
18623                        } else {
18624                            None
18625                        };
18626                        storage.insert_conversation_tree(agent_id, workspace_id, &internal)
18627                    })
18628                    .unwrap();
18629                    chunk_outcomes.push((
18630                        external_id,
18631                        outcome.conversation_id,
18632                        outcome.inserted_indices,
18633                    ));
18634                }
18635
18636                storage.close().unwrap();
18637                chunk_outcomes
18638            })
18639            .flatten()
18640            .collect();
18641        outcomes.sort_by(|left, right| left.0.cmp(&right.0));
18642
18643        assert!(
18644            outcomes
18645                .iter()
18646                .all(|(_, _, inserted_indices)| inserted_indices == &vec![0, 1, 2]),
18647            "unique external ids must not be routed through the existing-conversation merge path: {outcomes:?}"
18648        );
18649
18650        let distinct_ids: HashSet<i64> = outcomes
18651            .iter()
18652            .map(|(_, conversation_id, _)| *conversation_id)
18653            .collect();
18654        assert_eq!(
18655            distinct_ids.len(),
18656            conversations.len(),
18657            "unique external ids must produce distinct conversation ids: {outcomes:?}"
18658        );
18659
18660        let reader = FrankenStorage::open(&db_path).unwrap();
18661        let stored_rows: Vec<(i64, String)> = reader
18662            .raw()
18663            .query_map_collect(
18664                "SELECT id, external_id FROM conversations ORDER BY id",
18665                &[],
18666                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18667            )
18668            .unwrap();
18669        let stored_count: i64 = reader
18670            .raw()
18671            .query_row_map("SELECT COUNT(*) FROM conversations", &[], |row| {
18672                row.get_typed(0)
18673            })
18674            .unwrap();
18675
18676        assert_eq!(
18677            stored_count as usize,
18678            conversations.len(),
18679            "parallel distinct inserts must persist one row per external id; rows={stored_rows:?}; outcomes={outcomes:?}"
18680        );
18681        assert_eq!(
18682            stored_rows.len(),
18683            conversations.len(),
18684            "parallel distinct inserts must remain visible after reopening; rows={stored_rows:?}; outcomes={outcomes:?}"
18685        );
18686    }
18687
18688    #[test]
18689    fn insert_conversation_tree_merges_duplicate_external_id_with_gaps() {
18690        use crate::connectors::{NormalizedConversation, NormalizedMessage};
18691        use crate::indexer::persist::map_to_internal;
18692        use crate::model::types::{Agent, AgentKind};
18693        use std::path::PathBuf;
18694
18695        let dir = TempDir::new().unwrap();
18696        let db_path = dir.path().join("test.db");
18697        let storage = SqliteStorage::open(&db_path).unwrap();
18698
18699        let agent = Agent {
18700            id: None,
18701            slug: "codex".into(),
18702            name: "Codex".into(),
18703            version: Some("0.2.3".into()),
18704            kind: AgentKind::Cli,
18705        };
18706        let agent_id = storage.ensure_agent(&agent).unwrap();
18707
18708        let base_conv = |messages: Vec<NormalizedMessage>| NormalizedConversation {
18709            agent_slug: "codex".into(),
18710            workspace: Some(PathBuf::from("/tmp/workspace")),
18711            external_id: Some("tree-gap-session".into()),
18712            title: Some("Tree Gap Session".into()),
18713            source_path: PathBuf::from("/tmp/tree.jsonl"),
18714            started_at: Some(1_700_000_000_000),
18715            ended_at: Some(1_700_000_000_999),
18716            metadata: serde_json::Value::Null,
18717            messages,
18718        };
18719
18720        let conv_a = map_to_internal(&base_conv(vec![
18721            NormalizedMessage {
18722                idx: 2,
18723                role: "user".into(),
18724                author: None,
18725                created_at: Some(1_700_000_000_200),
18726                content: "third".into(),
18727                extra: serde_json::Value::Null,
18728                snippets: Vec::new(),
18729                invocations: Vec::new(),
18730            },
18731            NormalizedMessage {
18732                idx: 3,
18733                role: "assistant".into(),
18734                author: None,
18735                created_at: Some(1_700_000_000_300),
18736                content: "fourth".into(),
18737                extra: serde_json::Value::Null,
18738                snippets: Vec::new(),
18739                invocations: Vec::new(),
18740            },
18741        ]));
18742        let conv_b = map_to_internal(&base_conv(vec![
18743            NormalizedMessage {
18744                idx: 0,
18745                role: "user".into(),
18746                author: None,
18747                created_at: Some(1_700_000_000_000),
18748                content: "first".into(),
18749                extra: serde_json::Value::Null,
18750                snippets: Vec::new(),
18751                invocations: Vec::new(),
18752            },
18753            NormalizedMessage {
18754                idx: 1,
18755                role: "assistant".into(),
18756                author: None,
18757                created_at: Some(1_700_000_000_100),
18758                content: "second".into(),
18759                extra: serde_json::Value::Null,
18760                snippets: Vec::new(),
18761                invocations: Vec::new(),
18762            },
18763            NormalizedMessage {
18764                idx: 3,
18765                role: "assistant".into(),
18766                author: None,
18767                created_at: Some(1_700_000_000_300),
18768                content: "fourth".into(),
18769                extra: serde_json::Value::Null,
18770                snippets: Vec::new(),
18771                invocations: Vec::new(),
18772            },
18773        ]));
18774
18775        let first = storage
18776            .insert_conversation_tree(agent_id, None, &conv_a)
18777            .unwrap();
18778        let second = storage
18779            .insert_conversation_tree(agent_id, None, &conv_b)
18780            .unwrap();
18781
18782        assert_eq!(first.inserted_indices, vec![2, 3]);
18783        assert_eq!(second.inserted_indices, vec![0, 1]);
18784        assert_eq!(first.conversation_id, second.conversation_id);
18785
18786        let stored_indices: Vec<i64> = storage
18787            .conn
18788            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18789                row.get_typed(0)
18790            })
18791            .unwrap();
18792        assert_eq!(stored_indices, vec![0, 1, 2, 3]);
18793    }
18794
18795    #[test]
18796    fn insert_conversation_tree_skips_duplicate_message_indices_for_new_conversation() {
18797        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18798        use std::path::PathBuf;
18799
18800        let dir = TempDir::new().unwrap();
18801        let db_path = dir.path().join("test.db");
18802        let storage = SqliteStorage::open(&db_path).unwrap();
18803
18804        let agent = Agent {
18805            id: None,
18806            slug: "codex".into(),
18807            name: "Codex".into(),
18808            version: Some("0.2.3".into()),
18809            kind: AgentKind::Cli,
18810        };
18811        let agent_id = storage.ensure_agent(&agent).unwrap();
18812
18813        let conversation = Conversation {
18814            id: None,
18815            agent_slug: "codex".into(),
18816            workspace: Some(PathBuf::from("/tmp/workspace")),
18817            external_id: Some("duplicate-new-session".into()),
18818            title: Some("Duplicate New Session".into()),
18819            source_path: PathBuf::from("/tmp/duplicate-new-session.jsonl"),
18820            started_at: Some(1_700_000_000_000),
18821            ended_at: Some(1_700_000_000_999),
18822            approx_tokens: None,
18823            metadata_json: serde_json::Value::Null,
18824            messages: vec![
18825                Message {
18826                    id: None,
18827                    idx: 0,
18828                    role: MessageRole::User,
18829                    author: None,
18830                    created_at: Some(1_700_000_000_000),
18831                    content: "first canonical".into(),
18832                    extra_json: serde_json::Value::Null,
18833                    snippets: Vec::new(),
18834                },
18835                Message {
18836                    id: None,
18837                    idx: 0,
18838                    role: MessageRole::User,
18839                    author: None,
18840                    created_at: Some(1_700_000_000_001),
18841                    content: "duplicate idx should be skipped".into(),
18842                    extra_json: serde_json::Value::Null,
18843                    snippets: Vec::new(),
18844                },
18845                Message {
18846                    id: None,
18847                    idx: 1,
18848                    role: MessageRole::Agent,
18849                    author: None,
18850                    created_at: Some(1_700_000_000_100),
18851                    content: "second".into(),
18852                    extra_json: serde_json::Value::Null,
18853                    snippets: Vec::new(),
18854                },
18855            ],
18856            source_id: "local".into(),
18857            origin_host: None,
18858        };
18859
18860        let outcome = storage
18861            .insert_conversation_tree(agent_id, None, &conversation)
18862            .unwrap();
18863
18864        assert_eq!(outcome.inserted_indices, vec![0, 1]);
18865
18866        let stored_messages: Vec<(i64, String)> = storage
18867            .conn
18868            .query_map_collect(
18869                "SELECT idx, content FROM messages ORDER BY idx",
18870                fparams![],
18871                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
18872            )
18873            .unwrap();
18874        assert_eq!(
18875            stored_messages,
18876            vec![
18877                (0, "first canonical".to_string()),
18878                (1, "second".to_string())
18879            ]
18880        );
18881    }
18882
18883    #[test]
18884    fn insert_conversation_tree_merges_duplicate_source_path_without_external_id() {
18885        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18886        use std::path::PathBuf;
18887
18888        let dir = TempDir::new().unwrap();
18889        let db_path = dir.path().join("test.db");
18890        let storage = SqliteStorage::open(&db_path).unwrap();
18891
18892        let agent = Agent {
18893            id: None,
18894            slug: "codex".into(),
18895            name: "Codex".into(),
18896            version: Some("0.2.3".into()),
18897            kind: AgentKind::Cli,
18898        };
18899        let agent_id = storage.ensure_agent(&agent).unwrap();
18900
18901        let base_conv = |messages: Vec<Message>| Conversation {
18902            id: None,
18903            agent_slug: "codex".into(),
18904            workspace: Some(PathBuf::from("/tmp/workspace")),
18905            external_id: None,
18906            title: Some("Source Path Merge".into()),
18907            source_path: PathBuf::from("/tmp/shared-session.jsonl"),
18908            started_at: Some(1_700_000_000_000),
18909            ended_at: Some(1_700_000_000_999),
18910            approx_tokens: None,
18911            metadata_json: serde_json::Value::Null,
18912            messages,
18913            source_id: "local".into(),
18914            origin_host: None,
18915        };
18916
18917        let first = storage
18918            .insert_conversation_tree(
18919                agent_id,
18920                None,
18921                &base_conv(vec![
18922                    Message {
18923                        id: None,
18924                        idx: 0,
18925                        role: MessageRole::User,
18926                        author: None,
18927                        created_at: Some(1_700_000_000_000),
18928                        content: "first".into(),
18929                        extra_json: serde_json::Value::Null,
18930                        snippets: Vec::new(),
18931                    },
18932                    Message {
18933                        id: None,
18934                        idx: 1,
18935                        role: MessageRole::Agent,
18936                        author: None,
18937                        created_at: Some(1_700_000_000_100),
18938                        content: "second".into(),
18939                        extra_json: serde_json::Value::Null,
18940                        snippets: Vec::new(),
18941                    },
18942                ]),
18943            )
18944            .unwrap();
18945
18946        let second = storage
18947            .insert_conversation_tree(
18948                agent_id,
18949                None,
18950                &base_conv(vec![
18951                    Message {
18952                        id: None,
18953                        idx: 1,
18954                        role: MessageRole::Agent,
18955                        author: None,
18956                        created_at: Some(1_700_000_000_100),
18957                        content: "second".into(),
18958                        extra_json: serde_json::Value::Null,
18959                        snippets: Vec::new(),
18960                    },
18961                    Message {
18962                        id: None,
18963                        idx: 2,
18964                        role: MessageRole::User,
18965                        author: None,
18966                        created_at: Some(1_700_000_000_200),
18967                        content: "third".into(),
18968                        extra_json: serde_json::Value::Null,
18969                        snippets: Vec::new(),
18970                    },
18971                ]),
18972            )
18973            .unwrap();
18974
18975        assert_eq!(first.conversation_id, second.conversation_id);
18976        assert_eq!(first.inserted_indices, vec![0, 1]);
18977        assert_eq!(second.inserted_indices, vec![2]);
18978
18979        let stored_indices: Vec<i64> = storage
18980            .conn
18981            .query_map_collect("SELECT idx FROM messages ORDER BY idx", fparams![], |row| {
18982                row.get_typed(0)
18983            })
18984            .unwrap();
18985        assert_eq!(stored_indices, vec![0, 1, 2]);
18986    }
18987
18988    #[test]
18989    fn insert_conversation_tree_merges_source_path_duplicates_with_start_drift() {
18990        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
18991        use std::path::PathBuf;
18992
18993        let dir = TempDir::new().unwrap();
18994        let db_path = dir.path().join("test.db");
18995        let storage = SqliteStorage::open(&db_path).unwrap();
18996
18997        let agent = Agent {
18998            id: None,
18999            slug: "codex".into(),
19000            name: "Codex".into(),
19001            version: Some("0.2.3".into()),
19002            kind: AgentKind::Cli,
19003        };
19004        let agent_id = storage.ensure_agent(&agent).unwrap();
19005
19006        let base_conv = |started_at: Option<i64>, messages: Vec<Message>| Conversation {
19007            id: None,
19008            agent_slug: "codex".into(),
19009            workspace: Some(PathBuf::from("/tmp/workspace")),
19010            external_id: None,
19011            title: Some("Drift Merge".into()),
19012            source_path: PathBuf::from("/tmp/drift-session.jsonl"),
19013            started_at,
19014            ended_at: Some(1_700_000_000_999),
19015            approx_tokens: None,
19016            metadata_json: serde_json::Value::Null,
19017            messages,
19018            source_id: "local".into(),
19019            origin_host: None,
19020        };
19021
19022        let first = storage
19023            .insert_conversation_tree(
19024                agent_id,
19025                None,
19026                &base_conv(
19027                    Some(1_700_000_000_000),
19028                    vec![
19029                        Message {
19030                            id: None,
19031                            idx: 0,
19032                            role: MessageRole::User,
19033                            author: None,
19034                            created_at: Some(1_700_000_000_000),
19035                            content: "first".into(),
19036                            extra_json: serde_json::Value::Null,
19037                            snippets: Vec::new(),
19038                        },
19039                        Message {
19040                            id: None,
19041                            idx: 1,
19042                            role: MessageRole::Agent,
19043                            author: None,
19044                            created_at: Some(1_700_000_000_100),
19045                            content: "second".into(),
19046                            extra_json: serde_json::Value::Null,
19047                            snippets: Vec::new(),
19048                        },
19049                    ],
19050                ),
19051            )
19052            .unwrap();
19053
19054        let second = storage
19055            .insert_conversation_tree(
19056                agent_id,
19057                None,
19058                &base_conv(
19059                    Some(1_700_000_004_000),
19060                    vec![
19061                        Message {
19062                            id: None,
19063                            idx: 1,
19064                            role: MessageRole::Agent,
19065                            author: None,
19066                            created_at: Some(1_700_000_000_100),
19067                            content: "second".into(),
19068                            extra_json: serde_json::Value::Null,
19069                            snippets: Vec::new(),
19070                        },
19071                        Message {
19072                            id: None,
19073                            idx: 2,
19074                            role: MessageRole::User,
19075                            author: None,
19076                            created_at: Some(1_700_000_004_200),
19077                            content: "third".into(),
19078                            extra_json: serde_json::Value::Null,
19079                            snippets: Vec::new(),
19080                        },
19081                    ],
19082                ),
19083            )
19084            .unwrap();
19085
19086        assert_eq!(first.conversation_id, second.conversation_id);
19087        assert_eq!(second.inserted_indices, vec![2]);
19088    }
19089
19090    #[test]
19091    fn insert_conversation_tree_keeps_single_message_overlap_sessions_separate() {
19092        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19093        use std::path::PathBuf;
19094
19095        let dir = TempDir::new().unwrap();
19096        let db_path = dir.path().join("test.db");
19097        let storage = SqliteStorage::open(&db_path).unwrap();
19098
19099        let agent = Agent {
19100            id: None,
19101            slug: "codex".into(),
19102            name: "Codex".into(),
19103            version: Some("0.2.3".into()),
19104            kind: AgentKind::Cli,
19105        };
19106        let agent_id = storage.ensure_agent(&agent).unwrap();
19107
19108        let make_conv = |started_at: i64, idx: i64, content: &str| Conversation {
19109            id: None,
19110            agent_slug: "codex".into(),
19111            workspace: Some(PathBuf::from("/tmp/workspace")),
19112            external_id: None,
19113            title: Some("Partial overlap".into()),
19114            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19115            started_at: Some(started_at),
19116            ended_at: Some(started_at + 500),
19117            approx_tokens: None,
19118            metadata_json: serde_json::Value::Null,
19119            messages: vec![Message {
19120                id: None,
19121                idx,
19122                role: MessageRole::User,
19123                author: None,
19124                created_at: Some(started_at),
19125                content: content.into(),
19126                extra_json: serde_json::Value::Null,
19127                snippets: Vec::new(),
19128            }],
19129            source_id: "local".into(),
19130            origin_host: None,
19131        };
19132
19133        storage
19134            .insert_conversation_tree(
19135                agent_id,
19136                None,
19137                &Conversation {
19138                    messages: vec![
19139                        Message {
19140                            id: None,
19141                            idx: 0,
19142                            role: MessageRole::User,
19143                            author: None,
19144                            created_at: Some(1_700_000_000_000),
19145                            content: "shared opener".into(),
19146                            extra_json: serde_json::Value::Null,
19147                            snippets: Vec::new(),
19148                        },
19149                        Message {
19150                            id: None,
19151                            idx: 1,
19152                            role: MessageRole::Agent,
19153                            author: None,
19154                            created_at: Some(1_700_000_000_100),
19155                            content: "first session unique".into(),
19156                            extra_json: serde_json::Value::Null,
19157                            snippets: Vec::new(),
19158                        },
19159                    ],
19160                    ..make_conv(1_700_000_000_000, 0, "unused")
19161                },
19162            )
19163            .unwrap();
19164        storage
19165            .insert_conversation_tree(
19166                agent_id,
19167                None,
19168                &make_conv(1_700_000_900_000, 0, "shared opener"),
19169            )
19170            .unwrap();
19171
19172        let conversation_count: i64 = storage
19173            .conn
19174            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19175                row.get_typed(0)
19176            })
19177            .unwrap();
19178        assert_eq!(conversation_count, 2);
19179    }
19180
19181    #[test]
19182    fn insert_conversation_tree_keeps_distinct_source_path_sessions_separate() {
19183        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19184        use std::path::PathBuf;
19185
19186        let dir = TempDir::new().unwrap();
19187        let db_path = dir.path().join("test.db");
19188        let storage = SqliteStorage::open(&db_path).unwrap();
19189
19190        let agent = Agent {
19191            id: None,
19192            slug: "codex".into(),
19193            name: "Codex".into(),
19194            version: Some("0.2.3".into()),
19195            kind: AgentKind::Cli,
19196        };
19197        let agent_id = storage.ensure_agent(&agent).unwrap();
19198
19199        let make_conv = |started_at: i64, created_at: i64, content: &str| Conversation {
19200            id: None,
19201            agent_slug: "codex".into(),
19202            workspace: Some(PathBuf::from("/tmp/workspace")),
19203            external_id: None,
19204            title: Some("Same Path Different Session".into()),
19205            source_path: PathBuf::from("/tmp/reused-session.jsonl"),
19206            started_at: Some(started_at),
19207            ended_at: Some(started_at + 500),
19208            approx_tokens: None,
19209            metadata_json: serde_json::Value::Null,
19210            messages: vec![Message {
19211                id: None,
19212                idx: 0,
19213                role: MessageRole::User,
19214                author: None,
19215                created_at: Some(created_at),
19216                content: content.into(),
19217                extra_json: serde_json::Value::Null,
19218                snippets: Vec::new(),
19219            }],
19220            source_id: "local".into(),
19221            origin_host: None,
19222        };
19223
19224        storage
19225            .insert_conversation_tree(
19226                agent_id,
19227                None,
19228                &make_conv(1_700_000_000_000, 1_700_000_000_000, "first session"),
19229            )
19230            .unwrap();
19231        storage
19232            .insert_conversation_tree(
19233                agent_id,
19234                None,
19235                &make_conv(1_700_000_900_000, 1_700_000_900_000, "second session"),
19236            )
19237            .unwrap();
19238
19239        let conversation_count: i64 = storage
19240            .conn
19241            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
19242                row.get_typed(0)
19243            })
19244            .unwrap();
19245        assert_eq!(conversation_count, 2);
19246    }
19247
19248    #[test]
19249    fn insert_conversation_tree_merges_replay_equivalent_messages_with_shifted_idx() {
19250        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19251        use std::path::PathBuf;
19252
19253        let dir = TempDir::new().unwrap();
19254        let db_path = dir.path().join("test.db");
19255        let storage = SqliteStorage::open(&db_path).unwrap();
19256
19257        let agent = Agent {
19258            id: None,
19259            slug: "codex".into(),
19260            name: "Codex".into(),
19261            version: Some("0.2.3".into()),
19262            kind: AgentKind::Cli,
19263        };
19264        let agent_id = storage.ensure_agent(&agent).unwrap();
19265
19266        let make_conv = |started_at: i64, messages: Vec<Message>| Conversation {
19267            id: None,
19268            agent_slug: "codex".into(),
19269            workspace: Some(PathBuf::from("/tmp/workspace")),
19270            external_id: None,
19271            title: Some("Shifted replay".into()),
19272            source_path: PathBuf::from("/tmp/replay-session.jsonl"),
19273            started_at: Some(started_at),
19274            ended_at: Some(started_at + 500),
19275            approx_tokens: None,
19276            metadata_json: serde_json::Value::Null,
19277            messages,
19278            source_id: "local".into(),
19279            origin_host: None,
19280        };
19281
19282        let first = storage
19283            .insert_conversation_tree(
19284                agent_id,
19285                None,
19286                &make_conv(
19287                    1_700_000_000_000,
19288                    vec![
19289                        Message {
19290                            id: None,
19291                            idx: 0,
19292                            role: MessageRole::User,
19293                            author: None,
19294                            created_at: Some(1_700_000_000_000),
19295                            content: "first".into(),
19296                            extra_json: serde_json::Value::Null,
19297                            snippets: Vec::new(),
19298                        },
19299                        Message {
19300                            id: None,
19301                            idx: 1,
19302                            role: MessageRole::Agent,
19303                            author: None,
19304                            created_at: Some(1_700_000_000_100),
19305                            content: "second".into(),
19306                            extra_json: serde_json::Value::Null,
19307                            snippets: Vec::new(),
19308                        },
19309                    ],
19310                ),
19311            )
19312            .unwrap();
19313
19314        let second = storage
19315            .insert_conversation_tree(
19316                agent_id,
19317                None,
19318                &make_conv(
19319                    1_700_000_900_000,
19320                    vec![
19321                        Message {
19322                            id: None,
19323                            idx: 10,
19324                            role: MessageRole::User,
19325                            author: None,
19326                            created_at: Some(1_700_000_000_000),
19327                            content: "first".into(),
19328                            extra_json: serde_json::Value::Null,
19329                            snippets: Vec::new(),
19330                        },
19331                        Message {
19332                            id: None,
19333                            idx: 11,
19334                            role: MessageRole::Agent,
19335                            author: None,
19336                            created_at: Some(1_700_000_000_100),
19337                            content: "second".into(),
19338                            extra_json: serde_json::Value::Null,
19339                            snippets: Vec::new(),
19340                        },
19341                        Message {
19342                            id: None,
19343                            idx: 12,
19344                            role: MessageRole::User,
19345                            author: None,
19346                            created_at: Some(1_700_000_000_200),
19347                            content: "third".into(),
19348                            extra_json: serde_json::Value::Null,
19349                            snippets: Vec::new(),
19350                        },
19351                    ],
19352                ),
19353            )
19354            .unwrap();
19355
19356        assert_eq!(first.conversation_id, second.conversation_id);
19357        assert_eq!(second.inserted_indices, vec![12]);
19358
19359        let stored_indices: Vec<i64> = storage
19360            .conn
19361            .query_map_collect(
19362                "SELECT idx FROM messages WHERE conversation_id = ?1 ORDER BY idx",
19363                fparams![first.conversation_id],
19364                |row| row.get_typed(0),
19365            )
19366            .unwrap();
19367        assert_eq!(stored_indices, vec![0, 1, 12]);
19368    }
19369
19370    #[test]
19371    fn salvage_historical_databases_imports_backups_once_and_merges_overlap() {
19372        use crate::model::types::{Conversation, Message, MessageRole};
19373        use std::path::PathBuf;
19374
19375        fn base_conv(source_path: &str, messages: Vec<Message>) -> Conversation {
19376            Conversation {
19377                id: None,
19378                agent_slug: "codex".into(),
19379                workspace: Some(PathBuf::from("/tmp/workspace")),
19380                external_id: None,
19381                title: Some("Recovered".into()),
19382                source_path: PathBuf::from(source_path),
19383                started_at: Some(1_700_000_000_000),
19384                ended_at: Some(1_700_000_000_999),
19385                approx_tokens: None,
19386                metadata_json: serde_json::Value::Null,
19387                messages,
19388                source_id: "local".into(),
19389                origin_host: None,
19390            }
19391        }
19392
19393        let dir = TempDir::new().unwrap();
19394        let canonical_db = dir.path().join("agent_search.db");
19395        let storage = SqliteStorage::open(&canonical_db).unwrap();
19396
19397        let overlapping_a = base_conv(
19398            "/tmp/shared-history.jsonl",
19399            vec![
19400                Message {
19401                    id: None,
19402                    idx: 0,
19403                    role: MessageRole::User,
19404                    author: None,
19405                    created_at: Some(1_700_000_000_000),
19406                    content: "first".into(),
19407                    extra_json: serde_json::Value::Null,
19408                    snippets: Vec::new(),
19409                },
19410                Message {
19411                    id: None,
19412                    idx: 1,
19413                    role: MessageRole::Agent,
19414                    author: None,
19415                    created_at: Some(1_700_000_000_100),
19416                    content: "second".into(),
19417                    extra_json: serde_json::Value::Null,
19418                    snippets: Vec::new(),
19419                },
19420            ],
19421        );
19422        let overlapping_b = base_conv(
19423            "/tmp/shared-history.jsonl",
19424            vec![
19425                Message {
19426                    id: None,
19427                    idx: 1,
19428                    role: MessageRole::Agent,
19429                    author: None,
19430                    created_at: Some(1_700_000_000_100),
19431                    content: "second".into(),
19432                    extra_json: serde_json::Value::Null,
19433                    snippets: Vec::new(),
19434                },
19435                Message {
19436                    id: None,
19437                    idx: 2,
19438                    role: MessageRole::User,
19439                    author: None,
19440                    created_at: Some(1_700_000_000_200),
19441                    content: "third".into(),
19442                    extra_json: serde_json::Value::Null,
19443                    snippets: Vec::new(),
19444                },
19445            ],
19446        );
19447        let unique = Conversation {
19448            source_path: PathBuf::from("/tmp/unique-history.jsonl"),
19449            messages: vec![Message {
19450                id: None,
19451                idx: 0,
19452                role: MessageRole::User,
19453                author: None,
19454                created_at: Some(1_700_000_001_000),
19455                content: "unique".into(),
19456                extra_json: serde_json::Value::Null,
19457                snippets: Vec::new(),
19458            }],
19459            started_at: Some(1_700_000_001_000),
19460            ended_at: Some(1_700_000_001_100),
19461            ..base_conv("/tmp/unique-history.jsonl", Vec::new())
19462        };
19463
19464        seed_historical_db_direct(
19465            &dir.path()
19466                .join("backups/agent_search.db.20260322T020200.bak"),
19467            std::slice::from_ref(&overlapping_a),
19468        );
19469        seed_historical_db_direct(
19470            &dir.path().join("agent_search.corrupt.20260324_212907"),
19471            &[overlapping_b, unique],
19472        );
19473
19474        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19475        assert_eq!(first.bundles_considered, 2);
19476        assert_eq!(first.bundles_imported, 2);
19477        assert_eq!(first.messages_imported, 4);
19478
19479        let conversations = storage.list_conversations(10, 0).unwrap();
19480        assert_eq!(conversations.len(), 2);
19481
19482        let shared_id = conversations
19483            .iter()
19484            .find(|conv| conv.source_path == std::path::Path::new("/tmp/shared-history.jsonl"))
19485            .and_then(|conv| conv.id)
19486            .unwrap();
19487        let shared_indices: Vec<i64> = storage
19488            .fetch_messages(shared_id)
19489            .unwrap()
19490            .into_iter()
19491            .map(|msg| msg.idx)
19492            .collect();
19493        assert_eq!(shared_indices, vec![0, 1, 2]);
19494
19495        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19496        assert_eq!(second.bundles_imported, 0);
19497        assert_eq!(second.messages_imported, 0);
19498    }
19499
19500    #[test]
19501    fn salvage_historical_databases_normalizes_host_only_remote_provenance() {
19502        use crate::model::types::{Conversation, Message, MessageRole};
19503        use std::path::PathBuf;
19504
19505        let dir = TempDir::new().unwrap();
19506        let canonical_db = dir.path().join("agent_search.db");
19507        let storage = SqliteStorage::open(&canonical_db).unwrap();
19508
19509        let host_only_remote = Conversation {
19510            id: None,
19511            agent_slug: "codex".into(),
19512            workspace: Some(PathBuf::from("/tmp/workspace")),
19513            external_id: None,
19514            title: Some("Recovered Host Only Remote".into()),
19515            source_path: PathBuf::from("/tmp/host-only-history.jsonl"),
19516            started_at: Some(1_700_000_000_000),
19517            ended_at: Some(1_700_000_000_999),
19518            approx_tokens: None,
19519            metadata_json: serde_json::Value::Null,
19520            messages: vec![Message {
19521                id: None,
19522                idx: 0,
19523                role: MessageRole::User,
19524                author: None,
19525                created_at: Some(1_700_000_000_000),
19526                content: "host-only remote".into(),
19527                extra_json: serde_json::Value::Null,
19528                snippets: Vec::new(),
19529            }],
19530            source_id: "   ".into(),
19531            origin_host: Some("builder-5".into()),
19532        };
19533
19534        let historical_db = dir
19535            .path()
19536            .join("backups/agent_search.db.20260322T020200.bak");
19537        seed_historical_db_direct(&historical_db, std::slice::from_ref(&host_only_remote));
19538
19539        let historical_conn =
19540            FrankenConnection::open(historical_db.to_string_lossy().into_owned()).unwrap();
19541        historical_conn
19542            .execute_compat(
19543                "INSERT INTO sources(id, kind, host_label, created_at, updated_at) VALUES(?1, ?2, ?3, ?4, ?5)",
19544                fparams!["   ", "ssh", "builder-5", 0_i64, 0_i64],
19545            )
19546            .unwrap();
19547        historical_conn
19548            .execute_compat(
19549                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE source_path = ?3",
19550                fparams!["   ", "builder-5", "/tmp/host-only-history.jsonl"],
19551            )
19552            .unwrap();
19553        historical_conn
19554            .execute_compat("DELETE FROM sources WHERE id = ?1", fparams!["builder-5"])
19555            .unwrap();
19556        drop(historical_conn);
19557
19558        let first = storage.salvage_historical_databases(&canonical_db).unwrap();
19559        assert_eq!(first.bundles_imported, 1);
19560        assert_eq!(first.messages_imported, 1);
19561
19562        let source_ids = storage.get_source_ids().unwrap();
19563        assert_eq!(source_ids, vec!["builder-5".to_string()]);
19564
19565        let conversations = storage.list_conversations(10, 0).unwrap();
19566        assert_eq!(conversations.len(), 1);
19567        assert_eq!(conversations[0].source_id, "builder-5");
19568        assert_eq!(conversations[0].origin_host.as_deref(), Some("builder-5"));
19569    }
19570
19571    #[test]
19572    fn historical_salvage_retry_splits_single_conversation_until_it_fits() {
19573        use crate::model::types::{Conversation, Message, MessageRole};
19574        use std::path::PathBuf;
19575
19576        let mut attempts: Vec<Vec<usize>> = Vec::new();
19577        let entry = HistoricalBatchEntry {
19578            source_row_id: 77,
19579            agent_id: 1,
19580            workspace_id: None,
19581            conversation: Conversation {
19582                id: None,
19583                agent_slug: "gemini".into(),
19584                workspace: Some(PathBuf::from("/tmp/workspace")),
19585                external_id: Some("conv-77".into()),
19586                title: Some("Large recovered conversation".into()),
19587                source_path: PathBuf::from("/tmp/history.jsonl"),
19588                started_at: Some(1_700_000_000_000),
19589                ended_at: Some(1_700_000_000_999),
19590                approx_tokens: None,
19591                metadata_json: serde_json::Value::Null,
19592                messages: (0..4)
19593                    .map(|idx| Message {
19594                        id: None,
19595                        idx,
19596                        role: MessageRole::User,
19597                        author: None,
19598                        created_at: Some(1_700_000_000_000 + idx),
19599                        content: format!("message-{idx}"),
19600                        extra_json: serde_json::Value::Null,
19601                        snippets: Vec::new(),
19602                    })
19603                    .collect(),
19604                source_id: LOCAL_SOURCE_ID.into(),
19605                origin_host: None,
19606            },
19607        };
19608
19609        let totals = SqliteStorage::import_historical_batch_with_retry(
19610            std::slice::from_ref(&entry),
19611            &mut |batch| {
19612                attempts.push(
19613                    batch
19614                        .iter()
19615                        .map(|entry| entry.conversation.messages.len())
19616                        .collect(),
19617                );
19618                let total_messages: usize = batch
19619                    .iter()
19620                    .map(|entry| entry.conversation.messages.len())
19621                    .sum();
19622                if total_messages > 1 {
19623                    Err(anyhow!("out of memory"))
19624                } else {
19625                    Ok(HistoricalBatchImportTotals {
19626                        inserted_source_rows: batch.len(),
19627                        inserted_messages: total_messages,
19628                    })
19629                }
19630            },
19631        )
19632        .unwrap();
19633
19634        assert_eq!(
19635            totals,
19636            HistoricalBatchImportTotals {
19637                inserted_source_rows: 1,
19638                inserted_messages: 4,
19639            }
19640        );
19641        assert_eq!(attempts.first().cloned(), Some(vec![4]));
19642        assert!(
19643            attempts.iter().filter(|sizes| sizes == &&vec![1]).count() >= 4,
19644            "expected recursive fallback to reach one-message slices"
19645        );
19646    }
19647
19648    #[test]
19649    fn salvage_historical_databases_resumes_from_progress_checkpoint() {
19650        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19651        use std::path::PathBuf;
19652
19653        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19654            Conversation {
19655                id: None,
19656                agent_slug: "codex".into(),
19657                workspace: Some(PathBuf::from("/tmp/workspace")),
19658                external_id: Some(format!("conv-{idx_seed}")),
19659                title: Some(format!("Recovered {idx_seed}")),
19660                source_path: PathBuf::from(source_path),
19661                started_at: Some(1_700_000_000_000 + idx_seed),
19662                ended_at: Some(1_700_000_000_100 + idx_seed),
19663                approx_tokens: None,
19664                metadata_json: serde_json::Value::Null,
19665                messages: vec![Message {
19666                    id: None,
19667                    idx: 0,
19668                    role: MessageRole::User,
19669                    author: None,
19670                    created_at: Some(1_700_000_000_000 + idx_seed),
19671                    content: format!("message-{idx_seed}"),
19672                    extra_json: serde_json::Value::Null,
19673                    snippets: Vec::new(),
19674                }],
19675                source_id: LOCAL_SOURCE_ID.into(),
19676                origin_host: None,
19677            }
19678        }
19679
19680        let dir = TempDir::new().unwrap();
19681        let canonical_db = dir.path().join("agent_search.db");
19682        let backup_db = dir
19683            .path()
19684            .join("backups/agent_search.db.20260322T020200.bak");
19685        let storage = SqliteStorage::open(&canonical_db).unwrap();
19686        let conv_a = make_conv("/tmp/one.jsonl", 1);
19687        let conv_b = make_conv("/tmp/two.jsonl", 2);
19688        let conv_c = make_conv("/tmp/three.jsonl", 3);
19689        seed_historical_db_direct(
19690            &backup_db,
19691            &[conv_a.clone(), conv_b.clone(), conv_c.clone()],
19692        );
19693
19694        let agent = Agent {
19695            id: None,
19696            slug: "codex".into(),
19697            name: "Codex".into(),
19698            version: Some("0.2.3".into()),
19699            kind: AgentKind::Cli,
19700        };
19701        let agent_id = storage.ensure_agent(&agent).unwrap();
19702        storage
19703            .insert_conversation_tree(agent_id, None, &conv_a)
19704            .unwrap();
19705
19706        let bundle = discover_historical_database_bundles(&canonical_db)
19707            .into_iter()
19708            .find(|bundle| bundle.root_path == backup_db)
19709            .unwrap();
19710        let first_row_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19711            .unwrap()
19712            .query_row_map(
19713                "SELECT id FROM conversations WHERE source_path = ?1",
19714                fparams!["/tmp/one.jsonl"],
19715                |row| row.get_typed(0),
19716            )
19717            .unwrap();
19718        storage
19719            .record_historical_bundle_progress(&bundle, "direct-readonly", first_row_id, 50, 99)
19720            .unwrap();
19721
19722        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19723        assert_eq!(outcome.bundles_imported, 1);
19724        assert_eq!(outcome.conversations_imported, 52);
19725        assert_eq!(outcome.messages_imported, 101);
19726        assert_eq!(storage.list_conversations(10, 0).unwrap().len(), 3);
19727
19728        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19729        let progress_left: Option<String> = storage
19730            .conn
19731            .query_row_map(
19732                "SELECT value FROM meta WHERE key = ?1",
19733                fparams![progress_key.as_str()],
19734                |row| row.get_typed(0),
19735            )
19736            .optional()
19737            .unwrap();
19738        assert!(
19739            progress_left.is_none(),
19740            "completed salvage should clear bundle progress"
19741        );
19742
19743        let second = storage.salvage_historical_databases(&canonical_db).unwrap();
19744        assert_eq!(second.bundles_imported, 0);
19745        assert_eq!(second.messages_imported, 0);
19746    }
19747
19748    #[test]
19749    fn salvage_historical_databases_skips_bundle_when_checkpoint_covers_backup() {
19750        // Regression for issue #247 (coding_agent_session_search-r8pcy): a bundle
19751        // whose progress checkpoint already covers the backup's entire conversation
19752        // row-id space (daemon OOM-killed after the last batch committed but before
19753        // the completion ledger marker landed) must be ledgered + skipped, not
19754        // re-scanned O(n) with imported=0 every batch.
19755        use crate::model::types::{Conversation, Message, MessageRole};
19756        use std::path::PathBuf;
19757
19758        fn make_conv(source_path: &str, idx_seed: i64) -> Conversation {
19759            Conversation {
19760                id: None,
19761                agent_slug: "codex".into(),
19762                workspace: Some(PathBuf::from("/tmp/workspace")),
19763                external_id: Some(format!("conv-{idx_seed}")),
19764                title: Some(format!("Recovered {idx_seed}")),
19765                source_path: PathBuf::from(source_path),
19766                started_at: Some(1_700_000_000_000 + idx_seed),
19767                ended_at: Some(1_700_000_000_100 + idx_seed),
19768                approx_tokens: None,
19769                metadata_json: serde_json::Value::Null,
19770                messages: vec![Message {
19771                    id: None,
19772                    idx: 0,
19773                    role: MessageRole::User,
19774                    author: None,
19775                    created_at: Some(1_700_000_000_000 + idx_seed),
19776                    content: format!("message-{idx_seed}"),
19777                    extra_json: serde_json::Value::Null,
19778                    snippets: Vec::new(),
19779                }],
19780                source_id: LOCAL_SOURCE_ID.into(),
19781                origin_host: None,
19782            }
19783        }
19784
19785        let dir = TempDir::new().unwrap();
19786        let canonical_db = dir.path().join("agent_search.db");
19787        let backup_db = dir
19788            .path()
19789            .join("backups/agent_search.db.20260322T020200.bak");
19790        let storage = SqliteStorage::open(&canonical_db).unwrap();
19791        seed_historical_db_direct(
19792            &backup_db,
19793            &[
19794                make_conv("/tmp/one.jsonl", 1),
19795                make_conv("/tmp/two.jsonl", 2),
19796                make_conv("/tmp/three.jsonl", 3),
19797            ],
19798        );
19799
19800        let bundle = discover_historical_database_bundles(&canonical_db)
19801            .into_iter()
19802            .find(|bundle| bundle.root_path == backup_db)
19803            .unwrap();
19804
19805        // Checkpoint high-water mark == backup's max conversation id.
19806        let backup_max_id: i64 = FrankenConnection::open(backup_db.to_string_lossy().into_owned())
19807            .unwrap()
19808            .query_row_map(
19809                "SELECT COALESCE(MAX(id), 0) FROM conversations",
19810                fparams![],
19811                |row| row.get_typed(0),
19812            )
19813            .unwrap();
19814        assert!(backup_max_id > 0, "seeded backup should have conversations");
19815        storage
19816            .record_historical_bundle_progress(&bundle, "direct-readonly", backup_max_id, 3, 3)
19817            .unwrap();
19818
19819        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
19820        assert_eq!(
19821            outcome.bundles_imported, 0,
19822            "fully-checkpointed bundle must not be re-scanned"
19823        );
19824        assert_eq!(outcome.conversations_imported, 0);
19825        assert_eq!(outcome.messages_imported, 0);
19826        assert_eq!(
19827            storage.list_conversations(10, 0).unwrap().len(),
19828            0,
19829            "skip path must not import anything"
19830        );
19831        assert!(
19832            storage.historical_bundle_already_imported(&bundle).unwrap(),
19833            "skipped bundle must be ledgered as salvaged so future runs short-circuit"
19834        );
19835
19836        let progress_key = SqliteStorage::historical_bundle_progress_key(&bundle);
19837        let progress_left: Option<String> = storage
19838            .conn
19839            .query_row_map(
19840                "SELECT value FROM meta WHERE key = ?1",
19841                fparams![progress_key.as_str()],
19842                |row| row.get_typed(0),
19843            )
19844            .optional()
19845            .unwrap();
19846        assert!(
19847            progress_left.is_none(),
19848            "skip path must clear the bundle progress checkpoint"
19849        );
19850    }
19851
19852    #[test]
19853    fn list_conversations_for_lexical_rebuild_uses_stable_id_order() {
19854        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19855        use std::path::PathBuf;
19856
19857        let dir = TempDir::new().unwrap();
19858        let db_path = dir.path().join("agent_search.db");
19859        let storage = SqliteStorage::open(&db_path).unwrap();
19860        let agent = Agent {
19861            id: None,
19862            slug: "codex".into(),
19863            name: "Codex".into(),
19864            version: Some("0.2.3".into()),
19865            kind: AgentKind::Cli,
19866        };
19867        let agent_id = storage.ensure_agent(&agent).unwrap();
19868
19869        let make_conv = |source_path: &str, started_at: i64| Conversation {
19870            id: None,
19871            agent_slug: "codex".into(),
19872            workspace: Some(PathBuf::from("/tmp/workspace")),
19873            external_id: Some(source_path.to_string()),
19874            title: Some(source_path.to_string()),
19875            source_path: PathBuf::from(source_path),
19876            started_at: Some(started_at),
19877            ended_at: Some(started_at + 1),
19878            approx_tokens: None,
19879            metadata_json: serde_json::Value::Null,
19880            messages: vec![Message {
19881                id: None,
19882                idx: 0,
19883                role: MessageRole::User,
19884                author: None,
19885                created_at: Some(started_at),
19886                content: format!("message for {source_path}"),
19887                extra_json: serde_json::Value::Null,
19888                snippets: Vec::new(),
19889            }],
19890            source_id: LOCAL_SOURCE_ID.into(),
19891            origin_host: None,
19892        };
19893
19894        let conv_a = make_conv("/tmp/a.jsonl", 3_000);
19895        let conv_b = make_conv("/tmp/b.jsonl", 1_000);
19896        let conv_c = make_conv("/tmp/c.jsonl", 2_000);
19897
19898        storage
19899            .insert_conversation_tree(agent_id, None, &conv_a)
19900            .unwrap();
19901        storage
19902            .insert_conversation_tree(agent_id, None, &conv_b)
19903            .unwrap();
19904        storage
19905            .insert_conversation_tree(agent_id, None, &conv_c)
19906            .unwrap();
19907
19908        let user_order: Vec<PathBuf> = storage
19909            .list_conversations(10, 0)
19910            .unwrap()
19911            .into_iter()
19912            .map(|conv| conv.source_path)
19913            .collect();
19914        assert_eq!(
19915            user_order,
19916            vec![
19917                PathBuf::from("/tmp/a.jsonl"),
19918                PathBuf::from("/tmp/c.jsonl"),
19919                PathBuf::from("/tmp/b.jsonl"),
19920            ]
19921        );
19922
19923        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
19924        let rebuild_order: Vec<PathBuf> = storage
19925            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
19926            .unwrap()
19927            .into_iter()
19928            .map(|conv| conv.source_path)
19929            .collect();
19930        assert_eq!(
19931            rebuild_order,
19932            vec![
19933                PathBuf::from("/tmp/a.jsonl"),
19934                PathBuf::from("/tmp/b.jsonl"),
19935                PathBuf::from("/tmp/c.jsonl"),
19936            ]
19937        );
19938
19939        let first_page = storage
19940            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
19941            .unwrap();
19942        let first_page_paths: Vec<PathBuf> = first_page
19943            .iter()
19944            .map(|conv| conv.source_path.clone())
19945            .collect();
19946        assert_eq!(
19947            first_page_paths,
19948            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
19949        );
19950
19951        let second_page = storage
19952            .list_conversations_for_lexical_rebuild_after_id(
19953                2,
19954                first_page
19955                    .last()
19956                    .and_then(|conv| conv.id)
19957                    .expect("first page should include an id"),
19958                &agent_slugs,
19959                &workspace_paths,
19960            )
19961            .unwrap();
19962        let second_page_paths: Vec<PathBuf> = second_page
19963            .iter()
19964            .map(|conv| conv.source_path.clone())
19965            .collect();
19966        assert_eq!(second_page_paths, vec![PathBuf::from("/tmp/c.jsonl")]);
19967
19968        let bounded_page = storage
19969            .list_conversations_for_lexical_rebuild_after_id_through_id(
19970                10,
19971                0,
19972                first_page
19973                    .last()
19974                    .and_then(|conv| conv.id)
19975                    .expect("first page should include an id"),
19976                &agent_slugs,
19977                &workspace_paths,
19978            )
19979            .unwrap();
19980        let bounded_paths: Vec<PathBuf> = bounded_page
19981            .iter()
19982            .map(|conv| conv.source_path.clone())
19983            .collect();
19984        assert_eq!(
19985            bounded_paths,
19986            vec![PathBuf::from("/tmp/a.jsonl"), PathBuf::from("/tmp/b.jsonl")]
19987        );
19988    }
19989
19990    #[test]
19991    fn keyset_traversal_handles_sparse_holey_conversation_ids() {
19992        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
19993        use std::path::PathBuf;
19994
19995        let dir = TempDir::new().unwrap();
19996        let db_path = dir.path().join("agent_search.db");
19997        let storage = SqliteStorage::open(&db_path).unwrap();
19998        let agent = Agent {
19999            id: None,
20000            slug: "codex".into(),
20001            name: "Codex".into(),
20002            version: Some("0.2.3".into()),
20003            kind: AgentKind::Cli,
20004        };
20005        let agent_id = storage.ensure_agent(&agent).unwrap();
20006
20007        let make_conv = |label: &str, ts: i64| Conversation {
20008            id: None,
20009            agent_slug: "codex".into(),
20010            workspace: Some(PathBuf::from("/tmp/workspace")),
20011            external_id: Some(label.to_string()),
20012            title: Some(label.to_string()),
20013            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20014            started_at: Some(ts),
20015            ended_at: Some(ts + 1),
20016            approx_tokens: None,
20017            metadata_json: serde_json::Value::Null,
20018            messages: vec![Message {
20019                id: None,
20020                idx: 0,
20021                role: MessageRole::User,
20022                author: None,
20023                created_at: Some(ts),
20024                content: format!("msg for {label}"),
20025                extra_json: serde_json::Value::Null,
20026                snippets: Vec::new(),
20027            }],
20028            source_id: LOCAL_SOURCE_ID.into(),
20029            origin_host: None,
20030        };
20031
20032        for i in 0..6 {
20033            storage
20034                .insert_conversation_tree(
20035                    agent_id,
20036                    None,
20037                    &make_conv(&format!("conv-{i}"), 1000 + i),
20038                )
20039                .unwrap();
20040        }
20041
20042        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20043        storage
20044            .conn
20045            .execute_compat("DELETE FROM conversations WHERE id IN (2, 4)", fparams![])
20046            .unwrap();
20047        storage
20048            .conn
20049            .execute_compat(
20050                "DELETE FROM messages WHERE conversation_id IN (2, 4)",
20051                fparams![],
20052            )
20053            .unwrap();
20054        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20055
20056        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20057
20058        let page1 = storage
20059            .list_conversations_for_lexical_rebuild_after_id(2, 0, &agent_slugs, &workspace_paths)
20060            .unwrap();
20061        assert_eq!(page1.len(), 2);
20062        let page1_ids: Vec<i64> = page1.iter().map(|c| c.id.unwrap()).collect();
20063        assert_eq!(page1_ids, vec![1, 3]);
20064
20065        let page2 = storage
20066            .list_conversations_for_lexical_rebuild_after_id(
20067                2,
20068                *page1_ids.last().unwrap(),
20069                &agent_slugs,
20070                &workspace_paths,
20071            )
20072            .unwrap();
20073        assert_eq!(page2.len(), 2);
20074        let page2_ids: Vec<i64> = page2.iter().map(|c| c.id.unwrap()).collect();
20075        assert_eq!(page2_ids, vec![5, 6]);
20076
20077        let page3 = storage
20078            .list_conversations_for_lexical_rebuild_after_id(
20079                2,
20080                *page2_ids.last().unwrap(),
20081                &agent_slugs,
20082                &workspace_paths,
20083            )
20084            .unwrap();
20085        assert!(page3.is_empty());
20086
20087        let all_ids: Vec<i64> = page1_ids.iter().chain(page2_ids.iter()).copied().collect();
20088        assert_eq!(all_ids, vec![1, 3, 5, 6]);
20089    }
20090
20091    #[test]
20092    fn keyset_traversal_through_id_with_sparse_ranges() {
20093        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20094        use std::path::PathBuf;
20095
20096        let dir = TempDir::new().unwrap();
20097        let db_path = dir.path().join("agent_search.db");
20098        let storage = SqliteStorage::open(&db_path).unwrap();
20099        let agent = Agent {
20100            id: None,
20101            slug: "codex".into(),
20102            name: "Codex".into(),
20103            version: Some("0.2.3".into()),
20104            kind: AgentKind::Cli,
20105        };
20106        let agent_id = storage.ensure_agent(&agent).unwrap();
20107
20108        let make_conv = |label: &str, ts: i64| Conversation {
20109            id: None,
20110            agent_slug: "codex".into(),
20111            workspace: Some(PathBuf::from("/tmp/workspace")),
20112            external_id: Some(label.to_string()),
20113            title: Some(label.to_string()),
20114            source_path: PathBuf::from(format!("/tmp/{label}.jsonl")),
20115            started_at: Some(ts),
20116            ended_at: Some(ts + 1),
20117            approx_tokens: None,
20118            metadata_json: serde_json::Value::Null,
20119            messages: vec![Message {
20120                id: None,
20121                idx: 0,
20122                role: MessageRole::User,
20123                author: None,
20124                created_at: Some(ts),
20125                content: format!("msg for {label}"),
20126                extra_json: serde_json::Value::Null,
20127                snippets: Vec::new(),
20128            }],
20129            source_id: LOCAL_SOURCE_ID.into(),
20130            origin_host: None,
20131        };
20132
20133        for i in 0..10 {
20134            storage
20135                .insert_conversation_tree(
20136                    agent_id,
20137                    None,
20138                    &make_conv(&format!("conv-{i}"), 1000 + i),
20139                )
20140                .unwrap();
20141        }
20142
20143        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20144        storage
20145            .conn
20146            .execute_compat(
20147                "DELETE FROM conversations WHERE id IN (3, 5, 7, 8)",
20148                fparams![],
20149            )
20150            .unwrap();
20151        storage
20152            .conn
20153            .execute_compat(
20154                "DELETE FROM messages WHERE conversation_id IN (3, 5, 7, 8)",
20155                fparams![],
20156            )
20157            .unwrap();
20158        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20159
20160        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20161
20162        let through_5 = storage
20163            .list_conversations_for_lexical_rebuild_after_id_through_id(
20164                100,
20165                0,
20166                5,
20167                &agent_slugs,
20168                &workspace_paths,
20169            )
20170            .unwrap();
20171        let through_5_ids: Vec<i64> = through_5.iter().map(|c| c.id.unwrap()).collect();
20172        assert_eq!(through_5_ids, vec![1, 2, 4]);
20173
20174        let after_4_through_10 = storage
20175            .list_conversations_for_lexical_rebuild_after_id_through_id(
20176                100,
20177                4,
20178                10,
20179                &agent_slugs,
20180                &workspace_paths,
20181            )
20182            .unwrap();
20183        let ids: Vec<i64> = after_4_through_10.iter().map(|c| c.id.unwrap()).collect();
20184        assert_eq!(ids, vec![6, 9, 10]);
20185
20186        let after_10 = storage
20187            .list_conversations_for_lexical_rebuild_after_id_through_id(
20188                100,
20189                10,
20190                20,
20191                &agent_slugs,
20192                &workspace_paths,
20193            )
20194            .unwrap();
20195        assert!(after_10.is_empty());
20196    }
20197
20198    #[test]
20199    fn list_conversation_footprints_for_lexical_rebuild_estimates_bytes_and_keeps_empty_conversations()
20200     {
20201        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20202        use std::path::PathBuf;
20203
20204        let dir = TempDir::new().unwrap();
20205        let db_path = dir.path().join("agent_search.db");
20206        let storage = SqliteStorage::open(&db_path).unwrap();
20207        let agent = Agent {
20208            id: None,
20209            slug: "codex".into(),
20210            name: "Codex".into(),
20211            version: Some("0.2.3".into()),
20212            kind: AgentKind::Cli,
20213        };
20214        let agent_id = storage.ensure_agent(&agent).unwrap();
20215
20216        let insert = |external_id: &str, base_ts: i64, messages: Vec<Message>| {
20217            storage
20218                .insert_conversation_tree(
20219                    agent_id,
20220                    None,
20221                    &Conversation {
20222                        id: None,
20223                        agent_slug: "codex".into(),
20224                        workspace: Some(PathBuf::from("/tmp/workspace")),
20225                        external_id: Some(external_id.to_string()),
20226                        title: Some(external_id.to_string()),
20227                        source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
20228                        started_at: Some(base_ts),
20229                        ended_at: Some(base_ts + 100),
20230                        approx_tokens: None,
20231                        metadata_json: serde_json::Value::Null,
20232                        messages,
20233                        source_id: LOCAL_SOURCE_ID.into(),
20234                        origin_host: None,
20235                    },
20236                )
20237                .unwrap()
20238                .conversation_id
20239        };
20240
20241        let ascii_id = insert(
20242            "footprint-ascii",
20243            1_700_000_000_000,
20244            vec![
20245                Message {
20246                    id: None,
20247                    idx: 0,
20248                    role: MessageRole::User,
20249                    author: None,
20250                    created_at: Some(1_700_000_000_001),
20251                    content: "abc".into(),
20252                    extra_json: serde_json::Value::Null,
20253                    snippets: Vec::new(),
20254                },
20255                Message {
20256                    id: None,
20257                    idx: 1,
20258                    role: MessageRole::Agent,
20259                    author: None,
20260                    created_at: Some(1_700_000_000_002),
20261                    content: "defg".into(),
20262                    extra_json: serde_json::Value::Null,
20263                    snippets: Vec::new(),
20264                },
20265            ],
20266        );
20267        let empty_id = insert("footprint-empty", 1_700_000_001_000, Vec::new());
20268        let utf8_id = insert(
20269            "footprint-utf8",
20270            1_700_000_002_000,
20271            vec![Message {
20272                id: None,
20273                idx: 0,
20274                role: MessageRole::Tool,
20275                author: None,
20276                created_at: Some(1_700_000_002_001),
20277                content: "hé🙂".into(),
20278                extra_json: serde_json::Value::Null,
20279                snippets: Vec::new(),
20280            }],
20281        );
20282        let sparse_id = insert(
20283            "footprint-sparse",
20284            1_700_000_003_000,
20285            vec![Message {
20286                id: None,
20287                idx: 10,
20288                role: MessageRole::User,
20289                author: None,
20290                created_at: Some(1_700_000_003_010),
20291                content: "sparse".into(),
20292                extra_json: serde_json::Value::Null,
20293                snippets: Vec::new(),
20294            }],
20295        );
20296        storage
20297            .conn
20298            .execute_compat(
20299                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20300                fparams![utf8_id],
20301            )
20302            .unwrap();
20303
20304        let footprints = storage
20305            .list_conversation_footprints_for_lexical_rebuild()
20306            .unwrap();
20307        assert_eq!(
20308            footprints,
20309            vec![
20310                LexicalRebuildConversationFootprintRow {
20311                    conversation_id: ascii_id,
20312                    message_count: 2,
20313                    message_bytes: 2 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20314                },
20315                LexicalRebuildConversationFootprintRow {
20316                    conversation_id: empty_id,
20317                    message_count: 0,
20318                    message_bytes: 0,
20319                },
20320                LexicalRebuildConversationFootprintRow {
20321                    conversation_id: utf8_id,
20322                    message_count: 1,
20323                    message_bytes: LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20324                },
20325                LexicalRebuildConversationFootprintRow {
20326                    conversation_id: sparse_id,
20327                    message_count: 11,
20328                    message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20329                },
20330            ]
20331        );
20332    }
20333
20334    #[test]
20335    fn list_conversation_footprints_for_lexical_rebuild_falls_back_for_missing_tail_cache() {
20336        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20337        use std::path::PathBuf;
20338
20339        let dir = TempDir::new().unwrap();
20340        let db_path = dir.path().join("agent_search.db");
20341        let storage = SqliteStorage::open(&db_path).unwrap();
20342        let agent = Agent {
20343            id: None,
20344            slug: "codex".into(),
20345            name: "Codex".into(),
20346            version: Some("0.2.3".into()),
20347            kind: AgentKind::Cli,
20348        };
20349        let agent_id = storage.ensure_agent(&agent).unwrap();
20350        let conversation_id = storage
20351            .insert_conversation_tree(
20352                agent_id,
20353                None,
20354                &Conversation {
20355                    id: None,
20356                    agent_slug: "codex".into(),
20357                    workspace: Some(PathBuf::from("/tmp/workspace")),
20358                    external_id: Some("footprint-missing-tail".to_string()),
20359                    title: Some("footprint-missing-tail".to_string()),
20360                    source_path: PathBuf::from("/tmp/footprint-missing-tail.jsonl"),
20361                    started_at: Some(1_700_000_000_000),
20362                    ended_at: Some(1_700_000_000_100),
20363                    approx_tokens: None,
20364                    metadata_json: serde_json::Value::Null,
20365                    messages: vec![Message {
20366                        id: None,
20367                        idx: 10,
20368                        role: MessageRole::User,
20369                        author: None,
20370                        created_at: Some(1_700_000_000_010),
20371                        content: "legacy sparse tail".into(),
20372                        extra_json: serde_json::Value::Null,
20373                        snippets: Vec::new(),
20374                    }],
20375                    source_id: LOCAL_SOURCE_ID.into(),
20376                    origin_host: None,
20377                },
20378            )
20379            .unwrap()
20380            .conversation_id;
20381
20382        storage
20383            .conn
20384            .execute_compat(
20385                "UPDATE conversations
20386                 SET last_message_idx = NULL, last_message_created_at = NULL
20387                 WHERE id = ?1",
20388                fparams![conversation_id],
20389            )
20390            .unwrap();
20391        storage
20392            .conn
20393            .execute_compat(
20394                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
20395                fparams![conversation_id],
20396            )
20397            .unwrap();
20398
20399        let footprints = storage
20400            .list_conversation_footprints_for_lexical_rebuild()
20401            .unwrap();
20402
20403        assert_eq!(
20404            footprints,
20405            vec![LexicalRebuildConversationFootprintRow {
20406                conversation_id,
20407                message_count: 11,
20408                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20409            }],
20410            "missing tail-cache metadata should fall back to messages MAX(idx) instead of treating legacy conversations as empty"
20411        );
20412    }
20413
20414    #[test]
20415    fn list_conversation_footprints_for_lexical_rebuild_raises_stale_low_tail_cache() {
20416        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20417        use std::path::PathBuf;
20418
20419        let dir = TempDir::new().unwrap();
20420        let db_path = dir.path().join("agent_search.db");
20421        let storage = SqliteStorage::open(&db_path).unwrap();
20422        let agent = Agent {
20423            id: None,
20424            slug: "codex".into(),
20425            name: "Codex".into(),
20426            version: Some("0.2.3".into()),
20427            kind: AgentKind::Cli,
20428        };
20429        let agent_id = storage.ensure_agent(&agent).unwrap();
20430        let conversation_id = storage
20431            .insert_conversation_tree(
20432                agent_id,
20433                None,
20434                &Conversation {
20435                    id: None,
20436                    agent_slug: "codex".into(),
20437                    workspace: Some(PathBuf::from("/tmp/workspace")),
20438                    external_id: Some("footprint-stale-tail".to_string()),
20439                    title: Some("footprint-stale-tail".to_string()),
20440                    source_path: PathBuf::from("/tmp/footprint-stale-tail.jsonl"),
20441                    started_at: Some(1_700_000_000_000),
20442                    ended_at: Some(1_700_000_000_100),
20443                    approx_tokens: None,
20444                    metadata_json: serde_json::Value::Null,
20445                    messages: (0..3)
20446                        .map(|idx| Message {
20447                            id: None,
20448                            idx,
20449                            role: MessageRole::User,
20450                            author: None,
20451                            created_at: Some(1_700_000_000_010 + idx),
20452                            content: format!("message {idx}"),
20453                            extra_json: serde_json::Value::Null,
20454                            snippets: Vec::new(),
20455                        })
20456                        .collect(),
20457                    source_id: LOCAL_SOURCE_ID.into(),
20458                    origin_host: None,
20459                },
20460            )
20461            .unwrap()
20462            .conversation_id;
20463
20464        storage
20465            .conn
20466            .execute_compat(
20467                "UPDATE conversations
20468                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20469                 WHERE id = ?1",
20470                fparams![conversation_id],
20471            )
20472            .unwrap();
20473        storage
20474            .conn
20475            .execute_compat(
20476                "UPDATE conversation_tail_state
20477                 SET last_message_idx = 0, last_message_created_at = 1700000000010
20478                 WHERE conversation_id = ?1",
20479                fparams![conversation_id],
20480            )
20481            .unwrap();
20482
20483        let footprints = storage
20484            .list_conversation_footprints_for_lexical_rebuild()
20485            .unwrap();
20486
20487        assert_eq!(
20488            footprints,
20489            vec![LexicalRebuildConversationFootprintRow {
20490                conversation_id,
20491                message_count: 3,
20492                message_bytes: 3 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20493            }],
20494            "stale-low tail caches must not under-plan lexical shards and trip doc>plan invariants"
20495        );
20496    }
20497
20498    #[test]
20499    fn list_conversation_footprints_for_lexical_rebuild_tolerates_missing_tail_state_table() {
20500        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20501        use std::path::PathBuf;
20502
20503        let dir = TempDir::new().unwrap();
20504        let db_path = dir.path().join("agent_search.db");
20505        let storage = SqliteStorage::open(&db_path).unwrap();
20506        let agent = Agent {
20507            id: None,
20508            slug: "codex".into(),
20509            name: "Codex".into(),
20510            version: Some("0.2.3".into()),
20511            kind: AgentKind::Cli,
20512        };
20513        let agent_id = storage.ensure_agent(&agent).unwrap();
20514        let conversation_id = storage
20515            .insert_conversation_tree(
20516                agent_id,
20517                None,
20518                &Conversation {
20519                    id: None,
20520                    agent_slug: "codex".into(),
20521                    workspace: Some(PathBuf::from("/tmp/workspace")),
20522                    external_id: Some("footprint-missing-tail-table".to_string()),
20523                    title: Some("footprint-missing-tail-table".to_string()),
20524                    source_path: PathBuf::from("/tmp/footprint-missing-tail-table.jsonl"),
20525                    started_at: Some(1_700_000_000_000),
20526                    ended_at: Some(1_700_000_000_100),
20527                    approx_tokens: None,
20528                    metadata_json: serde_json::Value::Null,
20529                    messages: vec![Message {
20530                        id: None,
20531                        idx: 10,
20532                        role: MessageRole::User,
20533                        author: None,
20534                        created_at: Some(1_700_000_000_010),
20535                        content: "legacy sparse tail without hot table".into(),
20536                        extra_json: serde_json::Value::Null,
20537                        snippets: Vec::new(),
20538                    }],
20539                    source_id: LOCAL_SOURCE_ID.into(),
20540                    origin_host: None,
20541                },
20542            )
20543            .unwrap()
20544            .conversation_id;
20545
20546        storage
20547            .conn
20548            .execute_compat(
20549                "UPDATE conversations
20550                 SET last_message_idx = NULL, last_message_created_at = NULL
20551                 WHERE id = ?1",
20552                fparams![conversation_id],
20553            )
20554            .unwrap();
20555        storage
20556            .conn
20557            .execute_compat("DROP TABLE conversation_tail_state", fparams![])
20558            .unwrap();
20559
20560        let footprints = storage
20561            .list_conversation_footprints_for_lexical_rebuild()
20562            .unwrap();
20563
20564        assert_eq!(
20565            footprints,
20566            vec![LexicalRebuildConversationFootprintRow {
20567                conversation_id,
20568                message_count: 11,
20569                message_bytes: 11 * LEXICAL_REBUILD_PLANNER_ESTIMATED_BYTES_PER_MESSAGE,
20570            }],
20571            "read-only lexical self-heal must tolerate pre-tail-cache databases and use messages MAX(idx)"
20572        );
20573    }
20574
20575    #[test]
20576    fn list_conversation_footprints_for_lexical_rebuild_tolerates_legacy_search_demo_fixture() {
20577        let fixture_db = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
20578            .join("tests")
20579            .join("fixtures")
20580            .join("search_demo_data")
20581            .join("agent_search.db");
20582        let storage = FrankenStorage::open_readonly(&fixture_db).unwrap();
20583
20584        let footprints = storage
20585            .list_conversation_footprints_for_lexical_rebuild()
20586            .unwrap();
20587
20588        assert!(
20589            !footprints.is_empty(),
20590            "search self-heal should be able to plan a lexical rebuild from the legacy search demo fixture"
20591        );
20592        assert!(
20593            footprints
20594                .iter()
20595                .all(|footprint| footprint.message_count > 0),
20596            "legacy fixture conversations should derive message counts from messages when tail caches are absent"
20597        );
20598    }
20599
20600    #[test]
20601    fn lexical_rebuild_listing_normalizes_host_only_remote_source_from_blank_source_id() {
20602        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20603        use std::path::PathBuf;
20604
20605        let dir = TempDir::new().unwrap();
20606        let db_path = dir.path().join("agent_search.db");
20607        let storage = SqliteStorage::open(&db_path).unwrap();
20608        let agent = Agent {
20609            id: None,
20610            slug: "codex".into(),
20611            name: "Codex".into(),
20612            version: Some("0.2.3".into()),
20613            kind: AgentKind::Cli,
20614        };
20615        let agent_id = storage.ensure_agent(&agent).unwrap();
20616        let conversation = Conversation {
20617            id: None,
20618            agent_slug: "codex".into(),
20619            workspace: Some(PathBuf::from("/tmp/workspace")),
20620            external_id: Some("legacy-blank-source".into()),
20621            title: Some("Legacy blank source".into()),
20622            source_path: PathBuf::from("/tmp/legacy-blank-source.jsonl"),
20623            started_at: Some(1_700_000_000_000),
20624            ended_at: Some(1_700_000_000_100),
20625            approx_tokens: None,
20626            metadata_json: serde_json::Value::Null,
20627            messages: vec![Message {
20628                id: None,
20629                idx: 0,
20630                role: MessageRole::User,
20631                author: None,
20632                created_at: Some(1_700_000_000_000),
20633                content: "hello".into(),
20634                extra_json: serde_json::Value::Null,
20635                snippets: Vec::new(),
20636            }],
20637            source_id: LOCAL_SOURCE_ID.into(),
20638            origin_host: None,
20639        };
20640
20641        let conversation_id = storage
20642            .insert_conversation_tree(agent_id, None, &conversation)
20643            .unwrap()
20644            .conversation_id;
20645        storage.conn.execute("PRAGMA foreign_keys = OFF").unwrap();
20646        storage
20647            .conn
20648            .execute_compat(
20649                "UPDATE conversations SET source_id = ?1, origin_host = ?2 WHERE id = ?3",
20650                fparams!["   ", "dev@laptop", conversation_id],
20651            )
20652            .unwrap();
20653        storage.conn.execute("PRAGMA foreign_keys = ON").unwrap();
20654
20655        let listed = storage.list_conversations(10, 0).unwrap();
20656        assert_eq!(listed.len(), 1);
20657        assert_eq!(listed[0].source_id, "dev@laptop");
20658        assert_eq!(listed[0].origin_host.as_deref(), Some("dev@laptop"));
20659
20660        let (agent_slugs, workspace_paths) = storage.build_lexical_rebuild_lookups().unwrap();
20661        let rebuild_listed = storage
20662            .list_conversations_for_lexical_rebuild_after_id(10, 0, &agent_slugs, &workspace_paths)
20663            .unwrap();
20664        assert_eq!(rebuild_listed.len(), 1);
20665        assert_eq!(rebuild_listed[0].source_id, "dev@laptop");
20666        assert_eq!(rebuild_listed[0].origin_host.as_deref(), Some("dev@laptop"));
20667    }
20668
20669    #[test]
20670    fn seed_canonical_from_best_historical_bundle_copies_data_and_resets_runtime_meta() {
20671        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20672        use std::path::PathBuf;
20673
20674        let dir = TempDir::new().unwrap();
20675        let canonical_db = dir.path().join("agent_search.db");
20676        let source_db = dir
20677            .path()
20678            .join("backups/agent_search.db.20260322T020200.bak");
20679
20680        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20681
20682        let source = SqliteStorage::open(&source_db).unwrap();
20683        let agent = Agent {
20684            id: None,
20685            slug: "codex".into(),
20686            name: "Codex".into(),
20687            version: Some("0.2.3".into()),
20688            kind: AgentKind::Cli,
20689        };
20690        let agent_id = source.ensure_agent(&agent).unwrap();
20691        let conversation = Conversation {
20692            id: None,
20693            agent_slug: "codex".into(),
20694            workspace: Some(PathBuf::from("/tmp/workspace")),
20695            external_id: Some("seed-conv".into()),
20696            title: Some("Historical seed".into()),
20697            source_path: PathBuf::from("/tmp/historical-seed.jsonl"),
20698            started_at: Some(1_700_000_000_000),
20699            ended_at: Some(1_700_000_000_100),
20700            approx_tokens: Some(42),
20701            metadata_json: serde_json::json!({"seed": true}),
20702            messages: vec![Message {
20703                id: None,
20704                idx: 0,
20705                role: MessageRole::Agent,
20706                author: Some("assistant".into()),
20707                created_at: Some(1_700_000_000_050),
20708                content: "seeded message".into(),
20709                extra_json: serde_json::json!({"usage": {"total_tokens": 12}}),
20710                snippets: Vec::new(),
20711            }],
20712            source_id: LOCAL_SOURCE_ID.into(),
20713            origin_host: None,
20714        };
20715        source
20716            .insert_conversation_tree(agent_id, None, &conversation)
20717            .unwrap();
20718        source.set_last_scan_ts(123).unwrap();
20719        source.set_last_indexed_at(456).unwrap();
20720        source.set_last_embedded_message_id(789).unwrap();
20721        source
20722            .conn
20723            .execute_compat(
20724                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20725                fparams!["historical_bundle_salvaged:stale", "{\"stale\":true}"],
20726            )
20727            .unwrap();
20728        drop(source);
20729
20730        // Legacy "duplicate FTS" fixture reconstruction.
20731        //
20732        // Post-V14 migration cass drops the V13-era fts_messages virtual table
20733        // and recreates it lazily, so a freshly-opened canonical DB has zero
20734        // fts_messages entries in sqlite_master. To reproduce the historical
20735        // failure mode this test exercises — a legacy v13 bundle with a
20736        // duplicated CREATE VIRTUAL TABLE row — we have to inject *both*
20737        // entries: the original V13-era contentless row and the buggy duplicate
20738        // row. Before V14 existed the original was already present after
20739        // migration and only the duplicate needed manual injection.
20740        let legacy_v13_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, content='', tokenize='porter')";
20741        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
20742        let legacy = rusqlite_test_fixture_conn(&source_db);
20743        legacy
20744            .execute_batch(
20745                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
20746                 DELETE FROM _schema_migrations WHERE version = 14;
20747                 PRAGMA writable_schema = ON;",
20748            )
20749            .unwrap();
20750        legacy
20751            .execute(
20752                "DELETE FROM meta WHERE key = ?1",
20753                [FTS_FRANKEN_REBUILD_META_KEY],
20754            )
20755            .unwrap();
20756        // Inject the V13 original first.
20757        legacy
20758            .execute(
20759                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20760                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20761                [legacy_v13_fts_sql],
20762            )
20763            .unwrap();
20764        // Then the duplicate that's the real subject of the fixup logic.
20765        legacy
20766            .execute(
20767                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
20768                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
20769                [duplicate_legacy_fts_sql],
20770            )
20771            .unwrap();
20772        legacy
20773            .execute_batch("PRAGMA writable_schema = OFF;")
20774            .unwrap();
20775        drop(legacy);
20776
20777        // Verify fixture with rusqlite+writable_schema to see raw
20778        // sqlite_master rows (frankensqlite deduplicates schema entries).
20779        {
20780            let verify = rusqlite_test_fixture_conn(&source_db);
20781            verify
20782                .execute_batch("PRAGMA writable_schema = ON;")
20783                .unwrap();
20784            let fts_entries: i64 = verify
20785                .query_row(
20786                    "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20787                    [],
20788                    |row| row.get(0),
20789                )
20790                .unwrap();
20791            assert_eq!(
20792                fts_entries, 2,
20793                "test fixture should reproduce the duplicate legacy fts_messages rows"
20794            );
20795            let msg_count: i64 = verify
20796                .query_row("SELECT COUNT(*) FROM messages", [], |row| row.get(0))
20797                .unwrap();
20798            assert_eq!(msg_count, 1);
20799        }
20800
20801        let fresh = SqliteStorage::open(&canonical_db).unwrap();
20802        drop(fresh);
20803
20804        let outcome = seed_canonical_from_best_historical_bundle(&canonical_db)
20805            .unwrap()
20806            .unwrap();
20807        assert_eq!(outcome.bundles_imported, 1);
20808        assert_eq!(outcome.conversations_imported, 1);
20809        assert_eq!(outcome.messages_imported, 1);
20810
20811        let readonly = open_franken_with_flags(
20812            &canonical_db.to_string_lossy(),
20813            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20814        )
20815        .unwrap();
20816        let readonly_message_count: i64 = readonly
20817            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20818                row.get_typed(0)
20819            })
20820            .unwrap();
20821        assert_eq!(readonly_message_count, 1);
20822
20823        let seeded = SqliteStorage::open(&canonical_db).unwrap();
20824        assert_eq!(
20825            seeded
20826                .count_sessions_in_range(None, None, None, None)
20827                .unwrap()
20828                .0,
20829            1
20830        );
20831        let message_count: i64 = seeded
20832            .conn
20833            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20834                row.get_typed(0)
20835            })
20836            .unwrap();
20837        assert_eq!(message_count, 1);
20838        assert_eq!(seeded.get_last_scan_ts().unwrap(), None);
20839        assert_eq!(seeded.get_last_embedded_message_id().unwrap(), None);
20840
20841        let last_indexed: Option<String> = seeded
20842            .conn
20843            .query_row_map(
20844                "SELECT value FROM meta WHERE key = 'last_indexed_at'",
20845                fparams![],
20846                |row| row.get_typed(0),
20847            )
20848            .optional()
20849            .unwrap();
20850        assert!(last_indexed.is_none());
20851
20852        let salvage_keys: Vec<String> = seeded
20853            .conn
20854            .query_map_collect(
20855                "SELECT key FROM meta WHERE key LIKE 'historical_bundle_salvaged:%' ORDER BY key",
20856                fparams![],
20857                |row| row.get_typed(0),
20858            )
20859            .unwrap();
20860        assert_eq!(salvage_keys.len(), 1);
20861
20862        let reopened_readonly = open_franken_with_flags(
20863            &canonical_db.to_string_lossy(),
20864            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
20865        )
20866        .unwrap();
20867        let reopened_fts_entries: i64 = reopened_readonly
20868            .query_row_map(
20869                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20870                fparams![],
20871                |row| row.get_typed(0),
20872            )
20873            .unwrap();
20874        assert_eq!(
20875            reopened_fts_entries, 1,
20876            "seeded canonical db should keep a single stock-SQLite fts_messages schema row"
20877        );
20878        let reopened_message_count: i64 = reopened_readonly
20879            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
20880                row.get_typed(0)
20881            })
20882            .unwrap();
20883        assert_eq!(reopened_message_count, 1);
20884
20885        let franken_seeded = FrankenStorage::open(&canonical_db).unwrap();
20886        assert_eq!(
20887            franken_seeded.schema_version().unwrap(),
20888            CURRENT_SCHEMA_VERSION
20889        );
20890        // Post-V14 fts_messages is recreated lazily. `FrankenStorage::open`
20891        // alone doesn't re-register the virtual table for the frankensqlite
20892        // query engine — the consistency pass does, and this is exactly what
20893        // normal cass startup runs before the first search. Invoke it
20894        // explicitly so the query below exercises the expected post-repair
20895        // state rather than the between-steps state.
20896        franken_seeded
20897            .ensure_search_fallback_fts_consistency()
20898            .expect("ensure FTS consistency after seed");
20899        let post_franken_schema_rows: i64 = franken_seeded
20900            .raw()
20901            .query_row_map(
20902                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
20903                fparams![],
20904                |row| row.get_typed(0),
20905            )
20906            .unwrap();
20907        assert_eq!(post_franken_schema_rows, 1);
20908        let fts_probe = franken_seeded
20909            .raw()
20910            .query("SELECT COUNT(*) FROM fts_messages");
20911        assert!(
20912            fts_probe.is_ok(),
20913            "expected post-seed FTS to be queryable, got {fts_probe:?}"
20914        );
20915    }
20916
20917    #[test]
20918    fn failed_baseline_seed_preserves_existing_canonical_bundle() {
20919        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
20920        use std::path::PathBuf;
20921
20922        let dir = TempDir::new().unwrap();
20923        let canonical_db = dir.path().join("agent_search.db");
20924        let source_db = dir
20925            .path()
20926            .join("backups/agent_search.db.20260325T120000Z.bad-seed.bak");
20927
20928        fs::create_dir_all(source_db.parent().unwrap()).unwrap();
20929
20930        let canonical = SqliteStorage::open(&canonical_db).unwrap();
20931        canonical
20932            .conn
20933            .execute_compat(
20934                "INSERT OR REPLACE INTO meta(key, value) VALUES(?1, ?2)",
20935                fparams!["sentinel", "keep-me"],
20936            )
20937            .unwrap();
20938        drop(canonical);
20939
20940        let source = SqliteStorage::open(&source_db).unwrap();
20941        let agent = Agent {
20942            id: None,
20943            slug: "codex".into(),
20944            name: "Codex".into(),
20945            version: Some("0.2.3".into()),
20946            kind: AgentKind::Cli,
20947        };
20948        let agent_id = source.ensure_agent(&agent).unwrap();
20949        let conversation = Conversation {
20950            id: None,
20951            agent_slug: "codex".into(),
20952            workspace: Some(PathBuf::from("/tmp/workspace")),
20953            external_id: Some("bad-seed-conv".into()),
20954            title: Some("Bad seed".into()),
20955            source_path: PathBuf::from("/tmp/bad-seed.jsonl"),
20956            started_at: Some(1_700_000_000_000),
20957            ended_at: Some(1_700_000_000_100),
20958            approx_tokens: Some(42),
20959            metadata_json: serde_json::json!({"seed": "bad"}),
20960            messages: vec![Message {
20961                id: None,
20962                idx: 0,
20963                role: MessageRole::Agent,
20964                author: Some("assistant".into()),
20965                created_at: Some(1_700_000_000_050),
20966                content: "this seed should fail".into(),
20967                extra_json: serde_json::Value::Null,
20968                snippets: Vec::new(),
20969            }],
20970            source_id: LOCAL_SOURCE_ID.into(),
20971            origin_host: None,
20972        };
20973        source
20974            .insert_conversation_tree(agent_id, None, &conversation)
20975            .unwrap();
20976        drop(source);
20977
20978        let legacy = FrankenConnection::open(source_db.to_string_lossy().into_owned()).unwrap();
20979        legacy
20980            .execute("UPDATE meta SET value = '12' WHERE key = 'schema_version'")
20981            .unwrap();
20982        drop(legacy);
20983
20984        let err = seed_canonical_from_best_historical_bundle(&canonical_db).unwrap_err();
20985        assert!(
20986            err.to_string()
20987                .contains("schema_version 12 is too old for baseline import"),
20988            "unexpected seed error: {err:#}"
20989        );
20990
20991        let reopened = SqliteStorage::open(&canonical_db).unwrap();
20992        let sentinel: Option<String> = reopened
20993            .conn
20994            .query_row_map(
20995                "SELECT value FROM meta WHERE key = 'sentinel'",
20996                fparams![],
20997                |row| row.get_typed(0),
20998            )
20999            .optional()
21000            .unwrap();
21001        assert_eq!(sentinel.as_deref(), Some("keep-me"));
21002
21003        let conversation_count: i64 = reopened
21004            .conn
21005            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
21006                row.get_typed(0)
21007            })
21008            .unwrap();
21009        assert_eq!(conversation_count, 0);
21010
21011        let readonly = open_franken_with_flags(
21012            &canonical_db.to_string_lossy(),
21013            FrankenOpenFlags::SQLITE_OPEN_READ_ONLY,
21014        )
21015        .unwrap();
21016        let readonly_conversation_count: i64 = readonly
21017            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |row| {
21018                row.get_typed(0)
21019            })
21020            .unwrap();
21021        assert_eq!(readonly_conversation_count, 0);
21022    }
21023
21024    #[test]
21025    fn fetch_messages_for_lexical_rebuild_skips_extra_json() {
21026        let dir = TempDir::new().unwrap();
21027        let db_path = dir.path().join("test.db");
21028        let storage = SqliteStorage::open(&db_path).unwrap();
21029
21030        let agent = Agent {
21031            id: None,
21032            slug: "codex".into(),
21033            name: "Codex".into(),
21034            version: Some("0.2.3".into()),
21035            kind: AgentKind::Cli,
21036        };
21037        let agent_id = storage.ensure_agent(&agent).unwrap();
21038
21039        let conversation = Conversation {
21040            id: None,
21041            agent_slug: "codex".into(),
21042            workspace: Some(PathBuf::from("/tmp/workspace")),
21043            external_id: Some("lexical-rebuild-test".into()),
21044            title: Some("Lexical rebuild".into()),
21045            source_path: PathBuf::from("/tmp/lexical-rebuild.jsonl"),
21046            started_at: Some(1_700_000_000_000),
21047            ended_at: Some(1_700_000_000_100),
21048            approx_tokens: Some(42),
21049            metadata_json: serde_json::Value::Null,
21050            messages: vec![Message {
21051                id: None,
21052                idx: 0,
21053                role: MessageRole::Agent,
21054                author: Some("assistant".into()),
21055                created_at: Some(1_700_000_000_050),
21056                content: "indexed text".into(),
21057                extra_json: serde_json::json!({
21058                    "usage": { "total_tokens": 1234 },
21059                    "irrelevant_blob": "still preserved in canonical storage"
21060                }),
21061                snippets: Vec::new(),
21062            }],
21063            source_id: LOCAL_SOURCE_ID.into(),
21064            origin_host: None,
21065        };
21066
21067        let inserted = storage
21068            .insert_conversation_tree(agent_id, None, &conversation)
21069            .unwrap();
21070        let conversation_id = inserted.conversation_id;
21071
21072        let stored = storage.fetch_messages(conversation_id).unwrap();
21073        assert_eq!(stored.len(), 1);
21074        assert!(!stored[0].extra_json.is_null());
21075
21076        let lexical = storage
21077            .fetch_messages_for_lexical_rebuild(conversation_id)
21078            .unwrap();
21079        assert_eq!(lexical.len(), 1);
21080        assert_eq!(lexical[0].content, "indexed text");
21081        assert_eq!(lexical[0].author.as_deref(), Some("assistant"));
21082        assert!(lexical[0].extra_json.is_null());
21083    }
21084
21085    #[test]
21086    fn fetch_messages_for_lexical_rebuild_batch_groups_and_orders_messages() {
21087        let dir = TempDir::new().unwrap();
21088        let db_path = dir.path().join("test.db");
21089        let storage = SqliteStorage::open(&db_path).unwrap();
21090
21091        let agent = Agent {
21092            id: None,
21093            slug: "codex".into(),
21094            name: "Codex".into(),
21095            version: Some("0.2.3".into()),
21096            kind: AgentKind::Cli,
21097        };
21098        let agent_id = storage.ensure_agent(&agent).unwrap();
21099
21100        let first = Conversation {
21101            id: None,
21102            agent_slug: "codex".into(),
21103            workspace: Some(PathBuf::from("/tmp/workspace")),
21104            external_id: Some("lexical-batch-1".into()),
21105            title: Some("Lexical batch 1".into()),
21106            source_path: PathBuf::from("/tmp/lexical-batch-1.jsonl"),
21107            started_at: Some(1_700_000_000_000),
21108            ended_at: Some(1_700_000_000_100),
21109            approx_tokens: Some(42),
21110            metadata_json: serde_json::Value::Null,
21111            messages: vec![
21112                Message {
21113                    id: None,
21114                    idx: 0,
21115                    role: MessageRole::User,
21116                    author: Some("user".into()),
21117                    created_at: Some(1_700_000_000_010),
21118                    content: "first-a".into(),
21119                    extra_json: serde_json::json!({"opaque": true}),
21120                    snippets: Vec::new(),
21121                },
21122                Message {
21123                    id: None,
21124                    idx: 1,
21125                    role: MessageRole::Agent,
21126                    author: Some("assistant".into()),
21127                    created_at: Some(1_700_000_000_020),
21128                    content: "first-b".into(),
21129                    extra_json: serde_json::json!({"opaque": true}),
21130                    snippets: Vec::new(),
21131                },
21132            ],
21133            source_id: LOCAL_SOURCE_ID.into(),
21134            origin_host: None,
21135        };
21136
21137        let second = Conversation {
21138            id: None,
21139            agent_slug: "codex".into(),
21140            workspace: Some(PathBuf::from("/tmp/workspace")),
21141            external_id: Some("lexical-batch-2".into()),
21142            title: Some("Lexical batch 2".into()),
21143            source_path: PathBuf::from("/tmp/lexical-batch-2.jsonl"),
21144            started_at: Some(1_700_000_000_200),
21145            ended_at: Some(1_700_000_000_300),
21146            approx_tokens: Some(84),
21147            metadata_json: serde_json::Value::Null,
21148            messages: vec![Message {
21149                id: None,
21150                idx: 0,
21151                role: MessageRole::Tool,
21152                author: Some("tool".into()),
21153                created_at: Some(1_700_000_000_210),
21154                content: "second-a".into(),
21155                extra_json: serde_json::json!({"opaque": true}),
21156                snippets: Vec::new(),
21157            }],
21158            source_id: LOCAL_SOURCE_ID.into(),
21159            origin_host: None,
21160        };
21161        let third = Conversation {
21162            external_id: Some("lexical-batch-3".into()),
21163            title: Some("Lexical batch 3".into()),
21164            source_path: PathBuf::from("/tmp/lexical-batch-3.jsonl"),
21165            messages: vec![Message {
21166                id: None,
21167                idx: 0,
21168                role: MessageRole::System,
21169                author: Some("system".into()),
21170                created_at: Some(1_700_000_000_410),
21171                content: "third-a".into(),
21172                extra_json: serde_json::json!({"opaque": true}),
21173                snippets: Vec::new(),
21174            }],
21175            ..second.clone()
21176        };
21177
21178        let first_id = storage
21179            .insert_conversation_tree(agent_id, None, &first)
21180            .unwrap()
21181            .conversation_id;
21182        let second_id = storage
21183            .insert_conversation_tree(agent_id, None, &second)
21184            .unwrap()
21185            .conversation_id;
21186        let third_id = storage
21187            .insert_conversation_tree(agent_id, None, &third)
21188            .unwrap()
21189            .conversation_id;
21190
21191        let lexical = storage
21192            .fetch_messages_for_lexical_rebuild_batch(&[third_id, first_id], None, None)
21193            .unwrap();
21194
21195        let first_messages = lexical.get(&first_id).expect("first conversation");
21196        assert_eq!(first_messages.len(), 2);
21197        assert_eq!(first_messages[0].content, "first-a");
21198        assert_eq!(first_messages[1].content, "first-b");
21199        assert!(
21200            first_messages
21201                .iter()
21202                .all(|message| message.extra_json.is_null())
21203        );
21204
21205        assert!(
21206            !lexical.contains_key(&second_id),
21207            "batch fetch must exclude conversations not requested by the caller"
21208        );
21209
21210        let third_messages = lexical.get(&third_id).expect("third conversation");
21211        assert_eq!(third_messages.len(), 1);
21212        assert_eq!(third_messages[0].content, "third-a");
21213        assert!(third_messages[0].extra_json.is_null());
21214    }
21215
21216    #[test]
21217    fn fetch_messages_for_lexical_rebuild_batch_enforces_content_byte_guardrail() {
21218        let dir = TempDir::new().unwrap();
21219        let db_path = dir.path().join("test.db");
21220        let storage = SqliteStorage::open(&db_path).unwrap();
21221
21222        let agent = Agent {
21223            id: None,
21224            slug: "codex".into(),
21225            name: "Codex".into(),
21226            version: Some("0.2.3".into()),
21227            kind: AgentKind::Cli,
21228        };
21229        let agent_id = storage.ensure_agent(&agent).unwrap();
21230
21231        let conversation = Conversation {
21232            id: None,
21233            agent_slug: "codex".into(),
21234            workspace: Some(PathBuf::from("/tmp/workspace")),
21235            external_id: Some("lexical-batch-guard".into()),
21236            title: Some("Lexical batch guard".into()),
21237            source_path: PathBuf::from("/tmp/lexical-batch-guard.jsonl"),
21238            started_at: Some(1_700_000_000_000),
21239            ended_at: Some(1_700_000_000_100),
21240            approx_tokens: Some(42),
21241            metadata_json: serde_json::Value::Null,
21242            messages: vec![
21243                Message {
21244                    id: None,
21245                    idx: 0,
21246                    role: MessageRole::User,
21247                    author: Some("user".into()),
21248                    created_at: Some(1_700_000_000_010),
21249                    content: "123456".into(),
21250                    extra_json: serde_json::Value::Null,
21251                    snippets: Vec::new(),
21252                },
21253                Message {
21254                    id: None,
21255                    idx: 1,
21256                    role: MessageRole::Agent,
21257                    author: Some("assistant".into()),
21258                    created_at: Some(1_700_000_000_020),
21259                    content: "abcdef".into(),
21260                    extra_json: serde_json::Value::Null,
21261                    snippets: Vec::new(),
21262                },
21263            ],
21264            source_id: LOCAL_SOURCE_ID.into(),
21265            origin_host: None,
21266        };
21267
21268        let conversation_id = storage
21269            .insert_conversation_tree(agent_id, None, &conversation)
21270            .unwrap()
21271            .conversation_id;
21272
21273        let error = storage
21274            .fetch_messages_for_lexical_rebuild_batch(&[conversation_id], Some(10), Some(8))
21275            .expect_err("guardrail should reject oversized batch content");
21276
21277        let message = format!("{error:#}");
21278        assert!(
21279            message.contains("content-byte guardrail"),
21280            "expected guardrail reason in error, got {message}"
21281        );
21282    }
21283
21284    #[test]
21285    fn fetch_messages_handles_manual_rows_inserted_via_raw_connection() {
21286        let dir = TempDir::new().unwrap();
21287        let db_path = dir.path().join("manual-rows.db");
21288        let storage = FrankenStorage::open(&db_path).unwrap();
21289        let conn = storage.raw();
21290
21291        conn.execute(
21292            "INSERT INTO agents (id, slug, name, kind, created_at, updated_at)
21293             VALUES (1, 'claude_code', 'Claude Code', 'local', 0, 0)",
21294        )
21295        .unwrap();
21296        conn.execute(
21297            "INSERT INTO conversations
21298             (id, agent_id, external_id, title, source_path, source_id, started_at)
21299             VALUES (1, 1, 'manual-ext', 'Manual Session', '/tmp/manual.jsonl', 'local', 200)",
21300        )
21301        .unwrap();
21302        conn.execute(
21303            "INSERT INTO messages
21304             (id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
21305             VALUES (1, 1, 0, 'user', 'tester', 1700000000000, 'manual body', '{\"k\":1}', NULL)",
21306        )
21307        .unwrap();
21308
21309        let lexical = storage.fetch_messages_for_lexical_rebuild(1).unwrap();
21310        assert_eq!(lexical.len(), 1);
21311        assert_eq!(lexical[0].content, "manual body");
21312
21313        let full = storage.fetch_messages(1).unwrap();
21314        assert_eq!(full.len(), 1);
21315        assert_eq!(full[0].content, "manual body");
21316        assert_eq!(full[0].author.as_deref(), Some("tester"));
21317        assert_eq!(full[0].extra_json, serde_json::json!({ "k": 1 }));
21318    }
21319
21320    #[test]
21321    fn lexical_rebuild_batch_messages_query_avoids_sorter_temp_btrees() {
21322        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21323        use std::path::PathBuf;
21324
21325        let dir = TempDir::new().unwrap();
21326        let db_path = dir.path().join("agent_search.db");
21327        let storage = SqliteStorage::open(&db_path).unwrap();
21328
21329        let agent = Agent {
21330            id: None,
21331            slug: "claude_code".into(),
21332            name: "Claude Code".into(),
21333            version: None,
21334            kind: AgentKind::Cli,
21335        };
21336        let agent_id = storage.ensure_agent(&agent).unwrap();
21337
21338        for (external_id, base_ts) in [
21339            ("conv-1", 1_700_000_000_000_i64),
21340            ("conv-2", 1_700_000_001_000_i64),
21341        ] {
21342            let conversation = Conversation {
21343                id: None,
21344                agent_slug: "claude_code".into(),
21345                workspace: Some(PathBuf::from("/tmp/workspace")),
21346                external_id: Some(external_id.to_string()),
21347                title: Some("Lexical rebuild".into()),
21348                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21349                started_at: Some(base_ts),
21350                ended_at: Some(base_ts + 100),
21351                approx_tokens: None,
21352                metadata_json: serde_json::Value::Null,
21353                messages: vec![
21354                    Message {
21355                        id: None,
21356                        idx: 0,
21357                        role: MessageRole::User,
21358                        author: Some("user".into()),
21359                        created_at: Some(base_ts + 10),
21360                        content: format!("{external_id}-first"),
21361                        extra_json: serde_json::Value::Null,
21362                        snippets: Vec::new(),
21363                    },
21364                    Message {
21365                        id: None,
21366                        idx: 1,
21367                        role: MessageRole::Agent,
21368                        author: Some("assistant".into()),
21369                        created_at: Some(base_ts + 20),
21370                        content: format!("{external_id}-second"),
21371                        extra_json: serde_json::Value::Null,
21372                        snippets: Vec::new(),
21373                    },
21374                ],
21375                source_id: LOCAL_SOURCE_ID.into(),
21376                origin_host: None,
21377            };
21378            storage
21379                .insert_conversation_tree(agent_id, None, &conversation)
21380                .unwrap();
21381        }
21382
21383        let conversation_ids: Vec<i64> = storage
21384            .conn
21385            .query_map_collect(
21386                "SELECT id FROM conversations ORDER BY id",
21387                fparams![],
21388                |row| row.get_typed(0),
21389            )
21390            .unwrap();
21391        assert_eq!(conversation_ids.len(), 2);
21392
21393        let plan_details: Vec<String> = storage
21394            .conn
21395            .query_map_collect(
21396                "EXPLAIN QUERY PLAN \
21397                 SELECT conversation_id, id, idx, role, author, created_at, content \
21398                 FROM messages \
21399                 WHERE conversation_id IN (?1, ?2) \
21400                 ORDER BY conversation_id ASC, idx ASC",
21401                fparams![conversation_ids[0], conversation_ids[1]],
21402                |row| row.get_typed(3),
21403            )
21404            .unwrap();
21405
21406        assert!(
21407            plan_details
21408                .iter()
21409                .any(|detail| detail.contains("sqlite_autoindex_messages_1")),
21410            "expected batched lexical rebuild fetch to use the conversation_id/idx composite index, got {plan_details:?}"
21411        );
21412        assert!(
21413            !plan_details
21414                .iter()
21415                .any(|detail| detail.contains("TEMP B-TREE")),
21416            "expected batched lexical rebuild fetch to avoid sorter temp b-trees, got {plan_details:?}"
21417        );
21418    }
21419
21420    #[test]
21421    fn stream_messages_for_lexical_rebuild_groups_and_orders_messages() {
21422        let dir = TempDir::new().unwrap();
21423        let db_path = dir.path().join("test.db");
21424        let storage = SqliteStorage::open(&db_path).unwrap();
21425
21426        let agent = Agent {
21427            id: None,
21428            slug: "codex".into(),
21429            name: "Codex".into(),
21430            version: Some("0.2.3".into()),
21431            kind: AgentKind::Cli,
21432        };
21433        let agent_id = storage.ensure_agent(&agent).unwrap();
21434
21435        let first = Conversation {
21436            id: None,
21437            agent_slug: "codex".into(),
21438            workspace: Some(PathBuf::from("/tmp/workspace")),
21439            external_id: Some("lexical-stream-1".into()),
21440            title: Some("Lexical stream 1".into()),
21441            source_path: PathBuf::from("/tmp/lexical-stream-1.jsonl"),
21442            started_at: Some(1_700_000_000_000),
21443            ended_at: Some(1_700_000_000_100),
21444            approx_tokens: Some(42),
21445            metadata_json: serde_json::Value::Null,
21446            messages: vec![
21447                Message {
21448                    id: None,
21449                    idx: 0,
21450                    role: MessageRole::User,
21451                    author: Some("user".into()),
21452                    created_at: Some(1_700_000_000_010),
21453                    content: "first-a".into(),
21454                    extra_json: serde_json::json!({"opaque": true}),
21455                    snippets: Vec::new(),
21456                },
21457                Message {
21458                    id: None,
21459                    idx: 1,
21460                    role: MessageRole::Agent,
21461                    author: Some("assistant".into()),
21462                    created_at: Some(1_700_000_000_020),
21463                    content: "first-b".into(),
21464                    extra_json: serde_json::json!({"opaque": true}),
21465                    snippets: Vec::new(),
21466                },
21467            ],
21468            source_id: LOCAL_SOURCE_ID.into(),
21469            origin_host: None,
21470        };
21471
21472        let second = Conversation {
21473            id: None,
21474            agent_slug: "codex".into(),
21475            workspace: Some(PathBuf::from("/tmp/workspace")),
21476            external_id: Some("lexical-stream-2".into()),
21477            title: Some("Lexical stream 2".into()),
21478            source_path: PathBuf::from("/tmp/lexical-stream-2.jsonl"),
21479            started_at: Some(1_700_000_000_200),
21480            ended_at: Some(1_700_000_000_300),
21481            approx_tokens: Some(84),
21482            metadata_json: serde_json::Value::Null,
21483            messages: vec![Message {
21484                id: None,
21485                idx: 0,
21486                role: MessageRole::Tool,
21487                author: Some("tool".into()),
21488                created_at: Some(1_700_000_000_210),
21489                content: "second-a".into(),
21490                extra_json: serde_json::json!({"opaque": true}),
21491                snippets: Vec::new(),
21492            }],
21493            source_id: LOCAL_SOURCE_ID.into(),
21494            origin_host: None,
21495        };
21496
21497        let first_id = storage
21498            .insert_conversation_tree(agent_id, None, &first)
21499            .unwrap()
21500            .conversation_id;
21501        let second_id = storage
21502            .insert_conversation_tree(agent_id, None, &second)
21503            .unwrap()
21504            .conversation_id;
21505
21506        let mut streamed = Vec::new();
21507        storage
21508            .stream_messages_for_lexical_rebuild_from_conversation_id(first_id, |row| {
21509                streamed.push((
21510                    row.conversation_id,
21511                    row.idx,
21512                    row.role,
21513                    row.author,
21514                    row.content,
21515                ));
21516                Ok(())
21517            })
21518            .unwrap();
21519
21520        assert_eq!(
21521            streamed,
21522            vec![
21523                (
21524                    first_id,
21525                    0,
21526                    "user".to_string(),
21527                    Some("user".to_string()),
21528                    "first-a".to_string(),
21529                ),
21530                (
21531                    first_id,
21532                    1,
21533                    "agent".to_string(),
21534                    Some("assistant".to_string()),
21535                    "first-b".to_string(),
21536                ),
21537                (
21538                    second_id,
21539                    0,
21540                    "tool".to_string(),
21541                    Some("tool".to_string()),
21542                    "second-a".to_string(),
21543                ),
21544            ]
21545        );
21546    }
21547
21548    #[test]
21549    fn stream_messages_for_lexical_rebuild_between_conversation_ids_respects_upper_bound() {
21550        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21551        use std::path::PathBuf;
21552
21553        let dir = TempDir::new().unwrap();
21554        let db_path = dir.path().join("agent_search.db");
21555        let storage = SqliteStorage::open(&db_path).unwrap();
21556
21557        let agent = Agent {
21558            id: None,
21559            slug: "claude_code".into(),
21560            name: "Claude Code".into(),
21561            version: Some("1.2.3".into()),
21562            kind: AgentKind::Cli,
21563        };
21564        let agent_id = storage.ensure_agent(&agent).unwrap();
21565
21566        let first = Conversation {
21567            id: None,
21568            agent_slug: "claude_code".into(),
21569            workspace: Some(PathBuf::from("/tmp/workspace")),
21570            external_id: Some("lexical-range-1".into()),
21571            title: Some("Lexical range 1".into()),
21572            source_path: PathBuf::from("/tmp/lexical-range-1.jsonl"),
21573            started_at: Some(1_700_000_000_000),
21574            ended_at: Some(1_700_000_000_100),
21575            approx_tokens: Some(42),
21576            metadata_json: serde_json::Value::Null,
21577            messages: vec![Message {
21578                id: None,
21579                idx: 0,
21580                role: MessageRole::User,
21581                author: Some("user".into()),
21582                created_at: Some(1_700_000_000_010),
21583                content: "first-only".into(),
21584                extra_json: serde_json::json!({"opaque": true}),
21585                snippets: Vec::new(),
21586            }],
21587            source_id: LOCAL_SOURCE_ID.into(),
21588            origin_host: None,
21589        };
21590
21591        let second = Conversation {
21592            id: None,
21593            agent_slug: "claude_code".into(),
21594            workspace: Some(PathBuf::from("/tmp/workspace")),
21595            external_id: Some("lexical-range-2".into()),
21596            title: Some("Lexical range 2".into()),
21597            source_path: PathBuf::from("/tmp/lexical-range-2.jsonl"),
21598            started_at: Some(1_700_000_000_200),
21599            ended_at: Some(1_700_000_000_300),
21600            approx_tokens: Some(84),
21601            metadata_json: serde_json::Value::Null,
21602            messages: vec![Message {
21603                id: None,
21604                idx: 0,
21605                role: MessageRole::Tool,
21606                author: Some("tool".into()),
21607                created_at: Some(1_700_000_000_210),
21608                content: "second-should-not-appear".into(),
21609                extra_json: serde_json::json!({"opaque": true}),
21610                snippets: Vec::new(),
21611            }],
21612            source_id: LOCAL_SOURCE_ID.into(),
21613            origin_host: None,
21614        };
21615
21616        let first_id = storage
21617            .insert_conversation_tree(agent_id, None, &first)
21618            .unwrap()
21619            .conversation_id;
21620        let second_id = storage
21621            .insert_conversation_tree(agent_id, None, &second)
21622            .unwrap()
21623            .conversation_id;
21624
21625        let mut streamed = Vec::new();
21626        storage
21627            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21628                first_id,
21629                first_id,
21630                |row| {
21631                    streamed.push((row.conversation_id, row.idx, row.content));
21632                    Ok(())
21633                },
21634            )
21635            .unwrap();
21636
21637        assert_eq!(streamed, vec![(first_id, 0, "first-only".to_string())]);
21638        assert!(
21639            streamed
21640                .iter()
21641                .all(|(conversation_id, _, _)| *conversation_id != second_id),
21642            "upper bound should exclude later conversation ids"
21643        );
21644    }
21645
21646    #[test]
21647    fn stream_messages_for_lexical_rebuild_between_conversation_ids_handles_mixed_ranges() {
21648        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21649        use std::path::PathBuf;
21650
21651        let dir = TempDir::new().unwrap();
21652        let db_path = dir.path().join("agent_search.db");
21653        let storage = SqliteStorage::open(&db_path).unwrap();
21654
21655        let claude_agent_id = storage
21656            .ensure_agent(&Agent {
21657                id: None,
21658                slug: "claude_code".into(),
21659                name: "Claude Code".into(),
21660                version: None,
21661                kind: AgentKind::Cli,
21662            })
21663            .unwrap();
21664        let aider_agent_id = storage
21665            .ensure_agent(&Agent {
21666                id: None,
21667                slug: "aider".into(),
21668                name: "Aider".into(),
21669                version: None,
21670                kind: AgentKind::Cli,
21671            })
21672            .unwrap();
21673
21674        type MessageSpec = (i64, MessageRole, Option<String>, Option<i64>, String);
21675
21676        let mut expected = Vec::new();
21677        let mut first_conversation_id = None;
21678        let mut last_conversation_id = None;
21679        let mut insert_conversation =
21680            |agent_id: i64,
21681             external_id: &str,
21682             title: &str,
21683             source_path: &str,
21684             started_at: i64,
21685             message_specs: Vec<MessageSpec>| {
21686                let conversation = Conversation {
21687                    id: None,
21688                    agent_slug: if agent_id == aider_agent_id {
21689                        "aider".into()
21690                    } else {
21691                        "claude_code".into()
21692                    },
21693                    workspace: Some(PathBuf::from("/tmp/workspace")),
21694                    external_id: Some(external_id.to_string()),
21695                    title: Some(title.to_string()),
21696                    source_path: PathBuf::from(source_path),
21697                    started_at: Some(started_at),
21698                    ended_at: Some(started_at + 100),
21699                    approx_tokens: None,
21700                    metadata_json: serde_json::Value::Null,
21701                    messages: message_specs
21702                        .iter()
21703                        .map(|(idx, role, author, created_at, content)| Message {
21704                            id: None,
21705                            idx: *idx,
21706                            role: role.clone(),
21707                            author: author.clone(),
21708                            created_at: *created_at,
21709                            content: content.clone(),
21710                            extra_json: serde_json::Value::Null,
21711                            snippets: Vec::new(),
21712                        })
21713                        .collect(),
21714                    source_id: LOCAL_SOURCE_ID.into(),
21715                    origin_host: None,
21716                };
21717                let conversation_id = storage
21718                    .insert_conversation_tree(agent_id, None, &conversation)
21719                    .unwrap()
21720                    .conversation_id;
21721                if first_conversation_id.is_none() {
21722                    first_conversation_id = Some(conversation_id);
21723                }
21724                last_conversation_id = Some(conversation_id);
21725                expected.extend(message_specs.into_iter().map(
21726                    |(idx, role, author, created_at, content)| {
21727                        (
21728                            conversation_id,
21729                            idx,
21730                            match role {
21731                                MessageRole::User => "user".to_string(),
21732                                MessageRole::Agent => "agent".to_string(),
21733                                MessageRole::Tool => "tool".to_string(),
21734                                MessageRole::System => "system".to_string(),
21735                                MessageRole::Other(other) => other,
21736                            },
21737                            author,
21738                            created_at,
21739                            content,
21740                        )
21741                    },
21742                ));
21743            };
21744
21745        for (label, base_ts) in [
21746            ("alpha", 1_700_000_000_000_i64),
21747            ("beta", 1_700_000_001_000_i64),
21748            ("gamma", 1_700_000_002_000_i64),
21749            ("delta", 1_700_000_003_000_i64),
21750            ("epsilon", 1_700_000_004_000_i64),
21751        ] {
21752            insert_conversation(
21753                claude_agent_id,
21754                &format!("lexical-{label}"),
21755                &format!("Lexical {label}"),
21756                &format!("/tmp/{label}.jsonl"),
21757                base_ts,
21758                vec![
21759                    (
21760                        0,
21761                        MessageRole::User,
21762                        None,
21763                        Some(base_ts + 10),
21764                        format!("{label}_content"),
21765                    ),
21766                    (
21767                        1,
21768                        MessageRole::Agent,
21769                        None,
21770                        Some(base_ts + 20),
21771                        format!("{label}_content_response"),
21772                    ),
21773                ],
21774            );
21775        }
21776
21777        insert_conversation(
21778            aider_agent_id,
21779            "lexical-aider-history",
21780            "Aider Chat: coding_agent_session_search",
21781            "/tmp/.aider.chat.history.md",
21782            1_764_619_673_394,
21783            vec![
21784                (
21785                    0,
21786                    MessageRole::System,
21787                    Some("system".to_string()),
21788                    None,
21789                    "# aider chat started at 2025-12-01 20:07:47".to_string(),
21790                ),
21791                (
21792                    1,
21793                    MessageRole::User,
21794                    Some("user".to_string()),
21795                    None,
21796                    "/tmp/workspace/.venv/bin/aider --no-git --message hello world".to_string(),
21797                ),
21798            ],
21799        );
21800        insert_conversation(
21801            aider_agent_id,
21802            "lexical-aider-fixture",
21803            "Aider Chat: aider",
21804            "/tmp/tests/fixtures/aider/.aider.chat.history.md",
21805            1_764_621_401_399,
21806            vec![
21807                (
21808                    0,
21809                    MessageRole::User,
21810                    Some("user".to_string()),
21811                    None,
21812                    "/add src/main.rs".to_string(),
21813                ),
21814                (
21815                    1,
21816                    MessageRole::Agent,
21817                    Some("assistant".to_string()),
21818                    None,
21819                    "Added src/main.rs to the chat.
21820
21821#### /add src/main.rs"
21822                        .to_string(),
21823                ),
21824                (
21825                    2,
21826                    MessageRole::User,
21827                    Some("user".to_string()),
21828                    None,
21829                    "Please refactor.".to_string(),
21830                ),
21831                (
21832                    3,
21833                    MessageRole::Agent,
21834                    Some("assistant".to_string()),
21835                    None,
21836                    "Sure, here is the code.".to_string(),
21837                ),
21838            ],
21839        );
21840
21841        let mut streamed = Vec::new();
21842        storage
21843            .stream_messages_for_lexical_rebuild_between_conversation_ids(
21844                first_conversation_id.unwrap(),
21845                last_conversation_id.unwrap(),
21846                |row| {
21847                    streamed.push((
21848                        row.conversation_id,
21849                        row.idx,
21850                        row.role,
21851                        row.author,
21852                        row.created_at,
21853                        row.content,
21854                    ));
21855                    Ok(())
21856                },
21857            )
21858            .unwrap();
21859
21860        assert_eq!(streamed, expected);
21861    }
21862
21863    #[test]
21864    fn lexical_rebuild_stream_queries_use_rowid_and_per_conversation_probes() {
21865        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
21866        use std::path::PathBuf;
21867
21868        let dir = TempDir::new().unwrap();
21869        let db_path = dir.path().join("agent_search.db");
21870        let storage = SqliteStorage::open(&db_path).unwrap();
21871
21872        let agent = Agent {
21873            id: None,
21874            slug: "claude_code".into(),
21875            name: "Claude Code".into(),
21876            version: None,
21877            kind: AgentKind::Cli,
21878        };
21879        let agent_id = storage.ensure_agent(&agent).unwrap();
21880
21881        for (external_id, base_ts) in [
21882            ("conv-1", 1_700_000_000_000_i64),
21883            ("conv-2", 1_700_000_001_000_i64),
21884        ] {
21885            let conversation = Conversation {
21886                id: None,
21887                agent_slug: "claude_code".into(),
21888                workspace: Some(PathBuf::from("/tmp/workspace")),
21889                external_id: Some(external_id.to_string()),
21890                title: Some("Lexical rebuild".into()),
21891                source_path: PathBuf::from(format!("/tmp/{external_id}.jsonl")),
21892                started_at: Some(base_ts),
21893                ended_at: Some(base_ts + 100),
21894                approx_tokens: None,
21895                metadata_json: serde_json::Value::Null,
21896                messages: vec![
21897                    Message {
21898                        id: None,
21899                        idx: 0,
21900                        role: MessageRole::User,
21901                        author: Some("user".into()),
21902                        created_at: Some(base_ts + 10),
21903                        content: format!("{external_id}-first"),
21904                        extra_json: serde_json::Value::Null,
21905                        snippets: Vec::new(),
21906                    },
21907                    Message {
21908                        id: None,
21909                        idx: 1,
21910                        role: MessageRole::Agent,
21911                        author: Some("assistant".into()),
21912                        created_at: Some(base_ts + 20),
21913                        content: format!("{external_id}-second"),
21914                        extra_json: serde_json::Value::Null,
21915                        snippets: Vec::new(),
21916                    },
21917                ],
21918                source_id: LOCAL_SOURCE_ID.into(),
21919                origin_host: None,
21920            };
21921            storage
21922                .insert_conversation_tree(agent_id, None, &conversation)
21923                .unwrap();
21924        }
21925
21926        let first_id: i64 = storage
21927            .conn
21928            .query_row_map(
21929                "SELECT id FROM conversations ORDER BY id LIMIT 1",
21930                fparams![],
21931                |row| row.get_typed(0),
21932            )
21933            .unwrap();
21934        let last_id: i64 = storage
21935            .conn
21936            .query_row_map(
21937                "SELECT id FROM conversations ORDER BY id DESC LIMIT 1",
21938                fparams![],
21939                |row| row.get_typed(0),
21940            )
21941            .unwrap();
21942
21943        let conversation_plan_details: Vec<String> = storage
21944            .conn
21945            .query_map_collect(
21946                "EXPLAIN QUERY PLAN                  SELECT id FROM conversations                  WHERE id >= ?1 AND id <= ?2                  ORDER BY id ASC",
21947                fparams![first_id, last_id],
21948                |row| row.get_typed(3),
21949            )
21950            .unwrap();
21951        assert!(
21952            !conversation_plan_details
21953                .iter()
21954                .any(|detail| detail.contains("TEMP B-TREE")),
21955            "expected streamed lexical rebuild conversation listing to avoid sorter temp b-trees, got {conversation_plan_details:?}"
21956        );
21957
21958        let message_plan_details: Vec<String> = storage
21959            .conn
21960            .query_map_collect(
21961                "EXPLAIN QUERY PLAN                  SELECT id, idx, role, author, created_at, content                  FROM messages INDEXED BY sqlite_autoindex_messages_1                  WHERE conversation_id = ?1                  ORDER BY idx",
21962                fparams![first_id],
21963                |row| row.get_typed(3),
21964            )
21965            .unwrap();
21966        assert!(
21967            message_plan_details
21968                .iter()
21969                .any(|detail| detail.contains("sqlite_autoindex_messages_1")
21970                    || detail.contains("idx_messages_conv_idx")),
21971            "expected per-conversation lexical rebuild fetch to use the conversation_id/idx index, got {message_plan_details:?}"
21972        );
21973        assert!(
21974            !message_plan_details
21975                .iter()
21976                .any(|detail| detail.contains("TEMP B-TREE")),
21977            "expected per-conversation lexical rebuild fetch to avoid sorter temp b-trees, got {message_plan_details:?}"
21978        );
21979    }
21980
21981    #[test]
21982    fn discover_historical_database_bundles_prefers_larger_archives_first() {
21983        let dir = TempDir::new().unwrap();
21984        let canonical_db = dir.path().join("agent_search.db");
21985        fs::write(&canonical_db, b"canonical").unwrap();
21986
21987        let smaller = dir.path().join("agent_search.corrupt.small");
21988        fs::write(&smaller, vec![0_u8; 32]).unwrap();
21989
21990        let backups_dir = dir.path().join("backups");
21991        fs::create_dir_all(&backups_dir).unwrap();
21992        let larger = backups_dir.join("agent_search.db.20260322T020200.bak");
21993        fs::write(&larger, vec![0_u8; 128]).unwrap();
21994
21995        let bundles = discover_historical_database_bundles(&canonical_db);
21996        let ordered_paths: Vec<PathBuf> =
21997            bundles.into_iter().map(|bundle| bundle.root_path).collect();
21998
21999        assert_eq!(ordered_paths, vec![larger, smaller]);
22000    }
22001
22002    #[test]
22003    fn discover_historical_database_bundles_prefers_queryable_direct_bundles_first() {
22004        let dir = TempDir::new().unwrap();
22005        let canonical_db = dir.path().join("agent_search.db");
22006        fs::write(&canonical_db, b"canonical").unwrap();
22007
22008        let larger_corrupt = dir.path().join("agent_search.corrupt.20260324_212907");
22009        fs::write(&larger_corrupt, vec![0_u8; 4096]).unwrap();
22010
22011        let backups_dir = dir.path().join("backups");
22012        fs::create_dir_all(&backups_dir).unwrap();
22013        let smaller_healthy = backups_dir.join("agent_search.db.20260322T020200.bak");
22014        let conn = FrankenConnection::open(smaller_healthy.to_string_lossy().into_owned()).unwrap();
22015        conn.execute_batch(
22016            "CREATE TABLE conversations (id INTEGER PRIMARY KEY, source_path TEXT);
22017             CREATE TABLE messages (
22018                 id INTEGER PRIMARY KEY,
22019                 conversation_id INTEGER NOT NULL,
22020                 idx INTEGER NOT NULL,
22021                 content TEXT
22022             );
22023             INSERT INTO conversations(id, source_path) VALUES (1, '/tmp/history.jsonl');
22024             INSERT INTO messages(id, conversation_id, idx, content)
22025             VALUES (1, 1, 0, 'seed');",
22026        )
22027        .unwrap();
22028        drop(conn);
22029
22030        let bundles = discover_historical_database_bundles(&canonical_db);
22031        let ordered_paths: Vec<PathBuf> = bundles
22032            .iter()
22033            .map(|bundle| bundle.root_path.clone())
22034            .collect();
22035
22036        assert_eq!(ordered_paths, vec![smaller_healthy, larger_corrupt]);
22037        assert!(bundles[0].supports_direct_readonly);
22038        assert!(!bundles[1].supports_direct_readonly);
22039    }
22040
22041    #[test]
22042    fn salvage_historical_databases_skips_unreadable_quarantined_bundles() {
22043        let dir = TempDir::new().unwrap();
22044        let canonical_db = dir.path().join("agent_search.db");
22045        let storage = SqliteStorage::open(&canonical_db).unwrap();
22046
22047        let quarantined = dir.path().join("agent_search.corrupt.20260324_212907");
22048        fs::write(&quarantined, b"not a sqlite database").unwrap();
22049
22050        let discovered: Vec<PathBuf> = discover_historical_database_bundles(&canonical_db)
22051            .into_iter()
22052            .map(|bundle| bundle.root_path)
22053            .collect();
22054        assert_eq!(discovered, vec![quarantined]);
22055
22056        let outcome = storage.salvage_historical_databases(&canonical_db).unwrap();
22057        assert_eq!(outcome.bundles_considered, 1);
22058        assert_eq!(outcome.bundles_imported, 0);
22059        assert_eq!(outcome.conversations_imported, 0);
22060        assert_eq!(outcome.messages_imported, 0);
22061        assert!(storage.list_conversations(10, 0).unwrap().is_empty());
22062    }
22063
22064    #[test]
22065    fn discover_historical_database_bundles_includes_repair_lab_and_snapshots_named_roots() {
22066        let dir = TempDir::new().unwrap();
22067        let canonical_db = dir.path().join("agent_search.db");
22068        fs::write(&canonical_db, b"canonical").unwrap();
22069
22070        let repair_lab_dir = dir.path().join("repair-lab").join("live-copy");
22071        fs::create_dir_all(&repair_lab_dir).unwrap();
22072        let repair_lab_db = repair_lab_dir.join("agent_search.db");
22073        fs::write(&repair_lab_db, vec![0_u8; 96]).unwrap();
22074        fs::write(
22075            repair_lab_dir.join("agent_search.rebuild-test.db"),
22076            vec![0_u8; 192],
22077        )
22078        .unwrap();
22079
22080        let snapshots_dir = dir.path().join("snapshots").join("20260324T013201Z");
22081        fs::create_dir_all(&snapshots_dir).unwrap();
22082        let snapshot_db = snapshots_dir.join("agent_search.db");
22083        fs::write(&snapshot_db, vec![0_u8; 64]).unwrap();
22084
22085        let bundles = discover_historical_database_bundles(&canonical_db);
22086        let ordered_paths: Vec<PathBuf> =
22087            bundles.into_iter().map(|bundle| bundle.root_path).collect();
22088
22089        assert!(ordered_paths.contains(&repair_lab_db));
22090        assert!(ordered_paths.contains(&snapshot_db));
22091        assert!(
22092            !ordered_paths
22093                .iter()
22094                .any(|path| path.file_name().and_then(|name| name.to_str())
22095                    == Some("agent_search.rebuild-test.db"))
22096        );
22097    }
22098
22099    #[test]
22100    fn discover_historical_database_bundles_prefers_healthy_backup_over_replay_priority() {
22101        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22102
22103        let dir = TempDir::new().unwrap();
22104        let canonical_db = dir.path().join("agent_search.db");
22105        fs::write(&canonical_db, b"canonical").unwrap();
22106
22107        let replay_dir = dir
22108            .path()
22109            .join("repair-lab")
22110            .join("replay-20260324T070101Z");
22111        fs::create_dir_all(&replay_dir).unwrap();
22112        let replay_db = replay_dir.join("agent_search.db");
22113        let replay_storage = SqliteStorage::open(&replay_db).unwrap();
22114        let agent = Agent {
22115            id: None,
22116            slug: "codex".into(),
22117            name: "Codex".into(),
22118            version: Some("0.2.3".into()),
22119            kind: AgentKind::Cli,
22120        };
22121        let agent_id = replay_storage.ensure_agent(&agent).unwrap();
22122        let conversation = Conversation {
22123            id: None,
22124            agent_slug: "codex".into(),
22125            workspace: Some(PathBuf::from("/tmp/workspace")),
22126            external_id: Some("replay-conv".into()),
22127            title: Some("Replay bundle".into()),
22128            source_path: PathBuf::from("/tmp/replay.jsonl"),
22129            started_at: Some(1_700_000_000_000),
22130            ended_at: Some(1_700_000_000_100),
22131            approx_tokens: Some(42),
22132            metadata_json: serde_json::Value::Null,
22133            messages: vec![Message {
22134                id: None,
22135                idx: 0,
22136                role: MessageRole::Agent,
22137                author: Some("assistant".into()),
22138                created_at: Some(1_700_000_000_050),
22139                content: "replay message".into(),
22140                extra_json: serde_json::Value::Null,
22141                snippets: Vec::new(),
22142            }],
22143            source_id: LOCAL_SOURCE_ID.into(),
22144            origin_host: None,
22145        };
22146        replay_storage
22147            .insert_conversation_tree(agent_id, None, &conversation)
22148            .unwrap();
22149        drop(replay_storage);
22150
22151        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
22152        let replay_legacy = rusqlite_test_fixture_conn(&replay_db);
22153        replay_legacy
22154            .execute_batch(
22155                "UPDATE meta SET value = '13' WHERE key = 'schema_version';
22156                 DELETE FROM _schema_migrations WHERE version = 14;
22157                 PRAGMA writable_schema = ON;",
22158            )
22159            .unwrap();
22160        replay_legacy
22161            .execute(
22162                "DELETE FROM meta WHERE key = ?1",
22163                [FTS_FRANKEN_REBUILD_META_KEY],
22164            )
22165            .unwrap();
22166        replay_legacy
22167            .execute(
22168                "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22169                 VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22170                [duplicate_legacy_fts_sql],
22171            )
22172            .unwrap();
22173        replay_legacy
22174            .execute_batch("PRAGMA writable_schema = OFF;")
22175            .unwrap();
22176        drop(replay_legacy);
22177
22178        let backups_dir = dir.path().join("backups");
22179        fs::create_dir_all(&backups_dir).unwrap();
22180        let clean_backup = backups_dir.join("agent_search.db.20260322T020200.bak");
22181        let clean_storage = SqliteStorage::open(&clean_backup).unwrap();
22182        let clean_agent_id = clean_storage.ensure_agent(&agent).unwrap();
22183        clean_storage
22184            .insert_conversation_tree(clean_agent_id, None, &conversation)
22185            .unwrap();
22186        drop(clean_storage);
22187
22188        let bundles = discover_historical_database_bundles(&canonical_db);
22189        let ordered_paths: Vec<PathBuf> = bundles
22190            .iter()
22191            .map(|bundle| bundle.root_path.clone())
22192            .collect();
22193
22194        assert_eq!(ordered_paths[0], clean_backup);
22195        assert_eq!(ordered_paths[1], replay_db);
22196        assert_eq!(
22197            bundles[0].probe.schema_version,
22198            Some(CURRENT_SCHEMA_VERSION)
22199        );
22200        // Post-V14 cass drops the fts_messages virtual table during migration
22201        // and recreates it lazily on first open, so a freshly-migrated "clean"
22202        // backup has zero fts_messages rows in sqlite_master. The bundle is
22203        // still ranked as healthy by `bundle_health_rank` because 0 rows is a
22204        // legitimate lazy-FTS state (see comment there).
22205        assert_eq!(bundles[0].probe.fts_schema_rows, Some(0));
22206        // `fts_queryable` mirrors a direct rusqlite probe; with 0 sqlite_master
22207        // rows the table isn't queryable until lazy repair runs.
22208        assert!(!bundles[0].probe.fts_queryable);
22209        assert_eq!(bundles[1].probe.schema_version, Some(13));
22210        // The replay bundle had V14 run (dropping fts_messages → 0 rows), then
22211        // the test rolls meta.schema_version back to 13, deletes the V14
22212        // marker, and manually injects a duplicate sqlite_master row. Net
22213        // result: one synthetic (malformed) fts_messages entry.
22214        assert_eq!(bundles[1].probe.fts_schema_rows, Some(1));
22215    }
22216
22217    #[test]
22218    fn ensure_fts_consistency_via_rusqlite_catches_up_missing_rows() {
22219        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22220
22221        let dir = TempDir::new().unwrap();
22222        let db_path = dir.path().join("fts-catchup.db");
22223        let storage = SqliteStorage::open(&db_path).unwrap();
22224        let agent = Agent {
22225            id: None,
22226            slug: "codex".into(),
22227            name: "Codex".into(),
22228            version: Some("0.2.3".into()),
22229            kind: AgentKind::Cli,
22230        };
22231        let agent_id = storage.ensure_agent(&agent).unwrap();
22232        let conversation = Conversation {
22233            id: None,
22234            agent_slug: "codex".into(),
22235            workspace: Some(PathBuf::from("/tmp/workspace")),
22236            external_id: Some("fts-catchup".into()),
22237            title: Some("FTS catchup".into()),
22238            source_path: PathBuf::from("/tmp/fts-catchup.jsonl"),
22239            started_at: Some(1_700_000_000_000),
22240            ended_at: Some(1_700_000_000_100),
22241            approx_tokens: Some(42),
22242            metadata_json: serde_json::Value::Null,
22243            messages: vec![Message {
22244                id: None,
22245                idx: 0,
22246                role: MessageRole::User,
22247                author: Some("user".into()),
22248                created_at: Some(1_700_000_000_050),
22249                content: "initial message".into(),
22250                extra_json: serde_json::Value::Null,
22251                snippets: Vec::new(),
22252            }],
22253            source_id: LOCAL_SOURCE_ID.into(),
22254            origin_host: None,
22255        };
22256        storage
22257            .insert_conversation_tree(agent_id, None, &conversation)
22258            .unwrap();
22259        drop(storage);
22260
22261        rebuild_fts_via_rusqlite(&db_path).unwrap();
22262
22263        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22264        let conversation_id: i64 = conn
22265            .query_row_map("SELECT id FROM conversations LIMIT 1", fparams![], |row| {
22266                row.get_typed(0)
22267            })
22268            .unwrap();
22269        conn.execute_compat(
22270            "INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content, extra_json, extra_bin)
22271             VALUES(2, ?1, 1, 'assistant', 'assistant', 1700000000060, 'authentication catchup', NULL, NULL)",
22272            fparams![conversation_id],
22273        )
22274        .unwrap();
22275        drop(conn);
22276
22277        let repair = ensure_fts_consistency_via_rusqlite(&db_path).unwrap();
22278        assert_eq!(
22279            repair,
22280            FtsConsistencyRepair::IncrementalCatchUp {
22281                inserted_rows: 1,
22282                total_rows: 2
22283            }
22284        );
22285
22286        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22287        let auth_rows: i64 = conn
22288            .query_row_map(
22289                "SELECT COUNT(*) FROM fts_messages WHERE rowid = 2",
22290                fparams![],
22291                |row| row.get_typed(0),
22292            )
22293            .unwrap();
22294        assert_eq!(auth_rows, 1);
22295    }
22296
22297    #[test]
22298    fn rebuild_fts_via_rusqlite_cleans_duplicate_legacy_schema_rows() {
22299        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
22300
22301        let dir = TempDir::new().unwrap();
22302        let db_path = dir.path().join("fts-duplicate-rebuild.db");
22303
22304        let storage = SqliteStorage::open(&db_path).unwrap();
22305        let agent = Agent {
22306            id: None,
22307            slug: "codex".into(),
22308            name: "Codex".into(),
22309            version: Some("0.2.3".into()),
22310            kind: AgentKind::Cli,
22311        };
22312        let agent_id = storage.ensure_agent(&agent).unwrap();
22313        let conversation = Conversation {
22314            id: None,
22315            agent_slug: "codex".into(),
22316            workspace: Some(PathBuf::from("/ws")),
22317            external_id: Some("retro".into()),
22318            title: Some("retro".into()),
22319            source_path: PathBuf::from("/tmp/retro.jsonl"),
22320            started_at: Some(42),
22321            ended_at: Some(42),
22322            approx_tokens: None,
22323            metadata_json: serde_json::Value::Null,
22324            messages: vec![Message {
22325                id: None,
22326                idx: 0,
22327                role: MessageRole::User,
22328                author: None,
22329                created_at: Some(42),
22330                content: "retro investigation".into(),
22331                extra_json: serde_json::Value::Null,
22332                snippets: Vec::new(),
22333            }],
22334            source_id: LOCAL_SOURCE_ID.into(),
22335            origin_host: None,
22336        };
22337        storage
22338            .insert_conversation_tree(agent_id, None, &conversation)
22339            .unwrap();
22340        drop(storage);
22341        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
22342
22343        let conn = rusqlite_test_fixture_conn(&db_path);
22344        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
22345        conn.execute(
22346            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
22347             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
22348            ["CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')"],
22349        )
22350        .unwrap();
22351        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
22352        let duplicate_rows: i64 = conn
22353            .query_row(
22354                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
22355                [],
22356                |row| row.get(0),
22357            )
22358            .unwrap();
22359        assert_eq!(duplicate_rows, 2);
22360        drop(conn);
22361
22362        let inserted = rebuild_fts_via_rusqlite(&db_path).unwrap();
22363        assert_eq!(inserted, 1);
22364
22365        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
22366        let schema_rows = franken_fts_schema_rows(&conn).unwrap();
22367        assert_eq!(
22368            schema_rows, 1,
22369            "DROP TABLE should leave one clean FTS schema"
22370        );
22371        let match_count: i64 = conn
22372            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
22373                row.get_typed(0)
22374            })
22375            .unwrap();
22376        assert_eq!(match_count, 1);
22377    }
22378
22379    // =========================================================================
22380    // Agent storage tests (bead yln.4)
22381    // =========================================================================
22382
22383    #[test]
22384    fn ensure_agent_creates_new() {
22385        let dir = TempDir::new().unwrap();
22386        let db_path = dir.path().join("test.db");
22387        let storage = SqliteStorage::open(&db_path).unwrap();
22388
22389        let agent = Agent {
22390            id: None,
22391            slug: "test_agent".into(),
22392            name: "Test Agent".into(),
22393            version: Some("1.0".into()),
22394            kind: AgentKind::Cli,
22395        };
22396
22397        let id = storage.ensure_agent(&agent).unwrap();
22398        assert!(id > 0);
22399    }
22400
22401    #[test]
22402    fn ensure_agent_returns_existing_id() {
22403        let dir = TempDir::new().unwrap();
22404        let db_path = dir.path().join("test.db");
22405        let storage = SqliteStorage::open(&db_path).unwrap();
22406
22407        let agent = Agent {
22408            id: None,
22409            slug: "codex".into(),
22410            name: "Codex".into(),
22411            version: None,
22412            kind: AgentKind::Cli,
22413        };
22414
22415        let id1 = storage.ensure_agent(&agent).unwrap();
22416        let id2 = storage.ensure_agent(&agent).unwrap();
22417        assert_eq!(id1, id2);
22418    }
22419
22420    #[test]
22421    fn ensure_agent_unchanged_preserves_updated_at() {
22422        let dir = TempDir::new().unwrap();
22423        let db_path = dir.path().join("test.db");
22424        let storage = SqliteStorage::open(&db_path).unwrap();
22425
22426        let agent = Agent {
22427            id: None,
22428            slug: "codex".into(),
22429            name: "Codex".into(),
22430            version: Some("1.0".into()),
22431            kind: AgentKind::Cli,
22432        };
22433
22434        storage.ensure_agent(&agent).unwrap();
22435        let initial_updated_at: i64 = storage
22436            .conn
22437            .query_row_map(
22438                "SELECT updated_at FROM agents WHERE slug = ?1",
22439                fparams![agent.slug.as_str()],
22440                |row| row.get_typed(0),
22441            )
22442            .unwrap();
22443        std::thread::sleep(std::time::Duration::from_millis(5));
22444
22445        storage.ensure_agent(&agent).unwrap();
22446        let fetched_updated_at: i64 = storage
22447            .conn
22448            .query_row_map(
22449                "SELECT updated_at FROM agents WHERE slug = ?1",
22450                fparams![agent.slug.as_str()],
22451                |row| row.get_typed(0),
22452            )
22453            .unwrap();
22454
22455        assert_eq!(fetched_updated_at, initial_updated_at);
22456    }
22457
22458    #[test]
22459    fn ensure_agent_changed_metadata_updates_cached_slug() {
22460        let dir = TempDir::new().unwrap();
22461        let db_path = dir.path().join("test.db");
22462        let storage = SqliteStorage::open(&db_path).unwrap();
22463
22464        let mut agent = Agent {
22465            id: None,
22466            slug: "codex".into(),
22467            name: "Codex".into(),
22468            version: Some("1.0".into()),
22469            kind: AgentKind::Cli,
22470        };
22471
22472        let id1 = storage.ensure_agent(&agent).unwrap();
22473        agent.name = "Codex CLI".into();
22474        agent.version = Some("1.1".into());
22475        let id2 = storage.ensure_agent(&agent).unwrap();
22476
22477        let fetched: (String, Option<String>) = storage
22478            .conn
22479            .query_row_map(
22480                "SELECT name, version FROM agents WHERE slug = ?1",
22481                fparams![agent.slug.as_str()],
22482                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
22483            )
22484            .unwrap();
22485
22486        assert_eq!(id1, id2);
22487        assert_eq!(fetched, ("Codex CLI".into(), Some("1.1".into())));
22488    }
22489
22490    #[test]
22491    fn list_agents_returns_inserted() {
22492        let dir = TempDir::new().unwrap();
22493        let db_path = dir.path().join("test.db");
22494        let storage = SqliteStorage::open(&db_path).unwrap();
22495
22496        let agent = Agent {
22497            id: None,
22498            slug: "new_agent".into(),
22499            name: "New Agent".into(),
22500            version: None,
22501            kind: AgentKind::VsCode,
22502        };
22503        storage.ensure_agent(&agent).unwrap();
22504
22505        let agents = storage.list_agents().unwrap();
22506        assert!(agents.iter().any(|a| a.slug == "new_agent"));
22507    }
22508
22509    // =========================================================================
22510    // Workspace storage tests (bead yln.4)
22511    // =========================================================================
22512
22513    #[test]
22514    fn ensure_workspace_creates_new() {
22515        let dir = TempDir::new().unwrap();
22516        let db_path = dir.path().join("test.db");
22517        let storage = SqliteStorage::open(&db_path).unwrap();
22518
22519        let id = storage
22520            .ensure_workspace(Path::new("/home/user/project"), Some("My Project"))
22521            .unwrap();
22522        assert!(id > 0);
22523    }
22524
22525    #[test]
22526    fn ensure_workspace_returns_existing() {
22527        let dir = TempDir::new().unwrap();
22528        let db_path = dir.path().join("test.db");
22529        let storage = SqliteStorage::open(&db_path).unwrap();
22530
22531        let path = Path::new("/home/user/myproject");
22532        let id1 = storage.ensure_workspace(path, None).unwrap();
22533        let id2 = storage.ensure_workspace(path, None).unwrap();
22534        assert_eq!(id1, id2);
22535    }
22536
22537    #[test]
22538    fn ensure_workspace_changed_display_name_updates_cached_path() {
22539        let dir = TempDir::new().unwrap();
22540        let db_path = dir.path().join("test.db");
22541        let storage = SqliteStorage::open(&db_path).unwrap();
22542
22543        let path = Path::new("/home/user/myproject");
22544        let id1 = storage.ensure_workspace(path, Some("Before")).unwrap();
22545        let id2 = storage.ensure_workspace(path, Some("After")).unwrap();
22546
22547        let display_name: Option<String> = storage
22548            .conn
22549            .query_row_map(
22550                "SELECT display_name FROM workspaces WHERE path = ?1",
22551                fparams![path.to_string_lossy().as_ref()],
22552                |row| row.get_typed(0),
22553            )
22554            .unwrap();
22555
22556        assert_eq!(id1, id2);
22557        assert_eq!(display_name.as_deref(), Some("After"));
22558    }
22559
22560    #[test]
22561    fn list_workspaces_returns_inserted() {
22562        let dir = TempDir::new().unwrap();
22563        let db_path = dir.path().join("test.db");
22564        let storage = SqliteStorage::open(&db_path).unwrap();
22565
22566        storage
22567            .ensure_workspace(Path::new("/test/workspace"), Some("Test WS"))
22568            .unwrap();
22569
22570        let workspaces = storage.list_workspaces().unwrap();
22571        assert!(
22572            workspaces
22573                .iter()
22574                .any(|w| w.path.to_str() == Some("/test/workspace"))
22575        );
22576    }
22577
22578    // =========================================================================
22579    // Source storage tests (bead yln.4)
22580    // =========================================================================
22581
22582    #[test]
22583    fn upsert_source_creates_new() {
22584        let dir = TempDir::new().unwrap();
22585        let db_path = dir.path().join("test.db");
22586        let storage = SqliteStorage::open(&db_path).unwrap();
22587
22588        let source = Source {
22589            id: "test-laptop".into(),
22590            kind: SourceKind::Ssh,
22591            host_label: Some("test.local".into()),
22592            machine_id: Some("test-machine-id".into()),
22593            platform: None,
22594            config_json: None,
22595            created_at: Some(SqliteStorage::now_millis()),
22596            updated_at: None,
22597        };
22598
22599        storage.upsert_source(&source).unwrap();
22600        let fetched = storage.get_source("test-laptop").unwrap();
22601        assert!(fetched.is_some());
22602        assert_eq!(fetched.unwrap().host_label, Some("test.local".into()));
22603    }
22604
22605    #[test]
22606    fn upsert_source_updates_existing() {
22607        let dir = TempDir::new().unwrap();
22608        let db_path = dir.path().join("test.db");
22609        let storage = SqliteStorage::open(&db_path).unwrap();
22610
22611        let source1 = Source {
22612            id: "my-source".into(),
22613            kind: SourceKind::Ssh,
22614            host_label: Some("Original Label".into()),
22615            machine_id: None,
22616            platform: None,
22617            config_json: None,
22618            created_at: Some(SqliteStorage::now_millis()),
22619            updated_at: None,
22620        };
22621        storage.upsert_source(&source1).unwrap();
22622
22623        let source2 = Source {
22624            id: "my-source".into(),
22625            kind: SourceKind::Ssh,
22626            host_label: Some("Updated Label".into()),
22627            machine_id: None,
22628            platform: Some("linux".into()),
22629            config_json: None,
22630            created_at: Some(SqliteStorage::now_millis()),
22631            updated_at: Some(SqliteStorage::now_millis()),
22632        };
22633        storage.upsert_source(&source2).unwrap();
22634
22635        let fetched = storage.get_source("my-source").unwrap().unwrap();
22636        assert_eq!(fetched.host_label, Some("Updated Label".into()));
22637        assert!(fetched.platform.is_some());
22638    }
22639
22640    #[test]
22641    fn upsert_source_unchanged_preserves_updated_at() {
22642        let dir = TempDir::new().unwrap();
22643        let db_path = dir.path().join("test.db");
22644        let storage = SqliteStorage::open(&db_path).unwrap();
22645
22646        let source = Source {
22647            id: "stable-source".into(),
22648            kind: SourceKind::Ssh,
22649            host_label: Some("builder.local".into()),
22650            machine_id: None,
22651            platform: Some("linux".into()),
22652            config_json: Some(serde_json::json!({"role": "bench"})),
22653            created_at: None,
22654            updated_at: None,
22655        };
22656
22657        storage.upsert_source(&source).unwrap();
22658        let initial = storage.get_source("stable-source").unwrap().unwrap();
22659        std::thread::sleep(std::time::Duration::from_millis(5));
22660
22661        storage.upsert_source(&source).unwrap();
22662        let fetched = storage.get_source("stable-source").unwrap().unwrap();
22663
22664        assert_eq!(fetched.created_at, initial.created_at);
22665        assert_eq!(fetched.updated_at, initial.updated_at);
22666        assert_eq!(fetched.host_label, initial.host_label);
22667        assert_eq!(fetched.platform, initial.platform);
22668        assert_eq!(fetched.config_json, initial.config_json);
22669    }
22670
22671    #[test]
22672    fn ensure_source_for_conversation_recreates_remote_source_after_delete() {
22673        let dir = TempDir::new().unwrap();
22674        let db_path = dir.path().join("test.db");
22675        let storage = SqliteStorage::open(&db_path).unwrap();
22676
22677        let conversation = Conversation {
22678            id: None,
22679            agent_slug: "codex".into(),
22680            workspace: Some(PathBuf::from("/ws/cache-recreate")),
22681            external_id: Some("cache-recreate".into()),
22682            title: Some("Cache Recreate".into()),
22683            source_path: PathBuf::from("/log/cache-recreate.jsonl"),
22684            started_at: Some(1_700_000_000_000),
22685            ended_at: Some(1_700_000_000_001),
22686            approx_tokens: Some(16),
22687            metadata_json: serde_json::json!({}),
22688            messages: vec![Message {
22689                id: None,
22690                idx: 0,
22691                role: MessageRole::User,
22692                author: Some("tester".into()),
22693                created_at: Some(1_700_000_000_000),
22694                content: "cache recreate".into(),
22695                extra_json: serde_json::json!({}),
22696                snippets: Vec::new(),
22697            }],
22698            source_id: "cache-remote-source".into(),
22699            origin_host: Some("builder-cache".into()),
22700        };
22701
22702        storage
22703            .ensure_source_for_conversation(&conversation)
22704            .unwrap();
22705        assert!(storage.get_source("cache-remote-source").unwrap().is_some());
22706
22707        let deleted = storage.delete_source("cache-remote-source", false).unwrap();
22708        assert!(deleted);
22709        assert!(storage.get_source("cache-remote-source").unwrap().is_none());
22710
22711        storage
22712            .ensure_source_for_conversation(&conversation)
22713            .unwrap();
22714        let recreated = storage.get_source("cache-remote-source").unwrap();
22715        assert!(recreated.is_some());
22716        assert_eq!(
22717            recreated.unwrap().host_label.as_deref(),
22718            Some("builder-cache")
22719        );
22720    }
22721
22722    #[test]
22723    fn delete_source_removes_entry() {
22724        let dir = TempDir::new().unwrap();
22725        let db_path = dir.path().join("test.db");
22726        let storage = SqliteStorage::open(&db_path).unwrap();
22727
22728        let source = Source {
22729            id: "to-delete".into(),
22730            kind: SourceKind::Local,
22731            host_label: None,
22732            machine_id: None,
22733            platform: None,
22734            config_json: None,
22735            created_at: Some(SqliteStorage::now_millis()),
22736            updated_at: None,
22737        };
22738        storage.upsert_source(&source).unwrap();
22739
22740        let deleted = storage.delete_source("to-delete", false).unwrap();
22741        assert!(deleted);
22742
22743        let fetched = storage.get_source("to-delete").unwrap();
22744        assert!(fetched.is_none());
22745    }
22746
22747    #[test]
22748    fn delete_source_cannot_delete_local() {
22749        let dir = TempDir::new().unwrap();
22750        let db_path = dir.path().join("test.db");
22751        let storage = SqliteStorage::open(&db_path).unwrap();
22752
22753        let result = storage.delete_source(LOCAL_SOURCE_ID, false);
22754        assert!(result.is_err());
22755    }
22756
22757    #[test]
22758    fn list_sources_includes_local() {
22759        let dir = TempDir::new().unwrap();
22760        let db_path = dir.path().join("test.db");
22761        let storage = SqliteStorage::open(&db_path).unwrap();
22762
22763        let sources = storage.list_sources().unwrap();
22764        assert!(sources.iter().any(|s| s.id == LOCAL_SOURCE_ID));
22765    }
22766
22767    #[test]
22768    fn insert_conversation_tree_blank_local_source_normalizes_to_local_id() {
22769        let dir = TempDir::new().unwrap();
22770        let db_path = dir.path().join("test.db");
22771        let storage = SqliteStorage::open(&db_path).unwrap();
22772
22773        let agent_id = storage
22774            .ensure_agent(&Agent {
22775                id: None,
22776                slug: "codex".into(),
22777                name: "Codex".into(),
22778                version: None,
22779                kind: AgentKind::Cli,
22780            })
22781            .unwrap();
22782
22783        let conversation = Conversation {
22784            id: None,
22785            agent_slug: "codex".into(),
22786            workspace: None,
22787            external_id: Some("blank-local-source".into()),
22788            title: Some("Blank local source".into()),
22789            source_path: dir.path().join("blank-local.jsonl"),
22790            started_at: Some(1_700_000_000_000),
22791            ended_at: Some(1_700_000_000_001),
22792            approx_tokens: None,
22793            metadata_json: serde_json::Value::Null,
22794            messages: vec![Message {
22795                id: None,
22796                idx: 0,
22797                role: MessageRole::User,
22798                author: None,
22799                created_at: Some(1_700_000_000_000),
22800                content: "hello".into(),
22801                extra_json: serde_json::Value::Null,
22802                snippets: Vec::new(),
22803            }],
22804            source_id: "   ".into(),
22805            origin_host: None,
22806        };
22807
22808        storage
22809            .insert_conversation_tree(agent_id, None, &conversation)
22810            .unwrap();
22811
22812        assert!(storage.get_source("   ").unwrap().is_none());
22813        let source = storage
22814            .get_source(LOCAL_SOURCE_ID)
22815            .unwrap()
22816            .expect("local source row should exist");
22817        assert_eq!(source.kind, SourceKind::Local);
22818        assert_eq!(source.host_label, None);
22819
22820        let conversations = storage.list_conversations(10, 0).unwrap();
22821        assert_eq!(conversations.len(), 1);
22822        assert_eq!(conversations[0].source_id, LOCAL_SOURCE_ID);
22823        assert_eq!(conversations[0].origin_host, None);
22824    }
22825
22826    #[test]
22827    fn repeated_local_inserts_do_not_touch_bootstrap_source_row() {
22828        let dir = TempDir::new().unwrap();
22829        let db_path = dir.path().join("test.db");
22830        let storage = SqliteStorage::open(&db_path).unwrap();
22831
22832        let agent_id = storage
22833            .ensure_agent(&Agent {
22834                id: None,
22835                slug: "codex".into(),
22836                name: "Codex".into(),
22837                version: None,
22838                kind: AgentKind::Cli,
22839            })
22840            .unwrap();
22841
22842        let bootstrap_updated_at: i64 = storage
22843            .conn
22844            .query_row_map(
22845                "SELECT updated_at FROM sources WHERE id = ?1",
22846                fparams![LOCAL_SOURCE_ID],
22847                |row| row.get_typed(0),
22848            )
22849            .unwrap();
22850
22851        let make_conversation = |external_id: &str, suffix: &str| Conversation {
22852            id: None,
22853            agent_slug: "codex".into(),
22854            workspace: None,
22855            external_id: Some(external_id.into()),
22856            title: Some(format!("Local source {suffix}")),
22857            source_path: dir.path().join(format!("local-{suffix}.jsonl")),
22858            started_at: Some(1_700_000_000_000),
22859            ended_at: Some(1_700_000_000_001),
22860            approx_tokens: None,
22861            metadata_json: serde_json::Value::Null,
22862            messages: vec![Message {
22863                id: None,
22864                idx: 0,
22865                role: MessageRole::User,
22866                author: None,
22867                created_at: Some(1_700_000_000_000),
22868                content: format!("hello-{suffix}"),
22869                extra_json: serde_json::Value::Null,
22870                snippets: Vec::new(),
22871            }],
22872            source_id: LOCAL_SOURCE_ID.into(),
22873            origin_host: None,
22874        };
22875
22876        std::thread::sleep(std::time::Duration::from_millis(5));
22877        storage
22878            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-1", "one"))
22879            .unwrap();
22880        let after_first_insert: i64 = storage
22881            .conn
22882            .query_row_map(
22883                "SELECT updated_at FROM sources WHERE id = ?1",
22884                fparams![LOCAL_SOURCE_ID],
22885                |row| row.get_typed(0),
22886            )
22887            .unwrap();
22888
22889        std::thread::sleep(std::time::Duration::from_millis(5));
22890        storage
22891            .insert_conversation_tree(agent_id, None, &make_conversation("local-source-2", "two"))
22892            .unwrap();
22893        let after_second_insert: i64 = storage
22894            .conn
22895            .query_row_map(
22896                "SELECT updated_at FROM sources WHERE id = ?1",
22897                fparams![LOCAL_SOURCE_ID],
22898                |row| row.get_typed(0),
22899            )
22900            .unwrap();
22901
22902        assert_eq!(after_first_insert, bootstrap_updated_at);
22903        assert_eq!(after_second_insert, bootstrap_updated_at);
22904    }
22905
22906    #[test]
22907    fn insert_conversation_tree_blank_remote_source_normalizes_to_origin_host() {
22908        let dir = TempDir::new().unwrap();
22909        let db_path = dir.path().join("test.db");
22910        let storage = SqliteStorage::open(&db_path).unwrap();
22911
22912        let agent_id = storage
22913            .ensure_agent(&Agent {
22914                id: None,
22915                slug: "codex".into(),
22916                name: "Codex".into(),
22917                version: None,
22918                kind: AgentKind::Cli,
22919            })
22920            .unwrap();
22921
22922        let conversation = Conversation {
22923            id: None,
22924            agent_slug: "codex".into(),
22925            workspace: None,
22926            external_id: Some("blank-remote-source".into()),
22927            title: Some("Blank remote source".into()),
22928            source_path: dir.path().join("blank-remote.jsonl"),
22929            started_at: Some(1_700_000_000_000),
22930            ended_at: Some(1_700_000_000_001),
22931            approx_tokens: None,
22932            metadata_json: serde_json::Value::Null,
22933            messages: vec![Message {
22934                id: None,
22935                idx: 0,
22936                role: MessageRole::User,
22937                author: None,
22938                created_at: Some(1_700_000_000_000),
22939                content: "hello".into(),
22940                extra_json: serde_json::Value::Null,
22941                snippets: Vec::new(),
22942            }],
22943            source_id: "   ".into(),
22944            origin_host: Some("user@work-laptop".into()),
22945        };
22946
22947        storage
22948            .insert_conversation_tree(agent_id, None, &conversation)
22949            .unwrap();
22950
22951        assert!(storage.get_source("   ").unwrap().is_none());
22952        let source = storage
22953            .get_source("user@work-laptop")
22954            .unwrap()
22955            .expect("normalized remote source row should exist");
22956        assert_eq!(source.kind, SourceKind::Ssh);
22957        assert_eq!(source.host_label.as_deref(), Some("user@work-laptop"));
22958
22959        let conversations = storage.list_conversations(10, 0).unwrap();
22960        assert_eq!(conversations.len(), 1);
22961        assert_eq!(conversations[0].source_id, "user@work-laptop");
22962        assert_eq!(
22963            conversations[0].origin_host.as_deref(),
22964            Some("user@work-laptop")
22965        );
22966    }
22967
22968    #[test]
22969    fn insert_conversations_batched_normalizes_host_only_remote_source_id() {
22970        let dir = TempDir::new().unwrap();
22971        let db_path = dir.path().join("test.db");
22972        let storage = SqliteStorage::open(&db_path).unwrap();
22973
22974        let agent_id = storage
22975            .ensure_agent(&Agent {
22976                id: None,
22977                slug: "codex".into(),
22978                name: "Codex".into(),
22979                version: None,
22980                kind: AgentKind::Cli,
22981            })
22982            .unwrap();
22983
22984        let conversation = Conversation {
22985            id: None,
22986            agent_slug: "codex".into(),
22987            workspace: None,
22988            external_id: Some("batched-blank-remote-source".into()),
22989            title: Some("Batched blank remote source".into()),
22990            source_path: dir.path().join("batched-blank-remote.jsonl"),
22991            started_at: Some(1_700_000_000_000),
22992            ended_at: Some(1_700_000_000_001),
22993            approx_tokens: None,
22994            metadata_json: serde_json::Value::Null,
22995            messages: vec![Message {
22996                id: None,
22997                idx: 0,
22998                role: MessageRole::User,
22999                author: None,
23000                created_at: Some(1_700_000_000_000),
23001                content: "hello".into(),
23002                extra_json: serde_json::Value::Null,
23003                snippets: Vec::new(),
23004            }],
23005            source_id: "   ".into(),
23006            origin_host: Some("user@batch-host".into()),
23007        };
23008
23009        storage
23010            .insert_conversations_batched(&[(agent_id, None, &conversation)])
23011            .unwrap();
23012
23013        assert!(storage.get_source("   ").unwrap().is_none());
23014        let source = storage
23015            .get_source("user@batch-host")
23016            .unwrap()
23017            .expect("normalized batched remote source row should exist");
23018        assert_eq!(source.kind, SourceKind::Ssh);
23019        assert_eq!(source.host_label.as_deref(), Some("user@batch-host"));
23020
23021        let conversations = storage.list_conversations(10, 0).unwrap();
23022        assert_eq!(conversations.len(), 1);
23023        assert_eq!(conversations[0].source_id, "user@batch-host");
23024        assert_eq!(
23025            conversations[0].origin_host.as_deref(),
23026            Some("user@batch-host")
23027        );
23028    }
23029
23030    #[test]
23031    fn get_source_ids_excludes_local() {
23032        let dir = TempDir::new().unwrap();
23033        let db_path = dir.path().join("test.db");
23034        let storage = SqliteStorage::open(&db_path).unwrap();
23035
23036        // Add a non-local source
23037        let source = Source {
23038            id: "remote-1".into(),
23039            kind: SourceKind::Ssh,
23040            host_label: Some("server".into()),
23041            machine_id: None,
23042            platform: None,
23043            config_json: None,
23044            created_at: Some(SqliteStorage::now_millis()),
23045            updated_at: None,
23046        };
23047        storage.upsert_source(&source).unwrap();
23048
23049        let ids = storage.get_source_ids().unwrap();
23050        assert!(!ids.contains(&LOCAL_SOURCE_ID.to_string()));
23051        assert!(ids.contains(&"remote-1".to_string()));
23052    }
23053
23054    // =========================================================================
23055    // Scan timestamp tests (bead yln.4)
23056    // =========================================================================
23057
23058    #[test]
23059    fn get_last_scan_ts_returns_none_initially() {
23060        let dir = TempDir::new().unwrap();
23061        let db_path = dir.path().join("test.db");
23062        let storage = SqliteStorage::open(&db_path).unwrap();
23063
23064        let ts = storage.get_last_scan_ts().unwrap();
23065        assert!(ts.is_none());
23066    }
23067
23068    #[test]
23069    fn set_and_get_last_scan_ts() {
23070        let dir = TempDir::new().unwrap();
23071        let db_path = dir.path().join("test.db");
23072        let storage = SqliteStorage::open(&db_path).unwrap();
23073
23074        let expected_ts = 1700000000000_i64;
23075        storage.set_last_scan_ts(expected_ts).unwrap();
23076
23077        let actual_ts = storage.get_last_scan_ts().unwrap();
23078        assert_eq!(actual_ts, Some(expected_ts));
23079    }
23080
23081    // =========================================================================
23082    // now_millis utility test (bead yln.4)
23083    // =========================================================================
23084
23085    #[test]
23086    fn now_millis_returns_reasonable_value() {
23087        let ts = SqliteStorage::now_millis();
23088        // Should be after Jan 1, 2020 (approx 1577836800000)
23089        assert!(ts > 1577836800000);
23090        // Should be before Jan 1, 2100 (approx 4102444800000)
23091        assert!(ts < 4102444800000);
23092    }
23093
23094    // =========================================================================
23095    // Binary Metadata Serialization Tests (Opt 3.1)
23096    // =========================================================================
23097
23098    #[test]
23099    fn msgpack_roundtrip_basic_object() {
23100        let value = serde_json::json!({
23101            "key": "value",
23102            "number": 42,
23103            "nested": { "inner": true }
23104        });
23105
23106        let bytes = serialize_json_to_msgpack(&value).expect("should serialize");
23107        let recovered = deserialize_msgpack_to_json(&bytes);
23108
23109        assert_eq!(value, recovered);
23110    }
23111
23112    #[test]
23113    fn msgpack_returns_none_for_null() {
23114        let value = serde_json::Value::Null;
23115        assert!(serialize_json_to_msgpack(&value).is_none());
23116    }
23117
23118    #[test]
23119    fn message_insert_stores_null_extra_json_as_sql_null() {
23120        let dir = TempDir::new().unwrap();
23121        let db_path = dir.path().join("test.db");
23122        let storage = SqliteStorage::open(&db_path).unwrap();
23123        let agent_id = storage
23124            .ensure_agent(&Agent {
23125                id: None,
23126                slug: "codex".into(),
23127                name: "Codex".into(),
23128                version: None,
23129                kind: AgentKind::Cli,
23130            })
23131            .unwrap();
23132        let conversation = Conversation {
23133            id: None,
23134            agent_slug: "codex".into(),
23135            workspace: None,
23136            external_id: Some("null-extra-json".into()),
23137            title: Some("Null extra_json".into()),
23138            source_path: PathBuf::from("/tmp/null-extra-json.jsonl"),
23139            started_at: Some(1_700_000_000_000),
23140            ended_at: Some(1_700_000_000_001),
23141            approx_tokens: None,
23142            metadata_json: serde_json::Value::Null,
23143            messages: vec![Message {
23144                id: None,
23145                idx: 0,
23146                role: MessageRole::User,
23147                author: None,
23148                created_at: Some(1_700_000_000_000),
23149                content: "null metadata message".into(),
23150                extra_json: serde_json::Value::Null,
23151                snippets: Vec::new(),
23152            }],
23153            source_id: LOCAL_SOURCE_ID.into(),
23154            origin_host: None,
23155        };
23156
23157        let conversation_id = storage
23158            .insert_conversation_tree(agent_id, None, &conversation)
23159            .unwrap()
23160            .conversation_id;
23161
23162        let (extra_json, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23163            .conn
23164            .query_row_map(
23165                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23166                fparams![conversation_id],
23167                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23168            )
23169            .unwrap();
23170        assert!(extra_json.is_none());
23171        assert!(extra_bin.is_none());
23172
23173        let stored = storage.fetch_messages(conversation_id).unwrap();
23174        assert!(stored[0].extra_json.is_null());
23175    }
23176
23177    #[test]
23178    fn message_insert_stores_nonempty_extra_json_as_msgpack_only() {
23179        let dir = TempDir::new().unwrap();
23180        let db_path = dir.path().join("test.db");
23181        let storage = SqliteStorage::open(&db_path).unwrap();
23182        let agent_id = storage
23183            .ensure_agent(&Agent {
23184                id: None,
23185                slug: "codex".into(),
23186                name: "Codex".into(),
23187                version: None,
23188                kind: AgentKind::Cli,
23189            })
23190            .unwrap();
23191        let extra_json = serde_json::json!({ "idx": 7, "kind": "profile" });
23192        let conversation = Conversation {
23193            id: None,
23194            agent_slug: "codex".into(),
23195            workspace: None,
23196            external_id: Some("msgpack-extra-json".into()),
23197            title: Some("MessagePack extra_json".into()),
23198            source_path: PathBuf::from("/tmp/msgpack-extra-json.jsonl"),
23199            started_at: Some(1_700_000_000_000),
23200            ended_at: Some(1_700_000_000_001),
23201            approx_tokens: None,
23202            metadata_json: serde_json::Value::Null,
23203            messages: vec![Message {
23204                id: None,
23205                idx: 0,
23206                role: MessageRole::User,
23207                author: None,
23208                created_at: Some(1_700_000_000_000),
23209                content: "msgpack metadata message".into(),
23210                extra_json: extra_json.clone(),
23211                snippets: Vec::new(),
23212            }],
23213            source_id: LOCAL_SOURCE_ID.into(),
23214            origin_host: None,
23215        };
23216
23217        let conversation_id = storage
23218            .insert_conversation_tree(agent_id, None, &conversation)
23219            .unwrap()
23220            .conversation_id;
23221
23222        let (extra_json_text, extra_bin): (Option<String>, Option<Vec<u8>>) = storage
23223            .conn
23224            .query_row_map(
23225                "SELECT extra_json, extra_bin FROM messages WHERE conversation_id = ?1",
23226                fparams![conversation_id],
23227                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23228            )
23229            .unwrap();
23230        assert!(extra_json_text.is_none());
23231        assert!(extra_bin.is_some());
23232
23233        let stored = storage.fetch_messages(conversation_id).unwrap();
23234        assert_eq!(stored[0].extra_json, extra_json);
23235    }
23236
23237    #[test]
23238    fn conversation_insert_preserves_null_metadata_json_as_json_null() {
23239        let dir = TempDir::new().unwrap();
23240        let db_path = dir.path().join("test.db");
23241        let storage = SqliteStorage::open(&db_path).unwrap();
23242        let agent_id = storage
23243            .ensure_agent(&Agent {
23244                id: None,
23245                slug: "codex".into(),
23246                name: "Codex".into(),
23247                version: None,
23248                kind: AgentKind::Cli,
23249            })
23250            .unwrap();
23251        let conversation = Conversation {
23252            id: None,
23253            agent_slug: "codex".into(),
23254            workspace: None,
23255            external_id: Some("null-conversation-metadata".into()),
23256            title: Some("Null conversation metadata".into()),
23257            source_path: PathBuf::from("/tmp/null-conversation-metadata.jsonl"),
23258            started_at: Some(1_700_000_000_000),
23259            ended_at: Some(1_700_000_000_001),
23260            approx_tokens: None,
23261            metadata_json: serde_json::Value::Null,
23262            messages: vec![Message {
23263                id: None,
23264                idx: 0,
23265                role: MessageRole::User,
23266                author: None,
23267                created_at: Some(1_700_000_000_000),
23268                content: "null conversation metadata message".into(),
23269                extra_json: serde_json::Value::Null,
23270                snippets: Vec::new(),
23271            }],
23272            source_id: LOCAL_SOURCE_ID.into(),
23273            origin_host: None,
23274        };
23275
23276        storage
23277            .insert_conversation_tree(agent_id, None, &conversation)
23278            .unwrap();
23279
23280        let (metadata_json, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23281            .conn
23282            .query_row_map(
23283                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23284                fparams!["null-conversation-metadata"],
23285                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23286            )
23287            .unwrap();
23288        assert_eq!(metadata_json.as_deref(), Some("null"));
23289        assert!(metadata_bin.is_none());
23290
23291        let listed = storage.list_conversations(10, 0).unwrap();
23292        assert!(listed[0].metadata_json.is_null());
23293    }
23294
23295    #[test]
23296    fn conversation_insert_stores_nonempty_metadata_as_msgpack_only() {
23297        let dir = TempDir::new().unwrap();
23298        let db_path = dir.path().join("test.db");
23299        let storage = SqliteStorage::open(&db_path).unwrap();
23300        let agent_id = storage
23301            .ensure_agent(&Agent {
23302                id: None,
23303                slug: "codex".into(),
23304                name: "Codex".into(),
23305                version: None,
23306                kind: AgentKind::Cli,
23307            })
23308            .unwrap();
23309        let metadata_json = serde_json::json!({ "bench": true, "source": "profile" });
23310        let conversation = Conversation {
23311            id: None,
23312            agent_slug: "codex".into(),
23313            workspace: None,
23314            external_id: Some("msgpack-conversation-metadata".into()),
23315            title: Some("MessagePack conversation metadata".into()),
23316            source_path: PathBuf::from("/tmp/msgpack-conversation-metadata.jsonl"),
23317            started_at: Some(1_700_000_000_000),
23318            ended_at: Some(1_700_000_000_001),
23319            approx_tokens: None,
23320            metadata_json: metadata_json.clone(),
23321            messages: vec![Message {
23322                id: None,
23323                idx: 0,
23324                role: MessageRole::User,
23325                author: None,
23326                created_at: Some(1_700_000_000_000),
23327                content: "msgpack conversation metadata message".into(),
23328                extra_json: serde_json::Value::Null,
23329                snippets: Vec::new(),
23330            }],
23331            source_id: LOCAL_SOURCE_ID.into(),
23332            origin_host: None,
23333        };
23334
23335        storage
23336            .insert_conversation_tree(agent_id, None, &conversation)
23337            .unwrap();
23338
23339        let (metadata_text, metadata_bin): (Option<String>, Option<Vec<u8>>) = storage
23340            .conn
23341            .query_row_map(
23342                "SELECT metadata_json, metadata_bin FROM conversations WHERE external_id = ?1",
23343                fparams!["msgpack-conversation-metadata"],
23344                |row| Ok((row.get_typed(0)?, row.get_typed(1)?)),
23345            )
23346            .unwrap();
23347        assert!(metadata_text.is_none());
23348        assert!(metadata_bin.is_some());
23349
23350        let listed = storage.list_conversations(10, 0).unwrap();
23351        assert_eq!(listed[0].metadata_json, metadata_json);
23352    }
23353
23354    #[test]
23355    fn msgpack_returns_none_for_empty_object() {
23356        let value = serde_json::json!({});
23357        assert!(serialize_json_to_msgpack(&value).is_none());
23358    }
23359
23360    #[test]
23361    fn parse_historical_json_column_preserves_large_payloads_as_raw_json() {
23362        let raw = format!("{{\"blob\":\"{}\"}}", "x".repeat(1_000_000));
23363
23364        let value = parse_historical_json_column(Some(raw.clone()));
23365
23366        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23367        assert_eq!(json_value_size_hint(&value), raw.len());
23368    }
23369
23370    #[test]
23371    fn parse_historical_json_column_preserves_small_payloads_as_raw_json() {
23372        let raw = String::from("{\"ok\":true,\"n\":1}");
23373
23374        let value = parse_historical_json_column(Some(raw.clone()));
23375
23376        assert_eq!(historical_raw_json(&value), Some(raw.as_str()));
23377    }
23378
23379    #[test]
23380    fn msgpack_serializes_non_empty_array() {
23381        let value = serde_json::json!([1, 2, 3]);
23382        let bytes = serialize_json_to_msgpack(&value).expect("should serialize array");
23383        let recovered = deserialize_msgpack_to_json(&bytes);
23384        assert_eq!(value, recovered);
23385    }
23386
23387    #[test]
23388    fn msgpack_smaller_than_json() {
23389        let value = serde_json::json!({
23390            "field_name_one": "some_value",
23391            "field_name_two": 123456,
23392            "field_name_three": [1, 2, 3, 4, 5],
23393            "field_name_four": { "nested": true }
23394        });
23395
23396        let json_bytes = serde_json::to_vec(&value).unwrap();
23397        let msgpack_bytes = serialize_json_to_msgpack(&value).unwrap();
23398
23399        // MessagePack should be smaller due to more compact encoding
23400        assert!(
23401            msgpack_bytes.len() < json_bytes.len(),
23402            "MessagePack ({} bytes) should be smaller than JSON ({} bytes)",
23403            msgpack_bytes.len(),
23404            json_bytes.len()
23405        );
23406    }
23407
23408    #[test]
23409    fn migration_v7_adds_binary_columns() {
23410        let dir = TempDir::new().unwrap();
23411        let db_path = dir.path().join("test.db");
23412        let storage = SqliteStorage::open(&db_path).unwrap();
23413
23414        // Verify metadata_bin column exists
23415        let has_metadata_bin = storage
23416            .raw()
23417            .query("PRAGMA table_info(conversations)")
23418            .unwrap()
23419            .iter()
23420            .any(|row| row.get_typed::<String>(1).unwrap() == "metadata_bin");
23421        assert!(
23422            has_metadata_bin,
23423            "conversations should have metadata_bin column"
23424        );
23425
23426        // Verify extra_bin column exists
23427        let has_extra_bin = storage
23428            .raw()
23429            .query("PRAGMA table_info(messages)")
23430            .unwrap()
23431            .iter()
23432            .any(|row| row.get_typed::<String>(1).unwrap() == "extra_bin");
23433        assert!(has_extra_bin, "messages should have extra_bin column");
23434    }
23435
23436    #[test]
23437    fn insert_conversation_tree_rehydrates_append_tail_state_cache_after_manual_clear() {
23438        let dir = TempDir::new().unwrap();
23439        let db_path = dir.path().join("append-tail-state-cache.db");
23440        let storage = SqliteStorage::open(&db_path).unwrap();
23441        let agent_id = storage
23442            .ensure_agent(&Agent {
23443                id: None,
23444                slug: "codex".into(),
23445                name: "Codex".into(),
23446                version: None,
23447                kind: AgentKind::Cli,
23448            })
23449            .unwrap();
23450        let workspace = PathBuf::from("/ws/profiled-append-remote");
23451        let workspace_id = storage.ensure_workspace(&workspace, None).unwrap();
23452
23453        let initial = make_profiled_append_remote_merge_conversation(11, 5);
23454        let insert_outcome = storage
23455            .insert_conversation_tree(agent_id, Some(workspace_id), &initial)
23456            .unwrap();
23457        let conversation_id = insert_outcome.conversation_id;
23458
23459        let initial_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23460            .raw()
23461            .query_row_map(
23462                "SELECT ended_at, last_message_idx, last_message_created_at
23463                 FROM conversation_tail_state
23464                 WHERE conversation_id = ?1",
23465                fparams![conversation_id],
23466                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23467            )
23468            .unwrap();
23469        assert_eq!(initial_tail, (Some(111_005), Some(4), Some(111_004)));
23470
23471        storage
23472            .raw()
23473            .execute_compat(
23474                "UPDATE conversations SET ended_at = ?1 WHERE id = ?2",
23475                fparams![111_999_i64, conversation_id],
23476            )
23477            .unwrap();
23478        storage
23479            .raw()
23480            .execute_compat(
23481                "DELETE FROM conversation_tail_state WHERE conversation_id = ?1",
23482                fparams![conversation_id],
23483            )
23484            .unwrap();
23485
23486        let appended = make_profiled_append_remote_merge_conversation(11, 10);
23487        let append_outcome = storage
23488            .insert_conversation_tree(agent_id, Some(workspace_id), &appended)
23489            .unwrap();
23490        assert_eq!(append_outcome.inserted_indices, vec![5, 6, 7, 8, 9]);
23491
23492        let final_tail: (Option<i64>, Option<i64>, Option<i64>) = storage
23493            .raw()
23494            .query_row_map(
23495                "SELECT ended_at, last_message_idx, last_message_created_at
23496                 FROM conversation_tail_state
23497                 WHERE conversation_id = ?1",
23498                fparams![conversation_id],
23499                |row| Ok((row.get_typed(0)?, row.get_typed(1)?, row.get_typed(2)?)),
23500            )
23501            .unwrap();
23502        assert_eq!(final_tail, (Some(111_999), Some(9), Some(111_009)));
23503    }
23504
23505    #[test]
23506    fn msgpack_deserialize_empty_returns_default() {
23507        let recovered = deserialize_msgpack_to_json(&[]);
23508        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23509    }
23510
23511    #[test]
23512    fn msgpack_deserialize_garbage_returns_default() {
23513        // Use truncated msgpack data that will fail to parse
23514        // 0x85 indicates a fixmap with 5 elements, but we don't provide them
23515        let recovered = deserialize_msgpack_to_json(&[0x85]);
23516        assert_eq!(recovered, serde_json::Value::Object(serde_json::Map::new()));
23517    }
23518
23519    #[test]
23520    fn stats_aggregator_collects_and_expands() {
23521        let mut agg = StatsAggregator::new();
23522        assert!(agg.is_empty());
23523
23524        // Record some stats
23525        // Day 100, agent "claude", source "local"
23526        agg.record("claude", "local", 100, 5, 500);
23527        // Day 100, agent "codex", source "local"
23528        agg.record("codex", "local", 100, 3, 300);
23529        // Day 101, agent "claude", source "local"
23530        agg.record("claude", "local", 101, 2, 200);
23531
23532        assert!(!agg.is_empty());
23533        assert_eq!(agg.raw_entry_count(), 3);
23534
23535        let entries = agg.expand();
23536        // Each raw entry expands to 4 permutations.
23537        // But (all, local) and (all, all) will aggregate.
23538        //
23539        // Raw:
23540        // 1. (100, claude, local) -> 1 sess, 5 msgs, 500 chars
23541        // 2. (100, codex, local)  -> 1 sess, 3 msgs, 300 chars
23542        // 3. (101, claude, local) -> 1 sess, 2 msgs, 200 chars
23543        //
23544        // Expanded 1 (day 100):
23545        // - (100, claude, local): 1 sess, 5 msgs, 500 chars
23546        // - (100, all, local):    1 (from claude) + 1 (from codex) = 2 sess, 8 msgs, 800 chars
23547        // - (100, claude, all):   1 sess, 5 msgs, 500 chars
23548        // - (100, codex, local):  1 sess, 3 msgs, 300 chars
23549        // - (100, codex, all):    1 sess, 3 msgs, 300 chars
23550        // - (100, all, all):      2 sess, 8 msgs, 800 chars
23551        //
23552        // Expanded 3 (day 101):
23553        // - (101, claude, local): 1 sess, 2 msgs, 200 chars
23554        // - (101, all, local):    1 sess, 2 msgs, 200 chars
23555        // - (101, claude, all):   1 sess, 2 msgs, 200 chars
23556        // - (101, all, all):      1 sess, 2 msgs, 200 chars
23557        //
23558        // Total unique keys in expanded map:
23559        // Day 100: (claude, local), (codex, local), (all, local), (claude, all), (codex, all), (all, all) = 6
23560        // Day 101: (claude, local), (all, local), (claude, all), (all, all) = 4
23561        // Total = 10 entries
23562
23563        assert_eq!(entries.len(), 10);
23564
23565        // Verify totals for day 100, all/all
23566        let day100_all = entries
23567            .iter()
23568            .find(|(d, a, s, _)| *d == 100 && a == "all" && s == "all")
23569            .unwrap();
23570        assert_eq!(day100_all.3.session_count_delta, 2);
23571        assert_eq!(day100_all.3.message_count_delta, 8);
23572        assert_eq!(day100_all.3.total_chars_delta, 800);
23573    }
23574
23575    // =========================================================================
23576    // LazyFrankenDb tests (bd-1ueu)
23577    // =========================================================================
23578
23579    #[test]
23580    fn lazy_franken_db_not_open_before_get() {
23581        let dir = TempDir::new().unwrap();
23582        let db_path = dir.path().join("lazy_test.db");
23583
23584        // Create a real DB so the path exists
23585        let _storage = SqliteStorage::open(&db_path).unwrap();
23586
23587        let lazy = LazyFrankenDb::new(db_path);
23588        assert!(
23589            !lazy.is_open(),
23590            "LazyFrankenDb must not open on construction"
23591        );
23592    }
23593
23594    #[test]
23595    fn lazy_franken_db_opens_on_first_get() {
23596        let dir = TempDir::new().unwrap();
23597        let db_path = dir.path().join("lazy_test.db");
23598
23599        // Create a real DB so the path exists
23600        let _storage = SqliteStorage::open(&db_path).unwrap();
23601        drop(_storage);
23602
23603        let lazy = LazyFrankenDb::new(db_path);
23604        assert!(!lazy.is_open());
23605
23606        let conn = lazy.get("test").expect("should open successfully");
23607        let count: i64 = conn
23608            .query_row_map("SELECT COUNT(*) FROM conversations", fparams![], |r| {
23609                r.get_typed(0)
23610            })
23611            .unwrap();
23612        assert_eq!(count, 0);
23613        drop(conn);
23614
23615        assert!(lazy.is_open(), "LazyFrankenDb must be open after get()");
23616    }
23617
23618    #[test]
23619    fn lazy_franken_db_reuses_connection() {
23620        let dir = TempDir::new().unwrap();
23621        let db_path = dir.path().join("lazy_test.db");
23622        let _storage = SqliteStorage::open(&db_path).unwrap();
23623        drop(_storage);
23624
23625        let lazy = LazyFrankenDb::new(db_path);
23626
23627        // First access opens
23628        {
23629            let conn = lazy.get("first").unwrap();
23630            conn.execute_batch("CREATE TABLE IF NOT EXISTS test_tbl (id INTEGER)")
23631                .unwrap();
23632        }
23633
23634        // Second access reuses (table still exists)
23635        {
23636            let conn = lazy.get("second").unwrap();
23637            let count: i64 = conn
23638                .query_row_map("SELECT COUNT(*) FROM test_tbl", fparams![], |r| {
23639                    r.get_typed(0)
23640                })
23641                .unwrap();
23642            assert_eq!(count, 0);
23643        }
23644    }
23645
23646    #[test]
23647    fn lazy_franken_db_not_found_error() {
23648        let dir = TempDir::new().unwrap();
23649        let db_path = dir.path().join("nonexistent.db");
23650
23651        let lazy = LazyFrankenDb::new(db_path);
23652        let result = lazy.get("test");
23653        assert!(result.is_err());
23654        assert!(
23655            matches!(result.unwrap_err(), LazyDbError::NotFound(_)),
23656            "should return NotFound for missing DB"
23657        );
23658    }
23659
23660    #[test]
23661    fn lazy_franken_db_path_accessor() {
23662        let path = PathBuf::from("/tmp/test_lazy.db");
23663        let lazy = LazyFrankenDb::new(path.clone());
23664        assert_eq!(lazy.path(), path.as_path());
23665    }
23666
23667    // =========================================================================
23668    // Pricing / cost estimation tests (bead z9fse.10)
23669    // =========================================================================
23670
23671    #[test]
23672    fn sql_like_match_basic_patterns() {
23673        assert!(sql_like_match("claude-opus-4-20250101", "claude-opus-4%"));
23674        assert!(sql_like_match("claude-opus-4", "claude-opus-4%"));
23675        assert!(!sql_like_match("claude-sonnet-4", "claude-opus-4%"));
23676
23677        // Middle wildcard (gemini pattern)
23678        assert!(sql_like_match("gemini-2.0-flash-001", "gemini-2%flash%"));
23679        assert!(sql_like_match("gemini-2-flash", "gemini-2%flash%"));
23680        assert!(!sql_like_match("gemini-2-pro", "gemini-2%flash%"));
23681
23682        // Exact match
23683        assert!(sql_like_match("hello", "hello"));
23684        assert!(!sql_like_match("hello!", "hello"));
23685
23686        // Underscore wildcard
23687        assert!(sql_like_match("gpt-4o", "gpt-4_"));
23688        assert!(!sql_like_match("gpt-4oo", "gpt-4_"));
23689
23690        // Case insensitive
23691        assert!(sql_like_match("Claude-Opus-4", "claude-opus-4%"));
23692    }
23693
23694    #[test]
23695    fn date_str_to_day_id_converts_correctly() {
23696        // 2025-10-01 is 2100 days after 2020-01-01
23697        assert_eq!(date_str_to_day_id("2025-10-01").unwrap(), 2100);
23698        // 2024-04-01 is 1552 days after 2020-01-01
23699        assert_eq!(date_str_to_day_id("2024-04-01").unwrap(), 1552);
23700        assert!(date_str_to_day_id("invalid").is_err());
23701    }
23702
23703    #[test]
23704    fn pricing_table_lookup_selects_matching_entry() {
23705        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23706        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23707        let table = PricingTable {
23708            entries: vec![
23709                PricingEntry {
23710                    model_pattern: "claude-opus-4%".into(),
23711                    provider: "anthropic".into(),
23712                    input_cost_per_mtok: 15.0,
23713                    output_cost_per_mtok: 75.0,
23714                    cache_read_cost_per_mtok: Some(1.5),
23715                    cache_creation_cost_per_mtok: Some(18.75),
23716                    effective_day_id: effective_day,
23717                },
23718                PricingEntry {
23719                    model_pattern: "claude-sonnet-4%".into(),
23720                    provider: "anthropic".into(),
23721                    input_cost_per_mtok: 3.0,
23722                    output_cost_per_mtok: 15.0,
23723                    cache_read_cost_per_mtok: Some(0.3),
23724                    cache_creation_cost_per_mtok: Some(3.75),
23725                    effective_day_id: effective_day,
23726                },
23727            ],
23728        };
23729
23730        let result = table.lookup("claude-opus-4-20260101", lookup_day);
23731        assert!(result.is_some());
23732        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23733
23734        let result = table.lookup("claude-sonnet-4-latest", lookup_day);
23735        assert!(result.is_some());
23736        assert_eq!(result.unwrap().input_cost_per_mtok, 3.0);
23737
23738        assert!(table.lookup("unknown-model", lookup_day).is_none());
23739    }
23740
23741    #[test]
23742    fn pricing_table_lookup_respects_effective_date() {
23743        let effective_day_1 = date_str_to_day_id("2025-10-01").unwrap();
23744        let effective_day_2 = date_str_to_day_id("2026-01-01").unwrap();
23745        let table = PricingTable {
23746            entries: vec![
23747                PricingEntry {
23748                    model_pattern: "claude-opus-4%".into(),
23749                    provider: "anthropic".into(),
23750                    input_cost_per_mtok: 15.0,
23751                    output_cost_per_mtok: 75.0,
23752                    cache_read_cost_per_mtok: None,
23753                    cache_creation_cost_per_mtok: None,
23754                    effective_day_id: effective_day_1,
23755                },
23756                PricingEntry {
23757                    model_pattern: "claude-opus-4%".into(),
23758                    provider: "anthropic".into(),
23759                    input_cost_per_mtok: 12.0,
23760                    output_cost_per_mtok: 60.0,
23761                    cache_read_cost_per_mtok: None,
23762                    cache_creation_cost_per_mtok: None,
23763                    effective_day_id: effective_day_2,
23764                },
23765            ],
23766        };
23767
23768        // Before price drop
23769        let result = table.lookup("claude-opus-4", date_str_to_day_id("2025-11-01").unwrap());
23770        assert!(result.is_some());
23771        assert_eq!(result.unwrap().input_cost_per_mtok, 15.0);
23772
23773        // After price drop
23774        let result = table.lookup("claude-opus-4", date_str_to_day_id("2026-02-01").unwrap());
23775        assert!(result.is_some());
23776        assert_eq!(result.unwrap().input_cost_per_mtok, 12.0);
23777
23778        // Before all pricing
23779        assert!(
23780            table
23781                .lookup("claude-opus-4", date_str_to_day_id("2024-01-01").unwrap())
23782                .is_none()
23783        );
23784    }
23785
23786    #[test]
23787    fn pricing_table_lookup_specificity_tiebreak() {
23788        let effective_day = date_str_to_day_id("2025-01-01").unwrap();
23789        let lookup_day = date_str_to_day_id("2026-01-01").unwrap();
23790        let table = PricingTable {
23791            entries: vec![
23792                PricingEntry {
23793                    model_pattern: "gpt-4%".into(),
23794                    provider: "openai".into(),
23795                    input_cost_per_mtok: 10.0,
23796                    output_cost_per_mtok: 30.0,
23797                    cache_read_cost_per_mtok: None,
23798                    cache_creation_cost_per_mtok: None,
23799                    effective_day_id: effective_day,
23800                },
23801                PricingEntry {
23802                    model_pattern: "gpt-4-turbo%".into(),
23803                    provider: "openai".into(),
23804                    input_cost_per_mtok: 5.0,
23805                    output_cost_per_mtok: 15.0,
23806                    cache_read_cost_per_mtok: None,
23807                    cache_creation_cost_per_mtok: None,
23808                    effective_day_id: effective_day,
23809                },
23810            ],
23811        };
23812
23813        // Longer pattern wins for specific model
23814        let result = table.lookup("gpt-4-turbo-2025", lookup_day);
23815        assert!(result.is_some());
23816        assert_eq!(result.unwrap().input_cost_per_mtok, 5.0);
23817
23818        // Shorter pattern matches broader model
23819        let result = table.lookup("gpt-4o", lookup_day);
23820        assert!(result.is_some());
23821        assert_eq!(result.unwrap().input_cost_per_mtok, 10.0);
23822    }
23823
23824    #[test]
23825    fn pricing_table_compute_cost_basic() {
23826        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23827        let table = PricingTable {
23828            entries: vec![PricingEntry {
23829                model_pattern: "claude-opus-4%".into(),
23830                provider: "anthropic".into(),
23831                input_cost_per_mtok: 15.0,
23832                output_cost_per_mtok: 75.0,
23833                cache_read_cost_per_mtok: Some(1.5),
23834                cache_creation_cost_per_mtok: Some(18.75),
23835                effective_day_id: effective_day,
23836            }],
23837        };
23838
23839        let cost = table.compute_cost(
23840            Some("claude-opus-4-latest"),
23841            date_str_to_day_id("2026-02-06").unwrap(),
23842            Some(1000),
23843            Some(500),
23844            None,
23845            None,
23846        );
23847        assert!(cost.is_some());
23848        // 1000 * 15.0 / 1M + 500 * 75.0 / 1M = 0.015 + 0.0375 = 0.0525
23849        assert!((cost.unwrap() - 0.0525).abs() < 1e-10);
23850    }
23851
23852    #[test]
23853    fn pricing_table_compute_cost_with_cache() {
23854        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23855        let table = PricingTable {
23856            entries: vec![PricingEntry {
23857                model_pattern: "claude-opus-4%".into(),
23858                provider: "anthropic".into(),
23859                input_cost_per_mtok: 15.0,
23860                output_cost_per_mtok: 75.0,
23861                cache_read_cost_per_mtok: Some(1.5),
23862                cache_creation_cost_per_mtok: Some(18.75),
23863                effective_day_id: effective_day,
23864            }],
23865        };
23866
23867        let cost = table.compute_cost(
23868            Some("claude-opus-4-latest"),
23869            date_str_to_day_id("2026-02-06").unwrap(),
23870            Some(1_000_000),
23871            Some(100_000),
23872            Some(500_000),
23873            Some(200_000),
23874        );
23875        assert!(cost.is_some());
23876        // input excludes cache tokens to avoid double-charging them at both the
23877        // full input rate and the cache-specific rates.
23878        // non-cache input: 300K * 15/1M = 4.5, output: 100K * 75/1M = 7.5
23879        // cache_read: 500K * 1.5/1M = 0.75, cache_creation: 200K * 18.75/1M = 3.75
23880        // total = 16.5
23881        assert!((cost.unwrap() - 16.5).abs() < 1e-10);
23882    }
23883
23884    #[test]
23885    fn pricing_table_compute_cost_returns_none_for_unknown_model() {
23886        let effective_day = date_str_to_day_id("2025-10-01").unwrap();
23887        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23888        let table = PricingTable {
23889            entries: vec![PricingEntry {
23890                model_pattern: "claude-opus-4%".into(),
23891                provider: "anthropic".into(),
23892                input_cost_per_mtok: 15.0,
23893                output_cost_per_mtok: 75.0,
23894                cache_read_cost_per_mtok: None,
23895                cache_creation_cost_per_mtok: None,
23896                effective_day_id: effective_day,
23897            }],
23898        };
23899
23900        assert!(
23901            table
23902                .compute_cost(
23903                    Some("unknown-model"),
23904                    lookup_day,
23905                    Some(1000),
23906                    Some(500),
23907                    None,
23908                    None
23909                )
23910                .is_none()
23911        );
23912        assert!(
23913            table
23914                .compute_cost(None, lookup_day, Some(1000), Some(500), None, None)
23915                .is_none()
23916        );
23917        assert!(
23918            table
23919                .compute_cost(Some("claude-opus-4"), lookup_day, None, None, None, None)
23920                .is_none()
23921        );
23922    }
23923
23924    #[test]
23925    fn pricing_table_load_from_db() {
23926        let dir = TempDir::new().unwrap();
23927        let db_path = dir.path().join("test.db");
23928        let storage = SqliteStorage::open(&db_path).unwrap();
23929
23930        let table = PricingTable::load(&storage.conn).unwrap();
23931        assert!(!table.is_empty());
23932
23933        let lookup_day = date_str_to_day_id("2026-02-06").unwrap();
23934
23935        let opus = table.lookup("claude-opus-4-latest", lookup_day);
23936        assert!(opus.is_some());
23937        assert_eq!(opus.unwrap().input_cost_per_mtok, 15.0);
23938
23939        let flash = table.lookup("gemini-2.0-flash-001", lookup_day);
23940        assert!(flash.is_some());
23941        assert_eq!(flash.unwrap().input_cost_per_mtok, 0.075);
23942    }
23943
23944    #[test]
23945    fn pricing_table_load_rejects_invalid_effective_date() {
23946        let dir = TempDir::new().unwrap();
23947        let db_path = dir.path().join("test.db");
23948        let storage = SqliteStorage::open(&db_path).unwrap();
23949
23950        storage
23951            .conn
23952            .execute_compat(
23953                "INSERT INTO model_pricing (
23954                    model_pattern, provider, input_cost_per_mtok, output_cost_per_mtok,
23955                    cache_read_cost_per_mtok, cache_creation_cost_per_mtok, effective_date
23956                 ) VALUES (?1, ?2, ?3, ?4, ?5, ?6, ?7)",
23957                fparams![
23958                    "broken-model%",
23959                    "test",
23960                    1.0_f64,
23961                    2.0_f64,
23962                    Option::<f64>::None,
23963                    Option::<f64>::None,
23964                    "not-a-date"
23965                ],
23966            )
23967            .unwrap();
23968
23969        let err = PricingTable::load(&storage.conn).unwrap_err();
23970        assert!(err.to_string().contains("invalid effective_date"));
23971    }
23972
23973    #[test]
23974    fn pricing_diagnostics_tracks_coverage() {
23975        let mut diag = PricingDiagnostics::default();
23976        diag.record_priced();
23977        diag.record_priced();
23978        diag.record_unpriced(Some("custom-model-v1"));
23979        diag.record_unpriced(Some("custom-model-v1"));
23980        diag.record_unpriced(None);
23981
23982        assert_eq!(diag.priced_count, 2);
23983        assert_eq!(diag.unpriced_count, 3);
23984        assert_eq!(diag.unknown_models.len(), 2);
23985        assert_eq!(diag.unknown_models["custom-model-v1"], 2);
23986        assert_eq!(diag.unknown_models["(none)"], 1);
23987    }
23988
23989    // =========================================================================
23990    // FrankenStorage migration tests (bead 2j6p6)
23991    // =========================================================================
23992
23993    /// Helper: create a FrankenStorage wrapping an in-memory connection and
23994    /// run migrations. This exercises the same code path as `open()` but avoids
23995    /// frankensqlite's file-based autoindex renaming limitation (V5 uses
23996    /// ALTER TABLE RENAME which triggers sqlite_autoindex lookup issues on
23997    /// file-based pagers).
23998    fn franken_storage_in_memory() -> FrankenStorage {
23999        let conn = FrankenConnection::open(":memory:").unwrap();
24000        let storage = FrankenStorage::new(conn, PathBuf::from(":memory:"));
24001        storage.run_migrations().unwrap();
24002        storage.apply_config().unwrap();
24003        storage
24004    }
24005
24006    #[test]
24007    fn franken_migrations_create_all_tables() {
24008        let storage = franken_storage_in_memory();
24009
24010        // Should be at CURRENT_SCHEMA_VERSION.
24011        let version = storage.schema_version().unwrap();
24012        assert_eq!(
24013            version, CURRENT_SCHEMA_VERSION,
24014            "fresh FrankenStorage should be at current schema version"
24015        );
24016
24017        // Core tables from V1 should exist.
24018        let rows = storage
24019            .raw()
24020            .query("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name;")
24021            .unwrap();
24022        let table_names: Vec<String> = rows
24023            .iter()
24024            .filter_map(|r| r.get_typed::<String>(0).ok())
24025            .collect();
24026
24027        for required in [
24028            "meta",
24029            "agents",
24030            "workspaces",
24031            "conversations",
24032            "messages",
24033            "snippets",
24034            "tags",
24035            "conversation_tags",
24036        ] {
24037            assert!(
24038                table_names.contains(&required.to_string()),
24039                "missing table: {required}"
24040            );
24041        }
24042
24043        // V4 sources table.
24044        assert!(
24045            table_names.contains(&"sources".to_string()),
24046            "missing sources table"
24047        );
24048
24049        // V8 daily_stats table.
24050        assert!(
24051            table_names.contains(&"daily_stats".to_string()),
24052            "missing daily_stats table"
24053        );
24054
24055        // V9 embedding_jobs table.
24056        assert!(
24057            table_names.contains(&"embedding_jobs".to_string()),
24058            "missing embedding_jobs table"
24059        );
24060
24061        // V11 message_metrics, usage_hourly, usage_daily tables.
24062        for analytics_table in ["message_metrics", "usage_hourly", "usage_daily"] {
24063            assert!(
24064                table_names.contains(&analytics_table.to_string()),
24065                "missing table: {analytics_table}"
24066            );
24067        }
24068        assert!(
24069            table_names.contains(&"conversation_tail_state".to_string()),
24070            "missing conversation_tail_state table"
24071        );
24072        assert!(
24073            table_names.contains(&"conversation_external_lookup".to_string()),
24074            "missing conversation_external_lookup table"
24075        );
24076        assert!(
24077            table_names.contains(&"conversation_external_tail_lookup".to_string()),
24078            "missing conversation_external_tail_lookup table"
24079        );
24080
24081        // Fresh frankensqlite databases should record the combined V13 base
24082        // schema plus every additive post-V13 migration.
24083        let rows = storage
24084            .raw()
24085            .query("SELECT COUNT(*) FROM _schema_migrations;")
24086            .unwrap();
24087        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24088        assert_eq!(
24089            count,
24090            (13..=CURRENT_SCHEMA_VERSION).count() as i64,
24091            "_schema_migrations should record the V13 base schema and post-V13 migrations"
24092        );
24093
24094        // The latest applied migration should be the current schema version.
24095        let rows = storage
24096            .raw()
24097            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24098            .unwrap();
24099        let versions: Vec<i64> = rows
24100            .iter()
24101            .map(|row| row.get_typed(0))
24102            .collect::<std::result::Result<_, _>>()
24103            .unwrap();
24104        assert_eq!(
24105            versions,
24106            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24107            "_schema_migrations should contain v13 through current"
24108        );
24109    }
24110
24111    #[test]
24112    fn franken_migrations_idempotent() {
24113        let storage = franken_storage_in_memory();
24114        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24115
24116        // Re-running migrations on the same connection is a no-op.
24117        storage.run_migrations().unwrap();
24118        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24119    }
24120
24121    #[test]
24122    fn migration_v20_backfills_conversation_external_tail_lookup() {
24123        let storage = franken_storage_in_memory();
24124        let agent_id = storage
24125            .ensure_agent(&Agent {
24126                id: None,
24127                slug: "codex".into(),
24128                name: "Codex".into(),
24129                version: None,
24130                kind: AgentKind::Cli,
24131            })
24132            .unwrap();
24133        let workspace_id = storage
24134            .ensure_workspace(&PathBuf::from("/ws/profiled-storage-remote"), None)
24135            .unwrap();
24136        let mut conv = make_profiled_storage_remote_conversation(1919, 2);
24137        conv.source_id = "profiled-storage-remote-source-東京".into();
24138        conv.external_id = Some("profiled-storage-remote-☃-1919".into());
24139        let outcome = storage
24140            .insert_conversation_tree(agent_id, Some(workspace_id), &conv)
24141            .unwrap();
24142        let external_id = conv.external_id.as_deref().unwrap();
24143        let lookup_key = conversation_external_lookup_key(&conv.source_id, agent_id, external_id);
24144
24145        storage
24146            .raw()
24147            .execute("DELETE FROM conversation_external_tail_lookup")
24148            .unwrap();
24149        storage
24150            .raw()
24151            .execute("DELETE FROM _schema_migrations WHERE version = 20")
24152            .unwrap();
24153        storage
24154            .raw()
24155            .execute_compat(
24156                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24157                fparams!["19"],
24158            )
24159            .unwrap();
24160
24161        storage.run_migrations().unwrap();
24162
24163        let backfilled: (i64, Option<i64>, Option<i64>, Option<i64>) = storage
24164            .raw()
24165            .query_row_map(
24166                "SELECT conversation_id, ended_at, last_message_idx, last_message_created_at
24167                 FROM conversation_external_tail_lookup
24168                 WHERE lookup_key = ?1",
24169                fparams![lookup_key.as_str()],
24170                |row| {
24171                    Ok((
24172                        row.get_typed(0)?,
24173                        row.get_typed(1)?,
24174                        row.get_typed(2)?,
24175                        row.get_typed(3)?,
24176                    ))
24177                },
24178            )
24179            .unwrap();
24180        assert_eq!(
24181            backfilled,
24182            (
24183                outcome.conversation_id,
24184                conv.ended_at,
24185                Some(1),
24186                conv.messages[1].created_at
24187            )
24188        );
24189        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24190    }
24191
24192    #[test]
24193    fn migration_v15_creates_lazy_tail_state_cache() {
24194        let conn = FrankenConnection::open(":memory:").unwrap();
24195        conn.execute_batch(
24196            "CREATE TABLE conversations (
24197                 id INTEGER PRIMARY KEY,
24198                 ended_at INTEGER
24199             );
24200             CREATE TABLE messages (
24201                 id INTEGER PRIMARY KEY,
24202                 conversation_id INTEGER NOT NULL,
24203                 idx INTEGER NOT NULL,
24204                 created_at INTEGER
24205             );
24206             INSERT INTO conversations(id, ended_at) VALUES
24207                 (1, 1710000000300),
24208                 (2, NULL);
24209             INSERT INTO messages(id, conversation_id, idx, created_at) VALUES
24210                 (10, 1, 0, 1710000000100),
24211                 (11, 1, 1, 1710000000200),
24212                 (12, 2, 0, 1710000000400);",
24213        )
24214        .unwrap();
24215
24216        conn.execute(
24217            "CREATE TABLE _schema_migrations (
24218                version INTEGER PRIMARY KEY,
24219                name TEXT NOT NULL,
24220                applied_at TEXT NOT NULL DEFAULT (strftime('%Y-%m-%dT%H:%M:%SZ', 'now'))
24221             );",
24222        )
24223        .unwrap();
24224
24225        assert!(
24226            apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24227            "v15 migration should apply once"
24228        );
24229        assert!(
24230            !apply_conversation_tail_state_cache_migration(&conn).unwrap(),
24231            "v15 migration should be idempotent once recorded"
24232        );
24233
24234        let columns = conn.query("PRAGMA table_info(conversations);").unwrap();
24235        let column_names: HashSet<String> = columns
24236            .iter()
24237            .map(|row| row.get_typed(1))
24238            .collect::<std::result::Result<_, frankensqlite::FrankenError>>()
24239            .unwrap();
24240        assert!(column_names.contains("last_message_idx"));
24241        assert!(column_names.contains("last_message_created_at"));
24242
24243        let tail_rows: i64 = conn
24244            .query("SELECT COUNT(*) FROM conversation_tail_state;")
24245            .unwrap()
24246            .first()
24247            .unwrap()
24248            .get_typed(0)
24249            .unwrap();
24250        assert_eq!(
24251            tail_rows, 0,
24252            "v15 should create the cache without an open-time message scan"
24253        );
24254
24255        let applied: i64 = conn
24256            .query("SELECT COUNT(*) FROM _schema_migrations WHERE version = 15;")
24257            .unwrap()
24258            .first()
24259            .unwrap()
24260            .get_typed(0)
24261            .unwrap();
24262        assert_eq!(applied, 1);
24263    }
24264
24265    #[test]
24266    fn schema_repair_adds_missing_conversations_token_columns() {
24267        let conn = FrankenConnection::open(":memory:").unwrap();
24268        conn.execute_batch(
24269            "CREATE TABLE conversations (
24270                 id INTEGER PRIMARY KEY,
24271                 agent_id INTEGER NOT NULL,
24272                 source_path TEXT NOT NULL
24273             );",
24274        )
24275        .unwrap();
24276        let storage = FrankenStorage::new(conn, std::path::PathBuf::from(":memory:"));
24277
24278        storage.repair_missing_conversation_token_columns().unwrap();
24279        storage.repair_missing_conversation_token_columns().unwrap();
24280
24281        let columns = franken_table_column_names(&storage.conn, "conversations").unwrap();
24282        for &(column_name, _) in REQUIRED_CONVERSATION_TOKEN_COLUMNS {
24283            assert!(
24284                columns.contains(column_name),
24285                "schema repair should add conversations.{column_name}"
24286            );
24287        }
24288    }
24289
24290    #[test]
24291    fn franken_meta_schema_version_in_sync() {
24292        let storage = franken_storage_in_memory();
24293
24294        // meta.schema_version should be kept in sync.
24295        let rows = storage
24296            .raw()
24297            .query("SELECT value FROM meta WHERE key = 'schema_version';")
24298            .unwrap();
24299        let meta_version: String = rows.first().unwrap().get_typed(0).unwrap();
24300        assert_eq!(
24301            meta_version,
24302            CURRENT_SCHEMA_VERSION.to_string(),
24303            "meta.schema_version should match CURRENT_SCHEMA_VERSION"
24304        );
24305    }
24306
24307    #[test]
24308    fn franken_transition_from_meta_version() {
24309        let dir = TempDir::new().unwrap();
24310        let db_path = dir.path().join("test_transition.db");
24311
24312        // Simulate an existing database created by SqliteStorage at version 10.
24313        // We create just enough schema to test the transition.
24314        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24315        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24316            .unwrap();
24317        conn.execute("INSERT INTO meta(key, value) VALUES('schema_version', '10');")
24318            .unwrap();
24319        // Create a dummy conversations table so transition doesn't think it's corrupted.
24320        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24321            .unwrap();
24322        drop(conn);
24323
24324        // Now run the transition function.
24325        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24326        transition_from_meta_version(&conn).unwrap();
24327
24328        // The frankensqlite path uses a combined V13 base migration, so a
24329        // legacy V10 marker is bridged to V13 and later idempotent repair fills
24330        // in any missing V11-V13 objects.
24331        let rows = conn
24332            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24333            .unwrap();
24334        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24335        assert_eq!(
24336            versions,
24337            (1..=13).collect::<Vec<i64>>(),
24338            "transition should bridge legacy V10 databases through the combined V13 base marker"
24339        );
24340    }
24341
24342    #[test]
24343    fn franken_transition_from_current_meta_backfills_current_schema_marker() {
24344        let dir = TempDir::new().unwrap();
24345        let db_path = dir.path().join("test_current_transition.db");
24346
24347        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24348        conn.execute("CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);")
24349            .unwrap();
24350        conn.execute_compat(
24351            "INSERT INTO meta(key, value) VALUES('schema_version', ?1);",
24352            &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24353        )
24354        .unwrap();
24355        conn.execute("CREATE TABLE conversations (id INTEGER PRIMARY KEY);")
24356            .unwrap();
24357        drop(conn);
24358
24359        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24360        transition_from_meta_version(&conn).unwrap();
24361
24362        let rows = conn
24363            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24364            .unwrap();
24365        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24366        assert_eq!(
24367            versions,
24368            (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24369            "current meta schema marker should backfill every known migration"
24370        );
24371    }
24372
24373    #[test]
24374    fn franken_transition_skips_when_already_done() {
24375        let dir = TempDir::new().unwrap();
24376        let db_path = dir.path().join("test_transition_skip.db");
24377
24378        // Create a DB that already has _schema_migrations.
24379        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24380        conn.execute(
24381            "CREATE TABLE _schema_migrations (version INTEGER PRIMARY KEY, name TEXT NOT NULL, applied_at TEXT NOT NULL DEFAULT 'now');",
24382        ).unwrap();
24383        conn.execute("INSERT INTO _schema_migrations (version, name) VALUES (1, 'test');")
24384            .unwrap();
24385
24386        // Transition should be a no-op.
24387        transition_from_meta_version(&conn).unwrap();
24388
24389        // Should still have exactly 1 entry.
24390        let rows = conn
24391            .query("SELECT COUNT(*) FROM _schema_migrations;")
24392            .unwrap();
24393        let count: i64 = rows.first().unwrap().get_typed(0).unwrap();
24394        assert_eq!(
24395            count, 1,
24396            "transition should not re-run on already-transitioned DB"
24397        );
24398    }
24399
24400    #[test]
24401    fn franken_transition_fresh_db_is_noop() {
24402        let dir = TempDir::new().unwrap();
24403        let db_path = dir.path().join("test_fresh_noop.db");
24404
24405        // Empty database — no meta table, no tables at all.
24406        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24407        transition_from_meta_version(&conn).unwrap();
24408
24409        // _schema_migrations should NOT have been created.
24410        let res = conn.query("SELECT * FROM \"_schema_migrations\";");
24411        assert!(
24412            res.is_err(),
24413            "transition should not create _schema_migrations on fresh DB"
24414        );
24415    }
24416
24417    #[test]
24418    fn franken_transition_with_fts_virtual_table_succeeds() {
24419        let dir = TempDir::new().unwrap();
24420        let db_path = dir.path().join("test_transition_with_fts.db");
24421
24422        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24423        conn.execute_batch(
24424            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24425             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24426             CREATE TABLE conversations (id INTEGER PRIMARY KEY);
24427             CREATE VIRTUAL TABLE fts_messages USING fts5(
24428                 content,
24429                 title,
24430                 agent,
24431                 workspace,
24432                 source_path,
24433                 created_at,
24434                 content='',
24435                 tokenize='porter unicode61'
24436             );",
24437        )
24438        .unwrap();
24439        drop(conn);
24440
24441        let conn = FrankenConnection::open(db_path.to_string_lossy().to_string()).unwrap();
24442        transition_from_meta_version(&conn).unwrap();
24443
24444        let rows = conn
24445            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24446            .unwrap();
24447        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24448        assert_eq!(versions, (1..=13).collect::<Vec<i64>>());
24449    }
24450
24451    #[test]
24452    fn franken_storage_open_legacy_v13_with_fts_virtual_table_succeeds() {
24453        let dir = TempDir::new().unwrap();
24454        let db_path = dir.path().join("test_open_legacy_v13_with_fts.db");
24455
24456        let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24457        conn.execute_batch(
24458            "CREATE TABLE meta (key TEXT PRIMARY KEY, value TEXT NOT NULL);
24459             INSERT INTO meta(key, value) VALUES('schema_version', '13');
24460             CREATE TABLE agents (
24461                 id INTEGER PRIMARY KEY,
24462                 slug TEXT NOT NULL
24463             );
24464             CREATE TABLE workspaces (
24465                 id INTEGER PRIMARY KEY,
24466                 path TEXT NOT NULL
24467             );
24468             CREATE TABLE sources (
24469                 id TEXT PRIMARY KEY,
24470                 kind TEXT NOT NULL,
24471                 host_label TEXT,
24472                 machine_id TEXT,
24473                 platform TEXT,
24474                 config_json TEXT,
24475                 created_at INTEGER NOT NULL,
24476                 updated_at INTEGER NOT NULL
24477             );
24478             CREATE TABLE conversations (
24479                 id INTEGER PRIMARY KEY,
24480                 agent_id INTEGER NOT NULL,
24481                 workspace_id INTEGER,
24482                 source_id TEXT NOT NULL DEFAULT 'local',
24483                 external_id TEXT,
24484                 title TEXT,
24485                 source_path TEXT NOT NULL,
24486                 started_at INTEGER,
24487                 ended_at INTEGER
24488             );
24489             CREATE TABLE messages (
24490                 id INTEGER PRIMARY KEY,
24491                 conversation_id INTEGER NOT NULL,
24492                 idx INTEGER NOT NULL,
24493                 role TEXT NOT NULL,
24494                 author TEXT,
24495                 created_at INTEGER,
24496                 content TEXT NOT NULL,
24497                 extra_json TEXT,
24498                 extra_bin BLOB
24499             );
24500             INSERT INTO agents(id, slug) VALUES (1, 'codex');
24501             INSERT INTO workspaces(id, path) VALUES (1, '/data/projects/coding_agent_session_search');
24502             INSERT INTO sources(id, kind, host_label, created_at, updated_at)
24503             VALUES ('local', 'local', NULL, 1710000000000, 1710000000000);
24504             INSERT INTO conversations(
24505                 id,
24506                 agent_id,
24507                 workspace_id,
24508                 source_id,
24509                 external_id,
24510                 title,
24511                 source_path,
24512                 started_at
24513             )
24514             VALUES (
24515                 1,
24516                 1,
24517                 1,
24518                 'local',
24519                 'legacy-session',
24520                 'legacy session',
24521                 '/tmp/legacy.jsonl',
24522                 1710000000000
24523             );
24524             INSERT INTO messages(id, conversation_id, idx, role, author, created_at, content)
24525             VALUES (1, 1, 0, 'user', 'tester', 1710000000000, 'legacy content');
24526             CREATE VIRTUAL TABLE fts_messages USING fts5(
24527                 content,
24528                 title,
24529                 agent,
24530                 workspace,
24531                 source_path,
24532                 created_at,
24533                 message_id,
24534                 content='',
24535                 tokenize='porter unicode61'
24536             );",
24537        )
24538        .unwrap();
24539        drop(conn);
24540
24541        let storage = FrankenStorage::open(&db_path).unwrap();
24542        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24543
24544        let rows = storage
24545            .raw()
24546            .query("SELECT version FROM _schema_migrations ORDER BY version;")
24547            .unwrap();
24548        let versions: Vec<i64> = rows.iter().filter_map(|r| r.get_typed(0).ok()).collect();
24549        assert_eq!(versions, (1..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>());
24550    }
24551
24552    #[test]
24553    fn franken_storage_open_repairs_duplicate_fts_messages_schema_rows() {
24554        let dir = TempDir::new().unwrap();
24555        let db_path = dir.path().join("test_open_repairs_duplicate_fts_schema.db");
24556
24557        let storage = FrankenStorage::open(&db_path).unwrap();
24558        let agent = Agent {
24559            id: None,
24560            slug: "codex".into(),
24561            name: "Codex".into(),
24562            version: None,
24563            kind: AgentKind::Cli,
24564        };
24565        let agent_id = storage.ensure_agent(&agent).unwrap();
24566        let conversation = Conversation {
24567            id: None,
24568            agent_slug: "codex".into(),
24569            workspace: Some(PathBuf::from("/tmp/workspace")),
24570            external_id: Some("dup-fts-schema".into()),
24571            title: Some("Duplicate FTS schema".into()),
24572            source_path: PathBuf::from("/tmp/dup-fts-schema.jsonl"),
24573            started_at: Some(1_700_000_000_000),
24574            ended_at: Some(1_700_000_000_100),
24575            approx_tokens: Some(42),
24576            metadata_json: serde_json::Value::Null,
24577            messages: vec![Message {
24578                id: None,
24579                idx: 0,
24580                role: MessageRole::User,
24581                author: Some("user".into()),
24582                created_at: Some(1_700_000_000_050),
24583                content: "message that should remain queryable".into(),
24584                extra_json: serde_json::Value::Null,
24585                snippets: Vec::new(),
24586            }],
24587            source_id: LOCAL_SOURCE_ID.into(),
24588            origin_host: None,
24589        };
24590        storage
24591            .insert_conversation_tree(agent_id, None, &conversation)
24592            .unwrap();
24593        drop(storage);
24594        materialize_fresh_fts_schema_via_rusqlite(&db_path).unwrap();
24595
24596        let duplicate_legacy_fts_sql = "CREATE VIRTUAL TABLE fts_messages USING fts5(content, title, agent, workspace, source_path, created_at UNINDEXED, message_id UNINDEXED, tokenize='porter')";
24597        let conn = rusqlite_test_fixture_conn(&db_path);
24598        conn.execute_batch("PRAGMA writable_schema = ON;").unwrap();
24599        conn.execute(
24600            "INSERT INTO sqlite_master(type, name, tbl_name, rootpage, sql)
24601             VALUES('table', 'fts_messages', 'fts_messages', 0, ?1)",
24602            [duplicate_legacy_fts_sql],
24603        )
24604        .unwrap();
24605        conn.execute(
24606            "DELETE FROM meta WHERE key = ?1",
24607            [FTS_FRANKEN_REBUILD_META_KEY],
24608        )
24609        .unwrap();
24610        // Simulate a pre-fix upgraded database that has never gone through the
24611        // authoritative frankensqlite FTS rebuild generation yet.
24612        conn.execute_batch("PRAGMA writable_schema = OFF;").unwrap();
24613
24614        let duplicate_rows: i64 = conn
24615            .query_row(
24616                "SELECT COUNT(*) FROM sqlite_master WHERE name = 'fts_messages'",
24617                [],
24618                |row| row.get(0),
24619            )
24620            .unwrap();
24621        assert_eq!(duplicate_rows, 2);
24622        drop(conn);
24623
24624        let reopened = FrankenStorage::open(&db_path).unwrap();
24625        assert_eq!(reopened.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24626        let generation_rows: Vec<String> = reopened
24627            .raw()
24628            .query_map_collect(
24629                "SELECT value FROM meta WHERE key = ?1",
24630                fparams![FTS_FRANKEN_REBUILD_META_KEY],
24631                |row| row.get_typed(0),
24632            )
24633            .unwrap();
24634        assert_eq!(
24635            generation_rows.len(),
24636            0,
24637            "canonical open should not eagerly rewrite FTS repair metadata"
24638        );
24639        reopened.ensure_search_fallback_fts_consistency().unwrap();
24640        let repaired = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24641        assert_eq!(franken_fts_schema_rows(&repaired).unwrap(), 1);
24642
24643        let total_messages: i64 = reopened
24644            .raw()
24645            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
24646                row.get_typed(0)
24647            })
24648            .unwrap();
24649        let total_fts_rows: i64 = reopened
24650            .raw()
24651            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
24652                row.get_typed(0)
24653            })
24654            .unwrap();
24655        assert_eq!(total_fts_rows, total_messages);
24656    }
24657
24658    #[test]
24659    fn franken_storage_open_fresh_db_keeps_single_franken_fts_schema_row() {
24660        let dir = TempDir::new().unwrap();
24661        let db_path = dir.path().join("fresh-franken-storage-open.db");
24662
24663        let storage = FrankenStorage::open(&db_path).unwrap();
24664        assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24665
24666        // The FTS5 virtual table is no longer created eagerly by the
24667        // migration runner (V14 drops the old internal-content table and the
24668        // current contentless table is recreated lazily — see MIGRATION_V14).
24669        // Invoke the repair path to match normal cass startup, then assert
24670        // there is exactly one fts_messages entry in sqlite_schema (no
24671        // duplicates).
24672        storage
24673            .ensure_search_fallback_fts_consistency()
24674            .expect("ensure FTS consistency after fresh open");
24675        drop(storage);
24676
24677        let c_reader = FrankenConnection::open(db_path.to_string_lossy().into_owned())
24678            .expect("open DB via frankensqlite for sqlite_master inspection");
24679        assert_eq!(
24680            franken_fts_schema_rows(&c_reader).unwrap(),
24681            1,
24682            "exactly one fts_messages schema row should exist after ensure_search_fallback_fts_consistency"
24683        );
24684        drop(c_reader);
24685
24686        let storage = FrankenStorage::open(&db_path).unwrap();
24687        assert!(
24688            storage
24689                .raw()
24690                .query("SELECT COUNT(*) FROM fts_messages")
24691                .is_ok(),
24692            "fts_messages must be queryable through frankensqlite after open"
24693        );
24694    }
24695
24696    #[test]
24697    fn franken_storage_open_repairs_missing_analytics_tables_when_version_markers_lie() {
24698        let dir = TempDir::new().unwrap();
24699        let db_path = dir.path().join("test_repair_missing_analytics.db");
24700
24701        {
24702            let storage = FrankenStorage::open(&db_path).unwrap();
24703            assert_eq!(storage.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24704        }
24705
24706        {
24707            let conn = FrankenConnection::open(db_path.to_string_lossy().into_owned()).unwrap();
24708            for table in &[
24709                "usage_models_daily",
24710                "usage_daily",
24711                "usage_hourly",
24712                "message_metrics",
24713                "token_daily_stats",
24714                "token_usage",
24715                "model_pricing",
24716                "embedding_jobs",
24717                "daily_stats",
24718            ] {
24719                conn.execute(&format!("DROP TABLE IF EXISTS {table}"))
24720                    .unwrap();
24721            }
24722            conn.execute_compat(
24723                "UPDATE meta SET value = ?1 WHERE key = 'schema_version'",
24724                &[ParamValue::from(CURRENT_SCHEMA_VERSION.to_string())],
24725            )
24726            .unwrap();
24727        }
24728
24729        let repaired = FrankenStorage::open(&db_path).unwrap();
24730        assert_eq!(repaired.schema_version().unwrap(), CURRENT_SCHEMA_VERSION);
24731
24732        let analytics_count: i64 = repaired
24733            .raw()
24734            .query_row_map(
24735                "SELECT COUNT(*) FROM sqlite_master
24736                 WHERE type='table'
24737                   AND name IN (
24738                     'daily_stats',
24739                     'embedding_jobs',
24740                     'token_usage',
24741                     'token_daily_stats',
24742                     'model_pricing',
24743                     'message_metrics',
24744                     'usage_hourly',
24745                     'usage_daily',
24746                     'usage_models_daily'
24747                   )",
24748                &[],
24749                |row| row.get_typed(0),
24750            )
24751            .unwrap();
24752        assert_eq!(
24753            analytics_count, 9,
24754            "open() should recreate the missing analytics tables even when schema_version already says current"
24755        );
24756    }
24757
24758    #[test]
24759    fn current_schema_repair_batches_cover_every_required_probe() {
24760        let missing_tables: Vec<&'static str> = REQUIRED_CURRENT_SCHEMA_TABLE_PROBES
24761            .iter()
24762            .map(|(table_name, _)| *table_name)
24763            .collect();
24764
24765        let batches = current_schema_repair_batches_for_missing_tables(&missing_tables).unwrap();
24766        let covered_tables: HashSet<&'static str> = batches
24767            .iter()
24768            .flat_map(|batch| batch.tables.iter().copied())
24769            .collect();
24770
24771        for table_name in missing_tables {
24772            assert!(
24773                covered_tables.contains(table_name),
24774                "missing repair coverage for {table_name}"
24775            );
24776        }
24777    }
24778
24779    #[test]
24780    fn current_schema_repair_batches_do_not_replay_core_schema_bootstrap() {
24781        for batch in CURRENT_SCHEMA_REPAIR_BATCHES {
24782            assert!(
24783                !batch.sql.contains("CREATE TABLE IF NOT EXISTS meta"),
24784                "repair batch {} should not recreate meta",
24785                batch.name
24786            );
24787            assert!(
24788                !batch.sql.contains("CREATE TABLE IF NOT EXISTS agents"),
24789                "repair batch {} should not recreate agents",
24790                batch.name
24791            );
24792            assert!(
24793                !batch.sql.contains("CREATE TABLE IF NOT EXISTS workspaces"),
24794                "repair batch {} should not recreate workspaces",
24795                batch.name
24796            );
24797            assert!(
24798                !batch
24799                    .sql
24800                    .contains("CREATE TABLE IF NOT EXISTS conversations"),
24801                "repair batch {} should not recreate conversations",
24802                batch.name
24803            );
24804            assert!(
24805                !batch.sql.contains("CREATE TABLE IF NOT EXISTS messages"),
24806                "repair batch {} should not recreate messages",
24807                batch.name
24808            );
24809            assert!(
24810                !batch.sql.contains("CREATE TABLE IF NOT EXISTS snippets"),
24811                "repair batch {} should not recreate snippets",
24812                batch.name
24813            );
24814            assert!(
24815                !batch.sql.contains("CREATE VIRTUAL TABLE fts_messages"),
24816                "repair batch {} should not recreate FTS tables",
24817                batch.name
24818            );
24819            assert!(
24820                !batch.sql.contains("DROP TABLE"),
24821                "repair batch {} should never drop tables",
24822                batch.name
24823            );
24824        }
24825    }
24826
24827    #[test]
24828    fn build_cass_migrations_applies_combined_v13() {
24829        let conn = FrankenConnection::open(":memory:").unwrap();
24830        let base_result = build_cass_migrations_before_tail_cache()
24831            .run(&conn)
24832            .unwrap();
24833        assert!(apply_conversation_tail_state_cache_migration(&conn).unwrap());
24834        let post_result = build_cass_migrations_after_tail_cache().run(&conn).unwrap();
24835
24836        assert!(base_result.was_fresh);
24837        let mut applied = base_result.applied;
24838        applied.push(15);
24839        applied.extend(post_result.applied);
24840        assert_eq!(
24841            applied,
24842            (13..=CURRENT_SCHEMA_VERSION).collect::<Vec<i64>>(),
24843            "should apply combined V13 plus additive post-V13 migrations"
24844        );
24845        let current: i64 = conn
24846            .query("SELECT MAX(version) FROM _schema_migrations;")
24847            .unwrap()
24848            .first()
24849            .unwrap()
24850            .get_typed(0)
24851            .unwrap();
24852        assert_eq!(current, CURRENT_SCHEMA_VERSION);
24853    }
24854
24855    #[test]
24856    fn franken_insert_conversations_batched_populates_analytics_rollups() {
24857        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
24858        use frankensqlite::compat::{ConnectionExt, RowExt};
24859        use std::path::PathBuf;
24860
24861        let dir = TempDir::new().unwrap();
24862        let db_path = dir.path().join("franken-index.db");
24863        let storage = FrankenStorage::open(&db_path).unwrap();
24864
24865        let agent = Agent {
24866            id: None,
24867            slug: "claude_code".into(),
24868            name: "Claude Code".into(),
24869            version: Some("1.0".into()),
24870            kind: AgentKind::Cli,
24871        };
24872        let agent_id = storage.ensure_agent(&agent).unwrap();
24873
24874        let ts_ms = 1_770_551_400_000_i64;
24875        let usage_json = serde_json::json!({
24876            "message": {
24877                "model": "claude-opus-4-6",
24878                "usage": {
24879                    "input_tokens": 100,
24880                    "output_tokens": 50,
24881                    "cache_read_input_tokens": 25,
24882                    "cache_creation_input_tokens": 10,
24883                    "service_tier": "standard"
24884                }
24885            }
24886        });
24887
24888        let conv = Conversation {
24889            id: None,
24890            agent_slug: "claude_code".into(),
24891            workspace: Some(PathBuf::from("/tmp/workspace")),
24892            external_id: Some("franken-batch-upsert".into()),
24893            title: Some("Franken batch upsert".into()),
24894            source_path: PathBuf::from("/tmp/franken.jsonl"),
24895            started_at: Some(ts_ms),
24896            ended_at: Some(ts_ms + 60_000),
24897            approx_tokens: None,
24898            metadata_json: serde_json::Value::Null,
24899            messages: vec![
24900                Message {
24901                    id: None,
24902                    idx: 0,
24903                    role: MessageRole::User,
24904                    author: None,
24905                    created_at: Some(ts_ms),
24906                    content: "Please make a plan.".into(),
24907                    extra_json: serde_json::Value::Null,
24908                    snippets: vec![],
24909                },
24910                Message {
24911                    id: None,
24912                    idx: 1,
24913                    role: MessageRole::Agent,
24914                    author: None,
24915                    created_at: Some(ts_ms + 30_000),
24916                    content: "## Plan\n\n1. Reproduce\n2. Patch\n3. Verify".into(),
24917                    extra_json: usage_json,
24918                    snippets: vec![],
24919                },
24920            ],
24921            source_id: "local".into(),
24922            origin_host: None,
24923        };
24924
24925        let outcomes = storage
24926            .insert_conversations_batched(&[(agent_id, None, &conv)])
24927            .unwrap();
24928        assert_eq!(outcomes.len(), 1);
24929        assert_eq!(outcomes[0].inserted_indices, vec![0, 1]);
24930
24931        let conn = storage.raw();
24932        let daily_stats_rows: i64 = conn
24933            .query_row_map("SELECT COUNT(*) FROM daily_stats", fparams![], |row| {
24934                row.get_typed(0)
24935            })
24936            .unwrap();
24937        let token_daily_rows: i64 = conn
24938            .query_row_map(
24939                "SELECT COUNT(*) FROM token_daily_stats",
24940                fparams![],
24941                |row| row.get_typed(0),
24942            )
24943            .unwrap();
24944        let usage_daily_rows: i64 = conn
24945            .query_row_map("SELECT COUNT(*) FROM usage_daily", fparams![], |row| {
24946                row.get_typed(0)
24947            })
24948            .unwrap();
24949        let model_daily_rows: i64 = conn
24950            .query_row_map(
24951                "SELECT COUNT(*) FROM usage_models_daily",
24952                fparams![],
24953                |row| row.get_typed(0),
24954            )
24955            .unwrap();
24956
24957        assert!(daily_stats_rows > 0, "daily_stats should be populated");
24958        assert!(
24959            token_daily_rows > 0,
24960            "token_daily_stats should be populated"
24961        );
24962        assert!(usage_daily_rows > 0, "usage_daily should be populated");
24963        assert!(
24964            model_daily_rows > 0,
24965            "usage_models_daily should be populated"
24966        );
24967    }
24968
24969    // =========================================================================
24970    // FrankenConnectionManager tests (bead 3rlf8)
24971    // =========================================================================
24972
24973    #[test]
24974    fn connection_manager_creates_readers() {
24975        let dir = TempDir::new().unwrap();
24976        let db_path = dir.path().join("cm.db");
24977
24978        // Create the DB first
24979        let fs = FrankenStorage::open(&db_path).unwrap();
24980        drop(fs);
24981
24982        let config = ConnectionManagerConfig {
24983            reader_count: 3,
24984            max_writers: 2,
24985        };
24986        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
24987        assert_eq!(mgr.reader_count(), 3);
24988        assert_eq!(mgr.max_writers(), 2);
24989    }
24990
24991    #[test]
24992    fn connection_manager_clamps_zero_writer_limit_to_prevent_deadlock() {
24993        let dir = TempDir::new().unwrap();
24994        let db_path = dir.path().join("cm.db");
24995
24996        let fs = FrankenStorage::open(&db_path).unwrap();
24997        drop(fs);
24998
24999        let mgr = std::sync::Arc::new(
25000            FrankenConnectionManager::new(
25001                &db_path,
25002                ConnectionManagerConfig {
25003                    reader_count: 0,
25004                    max_writers: 0,
25005                },
25006            )
25007            .unwrap(),
25008        );
25009        assert_eq!(mgr.reader_count(), 1);
25010        assert_eq!(mgr.max_writers(), 1);
25011
25012        let (tx, rx) = std::sync::mpsc::channel();
25013        let mgr_for_thread = std::sync::Arc::clone(&mgr);
25014        std::thread::spawn(move || {
25015            let result = mgr_for_thread.writer().map(|mut guard| {
25016                guard.mark_committed();
25017            });
25018            tx.send(result.is_ok()).expect("writer result send");
25019        });
25020
25021        assert!(
25022            rx.recv_timeout(Duration::from_secs(10)).unwrap(),
25023            "writer acquisition should not block forever when configured with zero writer slots"
25024        );
25025    }
25026
25027    #[test]
25028    fn connection_manager_reader_round_robin() {
25029        let dir = TempDir::new().unwrap();
25030        let db_path = dir.path().join("cm.db");
25031
25032        let fs = FrankenStorage::open(&db_path).unwrap();
25033        drop(fs);
25034
25035        let config = ConnectionManagerConfig {
25036            reader_count: 2,
25037            max_writers: 1,
25038        };
25039        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25040
25041        // Reader index should advance (round-robin)
25042        let idx_before = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
25043        let _r1 = mgr.reader();
25044        let idx_after = mgr.reader_idx.load(std::sync::atomic::Ordering::Relaxed);
25045        assert_eq!(idx_after, idx_before + 1, "reader index should advance");
25046    }
25047
25048    #[test]
25049    fn connection_manager_writer_reads_and_writes() {
25050        use frankensqlite::compat::RowExt;
25051
25052        let dir = TempDir::new().unwrap();
25053        let db_path = dir.path().join("cm.db");
25054
25055        let fs = FrankenStorage::open(&db_path).unwrap();
25056        drop(fs);
25057
25058        let mgr = FrankenConnectionManager::new(&db_path, Default::default()).unwrap();
25059
25060        // Acquire writer and insert data
25061        {
25062            let mut guard = mgr.writer().unwrap();
25063            guard
25064                .storage()
25065                .raw()
25066                .execute("CREATE TABLE IF NOT EXISTS cm_test (id INTEGER PRIMARY KEY, val TEXT)")
25067                .unwrap();
25068            guard
25069                .storage()
25070                .raw()
25071                .execute("INSERT INTO cm_test (val) VALUES ('hello')")
25072                .unwrap();
25073            guard.mark_committed();
25074        }
25075
25076        // Verify via reader (returns MutexGuard<SendFrankenConnection>)
25077        let reader_guard = mgr.reader();
25078        let rows = reader_guard.query("SELECT val FROM cm_test").unwrap();
25079        assert_eq!(rows.len(), 1);
25080        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "hello");
25081    }
25082
25083    #[test]
25084    fn connection_manager_writer_guard_drops_releases_slot() {
25085        let dir = TempDir::new().unwrap();
25086        let db_path = dir.path().join("cm.db");
25087
25088        let fs = FrankenStorage::open(&db_path).unwrap();
25089        drop(fs);
25090
25091        let config = ConnectionManagerConfig {
25092            reader_count: 1,
25093            max_writers: 1,
25094        };
25095        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25096
25097        // Acquire and release writer
25098        {
25099            let mut guard = mgr.writer().unwrap();
25100            guard.mark_committed();
25101        }
25102
25103        // Should be able to acquire again (slot released)
25104        let mut guard2 = mgr.writer().unwrap();
25105        guard2.mark_committed();
25106    }
25107
25108    #[test]
25109    fn connection_manager_concurrent_writer_works() {
25110        use frankensqlite::compat::RowExt;
25111
25112        let dir = TempDir::new().unwrap();
25113        let db_path = dir.path().join("cm.db");
25114
25115        let fs = FrankenStorage::open(&db_path).unwrap();
25116        drop(fs);
25117
25118        let config = ConnectionManagerConfig {
25119            reader_count: 1,
25120            max_writers: 2,
25121        };
25122        let mgr = FrankenConnectionManager::new(&db_path, config).unwrap();
25123
25124        {
25125            let mut guard = mgr.concurrent_writer().unwrap();
25126            guard
25127                .storage()
25128                .raw()
25129                .execute("CREATE TABLE IF NOT EXISTS cm_conc (id INTEGER PRIMARY KEY, val TEXT)")
25130                .unwrap();
25131            guard
25132                .storage()
25133                .raw()
25134                .execute("INSERT INTO cm_conc (val) VALUES ('concurrent')")
25135                .unwrap();
25136            guard.mark_committed();
25137        }
25138
25139        let reader_guard = mgr.reader();
25140        let rows = reader_guard.query("SELECT val FROM cm_conc").unwrap();
25141        assert_eq!(rows.len(), 1);
25142        assert_eq!(rows[0].get_typed::<String>(0).unwrap(), "concurrent");
25143    }
25144
25145    #[test]
25146    fn connection_manager_default_config() {
25147        let config = ConnectionManagerConfig::default();
25148        assert_eq!(config.reader_count, 4);
25149        assert!(config.max_writers > 0);
25150    }
25151
25152    #[test]
25153    fn purge_agent_archive_data_removes_only_target_agent_and_rebuilds_derived_tables() {
25154        use crate::model::types::{Agent, AgentKind, Conversation, Message, MessageRole};
25155        use std::path::PathBuf;
25156
25157        fn seed_conversation(storage: &FrankenStorage, agent_slug: &str, marker: &str) {
25158            let agent = Agent {
25159                id: None,
25160                slug: agent_slug.into(),
25161                name: agent_slug.into(),
25162                version: None,
25163                kind: AgentKind::Cli,
25164            };
25165            let agent_id = storage.ensure_agent(&agent).unwrap();
25166            let conversation = Conversation {
25167                id: None,
25168                agent_slug: agent_slug.into(),
25169                workspace: Some(PathBuf::from("/tmp/workspace")),
25170                external_id: Some(format!("{agent_slug}-{marker}")),
25171                title: Some(format!("{agent_slug} {marker}")),
25172                source_path: PathBuf::from(format!("/tmp/{agent_slug}-{marker}.jsonl")),
25173                started_at: Some(1_700_000_000_000),
25174                ended_at: Some(1_700_000_000_100),
25175                approx_tokens: None,
25176                metadata_json: serde_json::Value::Null,
25177                messages: vec![
25178                    Message {
25179                        id: None,
25180                        idx: 0,
25181                        role: MessageRole::User,
25182                        author: Some("user".into()),
25183                        created_at: Some(1_700_000_000_010),
25184                        content: format!("{agent_slug} {marker} user"),
25185                        extra_json: serde_json::Value::Null,
25186                        snippets: Vec::new(),
25187                    },
25188                    Message {
25189                        id: None,
25190                        idx: 1,
25191                        role: MessageRole::Agent,
25192                        author: Some("assistant".into()),
25193                        created_at: Some(1_700_000_000_020),
25194                        content: format!("{agent_slug} {marker} assistant"),
25195                        extra_json: serde_json::Value::Null,
25196                        snippets: Vec::new(),
25197                    },
25198                ],
25199                source_id: LOCAL_SOURCE_ID.into(),
25200                origin_host: None,
25201            };
25202            storage
25203                .insert_conversation_tree(agent_id, None, &conversation)
25204                .unwrap();
25205        }
25206
25207        let dir = TempDir::new().unwrap();
25208        let db_path = dir.path().join("agent_search.db");
25209        let storage = FrankenStorage::open(&db_path).unwrap();
25210
25211        seed_conversation(&storage, "openclaw", "purge-target");
25212        seed_conversation(&storage, "codex", "keep-target");
25213
25214        let purge = storage.purge_agent_archive_data("openclaw").unwrap();
25215        assert_eq!(purge.conversations_deleted, 1);
25216        assert_eq!(purge.messages_deleted, 2);
25217
25218        storage.rebuild_fts().unwrap();
25219        storage.rebuild_analytics().unwrap();
25220        storage.rebuild_daily_stats().unwrap();
25221        storage.rebuild_token_daily_stats().unwrap();
25222
25223        let agents = storage.list_agents().unwrap();
25224        assert_eq!(agents.len(), 1);
25225        assert_eq!(agents[0].slug, "codex");
25226        assert_eq!(storage.total_conversation_count().unwrap(), 1);
25227        assert_eq!(storage.total_message_count().unwrap(), 2);
25228
25229        let fts_rows: i64 = storage
25230            .raw()
25231            .query_row_map("SELECT COUNT(*) FROM fts_messages", fparams![], |row| {
25232                row.get_typed(0)
25233            })
25234            .unwrap();
25235        assert_eq!(fts_rows, 2);
25236
25237        let total_daily_sessions: i64 = storage
25238            .raw()
25239            .query_row_map(
25240                "SELECT COALESCE(SUM(session_count), 0)
25241                 FROM daily_stats
25242                 WHERE agent_slug = 'all' AND source_id = 'all'",
25243                fparams![],
25244                |row| row.get_typed(0),
25245            )
25246            .unwrap();
25247        assert_eq!(total_daily_sessions, 1);
25248
25249        let openclaw_token_rows: i64 = storage
25250            .raw()
25251            .query_row_map(
25252                "SELECT COUNT(*) FROM token_daily_stats WHERE agent_slug = 'openclaw'",
25253                fparams![],
25254                |row| row.get_typed(0),
25255            )
25256            .unwrap();
25257        assert_eq!(openclaw_token_rows, 0);
25258    }
25259
25260    /// Regression for cass#202: a `Connection` dropped mid-transaction can
25261    /// leave child rows persisted without a matching parent. The next indexer
25262    /// pass then trips `FOREIGN KEY constraint failed` on every write, the
25263    /// session never gets marked indexed, and the pending backlog grows
25264    /// without bound. `cleanup_orphan_fk_rows` is the indexer-startup
25265    /// self-heal that breaks the cycle.
25266    #[test]
25267    fn cleanup_orphan_fk_rows_removes_orphans_and_is_noop_on_clean_db() {
25268        let dir = TempDir::new().unwrap();
25269        let db_path = dir.path().join("orphan_fk_self_heal.db");
25270        let storage = FrankenStorage::open(&db_path).unwrap();
25271
25272        // Plant orphan rows directly: rows whose FK parent does not exist.
25273        // FK enforcement is temporarily off so the planted rows can land.
25274        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25275
25276        // Seed a real conversation so a subset of children DO have valid
25277        // parents — we want the cleanup to be precise, not a table-flush.
25278        storage
25279            .raw()
25280            .execute_compat(
25281                "INSERT INTO agents(id, slug, name, kind, created_at, updated_at) \
25282                 VALUES(1, 'test-agent', 'Test Agent', 'cli', 0, 0)",
25283                fparams![],
25284            )
25285            .unwrap();
25286        storage
25287            .raw()
25288            .execute_compat(
25289                "INSERT INTO conversations(id, agent_id, source_id, source_path, started_at) \
25290                 VALUES(1, 1, 'local', '/tmp/real.jsonl', 0)",
25291                fparams![],
25292            )
25293            .unwrap();
25294        storage
25295            .raw()
25296            .execute_compat(
25297                "INSERT INTO messages(id, conversation_id, idx, role, content) \
25298                 VALUES(1, 1, 0, 'user', 'real message')",
25299                fparams![],
25300            )
25301            .unwrap();
25302
25303        // Plant orphan messages referencing conversation_id=99999 (does not exist)
25304        // and conversation_id=0 (the specific shape reported in #202). Distinct
25305        // (conversation_id, idx) pairs are required by the UNIQUE constraint.
25306        for (mid, cid, idx) in [(101_i64, 99_999_i64, 0_i64), (102, 0, 0), (103, 0, 1)] {
25307            storage
25308                .raw()
25309                .execute_compat(
25310                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25311                     VALUES(?1, ?2, ?3, 'user', 'orphan message')",
25312                    fparams![mid, cid, idx],
25313                )
25314                .unwrap();
25315        }
25316
25317        // Rows below are not directly orphaned because their immediate
25318        // `messages` parent exists, but that parent is itself orphaned. The
25319        // cleanup deletes them explicitly before deleting orphan messages so the
25320        // FK cascade engine does not have to run one delete program per orphan.
25321        for message_id in [1_i64, 101_i64, 102_i64] {
25322            storage
25323                .raw()
25324                .execute_compat(
25325                    "INSERT INTO message_metrics(
25326                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25327                         role, content_chars, content_tokens_est
25328                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 13, 2)",
25329                    fparams![message_id],
25330                )
25331                .unwrap();
25332            storage
25333                .raw()
25334                .execute_compat(
25335                    "INSERT INTO token_usage(
25336                         message_id, conversation_id, agent_id, timestamp_ms, day_id,
25337                         role, content_chars
25338                     ) VALUES(?1, 1, 1, 0, 0, 'user', 13)",
25339                    fparams![message_id],
25340                )
25341                .unwrap();
25342        }
25343
25344        // Plant a directly-orphan snippet — message_id=99999 does not exist
25345        // anywhere, so this exercises the snippets DELETE path rather than
25346        // riding on the cascade from the orphan-message DELETE.
25347        storage
25348            .raw()
25349            .execute_compat(
25350                "INSERT INTO snippets(message_id, file_path, start_line, end_line, language, snippet_text) \
25351                 VALUES(99999, '/tmp/orphan-snippet.rs', 1, 2, 'rust', 'fn main() {}')",
25352                fparams![],
25353            )
25354            .unwrap();
25355
25356        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25357
25358        // Sanity: the planted orphans are visible.
25359        let messages_before: i64 = storage
25360            .raw()
25361            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25362                row.get_typed(0)
25363            })
25364            .unwrap();
25365        assert_eq!(messages_before, 4); // 1 real + 3 orphans
25366        let snippets_before: i64 = storage
25367            .raw()
25368            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25369                row.get_typed(0)
25370            })
25371            .unwrap();
25372        assert_eq!(snippets_before, 1);
25373        let metrics_before: i64 = storage
25374            .raw()
25375            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25376                row.get_typed(0)
25377            })
25378            .unwrap();
25379        assert_eq!(metrics_before, 3);
25380        let token_usage_before: i64 = storage
25381            .raw()
25382            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25383                row.get_typed(0)
25384            })
25385            .unwrap();
25386        assert_eq!(token_usage_before, 3);
25387
25388        // Run the self-heal.
25389        let report = storage.cleanup_orphan_fk_rows().unwrap();
25390
25391        // 3 orphan messages + 1 directly-orphan snippet = 4 primary orphans
25392        // reported. Dependent message_metrics/token_usage rows for orphan
25393        // messages are pruned too, but they are not double-counted because the
25394        // orphan message is the root row that made them invalid.
25395        let messages_after: i64 = storage
25396            .raw()
25397            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25398                row.get_typed(0)
25399            })
25400            .unwrap();
25401        assert_eq!(messages_after, 1, "real message must be preserved");
25402        let snippets_after: i64 = storage
25403            .raw()
25404            .query_row_map("SELECT COUNT(*) FROM snippets", fparams![], |row| {
25405                row.get_typed(0)
25406            })
25407            .unwrap();
25408        assert_eq!(snippets_after, 0);
25409        let metrics_after: i64 = storage
25410            .raw()
25411            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25412                row.get_typed(0)
25413            })
25414            .unwrap();
25415        assert_eq!(metrics_after, 1, "real message metric must be preserved");
25416        let token_usage_after: i64 = storage
25417            .raw()
25418            .query_row_map("SELECT COUNT(*) FROM token_usage", fparams![], |row| {
25419                row.get_typed(0)
25420            })
25421            .unwrap();
25422        assert_eq!(token_usage_after, 1, "real token row must be preserved");
25423
25424        assert_eq!(report.total, 4, "report total: {:?}", report);
25425        let messages_count = report
25426            .per_table
25427            .iter()
25428            .find(|(t, _)| *t == "messages")
25429            .map(|(_, c)| *c);
25430        assert_eq!(messages_count, Some(3));
25431        let snippets_count = report
25432            .per_table
25433            .iter()
25434            .find(|(t, _)| *t == "snippets")
25435            .map(|(_, c)| *c);
25436        assert_eq!(snippets_count, Some(1));
25437
25438        // Second invocation on a now-clean DB must be a no-op.
25439        let second = storage.cleanup_orphan_fk_rows().unwrap();
25440        assert_eq!(second.total, 0);
25441        assert!(second.per_table.is_empty());
25442    }
25443
25444    #[test]
25445    fn cleanup_orphan_fk_rows_handles_more_than_one_delete_chunk() {
25446        let dir = TempDir::new().unwrap();
25447        let db_path = dir.path().join("orphan_fk_chunked_self_heal.db");
25448        let storage = FrankenStorage::open(&db_path).unwrap();
25449        let orphan_count = ORPHAN_FK_ID_CHUNK_SIZE + 3;
25450
25451        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25452        {
25453            let mut tx = storage.raw().transaction().unwrap();
25454            for idx in 0..orphan_count {
25455                let message_id = 10_000_i64 + i64::try_from(idx).unwrap();
25456                let conversation_id = 20_000_i64 + i64::try_from(idx).unwrap();
25457                tx.execute_compat(
25458                    "INSERT INTO messages(id, conversation_id, idx, role, content) \
25459                     VALUES(?1, ?2, 0, 'user', 'orphan message')",
25460                    fparams![message_id, conversation_id],
25461                )
25462                .unwrap();
25463                tx.execute_compat(
25464                    "INSERT INTO message_metrics(
25465                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25466                         role, content_chars, content_tokens_est
25467                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 14, 2)",
25468                    fparams![message_id],
25469                )
25470                .unwrap();
25471            }
25472            tx.commit().unwrap();
25473        }
25474        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25475
25476        let report = storage.cleanup_orphan_fk_rows().unwrap();
25477
25478        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25479        let messages_count = report
25480            .per_table
25481            .iter()
25482            .find(|(table, _)| *table == "messages")
25483            .map(|(_, count)| *count);
25484        assert_eq!(messages_count, Some(i64::try_from(orphan_count).unwrap()));
25485        let messages_after: i64 = storage
25486            .raw()
25487            .query_row_map("SELECT COUNT(*) FROM messages", fparams![], |row| {
25488                row.get_typed(0)
25489            })
25490            .unwrap();
25491        assert_eq!(messages_after, 0);
25492        let metrics_after: i64 = storage
25493            .raw()
25494            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25495                row.get_typed(0)
25496            })
25497            .unwrap();
25498        assert_eq!(metrics_after, 0);
25499    }
25500
25501    #[test]
25502    fn cleanup_orphan_fk_rows_pages_direct_child_orphans() {
25503        let dir = TempDir::new().unwrap();
25504        let db_path = dir.path().join("direct_orphan_fk_paged_self_heal.db");
25505        let storage = FrankenStorage::open(&db_path).unwrap();
25506        let orphan_count = (ORPHAN_FK_ID_CHUNK_SIZE * 2) + 5;
25507
25508        storage.raw().execute("PRAGMA foreign_keys = OFF").unwrap();
25509        {
25510            let mut tx = storage.raw().transaction().unwrap();
25511            for idx in 0..orphan_count {
25512                let message_id = 50_000_i64 + i64::try_from(idx).unwrap();
25513                tx.execute_compat(
25514                    "INSERT INTO message_metrics(
25515                         message_id, created_at_ms, hour_id, day_id, agent_slug,
25516                         role, content_chars, content_tokens_est
25517                     ) VALUES(?1, 0, 0, 0, 'test-agent', 'user', 21, 3)",
25518                    fparams![message_id],
25519                )
25520                .unwrap();
25521            }
25522            tx.commit().unwrap();
25523        }
25524        storage.raw().execute("PRAGMA foreign_keys = ON").unwrap();
25525
25526        let report = storage.cleanup_orphan_fk_rows().unwrap();
25527
25528        assert_eq!(report.total, i64::try_from(orphan_count).unwrap());
25529        let metrics_count = report
25530            .per_table
25531            .iter()
25532            .filter(|(table, _)| *table == "message_metrics")
25533            .map(|(_, count)| *count)
25534            .sum::<i64>();
25535        assert_eq!(metrics_count, i64::try_from(orphan_count).unwrap());
25536        assert_eq!(
25537            report
25538                .per_table
25539                .iter()
25540                .filter(|(table, _)| *table == "message_metrics")
25541                .count(),
25542            1,
25543            "paged cleanup should aggregate report entries by table: {report:?}"
25544        );
25545        let metrics_after: i64 = storage
25546            .raw()
25547            .query_row_map("SELECT COUNT(*) FROM message_metrics", fparams![], |row| {
25548                row.get_typed(0)
25549            })
25550            .unwrap();
25551        assert_eq!(metrics_after, 0);
25552    }
25553}